linux/net/ipv6/route.c
<<
>>
Prefs
   1/*
   2 *      Linux INET6 implementation
   3 *      FIB front-end.
   4 *
   5 *      Authors:
   6 *      Pedro Roque             <roque@di.fc.ul.pt>
   7 *
   8 *      This program is free software; you can redistribute it and/or
   9 *      modify it under the terms of the GNU General Public License
  10 *      as published by the Free Software Foundation; either version
  11 *      2 of the License, or (at your option) any later version.
  12 */
  13
  14/*      Changes:
  15 *
  16 *      YOSHIFUJI Hideaki @USAGI
  17 *              reworked default router selection.
  18 *              - respect outgoing interface
  19 *              - select from (probably) reachable routers (i.e.
  20 *              routers in REACHABLE, STALE, DELAY or PROBE states).
  21 *              - always select the same router if it is (probably)
  22 *              reachable.  otherwise, round-robin the list.
  23 *      Ville Nuorvala
  24 *              Fixed routing subtrees.
  25 */
  26
  27#define pr_fmt(fmt) "IPv6: " fmt
  28
  29#include <linux/capability.h>
  30#include <linux/errno.h>
  31#include <linux/export.h>
  32#include <linux/types.h>
  33#include <linux/times.h>
  34#include <linux/socket.h>
  35#include <linux/sockios.h>
  36#include <linux/net.h>
  37#include <linux/route.h>
  38#include <linux/netdevice.h>
  39#include <linux/in6.h>
  40#include <linux/mroute6.h>
  41#include <linux/init.h>
  42#include <linux/if_arp.h>
  43#include <linux/proc_fs.h>
  44#include <linux/seq_file.h>
  45#include <linux/nsproxy.h>
  46#include <linux/slab.h>
  47#include <linux/jhash.h>
  48#include <net/net_namespace.h>
  49#include <net/snmp.h>
  50#include <net/ipv6.h>
  51#include <net/ip6_fib.h>
  52#include <net/ip6_route.h>
  53#include <net/ndisc.h>
  54#include <net/addrconf.h>
  55#include <net/tcp.h>
  56#include <linux/rtnetlink.h>
  57#include <net/dst.h>
  58#include <net/dst_metadata.h>
  59#include <net/xfrm.h>
  60#include <net/netevent.h>
  61#include <net/netlink.h>
  62#include <net/nexthop.h>
  63#include <net/lwtunnel.h>
  64#include <net/ip_tunnels.h>
  65#include <net/l3mdev.h>
  66#include <net/ip.h>
  67#include <linux/uaccess.h>
  68
  69#ifdef CONFIG_SYSCTL
  70#include <linux/sysctl.h>
  71#endif
  72
  73static int ip6_rt_type_to_error(u8 fib6_type);
  74
  75#define CREATE_TRACE_POINTS
  76#include <trace/events/fib6.h>
  77EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
  78#undef CREATE_TRACE_POINTS
  79
  80enum rt6_nud_state {
  81        RT6_NUD_FAIL_HARD = -3,
  82        RT6_NUD_FAIL_PROBE = -2,
  83        RT6_NUD_FAIL_DO_RR = -1,
  84        RT6_NUD_SUCCEED = 1
  85};
  86
  87static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
  88static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
  89static unsigned int      ip6_mtu(const struct dst_entry *dst);
  90static struct dst_entry *ip6_negative_advice(struct dst_entry *);
  91static void             ip6_dst_destroy(struct dst_entry *);
  92static void             ip6_dst_ifdown(struct dst_entry *,
  93                                       struct net_device *dev, int how);
  94static int               ip6_dst_gc(struct dst_ops *ops);
  95
  96static int              ip6_pkt_discard(struct sk_buff *skb);
  97static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
  98static int              ip6_pkt_prohibit(struct sk_buff *skb);
  99static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
 100static void             ip6_link_failure(struct sk_buff *skb);
 101static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 102                                           struct sk_buff *skb, u32 mtu);
 103static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
 104                                        struct sk_buff *skb);
 105static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
 106static size_t rt6_nlmsg_size(struct fib6_info *rt);
 107static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 108                         struct fib6_info *rt, struct dst_entry *dst,
 109                         struct in6_addr *dest, struct in6_addr *src,
 110                         int iif, int type, u32 portid, u32 seq,
 111                         unsigned int flags);
 112static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
 113                                           struct in6_addr *daddr,
 114                                           struct in6_addr *saddr);
 115
 116#ifdef CONFIG_IPV6_ROUTE_INFO
 117static struct fib6_info *rt6_add_route_info(struct net *net,
 118                                           const struct in6_addr *prefix, int prefixlen,
 119                                           const struct in6_addr *gwaddr,
 120                                           struct net_device *dev,
 121                                           unsigned int pref);
 122static struct fib6_info *rt6_get_route_info(struct net *net,
 123                                           const struct in6_addr *prefix, int prefixlen,
 124                                           const struct in6_addr *gwaddr,
 125                                           struct net_device *dev);
 126#endif
 127
 128struct uncached_list {
 129        spinlock_t              lock;
 130        struct list_head        head;
 131};
 132
 133static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
 134
 135void rt6_uncached_list_add(struct rt6_info *rt)
 136{
 137        struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
 138
 139        rt->rt6i_uncached_list = ul;
 140
 141        spin_lock_bh(&ul->lock);
 142        list_add_tail(&rt->rt6i_uncached, &ul->head);
 143        spin_unlock_bh(&ul->lock);
 144}
 145
 146void rt6_uncached_list_del(struct rt6_info *rt)
 147{
 148        if (!list_empty(&rt->rt6i_uncached)) {
 149                struct uncached_list *ul = rt->rt6i_uncached_list;
 150                struct net *net = dev_net(rt->dst.dev);
 151
 152                spin_lock_bh(&ul->lock);
 153                list_del(&rt->rt6i_uncached);
 154                atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
 155                spin_unlock_bh(&ul->lock);
 156        }
 157}
 158
 159static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
 160{
 161        struct net_device *loopback_dev = net->loopback_dev;
 162        int cpu;
 163
 164        if (dev == loopback_dev)
 165                return;
 166
 167        for_each_possible_cpu(cpu) {
 168                struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
 169                struct rt6_info *rt;
 170
 171                spin_lock_bh(&ul->lock);
 172                list_for_each_entry(rt, &ul->head, rt6i_uncached) {
 173                        struct inet6_dev *rt_idev = rt->rt6i_idev;
 174                        struct net_device *rt_dev = rt->dst.dev;
 175
 176                        if (rt_idev->dev == dev) {
 177                                rt->rt6i_idev = in6_dev_get(loopback_dev);
 178                                in6_dev_put(rt_idev);
 179                        }
 180
 181                        if (rt_dev == dev) {
 182                                rt->dst.dev = loopback_dev;
 183                                dev_hold(rt->dst.dev);
 184                                dev_put(rt_dev);
 185                        }
 186                }
 187                spin_unlock_bh(&ul->lock);
 188        }
 189}
 190
 191static inline const void *choose_neigh_daddr(const struct in6_addr *p,
 192                                             struct sk_buff *skb,
 193                                             const void *daddr)
 194{
 195        if (!ipv6_addr_any(p))
 196                return (const void *) p;
 197        else if (skb)
 198                return &ipv6_hdr(skb)->daddr;
 199        return daddr;
 200}
 201
 202struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
 203                                   struct net_device *dev,
 204                                   struct sk_buff *skb,
 205                                   const void *daddr)
 206{
 207        struct neighbour *n;
 208
 209        daddr = choose_neigh_daddr(gw, skb, daddr);
 210        n = __ipv6_neigh_lookup(dev, daddr);
 211        if (n)
 212                return n;
 213        return neigh_create(&nd_tbl, daddr, dev);
 214}
 215
 216static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
 217                                              struct sk_buff *skb,
 218                                              const void *daddr)
 219{
 220        const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
 221
 222        return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
 223}
 224
 225static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 226{
 227        struct net_device *dev = dst->dev;
 228        struct rt6_info *rt = (struct rt6_info *)dst;
 229
 230        daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
 231        if (!daddr)
 232                return;
 233        if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
 234                return;
 235        if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
 236                return;
 237        __ipv6_confirm_neigh(dev, daddr);
 238}
 239
 240static struct dst_ops ip6_dst_ops_template = {
 241        .family                 =       AF_INET6,
 242        .gc                     =       ip6_dst_gc,
 243        .gc_thresh              =       1024,
 244        .check                  =       ip6_dst_check,
 245        .default_advmss         =       ip6_default_advmss,
 246        .mtu                    =       ip6_mtu,
 247        .cow_metrics            =       dst_cow_metrics_generic,
 248        .destroy                =       ip6_dst_destroy,
 249        .ifdown                 =       ip6_dst_ifdown,
 250        .negative_advice        =       ip6_negative_advice,
 251        .link_failure           =       ip6_link_failure,
 252        .update_pmtu            =       ip6_rt_update_pmtu,
 253        .redirect               =       rt6_do_redirect,
 254        .local_out              =       __ip6_local_out,
 255        .neigh_lookup           =       ip6_dst_neigh_lookup,
 256        .confirm_neigh          =       ip6_confirm_neigh,
 257};
 258
 259static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
 260{
 261        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
 262
 263        return mtu ? : dst->dev->mtu;
 264}
 265
 266static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
 267                                         struct sk_buff *skb, u32 mtu)
 268{
 269}
 270
 271static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
 272                                      struct sk_buff *skb)
 273{
 274}
 275
 276static struct dst_ops ip6_dst_blackhole_ops = {
 277        .family                 =       AF_INET6,
 278        .destroy                =       ip6_dst_destroy,
 279        .check                  =       ip6_dst_check,
 280        .mtu                    =       ip6_blackhole_mtu,
 281        .default_advmss         =       ip6_default_advmss,
 282        .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
 283        .redirect               =       ip6_rt_blackhole_redirect,
 284        .cow_metrics            =       dst_cow_metrics_generic,
 285        .neigh_lookup           =       ip6_dst_neigh_lookup,
 286};
 287
 288static const u32 ip6_template_metrics[RTAX_MAX] = {
 289        [RTAX_HOPLIMIT - 1] = 0,
 290};
 291
 292static const struct fib6_info fib6_null_entry_template = {
 293        .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 294        .fib6_protocol  = RTPROT_KERNEL,
 295        .fib6_metric    = ~(u32)0,
 296        .fib6_ref       = ATOMIC_INIT(1),
 297        .fib6_type      = RTN_UNREACHABLE,
 298        .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
 299};
 300
 301static const struct rt6_info ip6_null_entry_template = {
 302        .dst = {
 303                .__refcnt       = ATOMIC_INIT(1),
 304                .__use          = 1,
 305                .obsolete       = DST_OBSOLETE_FORCE_CHK,
 306                .error          = -ENETUNREACH,
 307                .input          = ip6_pkt_discard,
 308                .output         = ip6_pkt_discard_out,
 309        },
 310        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 311};
 312
 313#ifdef CONFIG_IPV6_MULTIPLE_TABLES
 314
 315static const struct rt6_info ip6_prohibit_entry_template = {
 316        .dst = {
 317                .__refcnt       = ATOMIC_INIT(1),
 318                .__use          = 1,
 319                .obsolete       = DST_OBSOLETE_FORCE_CHK,
 320                .error          = -EACCES,
 321                .input          = ip6_pkt_prohibit,
 322                .output         = ip6_pkt_prohibit_out,
 323        },
 324        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 325};
 326
 327static const struct rt6_info ip6_blk_hole_entry_template = {
 328        .dst = {
 329                .__refcnt       = ATOMIC_INIT(1),
 330                .__use          = 1,
 331                .obsolete       = DST_OBSOLETE_FORCE_CHK,
 332                .error          = -EINVAL,
 333                .input          = dst_discard,
 334                .output         = dst_discard_out,
 335        },
 336        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 337};
 338
 339#endif
 340
 341static void rt6_info_init(struct rt6_info *rt)
 342{
 343        struct dst_entry *dst = &rt->dst;
 344
 345        memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
 346        INIT_LIST_HEAD(&rt->rt6i_uncached);
 347}
 348
 349/* allocate dst with ip6_dst_ops */
 350struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
 351                               int flags)
 352{
 353        struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
 354                                        1, DST_OBSOLETE_FORCE_CHK, flags);
 355
 356        if (rt) {
 357                rt6_info_init(rt);
 358                atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
 359        }
 360
 361        return rt;
 362}
 363EXPORT_SYMBOL(ip6_dst_alloc);
 364
 365static void ip6_dst_destroy(struct dst_entry *dst)
 366{
 367        struct rt6_info *rt = (struct rt6_info *)dst;
 368        struct fib6_info *from;
 369        struct inet6_dev *idev;
 370
 371        ip_dst_metrics_put(dst);
 372        rt6_uncached_list_del(rt);
 373
 374        idev = rt->rt6i_idev;
 375        if (idev) {
 376                rt->rt6i_idev = NULL;
 377                in6_dev_put(idev);
 378        }
 379
 380        rcu_read_lock();
 381        from = rcu_dereference(rt->from);
 382        rcu_assign_pointer(rt->from, NULL);
 383        fib6_info_release(from);
 384        rcu_read_unlock();
 385}
 386
 387static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 388                           int how)
 389{
 390        struct rt6_info *rt = (struct rt6_info *)dst;
 391        struct inet6_dev *idev = rt->rt6i_idev;
 392        struct net_device *loopback_dev =
 393                dev_net(dev)->loopback_dev;
 394
 395        if (idev && idev->dev != loopback_dev) {
 396                struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
 397                if (loopback_idev) {
 398                        rt->rt6i_idev = loopback_idev;
 399                        in6_dev_put(idev);
 400                }
 401        }
 402}
 403
 404static bool __rt6_check_expired(const struct rt6_info *rt)
 405{
 406        if (rt->rt6i_flags & RTF_EXPIRES)
 407                return time_after(jiffies, rt->dst.expires);
 408        else
 409                return false;
 410}
 411
 412static bool rt6_check_expired(const struct rt6_info *rt)
 413{
 414        struct fib6_info *from;
 415
 416        from = rcu_dereference(rt->from);
 417
 418        if (rt->rt6i_flags & RTF_EXPIRES) {
 419                if (time_after(jiffies, rt->dst.expires))
 420                        return true;
 421        } else if (from) {
 422                return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
 423                        fib6_check_expired(from);
 424        }
 425        return false;
 426}
 427
 428struct fib6_info *fib6_multipath_select(const struct net *net,
 429                                        struct fib6_info *match,
 430                                        struct flowi6 *fl6, int oif,
 431                                        const struct sk_buff *skb,
 432                                        int strict)
 433{
 434        struct fib6_info *sibling, *next_sibling;
 435
 436        /* We might have already computed the hash for ICMPv6 errors. In such
 437         * case it will always be non-zero. Otherwise now is the time to do it.
 438         */
 439        if (!fl6->mp_hash)
 440                fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
 441
 442        if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
 443                return match;
 444
 445        list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
 446                                 fib6_siblings) {
 447                int nh_upper_bound;
 448
 449                nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
 450                if (fl6->mp_hash > nh_upper_bound)
 451                        continue;
 452                if (rt6_score_route(sibling, oif, strict) < 0)
 453                        break;
 454                match = sibling;
 455                break;
 456        }
 457
 458        return match;
 459}
 460
 461/*
 462 *      Route lookup. rcu_read_lock() should be held.
 463 */
 464
 465static inline struct fib6_info *rt6_device_match(struct net *net,
 466                                                 struct fib6_info *rt,
 467                                                    const struct in6_addr *saddr,
 468                                                    int oif,
 469                                                    int flags)
 470{
 471        struct fib6_info *sprt;
 472
 473        if (!oif && ipv6_addr_any(saddr) &&
 474            !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
 475                return rt;
 476
 477        for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
 478                const struct net_device *dev = sprt->fib6_nh.nh_dev;
 479
 480                if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
 481                        continue;
 482
 483                if (oif) {
 484                        if (dev->ifindex == oif)
 485                                return sprt;
 486                } else {
 487                        if (ipv6_chk_addr(net, saddr, dev,
 488                                          flags & RT6_LOOKUP_F_IFACE))
 489                                return sprt;
 490                }
 491        }
 492
 493        if (oif && flags & RT6_LOOKUP_F_IFACE)
 494                return net->ipv6.fib6_null_entry;
 495
 496        return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
 497}
 498
 499#ifdef CONFIG_IPV6_ROUTER_PREF
 500struct __rt6_probe_work {
 501        struct work_struct work;
 502        struct in6_addr target;
 503        struct net_device *dev;
 504};
 505
 506static void rt6_probe_deferred(struct work_struct *w)
 507{
 508        struct in6_addr mcaddr;
 509        struct __rt6_probe_work *work =
 510                container_of(w, struct __rt6_probe_work, work);
 511
 512        addrconf_addr_solict_mult(&work->target, &mcaddr);
 513        ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
 514        dev_put(work->dev);
 515        kfree(work);
 516}
 517
 518static void rt6_probe(struct fib6_info *rt)
 519{
 520        struct __rt6_probe_work *work = NULL;
 521        const struct in6_addr *nh_gw;
 522        struct neighbour *neigh;
 523        struct net_device *dev;
 524        struct inet6_dev *idev;
 525
 526        /*
 527         * Okay, this does not seem to be appropriate
 528         * for now, however, we need to check if it
 529         * is really so; aka Router Reachability Probing.
 530         *
 531         * Router Reachability Probe MUST be rate-limited
 532         * to no more than one per minute.
 533         */
 534        if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
 535                return;
 536
 537        nh_gw = &rt->fib6_nh.nh_gw;
 538        dev = rt->fib6_nh.nh_dev;
 539        rcu_read_lock_bh();
 540        idev = __in6_dev_get(dev);
 541        neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
 542        if (neigh) {
 543                if (neigh->nud_state & NUD_VALID)
 544                        goto out;
 545
 546                write_lock(&neigh->lock);
 547                if (!(neigh->nud_state & NUD_VALID) &&
 548                    time_after(jiffies,
 549                               neigh->updated + idev->cnf.rtr_probe_interval)) {
 550                        work = kmalloc(sizeof(*work), GFP_ATOMIC);
 551                        if (work)
 552                                __neigh_set_probe_once(neigh);
 553                }
 554                write_unlock(&neigh->lock);
 555        } else if (time_after(jiffies, rt->last_probe +
 556                                       idev->cnf.rtr_probe_interval)) {
 557                work = kmalloc(sizeof(*work), GFP_ATOMIC);
 558        }
 559
 560        if (work) {
 561                rt->last_probe = jiffies;
 562                INIT_WORK(&work->work, rt6_probe_deferred);
 563                work->target = *nh_gw;
 564                dev_hold(dev);
 565                work->dev = dev;
 566                schedule_work(&work->work);
 567        }
 568
 569out:
 570        rcu_read_unlock_bh();
 571}
 572#else
 573static inline void rt6_probe(struct fib6_info *rt)
 574{
 575}
 576#endif
 577
 578/*
 579 * Default Router Selection (RFC 2461 6.3.6)
 580 */
 581static inline int rt6_check_dev(struct fib6_info *rt, int oif)
 582{
 583        const struct net_device *dev = rt->fib6_nh.nh_dev;
 584
 585        if (!oif || dev->ifindex == oif)
 586                return 2;
 587        return 0;
 588}
 589
 590static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
 591{
 592        enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
 593        struct neighbour *neigh;
 594
 595        if (rt->fib6_flags & RTF_NONEXTHOP ||
 596            !(rt->fib6_flags & RTF_GATEWAY))
 597                return RT6_NUD_SUCCEED;
 598
 599        rcu_read_lock_bh();
 600        neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
 601                                          &rt->fib6_nh.nh_gw);
 602        if (neigh) {
 603                read_lock(&neigh->lock);
 604                if (neigh->nud_state & NUD_VALID)
 605                        ret = RT6_NUD_SUCCEED;
 606#ifdef CONFIG_IPV6_ROUTER_PREF
 607                else if (!(neigh->nud_state & NUD_FAILED))
 608                        ret = RT6_NUD_SUCCEED;
 609                else
 610                        ret = RT6_NUD_FAIL_PROBE;
 611#endif
 612                read_unlock(&neigh->lock);
 613        } else {
 614                ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
 615                      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
 616        }
 617        rcu_read_unlock_bh();
 618
 619        return ret;
 620}
 621
 622static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
 623{
 624        int m;
 625
 626        m = rt6_check_dev(rt, oif);
 627        if (!m && (strict & RT6_LOOKUP_F_IFACE))
 628                return RT6_NUD_FAIL_HARD;
 629#ifdef CONFIG_IPV6_ROUTER_PREF
 630        m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
 631#endif
 632        if (strict & RT6_LOOKUP_F_REACHABLE) {
 633                int n = rt6_check_neigh(rt);
 634                if (n < 0)
 635                        return n;
 636        }
 637        return m;
 638}
 639
 640/* called with rc_read_lock held */
 641static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
 642{
 643        const struct net_device *dev = fib6_info_nh_dev(f6i);
 644        bool rc = false;
 645
 646        if (dev) {
 647                const struct inet6_dev *idev = __in6_dev_get(dev);
 648
 649                rc = !!idev->cnf.ignore_routes_with_linkdown;
 650        }
 651
 652        return rc;
 653}
 654
 655static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
 656                                   int *mpri, struct fib6_info *match,
 657                                   bool *do_rr)
 658{
 659        int m;
 660        bool match_do_rr = false;
 661
 662        if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
 663                goto out;
 664
 665        if (fib6_ignore_linkdown(rt) &&
 666            rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
 667            !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
 668                goto out;
 669
 670        if (fib6_check_expired(rt))
 671                goto out;
 672
 673        m = rt6_score_route(rt, oif, strict);
 674        if (m == RT6_NUD_FAIL_DO_RR) {
 675                match_do_rr = true;
 676                m = 0; /* lowest valid score */
 677        } else if (m == RT6_NUD_FAIL_HARD) {
 678                goto out;
 679        }
 680
 681        if (strict & RT6_LOOKUP_F_REACHABLE)
 682                rt6_probe(rt);
 683
 684        /* note that m can be RT6_NUD_FAIL_PROBE at this point */
 685        if (m > *mpri) {
 686                *do_rr = match_do_rr;
 687                *mpri = m;
 688                match = rt;
 689        }
 690out:
 691        return match;
 692}
 693
 694static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
 695                                     struct fib6_info *leaf,
 696                                     struct fib6_info *rr_head,
 697                                     u32 metric, int oif, int strict,
 698                                     bool *do_rr)
 699{
 700        struct fib6_info *rt, *match, *cont;
 701        int mpri = -1;
 702
 703        match = NULL;
 704        cont = NULL;
 705        for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
 706                if (rt->fib6_metric != metric) {
 707                        cont = rt;
 708                        break;
 709                }
 710
 711                match = find_match(rt, oif, strict, &mpri, match, do_rr);
 712        }
 713
 714        for (rt = leaf; rt && rt != rr_head;
 715             rt = rcu_dereference(rt->fib6_next)) {
 716                if (rt->fib6_metric != metric) {
 717                        cont = rt;
 718                        break;
 719                }
 720
 721                match = find_match(rt, oif, strict, &mpri, match, do_rr);
 722        }
 723
 724        if (match || !cont)
 725                return match;
 726
 727        for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
 728                match = find_match(rt, oif, strict, &mpri, match, do_rr);
 729
 730        return match;
 731}
 732
 733static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
 734                                   int oif, int strict)
 735{
 736        struct fib6_info *leaf = rcu_dereference(fn->leaf);
 737        struct fib6_info *match, *rt0;
 738        bool do_rr = false;
 739        int key_plen;
 740
 741        if (!leaf || leaf == net->ipv6.fib6_null_entry)
 742                return net->ipv6.fib6_null_entry;
 743
 744        rt0 = rcu_dereference(fn->rr_ptr);
 745        if (!rt0)
 746                rt0 = leaf;
 747
 748        /* Double check to make sure fn is not an intermediate node
 749         * and fn->leaf does not points to its child's leaf
 750         * (This might happen if all routes under fn are deleted from
 751         * the tree and fib6_repair_tree() is called on the node.)
 752         */
 753        key_plen = rt0->fib6_dst.plen;
 754#ifdef CONFIG_IPV6_SUBTREES
 755        if (rt0->fib6_src.plen)
 756                key_plen = rt0->fib6_src.plen;
 757#endif
 758        if (fn->fn_bit != key_plen)
 759                return net->ipv6.fib6_null_entry;
 760
 761        match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
 762                             &do_rr);
 763
 764        if (do_rr) {
 765                struct fib6_info *next = rcu_dereference(rt0->fib6_next);
 766
 767                /* no entries matched; do round-robin */
 768                if (!next || next->fib6_metric != rt0->fib6_metric)
 769                        next = leaf;
 770
 771                if (next != rt0) {
 772                        spin_lock_bh(&leaf->fib6_table->tb6_lock);
 773                        /* make sure next is not being deleted from the tree */
 774                        if (next->fib6_node)
 775                                rcu_assign_pointer(fn->rr_ptr, next);
 776                        spin_unlock_bh(&leaf->fib6_table->tb6_lock);
 777                }
 778        }
 779
 780        return match ? match : net->ipv6.fib6_null_entry;
 781}
 782
 783static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
 784{
 785        return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
 786}
 787
 788#ifdef CONFIG_IPV6_ROUTE_INFO
 789int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 790                  const struct in6_addr *gwaddr)
 791{
 792        struct net *net = dev_net(dev);
 793        struct route_info *rinfo = (struct route_info *) opt;
 794        struct in6_addr prefix_buf, *prefix;
 795        unsigned int pref;
 796        unsigned long lifetime;
 797        struct fib6_info *rt;
 798
 799        if (len < sizeof(struct route_info)) {
 800                return -EINVAL;
 801        }
 802
 803        /* Sanity check for prefix_len and length */
 804        if (rinfo->length > 3) {
 805                return -EINVAL;
 806        } else if (rinfo->prefix_len > 128) {
 807                return -EINVAL;
 808        } else if (rinfo->prefix_len > 64) {
 809                if (rinfo->length < 2) {
 810                        return -EINVAL;
 811                }
 812        } else if (rinfo->prefix_len > 0) {
 813                if (rinfo->length < 1) {
 814                        return -EINVAL;
 815                }
 816        }
 817
 818        pref = rinfo->route_pref;
 819        if (pref == ICMPV6_ROUTER_PREF_INVALID)
 820                return -EINVAL;
 821
 822        lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
 823
 824        if (rinfo->length == 3)
 825                prefix = (struct in6_addr *)rinfo->prefix;
 826        else {
 827                /* this function is safe */
 828                ipv6_addr_prefix(&prefix_buf,
 829                                 (struct in6_addr *)rinfo->prefix,
 830                                 rinfo->prefix_len);
 831                prefix = &prefix_buf;
 832        }
 833
 834        if (rinfo->prefix_len == 0)
 835                rt = rt6_get_dflt_router(net, gwaddr, dev);
 836        else
 837                rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
 838                                        gwaddr, dev);
 839
 840        if (rt && !lifetime) {
 841                ip6_del_rt(net, rt);
 842                rt = NULL;
 843        }
 844
 845        if (!rt && lifetime)
 846                rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
 847                                        dev, pref);
 848        else if (rt)
 849                rt->fib6_flags = RTF_ROUTEINFO |
 850                                 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
 851
 852        if (rt) {
 853                if (!addrconf_finite_timeout(lifetime))
 854                        fib6_clean_expires(rt);
 855                else
 856                        fib6_set_expires(rt, jiffies + HZ * lifetime);
 857
 858                fib6_info_release(rt);
 859        }
 860        return 0;
 861}
 862#endif
 863
 864/*
 865 *      Misc support functions
 866 */
 867
 868/* called with rcu_lock held */
 869static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
 870{
 871        struct net_device *dev = rt->fib6_nh.nh_dev;
 872
 873        if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
 874                /* for copies of local routes, dst->dev needs to be the
 875                 * device if it is a master device, the master device if
 876                 * device is enslaved, and the loopback as the default
 877                 */
 878                if (netif_is_l3_slave(dev) &&
 879                    !rt6_need_strict(&rt->fib6_dst.addr))
 880                        dev = l3mdev_master_dev_rcu(dev);
 881                else if (!netif_is_l3_master(dev))
 882                        dev = dev_net(dev)->loopback_dev;
 883                /* last case is netif_is_l3_master(dev) is true in which
 884                 * case we want dev returned to be dev
 885                 */
 886        }
 887
 888        return dev;
 889}
 890
 891static const int fib6_prop[RTN_MAX + 1] = {
 892        [RTN_UNSPEC]    = 0,
 893        [RTN_UNICAST]   = 0,
 894        [RTN_LOCAL]     = 0,
 895        [RTN_BROADCAST] = 0,
 896        [RTN_ANYCAST]   = 0,
 897        [RTN_MULTICAST] = 0,
 898        [RTN_BLACKHOLE] = -EINVAL,
 899        [RTN_UNREACHABLE] = -EHOSTUNREACH,
 900        [RTN_PROHIBIT]  = -EACCES,
 901        [RTN_THROW]     = -EAGAIN,
 902        [RTN_NAT]       = -EINVAL,
 903        [RTN_XRESOLVE]  = -EINVAL,
 904};
 905
 906static int ip6_rt_type_to_error(u8 fib6_type)
 907{
 908        return fib6_prop[fib6_type];
 909}
 910
 911static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
 912{
 913        unsigned short flags = 0;
 914
 915        if (rt->dst_nocount)
 916                flags |= DST_NOCOUNT;
 917        if (rt->dst_nopolicy)
 918                flags |= DST_NOPOLICY;
 919        if (rt->dst_host)
 920                flags |= DST_HOST;
 921
 922        return flags;
 923}
 924
 925static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
 926{
 927        rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
 928
 929        switch (ort->fib6_type) {
 930        case RTN_BLACKHOLE:
 931                rt->dst.output = dst_discard_out;
 932                rt->dst.input = dst_discard;
 933                break;
 934        case RTN_PROHIBIT:
 935                rt->dst.output = ip6_pkt_prohibit_out;
 936                rt->dst.input = ip6_pkt_prohibit;
 937                break;
 938        case RTN_THROW:
 939        case RTN_UNREACHABLE:
 940        default:
 941                rt->dst.output = ip6_pkt_discard_out;
 942                rt->dst.input = ip6_pkt_discard;
 943                break;
 944        }
 945}
 946
 947static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
 948{
 949        if (ort->fib6_flags & RTF_REJECT) {
 950                ip6_rt_init_dst_reject(rt, ort);
 951                return;
 952        }
 953
 954        rt->dst.error = 0;
 955        rt->dst.output = ip6_output;
 956
 957        if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
 958                rt->dst.input = ip6_input;
 959        } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
 960                rt->dst.input = ip6_mc_input;
 961        } else {
 962                rt->dst.input = ip6_forward;
 963        }
 964
 965        if (ort->fib6_nh.nh_lwtstate) {
 966                rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
 967                lwtunnel_set_redirect(&rt->dst);
 968        }
 969
 970        rt->dst.lastuse = jiffies;
 971}
 972
 973/* Caller must already hold reference to @from */
 974static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
 975{
 976        rt->rt6i_flags &= ~RTF_EXPIRES;
 977        rcu_assign_pointer(rt->from, from);
 978        ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
 979}
 980
 981/* Caller must already hold reference to @ort */
 982static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
 983{
 984        struct net_device *dev = fib6_info_nh_dev(ort);
 985
 986        ip6_rt_init_dst(rt, ort);
 987
 988        rt->rt6i_dst = ort->fib6_dst;
 989        rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
 990        rt->rt6i_gateway = ort->fib6_nh.nh_gw;
 991        rt->rt6i_flags = ort->fib6_flags;
 992        rt6_set_from(rt, ort);
 993#ifdef CONFIG_IPV6_SUBTREES
 994        rt->rt6i_src = ort->fib6_src;
 995#endif
 996}
 997
 998static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
 999                                        struct in6_addr *saddr)
1000{
1001        struct fib6_node *pn, *sn;
1002        while (1) {
1003                if (fn->fn_flags & RTN_TL_ROOT)
1004                        return NULL;
1005                pn = rcu_dereference(fn->parent);
1006                sn = FIB6_SUBTREE(pn);
1007                if (sn && sn != fn)
1008                        fn = fib6_node_lookup(sn, NULL, saddr);
1009                else
1010                        fn = pn;
1011                if (fn->fn_flags & RTN_RTINFO)
1012                        return fn;
1013        }
1014}
1015
1016static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1017                          bool null_fallback)
1018{
1019        struct rt6_info *rt = *prt;
1020
1021        if (dst_hold_safe(&rt->dst))
1022                return true;
1023        if (null_fallback) {
1024                rt = net->ipv6.ip6_null_entry;
1025                dst_hold(&rt->dst);
1026        } else {
1027                rt = NULL;
1028        }
1029        *prt = rt;
1030        return false;
1031}
1032
1033/* called with rcu_lock held */
1034static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1035{
1036        unsigned short flags = fib6_info_dst_flags(rt);
1037        struct net_device *dev = rt->fib6_nh.nh_dev;
1038        struct rt6_info *nrt;
1039
1040        if (!fib6_info_hold_safe(rt))
1041                return NULL;
1042
1043        nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1044        if (nrt)
1045                ip6_rt_copy_init(nrt, rt);
1046        else
1047                fib6_info_release(rt);
1048
1049        return nrt;
1050}
1051
1052static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1053                                             struct fib6_table *table,
1054                                             struct flowi6 *fl6,
1055                                             const struct sk_buff *skb,
1056                                             int flags)
1057{
1058        struct fib6_info *f6i;
1059        struct fib6_node *fn;
1060        struct rt6_info *rt;
1061
1062        if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1063                flags &= ~RT6_LOOKUP_F_IFACE;
1064
1065        rcu_read_lock();
1066        fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1067restart:
1068        f6i = rcu_dereference(fn->leaf);
1069        if (!f6i) {
1070                f6i = net->ipv6.fib6_null_entry;
1071        } else {
1072                f6i = rt6_device_match(net, f6i, &fl6->saddr,
1073                                      fl6->flowi6_oif, flags);
1074                if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1075                        f6i = fib6_multipath_select(net, f6i, fl6,
1076                                                    fl6->flowi6_oif, skb,
1077                                                    flags);
1078        }
1079        if (f6i == net->ipv6.fib6_null_entry) {
1080                fn = fib6_backtrack(fn, &fl6->saddr);
1081                if (fn)
1082                        goto restart;
1083        }
1084
1085        trace_fib6_table_lookup(net, f6i, table, fl6);
1086
1087        /* Search through exception table */
1088        rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1089        if (rt) {
1090                if (ip6_hold_safe(net, &rt, true))
1091                        dst_use_noref(&rt->dst, jiffies);
1092        } else if (f6i == net->ipv6.fib6_null_entry) {
1093                rt = net->ipv6.ip6_null_entry;
1094                dst_hold(&rt->dst);
1095        } else {
1096                rt = ip6_create_rt_rcu(f6i);
1097                if (!rt) {
1098                        rt = net->ipv6.ip6_null_entry;
1099                        dst_hold(&rt->dst);
1100                }
1101        }
1102
1103        rcu_read_unlock();
1104
1105        return rt;
1106}
1107
1108struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1109                                   const struct sk_buff *skb, int flags)
1110{
1111        return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1112}
1113EXPORT_SYMBOL_GPL(ip6_route_lookup);
1114
1115struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1116                            const struct in6_addr *saddr, int oif,
1117                            const struct sk_buff *skb, int strict)
1118{
1119        struct flowi6 fl6 = {
1120                .flowi6_oif = oif,
1121                .daddr = *daddr,
1122        };
1123        struct dst_entry *dst;
1124        int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1125
1126        if (saddr) {
1127                memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1128                flags |= RT6_LOOKUP_F_HAS_SADDR;
1129        }
1130
1131        dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1132        if (dst->error == 0)
1133                return (struct rt6_info *) dst;
1134
1135        dst_release(dst);
1136
1137        return NULL;
1138}
1139EXPORT_SYMBOL(rt6_lookup);
1140
1141/* ip6_ins_rt is called with FREE table->tb6_lock.
1142 * It takes new route entry, the addition fails by any reason the
1143 * route is released.
1144 * Caller must hold dst before calling it.
1145 */
1146
1147static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1148                        struct netlink_ext_ack *extack)
1149{
1150        int err;
1151        struct fib6_table *table;
1152
1153        table = rt->fib6_table;
1154        spin_lock_bh(&table->tb6_lock);
1155        err = fib6_add(&table->tb6_root, rt, info, extack);
1156        spin_unlock_bh(&table->tb6_lock);
1157
1158        return err;
1159}
1160
1161int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1162{
1163        struct nl_info info = { .nl_net = net, };
1164
1165        return __ip6_ins_rt(rt, &info, NULL);
1166}
1167
1168static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1169                                           const struct in6_addr *daddr,
1170                                           const struct in6_addr *saddr)
1171{
1172        struct net_device *dev;
1173        struct rt6_info *rt;
1174
1175        /*
1176         *      Clone the route.
1177         */
1178
1179        if (!fib6_info_hold_safe(ort))
1180                return NULL;
1181
1182        dev = ip6_rt_get_dev_rcu(ort);
1183        rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1184        if (!rt) {
1185                fib6_info_release(ort);
1186                return NULL;
1187        }
1188
1189        ip6_rt_copy_init(rt, ort);
1190        rt->rt6i_flags |= RTF_CACHE;
1191        rt->dst.flags |= DST_HOST;
1192        rt->rt6i_dst.addr = *daddr;
1193        rt->rt6i_dst.plen = 128;
1194
1195        if (!rt6_is_gw_or_nonexthop(ort)) {
1196                if (ort->fib6_dst.plen != 128 &&
1197                    ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1198                        rt->rt6i_flags |= RTF_ANYCAST;
1199#ifdef CONFIG_IPV6_SUBTREES
1200                if (rt->rt6i_src.plen && saddr) {
1201                        rt->rt6i_src.addr = *saddr;
1202                        rt->rt6i_src.plen = 128;
1203                }
1204#endif
1205        }
1206
1207        return rt;
1208}
1209
1210static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1211{
1212        unsigned short flags = fib6_info_dst_flags(rt);
1213        struct net_device *dev;
1214        struct rt6_info *pcpu_rt;
1215
1216        if (!fib6_info_hold_safe(rt))
1217                return NULL;
1218
1219        rcu_read_lock();
1220        dev = ip6_rt_get_dev_rcu(rt);
1221        pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1222        rcu_read_unlock();
1223        if (!pcpu_rt) {
1224                fib6_info_release(rt);
1225                return NULL;
1226        }
1227        ip6_rt_copy_init(pcpu_rt, rt);
1228        pcpu_rt->rt6i_flags |= RTF_PCPU;
1229        return pcpu_rt;
1230}
1231
1232/* It should be called with rcu_read_lock() acquired */
1233static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1234{
1235        struct rt6_info *pcpu_rt, **p;
1236
1237        p = this_cpu_ptr(rt->rt6i_pcpu);
1238        pcpu_rt = *p;
1239
1240        if (pcpu_rt)
1241                ip6_hold_safe(NULL, &pcpu_rt, false);
1242
1243        return pcpu_rt;
1244}
1245
1246static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1247                                            struct fib6_info *rt)
1248{
1249        struct rt6_info *pcpu_rt, *prev, **p;
1250
1251        pcpu_rt = ip6_rt_pcpu_alloc(rt);
1252        if (!pcpu_rt) {
1253                dst_hold(&net->ipv6.ip6_null_entry->dst);
1254                return net->ipv6.ip6_null_entry;
1255        }
1256
1257        dst_hold(&pcpu_rt->dst);
1258        p = this_cpu_ptr(rt->rt6i_pcpu);
1259        prev = cmpxchg(p, NULL, pcpu_rt);
1260        BUG_ON(prev);
1261
1262        return pcpu_rt;
1263}
1264
1265/* exception hash table implementation
1266 */
1267static DEFINE_SPINLOCK(rt6_exception_lock);
1268
1269/* Remove rt6_ex from hash table and free the memory
1270 * Caller must hold rt6_exception_lock
1271 */
1272static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1273                                 struct rt6_exception *rt6_ex)
1274{
1275        struct net *net;
1276
1277        if (!bucket || !rt6_ex)
1278                return;
1279
1280        net = dev_net(rt6_ex->rt6i->dst.dev);
1281        hlist_del_rcu(&rt6_ex->hlist);
1282        dst_release(&rt6_ex->rt6i->dst);
1283        kfree_rcu(rt6_ex, rcu);
1284        WARN_ON_ONCE(!bucket->depth);
1285        bucket->depth--;
1286        net->ipv6.rt6_stats->fib_rt_cache--;
1287}
1288
1289/* Remove oldest rt6_ex in bucket and free the memory
1290 * Caller must hold rt6_exception_lock
1291 */
1292static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1293{
1294        struct rt6_exception *rt6_ex, *oldest = NULL;
1295
1296        if (!bucket)
1297                return;
1298
1299        hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1300                if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1301                        oldest = rt6_ex;
1302        }
1303        rt6_remove_exception(bucket, oldest);
1304}
1305
1306static u32 rt6_exception_hash(const struct in6_addr *dst,
1307                              const struct in6_addr *src)
1308{
1309        static u32 seed __read_mostly;
1310        u32 val;
1311
1312        net_get_random_once(&seed, sizeof(seed));
1313        val = jhash(dst, sizeof(*dst), seed);
1314
1315#ifdef CONFIG_IPV6_SUBTREES
1316        if (src)
1317                val = jhash(src, sizeof(*src), val);
1318#endif
1319        return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1320}
1321
1322/* Helper function to find the cached rt in the hash table
1323 * and update bucket pointer to point to the bucket for this
1324 * (daddr, saddr) pair
1325 * Caller must hold rt6_exception_lock
1326 */
1327static struct rt6_exception *
1328__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1329                              const struct in6_addr *daddr,
1330                              const struct in6_addr *saddr)
1331{
1332        struct rt6_exception *rt6_ex;
1333        u32 hval;
1334
1335        if (!(*bucket) || !daddr)
1336                return NULL;
1337
1338        hval = rt6_exception_hash(daddr, saddr);
1339        *bucket += hval;
1340
1341        hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1342                struct rt6_info *rt6 = rt6_ex->rt6i;
1343                bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1344
1345#ifdef CONFIG_IPV6_SUBTREES
1346                if (matched && saddr)
1347                        matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1348#endif
1349                if (matched)
1350                        return rt6_ex;
1351        }
1352        return NULL;
1353}
1354
1355/* Helper function to find the cached rt in the hash table
1356 * and update bucket pointer to point to the bucket for this
1357 * (daddr, saddr) pair
1358 * Caller must hold rcu_read_lock()
1359 */
1360static struct rt6_exception *
1361__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1362                         const struct in6_addr *daddr,
1363                         const struct in6_addr *saddr)
1364{
1365        struct rt6_exception *rt6_ex;
1366        u32 hval;
1367
1368        WARN_ON_ONCE(!rcu_read_lock_held());
1369
1370        if (!(*bucket) || !daddr)
1371                return NULL;
1372
1373        hval = rt6_exception_hash(daddr, saddr);
1374        *bucket += hval;
1375
1376        hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1377                struct rt6_info *rt6 = rt6_ex->rt6i;
1378                bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1379
1380#ifdef CONFIG_IPV6_SUBTREES
1381                if (matched && saddr)
1382                        matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1383#endif
1384                if (matched)
1385                        return rt6_ex;
1386        }
1387        return NULL;
1388}
1389
1390static unsigned int fib6_mtu(const struct fib6_info *rt)
1391{
1392        unsigned int mtu;
1393
1394        if (rt->fib6_pmtu) {
1395                mtu = rt->fib6_pmtu;
1396        } else {
1397                struct net_device *dev = fib6_info_nh_dev(rt);
1398                struct inet6_dev *idev;
1399
1400                rcu_read_lock();
1401                idev = __in6_dev_get(dev);
1402                mtu = idev->cnf.mtu6;
1403                rcu_read_unlock();
1404        }
1405
1406        mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1407
1408        return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1409}
1410
1411static int rt6_insert_exception(struct rt6_info *nrt,
1412                                struct fib6_info *ort)
1413{
1414        struct net *net = dev_net(nrt->dst.dev);
1415        struct rt6_exception_bucket *bucket;
1416        struct in6_addr *src_key = NULL;
1417        struct rt6_exception *rt6_ex;
1418        int err = 0;
1419
1420        spin_lock_bh(&rt6_exception_lock);
1421
1422        if (ort->exception_bucket_flushed) {
1423                err = -EINVAL;
1424                goto out;
1425        }
1426
1427        bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1428                                        lockdep_is_held(&rt6_exception_lock));
1429        if (!bucket) {
1430                bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1431                                 GFP_ATOMIC);
1432                if (!bucket) {
1433                        err = -ENOMEM;
1434                        goto out;
1435                }
1436                rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1437        }
1438
1439#ifdef CONFIG_IPV6_SUBTREES
1440        /* rt6i_src.plen != 0 indicates ort is in subtree
1441         * and exception table is indexed by a hash of
1442         * both rt6i_dst and rt6i_src.
1443         * Otherwise, the exception table is indexed by
1444         * a hash of only rt6i_dst.
1445         */
1446        if (ort->fib6_src.plen)
1447                src_key = &nrt->rt6i_src.addr;
1448#endif
1449        /* rt6_mtu_change() might lower mtu on ort.
1450         * Only insert this exception route if its mtu
1451         * is less than ort's mtu value.
1452         */
1453        if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1454                err = -EINVAL;
1455                goto out;
1456        }
1457
1458        rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1459                                               src_key);
1460        if (rt6_ex)
1461                rt6_remove_exception(bucket, rt6_ex);
1462
1463        rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1464        if (!rt6_ex) {
1465                err = -ENOMEM;
1466                goto out;
1467        }
1468        rt6_ex->rt6i = nrt;
1469        rt6_ex->stamp = jiffies;
1470        hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1471        bucket->depth++;
1472        net->ipv6.rt6_stats->fib_rt_cache++;
1473
1474        if (bucket->depth > FIB6_MAX_DEPTH)
1475                rt6_exception_remove_oldest(bucket);
1476
1477out:
1478        spin_unlock_bh(&rt6_exception_lock);
1479
1480        /* Update fn->fn_sernum to invalidate all cached dst */
1481        if (!err) {
1482                spin_lock_bh(&ort->fib6_table->tb6_lock);
1483                fib6_update_sernum(net, ort);
1484                spin_unlock_bh(&ort->fib6_table->tb6_lock);
1485                fib6_force_start_gc(net);
1486        }
1487
1488        return err;
1489}
1490
1491void rt6_flush_exceptions(struct fib6_info *rt)
1492{
1493        struct rt6_exception_bucket *bucket;
1494        struct rt6_exception *rt6_ex;
1495        struct hlist_node *tmp;
1496        int i;
1497
1498        spin_lock_bh(&rt6_exception_lock);
1499        /* Prevent rt6_insert_exception() to recreate the bucket list */
1500        rt->exception_bucket_flushed = 1;
1501
1502        bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1503                                    lockdep_is_held(&rt6_exception_lock));
1504        if (!bucket)
1505                goto out;
1506
1507        for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1508                hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1509                        rt6_remove_exception(bucket, rt6_ex);
1510                WARN_ON_ONCE(bucket->depth);
1511                bucket++;
1512        }
1513
1514out:
1515        spin_unlock_bh(&rt6_exception_lock);
1516}
1517
1518/* Find cached rt in the hash table inside passed in rt
1519 * Caller has to hold rcu_read_lock()
1520 */
1521static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1522                                           struct in6_addr *daddr,
1523                                           struct in6_addr *saddr)
1524{
1525        struct rt6_exception_bucket *bucket;
1526        struct in6_addr *src_key = NULL;
1527        struct rt6_exception *rt6_ex;
1528        struct rt6_info *res = NULL;
1529
1530        bucket = rcu_dereference(rt->rt6i_exception_bucket);
1531
1532#ifdef CONFIG_IPV6_SUBTREES
1533        /* rt6i_src.plen != 0 indicates rt is in subtree
1534         * and exception table is indexed by a hash of
1535         * both rt6i_dst and rt6i_src.
1536         * Otherwise, the exception table is indexed by
1537         * a hash of only rt6i_dst.
1538         */
1539        if (rt->fib6_src.plen)
1540                src_key = saddr;
1541#endif
1542        rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1543
1544        if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1545                res = rt6_ex->rt6i;
1546
1547        return res;
1548}
1549
1550/* Remove the passed in cached rt from the hash table that contains it */
1551static int rt6_remove_exception_rt(struct rt6_info *rt)
1552{
1553        struct rt6_exception_bucket *bucket;
1554        struct in6_addr *src_key = NULL;
1555        struct rt6_exception *rt6_ex;
1556        struct fib6_info *from;
1557        int err;
1558
1559        from = rcu_dereference(rt->from);
1560        if (!from ||
1561            !(rt->rt6i_flags & RTF_CACHE))
1562                return -EINVAL;
1563
1564        if (!rcu_access_pointer(from->rt6i_exception_bucket))
1565                return -ENOENT;
1566
1567        spin_lock_bh(&rt6_exception_lock);
1568        bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1569                                    lockdep_is_held(&rt6_exception_lock));
1570#ifdef CONFIG_IPV6_SUBTREES
1571        /* rt6i_src.plen != 0 indicates 'from' is in subtree
1572         * and exception table is indexed by a hash of
1573         * both rt6i_dst and rt6i_src.
1574         * Otherwise, the exception table is indexed by
1575         * a hash of only rt6i_dst.
1576         */
1577        if (from->fib6_src.plen)
1578                src_key = &rt->rt6i_src.addr;
1579#endif
1580        rt6_ex = __rt6_find_exception_spinlock(&bucket,
1581                                               &rt->rt6i_dst.addr,
1582                                               src_key);
1583        if (rt6_ex) {
1584                rt6_remove_exception(bucket, rt6_ex);
1585                err = 0;
1586        } else {
1587                err = -ENOENT;
1588        }
1589
1590        spin_unlock_bh(&rt6_exception_lock);
1591        return err;
1592}
1593
1594/* Find rt6_ex which contains the passed in rt cache and
1595 * refresh its stamp
1596 */
1597static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1598{
1599        struct rt6_exception_bucket *bucket;
1600        struct fib6_info *from = rt->from;
1601        struct in6_addr *src_key = NULL;
1602        struct rt6_exception *rt6_ex;
1603
1604        if (!from ||
1605            !(rt->rt6i_flags & RTF_CACHE))
1606                return;
1607
1608        rcu_read_lock();
1609        bucket = rcu_dereference(from->rt6i_exception_bucket);
1610
1611#ifdef CONFIG_IPV6_SUBTREES
1612        /* rt6i_src.plen != 0 indicates 'from' is in subtree
1613         * and exception table is indexed by a hash of
1614         * both rt6i_dst and rt6i_src.
1615         * Otherwise, the exception table is indexed by
1616         * a hash of only rt6i_dst.
1617         */
1618        if (from->fib6_src.plen)
1619                src_key = &rt->rt6i_src.addr;
1620#endif
1621        rt6_ex = __rt6_find_exception_rcu(&bucket,
1622                                          &rt->rt6i_dst.addr,
1623                                          src_key);
1624        if (rt6_ex)
1625                rt6_ex->stamp = jiffies;
1626
1627        rcu_read_unlock();
1628}
1629
1630static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1631                                         struct rt6_info *rt, int mtu)
1632{
1633        /* If the new MTU is lower than the route PMTU, this new MTU will be the
1634         * lowest MTU in the path: always allow updating the route PMTU to
1635         * reflect PMTU decreases.
1636         *
1637         * If the new MTU is higher, and the route PMTU is equal to the local
1638         * MTU, this means the old MTU is the lowest in the path, so allow
1639         * updating it: if other nodes now have lower MTUs, PMTU discovery will
1640         * handle this.
1641         */
1642
1643        if (dst_mtu(&rt->dst) >= mtu)
1644                return true;
1645
1646        if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1647                return true;
1648
1649        return false;
1650}
1651
1652static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1653                                       struct fib6_info *rt, int mtu)
1654{
1655        struct rt6_exception_bucket *bucket;
1656        struct rt6_exception *rt6_ex;
1657        int i;
1658
1659        bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1660                                        lockdep_is_held(&rt6_exception_lock));
1661
1662        if (!bucket)
1663                return;
1664
1665        for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1666                hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1667                        struct rt6_info *entry = rt6_ex->rt6i;
1668
1669                        /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1670                         * route), the metrics of its rt->from have already
1671                         * been updated.
1672                         */
1673                        if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1674                            rt6_mtu_change_route_allowed(idev, entry, mtu))
1675                                dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1676                }
1677                bucket++;
1678        }
1679}
1680
1681#define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1682
1683static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1684                                        struct in6_addr *gateway)
1685{
1686        struct rt6_exception_bucket *bucket;
1687        struct rt6_exception *rt6_ex;
1688        struct hlist_node *tmp;
1689        int i;
1690
1691        if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1692                return;
1693
1694        spin_lock_bh(&rt6_exception_lock);
1695        bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1696                                     lockdep_is_held(&rt6_exception_lock));
1697
1698        if (bucket) {
1699                for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1700                        hlist_for_each_entry_safe(rt6_ex, tmp,
1701                                                  &bucket->chain, hlist) {
1702                                struct rt6_info *entry = rt6_ex->rt6i;
1703
1704                                if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1705                                    RTF_CACHE_GATEWAY &&
1706                                    ipv6_addr_equal(gateway,
1707                                                    &entry->rt6i_gateway)) {
1708                                        rt6_remove_exception(bucket, rt6_ex);
1709                                }
1710                        }
1711                        bucket++;
1712                }
1713        }
1714
1715        spin_unlock_bh(&rt6_exception_lock);
1716}
1717
1718static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1719                                      struct rt6_exception *rt6_ex,
1720                                      struct fib6_gc_args *gc_args,
1721                                      unsigned long now)
1722{
1723        struct rt6_info *rt = rt6_ex->rt6i;
1724
1725        /* we are pruning and obsoleting aged-out and non gateway exceptions
1726         * even if others have still references to them, so that on next
1727         * dst_check() such references can be dropped.
1728         * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1729         * expired, independently from their aging, as per RFC 8201 section 4
1730         */
1731        if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1732                if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1733                        RT6_TRACE("aging clone %p\n", rt);
1734                        rt6_remove_exception(bucket, rt6_ex);
1735                        return;
1736                }
1737        } else if (time_after(jiffies, rt->dst.expires)) {
1738                RT6_TRACE("purging expired route %p\n", rt);
1739                rt6_remove_exception(bucket, rt6_ex);
1740                return;
1741        }
1742
1743        if (rt->rt6i_flags & RTF_GATEWAY) {
1744                struct neighbour *neigh;
1745                __u8 neigh_flags = 0;
1746
1747                neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1748                if (neigh)
1749                        neigh_flags = neigh->flags;
1750
1751                if (!(neigh_flags & NTF_ROUTER)) {
1752                        RT6_TRACE("purging route %p via non-router but gateway\n",
1753                                  rt);
1754                        rt6_remove_exception(bucket, rt6_ex);
1755                        return;
1756                }
1757        }
1758
1759        gc_args->more++;
1760}
1761
1762void rt6_age_exceptions(struct fib6_info *rt,
1763                        struct fib6_gc_args *gc_args,
1764                        unsigned long now)
1765{
1766        struct rt6_exception_bucket *bucket;
1767        struct rt6_exception *rt6_ex;
1768        struct hlist_node *tmp;
1769        int i;
1770
1771        if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1772                return;
1773
1774        rcu_read_lock_bh();
1775        spin_lock(&rt6_exception_lock);
1776        bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1777                                    lockdep_is_held(&rt6_exception_lock));
1778
1779        if (bucket) {
1780                for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1781                        hlist_for_each_entry_safe(rt6_ex, tmp,
1782                                                  &bucket->chain, hlist) {
1783                                rt6_age_examine_exception(bucket, rt6_ex,
1784                                                          gc_args, now);
1785                        }
1786                        bucket++;
1787                }
1788        }
1789        spin_unlock(&rt6_exception_lock);
1790        rcu_read_unlock_bh();
1791}
1792
1793/* must be called with rcu lock held */
1794struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1795                                    int oif, struct flowi6 *fl6, int strict)
1796{
1797        struct fib6_node *fn, *saved_fn;
1798        struct fib6_info *f6i;
1799
1800        fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1801        saved_fn = fn;
1802
1803        if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1804                oif = 0;
1805
1806redo_rt6_select:
1807        f6i = rt6_select(net, fn, oif, strict);
1808        if (f6i == net->ipv6.fib6_null_entry) {
1809                fn = fib6_backtrack(fn, &fl6->saddr);
1810                if (fn)
1811                        goto redo_rt6_select;
1812                else if (strict & RT6_LOOKUP_F_REACHABLE) {
1813                        /* also consider unreachable route */
1814                        strict &= ~RT6_LOOKUP_F_REACHABLE;
1815                        fn = saved_fn;
1816                        goto redo_rt6_select;
1817                }
1818        }
1819
1820        trace_fib6_table_lookup(net, f6i, table, fl6);
1821
1822        return f6i;
1823}
1824
1825struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1826                               int oif, struct flowi6 *fl6,
1827                               const struct sk_buff *skb, int flags)
1828{
1829        struct fib6_info *f6i;
1830        struct rt6_info *rt;
1831        int strict = 0;
1832
1833        strict |= flags & RT6_LOOKUP_F_IFACE;
1834        strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1835        if (net->ipv6.devconf_all->forwarding == 0)
1836                strict |= RT6_LOOKUP_F_REACHABLE;
1837
1838        rcu_read_lock();
1839
1840        f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1841        if (f6i->fib6_nsiblings)
1842                f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1843
1844        if (f6i == net->ipv6.fib6_null_entry) {
1845                rt = net->ipv6.ip6_null_entry;
1846                rcu_read_unlock();
1847                dst_hold(&rt->dst);
1848                return rt;
1849        }
1850
1851        /*Search through exception table */
1852        rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1853        if (rt) {
1854                if (ip6_hold_safe(net, &rt, true))
1855                        dst_use_noref(&rt->dst, jiffies);
1856
1857                rcu_read_unlock();
1858                return rt;
1859        } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1860                            !(f6i->fib6_flags & RTF_GATEWAY))) {
1861                /* Create a RTF_CACHE clone which will not be
1862                 * owned by the fib6 tree.  It is for the special case where
1863                 * the daddr in the skb during the neighbor look-up is different
1864                 * from the fl6->daddr used to look-up route here.
1865                 */
1866                struct rt6_info *uncached_rt;
1867
1868                uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1869
1870                rcu_read_unlock();
1871
1872                if (uncached_rt) {
1873                        /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1874                         * No need for another dst_hold()
1875                         */
1876                        rt6_uncached_list_add(uncached_rt);
1877                        atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1878                } else {
1879                        uncached_rt = net->ipv6.ip6_null_entry;
1880                        dst_hold(&uncached_rt->dst);
1881                }
1882
1883                return uncached_rt;
1884        } else {
1885                /* Get a percpu copy */
1886
1887                struct rt6_info *pcpu_rt;
1888
1889                local_bh_disable();
1890                pcpu_rt = rt6_get_pcpu_route(f6i);
1891
1892                if (!pcpu_rt)
1893                        pcpu_rt = rt6_make_pcpu_route(net, f6i);
1894
1895                local_bh_enable();
1896                rcu_read_unlock();
1897
1898                return pcpu_rt;
1899        }
1900}
1901EXPORT_SYMBOL_GPL(ip6_pol_route);
1902
1903static struct rt6_info *ip6_pol_route_input(struct net *net,
1904                                            struct fib6_table *table,
1905                                            struct flowi6 *fl6,
1906                                            const struct sk_buff *skb,
1907                                            int flags)
1908{
1909        return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1910}
1911
1912struct dst_entry *ip6_route_input_lookup(struct net *net,
1913                                         struct net_device *dev,
1914                                         struct flowi6 *fl6,
1915                                         const struct sk_buff *skb,
1916                                         int flags)
1917{
1918        if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1919                flags |= RT6_LOOKUP_F_IFACE;
1920
1921        return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1922}
1923EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1924
1925static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1926                                  struct flow_keys *keys,
1927                                  struct flow_keys *flkeys)
1928{
1929        const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1930        const struct ipv6hdr *key_iph = outer_iph;
1931        struct flow_keys *_flkeys = flkeys;
1932        const struct ipv6hdr *inner_iph;
1933        const struct icmp6hdr *icmph;
1934        struct ipv6hdr _inner_iph;
1935        struct icmp6hdr _icmph;
1936
1937        if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1938                goto out;
1939
1940        icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1941                                   sizeof(_icmph), &_icmph);
1942        if (!icmph)
1943                goto out;
1944
1945        if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1946            icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1947            icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1948            icmph->icmp6_type != ICMPV6_PARAMPROB)
1949                goto out;
1950
1951        inner_iph = skb_header_pointer(skb,
1952                                       skb_transport_offset(skb) + sizeof(*icmph),
1953                                       sizeof(_inner_iph), &_inner_iph);
1954        if (!inner_iph)
1955                goto out;
1956
1957        key_iph = inner_iph;
1958        _flkeys = NULL;
1959out:
1960        if (_flkeys) {
1961                keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1962                keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1963                keys->tags.flow_label = _flkeys->tags.flow_label;
1964                keys->basic.ip_proto = _flkeys->basic.ip_proto;
1965        } else {
1966                keys->addrs.v6addrs.src = key_iph->saddr;
1967                keys->addrs.v6addrs.dst = key_iph->daddr;
1968                keys->tags.flow_label = ip6_flowlabel(key_iph);
1969                keys->basic.ip_proto = key_iph->nexthdr;
1970        }
1971}
1972
1973/* if skb is set it will be used and fl6 can be NULL */
1974u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1975                       const struct sk_buff *skb, struct flow_keys *flkeys)
1976{
1977        struct flow_keys hash_keys;
1978        u32 mhash;
1979
1980        switch (ip6_multipath_hash_policy(net)) {
1981        case 0:
1982                memset(&hash_keys, 0, sizeof(hash_keys));
1983                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1984                if (skb) {
1985                        ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1986                } else {
1987                        hash_keys.addrs.v6addrs.src = fl6->saddr;
1988                        hash_keys.addrs.v6addrs.dst = fl6->daddr;
1989                        hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
1990                        hash_keys.basic.ip_proto = fl6->flowi6_proto;
1991                }
1992                break;
1993        case 1:
1994                if (skb) {
1995                        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1996                        struct flow_keys keys;
1997
1998                        /* short-circuit if we already have L4 hash present */
1999                        if (skb->l4_hash)
2000                                return skb_get_hash_raw(skb) >> 1;
2001
2002                        memset(&hash_keys, 0, sizeof(hash_keys));
2003
2004                        if (!flkeys) {
2005                                skb_flow_dissect_flow_keys(skb, &keys, flag);
2006                                flkeys = &keys;
2007                        }
2008                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2009                        hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2010                        hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2011                        hash_keys.ports.src = flkeys->ports.src;
2012                        hash_keys.ports.dst = flkeys->ports.dst;
2013                        hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2014                } else {
2015                        memset(&hash_keys, 0, sizeof(hash_keys));
2016                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2017                        hash_keys.addrs.v6addrs.src = fl6->saddr;
2018                        hash_keys.addrs.v6addrs.dst = fl6->daddr;
2019                        hash_keys.ports.src = fl6->fl6_sport;
2020                        hash_keys.ports.dst = fl6->fl6_dport;
2021                        hash_keys.basic.ip_proto = fl6->flowi6_proto;
2022                }
2023                break;
2024        }
2025        mhash = flow_hash_from_keys(&hash_keys);
2026
2027        return mhash >> 1;
2028}
2029
2030void ip6_route_input(struct sk_buff *skb)
2031{
2032        const struct ipv6hdr *iph = ipv6_hdr(skb);
2033        struct net *net = dev_net(skb->dev);
2034        int flags = RT6_LOOKUP_F_HAS_SADDR;
2035        struct ip_tunnel_info *tun_info;
2036        struct flowi6 fl6 = {
2037                .flowi6_iif = skb->dev->ifindex,
2038                .daddr = iph->daddr,
2039                .saddr = iph->saddr,
2040                .flowlabel = ip6_flowinfo(iph),
2041                .flowi6_mark = skb->mark,
2042                .flowi6_proto = iph->nexthdr,
2043        };
2044        struct flow_keys *flkeys = NULL, _flkeys;
2045
2046        tun_info = skb_tunnel_info(skb);
2047        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2048                fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2049
2050        if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2051                flkeys = &_flkeys;
2052
2053        if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2054                fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2055        skb_dst_drop(skb);
2056        skb_dst_set(skb,
2057                    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2058}
2059
2060static struct rt6_info *ip6_pol_route_output(struct net *net,
2061                                             struct fib6_table *table,
2062                                             struct flowi6 *fl6,
2063                                             const struct sk_buff *skb,
2064                                             int flags)
2065{
2066        return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2067}
2068
2069struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2070                                         struct flowi6 *fl6, int flags)
2071{
2072        bool any_src;
2073
2074        if (ipv6_addr_type(&fl6->daddr) &
2075            (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2076                struct dst_entry *dst;
2077
2078                dst = l3mdev_link_scope_lookup(net, fl6);
2079                if (dst)
2080                        return dst;
2081        }
2082
2083        fl6->flowi6_iif = LOOPBACK_IFINDEX;
2084
2085        any_src = ipv6_addr_any(&fl6->saddr);
2086        if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2087            (fl6->flowi6_oif && any_src))
2088                flags |= RT6_LOOKUP_F_IFACE;
2089
2090        if (!any_src)
2091                flags |= RT6_LOOKUP_F_HAS_SADDR;
2092        else if (sk)
2093                flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2094
2095        return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2096}
2097EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2098
2099struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2100{
2101        struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2102        struct net_device *loopback_dev = net->loopback_dev;
2103        struct dst_entry *new = NULL;
2104
2105        rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2106                       DST_OBSOLETE_DEAD, 0);
2107        if (rt) {
2108                rt6_info_init(rt);
2109                atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2110
2111                new = &rt->dst;
2112                new->__use = 1;
2113                new->input = dst_discard;
2114                new->output = dst_discard_out;
2115
2116                dst_copy_metrics(new, &ort->dst);
2117
2118                rt->rt6i_idev = in6_dev_get(loopback_dev);
2119                rt->rt6i_gateway = ort->rt6i_gateway;
2120                rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2121
2122                memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2123#ifdef CONFIG_IPV6_SUBTREES
2124                memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2125#endif
2126        }
2127
2128        dst_release(dst_orig);
2129        return new ? new : ERR_PTR(-ENOMEM);
2130}
2131
2132/*
2133 *      Destination cache support functions
2134 */
2135
2136static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2137{
2138        u32 rt_cookie = 0;
2139
2140        if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2141                return false;
2142
2143        if (fib6_check_expired(f6i))
2144                return false;
2145
2146        return true;
2147}
2148
2149static struct dst_entry *rt6_check(struct rt6_info *rt,
2150                                   struct fib6_info *from,
2151                                   u32 cookie)
2152{
2153        u32 rt_cookie = 0;
2154
2155        if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2156            rt_cookie != cookie)
2157                return NULL;
2158
2159        if (rt6_check_expired(rt))
2160                return NULL;
2161
2162        return &rt->dst;
2163}
2164
2165static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2166                                            struct fib6_info *from,
2167                                            u32 cookie)
2168{
2169        if (!__rt6_check_expired(rt) &&
2170            rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2171            fib6_check(from, cookie))
2172                return &rt->dst;
2173        else
2174                return NULL;
2175}
2176
2177static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2178{
2179        struct dst_entry *dst_ret;
2180        struct fib6_info *from;
2181        struct rt6_info *rt;
2182
2183        rt = container_of(dst, struct rt6_info, dst);
2184
2185        rcu_read_lock();
2186
2187        /* All IPV6 dsts are created with ->obsolete set to the value
2188         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2189         * into this function always.
2190         */
2191
2192        from = rcu_dereference(rt->from);
2193
2194        if (from && (rt->rt6i_flags & RTF_PCPU ||
2195            unlikely(!list_empty(&rt->rt6i_uncached))))
2196                dst_ret = rt6_dst_from_check(rt, from, cookie);
2197        else
2198                dst_ret = rt6_check(rt, from, cookie);
2199
2200        rcu_read_unlock();
2201
2202        return dst_ret;
2203}
2204
2205static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2206{
2207        struct rt6_info *rt = (struct rt6_info *) dst;
2208
2209        if (rt) {
2210                if (rt->rt6i_flags & RTF_CACHE) {
2211                        rcu_read_lock();
2212                        if (rt6_check_expired(rt)) {
2213                                rt6_remove_exception_rt(rt);
2214                                dst = NULL;
2215                        }
2216                        rcu_read_unlock();
2217                } else {
2218                        dst_release(dst);
2219                        dst = NULL;
2220                }
2221        }
2222        return dst;
2223}
2224
2225static void ip6_link_failure(struct sk_buff *skb)
2226{
2227        struct rt6_info *rt;
2228
2229        icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2230
2231        rt = (struct rt6_info *) skb_dst(skb);
2232        if (rt) {
2233                rcu_read_lock();
2234                if (rt->rt6i_flags & RTF_CACHE) {
2235                        rt6_remove_exception_rt(rt);
2236                } else {
2237                        struct fib6_info *from;
2238                        struct fib6_node *fn;
2239
2240                        from = rcu_dereference(rt->from);
2241                        if (from) {
2242                                fn = rcu_dereference(from->fib6_node);
2243                                if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2244                                        fn->fn_sernum = -1;
2245                        }
2246                }
2247                rcu_read_unlock();
2248        }
2249}
2250
2251static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2252{
2253        if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2254                struct fib6_info *from;
2255
2256                rcu_read_lock();
2257                from = rcu_dereference(rt0->from);
2258                if (from)
2259                        rt0->dst.expires = from->expires;
2260                rcu_read_unlock();
2261        }
2262
2263        dst_set_expires(&rt0->dst, timeout);
2264        rt0->rt6i_flags |= RTF_EXPIRES;
2265}
2266
2267static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2268{
2269        struct net *net = dev_net(rt->dst.dev);
2270
2271        dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2272        rt->rt6i_flags |= RTF_MODIFIED;
2273        rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2274}
2275
2276static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2277{
2278        bool from_set;
2279
2280        rcu_read_lock();
2281        from_set = !!rcu_dereference(rt->from);
2282        rcu_read_unlock();
2283
2284        return !(rt->rt6i_flags & RTF_CACHE) &&
2285                (rt->rt6i_flags & RTF_PCPU || from_set);
2286}
2287
2288static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2289                                 const struct ipv6hdr *iph, u32 mtu)
2290{
2291        const struct in6_addr *daddr, *saddr;
2292        struct rt6_info *rt6 = (struct rt6_info *)dst;
2293
2294        if (dst_metric_locked(dst, RTAX_MTU))
2295                return;
2296
2297        if (iph) {
2298                daddr = &iph->daddr;
2299                saddr = &iph->saddr;
2300        } else if (sk) {
2301                daddr = &sk->sk_v6_daddr;
2302                saddr = &inet6_sk(sk)->saddr;
2303        } else {
2304                daddr = NULL;
2305                saddr = NULL;
2306        }
2307        dst_confirm_neigh(dst, daddr);
2308        mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2309        if (mtu >= dst_mtu(dst))
2310                return;
2311
2312        if (!rt6_cache_allowed_for_pmtu(rt6)) {
2313                rt6_do_update_pmtu(rt6, mtu);
2314                /* update rt6_ex->stamp for cache */
2315                if (rt6->rt6i_flags & RTF_CACHE)
2316                        rt6_update_exception_stamp_rt(rt6);
2317        } else if (daddr) {
2318                struct fib6_info *from;
2319                struct rt6_info *nrt6;
2320
2321                rcu_read_lock();
2322                from = rcu_dereference(rt6->from);
2323                nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2324                if (nrt6) {
2325                        rt6_do_update_pmtu(nrt6, mtu);
2326                        if (rt6_insert_exception(nrt6, from))
2327                                dst_release_immediate(&nrt6->dst);
2328                }
2329                rcu_read_unlock();
2330        }
2331}
2332
2333static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2334                               struct sk_buff *skb, u32 mtu)
2335{
2336        __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2337}
2338
2339void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2340                     int oif, u32 mark, kuid_t uid)
2341{
2342        const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2343        struct dst_entry *dst;
2344        struct flowi6 fl6 = {
2345                .flowi6_oif = oif,
2346                .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2347                .daddr = iph->daddr,
2348                .saddr = iph->saddr,
2349                .flowlabel = ip6_flowinfo(iph),
2350                .flowi6_uid = uid,
2351        };
2352
2353        dst = ip6_route_output(net, NULL, &fl6);
2354        if (!dst->error)
2355                __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2356        dst_release(dst);
2357}
2358EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2359
2360void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2361{
2362        int oif = sk->sk_bound_dev_if;
2363        struct dst_entry *dst;
2364
2365        if (!oif && skb->dev)
2366                oif = l3mdev_master_ifindex(skb->dev);
2367
2368        ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2369
2370        dst = __sk_dst_get(sk);
2371        if (!dst || !dst->obsolete ||
2372            dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2373                return;
2374
2375        bh_lock_sock(sk);
2376        if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2377                ip6_datagram_dst_update(sk, false);
2378        bh_unlock_sock(sk);
2379}
2380EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2381
2382void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2383                           const struct flowi6 *fl6)
2384{
2385#ifdef CONFIG_IPV6_SUBTREES
2386        struct ipv6_pinfo *np = inet6_sk(sk);
2387#endif
2388
2389        ip6_dst_store(sk, dst,
2390                      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2391                      &sk->sk_v6_daddr : NULL,
2392#ifdef CONFIG_IPV6_SUBTREES
2393                      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2394                      &np->saddr :
2395#endif
2396                      NULL);
2397}
2398
2399/* Handle redirects */
2400struct ip6rd_flowi {
2401        struct flowi6 fl6;
2402        struct in6_addr gateway;
2403};
2404
2405static struct rt6_info *__ip6_route_redirect(struct net *net,
2406                                             struct fib6_table *table,
2407                                             struct flowi6 *fl6,
2408                                             const struct sk_buff *skb,
2409                                             int flags)
2410{
2411        struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2412        struct rt6_info *ret = NULL, *rt_cache;
2413        struct fib6_info *rt;
2414        struct fib6_node *fn;
2415
2416        /* Get the "current" route for this destination and
2417         * check if the redirect has come from appropriate router.
2418         *
2419         * RFC 4861 specifies that redirects should only be
2420         * accepted if they come from the nexthop to the target.
2421         * Due to the way the routes are chosen, this notion
2422         * is a bit fuzzy and one might need to check all possible
2423         * routes.
2424         */
2425
2426        rcu_read_lock();
2427        fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2428restart:
2429        for_each_fib6_node_rt_rcu(fn) {
2430                if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2431                        continue;
2432                if (fib6_check_expired(rt))
2433                        continue;
2434                if (rt->fib6_flags & RTF_REJECT)
2435                        break;
2436                if (!(rt->fib6_flags & RTF_GATEWAY))
2437                        continue;
2438                if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2439                        continue;
2440                /* rt_cache's gateway might be different from its 'parent'
2441                 * in the case of an ip redirect.
2442                 * So we keep searching in the exception table if the gateway
2443                 * is different.
2444                 */
2445                if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2446                        rt_cache = rt6_find_cached_rt(rt,
2447                                                      &fl6->daddr,
2448                                                      &fl6->saddr);
2449                        if (rt_cache &&
2450                            ipv6_addr_equal(&rdfl->gateway,
2451                                            &rt_cache->rt6i_gateway)) {
2452                                ret = rt_cache;
2453                                break;
2454                        }
2455                        continue;
2456                }
2457                break;
2458        }
2459
2460        if (!rt)
2461                rt = net->ipv6.fib6_null_entry;
2462        else if (rt->fib6_flags & RTF_REJECT) {
2463                ret = net->ipv6.ip6_null_entry;
2464                goto out;
2465        }
2466
2467        if (rt == net->ipv6.fib6_null_entry) {
2468                fn = fib6_backtrack(fn, &fl6->saddr);
2469                if (fn)
2470                        goto restart;
2471        }
2472
2473out:
2474        if (ret)
2475                ip6_hold_safe(net, &ret, true);
2476        else
2477                ret = ip6_create_rt_rcu(rt);
2478
2479        rcu_read_unlock();
2480
2481        trace_fib6_table_lookup(net, rt, table, fl6);
2482        return ret;
2483};
2484
2485static struct dst_entry *ip6_route_redirect(struct net *net,
2486                                            const struct flowi6 *fl6,
2487                                            const struct sk_buff *skb,
2488                                            const struct in6_addr *gateway)
2489{
2490        int flags = RT6_LOOKUP_F_HAS_SADDR;
2491        struct ip6rd_flowi rdfl;
2492
2493        rdfl.fl6 = *fl6;
2494        rdfl.gateway = *gateway;
2495
2496        return fib6_rule_lookup(net, &rdfl.fl6, skb,
2497                                flags, __ip6_route_redirect);
2498}
2499
2500void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2501                  kuid_t uid)
2502{
2503        const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2504        struct dst_entry *dst;
2505        struct flowi6 fl6 = {
2506                .flowi6_iif = LOOPBACK_IFINDEX,
2507                .flowi6_oif = oif,
2508                .flowi6_mark = mark,
2509                .daddr = iph->daddr,
2510                .saddr = iph->saddr,
2511                .flowlabel = ip6_flowinfo(iph),
2512                .flowi6_uid = uid,
2513        };
2514
2515        dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2516        rt6_do_redirect(dst, NULL, skb);
2517        dst_release(dst);
2518}
2519EXPORT_SYMBOL_GPL(ip6_redirect);
2520
2521void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2522{
2523        const struct ipv6hdr *iph = ipv6_hdr(skb);
2524        const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2525        struct dst_entry *dst;
2526        struct flowi6 fl6 = {
2527                .flowi6_iif = LOOPBACK_IFINDEX,
2528                .flowi6_oif = oif,
2529                .daddr = msg->dest,
2530                .saddr = iph->daddr,
2531                .flowi6_uid = sock_net_uid(net, NULL),
2532        };
2533
2534        dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2535        rt6_do_redirect(dst, NULL, skb);
2536        dst_release(dst);
2537}
2538
2539void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2540{
2541        ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2542                     sk->sk_uid);
2543}
2544EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2545
2546static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2547{
2548        struct net_device *dev = dst->dev;
2549        unsigned int mtu = dst_mtu(dst);
2550        struct net *net = dev_net(dev);
2551
2552        mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2553
2554        if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2555                mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2556
2557        /*
2558         * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2559         * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2560         * IPV6_MAXPLEN is also valid and means: "any MSS,
2561         * rely only on pmtu discovery"
2562         */
2563        if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2564                mtu = IPV6_MAXPLEN;
2565        return mtu;
2566}
2567
2568static unsigned int ip6_mtu(const struct dst_entry *dst)
2569{
2570        struct inet6_dev *idev;
2571        unsigned int mtu;
2572
2573        mtu = dst_metric_raw(dst, RTAX_MTU);
2574        if (mtu)
2575                goto out;
2576
2577        mtu = IPV6_MIN_MTU;
2578
2579        rcu_read_lock();
2580        idev = __in6_dev_get(dst->dev);
2581        if (idev)
2582                mtu = idev->cnf.mtu6;
2583        rcu_read_unlock();
2584
2585out:
2586        mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2587
2588        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2589}
2590
2591/* MTU selection:
2592 * 1. mtu on route is locked - use it
2593 * 2. mtu from nexthop exception
2594 * 3. mtu from egress device
2595 *
2596 * based on ip6_dst_mtu_forward and exception logic of
2597 * rt6_find_cached_rt; called with rcu_read_lock
2598 */
2599u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2600                      struct in6_addr *saddr)
2601{
2602        struct rt6_exception_bucket *bucket;
2603        struct rt6_exception *rt6_ex;
2604        struct in6_addr *src_key;
2605        struct inet6_dev *idev;
2606        u32 mtu = 0;
2607
2608        if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2609                mtu = f6i->fib6_pmtu;
2610                if (mtu)
2611                        goto out;
2612        }
2613
2614        src_key = NULL;
2615#ifdef CONFIG_IPV6_SUBTREES
2616        if (f6i->fib6_src.plen)
2617                src_key = saddr;
2618#endif
2619
2620        bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2621        rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2622        if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2623                mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2624
2625        if (likely(!mtu)) {
2626                struct net_device *dev = fib6_info_nh_dev(f6i);
2627
2628                mtu = IPV6_MIN_MTU;
2629                idev = __in6_dev_get(dev);
2630                if (idev && idev->cnf.mtu6 > mtu)
2631                        mtu = idev->cnf.mtu6;
2632        }
2633
2634        mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2635out:
2636        return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2637}
2638
2639struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2640                                  struct flowi6 *fl6)
2641{
2642        struct dst_entry *dst;
2643        struct rt6_info *rt;
2644        struct inet6_dev *idev = in6_dev_get(dev);
2645        struct net *net = dev_net(dev);
2646
2647        if (unlikely(!idev))
2648                return ERR_PTR(-ENODEV);
2649
2650        rt = ip6_dst_alloc(net, dev, 0);
2651        if (unlikely(!rt)) {
2652                in6_dev_put(idev);
2653                dst = ERR_PTR(-ENOMEM);
2654                goto out;
2655        }
2656
2657        rt->dst.flags |= DST_HOST;
2658        rt->dst.input = ip6_input;
2659        rt->dst.output  = ip6_output;
2660        rt->rt6i_gateway  = fl6->daddr;
2661        rt->rt6i_dst.addr = fl6->daddr;
2662        rt->rt6i_dst.plen = 128;
2663        rt->rt6i_idev     = idev;
2664        dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2665
2666        /* Add this dst into uncached_list so that rt6_disable_ip() can
2667         * do proper release of the net_device
2668         */
2669        rt6_uncached_list_add(rt);
2670        atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2671
2672        dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2673
2674out:
2675        return dst;
2676}
2677
2678static int ip6_dst_gc(struct dst_ops *ops)
2679{
2680        struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2681        int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2682        int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2683        int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2684        int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2685        unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2686        int entries;
2687
2688        entries = dst_entries_get_fast(ops);
2689        if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2690            entries <= rt_max_size)
2691                goto out;
2692
2693        net->ipv6.ip6_rt_gc_expire++;
2694        fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2695        entries = dst_entries_get_slow(ops);
2696        if (entries < ops->gc_thresh)
2697                net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2698out:
2699        net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2700        return entries > rt_max_size;
2701}
2702
2703static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2704                                            struct fib6_config *cfg,
2705                                            const struct in6_addr *gw_addr,
2706                                            u32 tbid, int flags)
2707{
2708        struct flowi6 fl6 = {
2709                .flowi6_oif = cfg->fc_ifindex,
2710                .daddr = *gw_addr,
2711                .saddr = cfg->fc_prefsrc,
2712        };
2713        struct fib6_table *table;
2714        struct rt6_info *rt;
2715
2716        table = fib6_get_table(net, tbid);
2717        if (!table)
2718                return NULL;
2719
2720        if (!ipv6_addr_any(&cfg->fc_prefsrc))
2721                flags |= RT6_LOOKUP_F_HAS_SADDR;
2722
2723        flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2724        rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2725
2726        /* if table lookup failed, fall back to full lookup */
2727        if (rt == net->ipv6.ip6_null_entry) {
2728                ip6_rt_put(rt);
2729                rt = NULL;
2730        }
2731
2732        return rt;
2733}
2734
2735static int ip6_route_check_nh_onlink(struct net *net,
2736                                     struct fib6_config *cfg,
2737                                     const struct net_device *dev,
2738                                     struct netlink_ext_ack *extack)
2739{
2740        u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2741        const struct in6_addr *gw_addr = &cfg->fc_gateway;
2742        u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2743        struct rt6_info *grt;
2744        int err;
2745
2746        err = 0;
2747        grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2748        if (grt) {
2749                if (!grt->dst.error &&
2750                    /* ignore match if it is the default route */
2751                    grt->from && !ipv6_addr_any(&grt->from->fib6_dst.addr) &&
2752                    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2753                        NL_SET_ERR_MSG(extack,
2754                                       "Nexthop has invalid gateway or device mismatch");
2755                        err = -EINVAL;
2756                }
2757
2758                ip6_rt_put(grt);
2759        }
2760
2761        return err;
2762}
2763
2764static int ip6_route_check_nh(struct net *net,
2765                              struct fib6_config *cfg,
2766                              struct net_device **_dev,
2767                              struct inet6_dev **idev)
2768{
2769        const struct in6_addr *gw_addr = &cfg->fc_gateway;
2770        struct net_device *dev = _dev ? *_dev : NULL;
2771        struct rt6_info *grt = NULL;
2772        int err = -EHOSTUNREACH;
2773
2774        if (cfg->fc_table) {
2775                int flags = RT6_LOOKUP_F_IFACE;
2776
2777                grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2778                                          cfg->fc_table, flags);
2779                if (grt) {
2780                        if (grt->rt6i_flags & RTF_GATEWAY ||
2781                            (dev && dev != grt->dst.dev)) {
2782                                ip6_rt_put(grt);
2783                                grt = NULL;
2784                        }
2785                }
2786        }
2787
2788        if (!grt)
2789                grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2790
2791        if (!grt)
2792                goto out;
2793
2794        if (dev) {
2795                if (dev != grt->dst.dev) {
2796                        ip6_rt_put(grt);
2797                        goto out;
2798                }
2799        } else {
2800                *_dev = dev = grt->dst.dev;
2801                *idev = grt->rt6i_idev;
2802                dev_hold(dev);
2803                in6_dev_hold(grt->rt6i_idev);
2804        }
2805
2806        if (!(grt->rt6i_flags & RTF_GATEWAY))
2807                err = 0;
2808
2809        ip6_rt_put(grt);
2810
2811out:
2812        return err;
2813}
2814
2815static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2816                           struct net_device **_dev, struct inet6_dev **idev,
2817                           struct netlink_ext_ack *extack)
2818{
2819        const struct in6_addr *gw_addr = &cfg->fc_gateway;
2820        int gwa_type = ipv6_addr_type(gw_addr);
2821        bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2822        const struct net_device *dev = *_dev;
2823        bool need_addr_check = !dev;
2824        int err = -EINVAL;
2825
2826        /* if gw_addr is local we will fail to detect this in case
2827         * address is still TENTATIVE (DAD in progress). rt6_lookup()
2828         * will return already-added prefix route via interface that
2829         * prefix route was assigned to, which might be non-loopback.
2830         */
2831        if (dev &&
2832            ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2833                NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2834                goto out;
2835        }
2836
2837        if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2838                /* IPv6 strictly inhibits using not link-local
2839                 * addresses as nexthop address.
2840                 * Otherwise, router will not able to send redirects.
2841                 * It is very good, but in some (rare!) circumstances
2842                 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2843                 * some exceptions. --ANK
2844                 * We allow IPv4-mapped nexthops to support RFC4798-type
2845                 * addressing
2846                 */
2847                if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2848                        NL_SET_ERR_MSG(extack, "Invalid gateway address");
2849                        goto out;
2850                }
2851
2852                if (cfg->fc_flags & RTNH_F_ONLINK)
2853                        err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2854                else
2855                        err = ip6_route_check_nh(net, cfg, _dev, idev);
2856
2857                if (err)
2858                        goto out;
2859        }
2860
2861        /* reload in case device was changed */
2862        dev = *_dev;
2863
2864        err = -EINVAL;
2865        if (!dev) {
2866                NL_SET_ERR_MSG(extack, "Egress device not specified");
2867                goto out;
2868        } else if (dev->flags & IFF_LOOPBACK) {
2869                NL_SET_ERR_MSG(extack,
2870                               "Egress device can not be loopback device for this route");
2871                goto out;
2872        }
2873
2874        /* if we did not check gw_addr above, do so now that the
2875         * egress device has been resolved.
2876         */
2877        if (need_addr_check &&
2878            ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2879                NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2880                goto out;
2881        }
2882
2883        err = 0;
2884out:
2885        return err;
2886}
2887
2888static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2889                                              gfp_t gfp_flags,
2890                                              struct netlink_ext_ack *extack)
2891{
2892        struct net *net = cfg->fc_nlinfo.nl_net;
2893        struct fib6_info *rt = NULL;
2894        struct net_device *dev = NULL;
2895        struct inet6_dev *idev = NULL;
2896        struct fib6_table *table;
2897        int addr_type;
2898        int err = -EINVAL;
2899
2900        /* RTF_PCPU is an internal flag; can not be set by userspace */
2901        if (cfg->fc_flags & RTF_PCPU) {
2902                NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2903                goto out;
2904        }
2905
2906        /* RTF_CACHE is an internal flag; can not be set by userspace */
2907        if (cfg->fc_flags & RTF_CACHE) {
2908                NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2909                goto out;
2910        }
2911
2912        if (cfg->fc_type > RTN_MAX) {
2913                NL_SET_ERR_MSG(extack, "Invalid route type");
2914                goto out;
2915        }
2916
2917        if (cfg->fc_dst_len > 128) {
2918                NL_SET_ERR_MSG(extack, "Invalid prefix length");
2919                goto out;
2920        }
2921        if (cfg->fc_src_len > 128) {
2922                NL_SET_ERR_MSG(extack, "Invalid source address length");
2923                goto out;
2924        }
2925#ifndef CONFIG_IPV6_SUBTREES
2926        if (cfg->fc_src_len) {
2927                NL_SET_ERR_MSG(extack,
2928                               "Specifying source address requires IPV6_SUBTREES to be enabled");
2929                goto out;
2930        }
2931#endif
2932        if (cfg->fc_ifindex) {
2933                err = -ENODEV;
2934                dev = dev_get_by_index(net, cfg->fc_ifindex);
2935                if (!dev)
2936                        goto out;
2937                idev = in6_dev_get(dev);
2938                if (!idev)
2939                        goto out;
2940        }
2941
2942        if (cfg->fc_metric == 0)
2943                cfg->fc_metric = IP6_RT_PRIO_USER;
2944
2945        if (cfg->fc_flags & RTNH_F_ONLINK) {
2946                if (!dev) {
2947                        NL_SET_ERR_MSG(extack,
2948                                       "Nexthop device required for onlink");
2949                        err = -ENODEV;
2950                        goto out;
2951                }
2952
2953                if (!(dev->flags & IFF_UP)) {
2954                        NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2955                        err = -ENETDOWN;
2956                        goto out;
2957                }
2958        }
2959
2960        err = -ENOBUFS;
2961        if (cfg->fc_nlinfo.nlh &&
2962            !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2963                table = fib6_get_table(net, cfg->fc_table);
2964                if (!table) {
2965                        pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2966                        table = fib6_new_table(net, cfg->fc_table);
2967                }
2968        } else {
2969                table = fib6_new_table(net, cfg->fc_table);
2970        }
2971
2972        if (!table)
2973                goto out;
2974
2975        err = -ENOMEM;
2976        rt = fib6_info_alloc(gfp_flags);
2977        if (!rt)
2978                goto out;
2979
2980        rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len);
2981        if (IS_ERR(rt->fib6_metrics)) {
2982                err = PTR_ERR(rt->fib6_metrics);
2983                /* Do not leave garbage there. */
2984                rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
2985                goto out;
2986        }
2987
2988        if (cfg->fc_flags & RTF_ADDRCONF)
2989                rt->dst_nocount = true;
2990
2991        if (cfg->fc_flags & RTF_EXPIRES)
2992                fib6_set_expires(rt, jiffies +
2993                                clock_t_to_jiffies(cfg->fc_expires));
2994        else
2995                fib6_clean_expires(rt);
2996
2997        if (cfg->fc_protocol == RTPROT_UNSPEC)
2998                cfg->fc_protocol = RTPROT_BOOT;
2999        rt->fib6_protocol = cfg->fc_protocol;
3000
3001        addr_type = ipv6_addr_type(&cfg->fc_dst);
3002
3003        if (cfg->fc_encap) {
3004                struct lwtunnel_state *lwtstate;
3005
3006                err = lwtunnel_build_state(cfg->fc_encap_type,
3007                                           cfg->fc_encap, AF_INET6, cfg,
3008                                           &lwtstate, extack);
3009                if (err)
3010                        goto out;
3011                rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3012        }
3013
3014        ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3015        rt->fib6_dst.plen = cfg->fc_dst_len;
3016        if (rt->fib6_dst.plen == 128)
3017                rt->dst_host = true;
3018
3019#ifdef CONFIG_IPV6_SUBTREES
3020        ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3021        rt->fib6_src.plen = cfg->fc_src_len;
3022#endif
3023
3024        rt->fib6_metric = cfg->fc_metric;
3025        rt->fib6_nh.nh_weight = 1;
3026
3027        rt->fib6_type = cfg->fc_type;
3028
3029        /* We cannot add true routes via loopback here,
3030           they would result in kernel looping; promote them to reject routes
3031         */
3032        if ((cfg->fc_flags & RTF_REJECT) ||
3033            (dev && (dev->flags & IFF_LOOPBACK) &&
3034             !(addr_type & IPV6_ADDR_LOOPBACK) &&
3035             !(cfg->fc_flags & RTF_LOCAL))) {
3036                /* hold loopback dev/idev if we haven't done so. */
3037                if (dev != net->loopback_dev) {
3038                        if (dev) {
3039                                dev_put(dev);
3040                                in6_dev_put(idev);
3041                        }
3042                        dev = net->loopback_dev;
3043                        dev_hold(dev);
3044                        idev = in6_dev_get(dev);
3045                        if (!idev) {
3046                                err = -ENODEV;
3047                                goto out;
3048                        }
3049                }
3050                rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3051                goto install_route;
3052        }
3053
3054        if (cfg->fc_flags & RTF_GATEWAY) {
3055                err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3056                if (err)
3057                        goto out;
3058
3059                rt->fib6_nh.nh_gw = cfg->fc_gateway;
3060        }
3061
3062        err = -ENODEV;
3063        if (!dev)
3064                goto out;
3065
3066        if (idev->cnf.disable_ipv6) {
3067                NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3068                err = -EACCES;
3069                goto out;
3070        }
3071
3072        if (!(dev->flags & IFF_UP)) {
3073                NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3074                err = -ENETDOWN;
3075                goto out;
3076        }
3077
3078        if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3079                if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3080                        NL_SET_ERR_MSG(extack, "Invalid source address");
3081                        err = -EINVAL;
3082                        goto out;
3083                }
3084                rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3085                rt->fib6_prefsrc.plen = 128;
3086        } else
3087                rt->fib6_prefsrc.plen = 0;
3088
3089        rt->fib6_flags = cfg->fc_flags;
3090
3091install_route:
3092        if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3093            !netif_carrier_ok(dev))
3094                rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3095        rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3096        rt->fib6_nh.nh_dev = dev;
3097        rt->fib6_table = table;
3098
3099        if (idev)
3100                in6_dev_put(idev);
3101
3102        return rt;
3103out:
3104        if (dev)
3105                dev_put(dev);
3106        if (idev)
3107                in6_dev_put(idev);
3108
3109        fib6_info_release(rt);
3110        return ERR_PTR(err);
3111}
3112
3113int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3114                  struct netlink_ext_ack *extack)
3115{
3116        struct fib6_info *rt;
3117        int err;
3118
3119        rt = ip6_route_info_create(cfg, gfp_flags, extack);
3120        if (IS_ERR(rt))
3121                return PTR_ERR(rt);
3122
3123        err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3124        fib6_info_release(rt);
3125
3126        return err;
3127}
3128
3129static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3130{
3131        struct net *net = info->nl_net;
3132        struct fib6_table *table;
3133        int err;
3134
3135        if (rt == net->ipv6.fib6_null_entry) {
3136                err = -ENOENT;
3137                goto out;
3138        }
3139
3140        table = rt->fib6_table;
3141        spin_lock_bh(&table->tb6_lock);
3142        err = fib6_del(rt, info);
3143        spin_unlock_bh(&table->tb6_lock);
3144
3145out:
3146        fib6_info_release(rt);
3147        return err;
3148}
3149
3150int ip6_del_rt(struct net *net, struct fib6_info *rt)
3151{
3152        struct nl_info info = { .nl_net = net };
3153
3154        return __ip6_del_rt(rt, &info);
3155}
3156
3157static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3158{
3159        struct nl_info *info = &cfg->fc_nlinfo;
3160        struct net *net = info->nl_net;
3161        struct sk_buff *skb = NULL;
3162        struct fib6_table *table;
3163        int err = -ENOENT;
3164
3165        if (rt == net->ipv6.fib6_null_entry)
3166                goto out_put;
3167        table = rt->fib6_table;
3168        spin_lock_bh(&table->tb6_lock);
3169
3170        if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3171                struct fib6_info *sibling, *next_sibling;
3172
3173                /* prefer to send a single notification with all hops */
3174                skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3175                if (skb) {
3176                        u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3177
3178                        if (rt6_fill_node(net, skb, rt, NULL,
3179                                          NULL, NULL, 0, RTM_DELROUTE,
3180                                          info->portid, seq, 0) < 0) {
3181                                kfree_skb(skb);
3182                                skb = NULL;
3183                        } else
3184                                info->skip_notify = 1;
3185                }
3186
3187                list_for_each_entry_safe(sibling, next_sibling,
3188                                         &rt->fib6_siblings,
3189                                         fib6_siblings) {
3190                        err = fib6_del(sibling, info);
3191                        if (err)
3192                                goto out_unlock;
3193                }
3194        }
3195
3196        err = fib6_del(rt, info);
3197out_unlock:
3198        spin_unlock_bh(&table->tb6_lock);
3199out_put:
3200        fib6_info_release(rt);
3201
3202        if (skb) {
3203                rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3204                            info->nlh, gfp_any());
3205        }
3206        return err;
3207}
3208
3209static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3210{
3211        int rc = -ESRCH;
3212
3213        if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3214                goto out;
3215
3216        if (cfg->fc_flags & RTF_GATEWAY &&
3217            !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3218                goto out;
3219
3220        rc = rt6_remove_exception_rt(rt);
3221out:
3222        return rc;
3223}
3224
3225static int ip6_route_del(struct fib6_config *cfg,
3226                         struct netlink_ext_ack *extack)
3227{
3228        struct rt6_info *rt_cache;
3229        struct fib6_table *table;
3230        struct fib6_info *rt;
3231        struct fib6_node *fn;
3232        int err = -ESRCH;
3233
3234        table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3235        if (!table) {
3236                NL_SET_ERR_MSG(extack, "FIB table does not exist");
3237                return err;
3238        }
3239
3240        rcu_read_lock();
3241
3242        fn = fib6_locate(&table->tb6_root,
3243                         &cfg->fc_dst, cfg->fc_dst_len,
3244                         &cfg->fc_src, cfg->fc_src_len,
3245                         !(cfg->fc_flags & RTF_CACHE));
3246
3247        if (fn) {
3248                for_each_fib6_node_rt_rcu(fn) {
3249                        if (cfg->fc_flags & RTF_CACHE) {
3250                                int rc;
3251
3252                                rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3253                                                              &cfg->fc_src);
3254                                if (rt_cache) {
3255                                        rc = ip6_del_cached_rt(rt_cache, cfg);
3256                                        if (rc != -ESRCH) {
3257                                                rcu_read_unlock();
3258                                                return rc;
3259                                        }
3260                                }
3261                                continue;
3262                        }
3263                        if (cfg->fc_ifindex &&
3264                            (!rt->fib6_nh.nh_dev ||
3265                             rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3266                                continue;
3267                        if (cfg->fc_flags & RTF_GATEWAY &&
3268                            !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3269                                continue;
3270                        if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3271                                continue;
3272                        if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3273                                continue;
3274                        if (!fib6_info_hold_safe(rt))
3275                                continue;
3276                        rcu_read_unlock();
3277
3278                        /* if gateway was specified only delete the one hop */
3279                        if (cfg->fc_flags & RTF_GATEWAY)
3280                                return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3281
3282                        return __ip6_del_rt_siblings(rt, cfg);
3283                }
3284        }
3285        rcu_read_unlock();
3286
3287        return err;
3288}
3289
3290static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3291{
3292        struct netevent_redirect netevent;
3293        struct rt6_info *rt, *nrt = NULL;
3294        struct ndisc_options ndopts;
3295        struct inet6_dev *in6_dev;
3296        struct neighbour *neigh;
3297        struct fib6_info *from;
3298        struct rd_msg *msg;
3299        int optlen, on_link;
3300        u8 *lladdr;
3301
3302        optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3303        optlen -= sizeof(*msg);
3304
3305        if (optlen < 0) {
3306                net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3307                return;
3308        }
3309
3310        msg = (struct rd_msg *)icmp6_hdr(skb);
3311
3312        if (ipv6_addr_is_multicast(&msg->dest)) {
3313                net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3314                return;
3315        }
3316
3317        on_link = 0;
3318        if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3319                on_link = 1;
3320        } else if (ipv6_addr_type(&msg->target) !=
3321                   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3322                net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3323                return;
3324        }
3325
3326        in6_dev = __in6_dev_get(skb->dev);
3327        if (!in6_dev)
3328                return;
3329        if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3330                return;
3331
3332        /* RFC2461 8.1:
3333         *      The IP source address of the Redirect MUST be the same as the current
3334         *      first-hop router for the specified ICMP Destination Address.
3335         */
3336
3337        if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3338                net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3339                return;
3340        }
3341
3342        lladdr = NULL;
3343        if (ndopts.nd_opts_tgt_lladdr) {
3344                lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3345                                             skb->dev);
3346                if (!lladdr) {
3347                        net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3348                        return;
3349                }
3350        }
3351
3352        rt = (struct rt6_info *) dst;
3353        if (rt->rt6i_flags & RTF_REJECT) {
3354                net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3355                return;
3356        }
3357
3358        /* Redirect received -> path was valid.
3359         * Look, redirects are sent only in response to data packets,
3360         * so that this nexthop apparently is reachable. --ANK
3361         */
3362        dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3363
3364        neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3365        if (!neigh)
3366                return;
3367
3368        /*
3369         *      We have finally decided to accept it.
3370         */
3371
3372        ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3373                     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3374                     NEIGH_UPDATE_F_OVERRIDE|
3375                     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3376                                     NEIGH_UPDATE_F_ISROUTER)),
3377                     NDISC_REDIRECT, &ndopts);
3378
3379        rcu_read_lock();
3380        from = rcu_dereference(rt->from);
3381        /* This fib6_info_hold() is safe here because we hold reference to rt
3382         * and rt already holds reference to fib6_info.
3383         */
3384        fib6_info_hold(from);
3385        rcu_read_unlock();
3386
3387        nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3388        if (!nrt)
3389                goto out;
3390
3391        nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3392        if (on_link)
3393                nrt->rt6i_flags &= ~RTF_GATEWAY;
3394
3395        nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3396
3397        /* No need to remove rt from the exception table if rt is
3398         * a cached route because rt6_insert_exception() will
3399         * takes care of it
3400         */
3401        if (rt6_insert_exception(nrt, from)) {
3402                dst_release_immediate(&nrt->dst);
3403                goto out;
3404        }
3405
3406        netevent.old = &rt->dst;
3407        netevent.new = &nrt->dst;
3408        netevent.daddr = &msg->dest;
3409        netevent.neigh = neigh;
3410        call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3411
3412out:
3413        fib6_info_release(from);
3414        neigh_release(neigh);
3415}
3416
3417#ifdef CONFIG_IPV6_ROUTE_INFO
3418static struct fib6_info *rt6_get_route_info(struct net *net,
3419                                           const struct in6_addr *prefix, int prefixlen,
3420                                           const struct in6_addr *gwaddr,
3421                                           struct net_device *dev)
3422{
3423        u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3424        int ifindex = dev->ifindex;
3425        struct fib6_node *fn;
3426        struct fib6_info *rt = NULL;
3427        struct fib6_table *table;
3428
3429        table = fib6_get_table(net, tb_id);
3430        if (!table)
3431                return NULL;
3432
3433        rcu_read_lock();
3434        fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3435        if (!fn)
3436                goto out;
3437
3438        for_each_fib6_node_rt_rcu(fn) {
3439                if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3440                        continue;
3441                if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3442                        continue;
3443                if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3444                        continue;
3445                if (!fib6_info_hold_safe(rt))
3446                        continue;
3447                break;
3448        }
3449out:
3450        rcu_read_unlock();
3451        return rt;
3452}
3453
3454static struct fib6_info *rt6_add_route_info(struct net *net,
3455                                           const struct in6_addr *prefix, int prefixlen,
3456                                           const struct in6_addr *gwaddr,
3457                                           struct net_device *dev,
3458                                           unsigned int pref)
3459{
3460        struct fib6_config cfg = {
3461                .fc_metric      = IP6_RT_PRIO_USER,
3462                .fc_ifindex     = dev->ifindex,
3463                .fc_dst_len     = prefixlen,
3464                .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3465                                  RTF_UP | RTF_PREF(pref),
3466                .fc_protocol = RTPROT_RA,
3467                .fc_type = RTN_UNICAST,
3468                .fc_nlinfo.portid = 0,
3469                .fc_nlinfo.nlh = NULL,
3470                .fc_nlinfo.nl_net = net,
3471        };
3472
3473        cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3474        cfg.fc_dst = *prefix;
3475        cfg.fc_gateway = *gwaddr;
3476
3477        /* We should treat it as a default route if prefix length is 0. */
3478        if (!prefixlen)
3479                cfg.fc_flags |= RTF_DEFAULT;
3480
3481        ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3482
3483        return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3484}
3485#endif
3486
3487struct fib6_info *rt6_get_dflt_router(struct net *net,
3488                                     const struct in6_addr *addr,
3489                                     struct net_device *dev)
3490{
3491        u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3492        struct fib6_info *rt;
3493        struct fib6_table *table;
3494
3495        table = fib6_get_table(net, tb_id);
3496        if (!table)
3497                return NULL;
3498
3499        rcu_read_lock();
3500        for_each_fib6_node_rt_rcu(&table->tb6_root) {
3501                if (dev == rt->fib6_nh.nh_dev &&
3502                    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3503                    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3504                        break;
3505        }
3506        if (rt && !fib6_info_hold_safe(rt))
3507                rt = NULL;
3508        rcu_read_unlock();
3509        return rt;
3510}
3511
3512struct fib6_info *rt6_add_dflt_router(struct net *net,
3513                                     const struct in6_addr *gwaddr,
3514                                     struct net_device *dev,
3515                                     unsigned int pref)
3516{
3517        struct fib6_config cfg = {
3518                .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3519                .fc_metric      = IP6_RT_PRIO_USER,
3520                .fc_ifindex     = dev->ifindex,
3521                .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3522                                  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3523                .fc_protocol = RTPROT_RA,
3524                .fc_type = RTN_UNICAST,
3525                .fc_nlinfo.portid = 0,
3526                .fc_nlinfo.nlh = NULL,
3527                .fc_nlinfo.nl_net = net,
3528        };
3529
3530        cfg.fc_gateway = *gwaddr;
3531
3532        if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3533                struct fib6_table *table;
3534
3535                table = fib6_get_table(dev_net(dev), cfg.fc_table);
3536                if (table)
3537                        table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3538        }
3539
3540        return rt6_get_dflt_router(net, gwaddr, dev);
3541}
3542
3543static void __rt6_purge_dflt_routers(struct net *net,
3544                                     struct fib6_table *table)
3545{
3546        struct fib6_info *rt;
3547
3548restart:
3549        rcu_read_lock();
3550        for_each_fib6_node_rt_rcu(&table->tb6_root) {
3551                struct net_device *dev = fib6_info_nh_dev(rt);
3552                struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3553
3554                if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3555                    (!idev || idev->cnf.accept_ra != 2) &&
3556                    fib6_info_hold_safe(rt)) {
3557                        rcu_read_unlock();
3558                        ip6_del_rt(net, rt);
3559                        goto restart;
3560                }
3561        }
3562        rcu_read_unlock();
3563
3564        table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3565}
3566
3567void rt6_purge_dflt_routers(struct net *net)
3568{
3569        struct fib6_table *table;
3570        struct hlist_head *head;
3571        unsigned int h;
3572
3573        rcu_read_lock();
3574
3575        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3576                head = &net->ipv6.fib_table_hash[h];
3577                hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3578                        if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3579                                __rt6_purge_dflt_routers(net, table);
3580                }
3581        }
3582
3583        rcu_read_unlock();
3584}
3585
3586static void rtmsg_to_fib6_config(struct net *net,
3587                                 struct in6_rtmsg *rtmsg,
3588                                 struct fib6_config *cfg)
3589{
3590        *cfg = (struct fib6_config){
3591                .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3592                         : RT6_TABLE_MAIN,
3593                .fc_ifindex = rtmsg->rtmsg_ifindex,
3594                .fc_metric = rtmsg->rtmsg_metric,
3595                .fc_expires = rtmsg->rtmsg_info,
3596                .fc_dst_len = rtmsg->rtmsg_dst_len,
3597                .fc_src_len = rtmsg->rtmsg_src_len,
3598                .fc_flags = rtmsg->rtmsg_flags,
3599                .fc_type = rtmsg->rtmsg_type,
3600
3601                .fc_nlinfo.nl_net = net,
3602
3603                .fc_dst = rtmsg->rtmsg_dst,
3604                .fc_src = rtmsg->rtmsg_src,
3605                .fc_gateway = rtmsg->rtmsg_gateway,
3606        };
3607}
3608
3609int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3610{
3611        struct fib6_config cfg;
3612        struct in6_rtmsg rtmsg;
3613        int err;
3614
3615        switch (cmd) {
3616        case SIOCADDRT:         /* Add a route */
3617        case SIOCDELRT:         /* Delete a route */
3618                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3619                        return -EPERM;
3620                err = copy_from_user(&rtmsg, arg,
3621                                     sizeof(struct in6_rtmsg));
3622                if (err)
3623                        return -EFAULT;
3624
3625                rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3626
3627                rtnl_lock();
3628                switch (cmd) {
3629                case SIOCADDRT:
3630                        err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3631                        break;
3632                case SIOCDELRT:
3633                        err = ip6_route_del(&cfg, NULL);
3634                        break;
3635                default:
3636                        err = -EINVAL;
3637                }
3638                rtnl_unlock();
3639
3640                return err;
3641        }
3642
3643        return -EINVAL;
3644}
3645
3646/*
3647 *      Drop the packet on the floor
3648 */
3649
3650static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3651{
3652        int type;
3653        struct dst_entry *dst = skb_dst(skb);
3654        switch (ipstats_mib_noroutes) {
3655        case IPSTATS_MIB_INNOROUTES:
3656                type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3657                if (type == IPV6_ADDR_ANY) {
3658                        IP6_INC_STATS(dev_net(dst->dev),
3659                                      __in6_dev_get_safely(skb->dev),
3660                                      IPSTATS_MIB_INADDRERRORS);
3661                        break;
3662                }
3663                /* FALLTHROUGH */
3664        case IPSTATS_MIB_OUTNOROUTES:
3665                IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3666                              ipstats_mib_noroutes);
3667                break;
3668        }
3669        icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3670        kfree_skb(skb);
3671        return 0;
3672}
3673
3674static int ip6_pkt_discard(struct sk_buff *skb)
3675{
3676        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3677}
3678
3679static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3680{
3681        skb->dev = skb_dst(skb)->dev;
3682        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3683}
3684
3685static int ip6_pkt_prohibit(struct sk_buff *skb)
3686{
3687        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3688}
3689
3690static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3691{
3692        skb->dev = skb_dst(skb)->dev;
3693        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3694}
3695
3696/*
3697 *      Allocate a dst for local (unicast / anycast) address.
3698 */
3699
3700struct fib6_info *addrconf_f6i_alloc(struct net *net,
3701                                     struct inet6_dev *idev,
3702                                     const struct in6_addr *addr,
3703                                     bool anycast, gfp_t gfp_flags)
3704{
3705        u32 tb_id;
3706        struct net_device *dev = idev->dev;
3707        struct fib6_info *f6i;
3708
3709        f6i = fib6_info_alloc(gfp_flags);
3710        if (!f6i)
3711                return ERR_PTR(-ENOMEM);
3712
3713        f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0);
3714        f6i->dst_nocount = true;
3715        f6i->dst_host = true;
3716        f6i->fib6_protocol = RTPROT_KERNEL;
3717        f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3718        if (anycast) {
3719                f6i->fib6_type = RTN_ANYCAST;
3720                f6i->fib6_flags |= RTF_ANYCAST;
3721        } else {
3722                f6i->fib6_type = RTN_LOCAL;
3723                f6i->fib6_flags |= RTF_LOCAL;
3724        }
3725
3726        f6i->fib6_nh.nh_gw = *addr;
3727        dev_hold(dev);
3728        f6i->fib6_nh.nh_dev = dev;
3729        f6i->fib6_dst.addr = *addr;
3730        f6i->fib6_dst.plen = 128;
3731        tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3732        f6i->fib6_table = fib6_get_table(net, tb_id);
3733
3734        return f6i;
3735}
3736
3737/* remove deleted ip from prefsrc entries */
3738struct arg_dev_net_ip {
3739        struct net_device *dev;
3740        struct net *net;
3741        struct in6_addr *addr;
3742};
3743
3744static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3745{
3746        struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3747        struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3748        struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3749
3750        if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3751            rt != net->ipv6.fib6_null_entry &&
3752            ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3753                spin_lock_bh(&rt6_exception_lock);
3754                /* remove prefsrc entry */
3755                rt->fib6_prefsrc.plen = 0;
3756                spin_unlock_bh(&rt6_exception_lock);
3757        }
3758        return 0;
3759}
3760
3761void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3762{
3763        struct net *net = dev_net(ifp->idev->dev);
3764        struct arg_dev_net_ip adni = {
3765                .dev = ifp->idev->dev,
3766                .net = net,
3767                .addr = &ifp->addr,
3768        };
3769        fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3770}
3771
3772#define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3773
3774/* Remove routers and update dst entries when gateway turn into host. */
3775static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3776{
3777        struct in6_addr *gateway = (struct in6_addr *)arg;
3778
3779        if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3780            ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3781                return -1;
3782        }
3783
3784        /* Further clean up cached routes in exception table.
3785         * This is needed because cached route may have a different
3786         * gateway than its 'parent' in the case of an ip redirect.
3787         */
3788        rt6_exceptions_clean_tohost(rt, gateway);
3789
3790        return 0;
3791}
3792
3793void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3794{
3795        fib6_clean_all(net, fib6_clean_tohost, gateway);
3796}
3797
3798struct arg_netdev_event {
3799        const struct net_device *dev;
3800        union {
3801                unsigned int nh_flags;
3802                unsigned long event;
3803        };
3804};
3805
3806static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3807{
3808        struct fib6_info *iter;
3809        struct fib6_node *fn;
3810
3811        fn = rcu_dereference_protected(rt->fib6_node,
3812                        lockdep_is_held(&rt->fib6_table->tb6_lock));
3813        iter = rcu_dereference_protected(fn->leaf,
3814                        lockdep_is_held(&rt->fib6_table->tb6_lock));
3815        while (iter) {
3816                if (iter->fib6_metric == rt->fib6_metric &&
3817                    rt6_qualify_for_ecmp(iter))
3818                        return iter;
3819                iter = rcu_dereference_protected(iter->fib6_next,
3820                                lockdep_is_held(&rt->fib6_table->tb6_lock));
3821        }
3822
3823        return NULL;
3824}
3825
3826static bool rt6_is_dead(const struct fib6_info *rt)
3827{
3828        if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3829            (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3830             fib6_ignore_linkdown(rt)))
3831                return true;
3832
3833        return false;
3834}
3835
3836static int rt6_multipath_total_weight(const struct fib6_info *rt)
3837{
3838        struct fib6_info *iter;
3839        int total = 0;
3840
3841        if (!rt6_is_dead(rt))
3842                total += rt->fib6_nh.nh_weight;
3843
3844        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3845                if (!rt6_is_dead(iter))
3846                        total += iter->fib6_nh.nh_weight;
3847        }
3848
3849        return total;
3850}
3851
3852static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3853{
3854        int upper_bound = -1;
3855
3856        if (!rt6_is_dead(rt)) {
3857                *weight += rt->fib6_nh.nh_weight;
3858                upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3859                                                    total) - 1;
3860        }
3861        atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3862}
3863
3864static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3865{
3866        struct fib6_info *iter;
3867        int weight = 0;
3868
3869        rt6_upper_bound_set(rt, &weight, total);
3870
3871        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3872                rt6_upper_bound_set(iter, &weight, total);
3873}
3874
3875void rt6_multipath_rebalance(struct fib6_info *rt)
3876{
3877        struct fib6_info *first;
3878        int total;
3879
3880        /* In case the entire multipath route was marked for flushing,
3881         * then there is no need to rebalance upon the removal of every
3882         * sibling route.
3883         */
3884        if (!rt->fib6_nsiblings || rt->should_flush)
3885                return;
3886
3887        /* During lookup routes are evaluated in order, so we need to
3888         * make sure upper bounds are assigned from the first sibling
3889         * onwards.
3890         */
3891        first = rt6_multipath_first_sibling(rt);
3892        if (WARN_ON_ONCE(!first))
3893                return;
3894
3895        total = rt6_multipath_total_weight(first);
3896        rt6_multipath_upper_bound_set(first, total);
3897}
3898
3899static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3900{
3901        const struct arg_netdev_event *arg = p_arg;
3902        struct net *net = dev_net(arg->dev);
3903
3904        if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3905                rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3906                fib6_update_sernum_upto_root(net, rt);
3907                rt6_multipath_rebalance(rt);
3908        }
3909
3910        return 0;
3911}
3912
3913void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3914{
3915        struct arg_netdev_event arg = {
3916                .dev = dev,
3917                {
3918                        .nh_flags = nh_flags,
3919                },
3920        };
3921
3922        if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3923                arg.nh_flags |= RTNH_F_LINKDOWN;
3924
3925        fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3926}
3927
3928static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3929                                   const struct net_device *dev)
3930{
3931        struct fib6_info *iter;
3932
3933        if (rt->fib6_nh.nh_dev == dev)
3934                return true;
3935        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3936                if (iter->fib6_nh.nh_dev == dev)
3937                        return true;
3938
3939        return false;
3940}
3941
3942static void rt6_multipath_flush(struct fib6_info *rt)
3943{
3944        struct fib6_info *iter;
3945
3946        rt->should_flush = 1;
3947        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3948                iter->should_flush = 1;
3949}
3950
3951static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3952                                             const struct net_device *down_dev)
3953{
3954        struct fib6_info *iter;
3955        unsigned int dead = 0;
3956
3957        if (rt->fib6_nh.nh_dev == down_dev ||
3958            rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3959                dead++;
3960        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3961                if (iter->fib6_nh.nh_dev == down_dev ||
3962                    iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3963                        dead++;
3964
3965        return dead;
3966}
3967
3968static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3969                                       const struct net_device *dev,
3970                                       unsigned int nh_flags)
3971{
3972        struct fib6_info *iter;
3973
3974        if (rt->fib6_nh.nh_dev == dev)
3975                rt->fib6_nh.nh_flags |= nh_flags;
3976        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3977                if (iter->fib6_nh.nh_dev == dev)
3978                        iter->fib6_nh.nh_flags |= nh_flags;
3979}
3980
3981/* called with write lock held for table with rt */
3982static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3983{
3984        const struct arg_netdev_event *arg = p_arg;
3985        const struct net_device *dev = arg->dev;
3986        struct net *net = dev_net(dev);
3987
3988        if (rt == net->ipv6.fib6_null_entry)
3989                return 0;
3990
3991        switch (arg->event) {
3992        case NETDEV_UNREGISTER:
3993                return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3994        case NETDEV_DOWN:
3995                if (rt->should_flush)
3996                        return -1;
3997                if (!rt->fib6_nsiblings)
3998                        return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3999                if (rt6_multipath_uses_dev(rt, dev)) {
4000                        unsigned int count;
4001
4002                        count = rt6_multipath_dead_count(rt, dev);
4003                        if (rt->fib6_nsiblings + 1 == count) {
4004                                rt6_multipath_flush(rt);
4005                                return -1;
4006                        }
4007                        rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4008                                                   RTNH_F_LINKDOWN);
4009                        fib6_update_sernum(net, rt);
4010                        rt6_multipath_rebalance(rt);
4011                }
4012                return -2;
4013        case NETDEV_CHANGE:
4014                if (rt->fib6_nh.nh_dev != dev ||
4015                    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4016                        break;
4017                rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4018                rt6_multipath_rebalance(rt);
4019                break;
4020        }
4021
4022        return 0;
4023}
4024
4025void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4026{
4027        struct arg_netdev_event arg = {
4028                .dev = dev,
4029                {
4030                        .event = event,
4031                },
4032        };
4033        struct net *net = dev_net(dev);
4034
4035        if (net->ipv6.sysctl.skip_notify_on_dev_down)
4036                fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4037        else
4038                fib6_clean_all(net, fib6_ifdown, &arg);
4039}
4040
4041void rt6_disable_ip(struct net_device *dev, unsigned long event)
4042{
4043        rt6_sync_down_dev(dev, event);
4044        rt6_uncached_list_flush_dev(dev_net(dev), dev);
4045        neigh_ifdown(&nd_tbl, dev);
4046}
4047
4048struct rt6_mtu_change_arg {
4049        struct net_device *dev;
4050        unsigned int mtu;
4051};
4052
4053static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4054{
4055        struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4056        struct inet6_dev *idev;
4057
4058        /* In IPv6 pmtu discovery is not optional,
4059           so that RTAX_MTU lock cannot disable it.
4060           We still use this lock to block changes
4061           caused by addrconf/ndisc.
4062        */
4063
4064        idev = __in6_dev_get(arg->dev);
4065        if (!idev)
4066                return 0;
4067
4068        /* For administrative MTU increase, there is no way to discover
4069           IPv6 PMTU increase, so PMTU increase should be updated here.
4070           Since RFC 1981 doesn't include administrative MTU increase
4071           update PMTU increase is a MUST. (i.e. jumbo frame)
4072         */
4073        if (rt->fib6_nh.nh_dev == arg->dev &&
4074            !fib6_metric_locked(rt, RTAX_MTU)) {
4075                u32 mtu = rt->fib6_pmtu;
4076
4077                if (mtu >= arg->mtu ||
4078                    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4079                        fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4080
4081                spin_lock_bh(&rt6_exception_lock);
4082                rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4083                spin_unlock_bh(&rt6_exception_lock);
4084        }
4085        return 0;
4086}
4087
4088void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4089{
4090        struct rt6_mtu_change_arg arg = {
4091                .dev = dev,
4092                .mtu = mtu,
4093        };
4094
4095        fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4096}
4097
4098static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4099        [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4100        [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4101        [RTA_OIF]               = { .type = NLA_U32 },
4102        [RTA_IIF]               = { .type = NLA_U32 },
4103        [RTA_PRIORITY]          = { .type = NLA_U32 },
4104        [RTA_METRICS]           = { .type = NLA_NESTED },
4105        [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4106        [RTA_PREF]              = { .type = NLA_U8 },
4107        [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4108        [RTA_ENCAP]             = { .type = NLA_NESTED },
4109        [RTA_EXPIRES]           = { .type = NLA_U32 },
4110        [RTA_UID]               = { .type = NLA_U32 },
4111        [RTA_MARK]              = { .type = NLA_U32 },
4112        [RTA_TABLE]             = { .type = NLA_U32 },
4113        [RTA_IP_PROTO]          = { .type = NLA_U8 },
4114        [RTA_SPORT]             = { .type = NLA_U16 },
4115        [RTA_DPORT]             = { .type = NLA_U16 },
4116};
4117
4118static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4119                              struct fib6_config *cfg,
4120                              struct netlink_ext_ack *extack)
4121{
4122        struct rtmsg *rtm;
4123        struct nlattr *tb[RTA_MAX+1];
4124        unsigned int pref;
4125        int err;
4126
4127        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4128                          extack);
4129        if (err < 0)
4130                goto errout;
4131
4132        err = -EINVAL;
4133        rtm = nlmsg_data(nlh);
4134
4135        *cfg = (struct fib6_config){
4136                .fc_table = rtm->rtm_table,
4137                .fc_dst_len = rtm->rtm_dst_len,
4138                .fc_src_len = rtm->rtm_src_len,
4139                .fc_flags = RTF_UP,
4140                .fc_protocol = rtm->rtm_protocol,
4141                .fc_type = rtm->rtm_type,
4142
4143                .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4144                .fc_nlinfo.nlh = nlh,
4145                .fc_nlinfo.nl_net = sock_net(skb->sk),
4146        };
4147
4148        if (rtm->rtm_type == RTN_UNREACHABLE ||
4149            rtm->rtm_type == RTN_BLACKHOLE ||
4150            rtm->rtm_type == RTN_PROHIBIT ||
4151            rtm->rtm_type == RTN_THROW)
4152                cfg->fc_flags |= RTF_REJECT;
4153
4154        if (rtm->rtm_type == RTN_LOCAL)
4155                cfg->fc_flags |= RTF_LOCAL;
4156
4157        if (rtm->rtm_flags & RTM_F_CLONED)
4158                cfg->fc_flags |= RTF_CACHE;
4159
4160        cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4161
4162        if (tb[RTA_GATEWAY]) {
4163                cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4164                cfg->fc_flags |= RTF_GATEWAY;
4165        }
4166
4167        if (tb[RTA_DST]) {
4168                int plen = (rtm->rtm_dst_len + 7) >> 3;
4169
4170                if (nla_len(tb[RTA_DST]) < plen)
4171                        goto errout;
4172
4173                nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4174        }
4175
4176        if (tb[RTA_SRC]) {
4177                int plen = (rtm->rtm_src_len + 7) >> 3;
4178
4179                if (nla_len(tb[RTA_SRC]) < plen)
4180                        goto errout;
4181
4182                nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4183        }
4184
4185        if (tb[RTA_PREFSRC])
4186                cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4187
4188        if (tb[RTA_OIF])
4189                cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4190
4191        if (tb[RTA_PRIORITY])
4192                cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4193
4194        if (tb[RTA_METRICS]) {
4195                cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4196                cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4197        }
4198
4199        if (tb[RTA_TABLE])
4200                cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4201
4202        if (tb[RTA_MULTIPATH]) {
4203                cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4204                cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4205
4206                err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4207                                                     cfg->fc_mp_len, extack);
4208                if (err < 0)
4209                        goto errout;
4210        }
4211
4212        if (tb[RTA_PREF]) {
4213                pref = nla_get_u8(tb[RTA_PREF]);
4214                if (pref != ICMPV6_ROUTER_PREF_LOW &&
4215                    pref != ICMPV6_ROUTER_PREF_HIGH)
4216                        pref = ICMPV6_ROUTER_PREF_MEDIUM;
4217                cfg->fc_flags |= RTF_PREF(pref);
4218        }
4219
4220        if (tb[RTA_ENCAP])
4221                cfg->fc_encap = tb[RTA_ENCAP];
4222
4223        if (tb[RTA_ENCAP_TYPE]) {
4224                cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4225
4226                err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4227                if (err < 0)
4228                        goto errout;
4229        }
4230
4231        if (tb[RTA_EXPIRES]) {
4232                unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4233
4234                if (addrconf_finite_timeout(timeout)) {
4235                        cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4236                        cfg->fc_flags |= RTF_EXPIRES;
4237                }
4238        }
4239
4240        err = 0;
4241errout:
4242        return err;
4243}
4244
4245struct rt6_nh {
4246        struct fib6_info *fib6_info;
4247        struct fib6_config r_cfg;
4248        struct list_head next;
4249};
4250
4251static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4252{
4253        struct rt6_nh *nh;
4254
4255        list_for_each_entry(nh, rt6_nh_list, next) {
4256                pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4257                        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4258                        nh->r_cfg.fc_ifindex);
4259        }
4260}
4261
4262static int ip6_route_info_append(struct net *net,
4263                                 struct list_head *rt6_nh_list,
4264                                 struct fib6_info *rt,
4265                                 struct fib6_config *r_cfg)
4266{
4267        struct rt6_nh *nh;
4268        int err = -EEXIST;
4269
4270        list_for_each_entry(nh, rt6_nh_list, next) {
4271                /* check if fib6_info already exists */
4272                if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4273                        return err;
4274        }
4275
4276        nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4277        if (!nh)
4278                return -ENOMEM;
4279        nh->fib6_info = rt;
4280        memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4281        list_add_tail(&nh->next, rt6_nh_list);
4282
4283        return 0;
4284}
4285
4286static void ip6_route_mpath_notify(struct fib6_info *rt,
4287                                   struct fib6_info *rt_last,
4288                                   struct nl_info *info,
4289                                   __u16 nlflags)
4290{
4291        /* if this is an APPEND route, then rt points to the first route
4292         * inserted and rt_last points to last route inserted. Userspace
4293         * wants a consistent dump of the route which starts at the first
4294         * nexthop. Since sibling routes are always added at the end of
4295         * the list, find the first sibling of the last route appended
4296         */
4297        if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4298                rt = list_first_entry(&rt_last->fib6_siblings,
4299                                      struct fib6_info,
4300                                      fib6_siblings);
4301        }
4302
4303        if (rt)
4304                inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4305}
4306
4307static int ip6_route_multipath_add(struct fib6_config *cfg,
4308                                   struct netlink_ext_ack *extack)
4309{
4310        struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4311        struct nl_info *info = &cfg->fc_nlinfo;
4312        struct fib6_config r_cfg;
4313        struct rtnexthop *rtnh;
4314        struct fib6_info *rt;
4315        struct rt6_nh *err_nh;
4316        struct rt6_nh *nh, *nh_safe;
4317        __u16 nlflags;
4318        int remaining;
4319        int attrlen;
4320        int err = 1;
4321        int nhn = 0;
4322        int replace = (cfg->fc_nlinfo.nlh &&
4323                       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4324        LIST_HEAD(rt6_nh_list);
4325
4326        nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4327        if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4328                nlflags |= NLM_F_APPEND;
4329
4330        remaining = cfg->fc_mp_len;
4331        rtnh = (struct rtnexthop *)cfg->fc_mp;
4332
4333        /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4334         * fib6_info structs per nexthop
4335         */
4336        while (rtnh_ok(rtnh, remaining)) {
4337                memcpy(&r_cfg, cfg, sizeof(*cfg));
4338                if (rtnh->rtnh_ifindex)
4339                        r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4340
4341                attrlen = rtnh_attrlen(rtnh);
4342                if (attrlen > 0) {
4343                        struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4344
4345                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4346                        if (nla) {
4347                                r_cfg.fc_gateway = nla_get_in6_addr(nla);
4348                                r_cfg.fc_flags |= RTF_GATEWAY;
4349                        }
4350                        r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4351                        nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4352                        if (nla)
4353                                r_cfg.fc_encap_type = nla_get_u16(nla);
4354                }
4355
4356                r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4357                rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4358                if (IS_ERR(rt)) {
4359                        err = PTR_ERR(rt);
4360                        rt = NULL;
4361                        goto cleanup;
4362                }
4363                if (!rt6_qualify_for_ecmp(rt)) {
4364                        err = -EINVAL;
4365                        NL_SET_ERR_MSG(extack,
4366                                       "Device only routes can not be added for IPv6 using the multipath API.");
4367                        fib6_info_release(rt);
4368                        goto cleanup;
4369                }
4370
4371                rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4372
4373                err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4374                                            rt, &r_cfg);
4375                if (err) {
4376                        fib6_info_release(rt);
4377                        goto cleanup;
4378                }
4379
4380                rtnh = rtnh_next(rtnh, &remaining);
4381        }
4382
4383        /* for add and replace send one notification with all nexthops.
4384         * Skip the notification in fib6_add_rt2node and send one with
4385         * the full route when done
4386         */
4387        info->skip_notify = 1;
4388
4389        err_nh = NULL;
4390        list_for_each_entry(nh, &rt6_nh_list, next) {
4391                err = __ip6_ins_rt(nh->fib6_info, info, extack);
4392                fib6_info_release(nh->fib6_info);
4393
4394                if (!err) {
4395                        /* save reference to last route successfully inserted */
4396                        rt_last = nh->fib6_info;
4397
4398                        /* save reference to first route for notification */
4399                        if (!rt_notif)
4400                                rt_notif = nh->fib6_info;
4401                }
4402
4403                /* nh->fib6_info is used or freed at this point, reset to NULL*/
4404                nh->fib6_info = NULL;
4405                if (err) {
4406                        if (replace && nhn)
4407                                ip6_print_replace_route_err(&rt6_nh_list);
4408                        err_nh = nh;
4409                        goto add_errout;
4410                }
4411
4412                /* Because each route is added like a single route we remove
4413                 * these flags after the first nexthop: if there is a collision,
4414                 * we have already failed to add the first nexthop:
4415                 * fib6_add_rt2node() has rejected it; when replacing, old
4416                 * nexthops have been replaced by first new, the rest should
4417                 * be added to it.
4418                 */
4419                cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4420                                                     NLM_F_REPLACE);
4421                nhn++;
4422        }
4423
4424        /* success ... tell user about new route */
4425        ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4426        goto cleanup;
4427
4428add_errout:
4429        /* send notification for routes that were added so that
4430         * the delete notifications sent by ip6_route_del are
4431         * coherent
4432         */
4433        if (rt_notif)
4434                ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4435
4436        /* Delete routes that were already added */
4437        list_for_each_entry(nh, &rt6_nh_list, next) {
4438                if (err_nh == nh)
4439                        break;
4440                ip6_route_del(&nh->r_cfg, extack);
4441        }
4442
4443cleanup:
4444        list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4445                if (nh->fib6_info)
4446                        fib6_info_release(nh->fib6_info);
4447                list_del(&nh->next);
4448                kfree(nh);
4449        }
4450
4451        return err;
4452}
4453
4454static int ip6_route_multipath_del(struct fib6_config *cfg,
4455                                   struct netlink_ext_ack *extack)
4456{
4457        struct fib6_config r_cfg;
4458        struct rtnexthop *rtnh;
4459        int remaining;
4460        int attrlen;
4461        int err = 1, last_err = 0;
4462
4463        remaining = cfg->fc_mp_len;
4464        rtnh = (struct rtnexthop *)cfg->fc_mp;
4465
4466        /* Parse a Multipath Entry */
4467        while (rtnh_ok(rtnh, remaining)) {
4468                memcpy(&r_cfg, cfg, sizeof(*cfg));
4469                if (rtnh->rtnh_ifindex)
4470                        r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4471
4472                attrlen = rtnh_attrlen(rtnh);
4473                if (attrlen > 0) {
4474                        struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4475
4476                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4477                        if (nla) {
4478                                nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4479                                r_cfg.fc_flags |= RTF_GATEWAY;
4480                        }
4481                }
4482                err = ip6_route_del(&r_cfg, extack);
4483                if (err)
4484                        last_err = err;
4485
4486                rtnh = rtnh_next(rtnh, &remaining);
4487        }
4488
4489        return last_err;
4490}
4491
4492static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4493                              struct netlink_ext_ack *extack)
4494{
4495        struct fib6_config cfg;
4496        int err;
4497
4498        err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4499        if (err < 0)
4500                return err;
4501
4502        if (cfg.fc_mp)
4503                return ip6_route_multipath_del(&cfg, extack);
4504        else {
4505                cfg.fc_delete_all_nh = 1;
4506                return ip6_route_del(&cfg, extack);
4507        }
4508}
4509
4510static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4511                              struct netlink_ext_ack *extack)
4512{
4513        struct fib6_config cfg;
4514        int err;
4515
4516        err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4517        if (err < 0)
4518                return err;
4519
4520        if (cfg.fc_mp)
4521                return ip6_route_multipath_add(&cfg, extack);
4522        else
4523                return ip6_route_add(&cfg, GFP_KERNEL, extack);
4524}
4525
4526static size_t rt6_nlmsg_size(struct fib6_info *rt)
4527{
4528        int nexthop_len = 0;
4529
4530        if (rt->fib6_nsiblings) {
4531                nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4532                            + NLA_ALIGN(sizeof(struct rtnexthop))
4533                            + nla_total_size(16) /* RTA_GATEWAY */
4534                            + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4535
4536                nexthop_len *= rt->fib6_nsiblings;
4537        }
4538
4539        return NLMSG_ALIGN(sizeof(struct rtmsg))
4540               + nla_total_size(16) /* RTA_SRC */
4541               + nla_total_size(16) /* RTA_DST */
4542               + nla_total_size(16) /* RTA_GATEWAY */
4543               + nla_total_size(16) /* RTA_PREFSRC */
4544               + nla_total_size(4) /* RTA_TABLE */
4545               + nla_total_size(4) /* RTA_IIF */
4546               + nla_total_size(4) /* RTA_OIF */
4547               + nla_total_size(4) /* RTA_PRIORITY */
4548               + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4549               + nla_total_size(sizeof(struct rta_cacheinfo))
4550               + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4551               + nla_total_size(1) /* RTA_PREF */
4552               + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4553               + nexthop_len;
4554}
4555
4556static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4557                            unsigned int *flags, bool skip_oif)
4558{
4559        if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4560                *flags |= RTNH_F_DEAD;
4561
4562        if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4563                *flags |= RTNH_F_LINKDOWN;
4564
4565                rcu_read_lock();
4566                if (fib6_ignore_linkdown(rt))
4567                        *flags |= RTNH_F_DEAD;
4568                rcu_read_unlock();
4569        }
4570
4571        if (rt->fib6_flags & RTF_GATEWAY) {
4572                if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4573                        goto nla_put_failure;
4574        }
4575
4576        *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4577        if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4578                *flags |= RTNH_F_OFFLOAD;
4579
4580        /* not needed for multipath encoding b/c it has a rtnexthop struct */
4581        if (!skip_oif && rt->fib6_nh.nh_dev &&
4582            nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4583                goto nla_put_failure;
4584
4585        if (rt->fib6_nh.nh_lwtstate &&
4586            lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4587                goto nla_put_failure;
4588
4589        return 0;
4590
4591nla_put_failure:
4592        return -EMSGSIZE;
4593}
4594
4595/* add multipath next hop */
4596static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4597{
4598        const struct net_device *dev = rt->fib6_nh.nh_dev;
4599        struct rtnexthop *rtnh;
4600        unsigned int flags = 0;
4601
4602        rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4603        if (!rtnh)
4604                goto nla_put_failure;
4605
4606        rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4607        rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4608
4609        if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4610                goto nla_put_failure;
4611
4612        rtnh->rtnh_flags = flags;
4613
4614        /* length of rtnetlink header + attributes */
4615        rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4616
4617        return 0;
4618
4619nla_put_failure:
4620        return -EMSGSIZE;
4621}
4622
4623static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4624                         struct fib6_info *rt, struct dst_entry *dst,
4625                         struct in6_addr *dest, struct in6_addr *src,
4626                         int iif, int type, u32 portid, u32 seq,
4627                         unsigned int flags)
4628{
4629        struct rt6_info *rt6 = (struct rt6_info *)dst;
4630        struct rt6key *rt6_dst, *rt6_src;
4631        u32 *pmetrics, table, rt6_flags;
4632        struct nlmsghdr *nlh;
4633        struct rtmsg *rtm;
4634        long expires = 0;
4635
4636        nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4637        if (!nlh)
4638                return -EMSGSIZE;
4639
4640        if (rt6) {
4641                rt6_dst = &rt6->rt6i_dst;
4642                rt6_src = &rt6->rt6i_src;
4643                rt6_flags = rt6->rt6i_flags;
4644        } else {
4645                rt6_dst = &rt->fib6_dst;
4646                rt6_src = &rt->fib6_src;
4647                rt6_flags = rt->fib6_flags;
4648        }
4649
4650        rtm = nlmsg_data(nlh);
4651        rtm->rtm_family = AF_INET6;
4652        rtm->rtm_dst_len = rt6_dst->plen;
4653        rtm->rtm_src_len = rt6_src->plen;
4654        rtm->rtm_tos = 0;
4655        if (rt->fib6_table)
4656                table = rt->fib6_table->tb6_id;
4657        else
4658                table = RT6_TABLE_UNSPEC;
4659        rtm->rtm_table = table;
4660        if (nla_put_u32(skb, RTA_TABLE, table))
4661                goto nla_put_failure;
4662
4663        rtm->rtm_type = rt->fib6_type;
4664        rtm->rtm_flags = 0;
4665        rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4666        rtm->rtm_protocol = rt->fib6_protocol;
4667
4668        if (rt6_flags & RTF_CACHE)
4669                rtm->rtm_flags |= RTM_F_CLONED;
4670
4671        if (dest) {
4672                if (nla_put_in6_addr(skb, RTA_DST, dest))
4673                        goto nla_put_failure;
4674                rtm->rtm_dst_len = 128;
4675        } else if (rtm->rtm_dst_len)
4676                if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4677                        goto nla_put_failure;
4678#ifdef CONFIG_IPV6_SUBTREES
4679        if (src) {
4680                if (nla_put_in6_addr(skb, RTA_SRC, src))
4681                        goto nla_put_failure;
4682                rtm->rtm_src_len = 128;
4683        } else if (rtm->rtm_src_len &&
4684                   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4685                goto nla_put_failure;
4686#endif
4687        if (iif) {
4688#ifdef CONFIG_IPV6_MROUTE
4689                if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4690                        int err = ip6mr_get_route(net, skb, rtm, portid);
4691
4692                        if (err == 0)
4693                                return 0;
4694                        if (err < 0)
4695                                goto nla_put_failure;
4696                } else
4697#endif
4698                        if (nla_put_u32(skb, RTA_IIF, iif))
4699                                goto nla_put_failure;
4700        } else if (dest) {
4701                struct in6_addr saddr_buf;
4702                if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4703                    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4704                        goto nla_put_failure;
4705        }
4706
4707        if (rt->fib6_prefsrc.plen) {
4708                struct in6_addr saddr_buf;
4709                saddr_buf = rt->fib6_prefsrc.addr;
4710                if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4711                        goto nla_put_failure;
4712        }
4713
4714        pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4715        if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4716                goto nla_put_failure;
4717
4718        if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4719                goto nla_put_failure;
4720
4721        /* For multipath routes, walk the siblings list and add
4722         * each as a nexthop within RTA_MULTIPATH.
4723         */
4724        if (rt6) {
4725                if (rt6_flags & RTF_GATEWAY &&
4726                    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4727                        goto nla_put_failure;
4728
4729                if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4730                        goto nla_put_failure;
4731        } else if (rt->fib6_nsiblings) {
4732                struct fib6_info *sibling, *next_sibling;
4733                struct nlattr *mp;
4734
4735                mp = nla_nest_start(skb, RTA_MULTIPATH);
4736                if (!mp)
4737                        goto nla_put_failure;
4738
4739                if (rt6_add_nexthop(skb, rt) < 0)
4740                        goto nla_put_failure;
4741
4742                list_for_each_entry_safe(sibling, next_sibling,
4743                                         &rt->fib6_siblings, fib6_siblings) {
4744                        if (rt6_add_nexthop(skb, sibling) < 0)
4745                                goto nla_put_failure;
4746                }
4747
4748                nla_nest_end(skb, mp);
4749        } else {
4750                if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4751                        goto nla_put_failure;
4752        }
4753
4754        if (rt6_flags & RTF_EXPIRES) {
4755                expires = dst ? dst->expires : rt->expires;
4756                expires -= jiffies;
4757        }
4758
4759        if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4760                goto nla_put_failure;
4761
4762        if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4763                goto nla_put_failure;
4764
4765
4766        nlmsg_end(skb, nlh);
4767        return 0;
4768
4769nla_put_failure:
4770        nlmsg_cancel(skb, nlh);
4771        return -EMSGSIZE;
4772}
4773
4774static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4775                               const struct net_device *dev)
4776{
4777        if (f6i->fib6_nh.nh_dev == dev)
4778                return true;
4779
4780        if (f6i->fib6_nsiblings) {
4781                struct fib6_info *sibling, *next_sibling;
4782
4783                list_for_each_entry_safe(sibling, next_sibling,
4784                                         &f6i->fib6_siblings, fib6_siblings) {
4785                        if (sibling->fib6_nh.nh_dev == dev)
4786                                return true;
4787                }
4788        }
4789
4790        return false;
4791}
4792
4793int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4794{
4795        struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4796        struct fib_dump_filter *filter = &arg->filter;
4797        unsigned int flags = NLM_F_MULTI;
4798        struct net *net = arg->net;
4799
4800        if (rt == net->ipv6.fib6_null_entry)
4801                return 0;
4802
4803        if ((filter->flags & RTM_F_PREFIX) &&
4804            !(rt->fib6_flags & RTF_PREFIX_RT)) {
4805                /* success since this is not a prefix route */
4806                return 1;
4807        }
4808        if (filter->filter_set) {
4809                if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4810                    (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4811                    (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4812                        return 1;
4813                }
4814                flags |= NLM_F_DUMP_FILTERED;
4815        }
4816
4817        return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4818                             RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4819                             arg->cb->nlh->nlmsg_seq, flags);
4820}
4821
4822static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4823                              struct netlink_ext_ack *extack)
4824{
4825        struct net *net = sock_net(in_skb->sk);
4826        struct nlattr *tb[RTA_MAX+1];
4827        int err, iif = 0, oif = 0;
4828        struct fib6_info *from;
4829        struct dst_entry *dst;
4830        struct rt6_info *rt;
4831        struct sk_buff *skb;
4832        struct rtmsg *rtm;
4833        struct flowi6 fl6 = {};
4834        bool fibmatch;
4835
4836        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4837                          extack);
4838        if (err < 0)
4839                goto errout;
4840
4841        err = -EINVAL;
4842        rtm = nlmsg_data(nlh);
4843        fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4844        fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4845
4846        if (tb[RTA_SRC]) {
4847                if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4848                        goto errout;
4849
4850                fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4851        }
4852
4853        if (tb[RTA_DST]) {
4854                if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4855                        goto errout;
4856
4857                fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4858        }
4859
4860        if (tb[RTA_IIF])
4861                iif = nla_get_u32(tb[RTA_IIF]);
4862
4863        if (tb[RTA_OIF])
4864                oif = nla_get_u32(tb[RTA_OIF]);
4865
4866        if (tb[RTA_MARK])
4867                fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4868
4869        if (tb[RTA_UID])
4870                fl6.flowi6_uid = make_kuid(current_user_ns(),
4871                                           nla_get_u32(tb[RTA_UID]));
4872        else
4873                fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4874
4875        if (tb[RTA_SPORT])
4876                fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4877
4878        if (tb[RTA_DPORT])
4879                fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4880
4881        if (tb[RTA_IP_PROTO]) {
4882                err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4883                                                  &fl6.flowi6_proto, extack);
4884                if (err)
4885                        goto errout;
4886        }
4887
4888        if (iif) {
4889                struct net_device *dev;
4890                int flags = 0;
4891
4892                rcu_read_lock();
4893
4894                dev = dev_get_by_index_rcu(net, iif);
4895                if (!dev) {
4896                        rcu_read_unlock();
4897                        err = -ENODEV;
4898                        goto errout;
4899                }
4900
4901                fl6.flowi6_iif = iif;
4902
4903                if (!ipv6_addr_any(&fl6.saddr))
4904                        flags |= RT6_LOOKUP_F_HAS_SADDR;
4905
4906                dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4907
4908                rcu_read_unlock();
4909        } else {
4910                fl6.flowi6_oif = oif;
4911
4912                dst = ip6_route_output(net, NULL, &fl6);
4913        }
4914
4915
4916        rt = container_of(dst, struct rt6_info, dst);
4917        if (rt->dst.error) {
4918                err = rt->dst.error;
4919                ip6_rt_put(rt);
4920                goto errout;
4921        }
4922
4923        if (rt == net->ipv6.ip6_null_entry) {
4924                err = rt->dst.error;
4925                ip6_rt_put(rt);
4926                goto errout;
4927        }
4928
4929        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4930        if (!skb) {
4931                ip6_rt_put(rt);
4932                err = -ENOBUFS;
4933                goto errout;
4934        }
4935
4936        skb_dst_set(skb, &rt->dst);
4937
4938        rcu_read_lock();
4939        from = rcu_dereference(rt->from);
4940
4941        if (fibmatch)
4942                err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4943                                    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4944                                    nlh->nlmsg_seq, 0);
4945        else
4946                err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4947                                    &fl6.saddr, iif, RTM_NEWROUTE,
4948                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4949                                    0);
4950        rcu_read_unlock();
4951
4952        if (err < 0) {
4953                kfree_skb(skb);
4954                goto errout;
4955        }
4956
4957        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4958errout:
4959        return err;
4960}
4961
4962void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4963                     unsigned int nlm_flags)
4964{
4965        struct sk_buff *skb;
4966        struct net *net = info->nl_net;
4967        u32 seq;
4968        int err;
4969
4970        err = -ENOBUFS;
4971        seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4972
4973        skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4974        if (!skb)
4975                goto errout;
4976
4977        err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4978                            event, info->portid, seq, nlm_flags);
4979        if (err < 0) {
4980                /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4981                WARN_ON(err == -EMSGSIZE);
4982                kfree_skb(skb);
4983                goto errout;
4984        }
4985        rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4986                    info->nlh, gfp_any());
4987        return;
4988errout:
4989        if (err < 0)
4990                rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4991}
4992
4993static int ip6_route_dev_notify(struct notifier_block *this,
4994                                unsigned long event, void *ptr)
4995{
4996        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4997        struct net *net = dev_net(dev);
4998
4999        if (!(dev->flags & IFF_LOOPBACK))
5000                return NOTIFY_OK;
5001
5002        if (event == NETDEV_REGISTER) {
5003                net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5004                net->ipv6.ip6_null_entry->dst.dev = dev;
5005                net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5006#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5007                net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5008                net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5009                net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5010                net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5011#endif
5012         } else if (event == NETDEV_UNREGISTER &&
5013                    dev->reg_state != NETREG_UNREGISTERED) {
5014                /* NETDEV_UNREGISTER could be fired for multiple times by
5015                 * netdev_wait_allrefs(). Make sure we only call this once.
5016                 */
5017                in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5018#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5019                in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5020                in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5021#endif
5022        }
5023
5024        return NOTIFY_OK;
5025}
5026
5027/*
5028 *      /proc
5029 */
5030
5031#ifdef CONFIG_PROC_FS
5032static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5033{
5034        struct net *net = (struct net *)seq->private;
5035        seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5036                   net->ipv6.rt6_stats->fib_nodes,
5037                   net->ipv6.rt6_stats->fib_route_nodes,
5038                   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5039                   net->ipv6.rt6_stats->fib_rt_entries,
5040                   net->ipv6.rt6_stats->fib_rt_cache,
5041                   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5042                   net->ipv6.rt6_stats->fib_discarded_routes);
5043
5044        return 0;
5045}
5046#endif  /* CONFIG_PROC_FS */
5047
5048#ifdef CONFIG_SYSCTL
5049
5050static
5051int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5052                              void __user *buffer, size_t *lenp, loff_t *ppos)
5053{
5054        struct net *net;
5055        int delay;
5056        if (!write)
5057                return -EINVAL;
5058
5059        net = (struct net *)ctl->extra1;
5060        delay = net->ipv6.sysctl.flush_delay;
5061        proc_dointvec(ctl, write, buffer, lenp, ppos);
5062        fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5063        return 0;
5064}
5065
5066static int zero;
5067static int one = 1;
5068
5069static struct ctl_table ipv6_route_table_template[] = {
5070        {
5071                .procname       =       "flush",
5072                .data           =       &init_net.ipv6.sysctl.flush_delay,
5073                .maxlen         =       sizeof(int),
5074                .mode           =       0200,
5075                .proc_handler   =       ipv6_sysctl_rtcache_flush
5076        },
5077        {
5078                .procname       =       "gc_thresh",
5079                .data           =       &ip6_dst_ops_template.gc_thresh,
5080                .maxlen         =       sizeof(int),
5081                .mode           =       0644,
5082                .proc_handler   =       proc_dointvec,
5083        },
5084        {
5085                .procname       =       "max_size",
5086                .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5087                .maxlen         =       sizeof(int),
5088                .mode           =       0644,
5089                .proc_handler   =       proc_dointvec,
5090        },
5091        {
5092                .procname       =       "gc_min_interval",
5093                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5094                .maxlen         =       sizeof(int),
5095                .mode           =       0644,
5096                .proc_handler   =       proc_dointvec_jiffies,
5097        },
5098        {
5099                .procname       =       "gc_timeout",
5100                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5101                .maxlen         =       sizeof(int),
5102                .mode           =       0644,
5103                .proc_handler   =       proc_dointvec_jiffies,
5104        },
5105        {
5106                .procname       =       "gc_interval",
5107                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5108                .maxlen         =       sizeof(int),
5109                .mode           =       0644,
5110                .proc_handler   =       proc_dointvec_jiffies,
5111        },
5112        {
5113                .procname       =       "gc_elasticity",
5114                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5115                .maxlen         =       sizeof(int),
5116                .mode           =       0644,
5117                .proc_handler   =       proc_dointvec,
5118        },
5119        {
5120                .procname       =       "mtu_expires",
5121                .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5122                .maxlen         =       sizeof(int),
5123                .mode           =       0644,
5124                .proc_handler   =       proc_dointvec_jiffies,
5125        },
5126        {
5127                .procname       =       "min_adv_mss",
5128                .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5129                .maxlen         =       sizeof(int),
5130                .mode           =       0644,
5131                .proc_handler   =       proc_dointvec,
5132        },
5133        {
5134                .procname       =       "gc_min_interval_ms",
5135                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5136                .maxlen         =       sizeof(int),
5137                .mode           =       0644,
5138                .proc_handler   =       proc_dointvec_ms_jiffies,
5139        },
5140        {
5141                .procname       =       "skip_notify_on_dev_down",
5142                .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5143                .maxlen         =       sizeof(int),
5144                .mode           =       0644,
5145                .proc_handler   =       proc_dointvec,
5146                .extra1         =       &zero,
5147                .extra2         =       &one,
5148        },
5149        { }
5150};
5151
5152struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5153{
5154        struct ctl_table *table;
5155
5156        table = kmemdup(ipv6_route_table_template,
5157                        sizeof(ipv6_route_table_template),
5158                        GFP_KERNEL);
5159
5160        if (table) {
5161                table[0].data = &net->ipv6.sysctl.flush_delay;
5162                table[0].extra1 = net;
5163                table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5164                table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5165                table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5166                table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5167                table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5168                table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5169                table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5170                table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5171                table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5172                table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5173
5174                /* Don't export sysctls to unprivileged users */
5175                if (net->user_ns != &init_user_ns)
5176                        table[0].procname = NULL;
5177        }
5178
5179        return table;
5180}
5181#endif
5182
5183static int __net_init ip6_route_net_init(struct net *net)
5184{
5185        int ret = -ENOMEM;
5186
5187        memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5188               sizeof(net->ipv6.ip6_dst_ops));
5189
5190        if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5191                goto out_ip6_dst_ops;
5192
5193        net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5194                                            sizeof(*net->ipv6.fib6_null_entry),
5195                                            GFP_KERNEL);
5196        if (!net->ipv6.fib6_null_entry)
5197                goto out_ip6_dst_entries;
5198
5199        net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5200                                           sizeof(*net->ipv6.ip6_null_entry),
5201                                           GFP_KERNEL);
5202        if (!net->ipv6.ip6_null_entry)
5203                goto out_fib6_null_entry;
5204        net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5205        dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5206                         ip6_template_metrics, true);
5207
5208#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5209        net->ipv6.fib6_has_custom_rules = false;
5210        net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5211                                               sizeof(*net->ipv6.ip6_prohibit_entry),
5212                                               GFP_KERNEL);
5213        if (!net->ipv6.ip6_prohibit_entry)
5214                goto out_ip6_null_entry;
5215        net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5216        dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5217                         ip6_template_metrics, true);
5218
5219        net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5220                                               sizeof(*net->ipv6.ip6_blk_hole_entry),
5221                                               GFP_KERNEL);
5222        if (!net->ipv6.ip6_blk_hole_entry)
5223                goto out_ip6_prohibit_entry;
5224        net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5225        dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5226                         ip6_template_metrics, true);
5227#endif
5228
5229        net->ipv6.sysctl.flush_delay = 0;
5230        net->ipv6.sysctl.ip6_rt_max_size = 4096;
5231        net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5232        net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5233        net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5234        net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5235        net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5236        net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5237        net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5238
5239        net->ipv6.ip6_rt_gc_expire = 30*HZ;
5240
5241        ret = 0;
5242out:
5243        return ret;
5244
5245#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5246out_ip6_prohibit_entry:
5247        kfree(net->ipv6.ip6_prohibit_entry);
5248out_ip6_null_entry:
5249        kfree(net->ipv6.ip6_null_entry);
5250#endif
5251out_fib6_null_entry:
5252        kfree(net->ipv6.fib6_null_entry);
5253out_ip6_dst_entries:
5254        dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5255out_ip6_dst_ops:
5256        goto out;
5257}
5258
5259static void __net_exit ip6_route_net_exit(struct net *net)
5260{
5261        kfree(net->ipv6.fib6_null_entry);
5262        kfree(net->ipv6.ip6_null_entry);
5263#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5264        kfree(net->ipv6.ip6_prohibit_entry);
5265        kfree(net->ipv6.ip6_blk_hole_entry);
5266#endif
5267        dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5268}
5269
5270static int __net_init ip6_route_net_init_late(struct net *net)
5271{
5272#ifdef CONFIG_PROC_FS
5273        proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5274                        sizeof(struct ipv6_route_iter));
5275        proc_create_net_single("rt6_stats", 0444, net->proc_net,
5276                        rt6_stats_seq_show, NULL);
5277#endif
5278        return 0;
5279}
5280
5281static void __net_exit ip6_route_net_exit_late(struct net *net)
5282{
5283#ifdef CONFIG_PROC_FS
5284        remove_proc_entry("ipv6_route", net->proc_net);
5285        remove_proc_entry("rt6_stats", net->proc_net);
5286#endif
5287}
5288
5289static struct pernet_operations ip6_route_net_ops = {
5290        .init = ip6_route_net_init,
5291        .exit = ip6_route_net_exit,
5292};
5293
5294static int __net_init ipv6_inetpeer_init(struct net *net)
5295{
5296        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5297
5298        if (!bp)
5299                return -ENOMEM;
5300        inet_peer_base_init(bp);
5301        net->ipv6.peers = bp;
5302        return 0;
5303}
5304
5305static void __net_exit ipv6_inetpeer_exit(struct net *net)
5306{
5307        struct inet_peer_base *bp = net->ipv6.peers;
5308
5309        net->ipv6.peers = NULL;
5310        inetpeer_invalidate_tree(bp);
5311        kfree(bp);
5312}
5313
5314static struct pernet_operations ipv6_inetpeer_ops = {
5315        .init   =       ipv6_inetpeer_init,
5316        .exit   =       ipv6_inetpeer_exit,
5317};
5318
5319static struct pernet_operations ip6_route_net_late_ops = {
5320        .init = ip6_route_net_init_late,
5321        .exit = ip6_route_net_exit_late,
5322};
5323
5324static struct notifier_block ip6_route_dev_notifier = {
5325        .notifier_call = ip6_route_dev_notify,
5326        .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5327};
5328
5329void __init ip6_route_init_special_entries(void)
5330{
5331        /* Registering of the loopback is done before this portion of code,
5332         * the loopback reference in rt6_info will not be taken, do it
5333         * manually for init_net */
5334        init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5335        init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5336        init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5337  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5338        init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5339        init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5340        init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5341        init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5342  #endif
5343}
5344
5345int __init ip6_route_init(void)
5346{
5347        int ret;
5348        int cpu;
5349
5350        ret = -ENOMEM;
5351        ip6_dst_ops_template.kmem_cachep =
5352                kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5353                                  SLAB_HWCACHE_ALIGN, NULL);
5354        if (!ip6_dst_ops_template.kmem_cachep)
5355                goto out;
5356
5357        ret = dst_entries_init(&ip6_dst_blackhole_ops);
5358        if (ret)
5359                goto out_kmem_cache;
5360
5361        ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5362        if (ret)
5363                goto out_dst_entries;
5364
5365        ret = register_pernet_subsys(&ip6_route_net_ops);
5366        if (ret)
5367                goto out_register_inetpeer;
5368
5369        ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5370
5371        ret = fib6_init();
5372        if (ret)
5373                goto out_register_subsys;
5374
5375        ret = xfrm6_init();
5376        if (ret)
5377                goto out_fib6_init;
5378
5379        ret = fib6_rules_init();
5380        if (ret)
5381                goto xfrm6_init;
5382
5383        ret = register_pernet_subsys(&ip6_route_net_late_ops);
5384        if (ret)
5385                goto fib6_rules_init;
5386
5387        ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5388                                   inet6_rtm_newroute, NULL, 0);
5389        if (ret < 0)
5390                goto out_register_late_subsys;
5391
5392        ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5393                                   inet6_rtm_delroute, NULL, 0);
5394        if (ret < 0)
5395                goto out_register_late_subsys;
5396
5397        ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5398                                   inet6_rtm_getroute, NULL,
5399                                   RTNL_FLAG_DOIT_UNLOCKED);
5400        if (ret < 0)
5401                goto out_register_late_subsys;
5402
5403        ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5404        if (ret)
5405                goto out_register_late_subsys;
5406
5407        for_each_possible_cpu(cpu) {
5408                struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5409
5410                INIT_LIST_HEAD(&ul->head);
5411                spin_lock_init(&ul->lock);
5412        }
5413
5414out:
5415        return ret;
5416
5417out_register_late_subsys:
5418        rtnl_unregister_all(PF_INET6);
5419        unregister_pernet_subsys(&ip6_route_net_late_ops);
5420fib6_rules_init:
5421        fib6_rules_cleanup();
5422xfrm6_init:
5423        xfrm6_fini();
5424out_fib6_init:
5425        fib6_gc_cleanup();
5426out_register_subsys:
5427        unregister_pernet_subsys(&ip6_route_net_ops);
5428out_register_inetpeer:
5429        unregister_pernet_subsys(&ipv6_inetpeer_ops);
5430out_dst_entries:
5431        dst_entries_destroy(&ip6_dst_blackhole_ops);
5432out_kmem_cache:
5433        kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5434        goto out;
5435}
5436
5437void ip6_route_cleanup(void)
5438{
5439        unregister_netdevice_notifier(&ip6_route_dev_notifier);
5440        unregister_pernet_subsys(&ip6_route_net_late_ops);
5441        fib6_rules_cleanup();
5442        xfrm6_fini();
5443        fib6_gc_cleanup();
5444        unregister_pernet_subsys(&ipv6_inetpeer_ops);
5445        unregister_pernet_subsys(&ip6_route_net_ops);
5446        dst_entries_destroy(&ip6_dst_blackhole_ops);
5447        kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5448}
5449