linux/net/ipv4/route.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              ROUTE - implementation of the IP router.
   8 *
   9 * Authors:     Ross Biro
  10 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  12 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  13 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Verify area fixes.
  17 *              Alan Cox        :       cli() protects routing changes
  18 *              Rui Oliveira    :       ICMP routing table updates
  19 *              (rco@di.uminho.pt)      Routing table insertion and update
  20 *              Linus Torvalds  :       Rewrote bits to be sensible
  21 *              Alan Cox        :       Added BSD route gw semantics
  22 *              Alan Cox        :       Super /proc >4K
  23 *              Alan Cox        :       MTU in route table
  24 *              Alan Cox        :       MSS actually. Also added the window
  25 *                                      clamper.
  26 *              Sam Lantinga    :       Fixed route matching in rt_del()
  27 *              Alan Cox        :       Routing cache support.
  28 *              Alan Cox        :       Removed compatibility cruft.
  29 *              Alan Cox        :       RTF_REJECT support.
  30 *              Alan Cox        :       TCP irtt support.
  31 *              Jonathan Naylor :       Added Metric support.
  32 *      Miquel van Smoorenburg  :       BSD API fixes.
  33 *      Miquel van Smoorenburg  :       Metrics.
  34 *              Alan Cox        :       Use __u32 properly
  35 *              Alan Cox        :       Aligned routing errors more closely with BSD
  36 *                                      our system is still very different.
  37 *              Alan Cox        :       Faster /proc handling
  38 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  39 *                                      routing caches and better behaviour.
  40 *
  41 *              Olaf Erb        :       irtt wasn't being copied right.
  42 *              Bjorn Ekwall    :       Kerneld route support.
  43 *              Alan Cox        :       Multicast fixed (I hope)
  44 *              Pavel Krauz     :       Limited broadcast fixed
  45 *              Mike McLagan    :       Routing by source
  46 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  47 *                                      route.c and rewritten from scratch.
  48 *              Andi Kleen      :       Load-limit warning messages.
  49 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  50 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  51 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  52 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  53 *              Marc Boucher    :       routing by fwmark
  54 *      Robert Olsson           :       Added rt_cache statistics
  55 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  56 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  57 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  58 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  59 */
  60
  61#define pr_fmt(fmt) "IPv4: " fmt
  62
  63#include <linux/module.h>
  64#include <linux/uaccess.h>
  65#include <linux/bitops.h>
  66#include <linux/types.h>
  67#include <linux/kernel.h>
  68#include <linux/mm.h>
  69#include <linux/memblock.h>
  70#include <linux/string.h>
  71#include <linux/socket.h>
  72#include <linux/sockios.h>
  73#include <linux/errno.h>
  74#include <linux/in.h>
  75#include <linux/inet.h>
  76#include <linux/netdevice.h>
  77#include <linux/proc_fs.h>
  78#include <linux/init.h>
  79#include <linux/skbuff.h>
  80#include <linux/inetdevice.h>
  81#include <linux/igmp.h>
  82#include <linux/pkt_sched.h>
  83#include <linux/mroute.h>
  84#include <linux/netfilter_ipv4.h>
  85#include <linux/random.h>
  86#include <linux/rcupdate.h>
  87#include <linux/times.h>
  88#include <linux/slab.h>
  89#include <linux/jhash.h>
  90#include <net/dst.h>
  91#include <net/dst_metadata.h>
  92#include <net/net_namespace.h>
  93#include <net/protocol.h>
  94#include <net/ip.h>
  95#include <net/route.h>
  96#include <net/inetpeer.h>
  97#include <net/sock.h>
  98#include <net/ip_fib.h>
  99#include <net/nexthop.h>
 100#include <net/arp.h>
 101#include <net/tcp.h>
 102#include <net/icmp.h>
 103#include <net/xfrm.h>
 104#include <net/lwtunnel.h>
 105#include <net/netevent.h>
 106#include <net/rtnetlink.h>
 107#ifdef CONFIG_SYSCTL
 108#include <linux/sysctl.h>
 109#endif
 110#include <net/secure_seq.h>
 111#include <net/ip_tunnels.h>
 112#include <net/l3mdev.h>
 113
 114#include "fib_lookup.h"
 115
 116#define RT_FL_TOS(oldflp4) \
 117        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119#define RT_GC_TIMEOUT (300*HZ)
 120
 121static int ip_rt_max_size;
 122static int ip_rt_redirect_number __read_mostly  = 9;
 123static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 124static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 125static int ip_rt_error_cost __read_mostly       = HZ;
 126static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 127static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 128static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 129static int ip_rt_min_advmss __read_mostly       = 256;
 130
 131static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 132
 133/*
 134 *      Interface to generic destination cache.
 135 */
 136
 137INDIRECT_CALLABLE_SCOPE
 138struct dst_entry        *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 139static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 140INDIRECT_CALLABLE_SCOPE
 141unsigned int            ipv4_mtu(const struct dst_entry *dst);
 142static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143static void              ipv4_link_failure(struct sk_buff *skb);
 144static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 145                                           struct sk_buff *skb, u32 mtu,
 146                                           bool confirm_neigh);
 147static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 148                                        struct sk_buff *skb);
 149static void             ipv4_dst_destroy(struct dst_entry *dst);
 150
 151static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 152{
 153        WARN_ON(1);
 154        return NULL;
 155}
 156
 157static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 158                                           struct sk_buff *skb,
 159                                           const void *daddr);
 160static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 161
 162static struct dst_ops ipv4_dst_ops = {
 163        .family =               AF_INET,
 164        .check =                ipv4_dst_check,
 165        .default_advmss =       ipv4_default_advmss,
 166        .mtu =                  ipv4_mtu,
 167        .cow_metrics =          ipv4_cow_metrics,
 168        .destroy =              ipv4_dst_destroy,
 169        .negative_advice =      ipv4_negative_advice,
 170        .link_failure =         ipv4_link_failure,
 171        .update_pmtu =          ip_rt_update_pmtu,
 172        .redirect =             ip_do_redirect,
 173        .local_out =            __ip_local_out,
 174        .neigh_lookup =         ipv4_neigh_lookup,
 175        .confirm_neigh =        ipv4_confirm_neigh,
 176};
 177
 178#define ECN_OR_COST(class)      TC_PRIO_##class
 179
 180const __u8 ip_tos2prio[16] = {
 181        TC_PRIO_BESTEFFORT,
 182        ECN_OR_COST(BESTEFFORT),
 183        TC_PRIO_BESTEFFORT,
 184        ECN_OR_COST(BESTEFFORT),
 185        TC_PRIO_BULK,
 186        ECN_OR_COST(BULK),
 187        TC_PRIO_BULK,
 188        ECN_OR_COST(BULK),
 189        TC_PRIO_INTERACTIVE,
 190        ECN_OR_COST(INTERACTIVE),
 191        TC_PRIO_INTERACTIVE,
 192        ECN_OR_COST(INTERACTIVE),
 193        TC_PRIO_INTERACTIVE_BULK,
 194        ECN_OR_COST(INTERACTIVE_BULK),
 195        TC_PRIO_INTERACTIVE_BULK,
 196        ECN_OR_COST(INTERACTIVE_BULK)
 197};
 198EXPORT_SYMBOL(ip_tos2prio);
 199
 200static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 201#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 202
 203#ifdef CONFIG_PROC_FS
 204static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 205{
 206        if (*pos)
 207                return NULL;
 208        return SEQ_START_TOKEN;
 209}
 210
 211static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 212{
 213        ++*pos;
 214        return NULL;
 215}
 216
 217static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 218{
 219}
 220
 221static int rt_cache_seq_show(struct seq_file *seq, void *v)
 222{
 223        if (v == SEQ_START_TOKEN)
 224                seq_printf(seq, "%-127s\n",
 225                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 226                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 227                           "HHUptod\tSpecDst");
 228        return 0;
 229}
 230
 231static const struct seq_operations rt_cache_seq_ops = {
 232        .start  = rt_cache_seq_start,
 233        .next   = rt_cache_seq_next,
 234        .stop   = rt_cache_seq_stop,
 235        .show   = rt_cache_seq_show,
 236};
 237
 238static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 239{
 240        int cpu;
 241
 242        if (*pos == 0)
 243                return SEQ_START_TOKEN;
 244
 245        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 246                if (!cpu_possible(cpu))
 247                        continue;
 248                *pos = cpu+1;
 249                return &per_cpu(rt_cache_stat, cpu);
 250        }
 251        return NULL;
 252}
 253
 254static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 255{
 256        int cpu;
 257
 258        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 259                if (!cpu_possible(cpu))
 260                        continue;
 261                *pos = cpu+1;
 262                return &per_cpu(rt_cache_stat, cpu);
 263        }
 264        (*pos)++;
 265        return NULL;
 266
 267}
 268
 269static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 270{
 271
 272}
 273
 274static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 275{
 276        struct rt_cache_stat *st = v;
 277
 278        if (v == SEQ_START_TOKEN) {
 279                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 280                return 0;
 281        }
 282
 283        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 284                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 285                   dst_entries_get_slow(&ipv4_dst_ops),
 286                   0, /* st->in_hit */
 287                   st->in_slow_tot,
 288                   st->in_slow_mc,
 289                   st->in_no_route,
 290                   st->in_brd,
 291                   st->in_martian_dst,
 292                   st->in_martian_src,
 293
 294                   0, /* st->out_hit */
 295                   st->out_slow_tot,
 296                   st->out_slow_mc,
 297
 298                   0, /* st->gc_total */
 299                   0, /* st->gc_ignored */
 300                   0, /* st->gc_goal_miss */
 301                   0, /* st->gc_dst_overflow */
 302                   0, /* st->in_hlist_search */
 303                   0  /* st->out_hlist_search */
 304                );
 305        return 0;
 306}
 307
 308static const struct seq_operations rt_cpu_seq_ops = {
 309        .start  = rt_cpu_seq_start,
 310        .next   = rt_cpu_seq_next,
 311        .stop   = rt_cpu_seq_stop,
 312        .show   = rt_cpu_seq_show,
 313};
 314
 315#ifdef CONFIG_IP_ROUTE_CLASSID
 316static int rt_acct_proc_show(struct seq_file *m, void *v)
 317{
 318        struct ip_rt_acct *dst, *src;
 319        unsigned int i, j;
 320
 321        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 322        if (!dst)
 323                return -ENOMEM;
 324
 325        for_each_possible_cpu(i) {
 326                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 327                for (j = 0; j < 256; j++) {
 328                        dst[j].o_bytes   += src[j].o_bytes;
 329                        dst[j].o_packets += src[j].o_packets;
 330                        dst[j].i_bytes   += src[j].i_bytes;
 331                        dst[j].i_packets += src[j].i_packets;
 332                }
 333        }
 334
 335        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 336        kfree(dst);
 337        return 0;
 338}
 339#endif
 340
 341static int __net_init ip_rt_do_proc_init(struct net *net)
 342{
 343        struct proc_dir_entry *pde;
 344
 345        pde = proc_create_seq("rt_cache", 0444, net->proc_net,
 346                              &rt_cache_seq_ops);
 347        if (!pde)
 348                goto err1;
 349
 350        pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
 351                              &rt_cpu_seq_ops);
 352        if (!pde)
 353                goto err2;
 354
 355#ifdef CONFIG_IP_ROUTE_CLASSID
 356        pde = proc_create_single("rt_acct", 0, net->proc_net,
 357                        rt_acct_proc_show);
 358        if (!pde)
 359                goto err3;
 360#endif
 361        return 0;
 362
 363#ifdef CONFIG_IP_ROUTE_CLASSID
 364err3:
 365        remove_proc_entry("rt_cache", net->proc_net_stat);
 366#endif
 367err2:
 368        remove_proc_entry("rt_cache", net->proc_net);
 369err1:
 370        return -ENOMEM;
 371}
 372
 373static void __net_exit ip_rt_do_proc_exit(struct net *net)
 374{
 375        remove_proc_entry("rt_cache", net->proc_net_stat);
 376        remove_proc_entry("rt_cache", net->proc_net);
 377#ifdef CONFIG_IP_ROUTE_CLASSID
 378        remove_proc_entry("rt_acct", net->proc_net);
 379#endif
 380}
 381
 382static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 383        .init = ip_rt_do_proc_init,
 384        .exit = ip_rt_do_proc_exit,
 385};
 386
 387static int __init ip_rt_proc_init(void)
 388{
 389        return register_pernet_subsys(&ip_rt_proc_ops);
 390}
 391
 392#else
 393static inline int ip_rt_proc_init(void)
 394{
 395        return 0;
 396}
 397#endif /* CONFIG_PROC_FS */
 398
 399static inline bool rt_is_expired(const struct rtable *rth)
 400{
 401        return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 402}
 403
 404void rt_cache_flush(struct net *net)
 405{
 406        rt_genid_bump_ipv4(net);
 407}
 408
 409static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 410                                           struct sk_buff *skb,
 411                                           const void *daddr)
 412{
 413        const struct rtable *rt = container_of(dst, struct rtable, dst);
 414        struct net_device *dev = dst->dev;
 415        struct neighbour *n;
 416
 417        rcu_read_lock_bh();
 418
 419        if (likely(rt->rt_gw_family == AF_INET)) {
 420                n = ip_neigh_gw4(dev, rt->rt_gw4);
 421        } else if (rt->rt_gw_family == AF_INET6) {
 422                n = ip_neigh_gw6(dev, &rt->rt_gw6);
 423        } else {
 424                __be32 pkey;
 425
 426                pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
 427                n = ip_neigh_gw4(dev, pkey);
 428        }
 429
 430        if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
 431                n = NULL;
 432
 433        rcu_read_unlock_bh();
 434
 435        return n;
 436}
 437
 438static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 439{
 440        const struct rtable *rt = container_of(dst, struct rtable, dst);
 441        struct net_device *dev = dst->dev;
 442        const __be32 *pkey = daddr;
 443
 444        if (rt->rt_gw_family == AF_INET) {
 445                pkey = (const __be32 *)&rt->rt_gw4;
 446        } else if (rt->rt_gw_family == AF_INET6) {
 447                return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
 448        } else if (!daddr ||
 449                 (rt->rt_flags &
 450                  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
 451                return;
 452        }
 453        __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 454}
 455
 456/* Hash tables of size 2048..262144 depending on RAM size.
 457 * Each bucket uses 8 bytes.
 458 */
 459static u32 ip_idents_mask __read_mostly;
 460static atomic_t *ip_idents __read_mostly;
 461static u32 *ip_tstamps __read_mostly;
 462
 463/* In order to protect privacy, we add a perturbation to identifiers
 464 * if one generator is seldom used. This makes hard for an attacker
 465 * to infer how many packets were sent between two points in time.
 466 */
 467u32 ip_idents_reserve(u32 hash, int segs)
 468{
 469        u32 bucket, old, now = (u32)jiffies;
 470        atomic_t *p_id;
 471        u32 *p_tstamp;
 472        u32 delta = 0;
 473
 474        bucket = hash & ip_idents_mask;
 475        p_tstamp = ip_tstamps + bucket;
 476        p_id = ip_idents + bucket;
 477        old = READ_ONCE(*p_tstamp);
 478
 479        if (old != now && cmpxchg(p_tstamp, old, now) == old)
 480                delta = prandom_u32_max(now - old);
 481
 482        /* If UBSAN reports an error there, please make sure your compiler
 483         * supports -fno-strict-overflow before reporting it that was a bug
 484         * in UBSAN, and it has been fixed in GCC-8.
 485         */
 486        return atomic_add_return(segs + delta, p_id) - segs;
 487}
 488EXPORT_SYMBOL(ip_idents_reserve);
 489
 490void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 491{
 492        u32 hash, id;
 493
 494        /* Note the following code is not safe, but this is okay. */
 495        if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 496                get_random_bytes(&net->ipv4.ip_id_key,
 497                                 sizeof(net->ipv4.ip_id_key));
 498
 499        hash = siphash_3u32((__force u32)iph->daddr,
 500                            (__force u32)iph->saddr,
 501                            iph->protocol,
 502                            &net->ipv4.ip_id_key);
 503        id = ip_idents_reserve(hash, segs);
 504        iph->id = htons(id);
 505}
 506EXPORT_SYMBOL(__ip_select_ident);
 507
 508static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 509                             const struct sock *sk,
 510                             const struct iphdr *iph,
 511                             int oif, u8 tos,
 512                             u8 prot, u32 mark, int flow_flags)
 513{
 514        if (sk) {
 515                const struct inet_sock *inet = inet_sk(sk);
 516
 517                oif = sk->sk_bound_dev_if;
 518                mark = sk->sk_mark;
 519                tos = RT_CONN_FLAGS(sk);
 520                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 521        }
 522        flowi4_init_output(fl4, oif, mark, tos,
 523                           RT_SCOPE_UNIVERSE, prot,
 524                           flow_flags,
 525                           iph->daddr, iph->saddr, 0, 0,
 526                           sock_net_uid(net, sk));
 527}
 528
 529static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 530                               const struct sock *sk)
 531{
 532        const struct net *net = dev_net(skb->dev);
 533        const struct iphdr *iph = ip_hdr(skb);
 534        int oif = skb->dev->ifindex;
 535        u8 tos = RT_TOS(iph->tos);
 536        u8 prot = iph->protocol;
 537        u32 mark = skb->mark;
 538
 539        __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 540}
 541
 542static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 543{
 544        const struct inet_sock *inet = inet_sk(sk);
 545        const struct ip_options_rcu *inet_opt;
 546        __be32 daddr = inet->inet_daddr;
 547
 548        rcu_read_lock();
 549        inet_opt = rcu_dereference(inet->inet_opt);
 550        if (inet_opt && inet_opt->opt.srr)
 551                daddr = inet_opt->opt.faddr;
 552        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 553                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 554                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 555                           inet_sk_flowi_flags(sk),
 556                           daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 557        rcu_read_unlock();
 558}
 559
 560static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 561                                 const struct sk_buff *skb)
 562{
 563        if (skb)
 564                build_skb_flow_key(fl4, skb, sk);
 565        else
 566                build_sk_flow_key(fl4, sk);
 567}
 568
 569static DEFINE_SPINLOCK(fnhe_lock);
 570
 571static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 572{
 573        struct rtable *rt;
 574
 575        rt = rcu_dereference(fnhe->fnhe_rth_input);
 576        if (rt) {
 577                RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 578                dst_dev_put(&rt->dst);
 579                dst_release(&rt->dst);
 580        }
 581        rt = rcu_dereference(fnhe->fnhe_rth_output);
 582        if (rt) {
 583                RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 584                dst_dev_put(&rt->dst);
 585                dst_release(&rt->dst);
 586        }
 587}
 588
 589static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 590{
 591        struct fib_nh_exception *fnhe, *oldest;
 592
 593        oldest = rcu_dereference(hash->chain);
 594        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 595             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 596                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 597                        oldest = fnhe;
 598        }
 599        fnhe_flush_routes(oldest);
 600        return oldest;
 601}
 602
 603static u32 fnhe_hashfun(__be32 daddr)
 604{
 605        static siphash_key_t fnhe_hash_key __read_mostly;
 606        u64 hval;
 607
 608        net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
 609        hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
 610        return hash_64(hval, FNHE_HASH_SHIFT);
 611}
 612
 613static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 614{
 615        rt->rt_pmtu = fnhe->fnhe_pmtu;
 616        rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 617        rt->dst.expires = fnhe->fnhe_expires;
 618
 619        if (fnhe->fnhe_gw) {
 620                rt->rt_flags |= RTCF_REDIRECTED;
 621                rt->rt_uses_gateway = 1;
 622                rt->rt_gw_family = AF_INET;
 623                rt->rt_gw4 = fnhe->fnhe_gw;
 624        }
 625}
 626
 627static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
 628                                  __be32 gw, u32 pmtu, bool lock,
 629                                  unsigned long expires)
 630{
 631        struct fnhe_hash_bucket *hash;
 632        struct fib_nh_exception *fnhe;
 633        struct rtable *rt;
 634        u32 genid, hval;
 635        unsigned int i;
 636        int depth;
 637
 638        genid = fnhe_genid(dev_net(nhc->nhc_dev));
 639        hval = fnhe_hashfun(daddr);
 640
 641        spin_lock_bh(&fnhe_lock);
 642
 643        hash = rcu_dereference(nhc->nhc_exceptions);
 644        if (!hash) {
 645                hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 646                if (!hash)
 647                        goto out_unlock;
 648                rcu_assign_pointer(nhc->nhc_exceptions, hash);
 649        }
 650
 651        hash += hval;
 652
 653        depth = 0;
 654        for (fnhe = rcu_dereference(hash->chain); fnhe;
 655             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 656                if (fnhe->fnhe_daddr == daddr)
 657                        break;
 658                depth++;
 659        }
 660
 661        if (fnhe) {
 662                if (fnhe->fnhe_genid != genid)
 663                        fnhe->fnhe_genid = genid;
 664                if (gw)
 665                        fnhe->fnhe_gw = gw;
 666                if (pmtu) {
 667                        fnhe->fnhe_pmtu = pmtu;
 668                        fnhe->fnhe_mtu_locked = lock;
 669                }
 670                fnhe->fnhe_expires = max(1UL, expires);
 671                /* Update all cached dsts too */
 672                rt = rcu_dereference(fnhe->fnhe_rth_input);
 673                if (rt)
 674                        fill_route_from_fnhe(rt, fnhe);
 675                rt = rcu_dereference(fnhe->fnhe_rth_output);
 676                if (rt)
 677                        fill_route_from_fnhe(rt, fnhe);
 678        } else {
 679                if (depth > FNHE_RECLAIM_DEPTH)
 680                        fnhe = fnhe_oldest(hash);
 681                else {
 682                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 683                        if (!fnhe)
 684                                goto out_unlock;
 685
 686                        fnhe->fnhe_next = hash->chain;
 687                        rcu_assign_pointer(hash->chain, fnhe);
 688                }
 689                fnhe->fnhe_genid = genid;
 690                fnhe->fnhe_daddr = daddr;
 691                fnhe->fnhe_gw = gw;
 692                fnhe->fnhe_pmtu = pmtu;
 693                fnhe->fnhe_mtu_locked = lock;
 694                fnhe->fnhe_expires = max(1UL, expires);
 695
 696                /* Exception created; mark the cached routes for the nexthop
 697                 * stale, so anyone caching it rechecks if this exception
 698                 * applies to them.
 699                 */
 700                rt = rcu_dereference(nhc->nhc_rth_input);
 701                if (rt)
 702                        rt->dst.obsolete = DST_OBSOLETE_KILL;
 703
 704                for_each_possible_cpu(i) {
 705                        struct rtable __rcu **prt;
 706
 707                        prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 708                        rt = rcu_dereference(*prt);
 709                        if (rt)
 710                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 711                }
 712        }
 713
 714        fnhe->fnhe_stamp = jiffies;
 715
 716out_unlock:
 717        spin_unlock_bh(&fnhe_lock);
 718}
 719
 720static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 721                             bool kill_route)
 722{
 723        __be32 new_gw = icmp_hdr(skb)->un.gateway;
 724        __be32 old_gw = ip_hdr(skb)->saddr;
 725        struct net_device *dev = skb->dev;
 726        struct in_device *in_dev;
 727        struct fib_result res;
 728        struct neighbour *n;
 729        struct net *net;
 730
 731        switch (icmp_hdr(skb)->code & 7) {
 732        case ICMP_REDIR_NET:
 733        case ICMP_REDIR_NETTOS:
 734        case ICMP_REDIR_HOST:
 735        case ICMP_REDIR_HOSTTOS:
 736                break;
 737
 738        default:
 739                return;
 740        }
 741
 742        if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
 743                return;
 744
 745        in_dev = __in_dev_get_rcu(dev);
 746        if (!in_dev)
 747                return;
 748
 749        net = dev_net(dev);
 750        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 751            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 752            ipv4_is_zeronet(new_gw))
 753                goto reject_redirect;
 754
 755        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 756                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 757                        goto reject_redirect;
 758                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 759                        goto reject_redirect;
 760        } else {
 761                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 762                        goto reject_redirect;
 763        }
 764
 765        n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 766        if (!n)
 767                n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 768        if (!IS_ERR(n)) {
 769                if (!(n->nud_state & NUD_VALID)) {
 770                        neigh_event_send(n, NULL);
 771                } else {
 772                        if (fib_lookup(net, fl4, &res, 0) == 0) {
 773                                struct fib_nh_common *nhc;
 774
 775                                fib_select_path(net, &res, fl4, skb);
 776                                nhc = FIB_RES_NHC(res);
 777                                update_or_create_fnhe(nhc, fl4->daddr, new_gw,
 778                                                0, false,
 779                                                jiffies + ip_rt_gc_timeout);
 780                        }
 781                        if (kill_route)
 782                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 783                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 784                }
 785                neigh_release(n);
 786        }
 787        return;
 788
 789reject_redirect:
 790#ifdef CONFIG_IP_ROUTE_VERBOSE
 791        if (IN_DEV_LOG_MARTIANS(in_dev)) {
 792                const struct iphdr *iph = (const struct iphdr *) skb->data;
 793                __be32 daddr = iph->daddr;
 794                __be32 saddr = iph->saddr;
 795
 796                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 797                                     "  Advised path = %pI4 -> %pI4\n",
 798                                     &old_gw, dev->name, &new_gw,
 799                                     &saddr, &daddr);
 800        }
 801#endif
 802        ;
 803}
 804
 805static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 806{
 807        struct rtable *rt;
 808        struct flowi4 fl4;
 809        const struct iphdr *iph = (const struct iphdr *) skb->data;
 810        struct net *net = dev_net(skb->dev);
 811        int oif = skb->dev->ifindex;
 812        u8 tos = RT_TOS(iph->tos);
 813        u8 prot = iph->protocol;
 814        u32 mark = skb->mark;
 815
 816        rt = (struct rtable *) dst;
 817
 818        __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 819        __ip_do_redirect(rt, skb, &fl4, true);
 820}
 821
 822static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 823{
 824        struct rtable *rt = (struct rtable *)dst;
 825        struct dst_entry *ret = dst;
 826
 827        if (rt) {
 828                if (dst->obsolete > 0) {
 829                        ip_rt_put(rt);
 830                        ret = NULL;
 831                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 832                           rt->dst.expires) {
 833                        ip_rt_put(rt);
 834                        ret = NULL;
 835                }
 836        }
 837        return ret;
 838}
 839
 840/*
 841 * Algorithm:
 842 *      1. The first ip_rt_redirect_number redirects are sent
 843 *         with exponential backoff, then we stop sending them at all,
 844 *         assuming that the host ignores our redirects.
 845 *      2. If we did not see packets requiring redirects
 846 *         during ip_rt_redirect_silence, we assume that the host
 847 *         forgot redirected route and start to send redirects again.
 848 *
 849 * This algorithm is much cheaper and more intelligent than dumb load limiting
 850 * in icmp.c.
 851 *
 852 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 853 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 854 */
 855
 856void ip_rt_send_redirect(struct sk_buff *skb)
 857{
 858        struct rtable *rt = skb_rtable(skb);
 859        struct in_device *in_dev;
 860        struct inet_peer *peer;
 861        struct net *net;
 862        int log_martians;
 863        int vif;
 864
 865        rcu_read_lock();
 866        in_dev = __in_dev_get_rcu(rt->dst.dev);
 867        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 868                rcu_read_unlock();
 869                return;
 870        }
 871        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 872        vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 873        rcu_read_unlock();
 874
 875        net = dev_net(rt->dst.dev);
 876        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 877        if (!peer) {
 878                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 879                          rt_nexthop(rt, ip_hdr(skb)->daddr));
 880                return;
 881        }
 882
 883        /* No redirected packets during ip_rt_redirect_silence;
 884         * reset the algorithm.
 885         */
 886        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 887                peer->rate_tokens = 0;
 888                peer->n_redirects = 0;
 889        }
 890
 891        /* Too many ignored redirects; do not send anything
 892         * set dst.rate_last to the last seen redirected packet.
 893         */
 894        if (peer->n_redirects >= ip_rt_redirect_number) {
 895                peer->rate_last = jiffies;
 896                goto out_put_peer;
 897        }
 898
 899        /* Check for load limit; set rate_last to the latest sent
 900         * redirect.
 901         */
 902        if (peer->n_redirects == 0 ||
 903            time_after(jiffies,
 904                       (peer->rate_last +
 905                        (ip_rt_redirect_load << peer->n_redirects)))) {
 906                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 907
 908                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 909                peer->rate_last = jiffies;
 910                ++peer->n_redirects;
 911#ifdef CONFIG_IP_ROUTE_VERBOSE
 912                if (log_martians &&
 913                    peer->n_redirects == ip_rt_redirect_number)
 914                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 915                                             &ip_hdr(skb)->saddr, inet_iif(skb),
 916                                             &ip_hdr(skb)->daddr, &gw);
 917#endif
 918        }
 919out_put_peer:
 920        inet_putpeer(peer);
 921}
 922
 923static int ip_error(struct sk_buff *skb)
 924{
 925        struct rtable *rt = skb_rtable(skb);
 926        struct net_device *dev = skb->dev;
 927        struct in_device *in_dev;
 928        struct inet_peer *peer;
 929        unsigned long now;
 930        struct net *net;
 931        bool send;
 932        int code;
 933
 934        if (netif_is_l3_master(skb->dev)) {
 935                dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 936                if (!dev)
 937                        goto out;
 938        }
 939
 940        in_dev = __in_dev_get_rcu(dev);
 941
 942        /* IP on this device is disabled. */
 943        if (!in_dev)
 944                goto out;
 945
 946        net = dev_net(rt->dst.dev);
 947        if (!IN_DEV_FORWARD(in_dev)) {
 948                switch (rt->dst.error) {
 949                case EHOSTUNREACH:
 950                        __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 951                        break;
 952
 953                case ENETUNREACH:
 954                        __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 955                        break;
 956                }
 957                goto out;
 958        }
 959
 960        switch (rt->dst.error) {
 961        case EINVAL:
 962        default:
 963                goto out;
 964        case EHOSTUNREACH:
 965                code = ICMP_HOST_UNREACH;
 966                break;
 967        case ENETUNREACH:
 968                code = ICMP_NET_UNREACH;
 969                __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 970                break;
 971        case EACCES:
 972                code = ICMP_PKT_FILTERED;
 973                break;
 974        }
 975
 976        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 977                               l3mdev_master_ifindex(skb->dev), 1);
 978
 979        send = true;
 980        if (peer) {
 981                now = jiffies;
 982                peer->rate_tokens += now - peer->rate_last;
 983                if (peer->rate_tokens > ip_rt_error_burst)
 984                        peer->rate_tokens = ip_rt_error_burst;
 985                peer->rate_last = now;
 986                if (peer->rate_tokens >= ip_rt_error_cost)
 987                        peer->rate_tokens -= ip_rt_error_cost;
 988                else
 989                        send = false;
 990                inet_putpeer(peer);
 991        }
 992        if (send)
 993                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 994
 995out:    kfree_skb(skb);
 996        return 0;
 997}
 998
 999static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1000{
1001        struct dst_entry *dst = &rt->dst;
1002        struct net *net = dev_net(dst->dev);
1003        struct fib_result res;
1004        bool lock = false;
1005        u32 old_mtu;
1006
1007        if (ip_mtu_locked(dst))
1008                return;
1009
1010        old_mtu = ipv4_mtu(dst);
1011        if (old_mtu < mtu)
1012                return;
1013
1014        if (mtu < ip_rt_min_pmtu) {
1015                lock = true;
1016                mtu = min(old_mtu, ip_rt_min_pmtu);
1017        }
1018
1019        if (rt->rt_pmtu == mtu && !lock &&
1020            time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1021                return;
1022
1023        rcu_read_lock();
1024        if (fib_lookup(net, fl4, &res, 0) == 0) {
1025                struct fib_nh_common *nhc;
1026
1027                fib_select_path(net, &res, fl4, NULL);
1028                nhc = FIB_RES_NHC(res);
1029                update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1030                                      jiffies + ip_rt_mtu_expires);
1031        }
1032        rcu_read_unlock();
1033}
1034
1035static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1036                              struct sk_buff *skb, u32 mtu,
1037                              bool confirm_neigh)
1038{
1039        struct rtable *rt = (struct rtable *) dst;
1040        struct flowi4 fl4;
1041
1042        ip_rt_build_flow_key(&fl4, sk, skb);
1043
1044        /* Don't make lookup fail for bridged encapsulations */
1045        if (skb && netif_is_any_bridge_port(skb->dev))
1046                fl4.flowi4_oif = 0;
1047
1048        __ip_rt_update_pmtu(rt, &fl4, mtu);
1049}
1050
1051void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1052                      int oif, u8 protocol)
1053{
1054        const struct iphdr *iph = (const struct iphdr *)skb->data;
1055        struct flowi4 fl4;
1056        struct rtable *rt;
1057        u32 mark = IP4_REPLY_MARK(net, skb->mark);
1058
1059        __build_flow_key(net, &fl4, NULL, iph, oif,
1060                         RT_TOS(iph->tos), protocol, mark, 0);
1061        rt = __ip_route_output_key(net, &fl4);
1062        if (!IS_ERR(rt)) {
1063                __ip_rt_update_pmtu(rt, &fl4, mtu);
1064                ip_rt_put(rt);
1065        }
1066}
1067EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1068
1069static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1070{
1071        const struct iphdr *iph = (const struct iphdr *)skb->data;
1072        struct flowi4 fl4;
1073        struct rtable *rt;
1074
1075        __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1076
1077        if (!fl4.flowi4_mark)
1078                fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1079
1080        rt = __ip_route_output_key(sock_net(sk), &fl4);
1081        if (!IS_ERR(rt)) {
1082                __ip_rt_update_pmtu(rt, &fl4, mtu);
1083                ip_rt_put(rt);
1084        }
1085}
1086
1087void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1088{
1089        const struct iphdr *iph = (const struct iphdr *)skb->data;
1090        struct flowi4 fl4;
1091        struct rtable *rt;
1092        struct dst_entry *odst = NULL;
1093        bool new = false;
1094        struct net *net = sock_net(sk);
1095
1096        bh_lock_sock(sk);
1097
1098        if (!ip_sk_accept_pmtu(sk))
1099                goto out;
1100
1101        odst = sk_dst_get(sk);
1102
1103        if (sock_owned_by_user(sk) || !odst) {
1104                __ipv4_sk_update_pmtu(skb, sk, mtu);
1105                goto out;
1106        }
1107
1108        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1109
1110        rt = (struct rtable *)odst;
1111        if (odst->obsolete && !odst->ops->check(odst, 0)) {
1112                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1113                if (IS_ERR(rt))
1114                        goto out;
1115
1116                new = true;
1117        }
1118
1119        __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1120
1121        if (!dst_check(&rt->dst, 0)) {
1122                if (new)
1123                        dst_release(&rt->dst);
1124
1125                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1126                if (IS_ERR(rt))
1127                        goto out;
1128
1129                new = true;
1130        }
1131
1132        if (new)
1133                sk_dst_set(sk, &rt->dst);
1134
1135out:
1136        bh_unlock_sock(sk);
1137        dst_release(odst);
1138}
1139EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1140
1141void ipv4_redirect(struct sk_buff *skb, struct net *net,
1142                   int oif, u8 protocol)
1143{
1144        const struct iphdr *iph = (const struct iphdr *)skb->data;
1145        struct flowi4 fl4;
1146        struct rtable *rt;
1147
1148        __build_flow_key(net, &fl4, NULL, iph, oif,
1149                         RT_TOS(iph->tos), protocol, 0, 0);
1150        rt = __ip_route_output_key(net, &fl4);
1151        if (!IS_ERR(rt)) {
1152                __ip_do_redirect(rt, skb, &fl4, false);
1153                ip_rt_put(rt);
1154        }
1155}
1156EXPORT_SYMBOL_GPL(ipv4_redirect);
1157
1158void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1159{
1160        const struct iphdr *iph = (const struct iphdr *)skb->data;
1161        struct flowi4 fl4;
1162        struct rtable *rt;
1163        struct net *net = sock_net(sk);
1164
1165        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1166        rt = __ip_route_output_key(net, &fl4);
1167        if (!IS_ERR(rt)) {
1168                __ip_do_redirect(rt, skb, &fl4, false);
1169                ip_rt_put(rt);
1170        }
1171}
1172EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1173
1174INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
1175                                                         u32 cookie)
1176{
1177        struct rtable *rt = (struct rtable *) dst;
1178
1179        /* All IPV4 dsts are created with ->obsolete set to the value
1180         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1181         * into this function always.
1182         *
1183         * When a PMTU/redirect information update invalidates a route,
1184         * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1185         * DST_OBSOLETE_DEAD.
1186         */
1187        if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1188                return NULL;
1189        return dst;
1190}
1191EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
1192
1193static void ipv4_send_dest_unreach(struct sk_buff *skb)
1194{
1195        struct ip_options opt;
1196        int res;
1197
1198        /* Recompile ip options since IPCB may not be valid anymore.
1199         * Also check we have a reasonable ipv4 header.
1200         */
1201        if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1202            ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1203                return;
1204
1205        memset(&opt, 0, sizeof(opt));
1206        if (ip_hdr(skb)->ihl > 5) {
1207                if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1208                        return;
1209                opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1210
1211                rcu_read_lock();
1212                res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1213                rcu_read_unlock();
1214
1215                if (res)
1216                        return;
1217        }
1218        __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1219}
1220
1221static void ipv4_link_failure(struct sk_buff *skb)
1222{
1223        struct rtable *rt;
1224
1225        ipv4_send_dest_unreach(skb);
1226
1227        rt = skb_rtable(skb);
1228        if (rt)
1229                dst_set_expires(&rt->dst, 0);
1230}
1231
1232static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1233{
1234        pr_debug("%s: %pI4 -> %pI4, %s\n",
1235                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1236                 skb->dev ? skb->dev->name : "?");
1237        kfree_skb(skb);
1238        WARN_ON(1);
1239        return 0;
1240}
1241
1242/*
1243 * We do not cache source address of outgoing interface,
1244 * because it is used only by IP RR, TS and SRR options,
1245 * so that it out of fast path.
1246 *
1247 * BTW remember: "addr" is allowed to be not aligned
1248 * in IP options!
1249 */
1250
1251void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1252{
1253        __be32 src;
1254
1255        if (rt_is_output_route(rt))
1256                src = ip_hdr(skb)->saddr;
1257        else {
1258                struct fib_result res;
1259                struct iphdr *iph = ip_hdr(skb);
1260                struct flowi4 fl4 = {
1261                        .daddr = iph->daddr,
1262                        .saddr = iph->saddr,
1263                        .flowi4_tos = RT_TOS(iph->tos),
1264                        .flowi4_oif = rt->dst.dev->ifindex,
1265                        .flowi4_iif = skb->dev->ifindex,
1266                        .flowi4_mark = skb->mark,
1267                };
1268
1269                rcu_read_lock();
1270                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1271                        src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1272                else
1273                        src = inet_select_addr(rt->dst.dev,
1274                                               rt_nexthop(rt, iph->daddr),
1275                                               RT_SCOPE_UNIVERSE);
1276                rcu_read_unlock();
1277        }
1278        memcpy(addr, &src, 4);
1279}
1280
1281#ifdef CONFIG_IP_ROUTE_CLASSID
1282static void set_class_tag(struct rtable *rt, u32 tag)
1283{
1284        if (!(rt->dst.tclassid & 0xFFFF))
1285                rt->dst.tclassid |= tag & 0xFFFF;
1286        if (!(rt->dst.tclassid & 0xFFFF0000))
1287                rt->dst.tclassid |= tag & 0xFFFF0000;
1288}
1289#endif
1290
1291static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1292{
1293        unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1294        unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1295                                    ip_rt_min_advmss);
1296
1297        return min(advmss, IPV4_MAX_PMTU - header_size);
1298}
1299
1300INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
1301{
1302        const struct rtable *rt = (const struct rtable *)dst;
1303        unsigned int mtu = rt->rt_pmtu;
1304
1305        if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1306                mtu = dst_metric_raw(dst, RTAX_MTU);
1307
1308        if (mtu)
1309                goto out;
1310
1311        mtu = READ_ONCE(dst->dev->mtu);
1312
1313        if (unlikely(ip_mtu_locked(dst))) {
1314                if (rt->rt_uses_gateway && mtu > 576)
1315                        mtu = 576;
1316        }
1317
1318out:
1319        mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1320
1321        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1322}
1323EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
1324
1325static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1326{
1327        struct fnhe_hash_bucket *hash;
1328        struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1329        u32 hval = fnhe_hashfun(daddr);
1330
1331        spin_lock_bh(&fnhe_lock);
1332
1333        hash = rcu_dereference_protected(nhc->nhc_exceptions,
1334                                         lockdep_is_held(&fnhe_lock));
1335        hash += hval;
1336
1337        fnhe_p = &hash->chain;
1338        fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1339        while (fnhe) {
1340                if (fnhe->fnhe_daddr == daddr) {
1341                        rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1342                                fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1343                        /* set fnhe_daddr to 0 to ensure it won't bind with
1344                         * new dsts in rt_bind_exception().
1345                         */
1346                        fnhe->fnhe_daddr = 0;
1347                        fnhe_flush_routes(fnhe);
1348                        kfree_rcu(fnhe, rcu);
1349                        break;
1350                }
1351                fnhe_p = &fnhe->fnhe_next;
1352                fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1353                                                 lockdep_is_held(&fnhe_lock));
1354        }
1355
1356        spin_unlock_bh(&fnhe_lock);
1357}
1358
1359static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1360                                               __be32 daddr)
1361{
1362        struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1363        struct fib_nh_exception *fnhe;
1364        u32 hval;
1365
1366        if (!hash)
1367                return NULL;
1368
1369        hval = fnhe_hashfun(daddr);
1370
1371        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1372             fnhe = rcu_dereference(fnhe->fnhe_next)) {
1373                if (fnhe->fnhe_daddr == daddr) {
1374                        if (fnhe->fnhe_expires &&
1375                            time_after(jiffies, fnhe->fnhe_expires)) {
1376                                ip_del_fnhe(nhc, daddr);
1377                                break;
1378                        }
1379                        return fnhe;
1380                }
1381        }
1382        return NULL;
1383}
1384
1385/* MTU selection:
1386 * 1. mtu on route is locked - use it
1387 * 2. mtu from nexthop exception
1388 * 3. mtu from egress device
1389 */
1390
1391u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1392{
1393        struct fib_nh_common *nhc = res->nhc;
1394        struct net_device *dev = nhc->nhc_dev;
1395        struct fib_info *fi = res->fi;
1396        u32 mtu = 0;
1397
1398        if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1399            fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1400                mtu = fi->fib_mtu;
1401
1402        if (likely(!mtu)) {
1403                struct fib_nh_exception *fnhe;
1404
1405                fnhe = find_exception(nhc, daddr);
1406                if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1407                        mtu = fnhe->fnhe_pmtu;
1408        }
1409
1410        if (likely(!mtu))
1411                mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1412
1413        return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1414}
1415
1416static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1417                              __be32 daddr, const bool do_cache)
1418{
1419        bool ret = false;
1420
1421        spin_lock_bh(&fnhe_lock);
1422
1423        if (daddr == fnhe->fnhe_daddr) {
1424                struct rtable __rcu **porig;
1425                struct rtable *orig;
1426                int genid = fnhe_genid(dev_net(rt->dst.dev));
1427
1428                if (rt_is_input_route(rt))
1429                        porig = &fnhe->fnhe_rth_input;
1430                else
1431                        porig = &fnhe->fnhe_rth_output;
1432                orig = rcu_dereference(*porig);
1433
1434                if (fnhe->fnhe_genid != genid) {
1435                        fnhe->fnhe_genid = genid;
1436                        fnhe->fnhe_gw = 0;
1437                        fnhe->fnhe_pmtu = 0;
1438                        fnhe->fnhe_expires = 0;
1439                        fnhe->fnhe_mtu_locked = false;
1440                        fnhe_flush_routes(fnhe);
1441                        orig = NULL;
1442                }
1443                fill_route_from_fnhe(rt, fnhe);
1444                if (!rt->rt_gw4) {
1445                        rt->rt_gw4 = daddr;
1446                        rt->rt_gw_family = AF_INET;
1447                }
1448
1449                if (do_cache) {
1450                        dst_hold(&rt->dst);
1451                        rcu_assign_pointer(*porig, rt);
1452                        if (orig) {
1453                                dst_dev_put(&orig->dst);
1454                                dst_release(&orig->dst);
1455                        }
1456                        ret = true;
1457                }
1458
1459                fnhe->fnhe_stamp = jiffies;
1460        }
1461        spin_unlock_bh(&fnhe_lock);
1462
1463        return ret;
1464}
1465
1466static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1467{
1468        struct rtable *orig, *prev, **p;
1469        bool ret = true;
1470
1471        if (rt_is_input_route(rt)) {
1472                p = (struct rtable **)&nhc->nhc_rth_input;
1473        } else {
1474                p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1475        }
1476        orig = *p;
1477
1478        /* hold dst before doing cmpxchg() to avoid race condition
1479         * on this dst
1480         */
1481        dst_hold(&rt->dst);
1482        prev = cmpxchg(p, orig, rt);
1483        if (prev == orig) {
1484                if (orig) {
1485                        rt_add_uncached_list(orig);
1486                        dst_release(&orig->dst);
1487                }
1488        } else {
1489                dst_release(&rt->dst);
1490                ret = false;
1491        }
1492
1493        return ret;
1494}
1495
1496struct uncached_list {
1497        spinlock_t              lock;
1498        struct list_head        head;
1499};
1500
1501static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1502
1503void rt_add_uncached_list(struct rtable *rt)
1504{
1505        struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1506
1507        rt->rt_uncached_list = ul;
1508
1509        spin_lock_bh(&ul->lock);
1510        list_add_tail(&rt->rt_uncached, &ul->head);
1511        spin_unlock_bh(&ul->lock);
1512}
1513
1514void rt_del_uncached_list(struct rtable *rt)
1515{
1516        if (!list_empty(&rt->rt_uncached)) {
1517                struct uncached_list *ul = rt->rt_uncached_list;
1518
1519                spin_lock_bh(&ul->lock);
1520                list_del(&rt->rt_uncached);
1521                spin_unlock_bh(&ul->lock);
1522        }
1523}
1524
1525static void ipv4_dst_destroy(struct dst_entry *dst)
1526{
1527        struct rtable *rt = (struct rtable *)dst;
1528
1529        ip_dst_metrics_put(dst);
1530        rt_del_uncached_list(rt);
1531}
1532
1533void rt_flush_dev(struct net_device *dev)
1534{
1535        struct rtable *rt;
1536        int cpu;
1537
1538        for_each_possible_cpu(cpu) {
1539                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1540
1541                spin_lock_bh(&ul->lock);
1542                list_for_each_entry(rt, &ul->head, rt_uncached) {
1543                        if (rt->dst.dev != dev)
1544                                continue;
1545                        rt->dst.dev = blackhole_netdev;
1546                        dev_hold(rt->dst.dev);
1547                        dev_put(dev);
1548                }
1549                spin_unlock_bh(&ul->lock);
1550        }
1551}
1552
1553static bool rt_cache_valid(const struct rtable *rt)
1554{
1555        return  rt &&
1556                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1557                !rt_is_expired(rt);
1558}
1559
1560static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1561                           const struct fib_result *res,
1562                           struct fib_nh_exception *fnhe,
1563                           struct fib_info *fi, u16 type, u32 itag,
1564                           const bool do_cache)
1565{
1566        bool cached = false;
1567
1568        if (fi) {
1569                struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1570
1571                if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1572                        rt->rt_uses_gateway = 1;
1573                        rt->rt_gw_family = nhc->nhc_gw_family;
1574                        /* only INET and INET6 are supported */
1575                        if (likely(nhc->nhc_gw_family == AF_INET))
1576                                rt->rt_gw4 = nhc->nhc_gw.ipv4;
1577                        else
1578                                rt->rt_gw6 = nhc->nhc_gw.ipv6;
1579                }
1580
1581                ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1582
1583#ifdef CONFIG_IP_ROUTE_CLASSID
1584                if (nhc->nhc_family == AF_INET) {
1585                        struct fib_nh *nh;
1586
1587                        nh = container_of(nhc, struct fib_nh, nh_common);
1588                        rt->dst.tclassid = nh->nh_tclassid;
1589                }
1590#endif
1591                rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1592                if (unlikely(fnhe))
1593                        cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1594                else if (do_cache)
1595                        cached = rt_cache_route(nhc, rt);
1596                if (unlikely(!cached)) {
1597                        /* Routes we intend to cache in nexthop exception or
1598                         * FIB nexthop have the DST_NOCACHE bit clear.
1599                         * However, if we are unsuccessful at storing this
1600                         * route into the cache we really need to set it.
1601                         */
1602                        if (!rt->rt_gw4) {
1603                                rt->rt_gw_family = AF_INET;
1604                                rt->rt_gw4 = daddr;
1605                        }
1606                        rt_add_uncached_list(rt);
1607                }
1608        } else
1609                rt_add_uncached_list(rt);
1610
1611#ifdef CONFIG_IP_ROUTE_CLASSID
1612#ifdef CONFIG_IP_MULTIPLE_TABLES
1613        set_class_tag(rt, res->tclassid);
1614#endif
1615        set_class_tag(rt, itag);
1616#endif
1617}
1618
1619struct rtable *rt_dst_alloc(struct net_device *dev,
1620                            unsigned int flags, u16 type,
1621                            bool nopolicy, bool noxfrm)
1622{
1623        struct rtable *rt;
1624
1625        rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1626                       (nopolicy ? DST_NOPOLICY : 0) |
1627                       (noxfrm ? DST_NOXFRM : 0));
1628
1629        if (rt) {
1630                rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1631                rt->rt_flags = flags;
1632                rt->rt_type = type;
1633                rt->rt_is_input = 0;
1634                rt->rt_iif = 0;
1635                rt->rt_pmtu = 0;
1636                rt->rt_mtu_locked = 0;
1637                rt->rt_uses_gateway = 0;
1638                rt->rt_gw_family = 0;
1639                rt->rt_gw4 = 0;
1640                INIT_LIST_HEAD(&rt->rt_uncached);
1641
1642                rt->dst.output = ip_output;
1643                if (flags & RTCF_LOCAL)
1644                        rt->dst.input = ip_local_deliver;
1645        }
1646
1647        return rt;
1648}
1649EXPORT_SYMBOL(rt_dst_alloc);
1650
1651struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1652{
1653        struct rtable *new_rt;
1654
1655        new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1656                           rt->dst.flags);
1657
1658        if (new_rt) {
1659                new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1660                new_rt->rt_flags = rt->rt_flags;
1661                new_rt->rt_type = rt->rt_type;
1662                new_rt->rt_is_input = rt->rt_is_input;
1663                new_rt->rt_iif = rt->rt_iif;
1664                new_rt->rt_pmtu = rt->rt_pmtu;
1665                new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1666                new_rt->rt_gw_family = rt->rt_gw_family;
1667                if (rt->rt_gw_family == AF_INET)
1668                        new_rt->rt_gw4 = rt->rt_gw4;
1669                else if (rt->rt_gw_family == AF_INET6)
1670                        new_rt->rt_gw6 = rt->rt_gw6;
1671                INIT_LIST_HEAD(&new_rt->rt_uncached);
1672
1673                new_rt->dst.input = rt->dst.input;
1674                new_rt->dst.output = rt->dst.output;
1675                new_rt->dst.error = rt->dst.error;
1676                new_rt->dst.lastuse = jiffies;
1677                new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1678        }
1679        return new_rt;
1680}
1681EXPORT_SYMBOL(rt_dst_clone);
1682
1683/* called in rcu_read_lock() section */
1684int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1685                          u8 tos, struct net_device *dev,
1686                          struct in_device *in_dev, u32 *itag)
1687{
1688        int err;
1689
1690        /* Primary sanity checks. */
1691        if (!in_dev)
1692                return -EINVAL;
1693
1694        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1695            skb->protocol != htons(ETH_P_IP))
1696                return -EINVAL;
1697
1698        if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1699                return -EINVAL;
1700
1701        if (ipv4_is_zeronet(saddr)) {
1702                if (!ipv4_is_local_multicast(daddr) &&
1703                    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1704                        return -EINVAL;
1705        } else {
1706                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1707                                          in_dev, itag);
1708                if (err < 0)
1709                        return err;
1710        }
1711        return 0;
1712}
1713
1714/* called in rcu_read_lock() section */
1715static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1716                             u8 tos, struct net_device *dev, int our)
1717{
1718        struct in_device *in_dev = __in_dev_get_rcu(dev);
1719        unsigned int flags = RTCF_MULTICAST;
1720        struct rtable *rth;
1721        u32 itag = 0;
1722        int err;
1723
1724        err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1725        if (err)
1726                return err;
1727
1728        if (our)
1729                flags |= RTCF_LOCAL;
1730
1731        rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1732                           IN_DEV_ORCONF(in_dev, NOPOLICY), false);
1733        if (!rth)
1734                return -ENOBUFS;
1735
1736#ifdef CONFIG_IP_ROUTE_CLASSID
1737        rth->dst.tclassid = itag;
1738#endif
1739        rth->dst.output = ip_rt_bug;
1740        rth->rt_is_input= 1;
1741
1742#ifdef CONFIG_IP_MROUTE
1743        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1744                rth->dst.input = ip_mr_input;
1745#endif
1746        RT_CACHE_STAT_INC(in_slow_mc);
1747
1748        skb_dst_set(skb, &rth->dst);
1749        return 0;
1750}
1751
1752
1753static void ip_handle_martian_source(struct net_device *dev,
1754                                     struct in_device *in_dev,
1755                                     struct sk_buff *skb,
1756                                     __be32 daddr,
1757                                     __be32 saddr)
1758{
1759        RT_CACHE_STAT_INC(in_martian_src);
1760#ifdef CONFIG_IP_ROUTE_VERBOSE
1761        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1762                /*
1763                 *      RFC1812 recommendation, if source is martian,
1764                 *      the only hint is MAC header.
1765                 */
1766                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1767                        &daddr, &saddr, dev->name);
1768                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1769                        print_hex_dump(KERN_WARNING, "ll header: ",
1770                                       DUMP_PREFIX_OFFSET, 16, 1,
1771                                       skb_mac_header(skb),
1772                                       dev->hard_header_len, false);
1773                }
1774        }
1775#endif
1776}
1777
1778/* called in rcu_read_lock() section */
1779static int __mkroute_input(struct sk_buff *skb,
1780                           const struct fib_result *res,
1781                           struct in_device *in_dev,
1782                           __be32 daddr, __be32 saddr, u32 tos)
1783{
1784        struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1785        struct net_device *dev = nhc->nhc_dev;
1786        struct fib_nh_exception *fnhe;
1787        struct rtable *rth;
1788        int err;
1789        struct in_device *out_dev;
1790        bool do_cache;
1791        u32 itag = 0;
1792
1793        /* get a working reference to the output device */
1794        out_dev = __in_dev_get_rcu(dev);
1795        if (!out_dev) {
1796                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1797                return -EINVAL;
1798        }
1799
1800        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1801                                  in_dev->dev, in_dev, &itag);
1802        if (err < 0) {
1803                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1804                                         saddr);
1805
1806                goto cleanup;
1807        }
1808
1809        do_cache = res->fi && !itag;
1810        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1811            skb->protocol == htons(ETH_P_IP)) {
1812                __be32 gw;
1813
1814                gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1815                if (IN_DEV_SHARED_MEDIA(out_dev) ||
1816                    inet_addr_onlink(out_dev, saddr, gw))
1817                        IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1818        }
1819
1820        if (skb->protocol != htons(ETH_P_IP)) {
1821                /* Not IP (i.e. ARP). Do not create route, if it is
1822                 * invalid for proxy arp. DNAT routes are always valid.
1823                 *
1824                 * Proxy arp feature have been extended to allow, ARP
1825                 * replies back to the same interface, to support
1826                 * Private VLAN switch technologies. See arp.c.
1827                 */
1828                if (out_dev == in_dev &&
1829                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1830                        err = -EINVAL;
1831                        goto cleanup;
1832                }
1833        }
1834
1835        fnhe = find_exception(nhc, daddr);
1836        if (do_cache) {
1837                if (fnhe)
1838                        rth = rcu_dereference(fnhe->fnhe_rth_input);
1839                else
1840                        rth = rcu_dereference(nhc->nhc_rth_input);
1841                if (rt_cache_valid(rth)) {
1842                        skb_dst_set_noref(skb, &rth->dst);
1843                        goto out;
1844                }
1845        }
1846
1847        rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1848                           IN_DEV_ORCONF(in_dev, NOPOLICY),
1849                           IN_DEV_ORCONF(out_dev, NOXFRM));
1850        if (!rth) {
1851                err = -ENOBUFS;
1852                goto cleanup;
1853        }
1854
1855        rth->rt_is_input = 1;
1856        RT_CACHE_STAT_INC(in_slow_tot);
1857
1858        rth->dst.input = ip_forward;
1859
1860        rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1861                       do_cache);
1862        lwtunnel_set_redirect(&rth->dst);
1863        skb_dst_set(skb, &rth->dst);
1864out:
1865        err = 0;
1866 cleanup:
1867        return err;
1868}
1869
1870#ifdef CONFIG_IP_ROUTE_MULTIPATH
1871/* To make ICMP packets follow the right flow, the multipath hash is
1872 * calculated from the inner IP addresses.
1873 */
1874static void ip_multipath_l3_keys(const struct sk_buff *skb,
1875                                 struct flow_keys *hash_keys)
1876{
1877        const struct iphdr *outer_iph = ip_hdr(skb);
1878        const struct iphdr *key_iph = outer_iph;
1879        const struct iphdr *inner_iph;
1880        const struct icmphdr *icmph;
1881        struct iphdr _inner_iph;
1882        struct icmphdr _icmph;
1883
1884        if (likely(outer_iph->protocol != IPPROTO_ICMP))
1885                goto out;
1886
1887        if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1888                goto out;
1889
1890        icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1891                                   &_icmph);
1892        if (!icmph)
1893                goto out;
1894
1895        if (!icmp_is_err(icmph->type))
1896                goto out;
1897
1898        inner_iph = skb_header_pointer(skb,
1899                                       outer_iph->ihl * 4 + sizeof(_icmph),
1900                                       sizeof(_inner_iph), &_inner_iph);
1901        if (!inner_iph)
1902                goto out;
1903
1904        key_iph = inner_iph;
1905out:
1906        hash_keys->addrs.v4addrs.src = key_iph->saddr;
1907        hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1908}
1909
1910static u32 fib_multipath_custom_hash_outer(const struct net *net,
1911                                           const struct sk_buff *skb,
1912                                           bool *p_has_inner)
1913{
1914        u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields;
1915        struct flow_keys keys, hash_keys;
1916
1917        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
1918                return 0;
1919
1920        memset(&hash_keys, 0, sizeof(hash_keys));
1921        skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
1922
1923        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1924        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
1925                hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1926        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
1927                hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1928        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
1929                hash_keys.basic.ip_proto = keys.basic.ip_proto;
1930        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
1931                hash_keys.ports.src = keys.ports.src;
1932        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
1933                hash_keys.ports.dst = keys.ports.dst;
1934
1935        *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
1936        return flow_hash_from_keys(&hash_keys);
1937}
1938
1939static u32 fib_multipath_custom_hash_inner(const struct net *net,
1940                                           const struct sk_buff *skb,
1941                                           bool has_inner)
1942{
1943        u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields;
1944        struct flow_keys keys, hash_keys;
1945
1946        /* We assume the packet carries an encapsulation, but if none was
1947         * encountered during dissection of the outer flow, then there is no
1948         * point in calling the flow dissector again.
1949         */
1950        if (!has_inner)
1951                return 0;
1952
1953        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
1954                return 0;
1955
1956        memset(&hash_keys, 0, sizeof(hash_keys));
1957        skb_flow_dissect_flow_keys(skb, &keys, 0);
1958
1959        if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
1960                return 0;
1961
1962        if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1963                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1964                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
1965                        hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1966                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
1967                        hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1968        } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1969                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1970                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
1971                        hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1972                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
1973                        hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1974                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
1975                        hash_keys.tags.flow_label = keys.tags.flow_label;
1976        }
1977
1978        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
1979                hash_keys.basic.ip_proto = keys.basic.ip_proto;
1980        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
1981                hash_keys.ports.src = keys.ports.src;
1982        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
1983                hash_keys.ports.dst = keys.ports.dst;
1984
1985        return flow_hash_from_keys(&hash_keys);
1986}
1987
1988static u32 fib_multipath_custom_hash_skb(const struct net *net,
1989                                         const struct sk_buff *skb)
1990{
1991        u32 mhash, mhash_inner;
1992        bool has_inner = true;
1993
1994        mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner);
1995        mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner);
1996
1997        return jhash_2words(mhash, mhash_inner, 0);
1998}
1999
2000static u32 fib_multipath_custom_hash_fl4(const struct net *net,
2001                                         const struct flowi4 *fl4)
2002{
2003        u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields;
2004        struct flow_keys hash_keys;
2005
2006        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
2007                return 0;
2008
2009        memset(&hash_keys, 0, sizeof(hash_keys));
2010        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2011        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
2012                hash_keys.addrs.v4addrs.src = fl4->saddr;
2013        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
2014                hash_keys.addrs.v4addrs.dst = fl4->daddr;
2015        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
2016                hash_keys.basic.ip_proto = fl4->flowi4_proto;
2017        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
2018                hash_keys.ports.src = fl4->fl4_sport;
2019        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
2020                hash_keys.ports.dst = fl4->fl4_dport;
2021
2022        return flow_hash_from_keys(&hash_keys);
2023}
2024
2025/* if skb is set it will be used and fl4 can be NULL */
2026int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
2027                       const struct sk_buff *skb, struct flow_keys *flkeys)
2028{
2029        u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
2030        struct flow_keys hash_keys;
2031        u32 mhash = 0;
2032
2033        switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
2034        case 0:
2035                memset(&hash_keys, 0, sizeof(hash_keys));
2036                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2037                if (skb) {
2038                        ip_multipath_l3_keys(skb, &hash_keys);
2039                } else {
2040                        hash_keys.addrs.v4addrs.src = fl4->saddr;
2041                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
2042                }
2043                mhash = flow_hash_from_keys(&hash_keys);
2044                break;
2045        case 1:
2046                /* skb is currently provided only when forwarding */
2047                if (skb) {
2048                        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2049                        struct flow_keys keys;
2050
2051                        /* short-circuit if we already have L4 hash present */
2052                        if (skb->l4_hash)
2053                                return skb_get_hash_raw(skb) >> 1;
2054
2055                        memset(&hash_keys, 0, sizeof(hash_keys));
2056
2057                        if (!flkeys) {
2058                                skb_flow_dissect_flow_keys(skb, &keys, flag);
2059                                flkeys = &keys;
2060                        }
2061
2062                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2063                        hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2064                        hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2065                        hash_keys.ports.src = flkeys->ports.src;
2066                        hash_keys.ports.dst = flkeys->ports.dst;
2067                        hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2068                } else {
2069                        memset(&hash_keys, 0, sizeof(hash_keys));
2070                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2071                        hash_keys.addrs.v4addrs.src = fl4->saddr;
2072                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
2073                        hash_keys.ports.src = fl4->fl4_sport;
2074                        hash_keys.ports.dst = fl4->fl4_dport;
2075                        hash_keys.basic.ip_proto = fl4->flowi4_proto;
2076                }
2077                mhash = flow_hash_from_keys(&hash_keys);
2078                break;
2079        case 2:
2080                memset(&hash_keys, 0, sizeof(hash_keys));
2081                /* skb is currently provided only when forwarding */
2082                if (skb) {
2083                        struct flow_keys keys;
2084
2085                        skb_flow_dissect_flow_keys(skb, &keys, 0);
2086                        /* Inner can be v4 or v6 */
2087                        if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2088                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2089                                hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2090                                hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2091                        } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2092                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2093                                hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2094                                hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2095                                hash_keys.tags.flow_label = keys.tags.flow_label;
2096                                hash_keys.basic.ip_proto = keys.basic.ip_proto;
2097                        } else {
2098                                /* Same as case 0 */
2099                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2100                                ip_multipath_l3_keys(skb, &hash_keys);
2101                        }
2102                } else {
2103                        /* Same as case 0 */
2104                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2105                        hash_keys.addrs.v4addrs.src = fl4->saddr;
2106                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
2107                }
2108                mhash = flow_hash_from_keys(&hash_keys);
2109                break;
2110        case 3:
2111                if (skb)
2112                        mhash = fib_multipath_custom_hash_skb(net, skb);
2113                else
2114                        mhash = fib_multipath_custom_hash_fl4(net, fl4);
2115                break;
2116        }
2117
2118        if (multipath_hash)
2119                mhash = jhash_2words(mhash, multipath_hash, 0);
2120
2121        return mhash >> 1;
2122}
2123#endif /* CONFIG_IP_ROUTE_MULTIPATH */
2124
2125static int ip_mkroute_input(struct sk_buff *skb,
2126                            struct fib_result *res,
2127                            struct in_device *in_dev,
2128                            __be32 daddr, __be32 saddr, u32 tos,
2129                            struct flow_keys *hkeys)
2130{
2131#ifdef CONFIG_IP_ROUTE_MULTIPATH
2132        if (res->fi && fib_info_num_path(res->fi) > 1) {
2133                int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2134
2135                fib_select_multipath(res, h);
2136        }
2137#endif
2138
2139        /* create a routing cache entry */
2140        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2141}
2142
2143/* Implements all the saddr-related checks as ip_route_input_slow(),
2144 * assuming daddr is valid and the destination is not a local broadcast one.
2145 * Uses the provided hint instead of performing a route lookup.
2146 */
2147int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2148                      u8 tos, struct net_device *dev,
2149                      const struct sk_buff *hint)
2150{
2151        struct in_device *in_dev = __in_dev_get_rcu(dev);
2152        struct rtable *rt = skb_rtable(hint);
2153        struct net *net = dev_net(dev);
2154        int err = -EINVAL;
2155        u32 tag = 0;
2156
2157        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2158                goto martian_source;
2159
2160        if (ipv4_is_zeronet(saddr))
2161                goto martian_source;
2162
2163        if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2164                goto martian_source;
2165
2166        if (rt->rt_type != RTN_LOCAL)
2167                goto skip_validate_source;
2168
2169        tos &= IPTOS_RT_MASK;
2170        err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2171        if (err < 0)
2172                goto martian_source;
2173
2174skip_validate_source:
2175        skb_dst_copy(skb, hint);
2176        return 0;
2177
2178martian_source:
2179        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2180        return err;
2181}
2182
2183/* get device for dst_alloc with local routes */
2184static struct net_device *ip_rt_get_dev(struct net *net,
2185                                        const struct fib_result *res)
2186{
2187        struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2188        struct net_device *dev = NULL;
2189
2190        if (nhc)
2191                dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2192
2193        return dev ? : net->loopback_dev;
2194}
2195
2196/*
2197 *      NOTE. We drop all the packets that has local source
2198 *      addresses, because every properly looped back packet
2199 *      must have correct destination already attached by output routine.
2200 *      Changes in the enforced policies must be applied also to
2201 *      ip_route_use_hint().
2202 *
2203 *      Such approach solves two big problems:
2204 *      1. Not simplex devices are handled properly.
2205 *      2. IP spoofing attempts are filtered with 100% of guarantee.
2206 *      called with rcu_read_lock()
2207 */
2208
2209static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2210                               u8 tos, struct net_device *dev,
2211                               struct fib_result *res)
2212{
2213        struct in_device *in_dev = __in_dev_get_rcu(dev);
2214        struct flow_keys *flkeys = NULL, _flkeys;
2215        struct net    *net = dev_net(dev);
2216        struct ip_tunnel_info *tun_info;
2217        int             err = -EINVAL;
2218        unsigned int    flags = 0;
2219        u32             itag = 0;
2220        struct rtable   *rth;
2221        struct flowi4   fl4;
2222        bool do_cache = true;
2223
2224        /* IP on this device is disabled. */
2225
2226        if (!in_dev)
2227                goto out;
2228
2229        /* Check for the most weird martians, which can be not detected
2230         * by fib_lookup.
2231         */
2232
2233        tun_info = skb_tunnel_info(skb);
2234        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2235                fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2236        else
2237                fl4.flowi4_tun_key.tun_id = 0;
2238        skb_dst_drop(skb);
2239
2240        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2241                goto martian_source;
2242
2243        res->fi = NULL;
2244        res->table = NULL;
2245        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2246                goto brd_input;
2247
2248        /* Accept zero addresses only to limited broadcast;
2249         * I even do not know to fix it or not. Waiting for complains :-)
2250         */
2251        if (ipv4_is_zeronet(saddr))
2252                goto martian_source;
2253
2254        if (ipv4_is_zeronet(daddr))
2255                goto martian_destination;
2256
2257        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2258         * and call it once if daddr or/and saddr are loopback addresses
2259         */
2260        if (ipv4_is_loopback(daddr)) {
2261                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2262                        goto martian_destination;
2263        } else if (ipv4_is_loopback(saddr)) {
2264                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2265                        goto martian_source;
2266        }
2267
2268        /*
2269         *      Now we are ready to route packet.
2270         */
2271        fl4.flowi4_oif = 0;
2272        fl4.flowi4_iif = dev->ifindex;
2273        fl4.flowi4_mark = skb->mark;
2274        fl4.flowi4_tos = tos;
2275        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2276        fl4.flowi4_flags = 0;
2277        fl4.daddr = daddr;
2278        fl4.saddr = saddr;
2279        fl4.flowi4_uid = sock_net_uid(net, NULL);
2280        fl4.flowi4_multipath_hash = 0;
2281
2282        if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2283                flkeys = &_flkeys;
2284        } else {
2285                fl4.flowi4_proto = 0;
2286                fl4.fl4_sport = 0;
2287                fl4.fl4_dport = 0;
2288        }
2289
2290        err = fib_lookup(net, &fl4, res, 0);
2291        if (err != 0) {
2292                if (!IN_DEV_FORWARD(in_dev))
2293                        err = -EHOSTUNREACH;
2294                goto no_route;
2295        }
2296
2297        if (res->type == RTN_BROADCAST) {
2298                if (IN_DEV_BFORWARD(in_dev))
2299                        goto make_route;
2300                /* not do cache if bc_forwarding is enabled */
2301                if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2302                        do_cache = false;
2303                goto brd_input;
2304        }
2305
2306        if (res->type == RTN_LOCAL) {
2307                err = fib_validate_source(skb, saddr, daddr, tos,
2308                                          0, dev, in_dev, &itag);
2309                if (err < 0)
2310                        goto martian_source;
2311                goto local_input;
2312        }
2313
2314        if (!IN_DEV_FORWARD(in_dev)) {
2315                err = -EHOSTUNREACH;
2316                goto no_route;
2317        }
2318        if (res->type != RTN_UNICAST)
2319                goto martian_destination;
2320
2321make_route:
2322        err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2323out:    return err;
2324
2325brd_input:
2326        if (skb->protocol != htons(ETH_P_IP))
2327                goto e_inval;
2328
2329        if (!ipv4_is_zeronet(saddr)) {
2330                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2331                                          in_dev, &itag);
2332                if (err < 0)
2333                        goto martian_source;
2334        }
2335        flags |= RTCF_BROADCAST;
2336        res->type = RTN_BROADCAST;
2337        RT_CACHE_STAT_INC(in_brd);
2338
2339local_input:
2340        do_cache &= res->fi && !itag;
2341        if (do_cache) {
2342                struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2343
2344                rth = rcu_dereference(nhc->nhc_rth_input);
2345                if (rt_cache_valid(rth)) {
2346                        skb_dst_set_noref(skb, &rth->dst);
2347                        err = 0;
2348                        goto out;
2349                }
2350        }
2351
2352        rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2353                           flags | RTCF_LOCAL, res->type,
2354                           IN_DEV_ORCONF(in_dev, NOPOLICY), false);
2355        if (!rth)
2356                goto e_nobufs;
2357
2358        rth->dst.output= ip_rt_bug;
2359#ifdef CONFIG_IP_ROUTE_CLASSID
2360        rth->dst.tclassid = itag;
2361#endif
2362        rth->rt_is_input = 1;
2363
2364        RT_CACHE_STAT_INC(in_slow_tot);
2365        if (res->type == RTN_UNREACHABLE) {
2366                rth->dst.input= ip_error;
2367                rth->dst.error= -err;
2368                rth->rt_flags   &= ~RTCF_LOCAL;
2369        }
2370
2371        if (do_cache) {
2372                struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2373
2374                rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2375                if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2376                        WARN_ON(rth->dst.input == lwtunnel_input);
2377                        rth->dst.lwtstate->orig_input = rth->dst.input;
2378                        rth->dst.input = lwtunnel_input;
2379                }
2380
2381                if (unlikely(!rt_cache_route(nhc, rth)))
2382                        rt_add_uncached_list(rth);
2383        }
2384        skb_dst_set(skb, &rth->dst);
2385        err = 0;
2386        goto out;
2387
2388no_route:
2389        RT_CACHE_STAT_INC(in_no_route);
2390        res->type = RTN_UNREACHABLE;
2391        res->fi = NULL;
2392        res->table = NULL;
2393        goto local_input;
2394
2395        /*
2396         *      Do not cache martian addresses: they should be logged (RFC1812)
2397         */
2398martian_destination:
2399        RT_CACHE_STAT_INC(in_martian_dst);
2400#ifdef CONFIG_IP_ROUTE_VERBOSE
2401        if (IN_DEV_LOG_MARTIANS(in_dev))
2402                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2403                                     &daddr, &saddr, dev->name);
2404#endif
2405
2406e_inval:
2407        err = -EINVAL;
2408        goto out;
2409
2410e_nobufs:
2411        err = -ENOBUFS;
2412        goto out;
2413
2414martian_source:
2415        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2416        goto out;
2417}
2418
2419int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2420                         u8 tos, struct net_device *dev)
2421{
2422        struct fib_result res;
2423        int err;
2424
2425        tos &= IPTOS_RT_MASK;
2426        rcu_read_lock();
2427        err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2428        rcu_read_unlock();
2429
2430        return err;
2431}
2432EXPORT_SYMBOL(ip_route_input_noref);
2433
2434/* called with rcu_read_lock held */
2435int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2436                       u8 tos, struct net_device *dev, struct fib_result *res)
2437{
2438        /* Multicast recognition logic is moved from route cache to here.
2439         * The problem was that too many Ethernet cards have broken/missing
2440         * hardware multicast filters :-( As result the host on multicasting
2441         * network acquires a lot of useless route cache entries, sort of
2442         * SDR messages from all the world. Now we try to get rid of them.
2443         * Really, provided software IP multicast filter is organized
2444         * reasonably (at least, hashed), it does not result in a slowdown
2445         * comparing with route cache reject entries.
2446         * Note, that multicast routers are not affected, because
2447         * route cache entry is created eventually.
2448         */
2449        if (ipv4_is_multicast(daddr)) {
2450                struct in_device *in_dev = __in_dev_get_rcu(dev);
2451                int our = 0;
2452                int err = -EINVAL;
2453
2454                if (!in_dev)
2455                        return err;
2456                our = ip_check_mc_rcu(in_dev, daddr, saddr,
2457                                      ip_hdr(skb)->protocol);
2458
2459                /* check l3 master if no match yet */
2460                if (!our && netif_is_l3_slave(dev)) {
2461                        struct in_device *l3_in_dev;
2462
2463                        l3_in_dev = __in_dev_get_rcu(skb->dev);
2464                        if (l3_in_dev)
2465                                our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2466                                                      ip_hdr(skb)->protocol);
2467                }
2468
2469                if (our
2470#ifdef CONFIG_IP_MROUTE
2471                        ||
2472                    (!ipv4_is_local_multicast(daddr) &&
2473                     IN_DEV_MFORWARD(in_dev))
2474#endif
2475                   ) {
2476                        err = ip_route_input_mc(skb, daddr, saddr,
2477                                                tos, dev, our);
2478                }
2479                return err;
2480        }
2481
2482        return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2483}
2484
2485/* called with rcu_read_lock() */
2486static struct rtable *__mkroute_output(const struct fib_result *res,
2487                                       const struct flowi4 *fl4, int orig_oif,
2488                                       struct net_device *dev_out,
2489                                       unsigned int flags)
2490{
2491        struct fib_info *fi = res->fi;
2492        struct fib_nh_exception *fnhe;
2493        struct in_device *in_dev;
2494        u16 type = res->type;
2495        struct rtable *rth;
2496        bool do_cache;
2497
2498        in_dev = __in_dev_get_rcu(dev_out);
2499        if (!in_dev)
2500                return ERR_PTR(-EINVAL);
2501
2502        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2503                if (ipv4_is_loopback(fl4->saddr) &&
2504                    !(dev_out->flags & IFF_LOOPBACK) &&
2505                    !netif_is_l3_master(dev_out))
2506                        return ERR_PTR(-EINVAL);
2507
2508        if (ipv4_is_lbcast(fl4->daddr))
2509                type = RTN_BROADCAST;
2510        else if (ipv4_is_multicast(fl4->daddr))
2511                type = RTN_MULTICAST;
2512        else if (ipv4_is_zeronet(fl4->daddr))
2513                return ERR_PTR(-EINVAL);
2514
2515        if (dev_out->flags & IFF_LOOPBACK)
2516                flags |= RTCF_LOCAL;
2517
2518        do_cache = true;
2519        if (type == RTN_BROADCAST) {
2520                flags |= RTCF_BROADCAST | RTCF_LOCAL;
2521                fi = NULL;
2522        } else if (type == RTN_MULTICAST) {
2523                flags |= RTCF_MULTICAST | RTCF_LOCAL;
2524                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2525                                     fl4->flowi4_proto))
2526                        flags &= ~RTCF_LOCAL;
2527                else
2528                        do_cache = false;
2529                /* If multicast route do not exist use
2530                 * default one, but do not gateway in this case.
2531                 * Yes, it is hack.
2532                 */
2533                if (fi && res->prefixlen < 4)
2534                        fi = NULL;
2535        } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2536                   (orig_oif != dev_out->ifindex)) {
2537                /* For local routes that require a particular output interface
2538                 * we do not want to cache the result.  Caching the result
2539                 * causes incorrect behaviour when there are multiple source
2540                 * addresses on the interface, the end result being that if the
2541                 * intended recipient is waiting on that interface for the
2542                 * packet he won't receive it because it will be delivered on
2543                 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2544                 * be set to the loopback interface as well.
2545                 */
2546                do_cache = false;
2547        }
2548
2549        fnhe = NULL;
2550        do_cache &= fi != NULL;
2551        if (fi) {
2552                struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2553                struct rtable __rcu **prth;
2554
2555                fnhe = find_exception(nhc, fl4->daddr);
2556                if (!do_cache)
2557                        goto add;
2558                if (fnhe) {
2559                        prth = &fnhe->fnhe_rth_output;
2560                } else {
2561                        if (unlikely(fl4->flowi4_flags &
2562                                     FLOWI_FLAG_KNOWN_NH &&
2563                                     !(nhc->nhc_gw_family &&
2564                                       nhc->nhc_scope == RT_SCOPE_LINK))) {
2565                                do_cache = false;
2566                                goto add;
2567                        }
2568                        prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2569                }
2570                rth = rcu_dereference(*prth);
2571                if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2572                        return rth;
2573        }
2574
2575add:
2576        rth = rt_dst_alloc(dev_out, flags, type,
2577                           IN_DEV_ORCONF(in_dev, NOPOLICY),
2578                           IN_DEV_ORCONF(in_dev, NOXFRM));
2579        if (!rth)
2580                return ERR_PTR(-ENOBUFS);
2581
2582        rth->rt_iif = orig_oif;
2583
2584        RT_CACHE_STAT_INC(out_slow_tot);
2585
2586        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2587                if (flags & RTCF_LOCAL &&
2588                    !(dev_out->flags & IFF_LOOPBACK)) {
2589                        rth->dst.output = ip_mc_output;
2590                        RT_CACHE_STAT_INC(out_slow_mc);
2591                }
2592#ifdef CONFIG_IP_MROUTE
2593                if (type == RTN_MULTICAST) {
2594                        if (IN_DEV_MFORWARD(in_dev) &&
2595                            !ipv4_is_local_multicast(fl4->daddr)) {
2596                                rth->dst.input = ip_mr_input;
2597                                rth->dst.output = ip_mc_output;
2598                        }
2599                }
2600#endif
2601        }
2602
2603        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2604        lwtunnel_set_redirect(&rth->dst);
2605
2606        return rth;
2607}
2608
2609/*
2610 * Major route resolver routine.
2611 */
2612
2613struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2614                                        const struct sk_buff *skb)
2615{
2616        __u8 tos = RT_FL_TOS(fl4);
2617        struct fib_result res = {
2618                .type           = RTN_UNSPEC,
2619                .fi             = NULL,
2620                .table          = NULL,
2621                .tclassid       = 0,
2622        };
2623        struct rtable *rth;
2624
2625        fl4->flowi4_iif = LOOPBACK_IFINDEX;
2626        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2627        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2628                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2629
2630        rcu_read_lock();
2631        rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2632        rcu_read_unlock();
2633
2634        return rth;
2635}
2636EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2637
2638struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2639                                            struct fib_result *res,
2640                                            const struct sk_buff *skb)
2641{
2642        struct net_device *dev_out = NULL;
2643        int orig_oif = fl4->flowi4_oif;
2644        unsigned int flags = 0;
2645        struct rtable *rth;
2646        int err;
2647
2648        if (fl4->saddr) {
2649                if (ipv4_is_multicast(fl4->saddr) ||
2650                    ipv4_is_lbcast(fl4->saddr) ||
2651                    ipv4_is_zeronet(fl4->saddr)) {
2652                        rth = ERR_PTR(-EINVAL);
2653                        goto out;
2654                }
2655
2656                rth = ERR_PTR(-ENETUNREACH);
2657
2658                /* I removed check for oif == dev_out->oif here.
2659                 * It was wrong for two reasons:
2660                 * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2661                 *    is assigned to multiple interfaces.
2662                 * 2. Moreover, we are allowed to send packets with saddr
2663                 *    of another iface. --ANK
2664                 */
2665
2666                if (fl4->flowi4_oif == 0 &&
2667                    (ipv4_is_multicast(fl4->daddr) ||
2668                     ipv4_is_lbcast(fl4->daddr))) {
2669                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2670                        dev_out = __ip_dev_find(net, fl4->saddr, false);
2671                        if (!dev_out)
2672                                goto out;
2673
2674                        /* Special hack: user can direct multicasts
2675                         * and limited broadcast via necessary interface
2676                         * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2677                         * This hack is not just for fun, it allows
2678                         * vic,vat and friends to work.
2679                         * They bind socket to loopback, set ttl to zero
2680                         * and expect that it will work.
2681                         * From the viewpoint of routing cache they are broken,
2682                         * because we are not allowed to build multicast path
2683                         * with loopback source addr (look, routing cache
2684                         * cannot know, that ttl is zero, so that packet
2685                         * will not leave this host and route is valid).
2686                         * Luckily, this hack is good workaround.
2687                         */
2688
2689                        fl4->flowi4_oif = dev_out->ifindex;
2690                        goto make_route;
2691                }
2692
2693                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2694                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2695                        if (!__ip_dev_find(net, fl4->saddr, false))
2696                                goto out;
2697                }
2698        }
2699
2700
2701        if (fl4->flowi4_oif) {
2702                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2703                rth = ERR_PTR(-ENODEV);
2704                if (!dev_out)
2705                        goto out;
2706
2707                /* RACE: Check return value of inet_select_addr instead. */
2708                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2709                        rth = ERR_PTR(-ENETUNREACH);
2710                        goto out;
2711                }
2712                if (ipv4_is_local_multicast(fl4->daddr) ||
2713                    ipv4_is_lbcast(fl4->daddr) ||
2714                    fl4->flowi4_proto == IPPROTO_IGMP) {
2715                        if (!fl4->saddr)
2716                                fl4->saddr = inet_select_addr(dev_out, 0,
2717                                                              RT_SCOPE_LINK);
2718                        goto make_route;
2719                }
2720                if (!fl4->saddr) {
2721                        if (ipv4_is_multicast(fl4->daddr))
2722                                fl4->saddr = inet_select_addr(dev_out, 0,
2723                                                              fl4->flowi4_scope);
2724                        else if (!fl4->daddr)
2725                                fl4->saddr = inet_select_addr(dev_out, 0,
2726                                                              RT_SCOPE_HOST);
2727                }
2728        }
2729
2730        if (!fl4->daddr) {
2731                fl4->daddr = fl4->saddr;
2732                if (!fl4->daddr)
2733                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2734                dev_out = net->loopback_dev;
2735                fl4->flowi4_oif = LOOPBACK_IFINDEX;
2736                res->type = RTN_LOCAL;
2737                flags |= RTCF_LOCAL;
2738                goto make_route;
2739        }
2740
2741        err = fib_lookup(net, fl4, res, 0);
2742        if (err) {
2743                res->fi = NULL;
2744                res->table = NULL;
2745                if (fl4->flowi4_oif &&
2746                    (ipv4_is_multicast(fl4->daddr) ||
2747                    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2748                        /* Apparently, routing tables are wrong. Assume,
2749                         * that the destination is on link.
2750                         *
2751                         * WHY? DW.
2752                         * Because we are allowed to send to iface
2753                         * even if it has NO routes and NO assigned
2754                         * addresses. When oif is specified, routing
2755                         * tables are looked up with only one purpose:
2756                         * to catch if destination is gatewayed, rather than
2757                         * direct. Moreover, if MSG_DONTROUTE is set,
2758                         * we send packet, ignoring both routing tables
2759                         * and ifaddr state. --ANK
2760                         *
2761                         *
2762                         * We could make it even if oif is unknown,
2763                         * likely IPv6, but we do not.
2764                         */
2765
2766                        if (fl4->saddr == 0)
2767                                fl4->saddr = inet_select_addr(dev_out, 0,
2768                                                              RT_SCOPE_LINK);
2769                        res->type = RTN_UNICAST;
2770                        goto make_route;
2771                }
2772                rth = ERR_PTR(err);
2773                goto out;
2774        }
2775
2776        if (res->type == RTN_LOCAL) {
2777                if (!fl4->saddr) {
2778                        if (res->fi->fib_prefsrc)
2779                                fl4->saddr = res->fi->fib_prefsrc;
2780                        else
2781                                fl4->saddr = fl4->daddr;
2782                }
2783
2784                /* L3 master device is the loopback for that domain */
2785                dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2786                        net->loopback_dev;
2787
2788                /* make sure orig_oif points to fib result device even
2789                 * though packet rx/tx happens over loopback or l3mdev
2790                 */
2791                orig_oif = FIB_RES_OIF(*res);
2792
2793                fl4->flowi4_oif = dev_out->ifindex;
2794                flags |= RTCF_LOCAL;
2795                goto make_route;
2796        }
2797
2798        fib_select_path(net, res, fl4, skb);
2799
2800        dev_out = FIB_RES_DEV(*res);
2801
2802make_route:
2803        rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2804
2805out:
2806        return rth;
2807}
2808
2809static struct dst_ops ipv4_dst_blackhole_ops = {
2810        .family                 = AF_INET,
2811        .default_advmss         = ipv4_default_advmss,
2812        .neigh_lookup           = ipv4_neigh_lookup,
2813        .check                  = dst_blackhole_check,
2814        .cow_metrics            = dst_blackhole_cow_metrics,
2815        .update_pmtu            = dst_blackhole_update_pmtu,
2816        .redirect               = dst_blackhole_redirect,
2817        .mtu                    = dst_blackhole_mtu,
2818};
2819
2820struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2821{
2822        struct rtable *ort = (struct rtable *) dst_orig;
2823        struct rtable *rt;
2824
2825        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2826        if (rt) {
2827                struct dst_entry *new = &rt->dst;
2828
2829                new->__use = 1;
2830                new->input = dst_discard;
2831                new->output = dst_discard_out;
2832
2833                new->dev = net->loopback_dev;
2834                if (new->dev)
2835                        dev_hold(new->dev);
2836
2837                rt->rt_is_input = ort->rt_is_input;
2838                rt->rt_iif = ort->rt_iif;
2839                rt->rt_pmtu = ort->rt_pmtu;
2840                rt->rt_mtu_locked = ort->rt_mtu_locked;
2841
2842                rt->rt_genid = rt_genid_ipv4(net);
2843                rt->rt_flags = ort->rt_flags;
2844                rt->rt_type = ort->rt_type;
2845                rt->rt_uses_gateway = ort->rt_uses_gateway;
2846                rt->rt_gw_family = ort->rt_gw_family;
2847                if (rt->rt_gw_family == AF_INET)
2848                        rt->rt_gw4 = ort->rt_gw4;
2849                else if (rt->rt_gw_family == AF_INET6)
2850                        rt->rt_gw6 = ort->rt_gw6;
2851
2852                INIT_LIST_HEAD(&rt->rt_uncached);
2853        }
2854
2855        dst_release(dst_orig);
2856
2857        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2858}
2859
2860struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2861                                    const struct sock *sk)
2862{
2863        struct rtable *rt = __ip_route_output_key(net, flp4);
2864
2865        if (IS_ERR(rt))
2866                return rt;
2867
2868        if (flp4->flowi4_proto) {
2869                flp4->flowi4_oif = rt->dst.dev->ifindex;
2870                rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2871                                                        flowi4_to_flowi(flp4),
2872                                                        sk, 0);
2873        }
2874
2875        return rt;
2876}
2877EXPORT_SYMBOL_GPL(ip_route_output_flow);
2878
2879struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2880                                      struct net_device *dev,
2881                                      struct net *net, __be32 *saddr,
2882                                      const struct ip_tunnel_info *info,
2883                                      u8 protocol, bool use_cache)
2884{
2885#ifdef CONFIG_DST_CACHE
2886        struct dst_cache *dst_cache;
2887#endif
2888        struct rtable *rt = NULL;
2889        struct flowi4 fl4;
2890        __u8 tos;
2891
2892#ifdef CONFIG_DST_CACHE
2893        dst_cache = (struct dst_cache *)&info->dst_cache;
2894        if (use_cache) {
2895                rt = dst_cache_get_ip4(dst_cache, saddr);
2896                if (rt)
2897                        return rt;
2898        }
2899#endif
2900        memset(&fl4, 0, sizeof(fl4));
2901        fl4.flowi4_mark = skb->mark;
2902        fl4.flowi4_proto = protocol;
2903        fl4.daddr = info->key.u.ipv4.dst;
2904        fl4.saddr = info->key.u.ipv4.src;
2905        tos = info->key.tos;
2906        fl4.flowi4_tos = RT_TOS(tos);
2907
2908        rt = ip_route_output_key(net, &fl4);
2909        if (IS_ERR(rt)) {
2910                netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2911                return ERR_PTR(-ENETUNREACH);
2912        }
2913        if (rt->dst.dev == dev) { /* is this necessary? */
2914                netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2915                ip_rt_put(rt);
2916                return ERR_PTR(-ELOOP);
2917        }
2918#ifdef CONFIG_DST_CACHE
2919        if (use_cache)
2920                dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2921#endif
2922        *saddr = fl4.saddr;
2923        return rt;
2924}
2925EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2926
2927/* called with rcu_read_lock held */
2928static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2929                        struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2930                        struct sk_buff *skb, u32 portid, u32 seq,
2931                        unsigned int flags)
2932{
2933        struct rtmsg *r;
2934        struct nlmsghdr *nlh;
2935        unsigned long expires = 0;
2936        u32 error;
2937        u32 metrics[RTAX_MAX];
2938
2939        nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2940        if (!nlh)
2941                return -EMSGSIZE;
2942
2943        r = nlmsg_data(nlh);
2944        r->rtm_family    = AF_INET;
2945        r->rtm_dst_len  = 32;
2946        r->rtm_src_len  = 0;
2947        r->rtm_tos      = fl4 ? fl4->flowi4_tos : 0;
2948        r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2949        if (nla_put_u32(skb, RTA_TABLE, table_id))
2950                goto nla_put_failure;
2951        r->rtm_type     = rt->rt_type;
2952        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2953        r->rtm_protocol = RTPROT_UNSPEC;
2954        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2955        if (rt->rt_flags & RTCF_NOTIFY)
2956                r->rtm_flags |= RTM_F_NOTIFY;
2957        if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2958                r->rtm_flags |= RTCF_DOREDIRECT;
2959
2960        if (nla_put_in_addr(skb, RTA_DST, dst))
2961                goto nla_put_failure;
2962        if (src) {
2963                r->rtm_src_len = 32;
2964                if (nla_put_in_addr(skb, RTA_SRC, src))
2965                        goto nla_put_failure;
2966        }
2967        if (rt->dst.dev &&
2968            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2969                goto nla_put_failure;
2970        if (rt->dst.lwtstate &&
2971            lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
2972                goto nla_put_failure;
2973#ifdef CONFIG_IP_ROUTE_CLASSID
2974        if (rt->dst.tclassid &&
2975            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2976                goto nla_put_failure;
2977#endif
2978        if (fl4 && !rt_is_input_route(rt) &&
2979            fl4->saddr != src) {
2980                if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2981                        goto nla_put_failure;
2982        }
2983        if (rt->rt_uses_gateway) {
2984                if (rt->rt_gw_family == AF_INET &&
2985                    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2986                        goto nla_put_failure;
2987                } else if (rt->rt_gw_family == AF_INET6) {
2988                        int alen = sizeof(struct in6_addr);
2989                        struct nlattr *nla;
2990                        struct rtvia *via;
2991
2992                        nla = nla_reserve(skb, RTA_VIA, alen + 2);
2993                        if (!nla)
2994                                goto nla_put_failure;
2995
2996                        via = nla_data(nla);
2997                        via->rtvia_family = AF_INET6;
2998                        memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2999                }
3000        }
3001
3002        expires = rt->dst.expires;
3003        if (expires) {
3004                unsigned long now = jiffies;
3005
3006                if (time_before(now, expires))
3007                        expires -= now;
3008                else
3009                        expires = 0;
3010        }
3011
3012        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3013        if (rt->rt_pmtu && expires)
3014                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
3015        if (rt->rt_mtu_locked && expires)
3016                metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
3017        if (rtnetlink_put_metrics(skb, metrics) < 0)
3018                goto nla_put_failure;
3019
3020        if (fl4) {
3021                if (fl4->flowi4_mark &&
3022                    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
3023                        goto nla_put_failure;
3024
3025                if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
3026                    nla_put_u32(skb, RTA_UID,
3027                                from_kuid_munged(current_user_ns(),
3028                                                 fl4->flowi4_uid)))
3029                        goto nla_put_failure;
3030
3031                if (rt_is_input_route(rt)) {
3032#ifdef CONFIG_IP_MROUTE
3033                        if (ipv4_is_multicast(dst) &&
3034                            !ipv4_is_local_multicast(dst) &&
3035                            IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3036                                int err = ipmr_get_route(net, skb,
3037                                                         fl4->saddr, fl4->daddr,
3038                                                         r, portid);
3039
3040                                if (err <= 0) {
3041                                        if (err == 0)
3042                                                return 0;
3043                                        goto nla_put_failure;
3044                                }
3045                        } else
3046#endif
3047                                if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
3048                                        goto nla_put_failure;
3049                }
3050        }
3051
3052        error = rt->dst.error;
3053
3054        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
3055                goto nla_put_failure;
3056
3057        nlmsg_end(skb, nlh);
3058        return 0;
3059
3060nla_put_failure:
3061        nlmsg_cancel(skb, nlh);
3062        return -EMSGSIZE;
3063}
3064
3065static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
3066                            struct netlink_callback *cb, u32 table_id,
3067                            struct fnhe_hash_bucket *bucket, int genid,
3068                            int *fa_index, int fa_start, unsigned int flags)
3069{
3070        int i;
3071
3072        for (i = 0; i < FNHE_HASH_SIZE; i++) {
3073                struct fib_nh_exception *fnhe;
3074
3075                for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
3076                     fnhe = rcu_dereference(fnhe->fnhe_next)) {
3077                        struct rtable *rt;
3078                        int err;
3079
3080                        if (*fa_index < fa_start)
3081                                goto next;
3082
3083                        if (fnhe->fnhe_genid != genid)
3084                                goto next;
3085
3086                        if (fnhe->fnhe_expires &&
3087                            time_after(jiffies, fnhe->fnhe_expires))
3088                                goto next;
3089
3090                        rt = rcu_dereference(fnhe->fnhe_rth_input);
3091                        if (!rt)
3092                                rt = rcu_dereference(fnhe->fnhe_rth_output);
3093                        if (!rt)
3094                                goto next;
3095
3096                        err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3097                                           table_id, NULL, skb,
3098                                           NETLINK_CB(cb->skb).portid,
3099                                           cb->nlh->nlmsg_seq, flags);
3100                        if (err)
3101                                return err;
3102next:
3103                        (*fa_index)++;
3104                }
3105        }
3106
3107        return 0;
3108}
3109
3110int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3111                       u32 table_id, struct fib_info *fi,
3112                       int *fa_index, int fa_start, unsigned int flags)
3113{
3114        struct net *net = sock_net(cb->skb->sk);
3115        int nhsel, genid = fnhe_genid(net);
3116
3117        for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3118                struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3119                struct fnhe_hash_bucket *bucket;
3120                int err;
3121
3122                if (nhc->nhc_flags & RTNH_F_DEAD)
3123                        continue;
3124
3125                rcu_read_lock();
3126                bucket = rcu_dereference(nhc->nhc_exceptions);
3127                err = 0;
3128                if (bucket)
3129                        err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3130                                               genid, fa_index, fa_start,
3131                                               flags);
3132                rcu_read_unlock();
3133                if (err)
3134                        return err;
3135        }
3136
3137        return 0;
3138}
3139
3140static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3141                                                   u8 ip_proto, __be16 sport,
3142                                                   __be16 dport)
3143{
3144        struct sk_buff *skb;
3145        struct iphdr *iph;
3146
3147        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3148        if (!skb)
3149                return NULL;
3150
3151        /* Reserve room for dummy headers, this skb can pass
3152         * through good chunk of routing engine.
3153         */
3154        skb_reset_mac_header(skb);
3155        skb_reset_network_header(skb);
3156        skb->protocol = htons(ETH_P_IP);
3157        iph = skb_put(skb, sizeof(struct iphdr));
3158        iph->protocol = ip_proto;
3159        iph->saddr = src;
3160        iph->daddr = dst;
3161        iph->version = 0x4;
3162        iph->frag_off = 0;
3163        iph->ihl = 0x5;
3164        skb_set_transport_header(skb, skb->len);
3165
3166        switch (iph->protocol) {
3167        case IPPROTO_UDP: {
3168                struct udphdr *udph;
3169
3170                udph = skb_put_zero(skb, sizeof(struct udphdr));
3171                udph->source = sport;
3172                udph->dest = dport;
3173                udph->len = sizeof(struct udphdr);
3174                udph->check = 0;
3175                break;
3176        }
3177        case IPPROTO_TCP: {
3178                struct tcphdr *tcph;
3179
3180                tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3181                tcph->source    = sport;
3182                tcph->dest      = dport;
3183                tcph->doff      = sizeof(struct tcphdr) / 4;
3184                tcph->rst = 1;
3185                tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3186                                            src, dst, 0);
3187                break;
3188        }
3189        case IPPROTO_ICMP: {
3190                struct icmphdr *icmph;
3191
3192                icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3193                icmph->type = ICMP_ECHO;
3194                icmph->code = 0;
3195        }
3196        }
3197
3198        return skb;
3199}
3200
3201static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3202                                       const struct nlmsghdr *nlh,
3203                                       struct nlattr **tb,
3204                                       struct netlink_ext_ack *extack)
3205{
3206        struct rtmsg *rtm;
3207        int i, err;
3208
3209        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3210                NL_SET_ERR_MSG(extack,
3211                               "ipv4: Invalid header for route get request");
3212                return -EINVAL;
3213        }
3214
3215        if (!netlink_strict_get_check(skb))
3216                return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3217                                              rtm_ipv4_policy, extack);
3218
3219        rtm = nlmsg_data(nlh);
3220        if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3221            (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3222            rtm->rtm_table || rtm->rtm_protocol ||
3223            rtm->rtm_scope || rtm->rtm_type) {
3224                NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3225                return -EINVAL;
3226        }
3227
3228        if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3229                               RTM_F_LOOKUP_TABLE |
3230                               RTM_F_FIB_MATCH)) {
3231                NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3232                return -EINVAL;
3233        }
3234
3235        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3236                                            rtm_ipv4_policy, extack);
3237        if (err)
3238                return err;
3239
3240        if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3241            (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3242                NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3243                return -EINVAL;
3244        }
3245
3246        for (i = 0; i <= RTA_MAX; i++) {
3247                if (!tb[i])
3248                        continue;
3249
3250                switch (i) {
3251                case RTA_IIF:
3252                case RTA_OIF:
3253                case RTA_SRC:
3254                case RTA_DST:
3255                case RTA_IP_PROTO:
3256                case RTA_SPORT:
3257                case RTA_DPORT:
3258                case RTA_MARK:
3259                case RTA_UID:
3260                        break;
3261                default:
3262                        NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3263                        return -EINVAL;
3264                }
3265        }
3266
3267        return 0;
3268}
3269
3270static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3271                             struct netlink_ext_ack *extack)
3272{
3273        struct net *net = sock_net(in_skb->sk);
3274        struct nlattr *tb[RTA_MAX+1];
3275        u32 table_id = RT_TABLE_MAIN;
3276        __be16 sport = 0, dport = 0;
3277        struct fib_result res = {};
3278        u8 ip_proto = IPPROTO_UDP;
3279        struct rtable *rt = NULL;
3280        struct sk_buff *skb;
3281        struct rtmsg *rtm;
3282        struct flowi4 fl4 = {};
3283        __be32 dst = 0;
3284        __be32 src = 0;
3285        kuid_t uid;
3286        u32 iif;
3287        int err;
3288        int mark;
3289
3290        err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3291        if (err < 0)
3292                return err;
3293
3294        rtm = nlmsg_data(nlh);
3295        src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3296        dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3297        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3298        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3299        if (tb[RTA_UID])
3300                uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3301        else
3302                uid = (iif ? INVALID_UID : current_uid());
3303
3304        if (tb[RTA_IP_PROTO]) {
3305                err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3306                                                  &ip_proto, AF_INET, extack);
3307                if (err)
3308                        return err;
3309        }
3310
3311        if (tb[RTA_SPORT])
3312                sport = nla_get_be16(tb[RTA_SPORT]);
3313
3314        if (tb[RTA_DPORT])
3315                dport = nla_get_be16(tb[RTA_DPORT]);
3316
3317        skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3318        if (!skb)
3319                return -ENOBUFS;
3320
3321        fl4.daddr = dst;
3322        fl4.saddr = src;
3323        fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3324        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3325        fl4.flowi4_mark = mark;
3326        fl4.flowi4_uid = uid;
3327        if (sport)
3328                fl4.fl4_sport = sport;
3329        if (dport)
3330                fl4.fl4_dport = dport;
3331        fl4.flowi4_proto = ip_proto;
3332
3333        rcu_read_lock();
3334
3335        if (iif) {
3336                struct net_device *dev;
3337
3338                dev = dev_get_by_index_rcu(net, iif);
3339                if (!dev) {
3340                        err = -ENODEV;
3341                        goto errout_rcu;
3342                }
3343
3344                fl4.flowi4_iif = iif; /* for rt_fill_info */
3345                skb->dev        = dev;
3346                skb->mark       = mark;
3347                err = ip_route_input_rcu(skb, dst, src,
3348                                         rtm->rtm_tos & IPTOS_RT_MASK, dev,
3349                                         &res);
3350
3351                rt = skb_rtable(skb);
3352                if (err == 0 && rt->dst.error)
3353                        err = -rt->dst.error;
3354        } else {
3355                fl4.flowi4_iif = LOOPBACK_IFINDEX;
3356                skb->dev = net->loopback_dev;
3357                rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3358                err = 0;
3359                if (IS_ERR(rt))
3360                        err = PTR_ERR(rt);
3361                else
3362                        skb_dst_set(skb, &rt->dst);
3363        }
3364
3365        if (err)
3366                goto errout_rcu;
3367
3368        if (rtm->rtm_flags & RTM_F_NOTIFY)
3369                rt->rt_flags |= RTCF_NOTIFY;
3370
3371        if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3372                table_id = res.table ? res.table->tb_id : 0;
3373
3374        /* reset skb for netlink reply msg */
3375        skb_trim(skb, 0);
3376        skb_reset_network_header(skb);
3377        skb_reset_transport_header(skb);
3378        skb_reset_mac_header(skb);
3379
3380        if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3381                struct fib_rt_info fri;
3382
3383                if (!res.fi) {
3384                        err = fib_props[res.type].error;
3385                        if (!err)
3386                                err = -EHOSTUNREACH;
3387                        goto errout_rcu;
3388                }
3389                fri.fi = res.fi;
3390                fri.tb_id = table_id;
3391                fri.dst = res.prefix;
3392                fri.dst_len = res.prefixlen;
3393                fri.tos = fl4.flowi4_tos;
3394                fri.type = rt->rt_type;
3395                fri.offload = 0;
3396                fri.trap = 0;
3397                fri.offload_failed = 0;
3398                if (res.fa_head) {
3399                        struct fib_alias *fa;
3400
3401                        hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3402                                u8 slen = 32 - fri.dst_len;
3403
3404                                if (fa->fa_slen == slen &&
3405                                    fa->tb_id == fri.tb_id &&
3406                                    fa->fa_tos == fri.tos &&
3407                                    fa->fa_info == res.fi &&
3408                                    fa->fa_type == fri.type) {
3409                                        fri.offload = fa->offload;
3410                                        fri.trap = fa->trap;
3411                                        break;
3412                                }
3413                        }
3414                }
3415                err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3416                                    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3417        } else {
3418                err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3419                                   NETLINK_CB(in_skb).portid,
3420                                   nlh->nlmsg_seq, 0);
3421        }
3422        if (err < 0)
3423                goto errout_rcu;
3424
3425        rcu_read_unlock();
3426
3427        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3428
3429errout_free:
3430        return err;
3431errout_rcu:
3432        rcu_read_unlock();
3433        kfree_skb(skb);
3434        goto errout_free;
3435}
3436
3437void ip_rt_multicast_event(struct in_device *in_dev)
3438{
3439        rt_cache_flush(dev_net(in_dev->dev));
3440}
3441
3442#ifdef CONFIG_SYSCTL
3443static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3444static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3445static int ip_rt_gc_elasticity __read_mostly    = 8;
3446static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3447
3448static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3449                void *buffer, size_t *lenp, loff_t *ppos)
3450{
3451        struct net *net = (struct net *)__ctl->extra1;
3452
3453        if (write) {
3454                rt_cache_flush(net);
3455                fnhe_genid_bump(net);
3456                return 0;
3457        }
3458
3459        return -EINVAL;
3460}
3461
3462static struct ctl_table ipv4_route_table[] = {
3463        {
3464                .procname       = "gc_thresh",
3465                .data           = &ipv4_dst_ops.gc_thresh,
3466                .maxlen         = sizeof(int),
3467                .mode           = 0644,
3468                .proc_handler   = proc_dointvec,
3469        },
3470        {
3471                .procname       = "max_size",
3472                .data           = &ip_rt_max_size,
3473                .maxlen         = sizeof(int),
3474                .mode           = 0644,
3475                .proc_handler   = proc_dointvec,
3476        },
3477        {
3478                /*  Deprecated. Use gc_min_interval_ms */
3479
3480                .procname       = "gc_min_interval",
3481                .data           = &ip_rt_gc_min_interval,
3482                .maxlen         = sizeof(int),
3483                .mode           = 0644,
3484                .proc_handler   = proc_dointvec_jiffies,
3485        },
3486        {
3487                .procname       = "gc_min_interval_ms",
3488                .data           = &ip_rt_gc_min_interval,
3489                .maxlen         = sizeof(int),
3490                .mode           = 0644,
3491                .proc_handler   = proc_dointvec_ms_jiffies,
3492        },
3493        {
3494                .procname       = "gc_timeout",
3495                .data           = &ip_rt_gc_timeout,
3496                .maxlen         = sizeof(int),
3497                .mode           = 0644,
3498                .proc_handler   = proc_dointvec_jiffies,
3499        },
3500        {
3501                .procname       = "gc_interval",
3502                .data           = &ip_rt_gc_interval,
3503                .maxlen         = sizeof(int),
3504                .mode           = 0644,
3505                .proc_handler   = proc_dointvec_jiffies,
3506        },
3507        {
3508                .procname       = "redirect_load",
3509                .data           = &ip_rt_redirect_load,
3510                .maxlen         = sizeof(int),
3511                .mode           = 0644,
3512                .proc_handler   = proc_dointvec,
3513        },
3514        {
3515                .procname       = "redirect_number",
3516                .data           = &ip_rt_redirect_number,
3517                .maxlen         = sizeof(int),
3518                .mode           = 0644,
3519                .proc_handler   = proc_dointvec,
3520        },
3521        {
3522                .procname       = "redirect_silence",
3523                .data           = &ip_rt_redirect_silence,
3524                .maxlen         = sizeof(int),
3525                .mode           = 0644,
3526                .proc_handler   = proc_dointvec,
3527        },
3528        {
3529                .procname       = "error_cost",
3530                .data           = &ip_rt_error_cost,
3531                .maxlen         = sizeof(int),
3532                .mode           = 0644,
3533                .proc_handler   = proc_dointvec,
3534        },
3535        {
3536                .procname       = "error_burst",
3537                .data           = &ip_rt_error_burst,
3538                .maxlen         = sizeof(int),
3539                .mode           = 0644,
3540                .proc_handler   = proc_dointvec,
3541        },
3542        {
3543                .procname       = "gc_elasticity",
3544                .data           = &ip_rt_gc_elasticity,
3545                .maxlen         = sizeof(int),
3546                .mode           = 0644,
3547                .proc_handler   = proc_dointvec,
3548        },
3549        {
3550                .procname       = "mtu_expires",
3551                .data           = &ip_rt_mtu_expires,
3552                .maxlen         = sizeof(int),
3553                .mode           = 0644,
3554                .proc_handler   = proc_dointvec_jiffies,
3555        },
3556        {
3557                .procname       = "min_pmtu",
3558                .data           = &ip_rt_min_pmtu,
3559                .maxlen         = sizeof(int),
3560                .mode           = 0644,
3561                .proc_handler   = proc_dointvec_minmax,
3562                .extra1         = &ip_min_valid_pmtu,
3563        },
3564        {
3565                .procname       = "min_adv_mss",
3566                .data           = &ip_rt_min_advmss,
3567                .maxlen         = sizeof(int),
3568                .mode           = 0644,
3569                .proc_handler   = proc_dointvec,
3570        },
3571        { }
3572};
3573
3574static const char ipv4_route_flush_procname[] = "flush";
3575
3576static struct ctl_table ipv4_route_flush_table[] = {
3577        {
3578                .procname       = ipv4_route_flush_procname,
3579                .maxlen         = sizeof(int),
3580                .mode           = 0200,
3581                .proc_handler   = ipv4_sysctl_rtcache_flush,
3582        },
3583        { },
3584};
3585
3586static __net_init int sysctl_route_net_init(struct net *net)
3587{
3588        struct ctl_table *tbl;
3589
3590        tbl = ipv4_route_flush_table;
3591        if (!net_eq(net, &init_net)) {
3592                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3593                if (!tbl)
3594                        goto err_dup;
3595
3596                /* Don't export non-whitelisted sysctls to unprivileged users */
3597                if (net->user_ns != &init_user_ns) {
3598                        if (tbl[0].procname != ipv4_route_flush_procname)
3599                                tbl[0].procname = NULL;
3600                }
3601        }
3602        tbl[0].extra1 = net;
3603
3604        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3605        if (!net->ipv4.route_hdr)
3606                goto err_reg;
3607        return 0;
3608
3609err_reg:
3610        if (tbl != ipv4_route_flush_table)
3611                kfree(tbl);
3612err_dup:
3613        return -ENOMEM;
3614}
3615
3616static __net_exit void sysctl_route_net_exit(struct net *net)
3617{
3618        struct ctl_table *tbl;
3619
3620        tbl = net->ipv4.route_hdr->ctl_table_arg;
3621        unregister_net_sysctl_table(net->ipv4.route_hdr);
3622        BUG_ON(tbl == ipv4_route_flush_table);
3623        kfree(tbl);
3624}
3625
3626static __net_initdata struct pernet_operations sysctl_route_ops = {
3627        .init = sysctl_route_net_init,
3628        .exit = sysctl_route_net_exit,
3629};
3630#endif
3631
3632static __net_init int rt_genid_init(struct net *net)
3633{
3634        atomic_set(&net->ipv4.rt_genid, 0);
3635        atomic_set(&net->fnhe_genid, 0);
3636        atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3637        return 0;
3638}
3639
3640static __net_initdata struct pernet_operations rt_genid_ops = {
3641        .init = rt_genid_init,
3642};
3643
3644static int __net_init ipv4_inetpeer_init(struct net *net)
3645{
3646        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3647
3648        if (!bp)
3649                return -ENOMEM;
3650        inet_peer_base_init(bp);
3651        net->ipv4.peers = bp;
3652        return 0;
3653}
3654
3655static void __net_exit ipv4_inetpeer_exit(struct net *net)
3656{
3657        struct inet_peer_base *bp = net->ipv4.peers;
3658
3659        net->ipv4.peers = NULL;
3660        inetpeer_invalidate_tree(bp);
3661        kfree(bp);
3662}
3663
3664static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3665        .init   =       ipv4_inetpeer_init,
3666        .exit   =       ipv4_inetpeer_exit,
3667};
3668
3669#ifdef CONFIG_IP_ROUTE_CLASSID
3670struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3671#endif /* CONFIG_IP_ROUTE_CLASSID */
3672
3673int __init ip_rt_init(void)
3674{
3675        void *idents_hash;
3676        int cpu;
3677
3678        /* For modern hosts, this will use 2 MB of memory */
3679        idents_hash = alloc_large_system_hash("IP idents",
3680                                              sizeof(*ip_idents) + sizeof(*ip_tstamps),
3681                                              0,
3682                                              16, /* one bucket per 64 KB */
3683                                              HASH_ZERO,
3684                                              NULL,
3685                                              &ip_idents_mask,
3686                                              2048,
3687                                              256*1024);
3688
3689        ip_idents = idents_hash;
3690
3691        prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3692
3693        ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3694
3695        for_each_possible_cpu(cpu) {
3696                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3697
3698                INIT_LIST_HEAD(&ul->head);
3699                spin_lock_init(&ul->lock);
3700        }
3701#ifdef CONFIG_IP_ROUTE_CLASSID
3702        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3703        if (!ip_rt_acct)
3704                panic("IP: failed to allocate ip_rt_acct\n");
3705#endif
3706
3707        ipv4_dst_ops.kmem_cachep =
3708                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3709                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3710
3711        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3712
3713        if (dst_entries_init(&ipv4_dst_ops) < 0)
3714                panic("IP: failed to allocate ipv4_dst_ops counter\n");
3715
3716        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3717                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3718
3719        ipv4_dst_ops.gc_thresh = ~0;
3720        ip_rt_max_size = INT_MAX;
3721
3722        devinet_init();
3723        ip_fib_init();
3724
3725        if (ip_rt_proc_init())
3726                pr_err("Unable to create route proc files\n");
3727#ifdef CONFIG_XFRM
3728        xfrm_init();
3729        xfrm4_init();
3730#endif
3731        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3732                      RTNL_FLAG_DOIT_UNLOCKED);
3733
3734#ifdef CONFIG_SYSCTL
3735        register_pernet_subsys(&sysctl_route_ops);
3736#endif
3737        register_pernet_subsys(&rt_genid_ops);
3738        register_pernet_subsys(&ipv4_inetpeer_ops);
3739        return 0;
3740}
3741
3742#ifdef CONFIG_SYSCTL
3743/*
3744 * We really need to sanitize the damn ipv4 init order, then all
3745 * this nonsense will go away.
3746 */
3747void __init ip_static_sysctl_init(void)
3748{
3749        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3750}
3751#endif
3752