linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *              Alan Cox        :       Verify area fixes.
  16 *              Alan Cox        :       cli() protects routing changes
  17 *              Rui Oliveira    :       ICMP routing table updates
  18 *              (rco@di.uminho.pt)      Routing table insertion and update
  19 *              Linus Torvalds  :       Rewrote bits to be sensible
  20 *              Alan Cox        :       Added BSD route gw semantics
  21 *              Alan Cox        :       Super /proc >4K
  22 *              Alan Cox        :       MTU in route table
  23 *              Alan Cox        :       MSS actually. Also added the window
  24 *                                      clamper.
  25 *              Sam Lantinga    :       Fixed route matching in rt_del()
  26 *              Alan Cox        :       Routing cache support.
  27 *              Alan Cox        :       Removed compatibility cruft.
  28 *              Alan Cox        :       RTF_REJECT support.
  29 *              Alan Cox        :       TCP irtt support.
  30 *              Jonathan Naylor :       Added Metric support.
  31 *      Miquel van Smoorenburg  :       BSD API fixes.
  32 *      Miquel van Smoorenburg  :       Metrics.
  33 *              Alan Cox        :       Use __u32 properly
  34 *              Alan Cox        :       Aligned routing errors more closely with BSD
  35 *                                      our system is still very different.
  36 *              Alan Cox        :       Faster /proc handling
  37 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38 *                                      routing caches and better behaviour.
  39 *
  40 *              Olaf Erb        :       irtt wasn't being copied right.
  41 *              Bjorn Ekwall    :       Kerneld route support.
  42 *              Alan Cox        :       Multicast fixed (I hope)
  43 *              Pavel Krauz     :       Limited broadcast fixed
  44 *              Mike McLagan    :       Routing by source
  45 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46 *                                      route.c and rewritten from scratch.
  47 *              Andi Kleen      :       Load-limit warning messages.
  48 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52 *              Marc Boucher    :       routing by fwmark
  53 *      Robert Olsson           :       Added rt_cache statistics
  54 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58 *
  59 *              This program is free software; you can redistribute it and/or
  60 *              modify it under the terms of the GNU General Public License
  61 *              as published by the Free Software Foundation; either version
  62 *              2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <linux/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
  92#include <linux/jhash.h>
  93#include <net/dst.h>
  94#include <net/dst_metadata.h>
  95#include <net/net_namespace.h>
  96#include <net/protocol.h>
  97#include <net/ip.h>
  98#include <net/route.h>
  99#include <net/inetpeer.h>
 100#include <net/sock.h>
 101#include <net/ip_fib.h>
 102#include <net/arp.h>
 103#include <net/tcp.h>
 104#include <net/icmp.h>
 105#include <net/xfrm.h>
 106#include <net/lwtunnel.h>
 107#include <net/netevent.h>
 108#include <net/rtnetlink.h>
 109#ifdef CONFIG_SYSCTL
 110#include <linux/sysctl.h>
 111#include <linux/kmemleak.h>
 112#endif
 113#include <net/secure_seq.h>
 114#include <net/ip_tunnels.h>
 115#include <net/l3mdev.h>
 116
 117#include "fib_lookup.h"
 118
 119#define RT_FL_TOS(oldflp4) \
 120        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 121
 122#define RT_GC_TIMEOUT (300*HZ)
 123
 124static int ip_rt_max_size;
 125static int ip_rt_redirect_number __read_mostly  = 9;
 126static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128static int ip_rt_error_cost __read_mostly       = HZ;
 129static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132static int ip_rt_min_advmss __read_mostly       = 256;
 133
 134static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 135
 136/*
 137 *      Interface to generic destination cache.
 138 */
 139
 140static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 141static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 142static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 143static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 144static void              ipv4_link_failure(struct sk_buff *skb);
 145static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 146                                           struct sk_buff *skb, u32 mtu);
 147static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 148                                        struct sk_buff *skb);
 149static void             ipv4_dst_destroy(struct dst_entry *dst);
 150
 151static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 152{
 153        WARN_ON(1);
 154        return NULL;
 155}
 156
 157static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 158                                           struct sk_buff *skb,
 159                                           const void *daddr);
 160static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 161
 162static struct dst_ops ipv4_dst_ops = {
 163        .family =               AF_INET,
 164        .check =                ipv4_dst_check,
 165        .default_advmss =       ipv4_default_advmss,
 166        .mtu =                  ipv4_mtu,
 167        .cow_metrics =          ipv4_cow_metrics,
 168        .destroy =              ipv4_dst_destroy,
 169        .negative_advice =      ipv4_negative_advice,
 170        .link_failure =         ipv4_link_failure,
 171        .update_pmtu =          ip_rt_update_pmtu,
 172        .redirect =             ip_do_redirect,
 173        .local_out =            __ip_local_out,
 174        .neigh_lookup =         ipv4_neigh_lookup,
 175        .confirm_neigh =        ipv4_confirm_neigh,
 176};
 177
 178#define ECN_OR_COST(class)      TC_PRIO_##class
 179
 180const __u8 ip_tos2prio[16] = {
 181        TC_PRIO_BESTEFFORT,
 182        ECN_OR_COST(BESTEFFORT),
 183        TC_PRIO_BESTEFFORT,
 184        ECN_OR_COST(BESTEFFORT),
 185        TC_PRIO_BULK,
 186        ECN_OR_COST(BULK),
 187        TC_PRIO_BULK,
 188        ECN_OR_COST(BULK),
 189        TC_PRIO_INTERACTIVE,
 190        ECN_OR_COST(INTERACTIVE),
 191        TC_PRIO_INTERACTIVE,
 192        ECN_OR_COST(INTERACTIVE),
 193        TC_PRIO_INTERACTIVE_BULK,
 194        ECN_OR_COST(INTERACTIVE_BULK),
 195        TC_PRIO_INTERACTIVE_BULK,
 196        ECN_OR_COST(INTERACTIVE_BULK)
 197};
 198EXPORT_SYMBOL(ip_tos2prio);
 199
 200static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 201#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 202
 203#ifdef CONFIG_PROC_FS
 204static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 205{
 206        if (*pos)
 207                return NULL;
 208        return SEQ_START_TOKEN;
 209}
 210
 211static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 212{
 213        ++*pos;
 214        return NULL;
 215}
 216
 217static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 218{
 219}
 220
 221static int rt_cache_seq_show(struct seq_file *seq, void *v)
 222{
 223        if (v == SEQ_START_TOKEN)
 224                seq_printf(seq, "%-127s\n",
 225                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 226                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 227                           "HHUptod\tSpecDst");
 228        return 0;
 229}
 230
 231static const struct seq_operations rt_cache_seq_ops = {
 232        .start  = rt_cache_seq_start,
 233        .next   = rt_cache_seq_next,
 234        .stop   = rt_cache_seq_stop,
 235        .show   = rt_cache_seq_show,
 236};
 237
 238static int rt_cache_seq_open(struct inode *inode, struct file *file)
 239{
 240        return seq_open(file, &rt_cache_seq_ops);
 241}
 242
 243static const struct file_operations rt_cache_seq_fops = {
 244        .open    = rt_cache_seq_open,
 245        .read    = seq_read,
 246        .llseek  = seq_lseek,
 247        .release = seq_release,
 248};
 249
 250
 251static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 252{
 253        int cpu;
 254
 255        if (*pos == 0)
 256                return SEQ_START_TOKEN;
 257
 258        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 259                if (!cpu_possible(cpu))
 260                        continue;
 261                *pos = cpu+1;
 262                return &per_cpu(rt_cache_stat, cpu);
 263        }
 264        return NULL;
 265}
 266
 267static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 268{
 269        int cpu;
 270
 271        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 272                if (!cpu_possible(cpu))
 273                        continue;
 274                *pos = cpu+1;
 275                return &per_cpu(rt_cache_stat, cpu);
 276        }
 277        return NULL;
 278
 279}
 280
 281static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 282{
 283
 284}
 285
 286static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 287{
 288        struct rt_cache_stat *st = v;
 289
 290        if (v == SEQ_START_TOKEN) {
 291                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 292                return 0;
 293        }
 294
 295        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 296                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 297                   dst_entries_get_slow(&ipv4_dst_ops),
 298                   0, /* st->in_hit */
 299                   st->in_slow_tot,
 300                   st->in_slow_mc,
 301                   st->in_no_route,
 302                   st->in_brd,
 303                   st->in_martian_dst,
 304                   st->in_martian_src,
 305
 306                   0, /* st->out_hit */
 307                   st->out_slow_tot,
 308                   st->out_slow_mc,
 309
 310                   0, /* st->gc_total */
 311                   0, /* st->gc_ignored */
 312                   0, /* st->gc_goal_miss */
 313                   0, /* st->gc_dst_overflow */
 314                   0, /* st->in_hlist_search */
 315                   0  /* st->out_hlist_search */
 316                );
 317        return 0;
 318}
 319
 320static const struct seq_operations rt_cpu_seq_ops = {
 321        .start  = rt_cpu_seq_start,
 322        .next   = rt_cpu_seq_next,
 323        .stop   = rt_cpu_seq_stop,
 324        .show   = rt_cpu_seq_show,
 325};
 326
 327
 328static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 329{
 330        return seq_open(file, &rt_cpu_seq_ops);
 331}
 332
 333static const struct file_operations rt_cpu_seq_fops = {
 334        .open    = rt_cpu_seq_open,
 335        .read    = seq_read,
 336        .llseek  = seq_lseek,
 337        .release = seq_release,
 338};
 339
 340#ifdef CONFIG_IP_ROUTE_CLASSID
 341static int rt_acct_proc_show(struct seq_file *m, void *v)
 342{
 343        struct ip_rt_acct *dst, *src;
 344        unsigned int i, j;
 345
 346        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 347        if (!dst)
 348                return -ENOMEM;
 349
 350        for_each_possible_cpu(i) {
 351                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 352                for (j = 0; j < 256; j++) {
 353                        dst[j].o_bytes   += src[j].o_bytes;
 354                        dst[j].o_packets += src[j].o_packets;
 355                        dst[j].i_bytes   += src[j].i_bytes;
 356                        dst[j].i_packets += src[j].i_packets;
 357                }
 358        }
 359
 360        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 361        kfree(dst);
 362        return 0;
 363}
 364
 365static int rt_acct_proc_open(struct inode *inode, struct file *file)
 366{
 367        return single_open(file, rt_acct_proc_show, NULL);
 368}
 369
 370static const struct file_operations rt_acct_proc_fops = {
 371        .open           = rt_acct_proc_open,
 372        .read           = seq_read,
 373        .llseek         = seq_lseek,
 374        .release        = single_release,
 375};
 376#endif
 377
 378static int __net_init ip_rt_do_proc_init(struct net *net)
 379{
 380        struct proc_dir_entry *pde;
 381
 382        pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 383                          &rt_cache_seq_fops);
 384        if (!pde)
 385                goto err1;
 386
 387        pde = proc_create("rt_cache", S_IRUGO,
 388                          net->proc_net_stat, &rt_cpu_seq_fops);
 389        if (!pde)
 390                goto err2;
 391
 392#ifdef CONFIG_IP_ROUTE_CLASSID
 393        pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 394        if (!pde)
 395                goto err3;
 396#endif
 397        return 0;
 398
 399#ifdef CONFIG_IP_ROUTE_CLASSID
 400err3:
 401        remove_proc_entry("rt_cache", net->proc_net_stat);
 402#endif
 403err2:
 404        remove_proc_entry("rt_cache", net->proc_net);
 405err1:
 406        return -ENOMEM;
 407}
 408
 409static void __net_exit ip_rt_do_proc_exit(struct net *net)
 410{
 411        remove_proc_entry("rt_cache", net->proc_net_stat);
 412        remove_proc_entry("rt_cache", net->proc_net);
 413#ifdef CONFIG_IP_ROUTE_CLASSID
 414        remove_proc_entry("rt_acct", net->proc_net);
 415#endif
 416}
 417
 418static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 419        .init = ip_rt_do_proc_init,
 420        .exit = ip_rt_do_proc_exit,
 421};
 422
 423static int __init ip_rt_proc_init(void)
 424{
 425        return register_pernet_subsys(&ip_rt_proc_ops);
 426}
 427
 428#else
 429static inline int ip_rt_proc_init(void)
 430{
 431        return 0;
 432}
 433#endif /* CONFIG_PROC_FS */
 434
 435static inline bool rt_is_expired(const struct rtable *rth)
 436{
 437        return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 438}
 439
 440void rt_cache_flush(struct net *net)
 441{
 442        rt_genid_bump_ipv4(net);
 443}
 444
 445static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 446                                           struct sk_buff *skb,
 447                                           const void *daddr)
 448{
 449        struct net_device *dev = dst->dev;
 450        const __be32 *pkey = daddr;
 451        const struct rtable *rt;
 452        struct neighbour *n;
 453
 454        rt = (const struct rtable *) dst;
 455        if (rt->rt_gateway)
 456                pkey = (const __be32 *) &rt->rt_gateway;
 457        else if (skb)
 458                pkey = &ip_hdr(skb)->daddr;
 459
 460        n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 461        if (n)
 462                return n;
 463        return neigh_create(&arp_tbl, pkey, dev);
 464}
 465
 466static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 467{
 468        struct net_device *dev = dst->dev;
 469        const __be32 *pkey = daddr;
 470        const struct rtable *rt;
 471
 472        rt = (const struct rtable *)dst;
 473        if (rt->rt_gateway)
 474                pkey = (const __be32 *)&rt->rt_gateway;
 475        else if (!daddr ||
 476                 (rt->rt_flags &
 477                  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 478                return;
 479
 480        __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 481}
 482
 483#define IP_IDENTS_SZ 2048u
 484
 485static atomic_t *ip_idents __read_mostly;
 486static u32 *ip_tstamps __read_mostly;
 487
 488/* In order to protect privacy, we add a perturbation to identifiers
 489 * if one generator is seldom used. This makes hard for an attacker
 490 * to infer how many packets were sent between two points in time.
 491 */
 492u32 ip_idents_reserve(u32 hash, int segs)
 493{
 494        u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 495        atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 496        u32 old = READ_ONCE(*p_tstamp);
 497        u32 now = (u32)jiffies;
 498        u32 new, delta = 0;
 499
 500        if (old != now && cmpxchg(p_tstamp, old, now) == old)
 501                delta = prandom_u32_max(now - old);
 502
 503        /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 504        do {
 505                old = (u32)atomic_read(p_id);
 506                new = old + delta + segs;
 507        } while (atomic_cmpxchg(p_id, old, new) != old);
 508
 509        return new - segs;
 510}
 511EXPORT_SYMBOL(ip_idents_reserve);
 512
 513void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 514{
 515        static u32 ip_idents_hashrnd __read_mostly;
 516        u32 hash, id;
 517
 518        net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 519
 520        hash = jhash_3words((__force u32)iph->daddr,
 521                            (__force u32)iph->saddr,
 522                            iph->protocol ^ net_hash_mix(net),
 523                            ip_idents_hashrnd);
 524        id = ip_idents_reserve(hash, segs);
 525        iph->id = htons(id);
 526}
 527EXPORT_SYMBOL(__ip_select_ident);
 528
 529static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 530                             const struct sock *sk,
 531                             const struct iphdr *iph,
 532                             int oif, u8 tos,
 533                             u8 prot, u32 mark, int flow_flags)
 534{
 535        if (sk) {
 536                const struct inet_sock *inet = inet_sk(sk);
 537
 538                oif = sk->sk_bound_dev_if;
 539                mark = sk->sk_mark;
 540                tos = RT_CONN_FLAGS(sk);
 541                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 542        }
 543        flowi4_init_output(fl4, oif, mark, tos,
 544                           RT_SCOPE_UNIVERSE, prot,
 545                           flow_flags,
 546                           iph->daddr, iph->saddr, 0, 0,
 547                           sock_net_uid(net, sk));
 548}
 549
 550static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 551                               const struct sock *sk)
 552{
 553        const struct net *net = dev_net(skb->dev);
 554        const struct iphdr *iph = ip_hdr(skb);
 555        int oif = skb->dev->ifindex;
 556        u8 tos = RT_TOS(iph->tos);
 557        u8 prot = iph->protocol;
 558        u32 mark = skb->mark;
 559
 560        __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 561}
 562
 563static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 564{
 565        const struct inet_sock *inet = inet_sk(sk);
 566        const struct ip_options_rcu *inet_opt;
 567        __be32 daddr = inet->inet_daddr;
 568
 569        rcu_read_lock();
 570        inet_opt = rcu_dereference(inet->inet_opt);
 571        if (inet_opt && inet_opt->opt.srr)
 572                daddr = inet_opt->opt.faddr;
 573        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 574                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 575                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 576                           inet_sk_flowi_flags(sk),
 577                           daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 578        rcu_read_unlock();
 579}
 580
 581static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 582                                 const struct sk_buff *skb)
 583{
 584        if (skb)
 585                build_skb_flow_key(fl4, skb, sk);
 586        else
 587                build_sk_flow_key(fl4, sk);
 588}
 589
 590static DEFINE_SPINLOCK(fnhe_lock);
 591
 592static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 593{
 594        struct rtable *rt;
 595
 596        rt = rcu_dereference(fnhe->fnhe_rth_input);
 597        if (rt) {
 598                RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 599                dst_dev_put(&rt->dst);
 600                dst_release(&rt->dst);
 601        }
 602        rt = rcu_dereference(fnhe->fnhe_rth_output);
 603        if (rt) {
 604                RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 605                dst_dev_put(&rt->dst);
 606                dst_release(&rt->dst);
 607        }
 608}
 609
 610static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 611{
 612        struct fib_nh_exception *fnhe, *oldest;
 613
 614        oldest = rcu_dereference(hash->chain);
 615        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 616             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 617                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 618                        oldest = fnhe;
 619        }
 620        fnhe_flush_routes(oldest);
 621        return oldest;
 622}
 623
 624static inline u32 fnhe_hashfun(__be32 daddr)
 625{
 626        static u32 fnhe_hashrnd __read_mostly;
 627        u32 hval;
 628
 629        net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 630        hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 631        return hash_32(hval, FNHE_HASH_SHIFT);
 632}
 633
 634static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 635{
 636        rt->rt_pmtu = fnhe->fnhe_pmtu;
 637        rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 638        rt->dst.expires = fnhe->fnhe_expires;
 639
 640        if (fnhe->fnhe_gw) {
 641                rt->rt_flags |= RTCF_REDIRECTED;
 642                rt->rt_gateway = fnhe->fnhe_gw;
 643                rt->rt_uses_gateway = 1;
 644        }
 645}
 646
 647static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 648                                  u32 pmtu, bool lock, unsigned long expires)
 649{
 650        struct fnhe_hash_bucket *hash;
 651        struct fib_nh_exception *fnhe;
 652        struct rtable *rt;
 653        u32 genid, hval;
 654        unsigned int i;
 655        int depth;
 656
 657        genid = fnhe_genid(dev_net(nh->nh_dev));
 658        hval = fnhe_hashfun(daddr);
 659
 660        spin_lock_bh(&fnhe_lock);
 661
 662        hash = rcu_dereference(nh->nh_exceptions);
 663        if (!hash) {
 664                hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 665                if (!hash)
 666                        goto out_unlock;
 667                rcu_assign_pointer(nh->nh_exceptions, hash);
 668        }
 669
 670        hash += hval;
 671
 672        depth = 0;
 673        for (fnhe = rcu_dereference(hash->chain); fnhe;
 674             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 675                if (fnhe->fnhe_daddr == daddr)
 676                        break;
 677                depth++;
 678        }
 679
 680        if (fnhe) {
 681                if (fnhe->fnhe_genid != genid)
 682                        fnhe->fnhe_genid = genid;
 683                if (gw)
 684                        fnhe->fnhe_gw = gw;
 685                if (pmtu) {
 686                        fnhe->fnhe_pmtu = pmtu;
 687                        fnhe->fnhe_mtu_locked = lock;
 688                }
 689                fnhe->fnhe_expires = max(1UL, expires);
 690                /* Update all cached dsts too */
 691                rt = rcu_dereference(fnhe->fnhe_rth_input);
 692                if (rt)
 693                        fill_route_from_fnhe(rt, fnhe);
 694                rt = rcu_dereference(fnhe->fnhe_rth_output);
 695                if (rt)
 696                        fill_route_from_fnhe(rt, fnhe);
 697        } else {
 698                if (depth > FNHE_RECLAIM_DEPTH)
 699                        fnhe = fnhe_oldest(hash);
 700                else {
 701                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 702                        if (!fnhe)
 703                                goto out_unlock;
 704
 705                        fnhe->fnhe_next = hash->chain;
 706                        rcu_assign_pointer(hash->chain, fnhe);
 707                }
 708                fnhe->fnhe_genid = genid;
 709                fnhe->fnhe_daddr = daddr;
 710                fnhe->fnhe_gw = gw;
 711                fnhe->fnhe_pmtu = pmtu;
 712                fnhe->fnhe_mtu_locked = lock;
 713                fnhe->fnhe_expires = expires;
 714
 715                /* Exception created; mark the cached routes for the nexthop
 716                 * stale, so anyone caching it rechecks if this exception
 717                 * applies to them.
 718                 */
 719                rt = rcu_dereference(nh->nh_rth_input);
 720                if (rt)
 721                        rt->dst.obsolete = DST_OBSOLETE_KILL;
 722
 723                for_each_possible_cpu(i) {
 724                        struct rtable __rcu **prt;
 725                        prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 726                        rt = rcu_dereference(*prt);
 727                        if (rt)
 728                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 729                }
 730        }
 731
 732        fnhe->fnhe_stamp = jiffies;
 733
 734out_unlock:
 735        spin_unlock_bh(&fnhe_lock);
 736}
 737
 738static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 739                             bool kill_route)
 740{
 741        __be32 new_gw = icmp_hdr(skb)->un.gateway;
 742        __be32 old_gw = ip_hdr(skb)->saddr;
 743        struct net_device *dev = skb->dev;
 744        struct in_device *in_dev;
 745        struct fib_result res;
 746        struct neighbour *n;
 747        struct net *net;
 748
 749        switch (icmp_hdr(skb)->code & 7) {
 750        case ICMP_REDIR_NET:
 751        case ICMP_REDIR_NETTOS:
 752        case ICMP_REDIR_HOST:
 753        case ICMP_REDIR_HOSTTOS:
 754                break;
 755
 756        default:
 757                return;
 758        }
 759
 760        if (rt->rt_gateway != old_gw)
 761                return;
 762
 763        in_dev = __in_dev_get_rcu(dev);
 764        if (!in_dev)
 765                return;
 766
 767        net = dev_net(dev);
 768        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 769            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 770            ipv4_is_zeronet(new_gw))
 771                goto reject_redirect;
 772
 773        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 774                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 775                        goto reject_redirect;
 776                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 777                        goto reject_redirect;
 778        } else {
 779                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 780                        goto reject_redirect;
 781        }
 782
 783        n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 784        if (!n)
 785                n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 786        if (!IS_ERR(n)) {
 787                if (!(n->nud_state & NUD_VALID)) {
 788                        neigh_event_send(n, NULL);
 789                } else {
 790                        if (fib_lookup(net, fl4, &res, 0) == 0) {
 791                                struct fib_nh *nh = &FIB_RES_NH(res);
 792
 793                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
 794                                                0, false,
 795                                                jiffies + ip_rt_gc_timeout);
 796                        }
 797                        if (kill_route)
 798                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 799                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 800                }
 801                neigh_release(n);
 802        }
 803        return;
 804
 805reject_redirect:
 806#ifdef CONFIG_IP_ROUTE_VERBOSE
 807        if (IN_DEV_LOG_MARTIANS(in_dev)) {
 808                const struct iphdr *iph = (const struct iphdr *) skb->data;
 809                __be32 daddr = iph->daddr;
 810                __be32 saddr = iph->saddr;
 811
 812                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 813                                     "  Advised path = %pI4 -> %pI4\n",
 814                                     &old_gw, dev->name, &new_gw,
 815                                     &saddr, &daddr);
 816        }
 817#endif
 818        ;
 819}
 820
 821static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 822{
 823        struct rtable *rt;
 824        struct flowi4 fl4;
 825        const struct iphdr *iph = (const struct iphdr *) skb->data;
 826        struct net *net = dev_net(skb->dev);
 827        int oif = skb->dev->ifindex;
 828        u8 tos = RT_TOS(iph->tos);
 829        u8 prot = iph->protocol;
 830        u32 mark = skb->mark;
 831
 832        rt = (struct rtable *) dst;
 833
 834        __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 835        __ip_do_redirect(rt, skb, &fl4, true);
 836}
 837
 838static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 839{
 840        struct rtable *rt = (struct rtable *)dst;
 841        struct dst_entry *ret = dst;
 842
 843        if (rt) {
 844                if (dst->obsolete > 0) {
 845                        ip_rt_put(rt);
 846                        ret = NULL;
 847                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 848                           rt->dst.expires) {
 849                        ip_rt_put(rt);
 850                        ret = NULL;
 851                }
 852        }
 853        return ret;
 854}
 855
 856/*
 857 * Algorithm:
 858 *      1. The first ip_rt_redirect_number redirects are sent
 859 *         with exponential backoff, then we stop sending them at all,
 860 *         assuming that the host ignores our redirects.
 861 *      2. If we did not see packets requiring redirects
 862 *         during ip_rt_redirect_silence, we assume that the host
 863 *         forgot redirected route and start to send redirects again.
 864 *
 865 * This algorithm is much cheaper and more intelligent than dumb load limiting
 866 * in icmp.c.
 867 *
 868 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 869 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 870 */
 871
 872void ip_rt_send_redirect(struct sk_buff *skb)
 873{
 874        struct rtable *rt = skb_rtable(skb);
 875        struct in_device *in_dev;
 876        struct inet_peer *peer;
 877        struct net *net;
 878        int log_martians;
 879        int vif;
 880
 881        rcu_read_lock();
 882        in_dev = __in_dev_get_rcu(rt->dst.dev);
 883        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 884                rcu_read_unlock();
 885                return;
 886        }
 887        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 888        vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 889        rcu_read_unlock();
 890
 891        net = dev_net(rt->dst.dev);
 892        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 893        if (!peer) {
 894                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 895                          rt_nexthop(rt, ip_hdr(skb)->daddr));
 896                return;
 897        }
 898
 899        /* No redirected packets during ip_rt_redirect_silence;
 900         * reset the algorithm.
 901         */
 902        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 903                peer->rate_tokens = 0;
 904
 905        /* Too many ignored redirects; do not send anything
 906         * set dst.rate_last to the last seen redirected packet.
 907         */
 908        if (peer->rate_tokens >= ip_rt_redirect_number) {
 909                peer->rate_last = jiffies;
 910                goto out_put_peer;
 911        }
 912
 913        /* Check for load limit; set rate_last to the latest sent
 914         * redirect.
 915         */
 916        if (peer->rate_tokens == 0 ||
 917            time_after(jiffies,
 918                       (peer->rate_last +
 919                        (ip_rt_redirect_load << peer->rate_tokens)))) {
 920                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 921
 922                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 923                peer->rate_last = jiffies;
 924                ++peer->rate_tokens;
 925#ifdef CONFIG_IP_ROUTE_VERBOSE
 926                if (log_martians &&
 927                    peer->rate_tokens == ip_rt_redirect_number)
 928                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 929                                             &ip_hdr(skb)->saddr, inet_iif(skb),
 930                                             &ip_hdr(skb)->daddr, &gw);
 931#endif
 932        }
 933out_put_peer:
 934        inet_putpeer(peer);
 935}
 936
 937static int ip_error(struct sk_buff *skb)
 938{
 939        struct rtable *rt = skb_rtable(skb);
 940        struct net_device *dev = skb->dev;
 941        struct in_device *in_dev;
 942        struct inet_peer *peer;
 943        unsigned long now;
 944        struct net *net;
 945        bool send;
 946        int code;
 947
 948        if (netif_is_l3_master(skb->dev)) {
 949                dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 950                if (!dev)
 951                        goto out;
 952        }
 953
 954        in_dev = __in_dev_get_rcu(dev);
 955
 956        /* IP on this device is disabled. */
 957        if (!in_dev)
 958                goto out;
 959
 960        net = dev_net(rt->dst.dev);
 961        if (!IN_DEV_FORWARD(in_dev)) {
 962                switch (rt->dst.error) {
 963                case EHOSTUNREACH:
 964                        __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 965                        break;
 966
 967                case ENETUNREACH:
 968                        __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 969                        break;
 970                }
 971                goto out;
 972        }
 973
 974        switch (rt->dst.error) {
 975        case EINVAL:
 976        default:
 977                goto out;
 978        case EHOSTUNREACH:
 979                code = ICMP_HOST_UNREACH;
 980                break;
 981        case ENETUNREACH:
 982                code = ICMP_NET_UNREACH;
 983                __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 984                break;
 985        case EACCES:
 986                code = ICMP_PKT_FILTERED;
 987                break;
 988        }
 989
 990        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 991                               l3mdev_master_ifindex(skb->dev), 1);
 992
 993        send = true;
 994        if (peer) {
 995                now = jiffies;
 996                peer->rate_tokens += now - peer->rate_last;
 997                if (peer->rate_tokens > ip_rt_error_burst)
 998                        peer->rate_tokens = ip_rt_error_burst;
 999                peer->rate_last = now;
1000                if (peer->rate_tokens >= ip_rt_error_cost)
1001                        peer->rate_tokens -= ip_rt_error_cost;
1002                else
1003                        send = false;
1004                inet_putpeer(peer);
1005        }
1006        if (send)
1007                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1008
1009out:    kfree_skb(skb);
1010        return 0;
1011}
1012
1013static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1014{
1015        struct dst_entry *dst = &rt->dst;
1016        struct fib_result res;
1017        bool lock = false;
1018
1019        if (ip_mtu_locked(dst))
1020                return;
1021
1022        if (ipv4_mtu(dst) < mtu)
1023                return;
1024
1025        if (mtu < ip_rt_min_pmtu) {
1026                lock = true;
1027                mtu = ip_rt_min_pmtu;
1028        }
1029
1030        if (rt->rt_pmtu == mtu &&
1031            time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1032                return;
1033
1034        rcu_read_lock();
1035        if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1036                struct fib_nh *nh = &FIB_RES_NH(res);
1037
1038                update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1039                                      jiffies + ip_rt_mtu_expires);
1040        }
1041        rcu_read_unlock();
1042}
1043
1044static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1045                              struct sk_buff *skb, u32 mtu)
1046{
1047        struct rtable *rt = (struct rtable *) dst;
1048        struct flowi4 fl4;
1049
1050        ip_rt_build_flow_key(&fl4, sk, skb);
1051        __ip_rt_update_pmtu(rt, &fl4, mtu);
1052}
1053
1054void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1055                      int oif, u32 mark, u8 protocol, int flow_flags)
1056{
1057        const struct iphdr *iph = (const struct iphdr *) skb->data;
1058        struct flowi4 fl4;
1059        struct rtable *rt;
1060
1061        if (!mark)
1062                mark = IP4_REPLY_MARK(net, skb->mark);
1063
1064        __build_flow_key(net, &fl4, NULL, iph, oif,
1065                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1066        rt = __ip_route_output_key(net, &fl4);
1067        if (!IS_ERR(rt)) {
1068                __ip_rt_update_pmtu(rt, &fl4, mtu);
1069                ip_rt_put(rt);
1070        }
1071}
1072EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1073
1074static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1075{
1076        const struct iphdr *iph = (const struct iphdr *) skb->data;
1077        struct flowi4 fl4;
1078        struct rtable *rt;
1079
1080        __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1081
1082        if (!fl4.flowi4_mark)
1083                fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1084
1085        rt = __ip_route_output_key(sock_net(sk), &fl4);
1086        if (!IS_ERR(rt)) {
1087                __ip_rt_update_pmtu(rt, &fl4, mtu);
1088                ip_rt_put(rt);
1089        }
1090}
1091
1092void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1093{
1094        const struct iphdr *iph = (const struct iphdr *) skb->data;
1095        struct flowi4 fl4;
1096        struct rtable *rt;
1097        struct dst_entry *odst = NULL;
1098        bool new = false;
1099        struct net *net = sock_net(sk);
1100
1101        bh_lock_sock(sk);
1102
1103        if (!ip_sk_accept_pmtu(sk))
1104                goto out;
1105
1106        odst = sk_dst_get(sk);
1107
1108        if (sock_owned_by_user(sk) || !odst) {
1109                __ipv4_sk_update_pmtu(skb, sk, mtu);
1110                goto out;
1111        }
1112
1113        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1114
1115        rt = (struct rtable *)odst;
1116        if (odst->obsolete && !odst->ops->check(odst, 0)) {
1117                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1118                if (IS_ERR(rt))
1119                        goto out;
1120
1121                new = true;
1122        }
1123
1124        __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1125
1126        if (!dst_check(&rt->dst, 0)) {
1127                if (new)
1128                        dst_release(&rt->dst);
1129
1130                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1131                if (IS_ERR(rt))
1132                        goto out;
1133
1134                new = true;
1135        }
1136
1137        if (new)
1138                sk_dst_set(sk, &rt->dst);
1139
1140out:
1141        bh_unlock_sock(sk);
1142        dst_release(odst);
1143}
1144EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1145
1146void ipv4_redirect(struct sk_buff *skb, struct net *net,
1147                   int oif, u32 mark, u8 protocol, int flow_flags)
1148{
1149        const struct iphdr *iph = (const struct iphdr *) skb->data;
1150        struct flowi4 fl4;
1151        struct rtable *rt;
1152
1153        __build_flow_key(net, &fl4, NULL, iph, oif,
1154                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1155        rt = __ip_route_output_key(net, &fl4);
1156        if (!IS_ERR(rt)) {
1157                __ip_do_redirect(rt, skb, &fl4, false);
1158                ip_rt_put(rt);
1159        }
1160}
1161EXPORT_SYMBOL_GPL(ipv4_redirect);
1162
1163void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1164{
1165        const struct iphdr *iph = (const struct iphdr *) skb->data;
1166        struct flowi4 fl4;
1167        struct rtable *rt;
1168        struct net *net = sock_net(sk);
1169
1170        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1171        rt = __ip_route_output_key(net, &fl4);
1172        if (!IS_ERR(rt)) {
1173                __ip_do_redirect(rt, skb, &fl4, false);
1174                ip_rt_put(rt);
1175        }
1176}
1177EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1178
1179static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1180{
1181        struct rtable *rt = (struct rtable *) dst;
1182
1183        /* All IPV4 dsts are created with ->obsolete set to the value
1184         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1185         * into this function always.
1186         *
1187         * When a PMTU/redirect information update invalidates a route,
1188         * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1189         * DST_OBSOLETE_DEAD by dst_free().
1190         */
1191        if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1192                return NULL;
1193        return dst;
1194}
1195
1196static void ipv4_link_failure(struct sk_buff *skb)
1197{
1198        struct rtable *rt;
1199
1200        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1201
1202        rt = skb_rtable(skb);
1203        if (rt)
1204                dst_set_expires(&rt->dst, 0);
1205}
1206
1207static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1208{
1209        pr_debug("%s: %pI4 -> %pI4, %s\n",
1210                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1211                 skb->dev ? skb->dev->name : "?");
1212        kfree_skb(skb);
1213        WARN_ON(1);
1214        return 0;
1215}
1216
1217/*
1218   We do not cache source address of outgoing interface,
1219   because it is used only by IP RR, TS and SRR options,
1220   so that it out of fast path.
1221
1222   BTW remember: "addr" is allowed to be not aligned
1223   in IP options!
1224 */
1225
1226void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1227{
1228        __be32 src;
1229
1230        if (rt_is_output_route(rt))
1231                src = ip_hdr(skb)->saddr;
1232        else {
1233                struct fib_result res;
1234                struct flowi4 fl4;
1235                struct iphdr *iph;
1236
1237                iph = ip_hdr(skb);
1238
1239                memset(&fl4, 0, sizeof(fl4));
1240                fl4.daddr = iph->daddr;
1241                fl4.saddr = iph->saddr;
1242                fl4.flowi4_tos = RT_TOS(iph->tos);
1243                fl4.flowi4_oif = rt->dst.dev->ifindex;
1244                fl4.flowi4_iif = skb->dev->ifindex;
1245                fl4.flowi4_mark = skb->mark;
1246
1247                rcu_read_lock();
1248                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1249                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1250                else
1251                        src = inet_select_addr(rt->dst.dev,
1252                                               rt_nexthop(rt, iph->daddr),
1253                                               RT_SCOPE_UNIVERSE);
1254                rcu_read_unlock();
1255        }
1256        memcpy(addr, &src, 4);
1257}
1258
1259#ifdef CONFIG_IP_ROUTE_CLASSID
1260static void set_class_tag(struct rtable *rt, u32 tag)
1261{
1262        if (!(rt->dst.tclassid & 0xFFFF))
1263                rt->dst.tclassid |= tag & 0xFFFF;
1264        if (!(rt->dst.tclassid & 0xFFFF0000))
1265                rt->dst.tclassid |= tag & 0xFFFF0000;
1266}
1267#endif
1268
1269static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1270{
1271        unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1272        unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1273                                    ip_rt_min_advmss);
1274
1275        return min(advmss, IPV4_MAX_PMTU - header_size);
1276}
1277
1278static unsigned int ipv4_mtu(const struct dst_entry *dst)
1279{
1280        const struct rtable *rt = (const struct rtable *) dst;
1281        unsigned int mtu = rt->rt_pmtu;
1282
1283        if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1284                mtu = dst_metric_raw(dst, RTAX_MTU);
1285
1286        if (mtu)
1287                return mtu;
1288
1289        mtu = READ_ONCE(dst->dev->mtu);
1290
1291        if (unlikely(ip_mtu_locked(dst))) {
1292                if (rt->rt_uses_gateway && mtu > 576)
1293                        mtu = 576;
1294        }
1295
1296        mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1297
1298        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1299}
1300
1301static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1302{
1303        struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1304        struct fib_nh_exception *fnhe;
1305        u32 hval;
1306
1307        if (!hash)
1308                return NULL;
1309
1310        hval = fnhe_hashfun(daddr);
1311
1312        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1313             fnhe = rcu_dereference(fnhe->fnhe_next)) {
1314                if (fnhe->fnhe_daddr == daddr)
1315                        return fnhe;
1316        }
1317        return NULL;
1318}
1319
1320static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1321                              __be32 daddr, const bool do_cache)
1322{
1323        bool ret = false;
1324
1325        spin_lock_bh(&fnhe_lock);
1326
1327        if (daddr == fnhe->fnhe_daddr) {
1328                struct rtable __rcu **porig;
1329                struct rtable *orig;
1330                int genid = fnhe_genid(dev_net(rt->dst.dev));
1331
1332                if (rt_is_input_route(rt))
1333                        porig = &fnhe->fnhe_rth_input;
1334                else
1335                        porig = &fnhe->fnhe_rth_output;
1336                orig = rcu_dereference(*porig);
1337
1338                if (fnhe->fnhe_genid != genid) {
1339                        fnhe->fnhe_genid = genid;
1340                        fnhe->fnhe_gw = 0;
1341                        fnhe->fnhe_pmtu = 0;
1342                        fnhe->fnhe_expires = 0;
1343                        fnhe_flush_routes(fnhe);
1344                        orig = NULL;
1345                }
1346                fill_route_from_fnhe(rt, fnhe);
1347                if (!rt->rt_gateway)
1348                        rt->rt_gateway = daddr;
1349
1350                if (do_cache) {
1351                        dst_hold(&rt->dst);
1352                        rcu_assign_pointer(*porig, rt);
1353                        if (orig) {
1354                                dst_dev_put(&orig->dst);
1355                                dst_release(&orig->dst);
1356                        }
1357                        ret = true;
1358                }
1359
1360                fnhe->fnhe_stamp = jiffies;
1361        }
1362        spin_unlock_bh(&fnhe_lock);
1363
1364        return ret;
1365}
1366
1367static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1368{
1369        struct rtable *orig, *prev, **p;
1370        bool ret = true;
1371
1372        if (rt_is_input_route(rt)) {
1373                p = (struct rtable **)&nh->nh_rth_input;
1374        } else {
1375                p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1376        }
1377        orig = *p;
1378
1379        /* hold dst before doing cmpxchg() to avoid race condition
1380         * on this dst
1381         */
1382        dst_hold(&rt->dst);
1383        prev = cmpxchg(p, orig, rt);
1384        if (prev == orig) {
1385                if (orig) {
1386                        dst_dev_put(&orig->dst);
1387                        dst_release(&orig->dst);
1388                }
1389        } else {
1390                dst_release(&rt->dst);
1391                ret = false;
1392        }
1393
1394        return ret;
1395}
1396
1397struct uncached_list {
1398        spinlock_t              lock;
1399        struct list_head        head;
1400};
1401
1402static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1403
1404void rt_add_uncached_list(struct rtable *rt)
1405{
1406        struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1407
1408        rt->rt_uncached_list = ul;
1409
1410        spin_lock_bh(&ul->lock);
1411        list_add_tail(&rt->rt_uncached, &ul->head);
1412        spin_unlock_bh(&ul->lock);
1413}
1414
1415void rt_del_uncached_list(struct rtable *rt)
1416{
1417        if (!list_empty(&rt->rt_uncached)) {
1418                struct uncached_list *ul = rt->rt_uncached_list;
1419
1420                spin_lock_bh(&ul->lock);
1421                list_del(&rt->rt_uncached);
1422                spin_unlock_bh(&ul->lock);
1423        }
1424}
1425
1426static void ipv4_dst_destroy(struct dst_entry *dst)
1427{
1428        struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1429        struct rtable *rt = (struct rtable *)dst;
1430
1431        if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1432                kfree(p);
1433
1434        rt_del_uncached_list(rt);
1435}
1436
1437void rt_flush_dev(struct net_device *dev)
1438{
1439        struct net *net = dev_net(dev);
1440        struct rtable *rt;
1441        int cpu;
1442
1443        for_each_possible_cpu(cpu) {
1444                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1445
1446                spin_lock_bh(&ul->lock);
1447                list_for_each_entry(rt, &ul->head, rt_uncached) {
1448                        if (rt->dst.dev != dev)
1449                                continue;
1450                        rt->dst.dev = net->loopback_dev;
1451                        dev_hold(rt->dst.dev);
1452                        dev_put(dev);
1453                }
1454                spin_unlock_bh(&ul->lock);
1455        }
1456}
1457
1458static bool rt_cache_valid(const struct rtable *rt)
1459{
1460        return  rt &&
1461                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1462                !rt_is_expired(rt);
1463}
1464
1465static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1466                           const struct fib_result *res,
1467                           struct fib_nh_exception *fnhe,
1468                           struct fib_info *fi, u16 type, u32 itag,
1469                           const bool do_cache)
1470{
1471        bool cached = false;
1472
1473        if (fi) {
1474                struct fib_nh *nh = &FIB_RES_NH(*res);
1475
1476                if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1477                        rt->rt_gateway = nh->nh_gw;
1478                        rt->rt_uses_gateway = 1;
1479                }
1480                dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1481                if (fi->fib_metrics != &dst_default_metrics) {
1482                        rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1483                        refcount_inc(&fi->fib_metrics->refcnt);
1484                }
1485#ifdef CONFIG_IP_ROUTE_CLASSID
1486                rt->dst.tclassid = nh->nh_tclassid;
1487#endif
1488                rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1489                if (unlikely(fnhe))
1490                        cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1491                else if (do_cache)
1492                        cached = rt_cache_route(nh, rt);
1493                if (unlikely(!cached)) {
1494                        /* Routes we intend to cache in nexthop exception or
1495                         * FIB nexthop have the DST_NOCACHE bit clear.
1496                         * However, if we are unsuccessful at storing this
1497                         * route into the cache we really need to set it.
1498                         */
1499                        if (!rt->rt_gateway)
1500                                rt->rt_gateway = daddr;
1501                        rt_add_uncached_list(rt);
1502                }
1503        } else
1504                rt_add_uncached_list(rt);
1505
1506#ifdef CONFIG_IP_ROUTE_CLASSID
1507#ifdef CONFIG_IP_MULTIPLE_TABLES
1508        set_class_tag(rt, res->tclassid);
1509#endif
1510        set_class_tag(rt, itag);
1511#endif
1512}
1513
1514struct rtable *rt_dst_alloc(struct net_device *dev,
1515                            unsigned int flags, u16 type,
1516                            bool nopolicy, bool noxfrm, bool will_cache)
1517{
1518        struct rtable *rt;
1519
1520        rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1521                       (will_cache ? 0 : DST_HOST) |
1522                       (nopolicy ? DST_NOPOLICY : 0) |
1523                       (noxfrm ? DST_NOXFRM : 0));
1524
1525        if (rt) {
1526                rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1527                rt->rt_flags = flags;
1528                rt->rt_type = type;
1529                rt->rt_is_input = 0;
1530                rt->rt_iif = 0;
1531                rt->rt_pmtu = 0;
1532                rt->rt_mtu_locked = 0;
1533                rt->rt_gateway = 0;
1534                rt->rt_uses_gateway = 0;
1535                rt->rt_table_id = 0;
1536                INIT_LIST_HEAD(&rt->rt_uncached);
1537
1538                rt->dst.output = ip_output;
1539                if (flags & RTCF_LOCAL)
1540                        rt->dst.input = ip_local_deliver;
1541        }
1542
1543        return rt;
1544}
1545EXPORT_SYMBOL(rt_dst_alloc);
1546
1547/* called in rcu_read_lock() section */
1548int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1549                          u8 tos, struct net_device *dev,
1550                          struct in_device *in_dev, u32 *itag)
1551{
1552        int err;
1553
1554        /* Primary sanity checks. */
1555        if (!in_dev)
1556                return -EINVAL;
1557
1558        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1559            skb->protocol != htons(ETH_P_IP))
1560                return -EINVAL;
1561
1562        if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1563                return -EINVAL;
1564
1565        if (ipv4_is_zeronet(saddr)) {
1566                if (!ipv4_is_local_multicast(daddr))
1567                        return -EINVAL;
1568        } else {
1569                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1570                                          in_dev, itag);
1571                if (err < 0)
1572                        return err;
1573        }
1574        return 0;
1575}
1576
1577/* called in rcu_read_lock() section */
1578static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1579                             u8 tos, struct net_device *dev, int our)
1580{
1581        struct in_device *in_dev = __in_dev_get_rcu(dev);
1582        unsigned int flags = RTCF_MULTICAST;
1583        struct rtable *rth;
1584        u32 itag = 0;
1585        int err;
1586
1587        err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1588        if (err)
1589                return err;
1590
1591        if (our)
1592                flags |= RTCF_LOCAL;
1593
1594        rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1595                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1596        if (!rth)
1597                return -ENOBUFS;
1598
1599#ifdef CONFIG_IP_ROUTE_CLASSID
1600        rth->dst.tclassid = itag;
1601#endif
1602        rth->dst.output = ip_rt_bug;
1603        rth->rt_is_input= 1;
1604
1605#ifdef CONFIG_IP_MROUTE
1606        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1607                rth->dst.input = ip_mr_input;
1608#endif
1609        RT_CACHE_STAT_INC(in_slow_mc);
1610
1611        skb_dst_set(skb, &rth->dst);
1612        return 0;
1613}
1614
1615
1616static void ip_handle_martian_source(struct net_device *dev,
1617                                     struct in_device *in_dev,
1618                                     struct sk_buff *skb,
1619                                     __be32 daddr,
1620                                     __be32 saddr)
1621{
1622        RT_CACHE_STAT_INC(in_martian_src);
1623#ifdef CONFIG_IP_ROUTE_VERBOSE
1624        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1625                /*
1626                 *      RFC1812 recommendation, if source is martian,
1627                 *      the only hint is MAC header.
1628                 */
1629                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1630                        &daddr, &saddr, dev->name);
1631                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1632                        print_hex_dump(KERN_WARNING, "ll header: ",
1633                                       DUMP_PREFIX_OFFSET, 16, 1,
1634                                       skb_mac_header(skb),
1635                                       dev->hard_header_len, true);
1636                }
1637        }
1638#endif
1639}
1640
1641static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1642{
1643        struct fnhe_hash_bucket *hash;
1644        struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1645        u32 hval = fnhe_hashfun(daddr);
1646
1647        spin_lock_bh(&fnhe_lock);
1648
1649        hash = rcu_dereference_protected(nh->nh_exceptions,
1650                                         lockdep_is_held(&fnhe_lock));
1651        hash += hval;
1652
1653        fnhe_p = &hash->chain;
1654        fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1655        while (fnhe) {
1656                if (fnhe->fnhe_daddr == daddr) {
1657                        rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1658                                fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1659                        fnhe_flush_routes(fnhe);
1660                        kfree_rcu(fnhe, rcu);
1661                        break;
1662                }
1663                fnhe_p = &fnhe->fnhe_next;
1664                fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1665                                                 lockdep_is_held(&fnhe_lock));
1666        }
1667
1668        spin_unlock_bh(&fnhe_lock);
1669}
1670
1671static void set_lwt_redirect(struct rtable *rth)
1672{
1673        if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1674                rth->dst.lwtstate->orig_output = rth->dst.output;
1675                rth->dst.output = lwtunnel_output;
1676        }
1677
1678        if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1679                rth->dst.lwtstate->orig_input = rth->dst.input;
1680                rth->dst.input = lwtunnel_input;
1681        }
1682}
1683
1684/* called in rcu_read_lock() section */
1685static int __mkroute_input(struct sk_buff *skb,
1686                           const struct fib_result *res,
1687                           struct in_device *in_dev,
1688                           __be32 daddr, __be32 saddr, u32 tos)
1689{
1690        struct fib_nh_exception *fnhe;
1691        struct rtable *rth;
1692        int err;
1693        struct in_device *out_dev;
1694        bool do_cache;
1695        u32 itag = 0;
1696
1697        /* get a working reference to the output device */
1698        out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1699        if (!out_dev) {
1700                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1701                return -EINVAL;
1702        }
1703
1704        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1705                                  in_dev->dev, in_dev, &itag);
1706        if (err < 0) {
1707                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1708                                         saddr);
1709
1710                goto cleanup;
1711        }
1712
1713        do_cache = res->fi && !itag;
1714        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1715            skb->protocol == htons(ETH_P_IP) &&
1716            (IN_DEV_SHARED_MEDIA(out_dev) ||
1717             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1718                IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1719
1720        if (skb->protocol != htons(ETH_P_IP)) {
1721                /* Not IP (i.e. ARP). Do not create route, if it is
1722                 * invalid for proxy arp. DNAT routes are always valid.
1723                 *
1724                 * Proxy arp feature have been extended to allow, ARP
1725                 * replies back to the same interface, to support
1726                 * Private VLAN switch technologies. See arp.c.
1727                 */
1728                if (out_dev == in_dev &&
1729                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1730                        err = -EINVAL;
1731                        goto cleanup;
1732                }
1733        }
1734
1735        fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1736        if (do_cache) {
1737                if (fnhe) {
1738                        rth = rcu_dereference(fnhe->fnhe_rth_input);
1739                        if (rth && rth->dst.expires &&
1740                            time_after(jiffies, rth->dst.expires)) {
1741                                ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1742                                fnhe = NULL;
1743                        } else {
1744                                goto rt_cache;
1745                        }
1746                }
1747
1748                rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1749
1750rt_cache:
1751                if (rt_cache_valid(rth)) {
1752                        skb_dst_set_noref(skb, &rth->dst);
1753                        goto out;
1754                }
1755        }
1756
1757        rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1758                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1759                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1760        if (!rth) {
1761                err = -ENOBUFS;
1762                goto cleanup;
1763        }
1764
1765        rth->rt_is_input = 1;
1766        if (res->table)
1767                rth->rt_table_id = res->table->tb_id;
1768        RT_CACHE_STAT_INC(in_slow_tot);
1769
1770        rth->dst.input = ip_forward;
1771
1772        rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1773                       do_cache);
1774        set_lwt_redirect(rth);
1775        skb_dst_set(skb, &rth->dst);
1776out:
1777        err = 0;
1778 cleanup:
1779        return err;
1780}
1781
1782#ifdef CONFIG_IP_ROUTE_MULTIPATH
1783/* To make ICMP packets follow the right flow, the multipath hash is
1784 * calculated from the inner IP addresses.
1785 */
1786static void ip_multipath_l3_keys(const struct sk_buff *skb,
1787                                 struct flow_keys *hash_keys)
1788{
1789        const struct iphdr *outer_iph = ip_hdr(skb);
1790        const struct iphdr *inner_iph;
1791        const struct icmphdr *icmph;
1792        struct iphdr _inner_iph;
1793        struct icmphdr _icmph;
1794
1795        hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1796        hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1797        if (likely(outer_iph->protocol != IPPROTO_ICMP))
1798                return;
1799
1800        if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1801                return;
1802
1803        icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1804                                   &_icmph);
1805        if (!icmph)
1806                return;
1807
1808        if (icmph->type != ICMP_DEST_UNREACH &&
1809            icmph->type != ICMP_REDIRECT &&
1810            icmph->type != ICMP_TIME_EXCEEDED &&
1811            icmph->type != ICMP_PARAMETERPROB)
1812                return;
1813
1814        inner_iph = skb_header_pointer(skb,
1815                                       outer_iph->ihl * 4 + sizeof(_icmph),
1816                                       sizeof(_inner_iph), &_inner_iph);
1817        if (!inner_iph)
1818                return;
1819        hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1820        hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1821}
1822
1823/* if skb is set it will be used and fl4 can be NULL */
1824int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1825                       const struct sk_buff *skb)
1826{
1827        struct net *net = fi->fib_net;
1828        struct flow_keys hash_keys;
1829        u32 mhash;
1830
1831        switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1832        case 0:
1833                memset(&hash_keys, 0, sizeof(hash_keys));
1834                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1835                if (skb) {
1836                        ip_multipath_l3_keys(skb, &hash_keys);
1837                } else {
1838                        hash_keys.addrs.v4addrs.src = fl4->saddr;
1839                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
1840                }
1841                break;
1842        case 1:
1843                /* skb is currently provided only when forwarding */
1844                if (skb) {
1845                        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1846                        struct flow_keys keys;
1847
1848                        /* short-circuit if we already have L4 hash present */
1849                        if (skb->l4_hash)
1850                                return skb_get_hash_raw(skb) >> 1;
1851                        memset(&hash_keys, 0, sizeof(hash_keys));
1852                        skb_flow_dissect_flow_keys(skb, &keys, flag);
1853
1854                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1855                        hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1856                        hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1857                        hash_keys.ports.src = keys.ports.src;
1858                        hash_keys.ports.dst = keys.ports.dst;
1859                        hash_keys.basic.ip_proto = keys.basic.ip_proto;
1860                } else {
1861                        memset(&hash_keys, 0, sizeof(hash_keys));
1862                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1863                        hash_keys.addrs.v4addrs.src = fl4->saddr;
1864                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
1865                        hash_keys.ports.src = fl4->fl4_sport;
1866                        hash_keys.ports.dst = fl4->fl4_dport;
1867                        hash_keys.basic.ip_proto = fl4->flowi4_proto;
1868                }
1869                break;
1870        }
1871        mhash = flow_hash_from_keys(&hash_keys);
1872
1873        return mhash >> 1;
1874}
1875EXPORT_SYMBOL_GPL(fib_multipath_hash);
1876#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1877
1878static int ip_mkroute_input(struct sk_buff *skb,
1879                            struct fib_result *res,
1880                            struct in_device *in_dev,
1881                            __be32 daddr, __be32 saddr, u32 tos)
1882{
1883#ifdef CONFIG_IP_ROUTE_MULTIPATH
1884        if (res->fi && res->fi->fib_nhs > 1) {
1885                int h = fib_multipath_hash(res->fi, NULL, skb);
1886
1887                fib_select_multipath(res, h);
1888        }
1889#endif
1890
1891        /* create a routing cache entry */
1892        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1893}
1894
1895/*
1896 *      NOTE. We drop all the packets that has local source
1897 *      addresses, because every properly looped back packet
1898 *      must have correct destination already attached by output routine.
1899 *
1900 *      Such approach solves two big problems:
1901 *      1. Not simplex devices are handled properly.
1902 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1903 *      called with rcu_read_lock()
1904 */
1905
1906static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1907                               u8 tos, struct net_device *dev,
1908                               struct fib_result *res)
1909{
1910        struct in_device *in_dev = __in_dev_get_rcu(dev);
1911        struct ip_tunnel_info *tun_info;
1912        struct flowi4   fl4;
1913        unsigned int    flags = 0;
1914        u32             itag = 0;
1915        struct rtable   *rth;
1916        int             err = -EINVAL;
1917        struct net    *net = dev_net(dev);
1918        bool do_cache;
1919
1920        /* IP on this device is disabled. */
1921
1922        if (!in_dev)
1923                goto out;
1924
1925        /* Check for the most weird martians, which can be not detected
1926           by fib_lookup.
1927         */
1928
1929        tun_info = skb_tunnel_info(skb);
1930        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1931                fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1932        else
1933                fl4.flowi4_tun_key.tun_id = 0;
1934        skb_dst_drop(skb);
1935
1936        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1937                goto martian_source;
1938
1939        res->fi = NULL;
1940        res->table = NULL;
1941        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1942                goto brd_input;
1943
1944        /* Accept zero addresses only to limited broadcast;
1945         * I even do not know to fix it or not. Waiting for complains :-)
1946         */
1947        if (ipv4_is_zeronet(saddr))
1948                goto martian_source;
1949
1950        if (ipv4_is_zeronet(daddr))
1951                goto martian_destination;
1952
1953        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1954         * and call it once if daddr or/and saddr are loopback addresses
1955         */
1956        if (ipv4_is_loopback(daddr)) {
1957                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1958                        goto martian_destination;
1959        } else if (ipv4_is_loopback(saddr)) {
1960                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1961                        goto martian_source;
1962        }
1963
1964        /*
1965         *      Now we are ready to route packet.
1966         */
1967        fl4.flowi4_oif = 0;
1968        fl4.flowi4_iif = dev->ifindex;
1969        fl4.flowi4_mark = skb->mark;
1970        fl4.flowi4_tos = tos;
1971        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1972        fl4.flowi4_flags = 0;
1973        fl4.daddr = daddr;
1974        fl4.saddr = saddr;
1975        fl4.flowi4_uid = sock_net_uid(net, NULL);
1976        err = fib_lookup(net, &fl4, res, 0);
1977        if (err != 0) {
1978                if (!IN_DEV_FORWARD(in_dev))
1979                        err = -EHOSTUNREACH;
1980                goto no_route;
1981        }
1982
1983        if (res->type == RTN_BROADCAST)
1984                goto brd_input;
1985
1986        if (res->type == RTN_LOCAL) {
1987                err = fib_validate_source(skb, saddr, daddr, tos,
1988                                          0, dev, in_dev, &itag);
1989                if (err < 0)
1990                        goto martian_source;
1991                goto local_input;
1992        }
1993
1994        if (!IN_DEV_FORWARD(in_dev)) {
1995                err = -EHOSTUNREACH;
1996                goto no_route;
1997        }
1998        if (res->type != RTN_UNICAST)
1999                goto martian_destination;
2000
2001        err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2002out:    return err;
2003
2004brd_input:
2005        if (skb->protocol != htons(ETH_P_IP))
2006                goto e_inval;
2007
2008        if (!ipv4_is_zeronet(saddr)) {
2009                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2010                                          in_dev, &itag);
2011                if (err < 0)
2012                        goto martian_source;
2013        }
2014        flags |= RTCF_BROADCAST;
2015        res->type = RTN_BROADCAST;
2016        RT_CACHE_STAT_INC(in_brd);
2017
2018local_input:
2019        do_cache = false;
2020        if (res->fi) {
2021                if (!itag) {
2022                        rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2023                        if (rt_cache_valid(rth)) {
2024                                skb_dst_set_noref(skb, &rth->dst);
2025                                err = 0;
2026                                goto out;
2027                        }
2028                        do_cache = true;
2029                }
2030        }
2031
2032        rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2033                           flags | RTCF_LOCAL, res->type,
2034                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2035        if (!rth)
2036                goto e_nobufs;
2037
2038        rth->dst.output= ip_rt_bug;
2039#ifdef CONFIG_IP_ROUTE_CLASSID
2040        rth->dst.tclassid = itag;
2041#endif
2042        rth->rt_is_input = 1;
2043        if (res->table)
2044                rth->rt_table_id = res->table->tb_id;
2045
2046        RT_CACHE_STAT_INC(in_slow_tot);
2047        if (res->type == RTN_UNREACHABLE) {
2048                rth->dst.input= ip_error;
2049                rth->dst.error= -err;
2050                rth->rt_flags   &= ~RTCF_LOCAL;
2051        }
2052
2053        if (do_cache) {
2054                struct fib_nh *nh = &FIB_RES_NH(*res);
2055
2056                rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2057                if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2058                        WARN_ON(rth->dst.input == lwtunnel_input);
2059                        rth->dst.lwtstate->orig_input = rth->dst.input;
2060                        rth->dst.input = lwtunnel_input;
2061                }
2062
2063                if (unlikely(!rt_cache_route(nh, rth)))
2064                        rt_add_uncached_list(rth);
2065        }
2066        skb_dst_set(skb, &rth->dst);
2067        err = 0;
2068        goto out;
2069
2070no_route:
2071        RT_CACHE_STAT_INC(in_no_route);
2072        res->type = RTN_UNREACHABLE;
2073        res->fi = NULL;
2074        res->table = NULL;
2075        goto local_input;
2076
2077        /*
2078         *      Do not cache martian addresses: they should be logged (RFC1812)
2079         */
2080martian_destination:
2081        RT_CACHE_STAT_INC(in_martian_dst);
2082#ifdef CONFIG_IP_ROUTE_VERBOSE
2083        if (IN_DEV_LOG_MARTIANS(in_dev))
2084                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2085                                     &daddr, &saddr, dev->name);
2086#endif
2087
2088e_inval:
2089        err = -EINVAL;
2090        goto out;
2091
2092e_nobufs:
2093        err = -ENOBUFS;
2094        goto out;
2095
2096martian_source:
2097        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2098        goto out;
2099}
2100
2101int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2102                         u8 tos, struct net_device *dev)
2103{
2104        struct fib_result res;
2105        int err;
2106
2107        tos &= IPTOS_RT_MASK;
2108        rcu_read_lock();
2109        err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2110        rcu_read_unlock();
2111
2112        return err;
2113}
2114EXPORT_SYMBOL(ip_route_input_noref);
2115
2116/* called with rcu_read_lock held */
2117int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2118                       u8 tos, struct net_device *dev, struct fib_result *res)
2119{
2120        /* Multicast recognition logic is moved from route cache to here.
2121           The problem was that too many Ethernet cards have broken/missing
2122           hardware multicast filters :-( As result the host on multicasting
2123           network acquires a lot of useless route cache entries, sort of
2124           SDR messages from all the world. Now we try to get rid of them.
2125           Really, provided software IP multicast filter is organized
2126           reasonably (at least, hashed), it does not result in a slowdown
2127           comparing with route cache reject entries.
2128           Note, that multicast routers are not affected, because
2129           route cache entry is created eventually.
2130         */
2131        if (ipv4_is_multicast(daddr)) {
2132                struct in_device *in_dev = __in_dev_get_rcu(dev);
2133                int our = 0;
2134                int err = -EINVAL;
2135
2136                if (in_dev)
2137                        our = ip_check_mc_rcu(in_dev, daddr, saddr,
2138                                              ip_hdr(skb)->protocol);
2139
2140                /* check l3 master if no match yet */
2141                if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2142                        struct in_device *l3_in_dev;
2143
2144                        l3_in_dev = __in_dev_get_rcu(skb->dev);
2145                        if (l3_in_dev)
2146                                our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2147                                                      ip_hdr(skb)->protocol);
2148                }
2149
2150                if (our
2151#ifdef CONFIG_IP_MROUTE
2152                        ||
2153                    (!ipv4_is_local_multicast(daddr) &&
2154                     IN_DEV_MFORWARD(in_dev))
2155#endif
2156                   ) {
2157                        err = ip_route_input_mc(skb, daddr, saddr,
2158                                                tos, dev, our);
2159                }
2160                return err;
2161        }
2162
2163        return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2164}
2165
2166/* called with rcu_read_lock() */
2167static struct rtable *__mkroute_output(const struct fib_result *res,
2168                                       const struct flowi4 *fl4, int orig_oif,
2169                                       struct net_device *dev_out,
2170                                       unsigned int flags)
2171{
2172        struct fib_info *fi = res->fi;
2173        struct fib_nh_exception *fnhe;
2174        struct in_device *in_dev;
2175        u16 type = res->type;
2176        struct rtable *rth;
2177        bool do_cache;
2178
2179        in_dev = __in_dev_get_rcu(dev_out);
2180        if (!in_dev)
2181                return ERR_PTR(-EINVAL);
2182
2183        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2184                if (ipv4_is_loopback(fl4->saddr) &&
2185                    !(dev_out->flags & IFF_LOOPBACK) &&
2186                    !netif_is_l3_master(dev_out))
2187                        return ERR_PTR(-EINVAL);
2188
2189        if (ipv4_is_lbcast(fl4->daddr))
2190                type = RTN_BROADCAST;
2191        else if (ipv4_is_multicast(fl4->daddr))
2192                type = RTN_MULTICAST;
2193        else if (ipv4_is_zeronet(fl4->daddr))
2194                return ERR_PTR(-EINVAL);
2195
2196        if (dev_out->flags & IFF_LOOPBACK)
2197                flags |= RTCF_LOCAL;
2198
2199        do_cache = true;
2200        if (type == RTN_BROADCAST) {
2201                flags |= RTCF_BROADCAST | RTCF_LOCAL;
2202                fi = NULL;
2203        } else if (type == RTN_MULTICAST) {
2204                flags |= RTCF_MULTICAST | RTCF_LOCAL;
2205                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2206                                     fl4->flowi4_proto))
2207                        flags &= ~RTCF_LOCAL;
2208                else
2209                        do_cache = false;
2210                /* If multicast route do not exist use
2211                 * default one, but do not gateway in this case.
2212                 * Yes, it is hack.
2213                 */
2214                if (fi && res->prefixlen < 4)
2215                        fi = NULL;
2216        } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2217                   (orig_oif != dev_out->ifindex)) {
2218                /* For local routes that require a particular output interface
2219                 * we do not want to cache the result.  Caching the result
2220                 * causes incorrect behaviour when there are multiple source
2221                 * addresses on the interface, the end result being that if the
2222                 * intended recipient is waiting on that interface for the
2223                 * packet he won't receive it because it will be delivered on
2224                 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2225                 * be set to the loopback interface as well.
2226                 */
2227                fi = NULL;
2228        }
2229
2230        fnhe = NULL;
2231        do_cache &= fi != NULL;
2232        if (do_cache) {
2233                struct rtable __rcu **prth;
2234                struct fib_nh *nh = &FIB_RES_NH(*res);
2235
2236                fnhe = find_exception(nh, fl4->daddr);
2237                if (fnhe) {
2238                        prth = &fnhe->fnhe_rth_output;
2239                        rth = rcu_dereference(*prth);
2240                        if (rth && rth->dst.expires &&
2241                            time_after(jiffies, rth->dst.expires)) {
2242                                ip_del_fnhe(nh, fl4->daddr);
2243                                fnhe = NULL;
2244                        } else {
2245                                goto rt_cache;
2246                        }
2247                }
2248
2249                if (unlikely(fl4->flowi4_flags &
2250                             FLOWI_FLAG_KNOWN_NH &&
2251                             !(nh->nh_gw &&
2252                               nh->nh_scope == RT_SCOPE_LINK))) {
2253                        do_cache = false;
2254                        goto add;
2255                }
2256                prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2257                rth = rcu_dereference(*prth);
2258
2259rt_cache:
2260                if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2261                        return rth;
2262        }
2263
2264add:
2265        rth = rt_dst_alloc(dev_out, flags, type,
2266                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
2267                           IN_DEV_CONF_GET(in_dev, NOXFRM),
2268                           do_cache);
2269        if (!rth)
2270                return ERR_PTR(-ENOBUFS);
2271
2272        rth->rt_iif = orig_oif;
2273        if (res->table)
2274                rth->rt_table_id = res->table->tb_id;
2275
2276        RT_CACHE_STAT_INC(out_slow_tot);
2277
2278        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2279                if (flags & RTCF_LOCAL &&
2280                    !(dev_out->flags & IFF_LOOPBACK)) {
2281                        rth->dst.output = ip_mc_output;
2282                        RT_CACHE_STAT_INC(out_slow_mc);
2283                }
2284#ifdef CONFIG_IP_MROUTE
2285                if (type == RTN_MULTICAST) {
2286                        if (IN_DEV_MFORWARD(in_dev) &&
2287                            !ipv4_is_local_multicast(fl4->daddr)) {
2288                                rth->dst.input = ip_mr_input;
2289                                rth->dst.output = ip_mc_output;
2290                        }
2291                }
2292#endif
2293        }
2294
2295        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2296        set_lwt_redirect(rth);
2297
2298        return rth;
2299}
2300
2301/*
2302 * Major route resolver routine.
2303 */
2304
2305struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2306                                        const struct sk_buff *skb)
2307{
2308        __u8 tos = RT_FL_TOS(fl4);
2309        struct fib_result res;
2310        struct rtable *rth;
2311
2312        res.tclassid    = 0;
2313        res.fi          = NULL;
2314        res.table       = NULL;
2315
2316        fl4->flowi4_iif = LOOPBACK_IFINDEX;
2317        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2318        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2319                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2320
2321        rcu_read_lock();
2322        rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2323        rcu_read_unlock();
2324
2325        return rth;
2326}
2327EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2328
2329struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2330                                            struct fib_result *res,
2331                                            const struct sk_buff *skb)
2332{
2333        struct net_device *dev_out = NULL;
2334        int orig_oif = fl4->flowi4_oif;
2335        unsigned int flags = 0;
2336        struct rtable *rth;
2337        int err = -ENETUNREACH;
2338
2339        if (fl4->saddr) {
2340                rth = ERR_PTR(-EINVAL);
2341                if (ipv4_is_multicast(fl4->saddr) ||
2342                    ipv4_is_lbcast(fl4->saddr) ||
2343                    ipv4_is_zeronet(fl4->saddr))
2344                        goto out;
2345
2346                /* I removed check for oif == dev_out->oif here.
2347                   It was wrong for two reasons:
2348                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2349                      is assigned to multiple interfaces.
2350                   2. Moreover, we are allowed to send packets with saddr
2351                      of another iface. --ANK
2352                 */
2353
2354                if (fl4->flowi4_oif == 0 &&
2355                    (ipv4_is_multicast(fl4->daddr) ||
2356                     ipv4_is_lbcast(fl4->daddr))) {
2357                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2358                        dev_out = __ip_dev_find(net, fl4->saddr, false);
2359                        if (!dev_out)
2360                                goto out;
2361
2362                        /* Special hack: user can direct multicasts
2363                           and limited broadcast via necessary interface
2364                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2365                           This hack is not just for fun, it allows
2366                           vic,vat and friends to work.
2367                           They bind socket to loopback, set ttl to zero
2368                           and expect that it will work.
2369                           From the viewpoint of routing cache they are broken,
2370                           because we are not allowed to build multicast path
2371                           with loopback source addr (look, routing cache
2372                           cannot know, that ttl is zero, so that packet
2373                           will not leave this host and route is valid).
2374                           Luckily, this hack is good workaround.
2375                         */
2376
2377                        fl4->flowi4_oif = dev_out->ifindex;
2378                        goto make_route;
2379                }
2380
2381                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2382                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2383                        if (!__ip_dev_find(net, fl4->saddr, false))
2384                                goto out;
2385                }
2386        }
2387
2388
2389        if (fl4->flowi4_oif) {
2390                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2391                rth = ERR_PTR(-ENODEV);
2392                if (!dev_out)
2393                        goto out;
2394
2395                /* RACE: Check return value of inet_select_addr instead. */
2396                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2397                        rth = ERR_PTR(-ENETUNREACH);
2398                        goto out;
2399                }
2400                if (ipv4_is_local_multicast(fl4->daddr) ||
2401                    ipv4_is_lbcast(fl4->daddr) ||
2402                    fl4->flowi4_proto == IPPROTO_IGMP) {
2403                        if (!fl4->saddr)
2404                                fl4->saddr = inet_select_addr(dev_out, 0,
2405                                                              RT_SCOPE_LINK);
2406                        goto make_route;
2407                }
2408                if (!fl4->saddr) {
2409                        if (ipv4_is_multicast(fl4->daddr))
2410                                fl4->saddr = inet_select_addr(dev_out, 0,
2411                                                              fl4->flowi4_scope);
2412                        else if (!fl4->daddr)
2413                                fl4->saddr = inet_select_addr(dev_out, 0,
2414                                                              RT_SCOPE_HOST);
2415                }
2416        }
2417
2418        if (!fl4->daddr) {
2419                fl4->daddr = fl4->saddr;
2420                if (!fl4->daddr)
2421                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2422                dev_out = net->loopback_dev;
2423                fl4->flowi4_oif = LOOPBACK_IFINDEX;
2424                res->type = RTN_LOCAL;
2425                flags |= RTCF_LOCAL;
2426                goto make_route;
2427        }
2428
2429        err = fib_lookup(net, fl4, res, 0);
2430        if (err) {
2431                res->fi = NULL;
2432                res->table = NULL;
2433                if (fl4->flowi4_oif &&
2434                    (ipv4_is_multicast(fl4->daddr) ||
2435                    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2436                        /* Apparently, routing tables are wrong. Assume,
2437                           that the destination is on link.
2438
2439                           WHY? DW.
2440                           Because we are allowed to send to iface
2441                           even if it has NO routes and NO assigned
2442                           addresses. When oif is specified, routing
2443                           tables are looked up with only one purpose:
2444                           to catch if destination is gatewayed, rather than
2445                           direct. Moreover, if MSG_DONTROUTE is set,
2446                           we send packet, ignoring both routing tables
2447                           and ifaddr state. --ANK
2448
2449
2450                           We could make it even if oif is unknown,
2451                           likely IPv6, but we do not.
2452                         */
2453
2454                        if (fl4->saddr == 0)
2455                                fl4->saddr = inet_select_addr(dev_out, 0,
2456                                                              RT_SCOPE_LINK);
2457                        res->type = RTN_UNICAST;
2458                        goto make_route;
2459                }
2460                rth = ERR_PTR(err);
2461                goto out;
2462        }
2463
2464        if (res->type == RTN_LOCAL) {
2465                if (!fl4->saddr) {
2466                        if (res->fi->fib_prefsrc)
2467                                fl4->saddr = res->fi->fib_prefsrc;
2468                        else
2469                                fl4->saddr = fl4->daddr;
2470                }
2471
2472                /* L3 master device is the loopback for that domain */
2473                dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2474                        net->loopback_dev;
2475
2476                /* make sure orig_oif points to fib result device even
2477                 * though packet rx/tx happens over loopback or l3mdev
2478                 */
2479                orig_oif = FIB_RES_OIF(*res);
2480
2481                fl4->flowi4_oif = dev_out->ifindex;
2482                flags |= RTCF_LOCAL;
2483                goto make_route;
2484        }
2485
2486        fib_select_path(net, res, fl4, skb);
2487
2488        dev_out = FIB_RES_DEV(*res);
2489        fl4->flowi4_oif = dev_out->ifindex;
2490
2491
2492make_route:
2493        rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2494
2495out:
2496        return rth;
2497}
2498
2499static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2500{
2501        return NULL;
2502}
2503
2504static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2505{
2506        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2507
2508        return mtu ? : dst->dev->mtu;
2509}
2510
2511static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2512                                          struct sk_buff *skb, u32 mtu)
2513{
2514}
2515
2516static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2517                                       struct sk_buff *skb)
2518{
2519}
2520
2521static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2522                                          unsigned long old)
2523{
2524        return NULL;
2525}
2526
2527static struct dst_ops ipv4_dst_blackhole_ops = {
2528        .family                 =       AF_INET,
2529        .check                  =       ipv4_blackhole_dst_check,
2530        .mtu                    =       ipv4_blackhole_mtu,
2531        .default_advmss         =       ipv4_default_advmss,
2532        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2533        .redirect               =       ipv4_rt_blackhole_redirect,
2534        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2535        .neigh_lookup           =       ipv4_neigh_lookup,
2536};
2537
2538struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2539{
2540        struct rtable *ort = (struct rtable *) dst_orig;
2541        struct rtable *rt;
2542
2543        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2544        if (rt) {
2545                struct dst_entry *new = &rt->dst;
2546
2547                new->__use = 1;
2548                new->input = dst_discard;
2549                new->output = dst_discard_out;
2550
2551                new->dev = net->loopback_dev;
2552                if (new->dev)
2553                        dev_hold(new->dev);
2554
2555                rt->rt_is_input = ort->rt_is_input;
2556                rt->rt_iif = ort->rt_iif;
2557                rt->rt_pmtu = ort->rt_pmtu;
2558                rt->rt_mtu_locked = ort->rt_mtu_locked;
2559
2560                rt->rt_genid = rt_genid_ipv4(net);
2561                rt->rt_flags = ort->rt_flags;
2562                rt->rt_type = ort->rt_type;
2563                rt->rt_gateway = ort->rt_gateway;
2564                rt->rt_uses_gateway = ort->rt_uses_gateway;
2565
2566                INIT_LIST_HEAD(&rt->rt_uncached);
2567        }
2568
2569        dst_release(dst_orig);
2570
2571        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2572}
2573
2574struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2575                                    const struct sock *sk)
2576{
2577        struct rtable *rt = __ip_route_output_key(net, flp4);
2578
2579        if (IS_ERR(rt))
2580                return rt;
2581
2582        if (flp4->flowi4_proto)
2583                rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2584                                                        flowi4_to_flowi(flp4),
2585                                                        sk, 0);
2586
2587        return rt;
2588}
2589EXPORT_SYMBOL_GPL(ip_route_output_flow);
2590
2591/* called with rcu_read_lock held */
2592static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2593                        struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2594                        u32 seq)
2595{
2596        struct rtable *rt = skb_rtable(skb);
2597        struct rtmsg *r;
2598        struct nlmsghdr *nlh;
2599        unsigned long expires = 0;
2600        u32 error;
2601        u32 metrics[RTAX_MAX];
2602
2603        nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2604        if (!nlh)
2605                return -EMSGSIZE;
2606
2607        r = nlmsg_data(nlh);
2608        r->rtm_family    = AF_INET;
2609        r->rtm_dst_len  = 32;
2610        r->rtm_src_len  = 0;
2611        r->rtm_tos      = fl4->flowi4_tos;
2612        r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2613        if (nla_put_u32(skb, RTA_TABLE, table_id))
2614                goto nla_put_failure;
2615        r->rtm_type     = rt->rt_type;
2616        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2617        r->rtm_protocol = RTPROT_UNSPEC;
2618        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2619        if (rt->rt_flags & RTCF_NOTIFY)
2620                r->rtm_flags |= RTM_F_NOTIFY;
2621        if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2622                r->rtm_flags |= RTCF_DOREDIRECT;
2623
2624        if (nla_put_in_addr(skb, RTA_DST, dst))
2625                goto nla_put_failure;
2626        if (src) {
2627                r->rtm_src_len = 32;
2628                if (nla_put_in_addr(skb, RTA_SRC, src))
2629                        goto nla_put_failure;
2630        }
2631        if (rt->dst.dev &&
2632            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2633                goto nla_put_failure;
2634#ifdef CONFIG_IP_ROUTE_CLASSID
2635        if (rt->dst.tclassid &&
2636            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2637                goto nla_put_failure;
2638#endif
2639        if (!rt_is_input_route(rt) &&
2640            fl4->saddr != src) {
2641                if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2642                        goto nla_put_failure;
2643        }
2644        if (rt->rt_uses_gateway &&
2645            nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2646                goto nla_put_failure;
2647
2648        expires = rt->dst.expires;
2649        if (expires) {
2650                unsigned long now = jiffies;
2651
2652                if (time_before(now, expires))
2653                        expires -= now;
2654                else
2655                        expires = 0;
2656        }
2657
2658        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2659        if (rt->rt_pmtu && expires)
2660                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2661        if (rt->rt_mtu_locked && expires)
2662                metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2663        if (rtnetlink_put_metrics(skb, metrics) < 0)
2664                goto nla_put_failure;
2665
2666        if (fl4->flowi4_mark &&
2667            nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2668                goto nla_put_failure;
2669
2670        if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2671            nla_put_u32(skb, RTA_UID,
2672                        from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2673                goto nla_put_failure;
2674
2675        error = rt->dst.error;
2676
2677        if (rt_is_input_route(rt)) {
2678#ifdef CONFIG_IP_MROUTE
2679                if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2680                    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2681                        int err = ipmr_get_route(net, skb,
2682                                                 fl4->saddr, fl4->daddr,
2683                                                 r, portid);
2684
2685                        if (err <= 0) {
2686                                if (err == 0)
2687                                        return 0;
2688                                goto nla_put_failure;
2689                        }
2690                } else
2691#endif
2692                        if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2693                                goto nla_put_failure;
2694        }
2695
2696        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2697                goto nla_put_failure;
2698
2699        nlmsg_end(skb, nlh);
2700        return 0;
2701
2702nla_put_failure:
2703        nlmsg_cancel(skb, nlh);
2704        return -EMSGSIZE;
2705}
2706
2707static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2708                             struct netlink_ext_ack *extack)
2709{
2710        struct net *net = sock_net(in_skb->sk);
2711        struct rtmsg *rtm;
2712        struct nlattr *tb[RTA_MAX+1];
2713        struct fib_result res = {};
2714        struct rtable *rt = NULL;
2715        struct flowi4 fl4;
2716        __be32 dst = 0;
2717        __be32 src = 0;
2718        u32 iif;
2719        int err;
2720        int mark;
2721        struct sk_buff *skb;
2722        u32 table_id = RT_TABLE_MAIN;
2723        kuid_t uid;
2724
2725        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2726                          extack);
2727        if (err < 0)
2728                goto errout;
2729
2730        rtm = nlmsg_data(nlh);
2731
2732        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2733        if (!skb) {
2734                err = -ENOBUFS;
2735                goto errout;
2736        }
2737
2738        /* Reserve room for dummy headers, this skb can pass
2739           through good chunk of routing engine.
2740         */
2741        skb_reset_mac_header(skb);
2742        skb_reset_network_header(skb);
2743
2744        src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2745        dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2746        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2747        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2748        if (tb[RTA_UID])
2749                uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2750        else
2751                uid = (iif ? INVALID_UID : current_uid());
2752
2753        /* Bugfix: need to give ip_route_input enough of an IP header to
2754         * not gag.
2755         */
2756        ip_hdr(skb)->protocol = IPPROTO_UDP;
2757        ip_hdr(skb)->saddr = src;
2758        ip_hdr(skb)->daddr = dst;
2759
2760        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2761
2762        memset(&fl4, 0, sizeof(fl4));
2763        fl4.daddr = dst;
2764        fl4.saddr = src;
2765        fl4.flowi4_tos = rtm->rtm_tos;
2766        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2767        fl4.flowi4_mark = mark;
2768        fl4.flowi4_uid = uid;
2769
2770        rcu_read_lock();
2771
2772        if (iif) {
2773                struct net_device *dev;
2774
2775                dev = dev_get_by_index_rcu(net, iif);
2776                if (!dev) {
2777                        err = -ENODEV;
2778                        goto errout_free;
2779                }
2780
2781                skb->protocol   = htons(ETH_P_IP);
2782                skb->dev        = dev;
2783                skb->mark       = mark;
2784                err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2785                                         dev, &res);
2786
2787                rt = skb_rtable(skb);
2788                if (err == 0 && rt->dst.error)
2789                        err = -rt->dst.error;
2790        } else {
2791                fl4.flowi4_iif = LOOPBACK_IFINDEX;
2792                rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2793                err = 0;
2794                if (IS_ERR(rt))
2795                        err = PTR_ERR(rt);
2796                else
2797                        skb_dst_set(skb, &rt->dst);
2798        }
2799
2800        if (err)
2801                goto errout_free;
2802
2803        if (rtm->rtm_flags & RTM_F_NOTIFY)
2804                rt->rt_flags |= RTCF_NOTIFY;
2805
2806        if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2807                table_id = rt->rt_table_id;
2808
2809        if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2810                if (!res.fi) {
2811                        err = fib_props[res.type].error;
2812                        if (!err)
2813                                err = -EHOSTUNREACH;
2814                        goto errout_free;
2815                }
2816                err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2817                                    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2818                                    rt->rt_type, res.prefix, res.prefixlen,
2819                                    fl4.flowi4_tos, res.fi, 0);
2820        } else {
2821                err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2822                                   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2823        }
2824        if (err < 0)
2825                goto errout_free;
2826
2827        rcu_read_unlock();
2828
2829        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2830errout:
2831        return err;
2832
2833errout_free:
2834        rcu_read_unlock();
2835        kfree_skb(skb);
2836        goto errout;
2837}
2838
2839void ip_rt_multicast_event(struct in_device *in_dev)
2840{
2841        rt_cache_flush(dev_net(in_dev->dev));
2842}
2843
2844#ifdef CONFIG_SYSCTL
2845static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2846static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2847static int ip_rt_gc_elasticity __read_mostly    = 8;
2848static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2849
2850static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2851                                        void __user *buffer,
2852                                        size_t *lenp, loff_t *ppos)
2853{
2854        struct net *net = (struct net *)__ctl->extra1;
2855
2856        if (write) {
2857                rt_cache_flush(net);
2858                fnhe_genid_bump(net);
2859                return 0;
2860        }
2861
2862        return -EINVAL;
2863}
2864
2865static struct ctl_table ipv4_route_table[] = {
2866        {
2867                .procname       = "gc_thresh",
2868                .data           = &ipv4_dst_ops.gc_thresh,
2869                .maxlen         = sizeof(int),
2870                .mode           = 0644,
2871                .proc_handler   = proc_dointvec,
2872        },
2873        {
2874                .procname       = "max_size",
2875                .data           = &ip_rt_max_size,
2876                .maxlen         = sizeof(int),
2877                .mode           = 0644,
2878                .proc_handler   = proc_dointvec,
2879        },
2880        {
2881                /*  Deprecated. Use gc_min_interval_ms */
2882
2883                .procname       = "gc_min_interval",
2884                .data           = &ip_rt_gc_min_interval,
2885                .maxlen         = sizeof(int),
2886                .mode           = 0644,
2887                .proc_handler   = proc_dointvec_jiffies,
2888        },
2889        {
2890                .procname       = "gc_min_interval_ms",
2891                .data           = &ip_rt_gc_min_interval,
2892                .maxlen         = sizeof(int),
2893                .mode           = 0644,
2894                .proc_handler   = proc_dointvec_ms_jiffies,
2895        },
2896        {
2897                .procname       = "gc_timeout",
2898                .data           = &ip_rt_gc_timeout,
2899                .maxlen         = sizeof(int),
2900                .mode           = 0644,
2901                .proc_handler   = proc_dointvec_jiffies,
2902        },
2903        {
2904                .procname       = "gc_interval",
2905                .data           = &ip_rt_gc_interval,
2906                .maxlen         = sizeof(int),
2907                .mode           = 0644,
2908                .proc_handler   = proc_dointvec_jiffies,
2909        },
2910        {
2911                .procname       = "redirect_load",
2912                .data           = &ip_rt_redirect_load,
2913                .maxlen         = sizeof(int),
2914                .mode           = 0644,
2915                .proc_handler   = proc_dointvec,
2916        },
2917        {
2918                .procname       = "redirect_number",
2919                .data           = &ip_rt_redirect_number,
2920                .maxlen         = sizeof(int),
2921                .mode           = 0644,
2922                .proc_handler   = proc_dointvec,
2923        },
2924        {
2925                .procname       = "redirect_silence",
2926                .data           = &ip_rt_redirect_silence,
2927                .maxlen         = sizeof(int),
2928                .mode           = 0644,
2929                .proc_handler   = proc_dointvec,
2930        },
2931        {
2932                .procname       = "error_cost",
2933                .data           = &ip_rt_error_cost,
2934                .maxlen         = sizeof(int),
2935                .mode           = 0644,
2936                .proc_handler   = proc_dointvec,
2937        },
2938        {
2939                .procname       = "error_burst",
2940                .data           = &ip_rt_error_burst,
2941                .maxlen         = sizeof(int),
2942                .mode           = 0644,
2943                .proc_handler   = proc_dointvec,
2944        },
2945        {
2946                .procname       = "gc_elasticity",
2947                .data           = &ip_rt_gc_elasticity,
2948                .maxlen         = sizeof(int),
2949                .mode           = 0644,
2950                .proc_handler   = proc_dointvec,
2951        },
2952        {
2953                .procname       = "mtu_expires",
2954                .data           = &ip_rt_mtu_expires,
2955                .maxlen         = sizeof(int),
2956                .mode           = 0644,
2957                .proc_handler   = proc_dointvec_jiffies,
2958        },
2959        {
2960                .procname       = "min_pmtu",
2961                .data           = &ip_rt_min_pmtu,
2962                .maxlen         = sizeof(int),
2963                .mode           = 0644,
2964                .proc_handler   = proc_dointvec_minmax,
2965                .extra1         = &ip_min_valid_pmtu,
2966        },
2967        {
2968                .procname       = "min_adv_mss",
2969                .data           = &ip_rt_min_advmss,
2970                .maxlen         = sizeof(int),
2971                .mode           = 0644,
2972                .proc_handler   = proc_dointvec,
2973        },
2974        { }
2975};
2976
2977static struct ctl_table ipv4_route_flush_table[] = {
2978        {
2979                .procname       = "flush",
2980                .maxlen         = sizeof(int),
2981                .mode           = 0200,
2982                .proc_handler   = ipv4_sysctl_rtcache_flush,
2983        },
2984        { },
2985};
2986
2987static __net_init int sysctl_route_net_init(struct net *net)
2988{
2989        struct ctl_table *tbl;
2990
2991        tbl = ipv4_route_flush_table;
2992        if (!net_eq(net, &init_net)) {
2993                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2994                if (!tbl)
2995                        goto err_dup;
2996
2997                /* Don't export sysctls to unprivileged users */
2998                if (net->user_ns != &init_user_ns)
2999                        tbl[0].procname = NULL;
3000        }
3001        tbl[0].extra1 = net;
3002
3003        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3004        if (!net->ipv4.route_hdr)
3005                goto err_reg;
3006        return 0;
3007
3008err_reg:
3009        if (tbl != ipv4_route_flush_table)
3010                kfree(tbl);
3011err_dup:
3012        return -ENOMEM;
3013}
3014
3015static __net_exit void sysctl_route_net_exit(struct net *net)
3016{
3017        struct ctl_table *tbl;
3018
3019        tbl = net->ipv4.route_hdr->ctl_table_arg;
3020        unregister_net_sysctl_table(net->ipv4.route_hdr);
3021        BUG_ON(tbl == ipv4_route_flush_table);
3022        kfree(tbl);
3023}
3024
3025static __net_initdata struct pernet_operations sysctl_route_ops = {
3026        .init = sysctl_route_net_init,
3027        .exit = sysctl_route_net_exit,
3028};
3029#endif
3030
3031static __net_init int rt_genid_init(struct net *net)
3032{
3033        atomic_set(&net->ipv4.rt_genid, 0);
3034        atomic_set(&net->fnhe_genid, 0);
3035        atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3036        return 0;
3037}
3038
3039static __net_initdata struct pernet_operations rt_genid_ops = {
3040        .init = rt_genid_init,
3041};
3042
3043static int __net_init ipv4_inetpeer_init(struct net *net)
3044{
3045        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3046
3047        if (!bp)
3048                return -ENOMEM;
3049        inet_peer_base_init(bp);
3050        net->ipv4.peers = bp;
3051        return 0;
3052}
3053
3054static void __net_exit ipv4_inetpeer_exit(struct net *net)
3055{
3056        struct inet_peer_base *bp = net->ipv4.peers;
3057
3058        net->ipv4.peers = NULL;
3059        inetpeer_invalidate_tree(bp);
3060        kfree(bp);
3061}
3062
3063static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3064        .init   =       ipv4_inetpeer_init,
3065        .exit   =       ipv4_inetpeer_exit,
3066};
3067
3068#ifdef CONFIG_IP_ROUTE_CLASSID
3069struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3070#endif /* CONFIG_IP_ROUTE_CLASSID */
3071
3072int __init ip_rt_init(void)
3073{
3074        int cpu;
3075
3076        ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3077        if (!ip_idents)
3078                panic("IP: failed to allocate ip_idents\n");
3079
3080        prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3081
3082        ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3083        if (!ip_tstamps)
3084                panic("IP: failed to allocate ip_tstamps\n");
3085
3086        for_each_possible_cpu(cpu) {
3087                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3088
3089                INIT_LIST_HEAD(&ul->head);
3090                spin_lock_init(&ul->lock);
3091        }
3092#ifdef CONFIG_IP_ROUTE_CLASSID
3093        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3094        if (!ip_rt_acct)
3095                panic("IP: failed to allocate ip_rt_acct\n");
3096#endif
3097
3098        ipv4_dst_ops.kmem_cachep =
3099                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3100                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3101
3102        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3103
3104        if (dst_entries_init(&ipv4_dst_ops) < 0)
3105                panic("IP: failed to allocate ipv4_dst_ops counter\n");
3106
3107        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3108                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3109
3110        ipv4_dst_ops.gc_thresh = ~0;
3111        ip_rt_max_size = INT_MAX;
3112
3113        devinet_init();
3114        ip_fib_init();
3115
3116        if (ip_rt_proc_init())
3117                pr_err("Unable to create route proc files\n");
3118#ifdef CONFIG_XFRM
3119        xfrm_init();
3120        xfrm4_init();
3121#endif
3122        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3123                      RTNL_FLAG_DOIT_UNLOCKED);
3124
3125#ifdef CONFIG_SYSCTL
3126        register_pernet_subsys(&sysctl_route_ops);
3127#endif
3128        register_pernet_subsys(&rt_genid_ops);
3129        register_pernet_subsys(&ipv4_inetpeer_ops);
3130        return 0;
3131}
3132
3133#ifdef CONFIG_SYSCTL
3134/*
3135 * We really need to sanitize the damn ipv4 init order, then all
3136 * this nonsense will go away.
3137 */
3138void __init ip_static_sysctl_init(void)
3139{
3140        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3141}
3142#endif
3143