linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *              Alan Cox        :       Verify area fixes.
  16 *              Alan Cox        :       cli() protects routing changes
  17 *              Rui Oliveira    :       ICMP routing table updates
  18 *              (rco@di.uminho.pt)      Routing table insertion and update
  19 *              Linus Torvalds  :       Rewrote bits to be sensible
  20 *              Alan Cox        :       Added BSD route gw semantics
  21 *              Alan Cox        :       Super /proc >4K
  22 *              Alan Cox        :       MTU in route table
  23 *              Alan Cox        :       MSS actually. Also added the window
  24 *                                      clamper.
  25 *              Sam Lantinga    :       Fixed route matching in rt_del()
  26 *              Alan Cox        :       Routing cache support.
  27 *              Alan Cox        :       Removed compatibility cruft.
  28 *              Alan Cox        :       RTF_REJECT support.
  29 *              Alan Cox        :       TCP irtt support.
  30 *              Jonathan Naylor :       Added Metric support.
  31 *      Miquel van Smoorenburg  :       BSD API fixes.
  32 *      Miquel van Smoorenburg  :       Metrics.
  33 *              Alan Cox        :       Use __u32 properly
  34 *              Alan Cox        :       Aligned routing errors more closely with BSD
  35 *                                      our system is still very different.
  36 *              Alan Cox        :       Faster /proc handling
  37 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38 *                                      routing caches and better behaviour.
  39 *
  40 *              Olaf Erb        :       irtt wasn't being copied right.
  41 *              Bjorn Ekwall    :       Kerneld route support.
  42 *              Alan Cox        :       Multicast fixed (I hope)
  43 *              Pavel Krauz     :       Limited broadcast fixed
  44 *              Mike McLagan    :       Routing by source
  45 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46 *                                      route.c and rewritten from scratch.
  47 *              Andi Kleen      :       Load-limit warning messages.
  48 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52 *              Marc Boucher    :       routing by fwmark
  53 *      Robert Olsson           :       Added rt_cache statistics
  54 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58 *
  59 *              This program is free software; you can redistribute it and/or
  60 *              modify it under the terms of the GNU General Public License
  61 *              as published by the Free Software Foundation; either version
  62 *              2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
  92#include <net/dst.h>
  93#include <net/net_namespace.h>
  94#include <net/protocol.h>
  95#include <net/ip.h>
  96#include <net/route.h>
  97#include <net/inetpeer.h>
  98#include <net/sock.h>
  99#include <net/ip_fib.h>
 100#include <net/arp.h>
 101#include <net/tcp.h>
 102#include <net/icmp.h>
 103#include <net/xfrm.h>
 104#include <net/netevent.h>
 105#include <net/rtnetlink.h>
 106#ifdef CONFIG_SYSCTL
 107#include <linux/sysctl.h>
 108#include <linux/kmemleak.h>
 109#endif
 110#include <net/secure_seq.h>
 111
 112#define RT_FL_TOS(oldflp4) \
 113        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 114
 115#define IP_MAX_MTU      0xFFF0
 116
 117#define RT_GC_TIMEOUT (300*HZ)
 118
 119static int ip_rt_max_size;
 120static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 121static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 122static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 123static int ip_rt_redirect_number __read_mostly  = 9;
 124static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126static int ip_rt_error_cost __read_mostly       = HZ;
 127static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128static int ip_rt_gc_elasticity __read_mostly    = 8;
 129static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131static int ip_rt_min_advmss __read_mostly       = 256;
 132
 133/*
 134 *      Interface to generic destination cache.
 135 */
 136
 137static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 138static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 139static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 140static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 141static void              ipv4_link_failure(struct sk_buff *skb);
 142static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 143                                           struct sk_buff *skb, u32 mtu);
 144static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 145                                        struct sk_buff *skb);
 146static void             ipv4_dst_destroy(struct dst_entry *dst);
 147
 148static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 149                            int how)
 150{
 151}
 152
 153static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 154{
 155        WARN_ON(1);
 156        return NULL;
 157}
 158
 159static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 160                                           struct sk_buff *skb,
 161                                           const void *daddr);
 162
 163static struct dst_ops ipv4_dst_ops = {
 164        .family =               AF_INET,
 165        .protocol =             cpu_to_be16(ETH_P_IP),
 166        .check =                ipv4_dst_check,
 167        .default_advmss =       ipv4_default_advmss,
 168        .mtu =                  ipv4_mtu,
 169        .cow_metrics =          ipv4_cow_metrics,
 170        .destroy =              ipv4_dst_destroy,
 171        .ifdown =               ipv4_dst_ifdown,
 172        .negative_advice =      ipv4_negative_advice,
 173        .link_failure =         ipv4_link_failure,
 174        .update_pmtu =          ip_rt_update_pmtu,
 175        .redirect =             ip_do_redirect,
 176        .local_out =            __ip_local_out,
 177        .neigh_lookup =         ipv4_neigh_lookup,
 178};
 179
 180#define ECN_OR_COST(class)      TC_PRIO_##class
 181
 182const __u8 ip_tos2prio[16] = {
 183        TC_PRIO_BESTEFFORT,
 184        ECN_OR_COST(BESTEFFORT),
 185        TC_PRIO_BESTEFFORT,
 186        ECN_OR_COST(BESTEFFORT),
 187        TC_PRIO_BULK,
 188        ECN_OR_COST(BULK),
 189        TC_PRIO_BULK,
 190        ECN_OR_COST(BULK),
 191        TC_PRIO_INTERACTIVE,
 192        ECN_OR_COST(INTERACTIVE),
 193        TC_PRIO_INTERACTIVE,
 194        ECN_OR_COST(INTERACTIVE),
 195        TC_PRIO_INTERACTIVE_BULK,
 196        ECN_OR_COST(INTERACTIVE_BULK),
 197        TC_PRIO_INTERACTIVE_BULK,
 198        ECN_OR_COST(INTERACTIVE_BULK)
 199};
 200EXPORT_SYMBOL(ip_tos2prio);
 201
 202static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 203#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 204
 205#ifdef CONFIG_PROC_FS
 206static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 207{
 208        if (*pos)
 209                return NULL;
 210        return SEQ_START_TOKEN;
 211}
 212
 213static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 214{
 215        ++*pos;
 216        return NULL;
 217}
 218
 219static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 220{
 221}
 222
 223static int rt_cache_seq_show(struct seq_file *seq, void *v)
 224{
 225        if (v == SEQ_START_TOKEN)
 226                seq_printf(seq, "%-127s\n",
 227                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 228                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 229                           "HHUptod\tSpecDst");
 230        return 0;
 231}
 232
 233static const struct seq_operations rt_cache_seq_ops = {
 234        .start  = rt_cache_seq_start,
 235        .next   = rt_cache_seq_next,
 236        .stop   = rt_cache_seq_stop,
 237        .show   = rt_cache_seq_show,
 238};
 239
 240static int rt_cache_seq_open(struct inode *inode, struct file *file)
 241{
 242        return seq_open(file, &rt_cache_seq_ops);
 243}
 244
 245static const struct file_operations rt_cache_seq_fops = {
 246        .owner   = THIS_MODULE,
 247        .open    = rt_cache_seq_open,
 248        .read    = seq_read,
 249        .llseek  = seq_lseek,
 250        .release = seq_release,
 251};
 252
 253
 254static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 255{
 256        int cpu;
 257
 258        if (*pos == 0)
 259                return SEQ_START_TOKEN;
 260
 261        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 262                if (!cpu_possible(cpu))
 263                        continue;
 264                *pos = cpu+1;
 265                return &per_cpu(rt_cache_stat, cpu);
 266        }
 267        return NULL;
 268}
 269
 270static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 271{
 272        int cpu;
 273
 274        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 275                if (!cpu_possible(cpu))
 276                        continue;
 277                *pos = cpu+1;
 278                return &per_cpu(rt_cache_stat, cpu);
 279        }
 280        return NULL;
 281
 282}
 283
 284static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 285{
 286
 287}
 288
 289static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 290{
 291        struct rt_cache_stat *st = v;
 292
 293        if (v == SEQ_START_TOKEN) {
 294                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 295                return 0;
 296        }
 297
 298        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 299                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 300                   dst_entries_get_slow(&ipv4_dst_ops),
 301                   st->in_hit,
 302                   st->in_slow_tot,
 303                   st->in_slow_mc,
 304                   st->in_no_route,
 305                   st->in_brd,
 306                   st->in_martian_dst,
 307                   st->in_martian_src,
 308
 309                   st->out_hit,
 310                   st->out_slow_tot,
 311                   st->out_slow_mc,
 312
 313                   st->gc_total,
 314                   st->gc_ignored,
 315                   st->gc_goal_miss,
 316                   st->gc_dst_overflow,
 317                   st->in_hlist_search,
 318                   st->out_hlist_search
 319                );
 320        return 0;
 321}
 322
 323static const struct seq_operations rt_cpu_seq_ops = {
 324        .start  = rt_cpu_seq_start,
 325        .next   = rt_cpu_seq_next,
 326        .stop   = rt_cpu_seq_stop,
 327        .show   = rt_cpu_seq_show,
 328};
 329
 330
 331static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 332{
 333        return seq_open(file, &rt_cpu_seq_ops);
 334}
 335
 336static const struct file_operations rt_cpu_seq_fops = {
 337        .owner   = THIS_MODULE,
 338        .open    = rt_cpu_seq_open,
 339        .read    = seq_read,
 340        .llseek  = seq_lseek,
 341        .release = seq_release,
 342};
 343
 344#ifdef CONFIG_IP_ROUTE_CLASSID
 345static int rt_acct_proc_show(struct seq_file *m, void *v)
 346{
 347        struct ip_rt_acct *dst, *src;
 348        unsigned int i, j;
 349
 350        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 351        if (!dst)
 352                return -ENOMEM;
 353
 354        for_each_possible_cpu(i) {
 355                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 356                for (j = 0; j < 256; j++) {
 357                        dst[j].o_bytes   += src[j].o_bytes;
 358                        dst[j].o_packets += src[j].o_packets;
 359                        dst[j].i_bytes   += src[j].i_bytes;
 360                        dst[j].i_packets += src[j].i_packets;
 361                }
 362        }
 363
 364        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 365        kfree(dst);
 366        return 0;
 367}
 368
 369static int rt_acct_proc_open(struct inode *inode, struct file *file)
 370{
 371        return single_open(file, rt_acct_proc_show, NULL);
 372}
 373
 374static const struct file_operations rt_acct_proc_fops = {
 375        .owner          = THIS_MODULE,
 376        .open           = rt_acct_proc_open,
 377        .read           = seq_read,
 378        .llseek         = seq_lseek,
 379        .release        = single_release,
 380};
 381#endif
 382
 383static int __net_init ip_rt_do_proc_init(struct net *net)
 384{
 385        struct proc_dir_entry *pde;
 386
 387        pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 388                        &rt_cache_seq_fops);
 389        if (!pde)
 390                goto err1;
 391
 392        pde = proc_create("rt_cache", S_IRUGO,
 393                          net->proc_net_stat, &rt_cpu_seq_fops);
 394        if (!pde)
 395                goto err2;
 396
 397#ifdef CONFIG_IP_ROUTE_CLASSID
 398        pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 399        if (!pde)
 400                goto err3;
 401#endif
 402        return 0;
 403
 404#ifdef CONFIG_IP_ROUTE_CLASSID
 405err3:
 406        remove_proc_entry("rt_cache", net->proc_net_stat);
 407#endif
 408err2:
 409        remove_proc_entry("rt_cache", net->proc_net);
 410err1:
 411        return -ENOMEM;
 412}
 413
 414static void __net_exit ip_rt_do_proc_exit(struct net *net)
 415{
 416        remove_proc_entry("rt_cache", net->proc_net_stat);
 417        remove_proc_entry("rt_cache", net->proc_net);
 418#ifdef CONFIG_IP_ROUTE_CLASSID
 419        remove_proc_entry("rt_acct", net->proc_net);
 420#endif
 421}
 422
 423static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 424        .init = ip_rt_do_proc_init,
 425        .exit = ip_rt_do_proc_exit,
 426};
 427
 428static int __init ip_rt_proc_init(void)
 429{
 430        return register_pernet_subsys(&ip_rt_proc_ops);
 431}
 432
 433#else
 434static inline int ip_rt_proc_init(void)
 435{
 436        return 0;
 437}
 438#endif /* CONFIG_PROC_FS */
 439
 440static inline bool rt_is_expired(const struct rtable *rth)
 441{
 442        return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 443}
 444
 445void rt_cache_flush(struct net *net)
 446{
 447        rt_genid_bump(net);
 448}
 449
 450static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 451                                           struct sk_buff *skb,
 452                                           const void *daddr)
 453{
 454        struct net_device *dev = dst->dev;
 455        const __be32 *pkey = daddr;
 456        const struct rtable *rt;
 457        struct neighbour *n;
 458
 459        rt = (const struct rtable *) dst;
 460        if (rt->rt_gateway)
 461                pkey = (const __be32 *) &rt->rt_gateway;
 462        else if (skb)
 463                pkey = &ip_hdr(skb)->daddr;
 464
 465        n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 466        if (n)
 467                return n;
 468        return neigh_create(&arp_tbl, pkey, dev);
 469}
 470
 471/*
 472 * Peer allocation may fail only in serious out-of-memory conditions.  However
 473 * we still can generate some output.
 474 * Random ID selection looks a bit dangerous because we have no chances to
 475 * select ID being unique in a reasonable period of time.
 476 * But broken packet identifier may be better than no packet at all.
 477 */
 478static void ip_select_fb_ident(struct iphdr *iph)
 479{
 480        static DEFINE_SPINLOCK(ip_fb_id_lock);
 481        static u32 ip_fallback_id;
 482        u32 salt;
 483
 484        spin_lock_bh(&ip_fb_id_lock);
 485        salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
 486        iph->id = htons(salt & 0xFFFF);
 487        ip_fallback_id = salt;
 488        spin_unlock_bh(&ip_fb_id_lock);
 489}
 490
 491void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 492{
 493        struct net *net = dev_net(dst->dev);
 494        struct inet_peer *peer;
 495
 496        peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
 497        if (peer) {
 498                iph->id = htons(inet_getid(peer, more));
 499                inet_putpeer(peer);
 500                return;
 501        }
 502
 503        ip_select_fb_ident(iph);
 504}
 505EXPORT_SYMBOL(__ip_select_ident);
 506
 507static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 508                             const struct iphdr *iph,
 509                             int oif, u8 tos,
 510                             u8 prot, u32 mark, int flow_flags)
 511{
 512        if (sk) {
 513                const struct inet_sock *inet = inet_sk(sk);
 514
 515                oif = sk->sk_bound_dev_if;
 516                mark = sk->sk_mark;
 517                tos = RT_CONN_FLAGS(sk);
 518                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 519        }
 520        flowi4_init_output(fl4, oif, mark, tos,
 521                           RT_SCOPE_UNIVERSE, prot,
 522                           flow_flags,
 523                           iph->daddr, iph->saddr, 0, 0);
 524}
 525
 526static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 527                               const struct sock *sk)
 528{
 529        const struct iphdr *iph = ip_hdr(skb);
 530        int oif = skb->dev->ifindex;
 531        u8 tos = RT_TOS(iph->tos);
 532        u8 prot = iph->protocol;
 533        u32 mark = skb->mark;
 534
 535        __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 536}
 537
 538static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 539{
 540        const struct inet_sock *inet = inet_sk(sk);
 541        const struct ip_options_rcu *inet_opt;
 542        __be32 daddr = inet->inet_daddr;
 543
 544        rcu_read_lock();
 545        inet_opt = rcu_dereference(inet->inet_opt);
 546        if (inet_opt && inet_opt->opt.srr)
 547                daddr = inet_opt->opt.faddr;
 548        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 549                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 550                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 551                           inet_sk_flowi_flags(sk),
 552                           daddr, inet->inet_saddr, 0, 0);
 553        rcu_read_unlock();
 554}
 555
 556static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 557                                 const struct sk_buff *skb)
 558{
 559        if (skb)
 560                build_skb_flow_key(fl4, skb, sk);
 561        else
 562                build_sk_flow_key(fl4, sk);
 563}
 564
 565static inline void rt_free(struct rtable *rt)
 566{
 567        call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 568}
 569
 570static DEFINE_SPINLOCK(fnhe_lock);
 571
 572static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 573{
 574        struct fib_nh_exception *fnhe, *oldest;
 575        struct rtable *orig;
 576
 577        oldest = rcu_dereference(hash->chain);
 578        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 579             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 580                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 581                        oldest = fnhe;
 582        }
 583        orig = rcu_dereference(oldest->fnhe_rth);
 584        if (orig) {
 585                RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
 586                rt_free(orig);
 587        }
 588        return oldest;
 589}
 590
 591static inline u32 fnhe_hashfun(__be32 daddr)
 592{
 593        u32 hval;
 594
 595        hval = (__force u32) daddr;
 596        hval ^= (hval >> 11) ^ (hval >> 22);
 597
 598        return hval & (FNHE_HASH_SIZE - 1);
 599}
 600
 601static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 602                                  u32 pmtu, unsigned long expires)
 603{
 604        struct fnhe_hash_bucket *hash;
 605        struct fib_nh_exception *fnhe;
 606        int depth;
 607        u32 hval = fnhe_hashfun(daddr);
 608
 609        spin_lock_bh(&fnhe_lock);
 610
 611        hash = nh->nh_exceptions;
 612        if (!hash) {
 613                hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 614                if (!hash)
 615                        goto out_unlock;
 616                nh->nh_exceptions = hash;
 617        }
 618
 619        hash += hval;
 620
 621        depth = 0;
 622        for (fnhe = rcu_dereference(hash->chain); fnhe;
 623             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 624                if (fnhe->fnhe_daddr == daddr)
 625                        break;
 626                depth++;
 627        }
 628
 629        if (fnhe) {
 630                if (gw)
 631                        fnhe->fnhe_gw = gw;
 632                if (pmtu) {
 633                        fnhe->fnhe_pmtu = pmtu;
 634                        fnhe->fnhe_expires = expires;
 635                }
 636        } else {
 637                if (depth > FNHE_RECLAIM_DEPTH)
 638                        fnhe = fnhe_oldest(hash);
 639                else {
 640                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 641                        if (!fnhe)
 642                                goto out_unlock;
 643
 644                        fnhe->fnhe_next = hash->chain;
 645                        rcu_assign_pointer(hash->chain, fnhe);
 646                }
 647                fnhe->fnhe_daddr = daddr;
 648                fnhe->fnhe_gw = gw;
 649                fnhe->fnhe_pmtu = pmtu;
 650                fnhe->fnhe_expires = expires;
 651        }
 652
 653        fnhe->fnhe_stamp = jiffies;
 654
 655out_unlock:
 656        spin_unlock_bh(&fnhe_lock);
 657        return;
 658}
 659
 660static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 661                             bool kill_route)
 662{
 663        __be32 new_gw = icmp_hdr(skb)->un.gateway;
 664        __be32 old_gw = ip_hdr(skb)->saddr;
 665        struct net_device *dev = skb->dev;
 666        struct in_device *in_dev;
 667        struct fib_result res;
 668        struct neighbour *n;
 669        struct net *net;
 670
 671        switch (icmp_hdr(skb)->code & 7) {
 672        case ICMP_REDIR_NET:
 673        case ICMP_REDIR_NETTOS:
 674        case ICMP_REDIR_HOST:
 675        case ICMP_REDIR_HOSTTOS:
 676                break;
 677
 678        default:
 679                return;
 680        }
 681
 682        if (rt->rt_gateway != old_gw)
 683                return;
 684
 685        in_dev = __in_dev_get_rcu(dev);
 686        if (!in_dev)
 687                return;
 688
 689        net = dev_net(dev);
 690        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 691            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 692            ipv4_is_zeronet(new_gw))
 693                goto reject_redirect;
 694
 695        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 696                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 697                        goto reject_redirect;
 698                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 699                        goto reject_redirect;
 700        } else {
 701                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 702                        goto reject_redirect;
 703        }
 704
 705        n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 706        if (n) {
 707                if (!(n->nud_state & NUD_VALID)) {
 708                        neigh_event_send(n, NULL);
 709                } else {
 710                        if (fib_lookup(net, fl4, &res) == 0) {
 711                                struct fib_nh *nh = &FIB_RES_NH(res);
 712
 713                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
 714                                                      0, 0);
 715                        }
 716                        if (kill_route)
 717                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 718                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 719                }
 720                neigh_release(n);
 721        }
 722        return;
 723
 724reject_redirect:
 725#ifdef CONFIG_IP_ROUTE_VERBOSE
 726        if (IN_DEV_LOG_MARTIANS(in_dev)) {
 727                const struct iphdr *iph = (const struct iphdr *) skb->data;
 728                __be32 daddr = iph->daddr;
 729                __be32 saddr = iph->saddr;
 730
 731                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 732                                     "  Advised path = %pI4 -> %pI4\n",
 733                                     &old_gw, dev->name, &new_gw,
 734                                     &saddr, &daddr);
 735        }
 736#endif
 737        ;
 738}
 739
 740static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 741{
 742        struct rtable *rt;
 743        struct flowi4 fl4;
 744
 745        rt = (struct rtable *) dst;
 746
 747        ip_rt_build_flow_key(&fl4, sk, skb);
 748        __ip_do_redirect(rt, skb, &fl4, true);
 749}
 750
 751static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 752{
 753        struct rtable *rt = (struct rtable *)dst;
 754        struct dst_entry *ret = dst;
 755
 756        if (rt) {
 757                if (dst->obsolete > 0) {
 758                        ip_rt_put(rt);
 759                        ret = NULL;
 760                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 761                           rt->dst.expires) {
 762                        ip_rt_put(rt);
 763                        ret = NULL;
 764                }
 765        }
 766        return ret;
 767}
 768
 769/*
 770 * Algorithm:
 771 *      1. The first ip_rt_redirect_number redirects are sent
 772 *         with exponential backoff, then we stop sending them at all,
 773 *         assuming that the host ignores our redirects.
 774 *      2. If we did not see packets requiring redirects
 775 *         during ip_rt_redirect_silence, we assume that the host
 776 *         forgot redirected route and start to send redirects again.
 777 *
 778 * This algorithm is much cheaper and more intelligent than dumb load limiting
 779 * in icmp.c.
 780 *
 781 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 782 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 783 */
 784
 785void ip_rt_send_redirect(struct sk_buff *skb)
 786{
 787        struct rtable *rt = skb_rtable(skb);
 788        struct in_device *in_dev;
 789        struct inet_peer *peer;
 790        struct net *net;
 791        int log_martians;
 792
 793        rcu_read_lock();
 794        in_dev = __in_dev_get_rcu(rt->dst.dev);
 795        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 796                rcu_read_unlock();
 797                return;
 798        }
 799        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 800        rcu_read_unlock();
 801
 802        net = dev_net(rt->dst.dev);
 803        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 804        if (!peer) {
 805                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 806                return;
 807        }
 808
 809        /* No redirected packets during ip_rt_redirect_silence;
 810         * reset the algorithm.
 811         */
 812        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 813                peer->rate_tokens = 0;
 814
 815        /* Too many ignored redirects; do not send anything
 816         * set dst.rate_last to the last seen redirected packet.
 817         */
 818        if (peer->rate_tokens >= ip_rt_redirect_number) {
 819                peer->rate_last = jiffies;
 820                goto out_put_peer;
 821        }
 822
 823        /* Check for load limit; set rate_last to the latest sent
 824         * redirect.
 825         */
 826        if (peer->rate_tokens == 0 ||
 827            time_after(jiffies,
 828                       (peer->rate_last +
 829                        (ip_rt_redirect_load << peer->rate_tokens)))) {
 830                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 831                peer->rate_last = jiffies;
 832                ++peer->rate_tokens;
 833#ifdef CONFIG_IP_ROUTE_VERBOSE
 834                if (log_martians &&
 835                    peer->rate_tokens == ip_rt_redirect_number)
 836                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 837                                             &ip_hdr(skb)->saddr, inet_iif(skb),
 838                                             &ip_hdr(skb)->daddr, &rt->rt_gateway);
 839#endif
 840        }
 841out_put_peer:
 842        inet_putpeer(peer);
 843}
 844
 845static int ip_error(struct sk_buff *skb)
 846{
 847        struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 848        struct rtable *rt = skb_rtable(skb);
 849        struct inet_peer *peer;
 850        unsigned long now;
 851        struct net *net;
 852        bool send;
 853        int code;
 854
 855        net = dev_net(rt->dst.dev);
 856        if (!IN_DEV_FORWARD(in_dev)) {
 857                switch (rt->dst.error) {
 858                case EHOSTUNREACH:
 859                        IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
 860                        break;
 861
 862                case ENETUNREACH:
 863                        IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 864                        break;
 865                }
 866                goto out;
 867        }
 868
 869        switch (rt->dst.error) {
 870        case EINVAL:
 871        default:
 872                goto out;
 873        case EHOSTUNREACH:
 874                code = ICMP_HOST_UNREACH;
 875                break;
 876        case ENETUNREACH:
 877                code = ICMP_NET_UNREACH;
 878                IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 879                break;
 880        case EACCES:
 881                code = ICMP_PKT_FILTERED;
 882                break;
 883        }
 884
 885        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 886
 887        send = true;
 888        if (peer) {
 889                now = jiffies;
 890                peer->rate_tokens += now - peer->rate_last;
 891                if (peer->rate_tokens > ip_rt_error_burst)
 892                        peer->rate_tokens = ip_rt_error_burst;
 893                peer->rate_last = now;
 894                if (peer->rate_tokens >= ip_rt_error_cost)
 895                        peer->rate_tokens -= ip_rt_error_cost;
 896                else
 897                        send = false;
 898                inet_putpeer(peer);
 899        }
 900        if (send)
 901                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 902
 903out:    kfree_skb(skb);
 904        return 0;
 905}
 906
 907static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 908{
 909        struct fib_result res;
 910
 911        if (mtu < ip_rt_min_pmtu)
 912                mtu = ip_rt_min_pmtu;
 913
 914        rcu_read_lock();
 915        if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
 916                struct fib_nh *nh = &FIB_RES_NH(res);
 917
 918                update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 919                                      jiffies + ip_rt_mtu_expires);
 920        }
 921        rcu_read_unlock();
 922        return mtu;
 923}
 924
 925static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 926                              struct sk_buff *skb, u32 mtu)
 927{
 928        struct rtable *rt = (struct rtable *) dst;
 929        struct flowi4 fl4;
 930
 931        ip_rt_build_flow_key(&fl4, sk, skb);
 932        mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
 933
 934        if (!rt->rt_pmtu) {
 935                dst->obsolete = DST_OBSOLETE_KILL;
 936        } else {
 937                rt->rt_pmtu = mtu;
 938                rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires);
 939        }
 940}
 941
 942void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 943                      int oif, u32 mark, u8 protocol, int flow_flags)
 944{
 945        const struct iphdr *iph = (const struct iphdr *) skb->data;
 946        struct flowi4 fl4;
 947        struct rtable *rt;
 948
 949        __build_flow_key(&fl4, NULL, iph, oif,
 950                         RT_TOS(iph->tos), protocol, mark, flow_flags);
 951        rt = __ip_route_output_key(net, &fl4);
 952        if (!IS_ERR(rt)) {
 953                __ip_rt_update_pmtu(rt, &fl4, mtu);
 954                ip_rt_put(rt);
 955        }
 956}
 957EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
 958
 959void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 960{
 961        const struct iphdr *iph = (const struct iphdr *) skb->data;
 962        struct flowi4 fl4;
 963        struct rtable *rt;
 964
 965        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
 966        rt = __ip_route_output_key(sock_net(sk), &fl4);
 967        if (!IS_ERR(rt)) {
 968                __ip_rt_update_pmtu(rt, &fl4, mtu);
 969                ip_rt_put(rt);
 970        }
 971}
 972EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
 973
 974void ipv4_redirect(struct sk_buff *skb, struct net *net,
 975                   int oif, u32 mark, u8 protocol, int flow_flags)
 976{
 977        const struct iphdr *iph = (const struct iphdr *) skb->data;
 978        struct flowi4 fl4;
 979        struct rtable *rt;
 980
 981        __build_flow_key(&fl4, NULL, iph, oif,
 982                         RT_TOS(iph->tos), protocol, mark, flow_flags);
 983        rt = __ip_route_output_key(net, &fl4);
 984        if (!IS_ERR(rt)) {
 985                __ip_do_redirect(rt, skb, &fl4, false);
 986                ip_rt_put(rt);
 987        }
 988}
 989EXPORT_SYMBOL_GPL(ipv4_redirect);
 990
 991void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
 992{
 993        const struct iphdr *iph = (const struct iphdr *) skb->data;
 994        struct flowi4 fl4;
 995        struct rtable *rt;
 996
 997        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
 998        rt = __ip_route_output_key(sock_net(sk), &fl4);
 999        if (!IS_ERR(rt)) {
1000                __ip_do_redirect(rt, skb, &fl4, false);
1001                ip_rt_put(rt);
1002        }
1003}
1004EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1005
1006static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1007{
1008        struct rtable *rt = (struct rtable *) dst;
1009
1010        /* All IPV4 dsts are created with ->obsolete set to the value
1011         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1012         * into this function always.
1013         *
1014         * When a PMTU/redirect information update invalidates a
1015         * route, this is indicated by setting obsolete to
1016         * DST_OBSOLETE_KILL.
1017         */
1018        if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1019                return NULL;
1020        return dst;
1021}
1022
1023static void ipv4_link_failure(struct sk_buff *skb)
1024{
1025        struct rtable *rt;
1026
1027        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1028
1029        rt = skb_rtable(skb);
1030        if (rt)
1031                dst_set_expires(&rt->dst, 0);
1032}
1033
1034static int ip_rt_bug(struct sk_buff *skb)
1035{
1036        pr_debug("%s: %pI4 -> %pI4, %s\n",
1037                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1038                 skb->dev ? skb->dev->name : "?");
1039        kfree_skb(skb);
1040        WARN_ON(1);
1041        return 0;
1042}
1043
1044/*
1045   We do not cache source address of outgoing interface,
1046   because it is used only by IP RR, TS and SRR options,
1047   so that it out of fast path.
1048
1049   BTW remember: "addr" is allowed to be not aligned
1050   in IP options!
1051 */
1052
1053void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1054{
1055        __be32 src;
1056
1057        if (rt_is_output_route(rt))
1058                src = ip_hdr(skb)->saddr;
1059        else {
1060                struct fib_result res;
1061                struct flowi4 fl4;
1062                struct iphdr *iph;
1063
1064                iph = ip_hdr(skb);
1065
1066                memset(&fl4, 0, sizeof(fl4));
1067                fl4.daddr = iph->daddr;
1068                fl4.saddr = iph->saddr;
1069                fl4.flowi4_tos = RT_TOS(iph->tos);
1070                fl4.flowi4_oif = rt->dst.dev->ifindex;
1071                fl4.flowi4_iif = skb->dev->ifindex;
1072                fl4.flowi4_mark = skb->mark;
1073
1074                rcu_read_lock();
1075                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1076                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1077                else
1078                        src = inet_select_addr(rt->dst.dev,
1079                                               rt_nexthop(rt, iph->daddr),
1080                                               RT_SCOPE_UNIVERSE);
1081                rcu_read_unlock();
1082        }
1083        memcpy(addr, &src, 4);
1084}
1085
1086#ifdef CONFIG_IP_ROUTE_CLASSID
1087static void set_class_tag(struct rtable *rt, u32 tag)
1088{
1089        if (!(rt->dst.tclassid & 0xFFFF))
1090                rt->dst.tclassid |= tag & 0xFFFF;
1091        if (!(rt->dst.tclassid & 0xFFFF0000))
1092                rt->dst.tclassid |= tag & 0xFFFF0000;
1093}
1094#endif
1095
1096static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1097{
1098        unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1099
1100        if (advmss == 0) {
1101                advmss = max_t(unsigned int, dst->dev->mtu - 40,
1102                               ip_rt_min_advmss);
1103                if (advmss > 65535 - 40)
1104                        advmss = 65535 - 40;
1105        }
1106        return advmss;
1107}
1108
1109static unsigned int ipv4_mtu(const struct dst_entry *dst)
1110{
1111        const struct rtable *rt = (const struct rtable *) dst;
1112        unsigned int mtu = rt->rt_pmtu;
1113
1114        if (mtu && time_after_eq(jiffies, rt->dst.expires))
1115                mtu = 0;
1116
1117        if (!mtu)
1118                mtu = dst_metric_raw(dst, RTAX_MTU);
1119
1120        if (mtu && rt_is_output_route(rt))
1121                return mtu;
1122
1123        mtu = dst->dev->mtu;
1124
1125        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1126                if (rt->rt_gateway && mtu > 576)
1127                        mtu = 576;
1128        }
1129
1130        if (mtu > IP_MAX_MTU)
1131                mtu = IP_MAX_MTU;
1132
1133        return mtu;
1134}
1135
1136static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1137{
1138        struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1139        struct fib_nh_exception *fnhe;
1140        u32 hval;
1141
1142        if (!hash)
1143                return NULL;
1144
1145        hval = fnhe_hashfun(daddr);
1146
1147        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1148             fnhe = rcu_dereference(fnhe->fnhe_next)) {
1149                if (fnhe->fnhe_daddr == daddr)
1150                        return fnhe;
1151        }
1152        return NULL;
1153}
1154
1155static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1156                              __be32 daddr)
1157{
1158        bool ret = false;
1159
1160        spin_lock_bh(&fnhe_lock);
1161
1162        if (daddr == fnhe->fnhe_daddr) {
1163                struct rtable *orig;
1164
1165                if (fnhe->fnhe_pmtu) {
1166                        unsigned long expires = fnhe->fnhe_expires;
1167                        unsigned long diff = expires - jiffies;
1168
1169                        if (time_before(jiffies, expires)) {
1170                                rt->rt_pmtu = fnhe->fnhe_pmtu;
1171                                dst_set_expires(&rt->dst, diff);
1172                        }
1173                }
1174                if (fnhe->fnhe_gw) {
1175                        rt->rt_flags |= RTCF_REDIRECTED;
1176                        rt->rt_gateway = fnhe->fnhe_gw;
1177                }
1178
1179                orig = rcu_dereference(fnhe->fnhe_rth);
1180                rcu_assign_pointer(fnhe->fnhe_rth, rt);
1181                if (orig)
1182                        rt_free(orig);
1183
1184                fnhe->fnhe_stamp = jiffies;
1185                ret = true;
1186        } else {
1187                /* Routes we intend to cache in nexthop exception have
1188                 * the DST_NOCACHE bit clear.  However, if we are
1189                 * unsuccessful at storing this route into the cache
1190                 * we really need to set it.
1191                 */
1192                rt->dst.flags |= DST_NOCACHE;
1193        }
1194        spin_unlock_bh(&fnhe_lock);
1195
1196        return ret;
1197}
1198
1199static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1200{
1201        struct rtable *orig, *prev, **p;
1202        bool ret = true;
1203
1204        if (rt_is_input_route(rt)) {
1205                p = (struct rtable **)&nh->nh_rth_input;
1206        } else {
1207                if (!nh->nh_pcpu_rth_output)
1208                        goto nocache;
1209                p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1210        }
1211        orig = *p;
1212
1213        prev = cmpxchg(p, orig, rt);
1214        if (prev == orig) {
1215                if (orig)
1216                        rt_free(orig);
1217        } else {
1218                /* Routes we intend to cache in the FIB nexthop have
1219                 * the DST_NOCACHE bit clear.  However, if we are
1220                 * unsuccessful at storing this route into the cache
1221                 * we really need to set it.
1222                 */
1223nocache:
1224                rt->dst.flags |= DST_NOCACHE;
1225                ret = false;
1226        }
1227
1228        return ret;
1229}
1230
1231static DEFINE_SPINLOCK(rt_uncached_lock);
1232static LIST_HEAD(rt_uncached_list);
1233
1234static void rt_add_uncached_list(struct rtable *rt)
1235{
1236        spin_lock_bh(&rt_uncached_lock);
1237        list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1238        spin_unlock_bh(&rt_uncached_lock);
1239}
1240
1241static void ipv4_dst_destroy(struct dst_entry *dst)
1242{
1243        struct rtable *rt = (struct rtable *) dst;
1244
1245        if (!list_empty(&rt->rt_uncached)) {
1246                spin_lock_bh(&rt_uncached_lock);
1247                list_del(&rt->rt_uncached);
1248                spin_unlock_bh(&rt_uncached_lock);
1249        }
1250}
1251
1252void rt_flush_dev(struct net_device *dev)
1253{
1254        if (!list_empty(&rt_uncached_list)) {
1255                struct net *net = dev_net(dev);
1256                struct rtable *rt;
1257
1258                spin_lock_bh(&rt_uncached_lock);
1259                list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1260                        if (rt->dst.dev != dev)
1261                                continue;
1262                        rt->dst.dev = net->loopback_dev;
1263                        dev_hold(rt->dst.dev);
1264                        dev_put(dev);
1265                }
1266                spin_unlock_bh(&rt_uncached_lock);
1267        }
1268}
1269
1270static bool rt_cache_valid(const struct rtable *rt)
1271{
1272        return  rt &&
1273                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1274                !rt_is_expired(rt);
1275}
1276
1277static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1278                           const struct fib_result *res,
1279                           struct fib_nh_exception *fnhe,
1280                           struct fib_info *fi, u16 type, u32 itag)
1281{
1282        bool cached = false;
1283
1284        if (fi) {
1285                struct fib_nh *nh = &FIB_RES_NH(*res);
1286
1287                if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1288                        rt->rt_gateway = nh->nh_gw;
1289                dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1290#ifdef CONFIG_IP_ROUTE_CLASSID
1291                rt->dst.tclassid = nh->nh_tclassid;
1292#endif
1293                if (unlikely(fnhe))
1294                        cached = rt_bind_exception(rt, fnhe, daddr);
1295                else if (!(rt->dst.flags & DST_NOCACHE))
1296                        cached = rt_cache_route(nh, rt);
1297        }
1298        if (unlikely(!cached))
1299                rt_add_uncached_list(rt);
1300
1301#ifdef CONFIG_IP_ROUTE_CLASSID
1302#ifdef CONFIG_IP_MULTIPLE_TABLES
1303        set_class_tag(rt, res->tclassid);
1304#endif
1305        set_class_tag(rt, itag);
1306#endif
1307}
1308
1309static struct rtable *rt_dst_alloc(struct net_device *dev,
1310                                   bool nopolicy, bool noxfrm, bool will_cache)
1311{
1312        return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1313                         (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1314                         (nopolicy ? DST_NOPOLICY : 0) |
1315                         (noxfrm ? DST_NOXFRM : 0));
1316}
1317
1318/* called in rcu_read_lock() section */
1319static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1320                                u8 tos, struct net_device *dev, int our)
1321{
1322        struct rtable *rth;
1323        struct in_device *in_dev = __in_dev_get_rcu(dev);
1324        u32 itag = 0;
1325        int err;
1326
1327        /* Primary sanity checks. */
1328
1329        if (in_dev == NULL)
1330                return -EINVAL;
1331
1332        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1333            skb->protocol != htons(ETH_P_IP))
1334                goto e_inval;
1335
1336        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1337                if (ipv4_is_loopback(saddr))
1338                        goto e_inval;
1339
1340        if (ipv4_is_zeronet(saddr)) {
1341                if (!ipv4_is_local_multicast(daddr))
1342                        goto e_inval;
1343        } else {
1344                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1345                                          in_dev, &itag);
1346                if (err < 0)
1347                        goto e_err;
1348        }
1349        rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1350                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1351        if (!rth)
1352                goto e_nobufs;
1353
1354#ifdef CONFIG_IP_ROUTE_CLASSID
1355        rth->dst.tclassid = itag;
1356#endif
1357        rth->dst.output = ip_rt_bug;
1358
1359        rth->rt_genid   = rt_genid(dev_net(dev));
1360        rth->rt_flags   = RTCF_MULTICAST;
1361        rth->rt_type    = RTN_MULTICAST;
1362        rth->rt_is_input= 1;
1363        rth->rt_iif     = 0;
1364        rth->rt_pmtu    = 0;
1365        rth->rt_gateway = 0;
1366        INIT_LIST_HEAD(&rth->rt_uncached);
1367        if (our) {
1368                rth->dst.input= ip_local_deliver;
1369                rth->rt_flags |= RTCF_LOCAL;
1370        }
1371
1372#ifdef CONFIG_IP_MROUTE
1373        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1374                rth->dst.input = ip_mr_input;
1375#endif
1376        RT_CACHE_STAT_INC(in_slow_mc);
1377
1378        skb_dst_set(skb, &rth->dst);
1379        return 0;
1380
1381e_nobufs:
1382        return -ENOBUFS;
1383e_inval:
1384        return -EINVAL;
1385e_err:
1386        return err;
1387}
1388
1389
1390static void ip_handle_martian_source(struct net_device *dev,
1391                                     struct in_device *in_dev,
1392                                     struct sk_buff *skb,
1393                                     __be32 daddr,
1394                                     __be32 saddr)
1395{
1396        RT_CACHE_STAT_INC(in_martian_src);
1397#ifdef CONFIG_IP_ROUTE_VERBOSE
1398        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1399                /*
1400                 *      RFC1812 recommendation, if source is martian,
1401                 *      the only hint is MAC header.
1402                 */
1403                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1404                        &daddr, &saddr, dev->name);
1405                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1406                        print_hex_dump(KERN_WARNING, "ll header: ",
1407                                       DUMP_PREFIX_OFFSET, 16, 1,
1408                                       skb_mac_header(skb),
1409                                       dev->hard_header_len, true);
1410                }
1411        }
1412#endif
1413}
1414
1415/* called in rcu_read_lock() section */
1416static int __mkroute_input(struct sk_buff *skb,
1417                           const struct fib_result *res,
1418                           struct in_device *in_dev,
1419                           __be32 daddr, __be32 saddr, u32 tos)
1420{
1421        struct rtable *rth;
1422        int err;
1423        struct in_device *out_dev;
1424        unsigned int flags = 0;
1425        bool do_cache;
1426        u32 itag;
1427
1428        /* get a working reference to the output device */
1429        out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1430        if (out_dev == NULL) {
1431                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1432                return -EINVAL;
1433        }
1434
1435
1436        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1437                                  in_dev->dev, in_dev, &itag);
1438        if (err < 0) {
1439                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1440                                         saddr);
1441
1442                goto cleanup;
1443        }
1444
1445        if (out_dev == in_dev && err &&
1446            (IN_DEV_SHARED_MEDIA(out_dev) ||
1447             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1448                flags |= RTCF_DOREDIRECT;
1449
1450        if (skb->protocol != htons(ETH_P_IP)) {
1451                /* Not IP (i.e. ARP). Do not create route, if it is
1452                 * invalid for proxy arp. DNAT routes are always valid.
1453                 *
1454                 * Proxy arp feature have been extended to allow, ARP
1455                 * replies back to the same interface, to support
1456                 * Private VLAN switch technologies. See arp.c.
1457                 */
1458                if (out_dev == in_dev &&
1459                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1460                        err = -EINVAL;
1461                        goto cleanup;
1462                }
1463        }
1464
1465        do_cache = false;
1466        if (res->fi) {
1467                if (!itag) {
1468                        rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1469                        if (rt_cache_valid(rth)) {
1470                                skb_dst_set_noref(skb, &rth->dst);
1471                                goto out;
1472                        }
1473                        do_cache = true;
1474                }
1475        }
1476
1477        rth = rt_dst_alloc(out_dev->dev,
1478                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1479                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1480        if (!rth) {
1481                err = -ENOBUFS;
1482                goto cleanup;
1483        }
1484
1485        rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1486        rth->rt_flags = flags;
1487        rth->rt_type = res->type;
1488        rth->rt_is_input = 1;
1489        rth->rt_iif     = 0;
1490        rth->rt_pmtu    = 0;
1491        rth->rt_gateway = 0;
1492        INIT_LIST_HEAD(&rth->rt_uncached);
1493
1494        rth->dst.input = ip_forward;
1495        rth->dst.output = ip_output;
1496
1497        rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1498        skb_dst_set(skb, &rth->dst);
1499out:
1500        err = 0;
1501 cleanup:
1502        return err;
1503}
1504
1505static int ip_mkroute_input(struct sk_buff *skb,
1506                            struct fib_result *res,
1507                            const struct flowi4 *fl4,
1508                            struct in_device *in_dev,
1509                            __be32 daddr, __be32 saddr, u32 tos)
1510{
1511#ifdef CONFIG_IP_ROUTE_MULTIPATH
1512        if (res->fi && res->fi->fib_nhs > 1)
1513                fib_select_multipath(res);
1514#endif
1515
1516        /* create a routing cache entry */
1517        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1518}
1519
1520/*
1521 *      NOTE. We drop all the packets that has local source
1522 *      addresses, because every properly looped back packet
1523 *      must have correct destination already attached by output routine.
1524 *
1525 *      Such approach solves two big problems:
1526 *      1. Not simplex devices are handled properly.
1527 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1528 *      called with rcu_read_lock()
1529 */
1530
1531static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1532                               u8 tos, struct net_device *dev)
1533{
1534        struct fib_result res;
1535        struct in_device *in_dev = __in_dev_get_rcu(dev);
1536        struct flowi4   fl4;
1537        unsigned int    flags = 0;
1538        u32             itag = 0;
1539        struct rtable   *rth;
1540        int             err = -EINVAL;
1541        struct net    *net = dev_net(dev);
1542        bool do_cache;
1543
1544        /* IP on this device is disabled. */
1545
1546        if (!in_dev)
1547                goto out;
1548
1549        /* Check for the most weird martians, which can be not detected
1550           by fib_lookup.
1551         */
1552
1553        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1554                goto martian_source;
1555
1556        res.fi = NULL;
1557        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1558                goto brd_input;
1559
1560        /* Accept zero addresses only to limited broadcast;
1561         * I even do not know to fix it or not. Waiting for complains :-)
1562         */
1563        if (ipv4_is_zeronet(saddr))
1564                goto martian_source;
1565
1566        if (ipv4_is_zeronet(daddr))
1567                goto martian_destination;
1568
1569        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
1570                if (ipv4_is_loopback(daddr))
1571                        goto martian_destination;
1572
1573                if (ipv4_is_loopback(saddr))
1574                        goto martian_source;
1575        }
1576
1577        /*
1578         *      Now we are ready to route packet.
1579         */
1580        fl4.flowi4_oif = 0;
1581        fl4.flowi4_iif = dev->ifindex;
1582        fl4.flowi4_mark = skb->mark;
1583        fl4.flowi4_tos = tos;
1584        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1585        fl4.daddr = daddr;
1586        fl4.saddr = saddr;
1587        err = fib_lookup(net, &fl4, &res);
1588        if (err != 0)
1589                goto no_route;
1590
1591        RT_CACHE_STAT_INC(in_slow_tot);
1592
1593        if (res.type == RTN_BROADCAST)
1594                goto brd_input;
1595
1596        if (res.type == RTN_LOCAL) {
1597                err = fib_validate_source(skb, saddr, daddr, tos,
1598                                          net->loopback_dev->ifindex,
1599                                          dev, in_dev, &itag);
1600                if (err < 0)
1601                        goto martian_source_keep_err;
1602                goto local_input;
1603        }
1604
1605        if (!IN_DEV_FORWARD(in_dev))
1606                goto no_route;
1607        if (res.type != RTN_UNICAST)
1608                goto martian_destination;
1609
1610        err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1611out:    return err;
1612
1613brd_input:
1614        if (skb->protocol != htons(ETH_P_IP))
1615                goto e_inval;
1616
1617        if (!ipv4_is_zeronet(saddr)) {
1618                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1619                                          in_dev, &itag);
1620                if (err < 0)
1621                        goto martian_source_keep_err;
1622        }
1623        flags |= RTCF_BROADCAST;
1624        res.type = RTN_BROADCAST;
1625        RT_CACHE_STAT_INC(in_brd);
1626
1627local_input:
1628        do_cache = false;
1629        if (res.fi) {
1630                if (!itag) {
1631                        rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1632                        if (rt_cache_valid(rth)) {
1633                                skb_dst_set_noref(skb, &rth->dst);
1634                                err = 0;
1635                                goto out;
1636                        }
1637                        do_cache = true;
1638                }
1639        }
1640
1641        rth = rt_dst_alloc(net->loopback_dev,
1642                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1643        if (!rth)
1644                goto e_nobufs;
1645
1646        rth->dst.input= ip_local_deliver;
1647        rth->dst.output= ip_rt_bug;
1648#ifdef CONFIG_IP_ROUTE_CLASSID
1649        rth->dst.tclassid = itag;
1650#endif
1651
1652        rth->rt_genid = rt_genid(net);
1653        rth->rt_flags   = flags|RTCF_LOCAL;
1654        rth->rt_type    = res.type;
1655        rth->rt_is_input = 1;
1656        rth->rt_iif     = 0;
1657        rth->rt_pmtu    = 0;
1658        rth->rt_gateway = 0;
1659        INIT_LIST_HEAD(&rth->rt_uncached);
1660        if (res.type == RTN_UNREACHABLE) {
1661                rth->dst.input= ip_error;
1662                rth->dst.error= -err;
1663                rth->rt_flags   &= ~RTCF_LOCAL;
1664        }
1665        if (do_cache)
1666                rt_cache_route(&FIB_RES_NH(res), rth);
1667        skb_dst_set(skb, &rth->dst);
1668        err = 0;
1669        goto out;
1670
1671no_route:
1672        RT_CACHE_STAT_INC(in_no_route);
1673        res.type = RTN_UNREACHABLE;
1674        if (err == -ESRCH)
1675                err = -ENETUNREACH;
1676        goto local_input;
1677
1678        /*
1679         *      Do not cache martian addresses: they should be logged (RFC1812)
1680         */
1681martian_destination:
1682        RT_CACHE_STAT_INC(in_martian_dst);
1683#ifdef CONFIG_IP_ROUTE_VERBOSE
1684        if (IN_DEV_LOG_MARTIANS(in_dev))
1685                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1686                                     &daddr, &saddr, dev->name);
1687#endif
1688
1689e_inval:
1690        err = -EINVAL;
1691        goto out;
1692
1693e_nobufs:
1694        err = -ENOBUFS;
1695        goto out;
1696
1697martian_source:
1698        err = -EINVAL;
1699martian_source_keep_err:
1700        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1701        goto out;
1702}
1703
1704int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1705                         u8 tos, struct net_device *dev)
1706{
1707        int res;
1708
1709        rcu_read_lock();
1710
1711        /* Multicast recognition logic is moved from route cache to here.
1712           The problem was that too many Ethernet cards have broken/missing
1713           hardware multicast filters :-( As result the host on multicasting
1714           network acquires a lot of useless route cache entries, sort of
1715           SDR messages from all the world. Now we try to get rid of them.
1716           Really, provided software IP multicast filter is organized
1717           reasonably (at least, hashed), it does not result in a slowdown
1718           comparing with route cache reject entries.
1719           Note, that multicast routers are not affected, because
1720           route cache entry is created eventually.
1721         */
1722        if (ipv4_is_multicast(daddr)) {
1723                struct in_device *in_dev = __in_dev_get_rcu(dev);
1724
1725                if (in_dev) {
1726                        int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1727                                                  ip_hdr(skb)->protocol);
1728                        if (our
1729#ifdef CONFIG_IP_MROUTE
1730                                ||
1731                            (!ipv4_is_local_multicast(daddr) &&
1732                             IN_DEV_MFORWARD(in_dev))
1733#endif
1734                           ) {
1735                                int res = ip_route_input_mc(skb, daddr, saddr,
1736                                                            tos, dev, our);
1737                                rcu_read_unlock();
1738                                return res;
1739                        }
1740                }
1741                rcu_read_unlock();
1742                return -EINVAL;
1743        }
1744        res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1745        rcu_read_unlock();
1746        return res;
1747}
1748EXPORT_SYMBOL(ip_route_input_noref);
1749
1750/* called with rcu_read_lock() */
1751static struct rtable *__mkroute_output(const struct fib_result *res,
1752                                       const struct flowi4 *fl4, int orig_oif,
1753                                       struct net_device *dev_out,
1754                                       unsigned int flags)
1755{
1756        struct fib_info *fi = res->fi;
1757        struct fib_nh_exception *fnhe;
1758        struct in_device *in_dev;
1759        u16 type = res->type;
1760        struct rtable *rth;
1761
1762        in_dev = __in_dev_get_rcu(dev_out);
1763        if (!in_dev)
1764                return ERR_PTR(-EINVAL);
1765
1766        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1767                if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1768                        return ERR_PTR(-EINVAL);
1769
1770        if (ipv4_is_lbcast(fl4->daddr))
1771                type = RTN_BROADCAST;
1772        else if (ipv4_is_multicast(fl4->daddr))
1773                type = RTN_MULTICAST;
1774        else if (ipv4_is_zeronet(fl4->daddr))
1775                return ERR_PTR(-EINVAL);
1776
1777        if (dev_out->flags & IFF_LOOPBACK)
1778                flags |= RTCF_LOCAL;
1779
1780        if (type == RTN_BROADCAST) {
1781                flags |= RTCF_BROADCAST | RTCF_LOCAL;
1782                fi = NULL;
1783        } else if (type == RTN_MULTICAST) {
1784                flags |= RTCF_MULTICAST | RTCF_LOCAL;
1785                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1786                                     fl4->flowi4_proto))
1787                        flags &= ~RTCF_LOCAL;
1788                /* If multicast route do not exist use
1789                 * default one, but do not gateway in this case.
1790                 * Yes, it is hack.
1791                 */
1792                if (fi && res->prefixlen < 4)
1793                        fi = NULL;
1794        }
1795
1796        fnhe = NULL;
1797        if (fi) {
1798                struct rtable __rcu **prth;
1799
1800                fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1801                if (fnhe)
1802                        prth = &fnhe->fnhe_rth;
1803                else
1804                        prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
1805                rth = rcu_dereference(*prth);
1806                if (rt_cache_valid(rth)) {
1807                        dst_hold(&rth->dst);
1808                        return rth;
1809                }
1810        }
1811        rth = rt_dst_alloc(dev_out,
1812                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1813                           IN_DEV_CONF_GET(in_dev, NOXFRM),
1814                           fi);
1815        if (!rth)
1816                return ERR_PTR(-ENOBUFS);
1817
1818        rth->dst.output = ip_output;
1819
1820        rth->rt_genid = rt_genid(dev_net(dev_out));
1821        rth->rt_flags   = flags;
1822        rth->rt_type    = type;
1823        rth->rt_is_input = 0;
1824        rth->rt_iif     = orig_oif ? : 0;
1825        rth->rt_pmtu    = 0;
1826        rth->rt_gateway = 0;
1827        INIT_LIST_HEAD(&rth->rt_uncached);
1828
1829        RT_CACHE_STAT_INC(out_slow_tot);
1830
1831        if (flags & RTCF_LOCAL)
1832                rth->dst.input = ip_local_deliver;
1833        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1834                if (flags & RTCF_LOCAL &&
1835                    !(dev_out->flags & IFF_LOOPBACK)) {
1836                        rth->dst.output = ip_mc_output;
1837                        RT_CACHE_STAT_INC(out_slow_mc);
1838                }
1839#ifdef CONFIG_IP_MROUTE
1840                if (type == RTN_MULTICAST) {
1841                        if (IN_DEV_MFORWARD(in_dev) &&
1842                            !ipv4_is_local_multicast(fl4->daddr)) {
1843                                rth->dst.input = ip_mr_input;
1844                                rth->dst.output = ip_mc_output;
1845                        }
1846                }
1847#endif
1848        }
1849
1850        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1851
1852        return rth;
1853}
1854
1855/*
1856 * Major route resolver routine.
1857 */
1858
1859struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1860{
1861        struct net_device *dev_out = NULL;
1862        __u8 tos = RT_FL_TOS(fl4);
1863        unsigned int flags = 0;
1864        struct fib_result res;
1865        struct rtable *rth;
1866        int orig_oif;
1867
1868        res.tclassid    = 0;
1869        res.fi          = NULL;
1870        res.table       = NULL;
1871
1872        orig_oif = fl4->flowi4_oif;
1873
1874        fl4->flowi4_iif = net->loopback_dev->ifindex;
1875        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1876        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1877                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1878
1879        rcu_read_lock();
1880        if (fl4->saddr) {
1881                rth = ERR_PTR(-EINVAL);
1882                if (ipv4_is_multicast(fl4->saddr) ||
1883                    ipv4_is_lbcast(fl4->saddr) ||
1884                    ipv4_is_zeronet(fl4->saddr))
1885                        goto out;
1886
1887                /* I removed check for oif == dev_out->oif here.
1888                   It was wrong for two reasons:
1889                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1890                      is assigned to multiple interfaces.
1891                   2. Moreover, we are allowed to send packets with saddr
1892                      of another iface. --ANK
1893                 */
1894
1895                if (fl4->flowi4_oif == 0 &&
1896                    (ipv4_is_multicast(fl4->daddr) ||
1897                     ipv4_is_lbcast(fl4->daddr))) {
1898                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1899                        dev_out = __ip_dev_find(net, fl4->saddr, false);
1900                        if (dev_out == NULL)
1901                                goto out;
1902
1903                        /* Special hack: user can direct multicasts
1904                           and limited broadcast via necessary interface
1905                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1906                           This hack is not just for fun, it allows
1907                           vic,vat and friends to work.
1908                           They bind socket to loopback, set ttl to zero
1909                           and expect that it will work.
1910                           From the viewpoint of routing cache they are broken,
1911                           because we are not allowed to build multicast path
1912                           with loopback source addr (look, routing cache
1913                           cannot know, that ttl is zero, so that packet
1914                           will not leave this host and route is valid).
1915                           Luckily, this hack is good workaround.
1916                         */
1917
1918                        fl4->flowi4_oif = dev_out->ifindex;
1919                        goto make_route;
1920                }
1921
1922                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1923                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1924                        if (!__ip_dev_find(net, fl4->saddr, false))
1925                                goto out;
1926                }
1927        }
1928
1929
1930        if (fl4->flowi4_oif) {
1931                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1932                rth = ERR_PTR(-ENODEV);
1933                if (dev_out == NULL)
1934                        goto out;
1935
1936                /* RACE: Check return value of inet_select_addr instead. */
1937                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1938                        rth = ERR_PTR(-ENETUNREACH);
1939                        goto out;
1940                }
1941                if (ipv4_is_local_multicast(fl4->daddr) ||
1942                    ipv4_is_lbcast(fl4->daddr)) {
1943                        if (!fl4->saddr)
1944                                fl4->saddr = inet_select_addr(dev_out, 0,
1945                                                              RT_SCOPE_LINK);
1946                        goto make_route;
1947                }
1948                if (fl4->saddr) {
1949                        if (ipv4_is_multicast(fl4->daddr))
1950                                fl4->saddr = inet_select_addr(dev_out, 0,
1951                                                              fl4->flowi4_scope);
1952                        else if (!fl4->daddr)
1953                                fl4->saddr = inet_select_addr(dev_out, 0,
1954                                                              RT_SCOPE_HOST);
1955                }
1956        }
1957
1958        if (!fl4->daddr) {
1959                fl4->daddr = fl4->saddr;
1960                if (!fl4->daddr)
1961                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1962                dev_out = net->loopback_dev;
1963                fl4->flowi4_oif = net->loopback_dev->ifindex;
1964                res.type = RTN_LOCAL;
1965                flags |= RTCF_LOCAL;
1966                goto make_route;
1967        }
1968
1969        if (fib_lookup(net, fl4, &res)) {
1970                res.fi = NULL;
1971                res.table = NULL;
1972                if (fl4->flowi4_oif) {
1973                        /* Apparently, routing tables are wrong. Assume,
1974                           that the destination is on link.
1975
1976                           WHY? DW.
1977                           Because we are allowed to send to iface
1978                           even if it has NO routes and NO assigned
1979                           addresses. When oif is specified, routing
1980                           tables are looked up with only one purpose:
1981                           to catch if destination is gatewayed, rather than
1982                           direct. Moreover, if MSG_DONTROUTE is set,
1983                           we send packet, ignoring both routing tables
1984                           and ifaddr state. --ANK
1985
1986
1987                           We could make it even if oif is unknown,
1988                           likely IPv6, but we do not.
1989                         */
1990
1991                        if (fl4->saddr == 0)
1992                                fl4->saddr = inet_select_addr(dev_out, 0,
1993                                                              RT_SCOPE_LINK);
1994                        res.type = RTN_UNICAST;
1995                        goto make_route;
1996                }
1997                rth = ERR_PTR(-ENETUNREACH);
1998                goto out;
1999        }
2000
2001        if (res.type == RTN_LOCAL) {
2002                if (!fl4->saddr) {
2003                        if (res.fi->fib_prefsrc)
2004                                fl4->saddr = res.fi->fib_prefsrc;
2005                        else
2006                                fl4->saddr = fl4->daddr;
2007                }
2008                dev_out = net->loopback_dev;
2009                fl4->flowi4_oif = dev_out->ifindex;
2010                flags |= RTCF_LOCAL;
2011                goto make_route;
2012        }
2013
2014#ifdef CONFIG_IP_ROUTE_MULTIPATH
2015        if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2016                fib_select_multipath(&res);
2017        else
2018#endif
2019        if (!res.prefixlen &&
2020            res.table->tb_num_default > 1 &&
2021            res.type == RTN_UNICAST && !fl4->flowi4_oif)
2022                fib_select_default(&res);
2023
2024        if (!fl4->saddr)
2025                fl4->saddr = FIB_RES_PREFSRC(net, res);
2026
2027        dev_out = FIB_RES_DEV(res);
2028        fl4->flowi4_oif = dev_out->ifindex;
2029
2030
2031make_route:
2032        rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2033
2034out:
2035        rcu_read_unlock();
2036        return rth;
2037}
2038EXPORT_SYMBOL_GPL(__ip_route_output_key);
2039
2040static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2041{
2042        return NULL;
2043}
2044
2045static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2046{
2047        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2048
2049        return mtu ? : dst->dev->mtu;
2050}
2051
2052static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2053                                          struct sk_buff *skb, u32 mtu)
2054{
2055}
2056
2057static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2058                                       struct sk_buff *skb)
2059{
2060}
2061
2062static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2063                                          unsigned long old)
2064{
2065        return NULL;
2066}
2067
2068static struct dst_ops ipv4_dst_blackhole_ops = {
2069        .family                 =       AF_INET,
2070        .protocol               =       cpu_to_be16(ETH_P_IP),
2071        .check                  =       ipv4_blackhole_dst_check,
2072        .mtu                    =       ipv4_blackhole_mtu,
2073        .default_advmss         =       ipv4_default_advmss,
2074        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2075        .redirect               =       ipv4_rt_blackhole_redirect,
2076        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2077        .neigh_lookup           =       ipv4_neigh_lookup,
2078};
2079
2080struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2081{
2082        struct rtable *ort = (struct rtable *) dst_orig;
2083        struct rtable *rt;
2084
2085        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2086        if (rt) {
2087                struct dst_entry *new = &rt->dst;
2088
2089                new->__use = 1;
2090                new->input = dst_discard;
2091                new->output = dst_discard;
2092
2093                new->dev = ort->dst.dev;
2094                if (new->dev)
2095                        dev_hold(new->dev);
2096
2097                rt->rt_is_input = ort->rt_is_input;
2098                rt->rt_iif = ort->rt_iif;
2099                rt->rt_pmtu = ort->rt_pmtu;
2100
2101                rt->rt_genid = rt_genid(net);
2102                rt->rt_flags = ort->rt_flags;
2103                rt->rt_type = ort->rt_type;
2104                rt->rt_gateway = ort->rt_gateway;
2105
2106                INIT_LIST_HEAD(&rt->rt_uncached);
2107
2108                dst_free(new);
2109        }
2110
2111        dst_release(dst_orig);
2112
2113        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2114}
2115
2116struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2117                                    struct sock *sk)
2118{
2119        struct rtable *rt = __ip_route_output_key(net, flp4);
2120
2121        if (IS_ERR(rt))
2122                return rt;
2123
2124        if (flp4->flowi4_proto)
2125                rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2126                                                   flowi4_to_flowi(flp4),
2127                                                   sk, 0);
2128
2129        return rt;
2130}
2131EXPORT_SYMBOL_GPL(ip_route_output_flow);
2132
2133static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2134                        struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2135                        u32 seq, int event, int nowait, unsigned int flags)
2136{
2137        struct rtable *rt = skb_rtable(skb);
2138        struct rtmsg *r;
2139        struct nlmsghdr *nlh;
2140        unsigned long expires = 0;
2141        u32 error;
2142        u32 metrics[RTAX_MAX];
2143
2144        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2145        if (nlh == NULL)
2146                return -EMSGSIZE;
2147
2148        r = nlmsg_data(nlh);
2149        r->rtm_family    = AF_INET;
2150        r->rtm_dst_len  = 32;
2151        r->rtm_src_len  = 0;
2152        r->rtm_tos      = fl4->flowi4_tos;
2153        r->rtm_table    = RT_TABLE_MAIN;
2154        if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2155                goto nla_put_failure;
2156        r->rtm_type     = rt->rt_type;
2157        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2158        r->rtm_protocol = RTPROT_UNSPEC;
2159        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2160        if (rt->rt_flags & RTCF_NOTIFY)
2161                r->rtm_flags |= RTM_F_NOTIFY;
2162
2163        if (nla_put_be32(skb, RTA_DST, dst))
2164                goto nla_put_failure;
2165        if (src) {
2166                r->rtm_src_len = 32;
2167                if (nla_put_be32(skb, RTA_SRC, src))
2168                        goto nla_put_failure;
2169        }
2170        if (rt->dst.dev &&
2171            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2172                goto nla_put_failure;
2173#ifdef CONFIG_IP_ROUTE_CLASSID
2174        if (rt->dst.tclassid &&
2175            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2176                goto nla_put_failure;
2177#endif
2178        if (!rt_is_input_route(rt) &&
2179            fl4->saddr != src) {
2180                if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2181                        goto nla_put_failure;
2182        }
2183        if (rt->rt_gateway &&
2184            nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2185                goto nla_put_failure;
2186
2187        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2188        if (rt->rt_pmtu)
2189                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2190        if (rtnetlink_put_metrics(skb, metrics) < 0)
2191                goto nla_put_failure;
2192
2193        if (fl4->flowi4_mark &&
2194            nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2195                goto nla_put_failure;
2196
2197        error = rt->dst.error;
2198        expires = rt->dst.expires;
2199        if (expires) {
2200                if (time_before(jiffies, expires))
2201                        expires -= jiffies;
2202                else
2203                        expires = 0;
2204        }
2205
2206        if (rt_is_input_route(rt)) {
2207                if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2208                        goto nla_put_failure;
2209        }
2210
2211        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2212                goto nla_put_failure;
2213
2214        return nlmsg_end(skb, nlh);
2215
2216nla_put_failure:
2217        nlmsg_cancel(skb, nlh);
2218        return -EMSGSIZE;
2219}
2220
2221static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2222{
2223        struct net *net = sock_net(in_skb->sk);
2224        struct rtmsg *rtm;
2225        struct nlattr *tb[RTA_MAX+1];
2226        struct rtable *rt = NULL;
2227        struct flowi4 fl4;
2228        __be32 dst = 0;
2229        __be32 src = 0;
2230        u32 iif;
2231        int err;
2232        int mark;
2233        struct sk_buff *skb;
2234
2235        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2236        if (err < 0)
2237                goto errout;
2238
2239        rtm = nlmsg_data(nlh);
2240
2241        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2242        if (skb == NULL) {
2243                err = -ENOBUFS;
2244                goto errout;
2245        }
2246
2247        /* Reserve room for dummy headers, this skb can pass
2248           through good chunk of routing engine.
2249         */
2250        skb_reset_mac_header(skb);
2251        skb_reset_network_header(skb);
2252
2253        /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2254        ip_hdr(skb)->protocol = IPPROTO_ICMP;
2255        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2256
2257        src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2258        dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2259        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2260        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2261
2262        memset(&fl4, 0, sizeof(fl4));
2263        fl4.daddr = dst;
2264        fl4.saddr = src;
2265        fl4.flowi4_tos = rtm->rtm_tos;
2266        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2267        fl4.flowi4_mark = mark;
2268
2269        if (iif) {
2270                struct net_device *dev;
2271
2272                dev = __dev_get_by_index(net, iif);
2273                if (dev == NULL) {
2274                        err = -ENODEV;
2275                        goto errout_free;
2276                }
2277
2278                skb->protocol   = htons(ETH_P_IP);
2279                skb->dev        = dev;
2280                skb->mark       = mark;
2281                local_bh_disable();
2282                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2283                local_bh_enable();
2284
2285                rt = skb_rtable(skb);
2286                if (err == 0 && rt->dst.error)
2287                        err = -rt->dst.error;
2288        } else {
2289                rt = ip_route_output_key(net, &fl4);
2290
2291                err = 0;
2292                if (IS_ERR(rt))
2293                        err = PTR_ERR(rt);
2294        }
2295
2296        if (err)
2297                goto errout_free;
2298
2299        skb_dst_set(skb, &rt->dst);
2300        if (rtm->rtm_flags & RTM_F_NOTIFY)
2301                rt->rt_flags |= RTCF_NOTIFY;
2302
2303        err = rt_fill_info(net, dst, src, &fl4, skb,
2304                           NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2305                           RTM_NEWROUTE, 0, 0);
2306        if (err <= 0)
2307                goto errout_free;
2308
2309        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2310errout:
2311        return err;
2312
2313errout_free:
2314        kfree_skb(skb);
2315        goto errout;
2316}
2317
2318int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2319{
2320        return skb->len;
2321}
2322
2323void ip_rt_multicast_event(struct in_device *in_dev)
2324{
2325        rt_cache_flush(dev_net(in_dev->dev));
2326}
2327
2328#ifdef CONFIG_SYSCTL
2329static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2330                                        void __user *buffer,
2331                                        size_t *lenp, loff_t *ppos)
2332{
2333        if (write) {
2334                rt_cache_flush((struct net *)__ctl->extra1);
2335                return 0;
2336        }
2337
2338        return -EINVAL;
2339}
2340
2341static ctl_table ipv4_route_table[] = {
2342        {
2343                .procname       = "gc_thresh",
2344                .data           = &ipv4_dst_ops.gc_thresh,
2345                .maxlen         = sizeof(int),
2346                .mode           = 0644,
2347                .proc_handler   = proc_dointvec,
2348        },
2349        {
2350                .procname       = "max_size",
2351                .data           = &ip_rt_max_size,
2352                .maxlen         = sizeof(int),
2353                .mode           = 0644,
2354                .proc_handler   = proc_dointvec,
2355        },
2356        {
2357                /*  Deprecated. Use gc_min_interval_ms */
2358
2359                .procname       = "gc_min_interval",
2360                .data           = &ip_rt_gc_min_interval,
2361                .maxlen         = sizeof(int),
2362                .mode           = 0644,
2363                .proc_handler   = proc_dointvec_jiffies,
2364        },
2365        {
2366                .procname       = "gc_min_interval_ms",
2367                .data           = &ip_rt_gc_min_interval,
2368                .maxlen         = sizeof(int),
2369                .mode           = 0644,
2370                .proc_handler   = proc_dointvec_ms_jiffies,
2371        },
2372        {
2373                .procname       = "gc_timeout",
2374                .data           = &ip_rt_gc_timeout,
2375                .maxlen         = sizeof(int),
2376                .mode           = 0644,
2377                .proc_handler   = proc_dointvec_jiffies,
2378        },
2379        {
2380                .procname       = "gc_interval",
2381                .data           = &ip_rt_gc_interval,
2382                .maxlen         = sizeof(int),
2383                .mode           = 0644,
2384                .proc_handler   = proc_dointvec_jiffies,
2385        },
2386        {
2387                .procname       = "redirect_load",
2388                .data           = &ip_rt_redirect_load,
2389                .maxlen         = sizeof(int),
2390                .mode           = 0644,
2391                .proc_handler   = proc_dointvec,
2392        },
2393        {
2394                .procname       = "redirect_number",
2395                .data           = &ip_rt_redirect_number,
2396                .maxlen         = sizeof(int),
2397                .mode           = 0644,
2398                .proc_handler   = proc_dointvec,
2399        },
2400        {
2401                .procname       = "redirect_silence",
2402                .data           = &ip_rt_redirect_silence,
2403                .maxlen         = sizeof(int),
2404                .mode           = 0644,
2405                .proc_handler   = proc_dointvec,
2406        },
2407        {
2408                .procname       = "error_cost",
2409                .data           = &ip_rt_error_cost,
2410                .maxlen         = sizeof(int),
2411                .mode           = 0644,
2412                .proc_handler   = proc_dointvec,
2413        },
2414        {
2415                .procname       = "error_burst",
2416                .data           = &ip_rt_error_burst,
2417                .maxlen         = sizeof(int),
2418                .mode           = 0644,
2419                .proc_handler   = proc_dointvec,
2420        },
2421        {
2422                .procname       = "gc_elasticity",
2423                .data           = &ip_rt_gc_elasticity,
2424                .maxlen         = sizeof(int),
2425                .mode           = 0644,
2426                .proc_handler   = proc_dointvec,
2427        },
2428        {
2429                .procname       = "mtu_expires",
2430                .data           = &ip_rt_mtu_expires,
2431                .maxlen         = sizeof(int),
2432                .mode           = 0644,
2433                .proc_handler   = proc_dointvec_jiffies,
2434        },
2435        {
2436                .procname       = "min_pmtu",
2437                .data           = &ip_rt_min_pmtu,
2438                .maxlen         = sizeof(int),
2439                .mode           = 0644,
2440                .proc_handler   = proc_dointvec,
2441        },
2442        {
2443                .procname       = "min_adv_mss",
2444                .data           = &ip_rt_min_advmss,
2445                .maxlen         = sizeof(int),
2446                .mode           = 0644,
2447                .proc_handler   = proc_dointvec,
2448        },
2449        { }
2450};
2451
2452static struct ctl_table ipv4_route_flush_table[] = {
2453        {
2454                .procname       = "flush",
2455                .maxlen         = sizeof(int),
2456                .mode           = 0200,
2457                .proc_handler   = ipv4_sysctl_rtcache_flush,
2458        },
2459        { },
2460};
2461
2462static __net_init int sysctl_route_net_init(struct net *net)
2463{
2464        struct ctl_table *tbl;
2465
2466        tbl = ipv4_route_flush_table;
2467        if (!net_eq(net, &init_net)) {
2468                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2469                if (tbl == NULL)
2470                        goto err_dup;
2471        }
2472        tbl[0].extra1 = net;
2473
2474        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2475        if (net->ipv4.route_hdr == NULL)
2476                goto err_reg;
2477        return 0;
2478
2479err_reg:
2480        if (tbl != ipv4_route_flush_table)
2481                kfree(tbl);
2482err_dup:
2483        return -ENOMEM;
2484}
2485
2486static __net_exit void sysctl_route_net_exit(struct net *net)
2487{
2488        struct ctl_table *tbl;
2489
2490        tbl = net->ipv4.route_hdr->ctl_table_arg;
2491        unregister_net_sysctl_table(net->ipv4.route_hdr);
2492        BUG_ON(tbl == ipv4_route_flush_table);
2493        kfree(tbl);
2494}
2495
2496static __net_initdata struct pernet_operations sysctl_route_ops = {
2497        .init = sysctl_route_net_init,
2498        .exit = sysctl_route_net_exit,
2499};
2500#endif
2501
2502static __net_init int rt_genid_init(struct net *net)
2503{
2504        atomic_set(&net->rt_genid, 0);
2505        get_random_bytes(&net->ipv4.dev_addr_genid,
2506                         sizeof(net->ipv4.dev_addr_genid));
2507        return 0;
2508}
2509
2510static __net_initdata struct pernet_operations rt_genid_ops = {
2511        .init = rt_genid_init,
2512};
2513
2514static int __net_init ipv4_inetpeer_init(struct net *net)
2515{
2516        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2517
2518        if (!bp)
2519                return -ENOMEM;
2520        inet_peer_base_init(bp);
2521        net->ipv4.peers = bp;
2522        return 0;
2523}
2524
2525static void __net_exit ipv4_inetpeer_exit(struct net *net)
2526{
2527        struct inet_peer_base *bp = net->ipv4.peers;
2528
2529        net->ipv4.peers = NULL;
2530        inetpeer_invalidate_tree(bp);
2531        kfree(bp);
2532}
2533
2534static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2535        .init   =       ipv4_inetpeer_init,
2536        .exit   =       ipv4_inetpeer_exit,
2537};
2538
2539#ifdef CONFIG_IP_ROUTE_CLASSID
2540struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2541#endif /* CONFIG_IP_ROUTE_CLASSID */
2542
2543int __init ip_rt_init(void)
2544{
2545        int rc = 0;
2546
2547#ifdef CONFIG_IP_ROUTE_CLASSID
2548        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2549        if (!ip_rt_acct)
2550                panic("IP: failed to allocate ip_rt_acct\n");
2551#endif
2552
2553        ipv4_dst_ops.kmem_cachep =
2554                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2555                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2556
2557        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2558
2559        if (dst_entries_init(&ipv4_dst_ops) < 0)
2560                panic("IP: failed to allocate ipv4_dst_ops counter\n");
2561
2562        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2563                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2564
2565        ipv4_dst_ops.gc_thresh = ~0;
2566        ip_rt_max_size = INT_MAX;
2567
2568        devinet_init();
2569        ip_fib_init();
2570
2571        if (ip_rt_proc_init())
2572                pr_err("Unable to create route proc files\n");
2573#ifdef CONFIG_XFRM
2574        xfrm_init();
2575        xfrm4_init(ip_rt_max_size);
2576#endif
2577        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2578
2579#ifdef CONFIG_SYSCTL
2580        register_pernet_subsys(&sysctl_route_ops);
2581#endif
2582        register_pernet_subsys(&rt_genid_ops);
2583        register_pernet_subsys(&ipv4_inetpeer_ops);
2584        return rc;
2585}
2586
2587#ifdef CONFIG_SYSCTL
2588/*
2589 * We really need to sanitize the damn ipv4 init order, then all
2590 * this nonsense will go away.
2591 */
2592void __init ip_static_sysctl_init(void)
2593{
2594        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2595}
2596#endif
2597