linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *              Alan Cox        :       Verify area fixes.
  16 *              Alan Cox        :       cli() protects routing changes
  17 *              Rui Oliveira    :       ICMP routing table updates
  18 *              (rco@di.uminho.pt)      Routing table insertion and update
  19 *              Linus Torvalds  :       Rewrote bits to be sensible
  20 *              Alan Cox        :       Added BSD route gw semantics
  21 *              Alan Cox        :       Super /proc >4K
  22 *              Alan Cox        :       MTU in route table
  23 *              Alan Cox        :       MSS actually. Also added the window
  24 *                                      clamper.
  25 *              Sam Lantinga    :       Fixed route matching in rt_del()
  26 *              Alan Cox        :       Routing cache support.
  27 *              Alan Cox        :       Removed compatibility cruft.
  28 *              Alan Cox        :       RTF_REJECT support.
  29 *              Alan Cox        :       TCP irtt support.
  30 *              Jonathan Naylor :       Added Metric support.
  31 *      Miquel van Smoorenburg  :       BSD API fixes.
  32 *      Miquel van Smoorenburg  :       Metrics.
  33 *              Alan Cox        :       Use __u32 properly
  34 *              Alan Cox        :       Aligned routing errors more closely with BSD
  35 *                                      our system is still very different.
  36 *              Alan Cox        :       Faster /proc handling
  37 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38 *                                      routing caches and better behaviour.
  39 *
  40 *              Olaf Erb        :       irtt wasn't being copied right.
  41 *              Bjorn Ekwall    :       Kerneld route support.
  42 *              Alan Cox        :       Multicast fixed (I hope)
  43 *              Pavel Krauz     :       Limited broadcast fixed
  44 *              Mike McLagan    :       Routing by source
  45 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46 *                                      route.c and rewritten from scratch.
  47 *              Andi Kleen      :       Load-limit warning messages.
  48 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52 *              Marc Boucher    :       routing by fwmark
  53 *      Robert Olsson           :       Added rt_cache statistics
  54 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58 *
  59 *              This program is free software; you can redistribute it and/or
  60 *              modify it under the terms of the GNU General Public License
  61 *              as published by the Free Software Foundation; either version
  62 *              2 of the License, or (at your option) any later version.
  63 */
  64
  65#include <linux/module.h>
  66#include <asm/uaccess.h>
  67#include <asm/system.h>
  68#include <linux/bitops.h>
  69#include <linux/types.h>
  70#include <linux/kernel.h>
  71#include <linux/mm.h>
  72#include <linux/bootmem.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/workqueue.h>
  83#include <linux/skbuff.h>
  84#include <linux/inetdevice.h>
  85#include <linux/igmp.h>
  86#include <linux/pkt_sched.h>
  87#include <linux/mroute.h>
  88#include <linux/netfilter_ipv4.h>
  89#include <linux/random.h>
  90#include <linux/jhash.h>
  91#include <linux/rcupdate.h>
  92#include <linux/times.h>
  93#include <net/dst.h>
  94#include <net/net_namespace.h>
  95#include <net/protocol.h>
  96#include <net/ip.h>
  97#include <net/route.h>
  98#include <net/inetpeer.h>
  99#include <net/sock.h>
 100#include <net/ip_fib.h>
 101#include <net/arp.h>
 102#include <net/tcp.h>
 103#include <net/icmp.h>
 104#include <net/xfrm.h>
 105#include <net/netevent.h>
 106#include <net/rtnetlink.h>
 107#ifdef CONFIG_SYSCTL
 108#include <linux/sysctl.h>
 109#endif
 110
 111#define RT_FL_TOS(oldflp) \
 112    ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 113
 114#define IP_MAX_MTU      0xFFF0
 115
 116#define RT_GC_TIMEOUT (300*HZ)
 117
 118static int ip_rt_max_size;
 119static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 120static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 121static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 122static int ip_rt_redirect_number __read_mostly  = 9;
 123static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 124static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 125static int ip_rt_error_cost __read_mostly       = HZ;
 126static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 127static int ip_rt_gc_elasticity __read_mostly    = 8;
 128static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130static int ip_rt_min_advmss __read_mostly       = 256;
 131static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
 132static int rt_chain_length_max __read_mostly    = 20;
 133
 134static struct delayed_work expires_work;
 135static unsigned long expires_ljiffies;
 136
 137/*
 138 *      Interface to generic destination cache.
 139 */
 140
 141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 142static void              ipv4_dst_destroy(struct dst_entry *dst);
 143static void              ipv4_dst_ifdown(struct dst_entry *dst,
 144                                         struct net_device *dev, int how);
 145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146static void              ipv4_link_failure(struct sk_buff *skb);
 147static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 148static int rt_garbage_collect(struct dst_ops *ops);
 149static void rt_emergency_hash_rebuild(struct net *net);
 150
 151
 152static struct dst_ops ipv4_dst_ops = {
 153        .family =               AF_INET,
 154        .protocol =             cpu_to_be16(ETH_P_IP),
 155        .gc =                   rt_garbage_collect,
 156        .check =                ipv4_dst_check,
 157        .destroy =              ipv4_dst_destroy,
 158        .ifdown =               ipv4_dst_ifdown,
 159        .negative_advice =      ipv4_negative_advice,
 160        .link_failure =         ipv4_link_failure,
 161        .update_pmtu =          ip_rt_update_pmtu,
 162        .local_out =            __ip_local_out,
 163        .entries =              ATOMIC_INIT(0),
 164};
 165
 166#define ECN_OR_COST(class)      TC_PRIO_##class
 167
 168const __u8 ip_tos2prio[16] = {
 169        TC_PRIO_BESTEFFORT,
 170        ECN_OR_COST(FILLER),
 171        TC_PRIO_BESTEFFORT,
 172        ECN_OR_COST(BESTEFFORT),
 173        TC_PRIO_BULK,
 174        ECN_OR_COST(BULK),
 175        TC_PRIO_BULK,
 176        ECN_OR_COST(BULK),
 177        TC_PRIO_INTERACTIVE,
 178        ECN_OR_COST(INTERACTIVE),
 179        TC_PRIO_INTERACTIVE,
 180        ECN_OR_COST(INTERACTIVE),
 181        TC_PRIO_INTERACTIVE_BULK,
 182        ECN_OR_COST(INTERACTIVE_BULK),
 183        TC_PRIO_INTERACTIVE_BULK,
 184        ECN_OR_COST(INTERACTIVE_BULK)
 185};
 186
 187
 188/*
 189 * Route cache.
 190 */
 191
 192/* The locking scheme is rather straight forward:
 193 *
 194 * 1) Read-Copy Update protects the buckets of the central route hash.
 195 * 2) Only writers remove entries, and they hold the lock
 196 *    as they look at rtable reference counts.
 197 * 3) Only readers acquire references to rtable entries,
 198 *    they do so with atomic increments and with the
 199 *    lock held.
 200 */
 201
 202struct rt_hash_bucket {
 203        struct rtable   *chain;
 204};
 205
 206#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 207        defined(CONFIG_PROVE_LOCKING)
 208/*
 209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 210 * The size of this table is a power of two and depends on the number of CPUS.
 211 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 212 */
 213#ifdef CONFIG_LOCKDEP
 214# define RT_HASH_LOCK_SZ        256
 215#else
 216# if NR_CPUS >= 32
 217#  define RT_HASH_LOCK_SZ       4096
 218# elif NR_CPUS >= 16
 219#  define RT_HASH_LOCK_SZ       2048
 220# elif NR_CPUS >= 8
 221#  define RT_HASH_LOCK_SZ       1024
 222# elif NR_CPUS >= 4
 223#  define RT_HASH_LOCK_SZ       512
 224# else
 225#  define RT_HASH_LOCK_SZ       256
 226# endif
 227#endif
 228
 229static spinlock_t       *rt_hash_locks;
 230# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 231
 232static __init void rt_hash_lock_init(void)
 233{
 234        int i;
 235
 236        rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 237                        GFP_KERNEL);
 238        if (!rt_hash_locks)
 239                panic("IP: failed to allocate rt_hash_locks\n");
 240
 241        for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 242                spin_lock_init(&rt_hash_locks[i]);
 243}
 244#else
 245# define rt_hash_lock_addr(slot) NULL
 246
 247static inline void rt_hash_lock_init(void)
 248{
 249}
 250#endif
 251
 252static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 253static unsigned                 rt_hash_mask __read_mostly;
 254static unsigned int             rt_hash_log  __read_mostly;
 255
 256static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 257#define RT_CACHE_STAT_INC(field) \
 258        (__raw_get_cpu_var(rt_cache_stat).field++)
 259
 260static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 261                int genid)
 262{
 263        return jhash_3words((__force u32)(__be32)(daddr),
 264                            (__force u32)(__be32)(saddr),
 265                            idx, genid)
 266                & rt_hash_mask;
 267}
 268
 269static inline int rt_genid(struct net *net)
 270{
 271        return atomic_read(&net->ipv4.rt_genid);
 272}
 273
 274#ifdef CONFIG_PROC_FS
 275struct rt_cache_iter_state {
 276        struct seq_net_private p;
 277        int bucket;
 278        int genid;
 279};
 280
 281static struct rtable *rt_cache_get_first(struct seq_file *seq)
 282{
 283        struct rt_cache_iter_state *st = seq->private;
 284        struct rtable *r = NULL;
 285
 286        for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 287                if (!rt_hash_table[st->bucket].chain)
 288                        continue;
 289                rcu_read_lock_bh();
 290                r = rcu_dereference(rt_hash_table[st->bucket].chain);
 291                while (r) {
 292                        if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 293                            r->rt_genid == st->genid)
 294                                return r;
 295                        r = rcu_dereference(r->u.dst.rt_next);
 296                }
 297                rcu_read_unlock_bh();
 298        }
 299        return r;
 300}
 301
 302static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 303                                          struct rtable *r)
 304{
 305        struct rt_cache_iter_state *st = seq->private;
 306
 307        r = r->u.dst.rt_next;
 308        while (!r) {
 309                rcu_read_unlock_bh();
 310                do {
 311                        if (--st->bucket < 0)
 312                                return NULL;
 313                } while (!rt_hash_table[st->bucket].chain);
 314                rcu_read_lock_bh();
 315                r = rt_hash_table[st->bucket].chain;
 316        }
 317        return rcu_dereference(r);
 318}
 319
 320static struct rtable *rt_cache_get_next(struct seq_file *seq,
 321                                        struct rtable *r)
 322{
 323        struct rt_cache_iter_state *st = seq->private;
 324        while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 325                if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 326                        continue;
 327                if (r->rt_genid == st->genid)
 328                        break;
 329        }
 330        return r;
 331}
 332
 333static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 334{
 335        struct rtable *r = rt_cache_get_first(seq);
 336
 337        if (r)
 338                while (pos && (r = rt_cache_get_next(seq, r)))
 339                        --pos;
 340        return pos ? NULL : r;
 341}
 342
 343static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 344{
 345        struct rt_cache_iter_state *st = seq->private;
 346        if (*pos)
 347                return rt_cache_get_idx(seq, *pos - 1);
 348        st->genid = rt_genid(seq_file_net(seq));
 349        return SEQ_START_TOKEN;
 350}
 351
 352static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 353{
 354        struct rtable *r;
 355
 356        if (v == SEQ_START_TOKEN)
 357                r = rt_cache_get_first(seq);
 358        else
 359                r = rt_cache_get_next(seq, v);
 360        ++*pos;
 361        return r;
 362}
 363
 364static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 365{
 366        if (v && v != SEQ_START_TOKEN)
 367                rcu_read_unlock_bh();
 368}
 369
 370static int rt_cache_seq_show(struct seq_file *seq, void *v)
 371{
 372        if (v == SEQ_START_TOKEN)
 373                seq_printf(seq, "%-127s\n",
 374                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 375                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 376                           "HHUptod\tSpecDst");
 377        else {
 378                struct rtable *r = v;
 379                int len;
 380
 381                seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 382                              "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 383                        r->u.dst.dev ? r->u.dst.dev->name : "*",
 384                        (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 385                        r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 386                        r->u.dst.__use, 0, (unsigned long)r->rt_src,
 387                        (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 388                             (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 389                        dst_metric(&r->u.dst, RTAX_WINDOW),
 390                        (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 391                              dst_metric(&r->u.dst, RTAX_RTTVAR)),
 392                        r->fl.fl4_tos,
 393                        r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 394                        r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 395                                       dev_queue_xmit) : 0,
 396                        r->rt_spec_dst, &len);
 397
 398                seq_printf(seq, "%*s\n", 127 - len, "");
 399        }
 400        return 0;
 401}
 402
 403static const struct seq_operations rt_cache_seq_ops = {
 404        .start  = rt_cache_seq_start,
 405        .next   = rt_cache_seq_next,
 406        .stop   = rt_cache_seq_stop,
 407        .show   = rt_cache_seq_show,
 408};
 409
 410static int rt_cache_seq_open(struct inode *inode, struct file *file)
 411{
 412        return seq_open_net(inode, file, &rt_cache_seq_ops,
 413                        sizeof(struct rt_cache_iter_state));
 414}
 415
 416static const struct file_operations rt_cache_seq_fops = {
 417        .owner   = THIS_MODULE,
 418        .open    = rt_cache_seq_open,
 419        .read    = seq_read,
 420        .llseek  = seq_lseek,
 421        .release = seq_release_net,
 422};
 423
 424
 425static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 426{
 427        int cpu;
 428
 429        if (*pos == 0)
 430                return SEQ_START_TOKEN;
 431
 432        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 433                if (!cpu_possible(cpu))
 434                        continue;
 435                *pos = cpu+1;
 436                return &per_cpu(rt_cache_stat, cpu);
 437        }
 438        return NULL;
 439}
 440
 441static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 442{
 443        int cpu;
 444
 445        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 446                if (!cpu_possible(cpu))
 447                        continue;
 448                *pos = cpu+1;
 449                return &per_cpu(rt_cache_stat, cpu);
 450        }
 451        return NULL;
 452
 453}
 454
 455static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 456{
 457
 458}
 459
 460static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 461{
 462        struct rt_cache_stat *st = v;
 463
 464        if (v == SEQ_START_TOKEN) {
 465                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 466                return 0;
 467        }
 468
 469        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 470                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 471                   atomic_read(&ipv4_dst_ops.entries),
 472                   st->in_hit,
 473                   st->in_slow_tot,
 474                   st->in_slow_mc,
 475                   st->in_no_route,
 476                   st->in_brd,
 477                   st->in_martian_dst,
 478                   st->in_martian_src,
 479
 480                   st->out_hit,
 481                   st->out_slow_tot,
 482                   st->out_slow_mc,
 483
 484                   st->gc_total,
 485                   st->gc_ignored,
 486                   st->gc_goal_miss,
 487                   st->gc_dst_overflow,
 488                   st->in_hlist_search,
 489                   st->out_hlist_search
 490                );
 491        return 0;
 492}
 493
 494static const struct seq_operations rt_cpu_seq_ops = {
 495        .start  = rt_cpu_seq_start,
 496        .next   = rt_cpu_seq_next,
 497        .stop   = rt_cpu_seq_stop,
 498        .show   = rt_cpu_seq_show,
 499};
 500
 501
 502static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 503{
 504        return seq_open(file, &rt_cpu_seq_ops);
 505}
 506
 507static const struct file_operations rt_cpu_seq_fops = {
 508        .owner   = THIS_MODULE,
 509        .open    = rt_cpu_seq_open,
 510        .read    = seq_read,
 511        .llseek  = seq_lseek,
 512        .release = seq_release,
 513};
 514
 515#ifdef CONFIG_NET_CLS_ROUTE
 516static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
 517                           int length, int *eof, void *data)
 518{
 519        unsigned int i;
 520
 521        if ((offset & 3) || (length & 3))
 522                return -EIO;
 523
 524        if (offset >= sizeof(struct ip_rt_acct) * 256) {
 525                *eof = 1;
 526                return 0;
 527        }
 528
 529        if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
 530                length = sizeof(struct ip_rt_acct) * 256 - offset;
 531                *eof = 1;
 532        }
 533
 534        offset /= sizeof(u32);
 535
 536        if (length > 0) {
 537                u32 *dst = (u32 *) buffer;
 538
 539                *start = buffer;
 540                memset(dst, 0, length);
 541
 542                for_each_possible_cpu(i) {
 543                        unsigned int j;
 544                        u32 *src;
 545
 546                        src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
 547                        for (j = 0; j < length/4; j++)
 548                                dst[j] += src[j];
 549                }
 550        }
 551        return length;
 552}
 553#endif
 554
 555static int __net_init ip_rt_do_proc_init(struct net *net)
 556{
 557        struct proc_dir_entry *pde;
 558
 559        pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 560                        &rt_cache_seq_fops);
 561        if (!pde)
 562                goto err1;
 563
 564        pde = proc_create("rt_cache", S_IRUGO,
 565                          net->proc_net_stat, &rt_cpu_seq_fops);
 566        if (!pde)
 567                goto err2;
 568
 569#ifdef CONFIG_NET_CLS_ROUTE
 570        pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
 571                        ip_rt_acct_read, NULL);
 572        if (!pde)
 573                goto err3;
 574#endif
 575        return 0;
 576
 577#ifdef CONFIG_NET_CLS_ROUTE
 578err3:
 579        remove_proc_entry("rt_cache", net->proc_net_stat);
 580#endif
 581err2:
 582        remove_proc_entry("rt_cache", net->proc_net);
 583err1:
 584        return -ENOMEM;
 585}
 586
 587static void __net_exit ip_rt_do_proc_exit(struct net *net)
 588{
 589        remove_proc_entry("rt_cache", net->proc_net_stat);
 590        remove_proc_entry("rt_cache", net->proc_net);
 591        remove_proc_entry("rt_acct", net->proc_net);
 592}
 593
 594static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 595        .init = ip_rt_do_proc_init,
 596        .exit = ip_rt_do_proc_exit,
 597};
 598
 599static int __init ip_rt_proc_init(void)
 600{
 601        return register_pernet_subsys(&ip_rt_proc_ops);
 602}
 603
 604#else
 605static inline int ip_rt_proc_init(void)
 606{
 607        return 0;
 608}
 609#endif /* CONFIG_PROC_FS */
 610
 611static inline void rt_free(struct rtable *rt)
 612{
 613        call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 614}
 615
 616static inline void rt_drop(struct rtable *rt)
 617{
 618        ip_rt_put(rt);
 619        call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 620}
 621
 622static inline int rt_fast_clean(struct rtable *rth)
 623{
 624        /* Kill broadcast/multicast entries very aggresively, if they
 625           collide in hash table with more useful entries */
 626        return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 627                rth->fl.iif && rth->u.dst.rt_next;
 628}
 629
 630static inline int rt_valuable(struct rtable *rth)
 631{
 632        return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 633                rth->u.dst.expires;
 634}
 635
 636static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 637{
 638        unsigned long age;
 639        int ret = 0;
 640
 641        if (atomic_read(&rth->u.dst.__refcnt))
 642                goto out;
 643
 644        ret = 1;
 645        if (rth->u.dst.expires &&
 646            time_after_eq(jiffies, rth->u.dst.expires))
 647                goto out;
 648
 649        age = jiffies - rth->u.dst.lastuse;
 650        ret = 0;
 651        if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 652            (age <= tmo2 && rt_valuable(rth)))
 653                goto out;
 654        ret = 1;
 655out:    return ret;
 656}
 657
 658/* Bits of score are:
 659 * 31: very valuable
 660 * 30: not quite useless
 661 * 29..0: usage counter
 662 */
 663static inline u32 rt_score(struct rtable *rt)
 664{
 665        u32 score = jiffies - rt->u.dst.lastuse;
 666
 667        score = ~score & ~(3<<30);
 668
 669        if (rt_valuable(rt))
 670                score |= (1<<31);
 671
 672        if (!rt->fl.iif ||
 673            !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 674                score |= (1<<30);
 675
 676        return score;
 677}
 678
 679static inline bool rt_caching(const struct net *net)
 680{
 681        return net->ipv4.current_rt_cache_rebuild_count <=
 682                net->ipv4.sysctl_rt_cache_rebuild_count;
 683}
 684
 685static inline bool compare_hash_inputs(const struct flowi *fl1,
 686                                        const struct flowi *fl2)
 687{
 688        return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 689                (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
 690                (fl1->iif ^ fl2->iif)) == 0);
 691}
 692
 693static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 694{
 695        return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 696                (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 697                (fl1->mark ^ fl2->mark) |
 698                (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 699                 *(u16 *)&fl2->nl_u.ip4_u.tos) |
 700                (fl1->oif ^ fl2->oif) |
 701                (fl1->iif ^ fl2->iif)) == 0;
 702}
 703
 704static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 705{
 706        return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
 707}
 708
 709static inline int rt_is_expired(struct rtable *rth)
 710{
 711        return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
 712}
 713
 714/*
 715 * Perform a full scan of hash table and free all entries.
 716 * Can be called by a softirq or a process.
 717 * In the later case, we want to be reschedule if necessary
 718 */
 719static void rt_do_flush(int process_context)
 720{
 721        unsigned int i;
 722        struct rtable *rth, *next;
 723        struct rtable * tail;
 724
 725        for (i = 0; i <= rt_hash_mask; i++) {
 726                if (process_context && need_resched())
 727                        cond_resched();
 728                rth = rt_hash_table[i].chain;
 729                if (!rth)
 730                        continue;
 731
 732                spin_lock_bh(rt_hash_lock_addr(i));
 733#ifdef CONFIG_NET_NS
 734                {
 735                struct rtable ** prev, * p;
 736
 737                rth = rt_hash_table[i].chain;
 738
 739                /* defer releasing the head of the list after spin_unlock */
 740                for (tail = rth; tail; tail = tail->u.dst.rt_next)
 741                        if (!rt_is_expired(tail))
 742                                break;
 743                if (rth != tail)
 744                        rt_hash_table[i].chain = tail;
 745
 746                /* call rt_free on entries after the tail requiring flush */
 747                prev = &rt_hash_table[i].chain;
 748                for (p = *prev; p; p = next) {
 749                        next = p->u.dst.rt_next;
 750                        if (!rt_is_expired(p)) {
 751                                prev = &p->u.dst.rt_next;
 752                        } else {
 753                                *prev = next;
 754                                rt_free(p);
 755                        }
 756                }
 757                }
 758#else
 759                rth = rt_hash_table[i].chain;
 760                rt_hash_table[i].chain = NULL;
 761                tail = NULL;
 762#endif
 763                spin_unlock_bh(rt_hash_lock_addr(i));
 764
 765                for (; rth != tail; rth = next) {
 766                        next = rth->u.dst.rt_next;
 767                        rt_free(rth);
 768                }
 769        }
 770}
 771
 772/*
 773 * While freeing expired entries, we compute average chain length
 774 * and standard deviation, using fixed-point arithmetic.
 775 * This to have an estimation of rt_chain_length_max
 776 *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 777 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 778 */
 779
 780#define FRACT_BITS 3
 781#define ONE (1UL << FRACT_BITS)
 782
 783static void rt_check_expire(void)
 784{
 785        static unsigned int rover;
 786        unsigned int i = rover, goal;
 787        struct rtable *rth, *aux, **rthp;
 788        unsigned long samples = 0;
 789        unsigned long sum = 0, sum2 = 0;
 790        unsigned long delta;
 791        u64 mult;
 792
 793        delta = jiffies - expires_ljiffies;
 794        expires_ljiffies = jiffies;
 795        mult = ((u64)delta) << rt_hash_log;
 796        if (ip_rt_gc_timeout > 1)
 797                do_div(mult, ip_rt_gc_timeout);
 798        goal = (unsigned int)mult;
 799        if (goal > rt_hash_mask)
 800                goal = rt_hash_mask + 1;
 801        for (; goal > 0; goal--) {
 802                unsigned long tmo = ip_rt_gc_timeout;
 803                unsigned long length;
 804
 805                i = (i + 1) & rt_hash_mask;
 806                rthp = &rt_hash_table[i].chain;
 807
 808                if (need_resched())
 809                        cond_resched();
 810
 811                samples++;
 812
 813                if (*rthp == NULL)
 814                        continue;
 815                length = 0;
 816                spin_lock_bh(rt_hash_lock_addr(i));
 817                while ((rth = *rthp) != NULL) {
 818                        prefetch(rth->u.dst.rt_next);
 819                        if (rt_is_expired(rth)) {
 820                                *rthp = rth->u.dst.rt_next;
 821                                rt_free(rth);
 822                                continue;
 823                        }
 824                        if (rth->u.dst.expires) {
 825                                /* Entry is expired even if it is in use */
 826                                if (time_before_eq(jiffies, rth->u.dst.expires)) {
 827nofree:
 828                                        tmo >>= 1;
 829                                        rthp = &rth->u.dst.rt_next;
 830                                        /*
 831                                         * We only count entries on
 832                                         * a chain with equal hash inputs once
 833                                         * so that entries for different QOS
 834                                         * levels, and other non-hash input
 835                                         * attributes don't unfairly skew
 836                                         * the length computation
 837                                         */
 838                                        for (aux = rt_hash_table[i].chain;;) {
 839                                                if (aux == rth) {
 840                                                        length += ONE;
 841                                                        break;
 842                                                }
 843                                                if (compare_hash_inputs(&aux->fl, &rth->fl))
 844                                                        break;
 845                                                aux = aux->u.dst.rt_next;
 846                                        }
 847                                        continue;
 848                                }
 849                        } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 850                                goto nofree;
 851
 852                        /* Cleanup aged off entries. */
 853                        *rthp = rth->u.dst.rt_next;
 854                        rt_free(rth);
 855                }
 856                spin_unlock_bh(rt_hash_lock_addr(i));
 857                sum += length;
 858                sum2 += length*length;
 859        }
 860        if (samples) {
 861                unsigned long avg = sum / samples;
 862                unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 863                rt_chain_length_max = max_t(unsigned long,
 864                                        ip_rt_gc_elasticity,
 865                                        (avg + 4*sd) >> FRACT_BITS);
 866        }
 867        rover = i;
 868}
 869
 870/*
 871 * rt_worker_func() is run in process context.
 872 * we call rt_check_expire() to scan part of the hash table
 873 */
 874static void rt_worker_func(struct work_struct *work)
 875{
 876        rt_check_expire();
 877        schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 878}
 879
 880/*
 881 * Pertubation of rt_genid by a small quantity [1..256]
 882 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 883 * many times (2^24) without giving recent rt_genid.
 884 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 885 */
 886static void rt_cache_invalidate(struct net *net)
 887{
 888        unsigned char shuffle;
 889
 890        get_random_bytes(&shuffle, sizeof(shuffle));
 891        atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 892}
 893
 894/*
 895 * delay < 0  : invalidate cache (fast : entries will be deleted later)
 896 * delay >= 0 : invalidate & flush cache (can be long)
 897 */
 898void rt_cache_flush(struct net *net, int delay)
 899{
 900        rt_cache_invalidate(net);
 901        if (delay >= 0)
 902                rt_do_flush(!in_softirq());
 903}
 904
 905/*
 906 * We change rt_genid and let gc do the cleanup
 907 */
 908static void rt_secret_rebuild(unsigned long __net)
 909{
 910        struct net *net = (struct net *)__net;
 911        rt_cache_invalidate(net);
 912        mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
 913}
 914
 915static void rt_secret_rebuild_oneshot(struct net *net)
 916{
 917        del_timer_sync(&net->ipv4.rt_secret_timer);
 918        rt_cache_invalidate(net);
 919        if (ip_rt_secret_interval) {
 920                net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
 921                add_timer(&net->ipv4.rt_secret_timer);
 922        }
 923}
 924
 925static void rt_emergency_hash_rebuild(struct net *net)
 926{
 927        if (net_ratelimit()) {
 928                printk(KERN_WARNING "Route hash chain too long!\n");
 929                printk(KERN_WARNING "Adjust your secret_interval!\n");
 930        }
 931
 932        rt_secret_rebuild_oneshot(net);
 933}
 934
 935/*
 936   Short description of GC goals.
 937
 938   We want to build algorithm, which will keep routing cache
 939   at some equilibrium point, when number of aged off entries
 940   is kept approximately equal to newly generated ones.
 941
 942   Current expiration strength is variable "expire".
 943   We try to adjust it dynamically, so that if networking
 944   is idle expires is large enough to keep enough of warm entries,
 945   and when load increases it reduces to limit cache size.
 946 */
 947
 948static int rt_garbage_collect(struct dst_ops *ops)
 949{
 950        static unsigned long expire = RT_GC_TIMEOUT;
 951        static unsigned long last_gc;
 952        static int rover;
 953        static int equilibrium;
 954        struct rtable *rth, **rthp;
 955        unsigned long now = jiffies;
 956        int goal;
 957
 958        /*
 959         * Garbage collection is pretty expensive,
 960         * do not make it too frequently.
 961         */
 962
 963        RT_CACHE_STAT_INC(gc_total);
 964
 965        if (now - last_gc < ip_rt_gc_min_interval &&
 966            atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 967                RT_CACHE_STAT_INC(gc_ignored);
 968                goto out;
 969        }
 970
 971        /* Calculate number of entries, which we want to expire now. */
 972        goal = atomic_read(&ipv4_dst_ops.entries) -
 973                (ip_rt_gc_elasticity << rt_hash_log);
 974        if (goal <= 0) {
 975                if (equilibrium < ipv4_dst_ops.gc_thresh)
 976                        equilibrium = ipv4_dst_ops.gc_thresh;
 977                goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 978                if (goal > 0) {
 979                        equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 980                        goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 981                }
 982        } else {
 983                /* We are in dangerous area. Try to reduce cache really
 984                 * aggressively.
 985                 */
 986                goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 987                equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 988        }
 989
 990        if (now - last_gc >= ip_rt_gc_min_interval)
 991                last_gc = now;
 992
 993        if (goal <= 0) {
 994                equilibrium += goal;
 995                goto work_done;
 996        }
 997
 998        do {
 999                int i, k;
1000
1001                for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1002                        unsigned long tmo = expire;
1003
1004                        k = (k + 1) & rt_hash_mask;
1005                        rthp = &rt_hash_table[k].chain;
1006                        spin_lock_bh(rt_hash_lock_addr(k));
1007                        while ((rth = *rthp) != NULL) {
1008                                if (!rt_is_expired(rth) &&
1009                                        !rt_may_expire(rth, tmo, expire)) {
1010                                        tmo >>= 1;
1011                                        rthp = &rth->u.dst.rt_next;
1012                                        continue;
1013                                }
1014                                *rthp = rth->u.dst.rt_next;
1015                                rt_free(rth);
1016                                goal--;
1017                        }
1018                        spin_unlock_bh(rt_hash_lock_addr(k));
1019                        if (goal <= 0)
1020                                break;
1021                }
1022                rover = k;
1023
1024                if (goal <= 0)
1025                        goto work_done;
1026
1027                /* Goal is not achieved. We stop process if:
1028
1029                   - if expire reduced to zero. Otherwise, expire is halfed.
1030                   - if table is not full.
1031                   - if we are called from interrupt.
1032                   - jiffies check is just fallback/debug loop breaker.
1033                     We will not spin here for long time in any case.
1034                 */
1035
1036                RT_CACHE_STAT_INC(gc_goal_miss);
1037
1038                if (expire == 0)
1039                        break;
1040
1041                expire >>= 1;
1042#if RT_CACHE_DEBUG >= 2
1043                printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1044                                atomic_read(&ipv4_dst_ops.entries), goal, i);
1045#endif
1046
1047                if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1048                        goto out;
1049        } while (!in_softirq() && time_before_eq(jiffies, now));
1050
1051        if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1052                goto out;
1053        if (net_ratelimit())
1054                printk(KERN_WARNING "dst cache overflow\n");
1055        RT_CACHE_STAT_INC(gc_dst_overflow);
1056        return 1;
1057
1058work_done:
1059        expire += ip_rt_gc_min_interval;
1060        if (expire > ip_rt_gc_timeout ||
1061            atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1062                expire = ip_rt_gc_timeout;
1063#if RT_CACHE_DEBUG >= 2
1064        printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1065                        atomic_read(&ipv4_dst_ops.entries), goal, rover);
1066#endif
1067out:    return 0;
1068}
1069
1070static int rt_intern_hash(unsigned hash, struct rtable *rt,
1071                          struct rtable **rp, struct sk_buff *skb)
1072{
1073        struct rtable   *rth, **rthp;
1074        unsigned long   now;
1075        struct rtable *cand, **candp;
1076        u32             min_score;
1077        int             chain_length;
1078        int attempts = !in_softirq();
1079
1080restart:
1081        chain_length = 0;
1082        min_score = ~(u32)0;
1083        cand = NULL;
1084        candp = NULL;
1085        now = jiffies;
1086
1087        if (!rt_caching(dev_net(rt->u.dst.dev))) {
1088                /*
1089                 * If we're not caching, just tell the caller we
1090                 * were successful and don't touch the route.  The
1091                 * caller hold the sole reference to the cache entry, and
1092                 * it will be released when the caller is done with it.
1093                 * If we drop it here, the callers have no way to resolve routes
1094                 * when we're not caching.  Instead, just point *rp at rt, so
1095                 * the caller gets a single use out of the route
1096                 * Note that we do rt_free on this new route entry, so that
1097                 * once its refcount hits zero, we are still able to reap it
1098                 * (Thanks Alexey)
1099                 * Note also the rt_free uses call_rcu.  We don't actually
1100                 * need rcu protection here, this is just our path to get
1101                 * on the route gc list.
1102                 */
1103
1104                if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1105                        int err = arp_bind_neighbour(&rt->u.dst);
1106                        if (err) {
1107                                if (net_ratelimit())
1108                                        printk(KERN_WARNING
1109                                            "Neighbour table failure & not caching routes.\n");
1110                                rt_drop(rt);
1111                                return err;
1112                        }
1113                }
1114
1115                rt_free(rt);
1116                goto skip_hashing;
1117        }
1118
1119        rthp = &rt_hash_table[hash].chain;
1120
1121        spin_lock_bh(rt_hash_lock_addr(hash));
1122        while ((rth = *rthp) != NULL) {
1123                if (rt_is_expired(rth)) {
1124                        *rthp = rth->u.dst.rt_next;
1125                        rt_free(rth);
1126                        continue;
1127                }
1128                if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1129                        /* Put it first */
1130                        *rthp = rth->u.dst.rt_next;
1131                        /*
1132                         * Since lookup is lockfree, the deletion
1133                         * must be visible to another weakly ordered CPU before
1134                         * the insertion at the start of the hash chain.
1135                         */
1136                        rcu_assign_pointer(rth->u.dst.rt_next,
1137                                           rt_hash_table[hash].chain);
1138                        /*
1139                         * Since lookup is lockfree, the update writes
1140                         * must be ordered for consistency on SMP.
1141                         */
1142                        rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1143
1144                        dst_use(&rth->u.dst, now);
1145                        spin_unlock_bh(rt_hash_lock_addr(hash));
1146
1147                        rt_drop(rt);
1148                        if (rp)
1149                                *rp = rth;
1150                        else
1151                                skb_dst_set(skb, &rth->u.dst);
1152                        return 0;
1153                }
1154
1155                if (!atomic_read(&rth->u.dst.__refcnt)) {
1156                        u32 score = rt_score(rth);
1157
1158                        if (score <= min_score) {
1159                                cand = rth;
1160                                candp = rthp;
1161                                min_score = score;
1162                        }
1163                }
1164
1165                chain_length++;
1166
1167                rthp = &rth->u.dst.rt_next;
1168        }
1169
1170        if (cand) {
1171                /* ip_rt_gc_elasticity used to be average length of chain
1172                 * length, when exceeded gc becomes really aggressive.
1173                 *
1174                 * The second limit is less certain. At the moment it allows
1175                 * only 2 entries per bucket. We will see.
1176                 */
1177                if (chain_length > ip_rt_gc_elasticity) {
1178                        *candp = cand->u.dst.rt_next;
1179                        rt_free(cand);
1180                }
1181        } else {
1182                if (chain_length > rt_chain_length_max) {
1183                        struct net *net = dev_net(rt->u.dst.dev);
1184                        int num = ++net->ipv4.current_rt_cache_rebuild_count;
1185                        if (!rt_caching(dev_net(rt->u.dst.dev))) {
1186                                printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1187                                        rt->u.dst.dev->name, num);
1188                        }
1189                        rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1190                }
1191        }
1192
1193        /* Try to bind route to arp only if it is output
1194           route or unicast forwarding path.
1195         */
1196        if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1197                int err = arp_bind_neighbour(&rt->u.dst);
1198                if (err) {
1199                        spin_unlock_bh(rt_hash_lock_addr(hash));
1200
1201                        if (err != -ENOBUFS) {
1202                                rt_drop(rt);
1203                                return err;
1204                        }
1205
1206                        /* Neighbour tables are full and nothing
1207                           can be released. Try to shrink route cache,
1208                           it is most likely it holds some neighbour records.
1209                         */
1210                        if (attempts-- > 0) {
1211                                int saved_elasticity = ip_rt_gc_elasticity;
1212                                int saved_int = ip_rt_gc_min_interval;
1213                                ip_rt_gc_elasticity     = 1;
1214                                ip_rt_gc_min_interval   = 0;
1215                                rt_garbage_collect(&ipv4_dst_ops);
1216                                ip_rt_gc_min_interval   = saved_int;
1217                                ip_rt_gc_elasticity     = saved_elasticity;
1218                                goto restart;
1219                        }
1220
1221                        if (net_ratelimit())
1222                                printk(KERN_WARNING "Neighbour table overflow.\n");
1223                        rt_drop(rt);
1224                        return -ENOBUFS;
1225                }
1226        }
1227
1228        rt->u.dst.rt_next = rt_hash_table[hash].chain;
1229
1230#if RT_CACHE_DEBUG >= 2
1231        if (rt->u.dst.rt_next) {
1232                struct rtable *trt;
1233                printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1234                       hash, &rt->rt_dst);
1235                for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1236                        printk(" . %pI4", &trt->rt_dst);
1237                printk("\n");
1238        }
1239#endif
1240        /*
1241         * Since lookup is lockfree, we must make sure
1242         * previous writes to rt are comitted to memory
1243         * before making rt visible to other CPUS.
1244         */
1245        rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1246
1247        spin_unlock_bh(rt_hash_lock_addr(hash));
1248
1249skip_hashing:
1250        if (rp)
1251                *rp = rt;
1252        else
1253                skb_dst_set(skb, &rt->u.dst);
1254        return 0;
1255}
1256
1257void rt_bind_peer(struct rtable *rt, int create)
1258{
1259        static DEFINE_SPINLOCK(rt_peer_lock);
1260        struct inet_peer *peer;
1261
1262        peer = inet_getpeer(rt->rt_dst, create);
1263
1264        spin_lock_bh(&rt_peer_lock);
1265        if (rt->peer == NULL) {
1266                rt->peer = peer;
1267                peer = NULL;
1268        }
1269        spin_unlock_bh(&rt_peer_lock);
1270        if (peer)
1271                inet_putpeer(peer);
1272}
1273
1274/*
1275 * Peer allocation may fail only in serious out-of-memory conditions.  However
1276 * we still can generate some output.
1277 * Random ID selection looks a bit dangerous because we have no chances to
1278 * select ID being unique in a reasonable period of time.
1279 * But broken packet identifier may be better than no packet at all.
1280 */
1281static void ip_select_fb_ident(struct iphdr *iph)
1282{
1283        static DEFINE_SPINLOCK(ip_fb_id_lock);
1284        static u32 ip_fallback_id;
1285        u32 salt;
1286
1287        spin_lock_bh(&ip_fb_id_lock);
1288        salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1289        iph->id = htons(salt & 0xFFFF);
1290        ip_fallback_id = salt;
1291        spin_unlock_bh(&ip_fb_id_lock);
1292}
1293
1294void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1295{
1296        struct rtable *rt = (struct rtable *) dst;
1297
1298        if (rt) {
1299                if (rt->peer == NULL)
1300                        rt_bind_peer(rt, 1);
1301
1302                /* If peer is attached to destination, it is never detached,
1303                   so that we need not to grab a lock to dereference it.
1304                 */
1305                if (rt->peer) {
1306                        iph->id = htons(inet_getid(rt->peer, more));
1307                        return;
1308                }
1309        } else
1310                printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1311                       __builtin_return_address(0));
1312
1313        ip_select_fb_ident(iph);
1314}
1315
1316static void rt_del(unsigned hash, struct rtable *rt)
1317{
1318        struct rtable **rthp, *aux;
1319
1320        rthp = &rt_hash_table[hash].chain;
1321        spin_lock_bh(rt_hash_lock_addr(hash));
1322        ip_rt_put(rt);
1323        while ((aux = *rthp) != NULL) {
1324                if (aux == rt || rt_is_expired(aux)) {
1325                        *rthp = aux->u.dst.rt_next;
1326                        rt_free(aux);
1327                        continue;
1328                }
1329                rthp = &aux->u.dst.rt_next;
1330        }
1331        spin_unlock_bh(rt_hash_lock_addr(hash));
1332}
1333
1334void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1335                    __be32 saddr, struct net_device *dev)
1336{
1337        int i, k;
1338        struct in_device *in_dev = in_dev_get(dev);
1339        struct rtable *rth, **rthp;
1340        __be32  skeys[2] = { saddr, 0 };
1341        int  ikeys[2] = { dev->ifindex, 0 };
1342        struct netevent_redirect netevent;
1343        struct net *net;
1344
1345        if (!in_dev)
1346                return;
1347
1348        net = dev_net(dev);
1349        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1350            || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1351            || ipv4_is_zeronet(new_gw))
1352                goto reject_redirect;
1353
1354        if (!rt_caching(net))
1355                goto reject_redirect;
1356
1357        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1358                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1359                        goto reject_redirect;
1360                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1361                        goto reject_redirect;
1362        } else {
1363                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1364                        goto reject_redirect;
1365        }
1366
1367        for (i = 0; i < 2; i++) {
1368                for (k = 0; k < 2; k++) {
1369                        unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1370                                                rt_genid(net));
1371
1372                        rthp=&rt_hash_table[hash].chain;
1373
1374                        rcu_read_lock();
1375                        while ((rth = rcu_dereference(*rthp)) != NULL) {
1376                                struct rtable *rt;
1377
1378                                if (rth->fl.fl4_dst != daddr ||
1379                                    rth->fl.fl4_src != skeys[i] ||
1380                                    rth->fl.oif != ikeys[k] ||
1381                                    rth->fl.iif != 0 ||
1382                                    rt_is_expired(rth) ||
1383                                    !net_eq(dev_net(rth->u.dst.dev), net)) {
1384                                        rthp = &rth->u.dst.rt_next;
1385                                        continue;
1386                                }
1387
1388                                if (rth->rt_dst != daddr ||
1389                                    rth->rt_src != saddr ||
1390                                    rth->u.dst.error ||
1391                                    rth->rt_gateway != old_gw ||
1392                                    rth->u.dst.dev != dev)
1393                                        break;
1394
1395                                dst_hold(&rth->u.dst);
1396                                rcu_read_unlock();
1397
1398                                rt = dst_alloc(&ipv4_dst_ops);
1399                                if (rt == NULL) {
1400                                        ip_rt_put(rth);
1401                                        in_dev_put(in_dev);
1402                                        return;
1403                                }
1404
1405                                /* Copy all the information. */
1406                                *rt = *rth;
1407                                rt->u.dst.__use         = 1;
1408                                atomic_set(&rt->u.dst.__refcnt, 1);
1409                                rt->u.dst.child         = NULL;
1410                                if (rt->u.dst.dev)
1411                                        dev_hold(rt->u.dst.dev);
1412                                if (rt->idev)
1413                                        in_dev_hold(rt->idev);
1414                                rt->u.dst.obsolete      = 0;
1415                                rt->u.dst.lastuse       = jiffies;
1416                                rt->u.dst.path          = &rt->u.dst;
1417                                rt->u.dst.neighbour     = NULL;
1418                                rt->u.dst.hh            = NULL;
1419#ifdef CONFIG_XFRM
1420                                rt->u.dst.xfrm          = NULL;
1421#endif
1422                                rt->rt_genid            = rt_genid(net);
1423                                rt->rt_flags            |= RTCF_REDIRECTED;
1424
1425                                /* Gateway is different ... */
1426                                rt->rt_gateway          = new_gw;
1427
1428                                /* Redirect received -> path was valid */
1429                                dst_confirm(&rth->u.dst);
1430
1431                                if (rt->peer)
1432                                        atomic_inc(&rt->peer->refcnt);
1433
1434                                if (arp_bind_neighbour(&rt->u.dst) ||
1435                                    !(rt->u.dst.neighbour->nud_state &
1436                                            NUD_VALID)) {
1437                                        if (rt->u.dst.neighbour)
1438                                                neigh_event_send(rt->u.dst.neighbour, NULL);
1439                                        ip_rt_put(rth);
1440                                        rt_drop(rt);
1441                                        goto do_next;
1442                                }
1443
1444                                netevent.old = &rth->u.dst;
1445                                netevent.new = &rt->u.dst;
1446                                call_netevent_notifiers(NETEVENT_REDIRECT,
1447                                                        &netevent);
1448
1449                                rt_del(hash, rth);
1450                                if (!rt_intern_hash(hash, rt, &rt, NULL))
1451                                        ip_rt_put(rt);
1452                                goto do_next;
1453                        }
1454                        rcu_read_unlock();
1455                do_next:
1456                        ;
1457                }
1458        }
1459        in_dev_put(in_dev);
1460        return;
1461
1462reject_redirect:
1463#ifdef CONFIG_IP_ROUTE_VERBOSE
1464        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1465                printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1466                        "  Advised path = %pI4 -> %pI4\n",
1467                       &old_gw, dev->name, &new_gw,
1468                       &saddr, &daddr);
1469#endif
1470        in_dev_put(in_dev);
1471}
1472
1473static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1474{
1475        struct rtable *rt = (struct rtable *)dst;
1476        struct dst_entry *ret = dst;
1477
1478        if (rt) {
1479                if (dst->obsolete) {
1480                        ip_rt_put(rt);
1481                        ret = NULL;
1482                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1483                           rt->u.dst.expires) {
1484                        unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1485                                                rt->fl.oif,
1486                                                rt_genid(dev_net(dst->dev)));
1487#if RT_CACHE_DEBUG >= 1
1488                        printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1489                                &rt->rt_dst, rt->fl.fl4_tos);
1490#endif
1491                        rt_del(hash, rt);
1492                        ret = NULL;
1493                }
1494        }
1495        return ret;
1496}
1497
1498/*
1499 * Algorithm:
1500 *      1. The first ip_rt_redirect_number redirects are sent
1501 *         with exponential backoff, then we stop sending them at all,
1502 *         assuming that the host ignores our redirects.
1503 *      2. If we did not see packets requiring redirects
1504 *         during ip_rt_redirect_silence, we assume that the host
1505 *         forgot redirected route and start to send redirects again.
1506 *
1507 * This algorithm is much cheaper and more intelligent than dumb load limiting
1508 * in icmp.c.
1509 *
1510 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1511 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1512 */
1513
1514void ip_rt_send_redirect(struct sk_buff *skb)
1515{
1516        struct rtable *rt = skb_rtable(skb);
1517        struct in_device *in_dev;
1518        int log_martians;
1519
1520        rcu_read_lock();
1521        in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1522        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1523                rcu_read_unlock();
1524                return;
1525        }
1526        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1527        rcu_read_unlock();
1528
1529        /* No redirected packets during ip_rt_redirect_silence;
1530         * reset the algorithm.
1531         */
1532        if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1533                rt->u.dst.rate_tokens = 0;
1534
1535        /* Too many ignored redirects; do not send anything
1536         * set u.dst.rate_last to the last seen redirected packet.
1537         */
1538        if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1539                rt->u.dst.rate_last = jiffies;
1540                return;
1541        }
1542
1543        /* Check for load limit; set rate_last to the latest sent
1544         * redirect.
1545         */
1546        if (rt->u.dst.rate_tokens == 0 ||
1547            time_after(jiffies,
1548                       (rt->u.dst.rate_last +
1549                        (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1550                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1551                rt->u.dst.rate_last = jiffies;
1552                ++rt->u.dst.rate_tokens;
1553#ifdef CONFIG_IP_ROUTE_VERBOSE
1554                if (log_martians &&
1555                    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1556                    net_ratelimit())
1557                        printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1558                                &rt->rt_src, rt->rt_iif,
1559                                &rt->rt_dst, &rt->rt_gateway);
1560#endif
1561        }
1562}
1563
1564static int ip_error(struct sk_buff *skb)
1565{
1566        struct rtable *rt = skb_rtable(skb);
1567        unsigned long now;
1568        int code;
1569
1570        switch (rt->u.dst.error) {
1571                case EINVAL:
1572                default:
1573                        goto out;
1574                case EHOSTUNREACH:
1575                        code = ICMP_HOST_UNREACH;
1576                        break;
1577                case ENETUNREACH:
1578                        code = ICMP_NET_UNREACH;
1579                        IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1580                                        IPSTATS_MIB_INNOROUTES);
1581                        break;
1582                case EACCES:
1583                        code = ICMP_PKT_FILTERED;
1584                        break;
1585        }
1586
1587        now = jiffies;
1588        rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1589        if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1590                rt->u.dst.rate_tokens = ip_rt_error_burst;
1591        rt->u.dst.rate_last = now;
1592        if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1593                rt->u.dst.rate_tokens -= ip_rt_error_cost;
1594                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1595        }
1596
1597out:    kfree_skb(skb);
1598        return 0;
1599}
1600
1601/*
1602 *      The last two values are not from the RFC but
1603 *      are needed for AMPRnet AX.25 paths.
1604 */
1605
1606static const unsigned short mtu_plateau[] =
1607{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1608
1609static inline unsigned short guess_mtu(unsigned short old_mtu)
1610{
1611        int i;
1612
1613        for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1614                if (old_mtu > mtu_plateau[i])
1615                        return mtu_plateau[i];
1616        return 68;
1617}
1618
1619unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1620                                 unsigned short new_mtu,
1621                                 struct net_device *dev)
1622{
1623        int i, k;
1624        unsigned short old_mtu = ntohs(iph->tot_len);
1625        struct rtable *rth;
1626        int  ikeys[2] = { dev->ifindex, 0 };
1627        __be32  skeys[2] = { iph->saddr, 0, };
1628        __be32  daddr = iph->daddr;
1629        unsigned short est_mtu = 0;
1630
1631        if (ipv4_config.no_pmtu_disc)
1632                return 0;
1633
1634        for (k = 0; k < 2; k++) {
1635                for (i = 0; i < 2; i++) {
1636                        unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1637                                                rt_genid(net));
1638
1639                        rcu_read_lock();
1640                        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1641                             rth = rcu_dereference(rth->u.dst.rt_next)) {
1642                                unsigned short mtu = new_mtu;
1643
1644                                if (rth->fl.fl4_dst != daddr ||
1645                                    rth->fl.fl4_src != skeys[i] ||
1646                                    rth->rt_dst != daddr ||
1647                                    rth->rt_src != iph->saddr ||
1648                                    rth->fl.oif != ikeys[k] ||
1649                                    rth->fl.iif != 0 ||
1650                                    dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1651                                    !net_eq(dev_net(rth->u.dst.dev), net) ||
1652                                    rt_is_expired(rth))
1653                                        continue;
1654
1655                                if (new_mtu < 68 || new_mtu >= old_mtu) {
1656
1657                                        /* BSD 4.2 compatibility hack :-( */
1658                                        if (mtu == 0 &&
1659                                            old_mtu >= dst_mtu(&rth->u.dst) &&
1660                                            old_mtu >= 68 + (iph->ihl << 2))
1661                                                old_mtu -= iph->ihl << 2;
1662
1663                                        mtu = guess_mtu(old_mtu);
1664                                }
1665                                if (mtu <= dst_mtu(&rth->u.dst)) {
1666                                        if (mtu < dst_mtu(&rth->u.dst)) {
1667                                                dst_confirm(&rth->u.dst);
1668                                                if (mtu < ip_rt_min_pmtu) {
1669                                                        mtu = ip_rt_min_pmtu;
1670                                                        rth->u.dst.metrics[RTAX_LOCK-1] |=
1671                                                                (1 << RTAX_MTU);
1672                                                }
1673                                                rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1674                                                dst_set_expires(&rth->u.dst,
1675                                                        ip_rt_mtu_expires);
1676                                        }
1677                                        est_mtu = mtu;
1678                                }
1679                        }
1680                        rcu_read_unlock();
1681                }
1682        }
1683        return est_mtu ? : new_mtu;
1684}
1685
1686static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1687{
1688        if (dst_mtu(dst) > mtu && mtu >= 68 &&
1689            !(dst_metric_locked(dst, RTAX_MTU))) {
1690                if (mtu < ip_rt_min_pmtu) {
1691                        mtu = ip_rt_min_pmtu;
1692                        dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1693                }
1694                dst->metrics[RTAX_MTU-1] = mtu;
1695                dst_set_expires(dst, ip_rt_mtu_expires);
1696                call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1697        }
1698}
1699
1700static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1701{
1702        return NULL;
1703}
1704
1705static void ipv4_dst_destroy(struct dst_entry *dst)
1706{
1707        struct rtable *rt = (struct rtable *) dst;
1708        struct inet_peer *peer = rt->peer;
1709        struct in_device *idev = rt->idev;
1710
1711        if (peer) {
1712                rt->peer = NULL;
1713                inet_putpeer(peer);
1714        }
1715
1716        if (idev) {
1717                rt->idev = NULL;
1718                in_dev_put(idev);
1719        }
1720}
1721
1722static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1723                            int how)
1724{
1725        struct rtable *rt = (struct rtable *) dst;
1726        struct in_device *idev = rt->idev;
1727        if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1728                struct in_device *loopback_idev =
1729                        in_dev_get(dev_net(dev)->loopback_dev);
1730                if (loopback_idev) {
1731                        rt->idev = loopback_idev;
1732                        in_dev_put(idev);
1733                }
1734        }
1735}
1736
1737static void ipv4_link_failure(struct sk_buff *skb)
1738{
1739        struct rtable *rt;
1740
1741        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1742
1743        rt = skb_rtable(skb);
1744        if (rt)
1745                dst_set_expires(&rt->u.dst, 0);
1746}
1747
1748static int ip_rt_bug(struct sk_buff *skb)
1749{
1750        printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1751                &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1752                skb->dev ? skb->dev->name : "?");
1753        kfree_skb(skb);
1754        return 0;
1755}
1756
1757/*
1758   We do not cache source address of outgoing interface,
1759   because it is used only by IP RR, TS and SRR options,
1760   so that it out of fast path.
1761
1762   BTW remember: "addr" is allowed to be not aligned
1763   in IP options!
1764 */
1765
1766void ip_rt_get_source(u8 *addr, struct rtable *rt)
1767{
1768        __be32 src;
1769        struct fib_result res;
1770
1771        if (rt->fl.iif == 0)
1772                src = rt->rt_src;
1773        else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1774                src = FIB_RES_PREFSRC(res);
1775                fib_res_put(&res);
1776        } else
1777                src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1778                                        RT_SCOPE_UNIVERSE);
1779        memcpy(addr, &src, 4);
1780}
1781
1782#ifdef CONFIG_NET_CLS_ROUTE
1783static void set_class_tag(struct rtable *rt, u32 tag)
1784{
1785        if (!(rt->u.dst.tclassid & 0xFFFF))
1786                rt->u.dst.tclassid |= tag & 0xFFFF;
1787        if (!(rt->u.dst.tclassid & 0xFFFF0000))
1788                rt->u.dst.tclassid |= tag & 0xFFFF0000;
1789}
1790#endif
1791
1792static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1793{
1794        struct fib_info *fi = res->fi;
1795
1796        if (fi) {
1797                if (FIB_RES_GW(*res) &&
1798                    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1799                        rt->rt_gateway = FIB_RES_GW(*res);
1800                memcpy(rt->u.dst.metrics, fi->fib_metrics,
1801                       sizeof(rt->u.dst.metrics));
1802                if (fi->fib_mtu == 0) {
1803                        rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1804                        if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1805                            rt->rt_gateway != rt->rt_dst &&
1806                            rt->u.dst.dev->mtu > 576)
1807                                rt->u.dst.metrics[RTAX_MTU-1] = 576;
1808                }
1809#ifdef CONFIG_NET_CLS_ROUTE
1810                rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1811#endif
1812        } else
1813                rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1814
1815        if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1816                rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1817        if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1818                rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1819        if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1820                rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1821                                       ip_rt_min_advmss);
1822        if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1823                rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1824
1825#ifdef CONFIG_NET_CLS_ROUTE
1826#ifdef CONFIG_IP_MULTIPLE_TABLES
1827        set_class_tag(rt, fib_rules_tclass(res));
1828#endif
1829        set_class_tag(rt, itag);
1830#endif
1831        rt->rt_type = res->type;
1832}
1833
1834static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1835                                u8 tos, struct net_device *dev, int our)
1836{
1837        unsigned hash;
1838        struct rtable *rth;
1839        __be32 spec_dst;
1840        struct in_device *in_dev = in_dev_get(dev);
1841        u32 itag = 0;
1842
1843        /* Primary sanity checks. */
1844
1845        if (in_dev == NULL)
1846                return -EINVAL;
1847
1848        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1849            ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1850                goto e_inval;
1851
1852        if (ipv4_is_zeronet(saddr)) {
1853                if (!ipv4_is_local_multicast(daddr))
1854                        goto e_inval;
1855                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1856        } else if (fib_validate_source(saddr, 0, tos, 0,
1857                                        dev, &spec_dst, &itag, 0) < 0)
1858                goto e_inval;
1859
1860        rth = dst_alloc(&ipv4_dst_ops);
1861        if (!rth)
1862                goto e_nobufs;
1863
1864        rth->u.dst.output= ip_rt_bug;
1865
1866        atomic_set(&rth->u.dst.__refcnt, 1);
1867        rth->u.dst.flags= DST_HOST;
1868        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1869                rth->u.dst.flags |= DST_NOPOLICY;
1870        rth->fl.fl4_dst = daddr;
1871        rth->rt_dst     = daddr;
1872        rth->fl.fl4_tos = tos;
1873        rth->fl.mark    = skb->mark;
1874        rth->fl.fl4_src = saddr;
1875        rth->rt_src     = saddr;
1876#ifdef CONFIG_NET_CLS_ROUTE
1877        rth->u.dst.tclassid = itag;
1878#endif
1879        rth->rt_iif     =
1880        rth->fl.iif     = dev->ifindex;
1881        rth->u.dst.dev  = init_net.loopback_dev;
1882        dev_hold(rth->u.dst.dev);
1883        rth->idev       = in_dev_get(rth->u.dst.dev);
1884        rth->fl.oif     = 0;
1885        rth->rt_gateway = daddr;
1886        rth->rt_spec_dst= spec_dst;
1887        rth->rt_genid   = rt_genid(dev_net(dev));
1888        rth->rt_flags   = RTCF_MULTICAST;
1889        rth->rt_type    = RTN_MULTICAST;
1890        if (our) {
1891                rth->u.dst.input= ip_local_deliver;
1892                rth->rt_flags |= RTCF_LOCAL;
1893        }
1894
1895#ifdef CONFIG_IP_MROUTE
1896        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1897                rth->u.dst.input = ip_mr_input;
1898#endif
1899        RT_CACHE_STAT_INC(in_slow_mc);
1900
1901        in_dev_put(in_dev);
1902        hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1903        return rt_intern_hash(hash, rth, NULL, skb);
1904
1905e_nobufs:
1906        in_dev_put(in_dev);
1907        return -ENOBUFS;
1908
1909e_inval:
1910        in_dev_put(in_dev);
1911        return -EINVAL;
1912}
1913
1914
1915static void ip_handle_martian_source(struct net_device *dev,
1916                                     struct in_device *in_dev,
1917                                     struct sk_buff *skb,
1918                                     __be32 daddr,
1919                                     __be32 saddr)
1920{
1921        RT_CACHE_STAT_INC(in_martian_src);
1922#ifdef CONFIG_IP_ROUTE_VERBOSE
1923        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1924                /*
1925                 *      RFC1812 recommendation, if source is martian,
1926                 *      the only hint is MAC header.
1927                 */
1928                printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1929                        &daddr, &saddr, dev->name);
1930                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1931                        int i;
1932                        const unsigned char *p = skb_mac_header(skb);
1933                        printk(KERN_WARNING "ll header: ");
1934                        for (i = 0; i < dev->hard_header_len; i++, p++) {
1935                                printk("%02x", *p);
1936                                if (i < (dev->hard_header_len - 1))
1937                                        printk(":");
1938                        }
1939                        printk("\n");
1940                }
1941        }
1942#endif
1943}
1944
1945static int __mkroute_input(struct sk_buff *skb,
1946                           struct fib_result *res,
1947                           struct in_device *in_dev,
1948                           __be32 daddr, __be32 saddr, u32 tos,
1949                           struct rtable **result)
1950{
1951
1952        struct rtable *rth;
1953        int err;
1954        struct in_device *out_dev;
1955        unsigned flags = 0;
1956        __be32 spec_dst;
1957        u32 itag;
1958
1959        /* get a working reference to the output device */
1960        out_dev = in_dev_get(FIB_RES_DEV(*res));
1961        if (out_dev == NULL) {
1962                if (net_ratelimit())
1963                        printk(KERN_CRIT "Bug in ip_route_input" \
1964                               "_slow(). Please, report\n");
1965                return -EINVAL;
1966        }
1967
1968
1969        err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1970                                  in_dev->dev, &spec_dst, &itag, skb->mark);
1971        if (err < 0) {
1972                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1973                                         saddr);
1974
1975                err = -EINVAL;
1976                goto cleanup;
1977        }
1978
1979        if (err)
1980                flags |= RTCF_DIRECTSRC;
1981
1982        if (out_dev == in_dev && err &&
1983            (IN_DEV_SHARED_MEDIA(out_dev) ||
1984             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1985                flags |= RTCF_DOREDIRECT;
1986
1987        if (skb->protocol != htons(ETH_P_IP)) {
1988                /* Not IP (i.e. ARP). Do not create route, if it is
1989                 * invalid for proxy arp. DNAT routes are always valid.
1990                 */
1991                if (out_dev == in_dev) {
1992                        err = -EINVAL;
1993                        goto cleanup;
1994                }
1995        }
1996
1997
1998        rth = dst_alloc(&ipv4_dst_ops);
1999        if (!rth) {
2000                err = -ENOBUFS;
2001                goto cleanup;
2002        }
2003
2004        atomic_set(&rth->u.dst.__refcnt, 1);
2005        rth->u.dst.flags= DST_HOST;
2006        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2007                rth->u.dst.flags |= DST_NOPOLICY;
2008        if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2009                rth->u.dst.flags |= DST_NOXFRM;
2010        rth->fl.fl4_dst = daddr;
2011        rth->rt_dst     = daddr;
2012        rth->fl.fl4_tos = tos;
2013        rth->fl.mark    = skb->mark;
2014        rth->fl.fl4_src = saddr;
2015        rth->rt_src     = saddr;
2016        rth->rt_gateway = daddr;
2017        rth->rt_iif     =
2018                rth->fl.iif     = in_dev->dev->ifindex;
2019        rth->u.dst.dev  = (out_dev)->dev;
2020        dev_hold(rth->u.dst.dev);
2021        rth->idev       = in_dev_get(rth->u.dst.dev);
2022        rth->fl.oif     = 0;
2023        rth->rt_spec_dst= spec_dst;
2024
2025        rth->u.dst.input = ip_forward;
2026        rth->u.dst.output = ip_output;
2027        rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2028
2029        rt_set_nexthop(rth, res, itag);
2030
2031        rth->rt_flags = flags;
2032
2033        *result = rth;
2034        err = 0;
2035 cleanup:
2036        /* release the working reference to the output device */
2037        in_dev_put(out_dev);
2038        return err;
2039}
2040
2041static int ip_mkroute_input(struct sk_buff *skb,
2042                            struct fib_result *res,
2043                            const struct flowi *fl,
2044                            struct in_device *in_dev,
2045                            __be32 daddr, __be32 saddr, u32 tos)
2046{
2047        struct rtable* rth = NULL;
2048        int err;
2049        unsigned hash;
2050
2051#ifdef CONFIG_IP_ROUTE_MULTIPATH
2052        if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2053                fib_select_multipath(fl, res);
2054#endif
2055
2056        /* create a routing cache entry */
2057        err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2058        if (err)
2059                return err;
2060
2061        /* put it into the cache */
2062        hash = rt_hash(daddr, saddr, fl->iif,
2063                       rt_genid(dev_net(rth->u.dst.dev)));
2064        return rt_intern_hash(hash, rth, NULL, skb);
2065}
2066
2067/*
2068 *      NOTE. We drop all the packets that has local source
2069 *      addresses, because every properly looped back packet
2070 *      must have correct destination already attached by output routine.
2071 *
2072 *      Such approach solves two big problems:
2073 *      1. Not simplex devices are handled properly.
2074 *      2. IP spoofing attempts are filtered with 100% of guarantee.
2075 */
2076
2077static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2078                               u8 tos, struct net_device *dev)
2079{
2080        struct fib_result res;
2081        struct in_device *in_dev = in_dev_get(dev);
2082        struct flowi fl = { .nl_u = { .ip4_u =
2083                                      { .daddr = daddr,
2084                                        .saddr = saddr,
2085                                        .tos = tos,
2086                                        .scope = RT_SCOPE_UNIVERSE,
2087                                      } },
2088                            .mark = skb->mark,
2089                            .iif = dev->ifindex };
2090        unsigned        flags = 0;
2091        u32             itag = 0;
2092        struct rtable * rth;
2093        unsigned        hash;
2094        __be32          spec_dst;
2095        int             err = -EINVAL;
2096        int             free_res = 0;
2097        struct net    * net = dev_net(dev);
2098
2099        /* IP on this device is disabled. */
2100
2101        if (!in_dev)
2102                goto out;
2103
2104        /* Check for the most weird martians, which can be not detected
2105           by fib_lookup.
2106         */
2107
2108        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2109            ipv4_is_loopback(saddr))
2110                goto martian_source;
2111
2112        if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2113                goto brd_input;
2114
2115        /* Accept zero addresses only to limited broadcast;
2116         * I even do not know to fix it or not. Waiting for complains :-)
2117         */
2118        if (ipv4_is_zeronet(saddr))
2119                goto martian_source;
2120
2121        if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2122            ipv4_is_loopback(daddr))
2123                goto martian_destination;
2124
2125        /*
2126         *      Now we are ready to route packet.
2127         */
2128        if ((err = fib_lookup(net, &fl, &res)) != 0) {
2129                if (!IN_DEV_FORWARD(in_dev))
2130                        goto e_hostunreach;
2131                goto no_route;
2132        }
2133        free_res = 1;
2134
2135        RT_CACHE_STAT_INC(in_slow_tot);
2136
2137        if (res.type == RTN_BROADCAST)
2138                goto brd_input;
2139
2140        if (res.type == RTN_LOCAL) {
2141                int result;
2142                result = fib_validate_source(saddr, daddr, tos,
2143                                             net->loopback_dev->ifindex,
2144                                             dev, &spec_dst, &itag, skb->mark);
2145                if (result < 0)
2146                        goto martian_source;
2147                if (result)
2148                        flags |= RTCF_DIRECTSRC;
2149                spec_dst = daddr;
2150                goto local_input;
2151        }
2152
2153        if (!IN_DEV_FORWARD(in_dev))
2154                goto e_hostunreach;
2155        if (res.type != RTN_UNICAST)
2156                goto martian_destination;
2157
2158        err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2159done:
2160        in_dev_put(in_dev);
2161        if (free_res)
2162                fib_res_put(&res);
2163out:    return err;
2164
2165brd_input:
2166        if (skb->protocol != htons(ETH_P_IP))
2167                goto e_inval;
2168
2169        if (ipv4_is_zeronet(saddr))
2170                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2171        else {
2172                err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2173                                          &itag, skb->mark);
2174                if (err < 0)
2175                        goto martian_source;
2176                if (err)
2177                        flags |= RTCF_DIRECTSRC;
2178        }
2179        flags |= RTCF_BROADCAST;
2180        res.type = RTN_BROADCAST;
2181        RT_CACHE_STAT_INC(in_brd);
2182
2183local_input:
2184        rth = dst_alloc(&ipv4_dst_ops);
2185        if (!rth)
2186                goto e_nobufs;
2187
2188        rth->u.dst.output= ip_rt_bug;
2189        rth->rt_genid = rt_genid(net);
2190
2191        atomic_set(&rth->u.dst.__refcnt, 1);
2192        rth->u.dst.flags= DST_HOST;
2193        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2194                rth->u.dst.flags |= DST_NOPOLICY;
2195        rth->fl.fl4_dst = daddr;
2196        rth->rt_dst     = daddr;
2197        rth->fl.fl4_tos = tos;
2198        rth->fl.mark    = skb->mark;
2199        rth->fl.fl4_src = saddr;
2200        rth->rt_src     = saddr;
2201#ifdef CONFIG_NET_CLS_ROUTE
2202        rth->u.dst.tclassid = itag;
2203#endif
2204        rth->rt_iif     =
2205        rth->fl.iif     = dev->ifindex;
2206        rth->u.dst.dev  = net->loopback_dev;
2207        dev_hold(rth->u.dst.dev);
2208        rth->idev       = in_dev_get(rth->u.dst.dev);
2209        rth->rt_gateway = daddr;
2210        rth->rt_spec_dst= spec_dst;
2211        rth->u.dst.input= ip_local_deliver;
2212        rth->rt_flags   = flags|RTCF_LOCAL;
2213        if (res.type == RTN_UNREACHABLE) {
2214                rth->u.dst.input= ip_error;
2215                rth->u.dst.error= -err;
2216                rth->rt_flags   &= ~RTCF_LOCAL;
2217        }
2218        rth->rt_type    = res.type;
2219        hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2220        err = rt_intern_hash(hash, rth, NULL, skb);
2221        goto done;
2222
2223no_route:
2224        RT_CACHE_STAT_INC(in_no_route);
2225        spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2226        res.type = RTN_UNREACHABLE;
2227        if (err == -ESRCH)
2228                err = -ENETUNREACH;
2229        goto local_input;
2230
2231        /*
2232         *      Do not cache martian addresses: they should be logged (RFC1812)
2233         */
2234martian_destination:
2235        RT_CACHE_STAT_INC(in_martian_dst);
2236#ifdef CONFIG_IP_ROUTE_VERBOSE
2237        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2238                printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2239                        &daddr, &saddr, dev->name);
2240#endif
2241
2242e_hostunreach:
2243        err = -EHOSTUNREACH;
2244        goto done;
2245
2246e_inval:
2247        err = -EINVAL;
2248        goto done;
2249
2250e_nobufs:
2251        err = -ENOBUFS;
2252        goto done;
2253
2254martian_source:
2255        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2256        goto e_inval;
2257}
2258
2259int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2260                   u8 tos, struct net_device *dev)
2261{
2262        struct rtable * rth;
2263        unsigned        hash;
2264        int iif = dev->ifindex;
2265        struct net *net;
2266
2267        net = dev_net(dev);
2268
2269        if (!rt_caching(net))
2270                goto skip_cache;
2271
2272        tos &= IPTOS_RT_MASK;
2273        hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2274
2275        rcu_read_lock();
2276        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2277             rth = rcu_dereference(rth->u.dst.rt_next)) {
2278                if (((rth->fl.fl4_dst ^ daddr) |
2279                     (rth->fl.fl4_src ^ saddr) |
2280                     (rth->fl.iif ^ iif) |
2281                     rth->fl.oif |
2282                     (rth->fl.fl4_tos ^ tos)) == 0 &&
2283                    rth->fl.mark == skb->mark &&
2284                    net_eq(dev_net(rth->u.dst.dev), net) &&
2285                    !rt_is_expired(rth)) {
2286                        dst_use(&rth->u.dst, jiffies);
2287                        RT_CACHE_STAT_INC(in_hit);
2288                        rcu_read_unlock();
2289                        skb_dst_set(skb, &rth->u.dst);
2290                        return 0;
2291                }
2292                RT_CACHE_STAT_INC(in_hlist_search);
2293        }
2294        rcu_read_unlock();
2295
2296skip_cache:
2297        /* Multicast recognition logic is moved from route cache to here.
2298           The problem was that too many Ethernet cards have broken/missing
2299           hardware multicast filters :-( As result the host on multicasting
2300           network acquires a lot of useless route cache entries, sort of
2301           SDR messages from all the world. Now we try to get rid of them.
2302           Really, provided software IP multicast filter is organized
2303           reasonably (at least, hashed), it does not result in a slowdown
2304           comparing with route cache reject entries.
2305           Note, that multicast routers are not affected, because
2306           route cache entry is created eventually.
2307         */
2308        if (ipv4_is_multicast(daddr)) {
2309                struct in_device *in_dev;
2310
2311                rcu_read_lock();
2312                if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2313                        int our = ip_check_mc(in_dev, daddr, saddr,
2314                                ip_hdr(skb)->protocol);
2315                        if (our
2316#ifdef CONFIG_IP_MROUTE
2317                            || (!ipv4_is_local_multicast(daddr) &&
2318                                IN_DEV_MFORWARD(in_dev))
2319#endif
2320                            ) {
2321                                rcu_read_unlock();
2322                                return ip_route_input_mc(skb, daddr, saddr,
2323                                                         tos, dev, our);
2324                        }
2325                }
2326                rcu_read_unlock();
2327                return -EINVAL;
2328        }
2329        return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2330}
2331
2332static int __mkroute_output(struct rtable **result,
2333                            struct fib_result *res,
2334                            const struct flowi *fl,
2335                            const struct flowi *oldflp,
2336                            struct net_device *dev_out,
2337                            unsigned flags)
2338{
2339        struct rtable *rth;
2340        struct in_device *in_dev;
2341        u32 tos = RT_FL_TOS(oldflp);
2342        int err = 0;
2343
2344        if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2345                return -EINVAL;
2346
2347        if (fl->fl4_dst == htonl(0xFFFFFFFF))
2348                res->type = RTN_BROADCAST;
2349        else if (ipv4_is_multicast(fl->fl4_dst))
2350                res->type = RTN_MULTICAST;
2351        else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2352                return -EINVAL;
2353
2354        if (dev_out->flags & IFF_LOOPBACK)
2355                flags |= RTCF_LOCAL;
2356
2357        /* get work reference to inet device */
2358        in_dev = in_dev_get(dev_out);
2359        if (!in_dev)
2360                return -EINVAL;
2361
2362        if (res->type == RTN_BROADCAST) {
2363                flags |= RTCF_BROADCAST | RTCF_LOCAL;
2364                if (res->fi) {
2365                        fib_info_put(res->fi);
2366                        res->fi = NULL;
2367                }
2368        } else if (res->type == RTN_MULTICAST) {
2369                flags |= RTCF_MULTICAST|RTCF_LOCAL;
2370                if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2371                                 oldflp->proto))
2372                        flags &= ~RTCF_LOCAL;
2373                /* If multicast route do not exist use
2374                   default one, but do not gateway in this case.
2375                   Yes, it is hack.
2376                 */
2377                if (res->fi && res->prefixlen < 4) {
2378                        fib_info_put(res->fi);
2379                        res->fi = NULL;
2380                }
2381        }
2382
2383
2384        rth = dst_alloc(&ipv4_dst_ops);
2385        if (!rth) {
2386                err = -ENOBUFS;
2387                goto cleanup;
2388        }
2389
2390        atomic_set(&rth->u.dst.__refcnt, 1);
2391        rth->u.dst.flags= DST_HOST;
2392        if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2393                rth->u.dst.flags |= DST_NOXFRM;
2394        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2395                rth->u.dst.flags |= DST_NOPOLICY;
2396
2397        rth->fl.fl4_dst = oldflp->fl4_dst;
2398        rth->fl.fl4_tos = tos;
2399        rth->fl.fl4_src = oldflp->fl4_src;
2400        rth->fl.oif     = oldflp->oif;
2401        rth->fl.mark    = oldflp->mark;
2402        rth->rt_dst     = fl->fl4_dst;
2403        rth->rt_src     = fl->fl4_src;
2404        rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2405        /* get references to the devices that are to be hold by the routing
2406           cache entry */
2407        rth->u.dst.dev  = dev_out;
2408        dev_hold(dev_out);
2409        rth->idev       = in_dev_get(dev_out);
2410        rth->rt_gateway = fl->fl4_dst;
2411        rth->rt_spec_dst= fl->fl4_src;
2412
2413        rth->u.dst.output=ip_output;
2414        rth->rt_genid = rt_genid(dev_net(dev_out));
2415
2416        RT_CACHE_STAT_INC(out_slow_tot);
2417
2418        if (flags & RTCF_LOCAL) {
2419                rth->u.dst.input = ip_local_deliver;
2420                rth->rt_spec_dst = fl->fl4_dst;
2421        }
2422        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2423                rth->rt_spec_dst = fl->fl4_src;
2424                if (flags & RTCF_LOCAL &&
2425                    !(dev_out->flags & IFF_LOOPBACK)) {
2426                        rth->u.dst.output = ip_mc_output;
2427                        RT_CACHE_STAT_INC(out_slow_mc);
2428                }
2429#ifdef CONFIG_IP_MROUTE
2430                if (res->type == RTN_MULTICAST) {
2431                        if (IN_DEV_MFORWARD(in_dev) &&
2432                            !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2433                                rth->u.dst.input = ip_mr_input;
2434                                rth->u.dst.output = ip_mc_output;
2435                        }
2436                }
2437#endif
2438        }
2439
2440        rt_set_nexthop(rth, res, 0);
2441
2442        rth->rt_flags = flags;
2443
2444        *result = rth;
2445 cleanup:
2446        /* release work reference to inet device */
2447        in_dev_put(in_dev);
2448
2449        return err;
2450}
2451
2452static int ip_mkroute_output(struct rtable **rp,
2453                             struct fib_result *res,
2454                             const struct flowi *fl,
2455                             const struct flowi *oldflp,
2456                             struct net_device *dev_out,
2457                             unsigned flags)
2458{
2459        struct rtable *rth = NULL;
2460        int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2461        unsigned hash;
2462        if (err == 0) {
2463                hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2464                               rt_genid(dev_net(dev_out)));
2465                err = rt_intern_hash(hash, rth, rp, NULL);
2466        }
2467
2468        return err;
2469}
2470
2471/*
2472 * Major route resolver routine.
2473 */
2474
2475static int ip_route_output_slow(struct net *net, struct rtable **rp,
2476                                const struct flowi *oldflp)
2477{
2478        u32 tos = RT_FL_TOS(oldflp);
2479        struct flowi fl = { .nl_u = { .ip4_u =
2480                                      { .daddr = oldflp->fl4_dst,
2481                                        .saddr = oldflp->fl4_src,
2482                                        .tos = tos & IPTOS_RT_MASK,
2483                                        .scope = ((tos & RTO_ONLINK) ?
2484                                                  RT_SCOPE_LINK :
2485                                                  RT_SCOPE_UNIVERSE),
2486                                      } },
2487                            .mark = oldflp->mark,
2488                            .iif = net->loopback_dev->ifindex,
2489                            .oif = oldflp->oif };
2490        struct fib_result res;
2491        unsigned flags = 0;
2492        struct net_device *dev_out = NULL;
2493        int free_res = 0;
2494        int err;
2495
2496
2497        res.fi          = NULL;
2498#ifdef CONFIG_IP_MULTIPLE_TABLES
2499        res.r           = NULL;
2500#endif
2501
2502        if (oldflp->fl4_src) {
2503                err = -EINVAL;
2504                if (ipv4_is_multicast(oldflp->fl4_src) ||
2505                    ipv4_is_lbcast(oldflp->fl4_src) ||
2506                    ipv4_is_zeronet(oldflp->fl4_src))
2507                        goto out;
2508
2509                /* I removed check for oif == dev_out->oif here.
2510                   It was wrong for two reasons:
2511                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2512                      is assigned to multiple interfaces.
2513                   2. Moreover, we are allowed to send packets with saddr
2514                      of another iface. --ANK
2515                 */
2516
2517                if (oldflp->oif == 0
2518                    && (ipv4_is_multicast(oldflp->fl4_dst) ||
2519                        oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2520                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2521                        dev_out = ip_dev_find(net, oldflp->fl4_src);
2522                        if (dev_out == NULL)
2523                                goto out;
2524
2525                        /* Special hack: user can direct multicasts
2526                           and limited broadcast via necessary interface
2527                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2528                           This hack is not just for fun, it allows
2529                           vic,vat and friends to work.
2530                           They bind socket to loopback, set ttl to zero
2531                           and expect that it will work.
2532                           From the viewpoint of routing cache they are broken,
2533                           because we are not allowed to build multicast path
2534                           with loopback source addr (look, routing cache
2535                           cannot know, that ttl is zero, so that packet
2536                           will not leave this host and route is valid).
2537                           Luckily, this hack is good workaround.
2538                         */
2539
2540                        fl.oif = dev_out->ifindex;
2541                        goto make_route;
2542                }
2543
2544                if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2545                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2546                        dev_out = ip_dev_find(net, oldflp->fl4_src);
2547                        if (dev_out == NULL)
2548                                goto out;
2549                        dev_put(dev_out);
2550                        dev_out = NULL;
2551                }
2552        }
2553
2554
2555        if (oldflp->oif) {
2556                dev_out = dev_get_by_index(net, oldflp->oif);
2557                err = -ENODEV;
2558                if (dev_out == NULL)
2559                        goto out;
2560
2561                /* RACE: Check return value of inet_select_addr instead. */
2562                if (__in_dev_get_rtnl(dev_out) == NULL) {
2563                        dev_put(dev_out);
2564                        goto out;       /* Wrong error code */
2565                }
2566
2567                if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2568                    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2569                        if (!fl.fl4_src)
2570                                fl.fl4_src = inet_select_addr(dev_out, 0,
2571                                                              RT_SCOPE_LINK);
2572                        goto make_route;
2573                }
2574                if (!fl.fl4_src) {
2575                        if (ipv4_is_multicast(oldflp->fl4_dst))
2576                                fl.fl4_src = inet_select_addr(dev_out, 0,
2577                                                              fl.fl4_scope);
2578                        else if (!oldflp->fl4_dst)
2579                                fl.fl4_src = inet_select_addr(dev_out, 0,
2580                                                              RT_SCOPE_HOST);
2581                }
2582        }
2583
2584        if (!fl.fl4_dst) {
2585                fl.fl4_dst = fl.fl4_src;
2586                if (!fl.fl4_dst)
2587                        fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2588                if (dev_out)
2589                        dev_put(dev_out);
2590                dev_out = net->loopback_dev;
2591                dev_hold(dev_out);
2592                fl.oif = net->loopback_dev->ifindex;
2593                res.type = RTN_LOCAL;
2594                flags |= RTCF_LOCAL;
2595                goto make_route;
2596        }
2597
2598        if (fib_lookup(net, &fl, &res)) {
2599                res.fi = NULL;
2600                if (oldflp->oif) {
2601                        /* Apparently, routing tables are wrong. Assume,
2602                           that the destination is on link.
2603
2604                           WHY? DW.
2605                           Because we are allowed to send to iface
2606                           even if it has NO routes and NO assigned
2607                           addresses. When oif is specified, routing
2608                           tables are looked up with only one purpose:
2609                           to catch if destination is gatewayed, rather than
2610                           direct. Moreover, if MSG_DONTROUTE is set,
2611                           we send packet, ignoring both routing tables
2612                           and ifaddr state. --ANK
2613
2614
2615                           We could make it even if oif is unknown,
2616                           likely IPv6, but we do not.
2617                         */
2618
2619                        if (fl.fl4_src == 0)
2620                                fl.fl4_src = inet_select_addr(dev_out, 0,
2621                                                              RT_SCOPE_LINK);
2622                        res.type = RTN_UNICAST;
2623                        goto make_route;
2624                }
2625                if (dev_out)
2626                        dev_put(dev_out);
2627                err = -ENETUNREACH;
2628                goto out;
2629        }
2630        free_res = 1;
2631
2632        if (res.type == RTN_LOCAL) {
2633                if (!fl.fl4_src)
2634                        fl.fl4_src = fl.fl4_dst;
2635                if (dev_out)
2636                        dev_put(dev_out);
2637                dev_out = net->loopback_dev;
2638                dev_hold(dev_out);
2639                fl.oif = dev_out->ifindex;
2640                if (res.fi)
2641                        fib_info_put(res.fi);
2642                res.fi = NULL;
2643                flags |= RTCF_LOCAL;
2644                goto make_route;
2645        }
2646
2647#ifdef CONFIG_IP_ROUTE_MULTIPATH
2648        if (res.fi->fib_nhs > 1 && fl.oif == 0)
2649                fib_select_multipath(&fl, &res);
2650        else
2651#endif
2652        if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2653                fib_select_default(net, &fl, &res);
2654
2655        if (!fl.fl4_src)
2656                fl.fl4_src = FIB_RES_PREFSRC(res);
2657
2658        if (dev_out)
2659                dev_put(dev_out);
2660        dev_out = FIB_RES_DEV(res);
2661        dev_hold(dev_out);
2662        fl.oif = dev_out->ifindex;
2663
2664
2665make_route:
2666        err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2667
2668
2669        if (free_res)
2670                fib_res_put(&res);
2671        if (dev_out)
2672                dev_put(dev_out);
2673out:    return err;
2674}
2675
2676int __ip_route_output_key(struct net *net, struct rtable **rp,
2677                          const struct flowi *flp)
2678{
2679        unsigned hash;
2680        struct rtable *rth;
2681
2682        if (!rt_caching(net))
2683                goto slow_output;
2684
2685        hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2686
2687        rcu_read_lock_bh();
2688        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2689                rth = rcu_dereference(rth->u.dst.rt_next)) {
2690                if (rth->fl.fl4_dst == flp->fl4_dst &&
2691                    rth->fl.fl4_src == flp->fl4_src &&
2692                    rth->fl.iif == 0 &&
2693                    rth->fl.oif == flp->oif &&
2694                    rth->fl.mark == flp->mark &&
2695                    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2696                            (IPTOS_RT_MASK | RTO_ONLINK)) &&
2697                    net_eq(dev_net(rth->u.dst.dev), net) &&
2698                    !rt_is_expired(rth)) {
2699                        dst_use(&rth->u.dst, jiffies);
2700                        RT_CACHE_STAT_INC(out_hit);
2701                        rcu_read_unlock_bh();
2702                        *rp = rth;
2703                        return 0;
2704                }
2705                RT_CACHE_STAT_INC(out_hlist_search);
2706        }
2707        rcu_read_unlock_bh();
2708
2709slow_output:
2710        return ip_route_output_slow(net, rp, flp);
2711}
2712
2713EXPORT_SYMBOL_GPL(__ip_route_output_key);
2714
2715static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2716{
2717}
2718
2719static struct dst_ops ipv4_dst_blackhole_ops = {
2720        .family                 =       AF_INET,
2721        .protocol               =       cpu_to_be16(ETH_P_IP),
2722        .destroy                =       ipv4_dst_destroy,
2723        .check                  =       ipv4_dst_check,
2724        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2725        .entries                =       ATOMIC_INIT(0),
2726};
2727
2728
2729static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2730{
2731        struct rtable *ort = *rp;
2732        struct rtable *rt = (struct rtable *)
2733                dst_alloc(&ipv4_dst_blackhole_ops);
2734
2735        if (rt) {
2736                struct dst_entry *new = &rt->u.dst;
2737
2738                atomic_set(&new->__refcnt, 1);
2739                new->__use = 1;
2740                new->input = dst_discard;
2741                new->output = dst_discard;
2742                memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2743
2744                new->dev = ort->u.dst.dev;
2745                if (new->dev)
2746                        dev_hold(new->dev);
2747
2748                rt->fl = ort->fl;
2749
2750                rt->idev = ort->idev;
2751                if (rt->idev)
2752                        in_dev_hold(rt->idev);
2753                rt->rt_genid = rt_genid(net);
2754                rt->rt_flags = ort->rt_flags;
2755                rt->rt_type = ort->rt_type;
2756                rt->rt_dst = ort->rt_dst;
2757                rt->rt_src = ort->rt_src;
2758                rt->rt_iif = ort->rt_iif;
2759                rt->rt_gateway = ort->rt_gateway;
2760                rt->rt_spec_dst = ort->rt_spec_dst;
2761                rt->peer = ort->peer;
2762                if (rt->peer)
2763                        atomic_inc(&rt->peer->refcnt);
2764
2765                dst_free(new);
2766        }
2767
2768        dst_release(&(*rp)->u.dst);
2769        *rp = rt;
2770        return (rt ? 0 : -ENOMEM);
2771}
2772
2773int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2774                         struct sock *sk, int flags)
2775{
2776        int err;
2777
2778        if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2779                return err;
2780
2781        if (flp->proto) {
2782                if (!flp->fl4_src)
2783                        flp->fl4_src = (*rp)->rt_src;
2784                if (!flp->fl4_dst)
2785                        flp->fl4_dst = (*rp)->rt_dst;
2786                err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2787                                    flags ? XFRM_LOOKUP_WAIT : 0);
2788                if (err == -EREMOTE)
2789                        err = ipv4_dst_blackhole(net, rp, flp);
2790
2791                return err;
2792        }
2793
2794        return 0;
2795}
2796
2797EXPORT_SYMBOL_GPL(ip_route_output_flow);
2798
2799int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2800{
2801        return ip_route_output_flow(net, rp, flp, NULL, 0);
2802}
2803
2804static int rt_fill_info(struct net *net,
2805                        struct sk_buff *skb, u32 pid, u32 seq, int event,
2806                        int nowait, unsigned int flags)
2807{
2808        struct rtable *rt = skb_rtable(skb);
2809        struct rtmsg *r;
2810        struct nlmsghdr *nlh;
2811        long expires;
2812        u32 id = 0, ts = 0, tsage = 0, error;
2813
2814        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2815        if (nlh == NULL)
2816                return -EMSGSIZE;
2817
2818        r = nlmsg_data(nlh);
2819        r->rtm_family    = AF_INET;
2820        r->rtm_dst_len  = 32;
2821        r->rtm_src_len  = 0;
2822        r->rtm_tos      = rt->fl.fl4_tos;
2823        r->rtm_table    = RT_TABLE_MAIN;
2824        NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2825        r->rtm_type     = rt->rt_type;
2826        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2827        r->rtm_protocol = RTPROT_UNSPEC;
2828        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2829        if (rt->rt_flags & RTCF_NOTIFY)
2830                r->rtm_flags |= RTM_F_NOTIFY;
2831
2832        NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2833
2834        if (rt->fl.fl4_src) {
2835                r->rtm_src_len = 32;
2836                NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2837        }
2838        if (rt->u.dst.dev)
2839                NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2840#ifdef CONFIG_NET_CLS_ROUTE
2841        if (rt->u.dst.tclassid)
2842                NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2843#endif
2844        if (rt->fl.iif)
2845                NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2846        else if (rt->rt_src != rt->fl.fl4_src)
2847                NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2848
2849        if (rt->rt_dst != rt->rt_gateway)
2850                NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2851
2852        if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2853                goto nla_put_failure;
2854
2855        error = rt->u.dst.error;
2856        expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2857        if (rt->peer) {
2858                id = rt->peer->ip_id_count;
2859                if (rt->peer->tcp_ts_stamp) {
2860                        ts = rt->peer->tcp_ts;
2861                        tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2862                }
2863        }
2864
2865        if (rt->fl.iif) {
2866#ifdef CONFIG_IP_MROUTE
2867                __be32 dst = rt->rt_dst;
2868
2869                if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2870                    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2871                        int err = ipmr_get_route(net, skb, r, nowait);
2872                        if (err <= 0) {
2873                                if (!nowait) {
2874                                        if (err == 0)
2875                                                return 0;
2876                                        goto nla_put_failure;
2877                                } else {
2878                                        if (err == -EMSGSIZE)
2879                                                goto nla_put_failure;
2880                                        error = err;
2881                                }
2882                        }
2883                } else
2884#endif
2885                        NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2886        }
2887
2888        if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2889                               expires, error) < 0)
2890                goto nla_put_failure;
2891
2892        return nlmsg_end(skb, nlh);
2893
2894nla_put_failure:
2895        nlmsg_cancel(skb, nlh);
2896        return -EMSGSIZE;
2897}
2898
2899static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2900{
2901        struct net *net = sock_net(in_skb->sk);
2902        struct rtmsg *rtm;
2903        struct nlattr *tb[RTA_MAX+1];
2904        struct rtable *rt = NULL;
2905        __be32 dst = 0;
2906        __be32 src = 0;
2907        u32 iif;
2908        int err;
2909        struct sk_buff *skb;
2910
2911        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2912        if (err < 0)
2913                goto errout;
2914
2915        rtm = nlmsg_data(nlh);
2916
2917        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2918        if (skb == NULL) {
2919                err = -ENOBUFS;
2920                goto errout;
2921        }
2922
2923        /* Reserve room for dummy headers, this skb can pass
2924           through good chunk of routing engine.
2925         */
2926        skb_reset_mac_header(skb);
2927        skb_reset_network_header(skb);
2928
2929        /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2930        ip_hdr(skb)->protocol = IPPROTO_ICMP;
2931        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2932
2933        src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2934        dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2935        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2936
2937        if (iif) {
2938                struct net_device *dev;
2939
2940                dev = __dev_get_by_index(net, iif);
2941                if (dev == NULL) {
2942                        err = -ENODEV;
2943                        goto errout_free;
2944                }
2945
2946                skb->protocol   = htons(ETH_P_IP);
2947                skb->dev        = dev;
2948                local_bh_disable();
2949                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2950                local_bh_enable();
2951
2952                rt = skb_rtable(skb);
2953                if (err == 0 && rt->u.dst.error)
2954                        err = -rt->u.dst.error;
2955        } else {
2956                struct flowi fl = {
2957                        .nl_u = {
2958                                .ip4_u = {
2959                                        .daddr = dst,
2960                                        .saddr = src,
2961                                        .tos = rtm->rtm_tos,
2962                                },
2963                        },
2964                        .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2965                };
2966                err = ip_route_output_key(net, &rt, &fl);
2967        }
2968
2969        if (err)
2970                goto errout_free;
2971
2972        skb_dst_set(skb, &rt->u.dst);
2973        if (rtm->rtm_flags & RTM_F_NOTIFY)
2974                rt->rt_flags |= RTCF_NOTIFY;
2975
2976        err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2977                           RTM_NEWROUTE, 0, 0);
2978        if (err <= 0)
2979                goto errout_free;
2980
2981        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2982errout:
2983        return err;
2984
2985errout_free:
2986        kfree_skb(skb);
2987        goto errout;
2988}
2989
2990int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2991{
2992        struct rtable *rt;
2993        int h, s_h;
2994        int idx, s_idx;
2995        struct net *net;
2996
2997        net = sock_net(skb->sk);
2998
2999        s_h = cb->args[0];
3000        if (s_h < 0)
3001                s_h = 0;
3002        s_idx = idx = cb->args[1];
3003        for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3004                if (!rt_hash_table[h].chain)
3005                        continue;
3006                rcu_read_lock_bh();
3007                for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
3008                     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
3009                        if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3010                                continue;
3011                        if (rt_is_expired(rt))
3012                                continue;
3013                        skb_dst_set(skb, dst_clone(&rt->u.dst));
3014                        if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3015                                         cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3016                                         1, NLM_F_MULTI) <= 0) {
3017                                skb_dst_drop(skb);
3018                                rcu_read_unlock_bh();
3019                                goto done;
3020                        }
3021                        skb_dst_drop(skb);
3022                }
3023                rcu_read_unlock_bh();
3024        }
3025
3026done:
3027        cb->args[0] = h;
3028        cb->args[1] = idx;
3029        return skb->len;
3030}
3031
3032void ip_rt_multicast_event(struct in_device *in_dev)
3033{
3034        rt_cache_flush(dev_net(in_dev->dev), 0);
3035}
3036
3037#ifdef CONFIG_SYSCTL
3038static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3039                                        void __user *buffer,
3040                                        size_t *lenp, loff_t *ppos)
3041{
3042        if (write) {
3043                int flush_delay;
3044                ctl_table ctl;
3045                struct net *net;
3046
3047                memcpy(&ctl, __ctl, sizeof(ctl));
3048                ctl.data = &flush_delay;
3049                proc_dointvec(&ctl, write, buffer, lenp, ppos);
3050
3051                net = (struct net *)__ctl->extra1;
3052                rt_cache_flush(net, flush_delay);
3053                return 0;
3054        }
3055
3056        return -EINVAL;
3057}
3058
3059static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
3060                                                void __user *oldval,
3061                                                size_t __user *oldlenp,
3062                                                void __user *newval,
3063                                                size_t newlen)
3064{
3065        int delay;
3066        struct net *net;
3067        if (newlen != sizeof(int))
3068                return -EINVAL;
3069        if (get_user(delay, (int __user *)newval))
3070                return -EFAULT;
3071        net = (struct net *)table->extra1;
3072        rt_cache_flush(net, delay);
3073        return 0;
3074}
3075
3076static void rt_secret_reschedule(int old)
3077{
3078        struct net *net;
3079        int new = ip_rt_secret_interval;
3080        int diff = new - old;
3081
3082        if (!diff)
3083                return;
3084
3085        rtnl_lock();
3086        for_each_net(net) {
3087                int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3088
3089                if (!new)
3090                        continue;
3091
3092                if (deleted) {
3093                        long time = net->ipv4.rt_secret_timer.expires - jiffies;
3094
3095                        if (time <= 0 || (time += diff) <= 0)
3096                                time = 0;
3097
3098                        net->ipv4.rt_secret_timer.expires = time;
3099                } else
3100                        net->ipv4.rt_secret_timer.expires = new;
3101
3102                net->ipv4.rt_secret_timer.expires += jiffies;
3103                add_timer(&net->ipv4.rt_secret_timer);
3104        }
3105        rtnl_unlock();
3106}
3107
3108static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3109                                          void __user *buffer, size_t *lenp,
3110                                          loff_t *ppos)
3111{
3112        int old = ip_rt_secret_interval;
3113        int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
3114
3115        rt_secret_reschedule(old);
3116
3117        return ret;
3118}
3119
3120static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
3121                                                   void __user *oldval,
3122                                                   size_t __user *oldlenp,
3123                                                   void __user *newval,
3124                                                   size_t newlen)
3125{
3126        int old = ip_rt_secret_interval;
3127        int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
3128
3129        rt_secret_reschedule(old);
3130
3131        return ret;
3132}
3133
3134static ctl_table ipv4_route_table[] = {
3135        {
3136                .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
3137                .procname       = "gc_thresh",
3138                .data           = &ipv4_dst_ops.gc_thresh,
3139                .maxlen         = sizeof(int),
3140                .mode           = 0644,
3141                .proc_handler   = proc_dointvec,
3142        },
3143        {
3144                .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
3145                .procname       = "max_size",
3146                .data           = &ip_rt_max_size,
3147                .maxlen         = sizeof(int),
3148                .mode           = 0644,
3149                .proc_handler   = proc_dointvec,
3150        },
3151        {
3152                /*  Deprecated. Use gc_min_interval_ms */
3153
3154                .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3155                .procname       = "gc_min_interval",
3156                .data           = &ip_rt_gc_min_interval,
3157                .maxlen         = sizeof(int),
3158                .mode           = 0644,
3159                .proc_handler   = proc_dointvec_jiffies,
3160                .strategy       = sysctl_jiffies,
3161        },
3162        {
3163                .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3164                .procname       = "gc_min_interval_ms",
3165                .data           = &ip_rt_gc_min_interval,
3166                .maxlen         = sizeof(int),
3167                .mode           = 0644,
3168                .proc_handler   = proc_dointvec_ms_jiffies,
3169                .strategy       = sysctl_ms_jiffies,
3170        },
3171        {
3172                .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
3173                .procname       = "gc_timeout",
3174                .data           = &ip_rt_gc_timeout,
3175                .maxlen         = sizeof(int),
3176                .mode           = 0644,
3177                .proc_handler   = proc_dointvec_jiffies,
3178                .strategy       = sysctl_jiffies,
3179        },
3180        {
3181                .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
3182                .procname       = "gc_interval",
3183                .data           = &ip_rt_gc_interval,
3184                .maxlen         = sizeof(int),
3185                .mode           = 0644,
3186                .proc_handler   = proc_dointvec_jiffies,
3187                .strategy       = sysctl_jiffies,
3188        },
3189        {
3190                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
3191                .procname       = "redirect_load",
3192                .data           = &ip_rt_redirect_load,
3193                .maxlen         = sizeof(int),
3194                .mode           = 0644,
3195                .proc_handler   = proc_dointvec,
3196        },
3197        {
3198                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3199                .procname       = "redirect_number",
3200                .data           = &ip_rt_redirect_number,
3201                .maxlen         = sizeof(int),
3202                .mode           = 0644,
3203                .proc_handler   = proc_dointvec,
3204        },
3205        {
3206                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3207                .procname       = "redirect_silence",
3208                .data           = &ip_rt_redirect_silence,
3209                .maxlen         = sizeof(int),
3210                .mode           = 0644,
3211                .proc_handler   = proc_dointvec,
3212        },
3213        {
3214                .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
3215                .procname       = "error_cost",
3216                .data           = &ip_rt_error_cost,
3217                .maxlen         = sizeof(int),
3218                .mode           = 0644,
3219                .proc_handler   = proc_dointvec,
3220        },
3221        {
3222                .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3223                .procname       = "error_burst",
3224                .data           = &ip_rt_error_burst,
3225                .maxlen         = sizeof(int),
3226                .mode           = 0644,
3227                .proc_handler   = proc_dointvec,
3228        },
3229        {
3230                .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3231                .procname       = "gc_elasticity",
3232                .data           = &ip_rt_gc_elasticity,
3233                .maxlen         = sizeof(int),
3234                .mode           = 0644,
3235                .proc_handler   = proc_dointvec,
3236        },
3237        {
3238                .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3239                .procname       = "mtu_expires",
3240                .data           = &ip_rt_mtu_expires,
3241                .maxlen         = sizeof(int),
3242                .mode           = 0644,
3243                .proc_handler   = proc_dointvec_jiffies,
3244                .strategy       = sysctl_jiffies,
3245        },
3246        {
3247                .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3248                .procname       = "min_pmtu",
3249                .data           = &ip_rt_min_pmtu,
3250                .maxlen         = sizeof(int),
3251                .mode           = 0644,
3252                .proc_handler   = proc_dointvec,
3253        },
3254        {
3255                .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3256                .procname       = "min_adv_mss",
3257                .data           = &ip_rt_min_advmss,
3258                .maxlen         = sizeof(int),
3259                .mode           = 0644,
3260                .proc_handler   = proc_dointvec,
3261        },
3262        {
3263                .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3264                .procname       = "secret_interval",
3265                .data           = &ip_rt_secret_interval,
3266                .maxlen         = sizeof(int),
3267                .mode           = 0644,
3268                .proc_handler   = ipv4_sysctl_rt_secret_interval,
3269                .strategy       = ipv4_sysctl_rt_secret_interval_strategy,
3270        },
3271        { .ctl_name = 0 }
3272};
3273
3274static struct ctl_table empty[1];
3275
3276static struct ctl_table ipv4_skeleton[] =
3277{
3278        { .procname = "route", .ctl_name = NET_IPV4_ROUTE,
3279          .mode = 0555, .child = ipv4_route_table},
3280        { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
3281          .mode = 0555, .child = empty},
3282        { }
3283};
3284
3285static __net_initdata struct ctl_path ipv4_path[] = {
3286        { .procname = "net", .ctl_name = CTL_NET, },
3287        { .procname = "ipv4", .ctl_name = NET_IPV4, },
3288        { },
3289};
3290
3291static struct ctl_table ipv4_route_flush_table[] = {
3292        {
3293                .ctl_name       = NET_IPV4_ROUTE_FLUSH,
3294                .procname       = "flush",
3295                .maxlen         = sizeof(int),
3296                .mode           = 0200,
3297                .proc_handler   = ipv4_sysctl_rtcache_flush,
3298                .strategy       = ipv4_sysctl_rtcache_flush_strategy,
3299        },
3300        { .ctl_name = 0 },
3301};
3302
3303static __net_initdata struct ctl_path ipv4_route_path[] = {
3304        { .procname = "net", .ctl_name = CTL_NET, },
3305        { .procname = "ipv4", .ctl_name = NET_IPV4, },
3306        { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3307        { },
3308};
3309
3310static __net_init int sysctl_route_net_init(struct net *net)
3311{
3312        struct ctl_table *tbl;
3313
3314        tbl = ipv4_route_flush_table;
3315        if (net != &init_net) {
3316                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3317                if (tbl == NULL)
3318                        goto err_dup;
3319        }
3320        tbl[0].extra1 = net;
3321
3322        net->ipv4.route_hdr =
3323                register_net_sysctl_table(net, ipv4_route_path, tbl);
3324        if (net->ipv4.route_hdr == NULL)
3325                goto err_reg;
3326        return 0;
3327
3328err_reg:
3329        if (tbl != ipv4_route_flush_table)
3330                kfree(tbl);
3331err_dup:
3332        return -ENOMEM;
3333}
3334
3335static __net_exit void sysctl_route_net_exit(struct net *net)
3336{
3337        struct ctl_table *tbl;
3338
3339        tbl = net->ipv4.route_hdr->ctl_table_arg;
3340        unregister_net_sysctl_table(net->ipv4.route_hdr);
3341        BUG_ON(tbl == ipv4_route_flush_table);
3342        kfree(tbl);
3343}
3344
3345static __net_initdata struct pernet_operations sysctl_route_ops = {
3346        .init = sysctl_route_net_init,
3347        .exit = sysctl_route_net_exit,
3348};
3349#endif
3350
3351
3352static __net_init int rt_secret_timer_init(struct net *net)
3353{
3354        atomic_set(&net->ipv4.rt_genid,
3355                        (int) ((num_physpages ^ (num_physpages>>8)) ^
3356                        (jiffies ^ (jiffies >> 7))));
3357
3358        net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3359        net->ipv4.rt_secret_timer.data = (unsigned long)net;
3360        init_timer_deferrable(&net->ipv4.rt_secret_timer);
3361
3362        if (ip_rt_secret_interval) {
3363                net->ipv4.rt_secret_timer.expires =
3364                        jiffies + net_random() % ip_rt_secret_interval +
3365                        ip_rt_secret_interval;
3366                add_timer(&net->ipv4.rt_secret_timer);
3367        }
3368        return 0;
3369}
3370
3371static __net_exit void rt_secret_timer_exit(struct net *net)
3372{
3373        del_timer_sync(&net->ipv4.rt_secret_timer);
3374}
3375
3376static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3377        .init = rt_secret_timer_init,
3378        .exit = rt_secret_timer_exit,
3379};
3380
3381
3382#ifdef CONFIG_NET_CLS_ROUTE
3383struct ip_rt_acct *ip_rt_acct __read_mostly;
3384#endif /* CONFIG_NET_CLS_ROUTE */
3385
3386static __initdata unsigned long rhash_entries;
3387static int __init set_rhash_entries(char *str)
3388{
3389        if (!str)
3390                return 0;
3391        rhash_entries = simple_strtoul(str, &str, 0);
3392        return 1;
3393}
3394__setup("rhash_entries=", set_rhash_entries);
3395
3396int __init ip_rt_init(void)
3397{
3398        int rc = 0;
3399
3400#ifdef CONFIG_NET_CLS_ROUTE
3401        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3402        if (!ip_rt_acct)
3403                panic("IP: failed to allocate ip_rt_acct\n");
3404#endif
3405
3406        ipv4_dst_ops.kmem_cachep =
3407                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3408                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3409
3410        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3411
3412        rt_hash_table = (struct rt_hash_bucket *)
3413                alloc_large_system_hash("IP route cache",
3414                                        sizeof(struct rt_hash_bucket),
3415                                        rhash_entries,
3416                                        (totalram_pages >= 128 * 1024) ?
3417                                        15 : 17,
3418                                        0,
3419                                        &rt_hash_log,
3420                                        &rt_hash_mask,
3421                                        rhash_entries ? 0 : 512 * 1024);
3422        memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3423        rt_hash_lock_init();
3424
3425        ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3426        ip_rt_max_size = (rt_hash_mask + 1) * 16;
3427
3428        devinet_init();
3429        ip_fib_init();
3430
3431        /* All the timers, started at system startup tend
3432           to synchronize. Perturb it a bit.
3433         */
3434        INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3435        expires_ljiffies = jiffies;
3436        schedule_delayed_work(&expires_work,
3437                net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3438
3439        if (register_pernet_subsys(&rt_secret_timer_ops))
3440                printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3441
3442        if (ip_rt_proc_init())
3443                printk(KERN_ERR "Unable to create route proc files\n");
3444#ifdef CONFIG_XFRM
3445        xfrm_init();
3446        xfrm4_init(ip_rt_max_size);
3447#endif
3448        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3449
3450#ifdef CONFIG_SYSCTL
3451        register_pernet_subsys(&sysctl_route_ops);
3452#endif
3453        return rc;
3454}
3455
3456#ifdef CONFIG_SYSCTL
3457/*
3458 * We really need to sanitize the damn ipv4 init order, then all
3459 * this nonsense will go away.
3460 */
3461void __init ip_static_sysctl_init(void)
3462{
3463        register_sysctl_paths(ipv4_path, ipv4_skeleton);
3464}
3465#endif
3466
3467EXPORT_SYMBOL(__ip_select_ident);
3468EXPORT_SYMBOL(ip_route_input);
3469EXPORT_SYMBOL(ip_route_output_key);
3470