linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9 *
  10 * Authors:     Ross Biro
  11 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15 *
  16 * Fixes:
  17 *              Alan Cox        :       Verify area fixes.
  18 *              Alan Cox        :       cli() protects routing changes
  19 *              Rui Oliveira    :       ICMP routing table updates
  20 *              (rco@di.uminho.pt)      Routing table insertion and update
  21 *              Linus Torvalds  :       Rewrote bits to be sensible
  22 *              Alan Cox        :       Added BSD route gw semantics
  23 *              Alan Cox        :       Super /proc >4K
  24 *              Alan Cox        :       MTU in route table
  25 *              Alan Cox        :       MSS actually. Also added the window
  26 *                                      clamper.
  27 *              Sam Lantinga    :       Fixed route matching in rt_del()
  28 *              Alan Cox        :       Routing cache support.
  29 *              Alan Cox        :       Removed compatibility cruft.
  30 *              Alan Cox        :       RTF_REJECT support.
  31 *              Alan Cox        :       TCP irtt support.
  32 *              Jonathan Naylor :       Added Metric support.
  33 *      Miquel van Smoorenburg  :       BSD API fixes.
  34 *      Miquel van Smoorenburg  :       Metrics.
  35 *              Alan Cox        :       Use __u32 properly
  36 *              Alan Cox        :       Aligned routing errors more closely with BSD
  37 *                                      our system is still very different.
  38 *              Alan Cox        :       Faster /proc handling
  39 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40 *                                      routing caches and better behaviour.
  41 *
  42 *              Olaf Erb        :       irtt wasn't being copied right.
  43 *              Bjorn Ekwall    :       Kerneld route support.
  44 *              Alan Cox        :       Multicast fixed (I hope)
  45 *              Pavel Krauz     :       Limited broadcast fixed
  46 *              Mike McLagan    :       Routing by source
  47 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48 *                                      route.c and rewritten from scratch.
  49 *              Andi Kleen      :       Load-limit warning messages.
  50 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54 *              Marc Boucher    :       routing by fwmark
  55 *      Robert Olsson           :       Added rt_cache statistics
  56 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  58 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  59 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  60 *
  61 *              This program is free software; you can redistribute it and/or
  62 *              modify it under the terms of the GNU General Public License
  63 *              as published by the Free Software Foundation; either version
  64 *              2 of the License, or (at your option) any later version.
  65 */
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <asm/system.h>
  70#include <linux/bitops.h>
  71#include <linux/types.h>
  72#include <linux/kernel.h>
  73#include <linux/mm.h>
  74#include <linux/bootmem.h>
  75#include <linux/string.h>
  76#include <linux/socket.h>
  77#include <linux/sockios.h>
  78#include <linux/errno.h>
  79#include <linux/in.h>
  80#include <linux/inet.h>
  81#include <linux/netdevice.h>
  82#include <linux/proc_fs.h>
  83#include <linux/init.h>
  84#include <linux/workqueue.h>
  85#include <linux/skbuff.h>
  86#include <linux/inetdevice.h>
  87#include <linux/igmp.h>
  88#include <linux/pkt_sched.h>
  89#include <linux/mroute.h>
  90#include <linux/netfilter_ipv4.h>
  91#include <linux/random.h>
  92#include <linux/jhash.h>
  93#include <linux/rcupdate.h>
  94#include <linux/times.h>
  95#include <net/net_namespace.h>
  96#include <net/protocol.h>
  97#include <net/ip.h>
  98#include <net/route.h>
  99#include <net/inetpeer.h>
 100#include <net/sock.h>
 101#include <net/ip_fib.h>
 102#include <net/arp.h>
 103#include <net/tcp.h>
 104#include <net/icmp.h>
 105#include <net/xfrm.h>
 106#include <net/netevent.h>
 107#include <net/rtnetlink.h>
 108#ifdef CONFIG_SYSCTL
 109#include <linux/sysctl.h>
 110#endif
 111
 112#define RT_FL_TOS(oldflp) \
 113    ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 114
 115#define IP_MAX_MTU      0xFFF0
 116
 117#define RT_GC_TIMEOUT (300*HZ)
 118
 119static int ip_rt_min_delay              = 2 * HZ;
 120static int ip_rt_max_delay              = 10 * HZ;
 121static int ip_rt_max_size;
 122static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
 123static int ip_rt_gc_interval            = 60 * HZ;
 124static int ip_rt_gc_min_interval        = HZ / 2;
 125static int ip_rt_redirect_number        = 9;
 126static int ip_rt_redirect_load          = HZ / 50;
 127static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
 128static int ip_rt_error_cost             = HZ;
 129static int ip_rt_error_burst            = 5 * HZ;
 130static int ip_rt_gc_elasticity          = 8;
 131static int ip_rt_mtu_expires            = 10 * 60 * HZ;
 132static int ip_rt_min_pmtu               = 512 + 20 + 20;
 133static int ip_rt_min_advmss             = 256;
 134static int ip_rt_secret_interval        = 10 * 60 * HZ;
 135static unsigned long rt_deadline;
 136
 137#define RTprint(a...)   printk(KERN_DEBUG a)
 138
 139static struct timer_list rt_flush_timer;
 140static void rt_check_expire(struct work_struct *work);
 141static DECLARE_DELAYED_WORK(expires_work, rt_check_expire);
 142static struct timer_list rt_secret_timer;
 143
 144/*
 145 *      Interface to generic destination cache.
 146 */
 147
 148static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 149static void              ipv4_dst_destroy(struct dst_entry *dst);
 150static void              ipv4_dst_ifdown(struct dst_entry *dst,
 151                                         struct net_device *dev, int how);
 152static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 153static void              ipv4_link_failure(struct sk_buff *skb);
 154static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 155static int rt_garbage_collect(void);
 156
 157
 158static struct dst_ops ipv4_dst_ops = {
 159        .family =               AF_INET,
 160        .protocol =             __constant_htons(ETH_P_IP),
 161        .gc =                   rt_garbage_collect,
 162        .check =                ipv4_dst_check,
 163        .destroy =              ipv4_dst_destroy,
 164        .ifdown =               ipv4_dst_ifdown,
 165        .negative_advice =      ipv4_negative_advice,
 166        .link_failure =         ipv4_link_failure,
 167        .update_pmtu =          ip_rt_update_pmtu,
 168        .entry_size =           sizeof(struct rtable),
 169};
 170
 171#define ECN_OR_COST(class)      TC_PRIO_##class
 172
 173const __u8 ip_tos2prio[16] = {
 174        TC_PRIO_BESTEFFORT,
 175        ECN_OR_COST(FILLER),
 176        TC_PRIO_BESTEFFORT,
 177        ECN_OR_COST(BESTEFFORT),
 178        TC_PRIO_BULK,
 179        ECN_OR_COST(BULK),
 180        TC_PRIO_BULK,
 181        ECN_OR_COST(BULK),
 182        TC_PRIO_INTERACTIVE,
 183        ECN_OR_COST(INTERACTIVE),
 184        TC_PRIO_INTERACTIVE,
 185        ECN_OR_COST(INTERACTIVE),
 186        TC_PRIO_INTERACTIVE_BULK,
 187        ECN_OR_COST(INTERACTIVE_BULK),
 188        TC_PRIO_INTERACTIVE_BULK,
 189        ECN_OR_COST(INTERACTIVE_BULK)
 190};
 191
 192
 193/*
 194 * Route cache.
 195 */
 196
 197/* The locking scheme is rather straight forward:
 198 *
 199 * 1) Read-Copy Update protects the buckets of the central route hash.
 200 * 2) Only writers remove entries, and they hold the lock
 201 *    as they look at rtable reference counts.
 202 * 3) Only readers acquire references to rtable entries,
 203 *    they do so with atomic increments and with the
 204 *    lock held.
 205 */
 206
 207struct rt_hash_bucket {
 208        struct rtable   *chain;
 209};
 210#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 211        defined(CONFIG_PROVE_LOCKING)
 212/*
 213 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 214 * The size of this table is a power of two and depends on the number of CPUS.
 215 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 216 */
 217#ifdef CONFIG_LOCKDEP
 218# define RT_HASH_LOCK_SZ        256
 219#else
 220# if NR_CPUS >= 32
 221#  define RT_HASH_LOCK_SZ       4096
 222# elif NR_CPUS >= 16
 223#  define RT_HASH_LOCK_SZ       2048
 224# elif NR_CPUS >= 8
 225#  define RT_HASH_LOCK_SZ       1024
 226# elif NR_CPUS >= 4
 227#  define RT_HASH_LOCK_SZ       512
 228# else
 229#  define RT_HASH_LOCK_SZ       256
 230# endif
 231#endif
 232
 233static spinlock_t       *rt_hash_locks;
 234# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 235# define rt_hash_lock_init()    { \
 236                int i; \
 237                rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
 238                if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
 239                for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
 240                        spin_lock_init(&rt_hash_locks[i]); \
 241                }
 242#else
 243# define rt_hash_lock_addr(slot) NULL
 244# define rt_hash_lock_init()
 245#endif
 246
 247static struct rt_hash_bucket    *rt_hash_table;
 248static unsigned                 rt_hash_mask;
 249static unsigned int             rt_hash_log;
 250static unsigned int             rt_hash_rnd;
 251
 252static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 253#define RT_CACHE_STAT_INC(field) \
 254        (__raw_get_cpu_var(rt_cache_stat).field++)
 255
 256static int rt_intern_hash(unsigned hash, struct rtable *rth,
 257                                struct rtable **res);
 258
 259static unsigned int rt_hash_code(u32 daddr, u32 saddr)
 260{
 261        return (jhash_2words(daddr, saddr, rt_hash_rnd)
 262                & rt_hash_mask);
 263}
 264
 265#define rt_hash(daddr, saddr, idx) \
 266        rt_hash_code((__force u32)(__be32)(daddr),\
 267                     (__force u32)(__be32)(saddr) ^ ((idx) << 5))
 268
 269#ifdef CONFIG_PROC_FS
 270struct rt_cache_iter_state {
 271        int bucket;
 272};
 273
 274static struct rtable *rt_cache_get_first(struct seq_file *seq)
 275{
 276        struct rtable *r = NULL;
 277        struct rt_cache_iter_state *st = seq->private;
 278
 279        for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 280                rcu_read_lock_bh();
 281                r = rt_hash_table[st->bucket].chain;
 282                if (r)
 283                        break;
 284                rcu_read_unlock_bh();
 285        }
 286        return rcu_dereference(r);
 287}
 288
 289static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 290{
 291        struct rt_cache_iter_state *st = seq->private;
 292
 293        r = r->u.dst.rt_next;
 294        while (!r) {
 295                rcu_read_unlock_bh();
 296                if (--st->bucket < 0)
 297                        break;
 298                rcu_read_lock_bh();
 299                r = rt_hash_table[st->bucket].chain;
 300        }
 301        return rcu_dereference(r);
 302}
 303
 304static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 305{
 306        struct rtable *r = rt_cache_get_first(seq);
 307
 308        if (r)
 309                while (pos && (r = rt_cache_get_next(seq, r)))
 310                        --pos;
 311        return pos ? NULL : r;
 312}
 313
 314static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 315{
 316        return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 317}
 318
 319static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 320{
 321        struct rtable *r = NULL;
 322
 323        if (v == SEQ_START_TOKEN)
 324                r = rt_cache_get_first(seq);
 325        else
 326                r = rt_cache_get_next(seq, v);
 327        ++*pos;
 328        return r;
 329}
 330
 331static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 332{
 333        if (v && v != SEQ_START_TOKEN)
 334                rcu_read_unlock_bh();
 335}
 336
 337static int rt_cache_seq_show(struct seq_file *seq, void *v)
 338{
 339        if (v == SEQ_START_TOKEN)
 340                seq_printf(seq, "%-127s\n",
 341                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 342                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 343                           "HHUptod\tSpecDst");
 344        else {
 345                struct rtable *r = v;
 346                char temp[256];
 347
 348                sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 349                              "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 350                        r->u.dst.dev ? r->u.dst.dev->name : "*",
 351                        (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 352                        r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 353                        r->u.dst.__use, 0, (unsigned long)r->rt_src,
 354                        (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 355                             (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 356                        dst_metric(&r->u.dst, RTAX_WINDOW),
 357                        (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 358                              dst_metric(&r->u.dst, RTAX_RTTVAR)),
 359                        r->fl.fl4_tos,
 360                        r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 361                        r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 362                                       dev_queue_xmit) : 0,
 363                        r->rt_spec_dst);
 364                seq_printf(seq, "%-127s\n", temp);
 365        }
 366        return 0;
 367}
 368
 369static const struct seq_operations rt_cache_seq_ops = {
 370        .start  = rt_cache_seq_start,
 371        .next   = rt_cache_seq_next,
 372        .stop   = rt_cache_seq_stop,
 373        .show   = rt_cache_seq_show,
 374};
 375
 376static int rt_cache_seq_open(struct inode *inode, struct file *file)
 377{
 378        return seq_open_private(file, &rt_cache_seq_ops,
 379                        sizeof(struct rt_cache_iter_state));
 380}
 381
 382static const struct file_operations rt_cache_seq_fops = {
 383        .owner   = THIS_MODULE,
 384        .open    = rt_cache_seq_open,
 385        .read    = seq_read,
 386        .llseek  = seq_lseek,
 387        .release = seq_release_private,
 388};
 389
 390
 391static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 392{
 393        int cpu;
 394
 395        if (*pos == 0)
 396                return SEQ_START_TOKEN;
 397
 398        for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 399                if (!cpu_possible(cpu))
 400                        continue;
 401                *pos = cpu+1;
 402                return &per_cpu(rt_cache_stat, cpu);
 403        }
 404        return NULL;
 405}
 406
 407static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 408{
 409        int cpu;
 410
 411        for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 412                if (!cpu_possible(cpu))
 413                        continue;
 414                *pos = cpu+1;
 415                return &per_cpu(rt_cache_stat, cpu);
 416        }
 417        return NULL;
 418
 419}
 420
 421static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 422{
 423
 424}
 425
 426static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 427{
 428        struct rt_cache_stat *st = v;
 429
 430        if (v == SEQ_START_TOKEN) {
 431                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 432                return 0;
 433        }
 434
 435        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 436                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 437                   atomic_read(&ipv4_dst_ops.entries),
 438                   st->in_hit,
 439                   st->in_slow_tot,
 440                   st->in_slow_mc,
 441                   st->in_no_route,
 442                   st->in_brd,
 443                   st->in_martian_dst,
 444                   st->in_martian_src,
 445
 446                   st->out_hit,
 447                   st->out_slow_tot,
 448                   st->out_slow_mc,
 449
 450                   st->gc_total,
 451                   st->gc_ignored,
 452                   st->gc_goal_miss,
 453                   st->gc_dst_overflow,
 454                   st->in_hlist_search,
 455                   st->out_hlist_search
 456                );
 457        return 0;
 458}
 459
 460static const struct seq_operations rt_cpu_seq_ops = {
 461        .start  = rt_cpu_seq_start,
 462        .next   = rt_cpu_seq_next,
 463        .stop   = rt_cpu_seq_stop,
 464        .show   = rt_cpu_seq_show,
 465};
 466
 467
 468static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 469{
 470        return seq_open(file, &rt_cpu_seq_ops);
 471}
 472
 473static const struct file_operations rt_cpu_seq_fops = {
 474        .owner   = THIS_MODULE,
 475        .open    = rt_cpu_seq_open,
 476        .read    = seq_read,
 477        .llseek  = seq_lseek,
 478        .release = seq_release,
 479};
 480
 481#endif /* CONFIG_PROC_FS */
 482
 483static __inline__ void rt_free(struct rtable *rt)
 484{
 485        call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 486}
 487
 488static __inline__ void rt_drop(struct rtable *rt)
 489{
 490        ip_rt_put(rt);
 491        call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 492}
 493
 494static __inline__ int rt_fast_clean(struct rtable *rth)
 495{
 496        /* Kill broadcast/multicast entries very aggresively, if they
 497           collide in hash table with more useful entries */
 498        return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 499                rth->fl.iif && rth->u.dst.rt_next;
 500}
 501
 502static __inline__ int rt_valuable(struct rtable *rth)
 503{
 504        return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 505                rth->u.dst.expires;
 506}
 507
 508static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 509{
 510        unsigned long age;
 511        int ret = 0;
 512
 513        if (atomic_read(&rth->u.dst.__refcnt))
 514                goto out;
 515
 516        ret = 1;
 517        if (rth->u.dst.expires &&
 518            time_after_eq(jiffies, rth->u.dst.expires))
 519                goto out;
 520
 521        age = jiffies - rth->u.dst.lastuse;
 522        ret = 0;
 523        if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 524            (age <= tmo2 && rt_valuable(rth)))
 525                goto out;
 526        ret = 1;
 527out:    return ret;
 528}
 529
 530/* Bits of score are:
 531 * 31: very valuable
 532 * 30: not quite useless
 533 * 29..0: usage counter
 534 */
 535static inline u32 rt_score(struct rtable *rt)
 536{
 537        u32 score = jiffies - rt->u.dst.lastuse;
 538
 539        score = ~score & ~(3<<30);
 540
 541        if (rt_valuable(rt))
 542                score |= (1<<31);
 543
 544        if (!rt->fl.iif ||
 545            !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 546                score |= (1<<30);
 547
 548        return score;
 549}
 550
 551static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 552{
 553        return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 554                (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 555                (fl1->mark ^ fl2->mark) |
 556                (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 557                 *(u16 *)&fl2->nl_u.ip4_u.tos) |
 558                (fl1->oif ^ fl2->oif) |
 559                (fl1->iif ^ fl2->iif)) == 0;
 560}
 561
 562static void rt_check_expire(struct work_struct *work)
 563{
 564        static unsigned int rover;
 565        unsigned int i = rover, goal;
 566        struct rtable *rth, **rthp;
 567        u64 mult;
 568
 569        mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 570        if (ip_rt_gc_timeout > 1)
 571                do_div(mult, ip_rt_gc_timeout);
 572        goal = (unsigned int)mult;
 573        if (goal > rt_hash_mask)
 574                goal = rt_hash_mask + 1;
 575        for (; goal > 0; goal--) {
 576                unsigned long tmo = ip_rt_gc_timeout;
 577
 578                i = (i + 1) & rt_hash_mask;
 579                rthp = &rt_hash_table[i].chain;
 580
 581                if (need_resched())
 582                        cond_resched();
 583
 584                if (*rthp == NULL)
 585                        continue;
 586                spin_lock_bh(rt_hash_lock_addr(i));
 587                while ((rth = *rthp) != NULL) {
 588                        if (rth->u.dst.expires) {
 589                                /* Entry is expired even if it is in use */
 590                                if (time_before_eq(jiffies, rth->u.dst.expires)) {
 591                                        tmo >>= 1;
 592                                        rthp = &rth->u.dst.rt_next;
 593                                        continue;
 594                                }
 595                        } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 596                                tmo >>= 1;
 597                                rthp = &rth->u.dst.rt_next;
 598                                continue;
 599                        }
 600
 601                        /* Cleanup aged off entries. */
 602                        *rthp = rth->u.dst.rt_next;
 603                        rt_free(rth);
 604                }
 605                spin_unlock_bh(rt_hash_lock_addr(i));
 606        }
 607        rover = i;
 608        schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 609}
 610
 611/* This can run from both BH and non-BH contexts, the latter
 612 * in the case of a forced flush event.
 613 */
 614static void rt_run_flush(unsigned long dummy)
 615{
 616        int i;
 617        struct rtable *rth, *next;
 618
 619        rt_deadline = 0;
 620
 621        get_random_bytes(&rt_hash_rnd, 4);
 622
 623        for (i = rt_hash_mask; i >= 0; i--) {
 624                spin_lock_bh(rt_hash_lock_addr(i));
 625                rth = rt_hash_table[i].chain;
 626                if (rth)
 627                        rt_hash_table[i].chain = NULL;
 628                spin_unlock_bh(rt_hash_lock_addr(i));
 629
 630                for (; rth; rth = next) {
 631                        next = rth->u.dst.rt_next;
 632                        rt_free(rth);
 633                }
 634        }
 635}
 636
 637static DEFINE_SPINLOCK(rt_flush_lock);
 638
 639void rt_cache_flush(int delay)
 640{
 641        unsigned long now = jiffies;
 642        int user_mode = !in_softirq();
 643
 644        if (delay < 0)
 645                delay = ip_rt_min_delay;
 646
 647        spin_lock_bh(&rt_flush_lock);
 648
 649        if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 650                long tmo = (long)(rt_deadline - now);
 651
 652                /* If flush timer is already running
 653                   and flush request is not immediate (delay > 0):
 654
 655                   if deadline is not achieved, prolongate timer to "delay",
 656                   otherwise fire it at deadline time.
 657                 */
 658
 659                if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 660                        tmo = 0;
 661
 662                if (delay > tmo)
 663                        delay = tmo;
 664        }
 665
 666        if (delay <= 0) {
 667                spin_unlock_bh(&rt_flush_lock);
 668                rt_run_flush(0);
 669                return;
 670        }
 671
 672        if (rt_deadline == 0)
 673                rt_deadline = now + ip_rt_max_delay;
 674
 675        mod_timer(&rt_flush_timer, now+delay);
 676        spin_unlock_bh(&rt_flush_lock);
 677}
 678
 679static void rt_secret_rebuild(unsigned long dummy)
 680{
 681        unsigned long now = jiffies;
 682
 683        rt_cache_flush(0);
 684        mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 685}
 686
 687/*
 688   Short description of GC goals.
 689
 690   We want to build algorithm, which will keep routing cache
 691   at some equilibrium point, when number of aged off entries
 692   is kept approximately equal to newly generated ones.
 693
 694   Current expiration strength is variable "expire".
 695   We try to adjust it dynamically, so that if networking
 696   is idle expires is large enough to keep enough of warm entries,
 697   and when load increases it reduces to limit cache size.
 698 */
 699
 700static int rt_garbage_collect(void)
 701{
 702        static unsigned long expire = RT_GC_TIMEOUT;
 703        static unsigned long last_gc;
 704        static int rover;
 705        static int equilibrium;
 706        struct rtable *rth, **rthp;
 707        unsigned long now = jiffies;
 708        int goal;
 709
 710        /*
 711         * Garbage collection is pretty expensive,
 712         * do not make it too frequently.
 713         */
 714
 715        RT_CACHE_STAT_INC(gc_total);
 716
 717        if (now - last_gc < ip_rt_gc_min_interval &&
 718            atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 719                RT_CACHE_STAT_INC(gc_ignored);
 720                goto out;
 721        }
 722
 723        /* Calculate number of entries, which we want to expire now. */
 724        goal = atomic_read(&ipv4_dst_ops.entries) -
 725                (ip_rt_gc_elasticity << rt_hash_log);
 726        if (goal <= 0) {
 727                if (equilibrium < ipv4_dst_ops.gc_thresh)
 728                        equilibrium = ipv4_dst_ops.gc_thresh;
 729                goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 730                if (goal > 0) {
 731                        equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
 732                        goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 733                }
 734        } else {
 735                /* We are in dangerous area. Try to reduce cache really
 736                 * aggressively.
 737                 */
 738                goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
 739                equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 740        }
 741
 742        if (now - last_gc >= ip_rt_gc_min_interval)
 743                last_gc = now;
 744
 745        if (goal <= 0) {
 746                equilibrium += goal;
 747                goto work_done;
 748        }
 749
 750        do {
 751                int i, k;
 752
 753                for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 754                        unsigned long tmo = expire;
 755
 756                        k = (k + 1) & rt_hash_mask;
 757                        rthp = &rt_hash_table[k].chain;
 758                        spin_lock_bh(rt_hash_lock_addr(k));
 759                        while ((rth = *rthp) != NULL) {
 760                                if (!rt_may_expire(rth, tmo, expire)) {
 761                                        tmo >>= 1;
 762                                        rthp = &rth->u.dst.rt_next;
 763                                        continue;
 764                                }
 765                                *rthp = rth->u.dst.rt_next;
 766                                rt_free(rth);
 767                                goal--;
 768                        }
 769                        spin_unlock_bh(rt_hash_lock_addr(k));
 770                        if (goal <= 0)
 771                                break;
 772                }
 773                rover = k;
 774
 775                if (goal <= 0)
 776                        goto work_done;
 777
 778                /* Goal is not achieved. We stop process if:
 779
 780                   - if expire reduced to zero. Otherwise, expire is halfed.
 781                   - if table is not full.
 782                   - if we are called from interrupt.
 783                   - jiffies check is just fallback/debug loop breaker.
 784                     We will not spin here for long time in any case.
 785                 */
 786
 787                RT_CACHE_STAT_INC(gc_goal_miss);
 788
 789                if (expire == 0)
 790                        break;
 791
 792                expire >>= 1;
 793#if RT_CACHE_DEBUG >= 2
 794                printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 795                                atomic_read(&ipv4_dst_ops.entries), goal, i);
 796#endif
 797
 798                if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 799                        goto out;
 800        } while (!in_softirq() && time_before_eq(jiffies, now));
 801
 802        if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 803                goto out;
 804        if (net_ratelimit())
 805                printk(KERN_WARNING "dst cache overflow\n");
 806        RT_CACHE_STAT_INC(gc_dst_overflow);
 807        return 1;
 808
 809work_done:
 810        expire += ip_rt_gc_min_interval;
 811        if (expire > ip_rt_gc_timeout ||
 812            atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 813                expire = ip_rt_gc_timeout;
 814#if RT_CACHE_DEBUG >= 2
 815        printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 816                        atomic_read(&ipv4_dst_ops.entries), goal, rover);
 817#endif
 818out:    return 0;
 819}
 820
 821static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 822{
 823        struct rtable   *rth, **rthp;
 824        unsigned long   now;
 825        struct rtable *cand, **candp;
 826        u32             min_score;
 827        int             chain_length;
 828        int attempts = !in_softirq();
 829
 830restart:
 831        chain_length = 0;
 832        min_score = ~(u32)0;
 833        cand = NULL;
 834        candp = NULL;
 835        now = jiffies;
 836
 837        rthp = &rt_hash_table[hash].chain;
 838
 839        spin_lock_bh(rt_hash_lock_addr(hash));
 840        while ((rth = *rthp) != NULL) {
 841                if (compare_keys(&rth->fl, &rt->fl)) {
 842                        /* Put it first */
 843                        *rthp = rth->u.dst.rt_next;
 844                        /*
 845                         * Since lookup is lockfree, the deletion
 846                         * must be visible to another weakly ordered CPU before
 847                         * the insertion at the start of the hash chain.
 848                         */
 849                        rcu_assign_pointer(rth->u.dst.rt_next,
 850                                           rt_hash_table[hash].chain);
 851                        /*
 852                         * Since lookup is lockfree, the update writes
 853                         * must be ordered for consistency on SMP.
 854                         */
 855                        rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 856
 857                        dst_use(&rth->u.dst, now);
 858                        spin_unlock_bh(rt_hash_lock_addr(hash));
 859
 860                        rt_drop(rt);
 861                        *rp = rth;
 862                        return 0;
 863                }
 864
 865                if (!atomic_read(&rth->u.dst.__refcnt)) {
 866                        u32 score = rt_score(rth);
 867
 868                        if (score <= min_score) {
 869                                cand = rth;
 870                                candp = rthp;
 871                                min_score = score;
 872                        }
 873                }
 874
 875                chain_length++;
 876
 877                rthp = &rth->u.dst.rt_next;
 878        }
 879
 880        if (cand) {
 881                /* ip_rt_gc_elasticity used to be average length of chain
 882                 * length, when exceeded gc becomes really aggressive.
 883                 *
 884                 * The second limit is less certain. At the moment it allows
 885                 * only 2 entries per bucket. We will see.
 886                 */
 887                if (chain_length > ip_rt_gc_elasticity) {
 888                        *candp = cand->u.dst.rt_next;
 889                        rt_free(cand);
 890                }
 891        }
 892
 893        /* Try to bind route to arp only if it is output
 894           route or unicast forwarding path.
 895         */
 896        if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 897                int err = arp_bind_neighbour(&rt->u.dst);
 898                if (err) {
 899                        spin_unlock_bh(rt_hash_lock_addr(hash));
 900
 901                        if (err != -ENOBUFS) {
 902                                rt_drop(rt);
 903                                return err;
 904                        }
 905
 906                        /* Neighbour tables are full and nothing
 907                           can be released. Try to shrink route cache,
 908                           it is most likely it holds some neighbour records.
 909                         */
 910                        if (attempts-- > 0) {
 911                                int saved_elasticity = ip_rt_gc_elasticity;
 912                                int saved_int = ip_rt_gc_min_interval;
 913                                ip_rt_gc_elasticity     = 1;
 914                                ip_rt_gc_min_interval   = 0;
 915                                rt_garbage_collect();
 916                                ip_rt_gc_min_interval   = saved_int;
 917                                ip_rt_gc_elasticity     = saved_elasticity;
 918                                goto restart;
 919                        }
 920
 921                        if (net_ratelimit())
 922                                printk(KERN_WARNING "Neighbour table overflow.\n");
 923                        rt_drop(rt);
 924                        return -ENOBUFS;
 925                }
 926        }
 927
 928        rt->u.dst.rt_next = rt_hash_table[hash].chain;
 929#if RT_CACHE_DEBUG >= 2
 930        if (rt->u.dst.rt_next) {
 931                struct rtable *trt;
 932                printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
 933                       NIPQUAD(rt->rt_dst));
 934                for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
 935                        printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
 936                printk("\n");
 937        }
 938#endif
 939        rt_hash_table[hash].chain = rt;
 940        spin_unlock_bh(rt_hash_lock_addr(hash));
 941        *rp = rt;
 942        return 0;
 943}
 944
 945void rt_bind_peer(struct rtable *rt, int create)
 946{
 947        static DEFINE_SPINLOCK(rt_peer_lock);
 948        struct inet_peer *peer;
 949
 950        peer = inet_getpeer(rt->rt_dst, create);
 951
 952        spin_lock_bh(&rt_peer_lock);
 953        if (rt->peer == NULL) {
 954                rt->peer = peer;
 955                peer = NULL;
 956        }
 957        spin_unlock_bh(&rt_peer_lock);
 958        if (peer)
 959                inet_putpeer(peer);
 960}
 961
 962/*
 963 * Peer allocation may fail only in serious out-of-memory conditions.  However
 964 * we still can generate some output.
 965 * Random ID selection looks a bit dangerous because we have no chances to
 966 * select ID being unique in a reasonable period of time.
 967 * But broken packet identifier may be better than no packet at all.
 968 */
 969static void ip_select_fb_ident(struct iphdr *iph)
 970{
 971        static DEFINE_SPINLOCK(ip_fb_id_lock);
 972        static u32 ip_fallback_id;
 973        u32 salt;
 974
 975        spin_lock_bh(&ip_fb_id_lock);
 976        salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
 977        iph->id = htons(salt & 0xFFFF);
 978        ip_fallback_id = salt;
 979        spin_unlock_bh(&ip_fb_id_lock);
 980}
 981
 982void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 983{
 984        struct rtable *rt = (struct rtable *) dst;
 985
 986        if (rt) {
 987                if (rt->peer == NULL)
 988                        rt_bind_peer(rt, 1);
 989
 990                /* If peer is attached to destination, it is never detached,
 991                   so that we need not to grab a lock to dereference it.
 992                 */
 993                if (rt->peer) {
 994                        iph->id = htons(inet_getid(rt->peer, more));
 995                        return;
 996                }
 997        } else
 998                printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
 999                       __builtin_return_address(0));
1000
1001        ip_select_fb_ident(iph);
1002}
1003
1004static void rt_del(unsigned hash, struct rtable *rt)
1005{
1006        struct rtable **rthp;
1007
1008        spin_lock_bh(rt_hash_lock_addr(hash));
1009        ip_rt_put(rt);
1010        for (rthp = &rt_hash_table[hash].chain; *rthp;
1011             rthp = &(*rthp)->u.dst.rt_next)
1012                if (*rthp == rt) {
1013                        *rthp = rt->u.dst.rt_next;
1014                        rt_free(rt);
1015                        break;
1016                }
1017        spin_unlock_bh(rt_hash_lock_addr(hash));
1018}
1019
1020void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1021                    __be32 saddr, struct net_device *dev)
1022{
1023        int i, k;
1024        struct in_device *in_dev = in_dev_get(dev);
1025        struct rtable *rth, **rthp;
1026        __be32  skeys[2] = { saddr, 0 };
1027        int  ikeys[2] = { dev->ifindex, 0 };
1028        struct netevent_redirect netevent;
1029
1030        if (!in_dev)
1031                return;
1032
1033        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1034            || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1035                goto reject_redirect;
1036
1037        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1038                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1039                        goto reject_redirect;
1040                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1041                        goto reject_redirect;
1042        } else {
1043                if (inet_addr_type(new_gw) != RTN_UNICAST)
1044                        goto reject_redirect;
1045        }
1046
1047        for (i = 0; i < 2; i++) {
1048                for (k = 0; k < 2; k++) {
1049                        unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1050
1051                        rthp=&rt_hash_table[hash].chain;
1052
1053                        rcu_read_lock();
1054                        while ((rth = rcu_dereference(*rthp)) != NULL) {
1055                                struct rtable *rt;
1056
1057                                if (rth->fl.fl4_dst != daddr ||
1058                                    rth->fl.fl4_src != skeys[i] ||
1059                                    rth->fl.oif != ikeys[k] ||
1060                                    rth->fl.iif != 0) {
1061                                        rthp = &rth->u.dst.rt_next;
1062                                        continue;
1063                                }
1064
1065                                if (rth->rt_dst != daddr ||
1066                                    rth->rt_src != saddr ||
1067                                    rth->u.dst.error ||
1068                                    rth->rt_gateway != old_gw ||
1069                                    rth->u.dst.dev != dev)
1070                                        break;
1071
1072                                dst_hold(&rth->u.dst);
1073                                rcu_read_unlock();
1074
1075                                rt = dst_alloc(&ipv4_dst_ops);
1076                                if (rt == NULL) {
1077                                        ip_rt_put(rth);
1078                                        in_dev_put(in_dev);
1079                                        return;
1080                                }
1081
1082                                /* Copy all the information. */
1083                                *rt = *rth;
1084                                INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1085                                rt->u.dst.__use         = 1;
1086                                atomic_set(&rt->u.dst.__refcnt, 1);
1087                                rt->u.dst.child         = NULL;
1088                                if (rt->u.dst.dev)
1089                                        dev_hold(rt->u.dst.dev);
1090                                if (rt->idev)
1091                                        in_dev_hold(rt->idev);
1092                                rt->u.dst.obsolete      = 0;
1093                                rt->u.dst.lastuse       = jiffies;
1094                                rt->u.dst.path          = &rt->u.dst;
1095                                rt->u.dst.neighbour     = NULL;
1096                                rt->u.dst.hh            = NULL;
1097                                rt->u.dst.xfrm          = NULL;
1098
1099                                rt->rt_flags            |= RTCF_REDIRECTED;
1100
1101                                /* Gateway is different ... */
1102                                rt->rt_gateway          = new_gw;
1103
1104                                /* Redirect received -> path was valid */
1105                                dst_confirm(&rth->u.dst);
1106
1107                                if (rt->peer)
1108                                        atomic_inc(&rt->peer->refcnt);
1109
1110                                if (arp_bind_neighbour(&rt->u.dst) ||
1111                                    !(rt->u.dst.neighbour->nud_state &
1112                                            NUD_VALID)) {
1113                                        if (rt->u.dst.neighbour)
1114                                                neigh_event_send(rt->u.dst.neighbour, NULL);
1115                                        ip_rt_put(rth);
1116                                        rt_drop(rt);
1117                                        goto do_next;
1118                                }
1119
1120                                netevent.old = &rth->u.dst;
1121                                netevent.new = &rt->u.dst;
1122                                call_netevent_notifiers(NETEVENT_REDIRECT,
1123                                                        &netevent);
1124
1125                                rt_del(hash, rth);
1126                                if (!rt_intern_hash(hash, rt, &rt))
1127                                        ip_rt_put(rt);
1128                                goto do_next;
1129                        }
1130                        rcu_read_unlock();
1131                do_next:
1132                        ;
1133                }
1134        }
1135        in_dev_put(in_dev);
1136        return;
1137
1138reject_redirect:
1139#ifdef CONFIG_IP_ROUTE_VERBOSE
1140        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1141                printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1142                        "%u.%u.%u.%u ignored.\n"
1143                        "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1144                       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1145                       NIPQUAD(saddr), NIPQUAD(daddr));
1146#endif
1147        in_dev_put(in_dev);
1148}
1149
1150static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1151{
1152        struct rtable *rt = (struct rtable*)dst;
1153        struct dst_entry *ret = dst;
1154
1155        if (rt) {
1156                if (dst->obsolete) {
1157                        ip_rt_put(rt);
1158                        ret = NULL;
1159                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1160                           rt->u.dst.expires) {
1161                        unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1162                                                rt->fl.oif);
1163#if RT_CACHE_DEBUG >= 1
1164                        printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1165                                          "%u.%u.%u.%u/%02x dropped\n",
1166                                NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1167#endif
1168                        rt_del(hash, rt);
1169                        ret = NULL;
1170                }
1171        }
1172        return ret;
1173}
1174
1175/*
1176 * Algorithm:
1177 *      1. The first ip_rt_redirect_number redirects are sent
1178 *         with exponential backoff, then we stop sending them at all,
1179 *         assuming that the host ignores our redirects.
1180 *      2. If we did not see packets requiring redirects
1181 *         during ip_rt_redirect_silence, we assume that the host
1182 *         forgot redirected route and start to send redirects again.
1183 *
1184 * This algorithm is much cheaper and more intelligent than dumb load limiting
1185 * in icmp.c.
1186 *
1187 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1188 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1189 */
1190
1191void ip_rt_send_redirect(struct sk_buff *skb)
1192{
1193        struct rtable *rt = (struct rtable*)skb->dst;
1194        struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1195
1196        if (!in_dev)
1197                return;
1198
1199        if (!IN_DEV_TX_REDIRECTS(in_dev))
1200                goto out;
1201
1202        /* No redirected packets during ip_rt_redirect_silence;
1203         * reset the algorithm.
1204         */
1205        if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1206                rt->u.dst.rate_tokens = 0;
1207
1208        /* Too many ignored redirects; do not send anything
1209         * set u.dst.rate_last to the last seen redirected packet.
1210         */
1211        if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1212                rt->u.dst.rate_last = jiffies;
1213                goto out;
1214        }
1215
1216        /* Check for load limit; set rate_last to the latest sent
1217         * redirect.
1218         */
1219        if (rt->u.dst.rate_tokens == 0 ||
1220            time_after(jiffies,
1221                       (rt->u.dst.rate_last +
1222                        (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1223                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1224                rt->u.dst.rate_last = jiffies;
1225                ++rt->u.dst.rate_tokens;
1226#ifdef CONFIG_IP_ROUTE_VERBOSE
1227                if (IN_DEV_LOG_MARTIANS(in_dev) &&
1228                    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1229                    net_ratelimit())
1230                        printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1231                                "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1232                                NIPQUAD(rt->rt_src), rt->rt_iif,
1233                                NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1234#endif
1235        }
1236out:
1237        in_dev_put(in_dev);
1238}
1239
1240static int ip_error(struct sk_buff *skb)
1241{
1242        struct rtable *rt = (struct rtable*)skb->dst;
1243        unsigned long now;
1244        int code;
1245
1246        switch (rt->u.dst.error) {
1247                case EINVAL:
1248                default:
1249                        goto out;
1250                case EHOSTUNREACH:
1251                        code = ICMP_HOST_UNREACH;
1252                        break;
1253                case ENETUNREACH:
1254                        code = ICMP_NET_UNREACH;
1255                        IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1256                        break;
1257                case EACCES:
1258                        code = ICMP_PKT_FILTERED;
1259                        break;
1260        }
1261
1262        now = jiffies;
1263        rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1264        if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1265                rt->u.dst.rate_tokens = ip_rt_error_burst;
1266        rt->u.dst.rate_last = now;
1267        if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1268                rt->u.dst.rate_tokens -= ip_rt_error_cost;
1269                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1270        }
1271
1272out:    kfree_skb(skb);
1273        return 0;
1274}
1275
1276/*
1277 *      The last two values are not from the RFC but
1278 *      are needed for AMPRnet AX.25 paths.
1279 */
1280
1281static const unsigned short mtu_plateau[] =
1282{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1283
1284static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1285{
1286        int i;
1287
1288        for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1289                if (old_mtu > mtu_plateau[i])
1290                        return mtu_plateau[i];
1291        return 68;
1292}
1293
1294unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1295{
1296        int i;
1297        unsigned short old_mtu = ntohs(iph->tot_len);
1298        struct rtable *rth;
1299        __be32  skeys[2] = { iph->saddr, 0, };
1300        __be32  daddr = iph->daddr;
1301        unsigned short est_mtu = 0;
1302
1303        if (ipv4_config.no_pmtu_disc)
1304                return 0;
1305
1306        for (i = 0; i < 2; i++) {
1307                unsigned hash = rt_hash(daddr, skeys[i], 0);
1308
1309                rcu_read_lock();
1310                for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1311                     rth = rcu_dereference(rth->u.dst.rt_next)) {
1312                        if (rth->fl.fl4_dst == daddr &&
1313                            rth->fl.fl4_src == skeys[i] &&
1314                            rth->rt_dst  == daddr &&
1315                            rth->rt_src  == iph->saddr &&
1316                            rth->fl.iif == 0 &&
1317                            !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1318                                unsigned short mtu = new_mtu;
1319
1320                                if (new_mtu < 68 || new_mtu >= old_mtu) {
1321
1322                                        /* BSD 4.2 compatibility hack :-( */
1323                                        if (mtu == 0 &&
1324                                            old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1325                                            old_mtu >= 68 + (iph->ihl << 2))
1326                                                old_mtu -= iph->ihl << 2;
1327
1328                                        mtu = guess_mtu(old_mtu);
1329                                }
1330                                if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1331                                        if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1332                                                dst_confirm(&rth->u.dst);
1333                                                if (mtu < ip_rt_min_pmtu) {
1334                                                        mtu = ip_rt_min_pmtu;
1335                                                        rth->u.dst.metrics[RTAX_LOCK-1] |=
1336                                                                (1 << RTAX_MTU);
1337                                                }
1338                                                rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1339                                                dst_set_expires(&rth->u.dst,
1340                                                        ip_rt_mtu_expires);
1341                                        }
1342                                        est_mtu = mtu;
1343                                }
1344                        }
1345                }
1346                rcu_read_unlock();
1347        }
1348        return est_mtu ? : new_mtu;
1349}
1350
1351static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1352{
1353        if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1354            !(dst_metric_locked(dst, RTAX_MTU))) {
1355                if (mtu < ip_rt_min_pmtu) {
1356                        mtu = ip_rt_min_pmtu;
1357                        dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1358                }
1359                dst->metrics[RTAX_MTU-1] = mtu;
1360                dst_set_expires(dst, ip_rt_mtu_expires);
1361                call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1362        }
1363}
1364
1365static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1366{
1367        return NULL;
1368}
1369
1370static void ipv4_dst_destroy(struct dst_entry *dst)
1371{
1372        struct rtable *rt = (struct rtable *) dst;
1373        struct inet_peer *peer = rt->peer;
1374        struct in_device *idev = rt->idev;
1375
1376        if (peer) {
1377                rt->peer = NULL;
1378                inet_putpeer(peer);
1379        }
1380
1381        if (idev) {
1382                rt->idev = NULL;
1383                in_dev_put(idev);
1384        }
1385}
1386
1387static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1388                            int how)
1389{
1390        struct rtable *rt = (struct rtable *) dst;
1391        struct in_device *idev = rt->idev;
1392        if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
1393                struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
1394                if (loopback_idev) {
1395                        rt->idev = loopback_idev;
1396                        in_dev_put(idev);
1397                }
1398        }
1399}
1400
1401static void ipv4_link_failure(struct sk_buff *skb)
1402{
1403        struct rtable *rt;
1404
1405        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1406
1407        rt = (struct rtable *) skb->dst;
1408        if (rt)
1409                dst_set_expires(&rt->u.dst, 0);
1410}
1411
1412static int ip_rt_bug(struct sk_buff *skb)
1413{
1414        printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1415                NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1416                skb->dev ? skb->dev->name : "?");
1417        kfree_skb(skb);
1418        return 0;
1419}
1420
1421/*
1422   We do not cache source address of outgoing interface,
1423   because it is used only by IP RR, TS and SRR options,
1424   so that it out of fast path.
1425
1426   BTW remember: "addr" is allowed to be not aligned
1427   in IP options!
1428 */
1429
1430void ip_rt_get_source(u8 *addr, struct rtable *rt)
1431{
1432        __be32 src;
1433        struct fib_result res;
1434
1435        if (rt->fl.iif == 0)
1436                src = rt->rt_src;
1437        else if (fib_lookup(&rt->fl, &res) == 0) {
1438                src = FIB_RES_PREFSRC(res);
1439                fib_res_put(&res);
1440        } else
1441                src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1442                                        RT_SCOPE_UNIVERSE);
1443        memcpy(addr, &src, 4);
1444}
1445
1446#ifdef CONFIG_NET_CLS_ROUTE
1447static void set_class_tag(struct rtable *rt, u32 tag)
1448{
1449        if (!(rt->u.dst.tclassid & 0xFFFF))
1450                rt->u.dst.tclassid |= tag & 0xFFFF;
1451        if (!(rt->u.dst.tclassid & 0xFFFF0000))
1452                rt->u.dst.tclassid |= tag & 0xFFFF0000;
1453}
1454#endif
1455
1456static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1457{
1458        struct fib_info *fi = res->fi;
1459
1460        if (fi) {
1461                if (FIB_RES_GW(*res) &&
1462                    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1463                        rt->rt_gateway = FIB_RES_GW(*res);
1464                memcpy(rt->u.dst.metrics, fi->fib_metrics,
1465                       sizeof(rt->u.dst.metrics));
1466                if (fi->fib_mtu == 0) {
1467                        rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1468                        if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1469                            rt->rt_gateway != rt->rt_dst &&
1470                            rt->u.dst.dev->mtu > 576)
1471                                rt->u.dst.metrics[RTAX_MTU-1] = 576;
1472                }
1473#ifdef CONFIG_NET_CLS_ROUTE
1474                rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1475#endif
1476        } else
1477                rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1478
1479        if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1480                rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1481        if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1482                rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1483        if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1484                rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1485                                       ip_rt_min_advmss);
1486        if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1487                rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1488
1489#ifdef CONFIG_NET_CLS_ROUTE
1490#ifdef CONFIG_IP_MULTIPLE_TABLES
1491        set_class_tag(rt, fib_rules_tclass(res));
1492#endif
1493        set_class_tag(rt, itag);
1494#endif
1495        rt->rt_type = res->type;
1496}
1497
1498static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1499                                u8 tos, struct net_device *dev, int our)
1500{
1501        unsigned hash;
1502        struct rtable *rth;
1503        __be32 spec_dst;
1504        struct in_device *in_dev = in_dev_get(dev);
1505        u32 itag = 0;
1506
1507        /* Primary sanity checks. */
1508
1509        if (in_dev == NULL)
1510                return -EINVAL;
1511
1512        if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1513            skb->protocol != htons(ETH_P_IP))
1514                goto e_inval;
1515
1516        if (ZERONET(saddr)) {
1517                if (!LOCAL_MCAST(daddr))
1518                        goto e_inval;
1519                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1520        } else if (fib_validate_source(saddr, 0, tos, 0,
1521                                        dev, &spec_dst, &itag) < 0)
1522                goto e_inval;
1523
1524        rth = dst_alloc(&ipv4_dst_ops);
1525        if (!rth)
1526                goto e_nobufs;
1527
1528        rth->u.dst.output= ip_rt_bug;
1529
1530        atomic_set(&rth->u.dst.__refcnt, 1);
1531        rth->u.dst.flags= DST_HOST;
1532        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1533                rth->u.dst.flags |= DST_NOPOLICY;
1534        rth->fl.fl4_dst = daddr;
1535        rth->rt_dst     = daddr;
1536        rth->fl.fl4_tos = tos;
1537        rth->fl.mark    = skb->mark;
1538        rth->fl.fl4_src = saddr;
1539        rth->rt_src     = saddr;
1540#ifdef CONFIG_NET_CLS_ROUTE
1541        rth->u.dst.tclassid = itag;
1542#endif
1543        rth->rt_iif     =
1544        rth->fl.iif     = dev->ifindex;
1545        rth->u.dst.dev  = init_net.loopback_dev;
1546        dev_hold(rth->u.dst.dev);
1547        rth->idev       = in_dev_get(rth->u.dst.dev);
1548        rth->fl.oif     = 0;
1549        rth->rt_gateway = daddr;
1550        rth->rt_spec_dst= spec_dst;
1551        rth->rt_type    = RTN_MULTICAST;
1552        rth->rt_flags   = RTCF_MULTICAST;
1553        if (our) {
1554                rth->u.dst.input= ip_local_deliver;
1555                rth->rt_flags |= RTCF_LOCAL;
1556        }
1557
1558#ifdef CONFIG_IP_MROUTE
1559        if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1560                rth->u.dst.input = ip_mr_input;
1561#endif
1562        RT_CACHE_STAT_INC(in_slow_mc);
1563
1564        in_dev_put(in_dev);
1565        hash = rt_hash(daddr, saddr, dev->ifindex);
1566        return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1567
1568e_nobufs:
1569        in_dev_put(in_dev);
1570        return -ENOBUFS;
1571
1572e_inval:
1573        in_dev_put(in_dev);
1574        return -EINVAL;
1575}
1576
1577
1578static void ip_handle_martian_source(struct net_device *dev,
1579                                     struct in_device *in_dev,
1580                                     struct sk_buff *skb,
1581                                     __be32 daddr,
1582                                     __be32 saddr)
1583{
1584        RT_CACHE_STAT_INC(in_martian_src);
1585#ifdef CONFIG_IP_ROUTE_VERBOSE
1586        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1587                /*
1588                 *      RFC1812 recommendation, if source is martian,
1589                 *      the only hint is MAC header.
1590                 */
1591                printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1592                        "%u.%u.%u.%u, on dev %s\n",
1593                        NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1594                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1595                        int i;
1596                        const unsigned char *p = skb_mac_header(skb);
1597                        printk(KERN_WARNING "ll header: ");
1598                        for (i = 0; i < dev->hard_header_len; i++, p++) {
1599                                printk("%02x", *p);
1600                                if (i < (dev->hard_header_len - 1))
1601                                        printk(":");
1602                        }
1603                        printk("\n");
1604                }
1605        }
1606#endif
1607}
1608
1609static inline int __mkroute_input(struct sk_buff *skb,
1610                                  struct fib_result* res,
1611                                  struct in_device *in_dev,
1612                                  __be32 daddr, __be32 saddr, u32 tos,
1613                                  struct rtable **result)
1614{
1615
1616        struct rtable *rth;
1617        int err;
1618        struct in_device *out_dev;
1619        unsigned flags = 0;
1620        __be32 spec_dst;
1621        u32 itag;
1622
1623        /* get a working reference to the output device */
1624        out_dev = in_dev_get(FIB_RES_DEV(*res));
1625        if (out_dev == NULL) {
1626                if (net_ratelimit())
1627                        printk(KERN_CRIT "Bug in ip_route_input" \
1628                               "_slow(). Please, report\n");
1629                return -EINVAL;
1630        }
1631
1632
1633        err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1634                                  in_dev->dev, &spec_dst, &itag);
1635        if (err < 0) {
1636                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1637                                         saddr);
1638
1639                err = -EINVAL;
1640                goto cleanup;
1641        }
1642
1643        if (err)
1644                flags |= RTCF_DIRECTSRC;
1645
1646        if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1647            (IN_DEV_SHARED_MEDIA(out_dev) ||
1648             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1649                flags |= RTCF_DOREDIRECT;
1650
1651        if (skb->protocol != htons(ETH_P_IP)) {
1652                /* Not IP (i.e. ARP). Do not create route, if it is
1653                 * invalid for proxy arp. DNAT routes are always valid.
1654                 */
1655                if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1656                        err = -EINVAL;
1657                        goto cleanup;
1658                }
1659        }
1660
1661
1662        rth = dst_alloc(&ipv4_dst_ops);
1663        if (!rth) {
1664                err = -ENOBUFS;
1665                goto cleanup;
1666        }
1667
1668        atomic_set(&rth->u.dst.__refcnt, 1);
1669        rth->u.dst.flags= DST_HOST;
1670        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1671                rth->u.dst.flags |= DST_NOPOLICY;
1672        if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1673                rth->u.dst.flags |= DST_NOXFRM;
1674        rth->fl.fl4_dst = daddr;
1675        rth->rt_dst     = daddr;
1676        rth->fl.fl4_tos = tos;
1677        rth->fl.mark    = skb->mark;
1678        rth->fl.fl4_src = saddr;
1679        rth->rt_src     = saddr;
1680        rth->rt_gateway = daddr;
1681        rth->rt_iif     =
1682                rth->fl.iif     = in_dev->dev->ifindex;
1683        rth->u.dst.dev  = (out_dev)->dev;
1684        dev_hold(rth->u.dst.dev);
1685        rth->idev       = in_dev_get(rth->u.dst.dev);
1686        rth->fl.oif     = 0;
1687        rth->rt_spec_dst= spec_dst;
1688
1689        rth->u.dst.input = ip_forward;
1690        rth->u.dst.output = ip_output;
1691
1692        rt_set_nexthop(rth, res, itag);
1693
1694        rth->rt_flags = flags;
1695
1696        *result = rth;
1697        err = 0;
1698 cleanup:
1699        /* release the working reference to the output device */
1700        in_dev_put(out_dev);
1701        return err;
1702}
1703
1704static inline int ip_mkroute_input(struct sk_buff *skb,
1705                                   struct fib_result* res,
1706                                   const struct flowi *fl,
1707                                   struct in_device *in_dev,
1708                                   __be32 daddr, __be32 saddr, u32 tos)
1709{
1710        struct rtable* rth = NULL;
1711        int err;
1712        unsigned hash;
1713
1714#ifdef CONFIG_IP_ROUTE_MULTIPATH
1715        if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1716                fib_select_multipath(fl, res);
1717#endif
1718
1719        /* create a routing cache entry */
1720        err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1721        if (err)
1722                return err;
1723
1724        /* put it into the cache */
1725        hash = rt_hash(daddr, saddr, fl->iif);
1726        return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1727}
1728
1729/*
1730 *      NOTE. We drop all the packets that has local source
1731 *      addresses, because every properly looped back packet
1732 *      must have correct destination already attached by output routine.
1733 *
1734 *      Such approach solves two big problems:
1735 *      1. Not simplex devices are handled properly.
1736 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1737 */
1738
1739static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1740                               u8 tos, struct net_device *dev)
1741{
1742        struct fib_result res;
1743        struct in_device *in_dev = in_dev_get(dev);
1744        struct flowi fl = { .nl_u = { .ip4_u =
1745                                      { .daddr = daddr,
1746                                        .saddr = saddr,
1747                                        .tos = tos,
1748                                        .scope = RT_SCOPE_UNIVERSE,
1749                                      } },
1750                            .mark = skb->mark,
1751                            .iif = dev->ifindex };
1752        unsigned        flags = 0;
1753        u32             itag = 0;
1754        struct rtable * rth;
1755        unsigned        hash;
1756        __be32          spec_dst;
1757        int             err = -EINVAL;
1758        int             free_res = 0;
1759
1760        /* IP on this device is disabled. */
1761
1762        if (!in_dev)
1763                goto out;
1764
1765        /* Check for the most weird martians, which can be not detected
1766           by fib_lookup.
1767         */
1768
1769        if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1770                goto martian_source;
1771
1772        if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1773                goto brd_input;
1774
1775        /* Accept zero addresses only to limited broadcast;
1776         * I even do not know to fix it or not. Waiting for complains :-)
1777         */
1778        if (ZERONET(saddr))
1779                goto martian_source;
1780
1781        if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1782                goto martian_destination;
1783
1784        /*
1785         *      Now we are ready to route packet.
1786         */
1787        if ((err = fib_lookup(&fl, &res)) != 0) {
1788                if (!IN_DEV_FORWARD(in_dev))
1789                        goto e_hostunreach;
1790                goto no_route;
1791        }
1792        free_res = 1;
1793
1794        RT_CACHE_STAT_INC(in_slow_tot);
1795
1796        if (res.type == RTN_BROADCAST)
1797                goto brd_input;
1798
1799        if (res.type == RTN_LOCAL) {
1800                int result;
1801                result = fib_validate_source(saddr, daddr, tos,
1802                                             init_net.loopback_dev->ifindex,
1803                                             dev, &spec_dst, &itag);
1804                if (result < 0)
1805                        goto martian_source;
1806                if (result)
1807                        flags |= RTCF_DIRECTSRC;
1808                spec_dst = daddr;
1809                goto local_input;
1810        }
1811
1812        if (!IN_DEV_FORWARD(in_dev))
1813                goto e_hostunreach;
1814        if (res.type != RTN_UNICAST)
1815                goto martian_destination;
1816
1817        err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1818done:
1819        in_dev_put(in_dev);
1820        if (free_res)
1821                fib_res_put(&res);
1822out:    return err;
1823
1824brd_input:
1825        if (skb->protocol != htons(ETH_P_IP))
1826                goto e_inval;
1827
1828        if (ZERONET(saddr))
1829                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1830        else {
1831                err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1832                                          &itag);
1833                if (err < 0)
1834                        goto martian_source;
1835                if (err)
1836                        flags |= RTCF_DIRECTSRC;
1837        }
1838        flags |= RTCF_BROADCAST;
1839        res.type = RTN_BROADCAST;
1840        RT_CACHE_STAT_INC(in_brd);
1841
1842local_input:
1843        rth = dst_alloc(&ipv4_dst_ops);
1844        if (!rth)
1845                goto e_nobufs;
1846
1847        rth->u.dst.output= ip_rt_bug;
1848
1849        atomic_set(&rth->u.dst.__refcnt, 1);
1850        rth->u.dst.flags= DST_HOST;
1851        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1852                rth->u.dst.flags |= DST_NOPOLICY;
1853        rth->fl.fl4_dst = daddr;
1854        rth->rt_dst     = daddr;
1855        rth->fl.fl4_tos = tos;
1856        rth->fl.mark    = skb->mark;
1857        rth->fl.fl4_src = saddr;
1858        rth->rt_src     = saddr;
1859#ifdef CONFIG_NET_CLS_ROUTE
1860        rth->u.dst.tclassid = itag;
1861#endif
1862        rth->rt_iif     =
1863        rth->fl.iif     = dev->ifindex;
1864        rth->u.dst.dev  = init_net.loopback_dev;
1865        dev_hold(rth->u.dst.dev);
1866        rth->idev       = in_dev_get(rth->u.dst.dev);
1867        rth->rt_gateway = daddr;
1868        rth->rt_spec_dst= spec_dst;
1869        rth->u.dst.input= ip_local_deliver;
1870        rth->rt_flags   = flags|RTCF_LOCAL;
1871        if (res.type == RTN_UNREACHABLE) {
1872                rth->u.dst.input= ip_error;
1873                rth->u.dst.error= -err;
1874                rth->rt_flags   &= ~RTCF_LOCAL;
1875        }
1876        rth->rt_type    = res.type;
1877        hash = rt_hash(daddr, saddr, fl.iif);
1878        err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1879        goto done;
1880
1881no_route:
1882        RT_CACHE_STAT_INC(in_no_route);
1883        spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1884        res.type = RTN_UNREACHABLE;
1885        if (err == -ESRCH)
1886                err = -ENETUNREACH;
1887        goto local_input;
1888
1889        /*
1890         *      Do not cache martian addresses: they should be logged (RFC1812)
1891         */
1892martian_destination:
1893        RT_CACHE_STAT_INC(in_martian_dst);
1894#ifdef CONFIG_IP_ROUTE_VERBOSE
1895        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1896                printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1897                        "%u.%u.%u.%u, dev %s\n",
1898                        NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1899#endif
1900
1901e_hostunreach:
1902        err = -EHOSTUNREACH;
1903        goto done;
1904
1905e_inval:
1906        err = -EINVAL;
1907        goto done;
1908
1909e_nobufs:
1910        err = -ENOBUFS;
1911        goto done;
1912
1913martian_source:
1914        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1915        goto e_inval;
1916}
1917
1918int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1919                   u8 tos, struct net_device *dev)
1920{
1921        struct rtable * rth;
1922        unsigned        hash;
1923        int iif = dev->ifindex;
1924
1925        tos &= IPTOS_RT_MASK;
1926        hash = rt_hash(daddr, saddr, iif);
1927
1928        rcu_read_lock();
1929        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1930             rth = rcu_dereference(rth->u.dst.rt_next)) {
1931                if (rth->fl.fl4_dst == daddr &&
1932                    rth->fl.fl4_src == saddr &&
1933                    rth->fl.iif == iif &&
1934                    rth->fl.oif == 0 &&
1935                    rth->fl.mark == skb->mark &&
1936                    rth->fl.fl4_tos == tos) {
1937                        dst_use(&rth->u.dst, jiffies);
1938                        RT_CACHE_STAT_INC(in_hit);
1939                        rcu_read_unlock();
1940                        skb->dst = (struct dst_entry*)rth;
1941                        return 0;
1942                }
1943                RT_CACHE_STAT_INC(in_hlist_search);
1944        }
1945        rcu_read_unlock();
1946
1947        /* Multicast recognition logic is moved from route cache to here.
1948           The problem was that too many Ethernet cards have broken/missing
1949           hardware multicast filters :-( As result the host on multicasting
1950           network acquires a lot of useless route cache entries, sort of
1951           SDR messages from all the world. Now we try to get rid of them.
1952           Really, provided software IP multicast filter is organized
1953           reasonably (at least, hashed), it does not result in a slowdown
1954           comparing with route cache reject entries.
1955           Note, that multicast routers are not affected, because
1956           route cache entry is created eventually.
1957         */
1958        if (MULTICAST(daddr)) {
1959                struct in_device *in_dev;
1960
1961                rcu_read_lock();
1962                if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1963                        int our = ip_check_mc(in_dev, daddr, saddr,
1964                                ip_hdr(skb)->protocol);
1965                        if (our
1966#ifdef CONFIG_IP_MROUTE
1967                            || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1968#endif
1969                            ) {
1970                                rcu_read_unlock();
1971                                return ip_route_input_mc(skb, daddr, saddr,
1972                                                         tos, dev, our);
1973                        }
1974                }
1975                rcu_read_unlock();
1976                return -EINVAL;
1977        }
1978        return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1979}
1980
1981static inline int __mkroute_output(struct rtable **result,
1982                                   struct fib_result* res,
1983                                   const struct flowi *fl,
1984                                   const struct flowi *oldflp,
1985                                   struct net_device *dev_out,
1986                                   unsigned flags)
1987{
1988        struct rtable *rth;
1989        struct in_device *in_dev;
1990        u32 tos = RT_FL_TOS(oldflp);
1991        int err = 0;
1992
1993        if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
1994                return -EINVAL;
1995
1996        if (fl->fl4_dst == htonl(0xFFFFFFFF))
1997                res->type = RTN_BROADCAST;
1998        else if (MULTICAST(fl->fl4_dst))
1999                res->type = RTN_MULTICAST;
2000        else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2001                return -EINVAL;
2002
2003        if (dev_out->flags & IFF_LOOPBACK)
2004                flags |= RTCF_LOCAL;
2005
2006        /* get work reference to inet device */
2007        in_dev = in_dev_get(dev_out);
2008        if (!in_dev)
2009                return -EINVAL;
2010
2011        if (res->type == RTN_BROADCAST) {
2012                flags |= RTCF_BROADCAST | RTCF_LOCAL;
2013                if (res->fi) {
2014                        fib_info_put(res->fi);
2015                        res->fi = NULL;
2016                }
2017        } else if (res->type == RTN_MULTICAST) {
2018                flags |= RTCF_MULTICAST|RTCF_LOCAL;
2019                if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2020                                 oldflp->proto))
2021                        flags &= ~RTCF_LOCAL;
2022                /* If multicast route do not exist use
2023                   default one, but do not gateway in this case.
2024                   Yes, it is hack.
2025                 */
2026                if (res->fi && res->prefixlen < 4) {
2027                        fib_info_put(res->fi);
2028                        res->fi = NULL;
2029                }
2030        }
2031
2032
2033        rth = dst_alloc(&ipv4_dst_ops);
2034        if (!rth) {
2035                err = -ENOBUFS;
2036                goto cleanup;
2037        }
2038
2039        atomic_set(&rth->u.dst.__refcnt, 1);
2040        rth->u.dst.flags= DST_HOST;
2041        if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2042                rth->u.dst.flags |= DST_NOXFRM;
2043        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2044                rth->u.dst.flags |= DST_NOPOLICY;
2045
2046        rth->fl.fl4_dst = oldflp->fl4_dst;
2047        rth->fl.fl4_tos = tos;
2048        rth->fl.fl4_src = oldflp->fl4_src;
2049        rth->fl.oif     = oldflp->oif;
2050        rth->fl.mark    = oldflp->mark;
2051        rth->rt_dst     = fl->fl4_dst;
2052        rth->rt_src     = fl->fl4_src;
2053        rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2054        /* get references to the devices that are to be hold by the routing
2055           cache entry */
2056        rth->u.dst.dev  = dev_out;
2057        dev_hold(dev_out);
2058        rth->idev       = in_dev_get(dev_out);
2059        rth->rt_gateway = fl->fl4_dst;
2060        rth->rt_spec_dst= fl->fl4_src;
2061
2062        rth->u.dst.output=ip_output;
2063
2064        RT_CACHE_STAT_INC(out_slow_tot);
2065
2066        if (flags & RTCF_LOCAL) {
2067                rth->u.dst.input = ip_local_deliver;
2068                rth->rt_spec_dst = fl->fl4_dst;
2069        }
2070        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2071                rth->rt_spec_dst = fl->fl4_src;
2072                if (flags & RTCF_LOCAL &&
2073                    !(dev_out->flags & IFF_LOOPBACK)) {
2074                        rth->u.dst.output = ip_mc_output;
2075                        RT_CACHE_STAT_INC(out_slow_mc);
2076                }
2077#ifdef CONFIG_IP_MROUTE
2078                if (res->type == RTN_MULTICAST) {
2079                        if (IN_DEV_MFORWARD(in_dev) &&
2080                            !LOCAL_MCAST(oldflp->fl4_dst)) {
2081                                rth->u.dst.input = ip_mr_input;
2082                                rth->u.dst.output = ip_mc_output;
2083                        }
2084                }
2085#endif
2086        }
2087
2088        rt_set_nexthop(rth, res, 0);
2089
2090        rth->rt_flags = flags;
2091
2092        *result = rth;
2093 cleanup:
2094        /* release work reference to inet device */
2095        in_dev_put(in_dev);
2096
2097        return err;
2098}
2099
2100static inline int ip_mkroute_output(struct rtable **rp,
2101                                    struct fib_result* res,
2102                                    const struct flowi *fl,
2103                                    const struct flowi *oldflp,
2104                                    struct net_device *dev_out,
2105                                    unsigned flags)
2106{
2107        struct rtable *rth = NULL;
2108        int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2109        unsigned hash;
2110        if (err == 0) {
2111                hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2112                err = rt_intern_hash(hash, rth, rp);
2113        }
2114
2115        return err;
2116}
2117
2118/*
2119 * Major route resolver routine.
2120 */
2121
2122static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2123{
2124        u32 tos = RT_FL_TOS(oldflp);
2125        struct flowi fl = { .nl_u = { .ip4_u =
2126                                      { .daddr = oldflp->fl4_dst,
2127                                        .saddr = oldflp->fl4_src,
2128                                        .tos = tos & IPTOS_RT_MASK,
2129                                        .scope = ((tos & RTO_ONLINK) ?
2130                                                  RT_SCOPE_LINK :
2131                                                  RT_SCOPE_UNIVERSE),
2132                                      } },
2133                            .mark = oldflp->mark,
2134                            .iif = init_net.loopback_dev->ifindex,
2135                            .oif = oldflp->oif };
2136        struct fib_result res;
2137        unsigned flags = 0;
2138        struct net_device *dev_out = NULL;
2139        int free_res = 0;
2140        int err;
2141
2142
2143        res.fi          = NULL;
2144#ifdef CONFIG_IP_MULTIPLE_TABLES
2145        res.r           = NULL;
2146#endif
2147
2148        if (oldflp->fl4_src) {
2149                err = -EINVAL;
2150                if (MULTICAST(oldflp->fl4_src) ||
2151                    BADCLASS(oldflp->fl4_src) ||
2152                    ZERONET(oldflp->fl4_src))
2153                        goto out;
2154
2155                /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2156                dev_out = ip_dev_find(oldflp->fl4_src);
2157                if (dev_out == NULL)
2158                        goto out;
2159
2160                /* I removed check for oif == dev_out->oif here.
2161                   It was wrong for two reasons:
2162                   1. ip_dev_find(saddr) can return wrong iface, if saddr is
2163                      assigned to multiple interfaces.
2164                   2. Moreover, we are allowed to send packets with saddr
2165                      of another iface. --ANK
2166                 */
2167
2168                if (oldflp->oif == 0
2169                    && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2170                        /* Special hack: user can direct multicasts
2171                           and limited broadcast via necessary interface
2172                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2173                           This hack is not just for fun, it allows
2174                           vic,vat and friends to work.
2175                           They bind socket to loopback, set ttl to zero
2176                           and expect that it will work.
2177                           From the viewpoint of routing cache they are broken,
2178                           because we are not allowed to build multicast path
2179                           with loopback source addr (look, routing cache
2180                           cannot know, that ttl is zero, so that packet
2181                           will not leave this host and route is valid).
2182                           Luckily, this hack is good workaround.
2183                         */
2184
2185                        fl.oif = dev_out->ifindex;
2186                        goto make_route;
2187                }
2188                if (dev_out)
2189                        dev_put(dev_out);
2190                dev_out = NULL;
2191        }
2192
2193
2194        if (oldflp->oif) {
2195                dev_out = dev_get_by_index(&init_net, oldflp->oif);
2196                err = -ENODEV;
2197                if (dev_out == NULL)
2198                        goto out;
2199
2200                /* RACE: Check return value of inet_select_addr instead. */
2201                if (__in_dev_get_rtnl(dev_out) == NULL) {
2202                        dev_put(dev_out);
2203                        goto out;       /* Wrong error code */
2204                }
2205
2206                if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2207                        if (!fl.fl4_src)
2208                                fl.fl4_src = inet_select_addr(dev_out, 0,
2209                                                              RT_SCOPE_LINK);
2210                        goto make_route;
2211                }
2212                if (!fl.fl4_src) {
2213                        if (MULTICAST(oldflp->fl4_dst))
2214                                fl.fl4_src = inet_select_addr(dev_out, 0,
2215                                                              fl.fl4_scope);
2216                        else if (!oldflp->fl4_dst)
2217                                fl.fl4_src = inet_select_addr(dev_out, 0,
2218                                                              RT_SCOPE_HOST);
2219                }
2220        }
2221
2222        if (!fl.fl4_dst) {
2223                fl.fl4_dst = fl.fl4_src;
2224                if (!fl.fl4_dst)
2225                        fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2226                if (dev_out)
2227                        dev_put(dev_out);
2228                dev_out = init_net.loopback_dev;
2229                dev_hold(dev_out);
2230                fl.oif = init_net.loopback_dev->ifindex;
2231                res.type = RTN_LOCAL;
2232                flags |= RTCF_LOCAL;
2233                goto make_route;
2234        }
2235
2236        if (fib_lookup(&fl, &res)) {
2237                res.fi = NULL;
2238                if (oldflp->oif) {
2239                        /* Apparently, routing tables are wrong. Assume,
2240                           that the destination is on link.
2241
2242                           WHY? DW.
2243                           Because we are allowed to send to iface
2244                           even if it has NO routes and NO assigned
2245                           addresses. When oif is specified, routing
2246                           tables are looked up with only one purpose:
2247                           to catch if destination is gatewayed, rather than
2248                           direct. Moreover, if MSG_DONTROUTE is set,
2249                           we send packet, ignoring both routing tables
2250                           and ifaddr state. --ANK
2251
2252
2253                           We could make it even if oif is unknown,
2254                           likely IPv6, but we do not.
2255                         */
2256
2257                        if (fl.fl4_src == 0)
2258                                fl.fl4_src = inet_select_addr(dev_out, 0,
2259                                                              RT_SCOPE_LINK);
2260                        res.type = RTN_UNICAST;
2261                        goto make_route;
2262                }
2263                if (dev_out)
2264                        dev_put(dev_out);
2265                err = -ENETUNREACH;
2266                goto out;
2267        }
2268        free_res = 1;
2269
2270        if (res.type == RTN_LOCAL) {
2271                if (!fl.fl4_src)
2272                        fl.fl4_src = fl.fl4_dst;
2273                if (dev_out)
2274                        dev_put(dev_out);
2275                dev_out = init_net.loopback_dev;
2276                dev_hold(dev_out);
2277                fl.oif = dev_out->ifindex;
2278                if (res.fi)
2279                        fib_info_put(res.fi);
2280                res.fi = NULL;
2281                flags |= RTCF_LOCAL;
2282                goto make_route;
2283        }
2284
2285#ifdef CONFIG_IP_ROUTE_MULTIPATH
2286        if (res.fi->fib_nhs > 1 && fl.oif == 0)
2287                fib_select_multipath(&fl, &res);
2288        else
2289#endif
2290        if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2291                fib_select_default(&fl, &res);
2292
2293        if (!fl.fl4_src)
2294                fl.fl4_src = FIB_RES_PREFSRC(res);
2295
2296        if (dev_out)
2297                dev_put(dev_out);
2298        dev_out = FIB_RES_DEV(res);
2299        dev_hold(dev_out);
2300        fl.oif = dev_out->ifindex;
2301
2302
2303make_route:
2304        err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2305
2306
2307        if (free_res)
2308                fib_res_put(&res);
2309        if (dev_out)
2310                dev_put(dev_out);
2311out:    return err;
2312}
2313
2314int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2315{
2316        unsigned hash;
2317        struct rtable *rth;
2318
2319        hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2320
2321        rcu_read_lock_bh();
2322        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2323                rth = rcu_dereference(rth->u.dst.rt_next)) {
2324                if (rth->fl.fl4_dst == flp->fl4_dst &&
2325                    rth->fl.fl4_src == flp->fl4_src &&
2326                    rth->fl.iif == 0 &&
2327                    rth->fl.oif == flp->oif &&
2328                    rth->fl.mark == flp->mark &&
2329                    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2330                            (IPTOS_RT_MASK | RTO_ONLINK))) {
2331                        dst_use(&rth->u.dst, jiffies);
2332                        RT_CACHE_STAT_INC(out_hit);
2333                        rcu_read_unlock_bh();
2334                        *rp = rth;
2335                        return 0;
2336                }
2337                RT_CACHE_STAT_INC(out_hlist_search);
2338        }
2339        rcu_read_unlock_bh();
2340
2341        return ip_route_output_slow(rp, flp);
2342}
2343
2344EXPORT_SYMBOL_GPL(__ip_route_output_key);
2345
2346static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2347{
2348}
2349
2350static struct dst_ops ipv4_dst_blackhole_ops = {
2351        .family                 =       AF_INET,
2352        .protocol               =       __constant_htons(ETH_P_IP),
2353        .destroy                =       ipv4_dst_destroy,
2354        .check                  =       ipv4_dst_check,
2355        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2356        .entry_size             =       sizeof(struct rtable),
2357};
2358
2359
2360static int ipv4_blackhole_output(struct sk_buff *skb)
2361{
2362        kfree_skb(skb);
2363        return 0;
2364}
2365
2366static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2367{
2368        struct rtable *ort = *rp;
2369        struct rtable *rt = (struct rtable *)
2370                dst_alloc(&ipv4_dst_blackhole_ops);
2371
2372        if (rt) {
2373                struct dst_entry *new = &rt->u.dst;
2374
2375                atomic_set(&new->__refcnt, 1);
2376                new->__use = 1;
2377                new->input = ipv4_blackhole_output;
2378                new->output = ipv4_blackhole_output;
2379                memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2380
2381                new->dev = ort->u.dst.dev;
2382                if (new->dev)
2383                        dev_hold(new->dev);
2384
2385                rt->fl = ort->fl;
2386
2387                rt->idev = ort->idev;
2388                if (rt->idev)
2389                        in_dev_hold(rt->idev);
2390                rt->rt_flags = ort->rt_flags;
2391                rt->rt_type = ort->rt_type;
2392                rt->rt_dst = ort->rt_dst;
2393                rt->rt_src = ort->rt_src;
2394                rt->rt_iif = ort->rt_iif;
2395                rt->rt_gateway = ort->rt_gateway;
2396                rt->rt_spec_dst = ort->rt_spec_dst;
2397                rt->peer = ort->peer;
2398                if (rt->peer)
2399                        atomic_inc(&rt->peer->refcnt);
2400
2401                dst_free(new);
2402        }
2403
2404        dst_release(&(*rp)->u.dst);
2405        *rp = rt;
2406        return (rt ? 0 : -ENOMEM);
2407}
2408
2409int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2410{
2411        int err;
2412
2413        if ((err = __ip_route_output_key(rp, flp)) != 0)
2414                return err;
2415
2416        if (flp->proto) {
2417                if (!flp->fl4_src)
2418                        flp->fl4_src = (*rp)->rt_src;
2419                if (!flp->fl4_dst)
2420                        flp->fl4_dst = (*rp)->rt_dst;
2421                err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2422                if (err == -EREMOTE)
2423                        err = ipv4_dst_blackhole(rp, flp, sk);
2424
2425                return err;
2426        }
2427
2428        return 0;
2429}
2430
2431EXPORT_SYMBOL_GPL(ip_route_output_flow);
2432
2433int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2434{
2435        return ip_route_output_flow(rp, flp, NULL, 0);
2436}
2437
2438static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2439                        int nowait, unsigned int flags)
2440{
2441        struct rtable *rt = (struct rtable*)skb->dst;
2442        struct rtmsg *r;
2443        struct nlmsghdr *nlh;
2444        long expires;
2445        u32 id = 0, ts = 0, tsage = 0, error;
2446
2447        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2448        if (nlh == NULL)
2449                return -EMSGSIZE;
2450
2451        r = nlmsg_data(nlh);
2452        r->rtm_family    = AF_INET;
2453        r->rtm_dst_len  = 32;
2454        r->rtm_src_len  = 0;
2455        r->rtm_tos      = rt->fl.fl4_tos;
2456        r->rtm_table    = RT_TABLE_MAIN;
2457        NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2458        r->rtm_type     = rt->rt_type;
2459        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2460        r->rtm_protocol = RTPROT_UNSPEC;
2461        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2462        if (rt->rt_flags & RTCF_NOTIFY)
2463                r->rtm_flags |= RTM_F_NOTIFY;
2464
2465        NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2466
2467        if (rt->fl.fl4_src) {
2468                r->rtm_src_len = 32;
2469                NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2470        }
2471        if (rt->u.dst.dev)
2472                NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2473#ifdef CONFIG_NET_CLS_ROUTE
2474        if (rt->u.dst.tclassid)
2475                NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2476#endif
2477        if (rt->fl.iif)
2478                NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2479        else if (rt->rt_src != rt->fl.fl4_src)
2480                NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2481
2482        if (rt->rt_dst != rt->rt_gateway)
2483                NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2484
2485        if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2486                goto nla_put_failure;
2487
2488        error = rt->u.dst.error;
2489        expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2490        if (rt->peer) {
2491                id = rt->peer->ip_id_count;
2492                if (rt->peer->tcp_ts_stamp) {
2493                        ts = rt->peer->tcp_ts;
2494                        tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2495                }
2496        }
2497
2498        if (rt->fl.iif) {
2499#ifdef CONFIG_IP_MROUTE
2500                __be32 dst = rt->rt_dst;
2501
2502                if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2503                    IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2504                        int err = ipmr_get_route(skb, r, nowait);
2505                        if (err <= 0) {
2506                                if (!nowait) {
2507                                        if (err == 0)
2508                                                return 0;
2509                                        goto nla_put_failure;
2510                                } else {
2511                                        if (err == -EMSGSIZE)
2512                                                goto nla_put_failure;
2513                                        error = err;
2514                                }
2515                        }
2516                } else
2517#endif
2518                        NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2519        }
2520
2521        if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2522                               expires, error) < 0)
2523                goto nla_put_failure;
2524
2525        return nlmsg_end(skb, nlh);
2526
2527nla_put_failure:
2528        nlmsg_cancel(skb, nlh);
2529        return -EMSGSIZE;
2530}
2531
2532static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2533{
2534        struct rtmsg *rtm;
2535        struct nlattr *tb[RTA_MAX+1];
2536        struct rtable *rt = NULL;
2537        __be32 dst = 0;
2538        __be32 src = 0;
2539        u32 iif;
2540        int err;
2541        struct sk_buff *skb;
2542
2543        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2544        if (err < 0)
2545                goto errout;
2546
2547        rtm = nlmsg_data(nlh);
2548
2549        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2550        if (skb == NULL) {
2551                err = -ENOBUFS;
2552                goto errout;
2553        }
2554
2555        /* Reserve room for dummy headers, this skb can pass
2556           through good chunk of routing engine.
2557         */
2558        skb_reset_mac_header(skb);
2559        skb_reset_network_header(skb);
2560
2561        /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2562        ip_hdr(skb)->protocol = IPPROTO_ICMP;
2563        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2564
2565        src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2566        dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2567        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2568
2569        if (iif) {
2570                struct net_device *dev;
2571
2572                dev = __dev_get_by_index(&init_net, iif);
2573                if (dev == NULL) {
2574                        err = -ENODEV;
2575                        goto errout_free;
2576                }
2577
2578                skb->protocol   = htons(ETH_P_IP);
2579                skb->dev        = dev;
2580                local_bh_disable();
2581                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2582                local_bh_enable();
2583
2584                rt = (struct rtable*) skb->dst;
2585                if (err == 0 && rt->u.dst.error)
2586                        err = -rt->u.dst.error;
2587        } else {
2588                struct flowi fl = {
2589                        .nl_u = {
2590                                .ip4_u = {
2591                                        .daddr = dst,
2592                                        .saddr = src,
2593                                        .tos = rtm->rtm_tos,
2594                                },
2595                        },
2596                        .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2597                };
2598                err = ip_route_output_key(&rt, &fl);
2599        }
2600
2601        if (err)
2602                goto errout_free;
2603
2604        skb->dst = &rt->u.dst;
2605        if (rtm->rtm_flags & RTM_F_NOTIFY)
2606                rt->rt_flags |= RTCF_NOTIFY;
2607
2608        err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2609                                RTM_NEWROUTE, 0, 0);
2610        if (err <= 0)
2611                goto errout_free;
2612
2613        err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2614errout:
2615        return err;
2616
2617errout_free:
2618        kfree_skb(skb);
2619        goto errout;
2620}
2621
2622int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2623{
2624        struct rtable *rt;
2625        int h, s_h;
2626        int idx, s_idx;
2627
2628        s_h = cb->args[0];
2629        if (s_h < 0)
2630                s_h = 0;
2631        s_idx = idx = cb->args[1];
2632        for (h = s_h; h <= rt_hash_mask; h++) {
2633                rcu_read_lock_bh();
2634                for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2635                     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2636                        if (idx < s_idx)
2637                                continue;
2638                        skb->dst = dst_clone(&rt->u.dst);
2639                        if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2640                                         cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2641                                         1, NLM_F_MULTI) <= 0) {
2642                                dst_release(xchg(&skb->dst, NULL));
2643                                rcu_read_unlock_bh();
2644                                goto done;
2645                        }
2646                        dst_release(xchg(&skb->dst, NULL));
2647                }
2648                rcu_read_unlock_bh();
2649                s_idx = 0;
2650        }
2651
2652done:
2653        cb->args[0] = h;
2654        cb->args[1] = idx;
2655        return skb->len;
2656}
2657
2658void ip_rt_multicast_event(struct in_device *in_dev)
2659{
2660        rt_cache_flush(0);
2661}
2662
2663#ifdef CONFIG_SYSCTL
2664static int flush_delay;
2665
2666static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2667                                        struct file *filp, void __user *buffer,
2668                                        size_t *lenp, loff_t *ppos)
2669{
2670        if (write) {
2671                proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2672                rt_cache_flush(flush_delay);
2673                return 0;
2674        }
2675
2676        return -EINVAL;
2677}
2678
2679static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2680                                                int __user *name,
2681                                                int nlen,
2682                                                void __user *oldval,
2683                                                size_t __user *oldlenp,
2684                                                void __user *newval,
2685                                                size_t newlen)
2686{
2687        int delay;
2688        if (newlen != sizeof(int))
2689                return -EINVAL;
2690        if (get_user(delay, (int __user *)newval))
2691                return -EFAULT;
2692        rt_cache_flush(delay);
2693        return 0;
2694}
2695
2696ctl_table ipv4_route_table[] = {
2697        {
2698                .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2699                .procname       = "flush",
2700                .data           = &flush_delay,
2701                .maxlen         = sizeof(int),
2702                .mode           = 0200,
2703                .proc_handler   = &ipv4_sysctl_rtcache_flush,
2704                .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2705        },
2706        {
2707                .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2708                .procname       = "min_delay",
2709                .data           = &ip_rt_min_delay,
2710                .maxlen         = sizeof(int),
2711                .mode           = 0644,
2712                .proc_handler   = &proc_dointvec_jiffies,
2713                .strategy       = &sysctl_jiffies,
2714        },
2715        {
2716                .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2717                .procname       = "max_delay",
2718                .data           = &ip_rt_max_delay,
2719                .maxlen         = sizeof(int),
2720                .mode           = 0644,
2721                .proc_handler   = &proc_dointvec_jiffies,
2722                .strategy       = &sysctl_jiffies,
2723        },
2724        {
2725                .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2726                .procname       = "gc_thresh",
2727                .data           = &ipv4_dst_ops.gc_thresh,
2728                .maxlen         = sizeof(int),
2729                .mode           = 0644,
2730                .proc_handler   = &proc_dointvec,
2731        },
2732        {
2733                .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2734                .procname       = "max_size",
2735                .data           = &ip_rt_max_size,
2736                .maxlen         = sizeof(int),
2737                .mode           = 0644,
2738                .proc_handler   = &proc_dointvec,
2739        },
2740        {
2741                /*  Deprecated. Use gc_min_interval_ms */
2742
2743                .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2744                .procname       = "gc_min_interval",
2745                .data           = &ip_rt_gc_min_interval,
2746                .maxlen         = sizeof(int),
2747                .mode           = 0644,
2748                .proc_handler   = &proc_dointvec_jiffies,
2749                .strategy       = &sysctl_jiffies,
2750        },
2751        {
2752                .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2753                .procname       = "gc_min_interval_ms",
2754                .data           = &ip_rt_gc_min_interval,
2755                .maxlen         = sizeof(int),
2756                .mode           = 0644,
2757                .proc_handler   = &proc_dointvec_ms_jiffies,
2758                .strategy       = &sysctl_ms_jiffies,
2759        },
2760        {
2761                .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2762                .procname       = "gc_timeout",
2763                .data           = &ip_rt_gc_timeout,
2764                .maxlen         = sizeof(int),
2765                .mode           = 0644,
2766                .proc_handler   = &proc_dointvec_jiffies,
2767                .strategy       = &sysctl_jiffies,
2768        },
2769        {
2770                .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2771                .procname       = "gc_interval",
2772                .data           = &ip_rt_gc_interval,
2773                .maxlen         = sizeof(int),
2774                .mode           = 0644,
2775                .proc_handler   = &proc_dointvec_jiffies,
2776                .strategy       = &sysctl_jiffies,
2777        },
2778        {
2779                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2780                .procname       = "redirect_load",
2781                .data           = &ip_rt_redirect_load,
2782                .maxlen         = sizeof(int),
2783                .mode           = 0644,
2784                .proc_handler   = &proc_dointvec,
2785        },
2786        {
2787                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2788                .procname       = "redirect_number",
2789                .data           = &ip_rt_redirect_number,
2790                .maxlen         = sizeof(int),
2791                .mode           = 0644,
2792                .proc_handler   = &proc_dointvec,
2793        },
2794        {
2795                .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2796                .procname       = "redirect_silence",
2797                .data           = &ip_rt_redirect_silence,
2798                .maxlen         = sizeof(int),
2799                .mode           = 0644,
2800                .proc_handler   = &proc_dointvec,
2801        },
2802        {
2803                .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2804                .procname       = "error_cost",
2805                .data           = &ip_rt_error_cost,
2806                .maxlen         = sizeof(int),
2807                .mode           = 0644,
2808                .proc_handler   = &proc_dointvec,
2809        },
2810        {
2811                .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2812                .procname       = "error_burst",
2813                .data           = &ip_rt_error_burst,
2814                .maxlen         = sizeof(int),
2815                .mode           = 0644,
2816                .proc_handler   = &proc_dointvec,
2817        },
2818        {
2819                .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2820                .procname       = "gc_elasticity",
2821                .data           = &ip_rt_gc_elasticity,
2822                .maxlen         = sizeof(int),
2823                .mode           = 0644,
2824                .proc_handler   = &proc_dointvec,
2825        },
2826        {
2827                .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2828                .procname       = "mtu_expires",
2829                .data           = &ip_rt_mtu_expires,
2830                .maxlen         = sizeof(int),
2831                .mode           = 0644,
2832                .proc_handler   = &proc_dointvec_jiffies,
2833                .strategy       = &sysctl_jiffies,
2834        },
2835        {
2836                .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2837                .procname       = "min_pmtu",
2838                .data           = &ip_rt_min_pmtu,
2839                .maxlen         = sizeof(int),
2840                .mode           = 0644,
2841                .proc_handler   = &proc_dointvec,
2842        },
2843        {
2844                .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2845                .procname       = "min_adv_mss",
2846                .data           = &ip_rt_min_advmss,
2847                .maxlen         = sizeof(int),
2848                .mode           = 0644,
2849                .proc_handler   = &proc_dointvec,
2850        },
2851        {
2852                .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2853                .procname       = "secret_interval",
2854                .data           = &ip_rt_secret_interval,
2855                .maxlen         = sizeof(int),
2856                .mode           = 0644,
2857                .proc_handler   = &proc_dointvec_jiffies,
2858                .strategy       = &sysctl_jiffies,
2859        },
2860        { .ctl_name = 0 }
2861};
2862#endif
2863
2864#ifdef CONFIG_NET_CLS_ROUTE
2865struct ip_rt_acct *ip_rt_acct;
2866
2867/* This code sucks.  But you should have seen it before! --RR */
2868
2869/* IP route accounting ptr for this logical cpu number. */
2870#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2871
2872#ifdef CONFIG_PROC_FS
2873static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2874                           int length, int *eof, void *data)
2875{
2876        unsigned int i;
2877
2878        if ((offset & 3) || (length & 3))
2879                return -EIO;
2880
2881        if (offset >= sizeof(struct ip_rt_acct) * 256) {
2882                *eof = 1;
2883                return 0;
2884        }
2885
2886        if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2887                length = sizeof(struct ip_rt_acct) * 256 - offset;
2888                *eof = 1;
2889        }
2890
2891        offset /= sizeof(u32);
2892
2893        if (length > 0) {
2894                u32 *dst = (u32 *) buffer;
2895
2896                *start = buffer;
2897                memset(dst, 0, length);
2898
2899                for_each_possible_cpu(i) {
2900                        unsigned int j;
2901                        u32 *src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2902
2903                        for (j = 0; j < length/4; j++)
2904                                dst[j] += src[j];
2905                }
2906        }
2907        return length;
2908}
2909#endif /* CONFIG_PROC_FS */
2910#endif /* CONFIG_NET_CLS_ROUTE */
2911
2912static __initdata unsigned long rhash_entries;
2913static int __init set_rhash_entries(char *str)
2914{
2915        if (!str)
2916                return 0;
2917        rhash_entries = simple_strtoul(str, &str, 0);
2918        return 1;
2919}
2920__setup("rhash_entries=", set_rhash_entries);
2921
2922int __init ip_rt_init(void)
2923{
2924        int rc = 0;
2925
2926        rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2927                             (jiffies ^ (jiffies >> 7)));
2928
2929#ifdef CONFIG_NET_CLS_ROUTE
2930        {
2931        int order;
2932        for (order = 0;
2933             (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2934                /* NOTHING */;
2935        ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2936        if (!ip_rt_acct)
2937                panic("IP: failed to allocate ip_rt_acct\n");
2938        memset(ip_rt_acct, 0, PAGE_SIZE << order);
2939        }
2940#endif
2941
2942        ipv4_dst_ops.kmem_cachep =
2943                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2944                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2945
2946        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2947
2948        rt_hash_table = (struct rt_hash_bucket *)
2949                alloc_large_system_hash("IP route cache",
2950                                        sizeof(struct rt_hash_bucket),
2951                                        rhash_entries,
2952                                        (num_physpages >= 128 * 1024) ?
2953                                        15 : 17,
2954                                        0,
2955                                        &rt_hash_log,
2956                                        &rt_hash_mask,
2957                                        0);
2958        memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2959        rt_hash_lock_init();
2960
2961        ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2962        ip_rt_max_size = (rt_hash_mask + 1) * 16;
2963
2964        devinet_init();
2965        ip_fib_init();
2966
2967        init_timer(&rt_flush_timer);
2968        rt_flush_timer.function = rt_run_flush;
2969        init_timer(&rt_secret_timer);
2970        rt_secret_timer.function = rt_secret_rebuild;
2971
2972        /* All the timers, started at system startup tend
2973           to synchronize. Perturb it a bit.
2974         */
2975        schedule_delayed_work(&expires_work,
2976                net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
2977
2978        rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2979                ip_rt_secret_interval;
2980        add_timer(&rt_secret_timer);
2981
2982#ifdef CONFIG_PROC_FS
2983        {
2984        struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
2985        if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2986            !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
2987                                             init_net.proc_net_stat))) {
2988                return -ENOMEM;
2989        }
2990        rtstat_pde->proc_fops = &rt_cpu_seq_fops;
2991        }
2992#ifdef CONFIG_NET_CLS_ROUTE
2993        create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
2994#endif
2995#endif
2996#ifdef CONFIG_XFRM
2997        xfrm_init();
2998        xfrm4_init();
2999#endif
3000        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3001
3002        return rc;
3003}
3004
3005EXPORT_SYMBOL(__ip_select_ident);
3006EXPORT_SYMBOL(ip_route_input);
3007EXPORT_SYMBOL(ip_route_output_key);
3008