linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *              Alan Cox        :       Verify area fixes.
  16 *              Alan Cox        :       cli() protects routing changes
  17 *              Rui Oliveira    :       ICMP routing table updates
  18 *              (rco@di.uminho.pt)      Routing table insertion and update
  19 *              Linus Torvalds  :       Rewrote bits to be sensible
  20 *              Alan Cox        :       Added BSD route gw semantics
  21 *              Alan Cox        :       Super /proc >4K
  22 *              Alan Cox        :       MTU in route table
  23 *              Alan Cox        :       MSS actually. Also added the window
  24 *                                      clamper.
  25 *              Sam Lantinga    :       Fixed route matching in rt_del()
  26 *              Alan Cox        :       Routing cache support.
  27 *              Alan Cox        :       Removed compatibility cruft.
  28 *              Alan Cox        :       RTF_REJECT support.
  29 *              Alan Cox        :       TCP irtt support.
  30 *              Jonathan Naylor :       Added Metric support.
  31 *      Miquel van Smoorenburg  :       BSD API fixes.
  32 *      Miquel van Smoorenburg  :       Metrics.
  33 *              Alan Cox        :       Use __u32 properly
  34 *              Alan Cox        :       Aligned routing errors more closely with BSD
  35 *                                      our system is still very different.
  36 *              Alan Cox        :       Faster /proc handling
  37 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38 *                                      routing caches and better behaviour.
  39 *
  40 *              Olaf Erb        :       irtt wasn't being copied right.
  41 *              Bjorn Ekwall    :       Kerneld route support.
  42 *              Alan Cox        :       Multicast fixed (I hope)
  43 *              Pavel Krauz     :       Limited broadcast fixed
  44 *              Mike McLagan    :       Routing by source
  45 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46 *                                      route.c and rewritten from scratch.
  47 *              Andi Kleen      :       Load-limit warning messages.
  48 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52 *              Marc Boucher    :       routing by fwmark
  53 *      Robert Olsson           :       Added rt_cache statistics
  54 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58 *
  59 *              This program is free software; you can redistribute it and/or
  60 *              modify it under the terms of the GNU General Public License
  61 *              as published by the Free Software Foundation; either version
  62 *              2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
  92#include <net/dst.h>
  93#include <net/net_namespace.h>
  94#include <net/protocol.h>
  95#include <net/ip.h>
  96#include <net/route.h>
  97#include <net/inetpeer.h>
  98#include <net/sock.h>
  99#include <net/ip_fib.h>
 100#include <net/arp.h>
 101#include <net/tcp.h>
 102#include <net/icmp.h>
 103#include <net/xfrm.h>
 104#include <net/netevent.h>
 105#include <net/rtnetlink.h>
 106#ifdef CONFIG_SYSCTL
 107#include <linux/sysctl.h>
 108#include <linux/kmemleak.h>
 109#endif
 110#include <net/secure_seq.h>
 111
 112#define RT_FL_TOS(oldflp4) \
 113        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 114
 115#define IP_MAX_MTU      0xFFF0
 116
 117#define RT_GC_TIMEOUT (300*HZ)
 118
 119static int ip_rt_max_size;
 120static int ip_rt_redirect_number __read_mostly  = 9;
 121static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 122static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 123static int ip_rt_error_cost __read_mostly       = HZ;
 124static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 125static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 126static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 127static int ip_rt_min_advmss __read_mostly       = 256;
 128
 129/*
 130 *      Interface to generic destination cache.
 131 */
 132
 133static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 134static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 135static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 136static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 137static void              ipv4_link_failure(struct sk_buff *skb);
 138static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 139                                           struct sk_buff *skb, u32 mtu);
 140static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 141                                        struct sk_buff *skb);
 142static void             ipv4_dst_destroy(struct dst_entry *dst);
 143
 144static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 145                            int how)
 146{
 147}
 148
 149static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 150{
 151        WARN_ON(1);
 152        return NULL;
 153}
 154
 155static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 156                                           struct sk_buff *skb,
 157                                           const void *daddr);
 158
 159static struct dst_ops ipv4_dst_ops = {
 160        .family =               AF_INET,
 161        .protocol =             cpu_to_be16(ETH_P_IP),
 162        .check =                ipv4_dst_check,
 163        .default_advmss =       ipv4_default_advmss,
 164        .mtu =                  ipv4_mtu,
 165        .cow_metrics =          ipv4_cow_metrics,
 166        .destroy =              ipv4_dst_destroy,
 167        .ifdown =               ipv4_dst_ifdown,
 168        .negative_advice =      ipv4_negative_advice,
 169        .link_failure =         ipv4_link_failure,
 170        .update_pmtu =          ip_rt_update_pmtu,
 171        .redirect =             ip_do_redirect,
 172        .local_out =            __ip_local_out,
 173        .neigh_lookup =         ipv4_neigh_lookup,
 174};
 175
 176#define ECN_OR_COST(class)      TC_PRIO_##class
 177
 178const __u8 ip_tos2prio[16] = {
 179        TC_PRIO_BESTEFFORT,
 180        ECN_OR_COST(BESTEFFORT),
 181        TC_PRIO_BESTEFFORT,
 182        ECN_OR_COST(BESTEFFORT),
 183        TC_PRIO_BULK,
 184        ECN_OR_COST(BULK),
 185        TC_PRIO_BULK,
 186        ECN_OR_COST(BULK),
 187        TC_PRIO_INTERACTIVE,
 188        ECN_OR_COST(INTERACTIVE),
 189        TC_PRIO_INTERACTIVE,
 190        ECN_OR_COST(INTERACTIVE),
 191        TC_PRIO_INTERACTIVE_BULK,
 192        ECN_OR_COST(INTERACTIVE_BULK),
 193        TC_PRIO_INTERACTIVE_BULK,
 194        ECN_OR_COST(INTERACTIVE_BULK)
 195};
 196EXPORT_SYMBOL(ip_tos2prio);
 197
 198static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 199#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 200
 201#ifdef CONFIG_PROC_FS
 202static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 203{
 204        if (*pos)
 205                return NULL;
 206        return SEQ_START_TOKEN;
 207}
 208
 209static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 210{
 211        ++*pos;
 212        return NULL;
 213}
 214
 215static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 216{
 217}
 218
 219static int rt_cache_seq_show(struct seq_file *seq, void *v)
 220{
 221        if (v == SEQ_START_TOKEN)
 222                seq_printf(seq, "%-127s\n",
 223                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 224                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 225                           "HHUptod\tSpecDst");
 226        return 0;
 227}
 228
 229static const struct seq_operations rt_cache_seq_ops = {
 230        .start  = rt_cache_seq_start,
 231        .next   = rt_cache_seq_next,
 232        .stop   = rt_cache_seq_stop,
 233        .show   = rt_cache_seq_show,
 234};
 235
 236static int rt_cache_seq_open(struct inode *inode, struct file *file)
 237{
 238        return seq_open(file, &rt_cache_seq_ops);
 239}
 240
 241static const struct file_operations rt_cache_seq_fops = {
 242        .owner   = THIS_MODULE,
 243        .open    = rt_cache_seq_open,
 244        .read    = seq_read,
 245        .llseek  = seq_lseek,
 246        .release = seq_release,
 247};
 248
 249
 250static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 251{
 252        int cpu;
 253
 254        if (*pos == 0)
 255                return SEQ_START_TOKEN;
 256
 257        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 258                if (!cpu_possible(cpu))
 259                        continue;
 260                *pos = cpu+1;
 261                return &per_cpu(rt_cache_stat, cpu);
 262        }
 263        return NULL;
 264}
 265
 266static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 267{
 268        int cpu;
 269
 270        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 271                if (!cpu_possible(cpu))
 272                        continue;
 273                *pos = cpu+1;
 274                return &per_cpu(rt_cache_stat, cpu);
 275        }
 276        return NULL;
 277
 278}
 279
 280static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 281{
 282
 283}
 284
 285static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 286{
 287        struct rt_cache_stat *st = v;
 288
 289        if (v == SEQ_START_TOKEN) {
 290                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 291                return 0;
 292        }
 293
 294        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 295                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 296                   dst_entries_get_slow(&ipv4_dst_ops),
 297                   st->in_hit,
 298                   st->in_slow_tot,
 299                   st->in_slow_mc,
 300                   st->in_no_route,
 301                   st->in_brd,
 302                   st->in_martian_dst,
 303                   st->in_martian_src,
 304
 305                   st->out_hit,
 306                   st->out_slow_tot,
 307                   st->out_slow_mc,
 308
 309                   st->gc_total,
 310                   st->gc_ignored,
 311                   st->gc_goal_miss,
 312                   st->gc_dst_overflow,
 313                   st->in_hlist_search,
 314                   st->out_hlist_search
 315                );
 316        return 0;
 317}
 318
 319static const struct seq_operations rt_cpu_seq_ops = {
 320        .start  = rt_cpu_seq_start,
 321        .next   = rt_cpu_seq_next,
 322        .stop   = rt_cpu_seq_stop,
 323        .show   = rt_cpu_seq_show,
 324};
 325
 326
 327static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 328{
 329        return seq_open(file, &rt_cpu_seq_ops);
 330}
 331
 332static const struct file_operations rt_cpu_seq_fops = {
 333        .owner   = THIS_MODULE,
 334        .open    = rt_cpu_seq_open,
 335        .read    = seq_read,
 336        .llseek  = seq_lseek,
 337        .release = seq_release,
 338};
 339
 340#ifdef CONFIG_IP_ROUTE_CLASSID
 341static int rt_acct_proc_show(struct seq_file *m, void *v)
 342{
 343        struct ip_rt_acct *dst, *src;
 344        unsigned int i, j;
 345
 346        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 347        if (!dst)
 348                return -ENOMEM;
 349
 350        for_each_possible_cpu(i) {
 351                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 352                for (j = 0; j < 256; j++) {
 353                        dst[j].o_bytes   += src[j].o_bytes;
 354                        dst[j].o_packets += src[j].o_packets;
 355                        dst[j].i_bytes   += src[j].i_bytes;
 356                        dst[j].i_packets += src[j].i_packets;
 357                }
 358        }
 359
 360        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 361        kfree(dst);
 362        return 0;
 363}
 364
 365static int rt_acct_proc_open(struct inode *inode, struct file *file)
 366{
 367        return single_open(file, rt_acct_proc_show, NULL);
 368}
 369
 370static const struct file_operations rt_acct_proc_fops = {
 371        .owner          = THIS_MODULE,
 372        .open           = rt_acct_proc_open,
 373        .read           = seq_read,
 374        .llseek         = seq_lseek,
 375        .release        = single_release,
 376};
 377#endif
 378
 379static int __net_init ip_rt_do_proc_init(struct net *net)
 380{
 381        struct proc_dir_entry *pde;
 382
 383        pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 384                          &rt_cache_seq_fops);
 385        if (!pde)
 386                goto err1;
 387
 388        pde = proc_create("rt_cache", S_IRUGO,
 389                          net->proc_net_stat, &rt_cpu_seq_fops);
 390        if (!pde)
 391                goto err2;
 392
 393#ifdef CONFIG_IP_ROUTE_CLASSID
 394        pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 395        if (!pde)
 396                goto err3;
 397#endif
 398        return 0;
 399
 400#ifdef CONFIG_IP_ROUTE_CLASSID
 401err3:
 402        remove_proc_entry("rt_cache", net->proc_net_stat);
 403#endif
 404err2:
 405        remove_proc_entry("rt_cache", net->proc_net);
 406err1:
 407        return -ENOMEM;
 408}
 409
 410static void __net_exit ip_rt_do_proc_exit(struct net *net)
 411{
 412        remove_proc_entry("rt_cache", net->proc_net_stat);
 413        remove_proc_entry("rt_cache", net->proc_net);
 414#ifdef CONFIG_IP_ROUTE_CLASSID
 415        remove_proc_entry("rt_acct", net->proc_net);
 416#endif
 417}
 418
 419static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 420        .init = ip_rt_do_proc_init,
 421        .exit = ip_rt_do_proc_exit,
 422};
 423
 424static int __init ip_rt_proc_init(void)
 425{
 426        return register_pernet_subsys(&ip_rt_proc_ops);
 427}
 428
 429#else
 430static inline int ip_rt_proc_init(void)
 431{
 432        return 0;
 433}
 434#endif /* CONFIG_PROC_FS */
 435
 436static inline bool rt_is_expired(const struct rtable *rth)
 437{
 438        return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 439}
 440
 441void rt_cache_flush(struct net *net)
 442{
 443        rt_genid_bump(net);
 444}
 445
 446static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 447                                           struct sk_buff *skb,
 448                                           const void *daddr)
 449{
 450        struct net_device *dev = dst->dev;
 451        const __be32 *pkey = daddr;
 452        const struct rtable *rt;
 453        struct neighbour *n;
 454
 455        rt = (const struct rtable *) dst;
 456        if (rt->rt_gateway)
 457                pkey = (const __be32 *) &rt->rt_gateway;
 458        else if (skb)
 459                pkey = &ip_hdr(skb)->daddr;
 460
 461        n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 462        if (n)
 463                return n;
 464        return neigh_create(&arp_tbl, pkey, dev);
 465}
 466
 467/*
 468 * Peer allocation may fail only in serious out-of-memory conditions.  However
 469 * we still can generate some output.
 470 * Random ID selection looks a bit dangerous because we have no chances to
 471 * select ID being unique in a reasonable period of time.
 472 * But broken packet identifier may be better than no packet at all.
 473 */
 474static void ip_select_fb_ident(struct iphdr *iph)
 475{
 476        static DEFINE_SPINLOCK(ip_fb_id_lock);
 477        static u32 ip_fallback_id;
 478        u32 salt;
 479
 480        spin_lock_bh(&ip_fb_id_lock);
 481        salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
 482        iph->id = htons(salt & 0xFFFF);
 483        ip_fallback_id = salt;
 484        spin_unlock_bh(&ip_fb_id_lock);
 485}
 486
 487void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 488{
 489        struct net *net = dev_net(dst->dev);
 490        struct inet_peer *peer;
 491
 492        peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
 493        if (peer) {
 494                iph->id = htons(inet_getid(peer, more));
 495                inet_putpeer(peer);
 496                return;
 497        }
 498
 499        ip_select_fb_ident(iph);
 500}
 501EXPORT_SYMBOL(__ip_select_ident);
 502
 503static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 504                             const struct iphdr *iph,
 505                             int oif, u8 tos,
 506                             u8 prot, u32 mark, int flow_flags)
 507{
 508        if (sk) {
 509                const struct inet_sock *inet = inet_sk(sk);
 510
 511                oif = sk->sk_bound_dev_if;
 512                mark = sk->sk_mark;
 513                tos = RT_CONN_FLAGS(sk);
 514                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 515        }
 516        flowi4_init_output(fl4, oif, mark, tos,
 517                           RT_SCOPE_UNIVERSE, prot,
 518                           flow_flags,
 519                           iph->daddr, iph->saddr, 0, 0);
 520}
 521
 522static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 523                               const struct sock *sk)
 524{
 525        const struct iphdr *iph = ip_hdr(skb);
 526        int oif = skb->dev->ifindex;
 527        u8 tos = RT_TOS(iph->tos);
 528        u8 prot = iph->protocol;
 529        u32 mark = skb->mark;
 530
 531        __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 532}
 533
 534static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 535{
 536        const struct inet_sock *inet = inet_sk(sk);
 537        const struct ip_options_rcu *inet_opt;
 538        __be32 daddr = inet->inet_daddr;
 539
 540        rcu_read_lock();
 541        inet_opt = rcu_dereference(inet->inet_opt);
 542        if (inet_opt && inet_opt->opt.srr)
 543                daddr = inet_opt->opt.faddr;
 544        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 545                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 546                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 547                           inet_sk_flowi_flags(sk),
 548                           daddr, inet->inet_saddr, 0, 0);
 549        rcu_read_unlock();
 550}
 551
 552static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 553                                 const struct sk_buff *skb)
 554{
 555        if (skb)
 556                build_skb_flow_key(fl4, skb, sk);
 557        else
 558                build_sk_flow_key(fl4, sk);
 559}
 560
 561static inline void rt_free(struct rtable *rt)
 562{
 563        call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 564}
 565
 566static DEFINE_SPINLOCK(fnhe_lock);
 567
 568static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 569{
 570        struct fib_nh_exception *fnhe, *oldest;
 571        struct rtable *orig;
 572
 573        oldest = rcu_dereference(hash->chain);
 574        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 575             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 576                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 577                        oldest = fnhe;
 578        }
 579        orig = rcu_dereference(oldest->fnhe_rth);
 580        if (orig) {
 581                RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
 582                rt_free(orig);
 583        }
 584        return oldest;
 585}
 586
 587static inline u32 fnhe_hashfun(__be32 daddr)
 588{
 589        u32 hval;
 590
 591        hval = (__force u32) daddr;
 592        hval ^= (hval >> 11) ^ (hval >> 22);
 593
 594        return hval & (FNHE_HASH_SIZE - 1);
 595}
 596
 597static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 598                                  u32 pmtu, unsigned long expires)
 599{
 600        struct fnhe_hash_bucket *hash;
 601        struct fib_nh_exception *fnhe;
 602        int depth;
 603        u32 hval = fnhe_hashfun(daddr);
 604
 605        spin_lock_bh(&fnhe_lock);
 606
 607        hash = nh->nh_exceptions;
 608        if (!hash) {
 609                hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 610                if (!hash)
 611                        goto out_unlock;
 612                nh->nh_exceptions = hash;
 613        }
 614
 615        hash += hval;
 616
 617        depth = 0;
 618        for (fnhe = rcu_dereference(hash->chain); fnhe;
 619             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 620                if (fnhe->fnhe_daddr == daddr)
 621                        break;
 622                depth++;
 623        }
 624
 625        if (fnhe) {
 626                if (gw)
 627                        fnhe->fnhe_gw = gw;
 628                if (pmtu) {
 629                        fnhe->fnhe_pmtu = pmtu;
 630                        fnhe->fnhe_expires = expires;
 631                }
 632        } else {
 633                if (depth > FNHE_RECLAIM_DEPTH)
 634                        fnhe = fnhe_oldest(hash);
 635                else {
 636                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 637                        if (!fnhe)
 638                                goto out_unlock;
 639
 640                        fnhe->fnhe_next = hash->chain;
 641                        rcu_assign_pointer(hash->chain, fnhe);
 642                }
 643                fnhe->fnhe_daddr = daddr;
 644                fnhe->fnhe_gw = gw;
 645                fnhe->fnhe_pmtu = pmtu;
 646                fnhe->fnhe_expires = expires;
 647        }
 648
 649        fnhe->fnhe_stamp = jiffies;
 650
 651out_unlock:
 652        spin_unlock_bh(&fnhe_lock);
 653        return;
 654}
 655
 656static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 657                             bool kill_route)
 658{
 659        __be32 new_gw = icmp_hdr(skb)->un.gateway;
 660        __be32 old_gw = ip_hdr(skb)->saddr;
 661        struct net_device *dev = skb->dev;
 662        struct in_device *in_dev;
 663        struct fib_result res;
 664        struct neighbour *n;
 665        struct net *net;
 666
 667        switch (icmp_hdr(skb)->code & 7) {
 668        case ICMP_REDIR_NET:
 669        case ICMP_REDIR_NETTOS:
 670        case ICMP_REDIR_HOST:
 671        case ICMP_REDIR_HOSTTOS:
 672                break;
 673
 674        default:
 675                return;
 676        }
 677
 678        if (rt->rt_gateway != old_gw)
 679                return;
 680
 681        in_dev = __in_dev_get_rcu(dev);
 682        if (!in_dev)
 683                return;
 684
 685        net = dev_net(dev);
 686        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 687            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 688            ipv4_is_zeronet(new_gw))
 689                goto reject_redirect;
 690
 691        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 692                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 693                        goto reject_redirect;
 694                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 695                        goto reject_redirect;
 696        } else {
 697                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 698                        goto reject_redirect;
 699        }
 700
 701        n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 702        if (n) {
 703                if (!(n->nud_state & NUD_VALID)) {
 704                        neigh_event_send(n, NULL);
 705                } else {
 706                        if (fib_lookup(net, fl4, &res) == 0) {
 707                                struct fib_nh *nh = &FIB_RES_NH(res);
 708
 709                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
 710                                                      0, 0);
 711                        }
 712                        if (kill_route)
 713                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 714                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 715                }
 716                neigh_release(n);
 717        }
 718        return;
 719
 720reject_redirect:
 721#ifdef CONFIG_IP_ROUTE_VERBOSE
 722        if (IN_DEV_LOG_MARTIANS(in_dev)) {
 723                const struct iphdr *iph = (const struct iphdr *) skb->data;
 724                __be32 daddr = iph->daddr;
 725                __be32 saddr = iph->saddr;
 726
 727                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 728                                     "  Advised path = %pI4 -> %pI4\n",
 729                                     &old_gw, dev->name, &new_gw,
 730                                     &saddr, &daddr);
 731        }
 732#endif
 733        ;
 734}
 735
 736static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 737{
 738        struct rtable *rt;
 739        struct flowi4 fl4;
 740        const struct iphdr *iph = (const struct iphdr *) skb->data;
 741        int oif = skb->dev->ifindex;
 742        u8 tos = RT_TOS(iph->tos);
 743        u8 prot = iph->protocol;
 744        u32 mark = skb->mark;
 745
 746        rt = (struct rtable *) dst;
 747
 748        __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
 749        __ip_do_redirect(rt, skb, &fl4, true);
 750}
 751
 752static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 753{
 754        struct rtable *rt = (struct rtable *)dst;
 755        struct dst_entry *ret = dst;
 756
 757        if (rt) {
 758                if (dst->obsolete > 0) {
 759                        ip_rt_put(rt);
 760                        ret = NULL;
 761                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 762                           rt->dst.expires) {
 763                        ip_rt_put(rt);
 764                        ret = NULL;
 765                }
 766        }
 767        return ret;
 768}
 769
 770/*
 771 * Algorithm:
 772 *      1. The first ip_rt_redirect_number redirects are sent
 773 *         with exponential backoff, then we stop sending them at all,
 774 *         assuming that the host ignores our redirects.
 775 *      2. If we did not see packets requiring redirects
 776 *         during ip_rt_redirect_silence, we assume that the host
 777 *         forgot redirected route and start to send redirects again.
 778 *
 779 * This algorithm is much cheaper and more intelligent than dumb load limiting
 780 * in icmp.c.
 781 *
 782 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 783 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 784 */
 785
 786void ip_rt_send_redirect(struct sk_buff *skb)
 787{
 788        struct rtable *rt = skb_rtable(skb);
 789        struct in_device *in_dev;
 790        struct inet_peer *peer;
 791        struct net *net;
 792        int log_martians;
 793
 794        rcu_read_lock();
 795        in_dev = __in_dev_get_rcu(rt->dst.dev);
 796        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 797                rcu_read_unlock();
 798                return;
 799        }
 800        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 801        rcu_read_unlock();
 802
 803        net = dev_net(rt->dst.dev);
 804        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 805        if (!peer) {
 806                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 807                          rt_nexthop(rt, ip_hdr(skb)->daddr));
 808                return;
 809        }
 810
 811        /* No redirected packets during ip_rt_redirect_silence;
 812         * reset the algorithm.
 813         */
 814        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 815                peer->rate_tokens = 0;
 816
 817        /* Too many ignored redirects; do not send anything
 818         * set dst.rate_last to the last seen redirected packet.
 819         */
 820        if (peer->rate_tokens >= ip_rt_redirect_number) {
 821                peer->rate_last = jiffies;
 822                goto out_put_peer;
 823        }
 824
 825        /* Check for load limit; set rate_last to the latest sent
 826         * redirect.
 827         */
 828        if (peer->rate_tokens == 0 ||
 829            time_after(jiffies,
 830                       (peer->rate_last +
 831                        (ip_rt_redirect_load << peer->rate_tokens)))) {
 832                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 833
 834                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 835                peer->rate_last = jiffies;
 836                ++peer->rate_tokens;
 837#ifdef CONFIG_IP_ROUTE_VERBOSE
 838                if (log_martians &&
 839                    peer->rate_tokens == ip_rt_redirect_number)
 840                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 841                                             &ip_hdr(skb)->saddr, inet_iif(skb),
 842                                             &ip_hdr(skb)->daddr, &gw);
 843#endif
 844        }
 845out_put_peer:
 846        inet_putpeer(peer);
 847}
 848
 849static int ip_error(struct sk_buff *skb)
 850{
 851        struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 852        struct rtable *rt = skb_rtable(skb);
 853        struct inet_peer *peer;
 854        unsigned long now;
 855        struct net *net;
 856        bool send;
 857        int code;
 858
 859        net = dev_net(rt->dst.dev);
 860        if (!IN_DEV_FORWARD(in_dev)) {
 861                switch (rt->dst.error) {
 862                case EHOSTUNREACH:
 863                        IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
 864                        break;
 865
 866                case ENETUNREACH:
 867                        IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 868                        break;
 869                }
 870                goto out;
 871        }
 872
 873        switch (rt->dst.error) {
 874        case EINVAL:
 875        default:
 876                goto out;
 877        case EHOSTUNREACH:
 878                code = ICMP_HOST_UNREACH;
 879                break;
 880        case ENETUNREACH:
 881                code = ICMP_NET_UNREACH;
 882                IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 883                break;
 884        case EACCES:
 885                code = ICMP_PKT_FILTERED;
 886                break;
 887        }
 888
 889        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 890
 891        send = true;
 892        if (peer) {
 893                now = jiffies;
 894                peer->rate_tokens += now - peer->rate_last;
 895                if (peer->rate_tokens > ip_rt_error_burst)
 896                        peer->rate_tokens = ip_rt_error_burst;
 897                peer->rate_last = now;
 898                if (peer->rate_tokens >= ip_rt_error_cost)
 899                        peer->rate_tokens -= ip_rt_error_cost;
 900                else
 901                        send = false;
 902                inet_putpeer(peer);
 903        }
 904        if (send)
 905                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 906
 907out:    kfree_skb(skb);
 908        return 0;
 909}
 910
 911static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 912{
 913        struct dst_entry *dst = &rt->dst;
 914        struct fib_result res;
 915
 916        if (dst_metric_locked(dst, RTAX_MTU))
 917                return;
 918
 919        if (dst->dev->mtu < mtu)
 920                return;
 921
 922        if (mtu < ip_rt_min_pmtu)
 923                mtu = ip_rt_min_pmtu;
 924
 925        if (!rt->rt_pmtu) {
 926                dst->obsolete = DST_OBSOLETE_KILL;
 927        } else {
 928                rt->rt_pmtu = mtu;
 929                dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
 930        }
 931
 932        rcu_read_lock();
 933        if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
 934                struct fib_nh *nh = &FIB_RES_NH(res);
 935
 936                update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 937                                      jiffies + ip_rt_mtu_expires);
 938        }
 939        rcu_read_unlock();
 940}
 941
 942static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 943                              struct sk_buff *skb, u32 mtu)
 944{
 945        struct rtable *rt = (struct rtable *) dst;
 946        struct flowi4 fl4;
 947
 948        ip_rt_build_flow_key(&fl4, sk, skb);
 949        __ip_rt_update_pmtu(rt, &fl4, mtu);
 950}
 951
 952void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 953                      int oif, u32 mark, u8 protocol, int flow_flags)
 954{
 955        const struct iphdr *iph = (const struct iphdr *) skb->data;
 956        struct flowi4 fl4;
 957        struct rtable *rt;
 958
 959        __build_flow_key(&fl4, NULL, iph, oif,
 960                         RT_TOS(iph->tos), protocol, mark, flow_flags);
 961        rt = __ip_route_output_key(net, &fl4);
 962        if (!IS_ERR(rt)) {
 963                __ip_rt_update_pmtu(rt, &fl4, mtu);
 964                ip_rt_put(rt);
 965        }
 966}
 967EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
 968
 969static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 970{
 971        const struct iphdr *iph = (const struct iphdr *) skb->data;
 972        struct flowi4 fl4;
 973        struct rtable *rt;
 974
 975        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
 976        rt = __ip_route_output_key(sock_net(sk), &fl4);
 977        if (!IS_ERR(rt)) {
 978                __ip_rt_update_pmtu(rt, &fl4, mtu);
 979                ip_rt_put(rt);
 980        }
 981}
 982
 983void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 984{
 985        const struct iphdr *iph = (const struct iphdr *) skb->data;
 986        struct flowi4 fl4;
 987        struct rtable *rt;
 988        struct dst_entry *dst;
 989        bool new = false;
 990
 991        bh_lock_sock(sk);
 992        rt = (struct rtable *) __sk_dst_get(sk);
 993
 994        if (sock_owned_by_user(sk) || !rt) {
 995                __ipv4_sk_update_pmtu(skb, sk, mtu);
 996                goto out;
 997        }
 998
 999        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1000
1001        if (!__sk_dst_check(sk, 0)) {
1002                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1003                if (IS_ERR(rt))
1004                        goto out;
1005
1006                new = true;
1007        }
1008
1009        __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1010
1011        dst = dst_check(&rt->dst, 0);
1012        if (!dst) {
1013                if (new)
1014                        dst_release(&rt->dst);
1015
1016                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1017                if (IS_ERR(rt))
1018                        goto out;
1019
1020                new = true;
1021        }
1022
1023        if (new)
1024                __sk_dst_set(sk, &rt->dst);
1025
1026out:
1027        bh_unlock_sock(sk);
1028}
1029EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1030
1031void ipv4_redirect(struct sk_buff *skb, struct net *net,
1032                   int oif, u32 mark, u8 protocol, int flow_flags)
1033{
1034        const struct iphdr *iph = (const struct iphdr *) skb->data;
1035        struct flowi4 fl4;
1036        struct rtable *rt;
1037
1038        __build_flow_key(&fl4, NULL, iph, oif,
1039                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1040        rt = __ip_route_output_key(net, &fl4);
1041        if (!IS_ERR(rt)) {
1042                __ip_do_redirect(rt, skb, &fl4, false);
1043                ip_rt_put(rt);
1044        }
1045}
1046EXPORT_SYMBOL_GPL(ipv4_redirect);
1047
1048void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1049{
1050        const struct iphdr *iph = (const struct iphdr *) skb->data;
1051        struct flowi4 fl4;
1052        struct rtable *rt;
1053
1054        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1055        rt = __ip_route_output_key(sock_net(sk), &fl4);
1056        if (!IS_ERR(rt)) {
1057                __ip_do_redirect(rt, skb, &fl4, false);
1058                ip_rt_put(rt);
1059        }
1060}
1061EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1062
1063static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1064{
1065        struct rtable *rt = (struct rtable *) dst;
1066
1067        /* All IPV4 dsts are created with ->obsolete set to the value
1068         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1069         * into this function always.
1070         *
1071         * When a PMTU/redirect information update invalidates a
1072         * route, this is indicated by setting obsolete to
1073         * DST_OBSOLETE_KILL.
1074         */
1075        if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1076                return NULL;
1077        return dst;
1078}
1079
1080static void ipv4_link_failure(struct sk_buff *skb)
1081{
1082        struct rtable *rt;
1083
1084        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1085
1086        rt = skb_rtable(skb);
1087        if (rt)
1088                dst_set_expires(&rt->dst, 0);
1089}
1090
1091static int ip_rt_bug(struct sk_buff *skb)
1092{
1093        pr_debug("%s: %pI4 -> %pI4, %s\n",
1094                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1095                 skb->dev ? skb->dev->name : "?");
1096        kfree_skb(skb);
1097        WARN_ON(1);
1098        return 0;
1099}
1100
1101/*
1102   We do not cache source address of outgoing interface,
1103   because it is used only by IP RR, TS and SRR options,
1104   so that it out of fast path.
1105
1106   BTW remember: "addr" is allowed to be not aligned
1107   in IP options!
1108 */
1109
1110void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1111{
1112        __be32 src;
1113
1114        if (rt_is_output_route(rt))
1115                src = ip_hdr(skb)->saddr;
1116        else {
1117                struct fib_result res;
1118                struct flowi4 fl4;
1119                struct iphdr *iph;
1120
1121                iph = ip_hdr(skb);
1122
1123                memset(&fl4, 0, sizeof(fl4));
1124                fl4.daddr = iph->daddr;
1125                fl4.saddr = iph->saddr;
1126                fl4.flowi4_tos = RT_TOS(iph->tos);
1127                fl4.flowi4_oif = rt->dst.dev->ifindex;
1128                fl4.flowi4_iif = skb->dev->ifindex;
1129                fl4.flowi4_mark = skb->mark;
1130
1131                rcu_read_lock();
1132                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1133                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1134                else
1135                        src = inet_select_addr(rt->dst.dev,
1136                                               rt_nexthop(rt, iph->daddr),
1137                                               RT_SCOPE_UNIVERSE);
1138                rcu_read_unlock();
1139        }
1140        memcpy(addr, &src, 4);
1141}
1142
1143#ifdef CONFIG_IP_ROUTE_CLASSID
1144static void set_class_tag(struct rtable *rt, u32 tag)
1145{
1146        if (!(rt->dst.tclassid & 0xFFFF))
1147                rt->dst.tclassid |= tag & 0xFFFF;
1148        if (!(rt->dst.tclassid & 0xFFFF0000))
1149                rt->dst.tclassid |= tag & 0xFFFF0000;
1150}
1151#endif
1152
1153static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1154{
1155        unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1156
1157        if (advmss == 0) {
1158                advmss = max_t(unsigned int, dst->dev->mtu - 40,
1159                               ip_rt_min_advmss);
1160                if (advmss > 65535 - 40)
1161                        advmss = 65535 - 40;
1162        }
1163        return advmss;
1164}
1165
1166static unsigned int ipv4_mtu(const struct dst_entry *dst)
1167{
1168        const struct rtable *rt = (const struct rtable *) dst;
1169        unsigned int mtu = rt->rt_pmtu;
1170
1171        if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1172                mtu = dst_metric_raw(dst, RTAX_MTU);
1173
1174        if (mtu)
1175                return mtu;
1176
1177        mtu = dst->dev->mtu;
1178
1179        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1180                if (rt->rt_uses_gateway && mtu > 576)
1181                        mtu = 576;
1182        }
1183
1184        if (mtu > IP_MAX_MTU)
1185                mtu = IP_MAX_MTU;
1186
1187        return mtu;
1188}
1189
1190static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1191{
1192        struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1193        struct fib_nh_exception *fnhe;
1194        u32 hval;
1195
1196        if (!hash)
1197                return NULL;
1198
1199        hval = fnhe_hashfun(daddr);
1200
1201        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1202             fnhe = rcu_dereference(fnhe->fnhe_next)) {
1203                if (fnhe->fnhe_daddr == daddr)
1204                        return fnhe;
1205        }
1206        return NULL;
1207}
1208
1209static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1210                              __be32 daddr)
1211{
1212        bool ret = false;
1213
1214        spin_lock_bh(&fnhe_lock);
1215
1216        if (daddr == fnhe->fnhe_daddr) {
1217                struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1218                if (orig && rt_is_expired(orig)) {
1219                        fnhe->fnhe_gw = 0;
1220                        fnhe->fnhe_pmtu = 0;
1221                        fnhe->fnhe_expires = 0;
1222                }
1223                if (fnhe->fnhe_pmtu) {
1224                        unsigned long expires = fnhe->fnhe_expires;
1225                        unsigned long diff = expires - jiffies;
1226
1227                        if (time_before(jiffies, expires)) {
1228                                rt->rt_pmtu = fnhe->fnhe_pmtu;
1229                                dst_set_expires(&rt->dst, diff);
1230                        }
1231                }
1232                if (fnhe->fnhe_gw) {
1233                        rt->rt_flags |= RTCF_REDIRECTED;
1234                        rt->rt_gateway = fnhe->fnhe_gw;
1235                        rt->rt_uses_gateway = 1;
1236                } else if (!rt->rt_gateway)
1237                        rt->rt_gateway = daddr;
1238
1239                rcu_assign_pointer(fnhe->fnhe_rth, rt);
1240                if (orig)
1241                        rt_free(orig);
1242
1243                fnhe->fnhe_stamp = jiffies;
1244                ret = true;
1245        }
1246        spin_unlock_bh(&fnhe_lock);
1247
1248        return ret;
1249}
1250
1251static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1252{
1253        struct rtable *orig, *prev, **p;
1254        bool ret = true;
1255
1256        if (rt_is_input_route(rt)) {
1257                p = (struct rtable **)&nh->nh_rth_input;
1258        } else {
1259                p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1260        }
1261        orig = *p;
1262
1263        prev = cmpxchg(p, orig, rt);
1264        if (prev == orig) {
1265                if (orig)
1266                        rt_free(orig);
1267        } else
1268                ret = false;
1269
1270        return ret;
1271}
1272
1273static DEFINE_SPINLOCK(rt_uncached_lock);
1274static LIST_HEAD(rt_uncached_list);
1275
1276static void rt_add_uncached_list(struct rtable *rt)
1277{
1278        spin_lock_bh(&rt_uncached_lock);
1279        list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1280        spin_unlock_bh(&rt_uncached_lock);
1281}
1282
1283static void ipv4_dst_destroy(struct dst_entry *dst)
1284{
1285        struct rtable *rt = (struct rtable *) dst;
1286
1287        if (!list_empty(&rt->rt_uncached)) {
1288                spin_lock_bh(&rt_uncached_lock);
1289                list_del(&rt->rt_uncached);
1290                spin_unlock_bh(&rt_uncached_lock);
1291        }
1292}
1293
1294void rt_flush_dev(struct net_device *dev)
1295{
1296        if (!list_empty(&rt_uncached_list)) {
1297                struct net *net = dev_net(dev);
1298                struct rtable *rt;
1299
1300                spin_lock_bh(&rt_uncached_lock);
1301                list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1302                        if (rt->dst.dev != dev)
1303                                continue;
1304                        rt->dst.dev = net->loopback_dev;
1305                        dev_hold(rt->dst.dev);
1306                        dev_put(dev);
1307                }
1308                spin_unlock_bh(&rt_uncached_lock);
1309        }
1310}
1311
1312static bool rt_cache_valid(const struct rtable *rt)
1313{
1314        return  rt &&
1315                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1316                !rt_is_expired(rt);
1317}
1318
1319static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1320                           const struct fib_result *res,
1321                           struct fib_nh_exception *fnhe,
1322                           struct fib_info *fi, u16 type, u32 itag)
1323{
1324        bool cached = false;
1325
1326        if (fi) {
1327                struct fib_nh *nh = &FIB_RES_NH(*res);
1328
1329                if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1330                        rt->rt_gateway = nh->nh_gw;
1331                        rt->rt_uses_gateway = 1;
1332                }
1333                dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1334#ifdef CONFIG_IP_ROUTE_CLASSID
1335                rt->dst.tclassid = nh->nh_tclassid;
1336#endif
1337                if (unlikely(fnhe))
1338                        cached = rt_bind_exception(rt, fnhe, daddr);
1339                else if (!(rt->dst.flags & DST_NOCACHE))
1340                        cached = rt_cache_route(nh, rt);
1341                if (unlikely(!cached)) {
1342                        /* Routes we intend to cache in nexthop exception or
1343                         * FIB nexthop have the DST_NOCACHE bit clear.
1344                         * However, if we are unsuccessful at storing this
1345                         * route into the cache we really need to set it.
1346                         */
1347                        rt->dst.flags |= DST_NOCACHE;
1348                        if (!rt->rt_gateway)
1349                                rt->rt_gateway = daddr;
1350                        rt_add_uncached_list(rt);
1351                }
1352        } else
1353                rt_add_uncached_list(rt);
1354
1355#ifdef CONFIG_IP_ROUTE_CLASSID
1356#ifdef CONFIG_IP_MULTIPLE_TABLES
1357        set_class_tag(rt, res->tclassid);
1358#endif
1359        set_class_tag(rt, itag);
1360#endif
1361}
1362
1363static struct rtable *rt_dst_alloc(struct net_device *dev,
1364                                   bool nopolicy, bool noxfrm, bool will_cache)
1365{
1366        return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1367                         (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1368                         (nopolicy ? DST_NOPOLICY : 0) |
1369                         (noxfrm ? DST_NOXFRM : 0));
1370}
1371
1372/* called in rcu_read_lock() section */
1373static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1374                                u8 tos, struct net_device *dev, int our)
1375{
1376        struct rtable *rth;
1377        struct in_device *in_dev = __in_dev_get_rcu(dev);
1378        u32 itag = 0;
1379        int err;
1380
1381        /* Primary sanity checks. */
1382
1383        if (in_dev == NULL)
1384                return -EINVAL;
1385
1386        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1387            skb->protocol != htons(ETH_P_IP))
1388                goto e_inval;
1389
1390        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1391                if (ipv4_is_loopback(saddr))
1392                        goto e_inval;
1393
1394        if (ipv4_is_zeronet(saddr)) {
1395                if (!ipv4_is_local_multicast(daddr))
1396                        goto e_inval;
1397        } else {
1398                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1399                                          in_dev, &itag);
1400                if (err < 0)
1401                        goto e_err;
1402        }
1403        rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1404                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1405        if (!rth)
1406                goto e_nobufs;
1407
1408#ifdef CONFIG_IP_ROUTE_CLASSID
1409        rth->dst.tclassid = itag;
1410#endif
1411        rth->dst.output = ip_rt_bug;
1412
1413        rth->rt_genid   = rt_genid(dev_net(dev));
1414        rth->rt_flags   = RTCF_MULTICAST;
1415        rth->rt_type    = RTN_MULTICAST;
1416        rth->rt_is_input= 1;
1417        rth->rt_iif     = 0;
1418        rth->rt_pmtu    = 0;
1419        rth->rt_gateway = 0;
1420        rth->rt_uses_gateway = 0;
1421        INIT_LIST_HEAD(&rth->rt_uncached);
1422        if (our) {
1423                rth->dst.input= ip_local_deliver;
1424                rth->rt_flags |= RTCF_LOCAL;
1425        }
1426
1427#ifdef CONFIG_IP_MROUTE
1428        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1429                rth->dst.input = ip_mr_input;
1430#endif
1431        RT_CACHE_STAT_INC(in_slow_mc);
1432
1433        skb_dst_set(skb, &rth->dst);
1434        return 0;
1435
1436e_nobufs:
1437        return -ENOBUFS;
1438e_inval:
1439        return -EINVAL;
1440e_err:
1441        return err;
1442}
1443
1444
1445static void ip_handle_martian_source(struct net_device *dev,
1446                                     struct in_device *in_dev,
1447                                     struct sk_buff *skb,
1448                                     __be32 daddr,
1449                                     __be32 saddr)
1450{
1451        RT_CACHE_STAT_INC(in_martian_src);
1452#ifdef CONFIG_IP_ROUTE_VERBOSE
1453        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1454                /*
1455                 *      RFC1812 recommendation, if source is martian,
1456                 *      the only hint is MAC header.
1457                 */
1458                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1459                        &daddr, &saddr, dev->name);
1460                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1461                        print_hex_dump(KERN_WARNING, "ll header: ",
1462                                       DUMP_PREFIX_OFFSET, 16, 1,
1463                                       skb_mac_header(skb),
1464                                       dev->hard_header_len, true);
1465                }
1466        }
1467#endif
1468}
1469
1470/* called in rcu_read_lock() section */
1471static int __mkroute_input(struct sk_buff *skb,
1472                           const struct fib_result *res,
1473                           struct in_device *in_dev,
1474                           __be32 daddr, __be32 saddr, u32 tos)
1475{
1476        struct rtable *rth;
1477        int err;
1478        struct in_device *out_dev;
1479        unsigned int flags = 0;
1480        bool do_cache;
1481        u32 itag;
1482
1483        /* get a working reference to the output device */
1484        out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1485        if (out_dev == NULL) {
1486                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1487                return -EINVAL;
1488        }
1489
1490        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1491                                  in_dev->dev, in_dev, &itag);
1492        if (err < 0) {
1493                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1494                                         saddr);
1495
1496                goto cleanup;
1497        }
1498
1499        do_cache = res->fi && !itag;
1500        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1501            (IN_DEV_SHARED_MEDIA(out_dev) ||
1502             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1503                flags |= RTCF_DOREDIRECT;
1504                do_cache = false;
1505        }
1506
1507        if (skb->protocol != htons(ETH_P_IP)) {
1508                /* Not IP (i.e. ARP). Do not create route, if it is
1509                 * invalid for proxy arp. DNAT routes are always valid.
1510                 *
1511                 * Proxy arp feature have been extended to allow, ARP
1512                 * replies back to the same interface, to support
1513                 * Private VLAN switch technologies. See arp.c.
1514                 */
1515                if (out_dev == in_dev &&
1516                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1517                        err = -EINVAL;
1518                        goto cleanup;
1519                }
1520        }
1521
1522        if (do_cache) {
1523                rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1524                if (rt_cache_valid(rth)) {
1525                        skb_dst_set_noref(skb, &rth->dst);
1526                        goto out;
1527                }
1528        }
1529
1530        rth = rt_dst_alloc(out_dev->dev,
1531                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1532                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1533        if (!rth) {
1534                err = -ENOBUFS;
1535                goto cleanup;
1536        }
1537
1538        rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1539        rth->rt_flags = flags;
1540        rth->rt_type = res->type;
1541        rth->rt_is_input = 1;
1542        rth->rt_iif     = 0;
1543        rth->rt_pmtu    = 0;
1544        rth->rt_gateway = 0;
1545        rth->rt_uses_gateway = 0;
1546        INIT_LIST_HEAD(&rth->rt_uncached);
1547
1548        rth->dst.input = ip_forward;
1549        rth->dst.output = ip_output;
1550
1551        rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1552        skb_dst_set(skb, &rth->dst);
1553out:
1554        err = 0;
1555 cleanup:
1556        return err;
1557}
1558
1559static int ip_mkroute_input(struct sk_buff *skb,
1560                            struct fib_result *res,
1561                            const struct flowi4 *fl4,
1562                            struct in_device *in_dev,
1563                            __be32 daddr, __be32 saddr, u32 tos)
1564{
1565#ifdef CONFIG_IP_ROUTE_MULTIPATH
1566        if (res->fi && res->fi->fib_nhs > 1)
1567                fib_select_multipath(res);
1568#endif
1569
1570        /* create a routing cache entry */
1571        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1572}
1573
1574/*
1575 *      NOTE. We drop all the packets that has local source
1576 *      addresses, because every properly looped back packet
1577 *      must have correct destination already attached by output routine.
1578 *
1579 *      Such approach solves two big problems:
1580 *      1. Not simplex devices are handled properly.
1581 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1582 *      called with rcu_read_lock()
1583 */
1584
1585static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1586                               u8 tos, struct net_device *dev)
1587{
1588        struct fib_result res;
1589        struct in_device *in_dev = __in_dev_get_rcu(dev);
1590        struct flowi4   fl4;
1591        unsigned int    flags = 0;
1592        u32             itag = 0;
1593        struct rtable   *rth;
1594        int             err = -EINVAL;
1595        struct net    *net = dev_net(dev);
1596        bool do_cache;
1597
1598        /* IP on this device is disabled. */
1599
1600        if (!in_dev)
1601                goto out;
1602
1603        /* Check for the most weird martians, which can be not detected
1604           by fib_lookup.
1605         */
1606
1607        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1608                goto martian_source;
1609
1610        res.fi = NULL;
1611        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1612                goto brd_input;
1613
1614        /* Accept zero addresses only to limited broadcast;
1615         * I even do not know to fix it or not. Waiting for complains :-)
1616         */
1617        if (ipv4_is_zeronet(saddr))
1618                goto martian_source;
1619
1620        if (ipv4_is_zeronet(daddr))
1621                goto martian_destination;
1622
1623        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1624         * and call it once if daddr or/and saddr are loopback addresses
1625         */
1626        if (ipv4_is_loopback(daddr)) {
1627                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1628                        goto martian_destination;
1629        } else if (ipv4_is_loopback(saddr)) {
1630                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1631                        goto martian_source;
1632        }
1633
1634        /*
1635         *      Now we are ready to route packet.
1636         */
1637        fl4.flowi4_oif = 0;
1638        fl4.flowi4_iif = dev->ifindex;
1639        fl4.flowi4_mark = skb->mark;
1640        fl4.flowi4_tos = tos;
1641        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1642        fl4.daddr = daddr;
1643        fl4.saddr = saddr;
1644        err = fib_lookup(net, &fl4, &res);
1645        if (err != 0)
1646                goto no_route;
1647
1648        RT_CACHE_STAT_INC(in_slow_tot);
1649
1650        if (res.type == RTN_BROADCAST)
1651                goto brd_input;
1652
1653        if (res.type == RTN_LOCAL) {
1654                err = fib_validate_source(skb, saddr, daddr, tos,
1655                                          LOOPBACK_IFINDEX,
1656                                          dev, in_dev, &itag);
1657                if (err < 0)
1658                        goto martian_source_keep_err;
1659                goto local_input;
1660        }
1661
1662        if (!IN_DEV_FORWARD(in_dev))
1663                goto no_route;
1664        if (res.type != RTN_UNICAST)
1665                goto martian_destination;
1666
1667        err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1668out:    return err;
1669
1670brd_input:
1671        if (skb->protocol != htons(ETH_P_IP))
1672                goto e_inval;
1673
1674        if (!ipv4_is_zeronet(saddr)) {
1675                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1676                                          in_dev, &itag);
1677                if (err < 0)
1678                        goto martian_source_keep_err;
1679        }
1680        flags |= RTCF_BROADCAST;
1681        res.type = RTN_BROADCAST;
1682        RT_CACHE_STAT_INC(in_brd);
1683
1684local_input:
1685        do_cache = false;
1686        if (res.fi) {
1687                if (!itag) {
1688                        rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1689                        if (rt_cache_valid(rth)) {
1690                                skb_dst_set_noref(skb, &rth->dst);
1691                                err = 0;
1692                                goto out;
1693                        }
1694                        do_cache = true;
1695                }
1696        }
1697
1698        rth = rt_dst_alloc(net->loopback_dev,
1699                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1700        if (!rth)
1701                goto e_nobufs;
1702
1703        rth->dst.input= ip_local_deliver;
1704        rth->dst.output= ip_rt_bug;
1705#ifdef CONFIG_IP_ROUTE_CLASSID
1706        rth->dst.tclassid = itag;
1707#endif
1708
1709        rth->rt_genid = rt_genid(net);
1710        rth->rt_flags   = flags|RTCF_LOCAL;
1711        rth->rt_type    = res.type;
1712        rth->rt_is_input = 1;
1713        rth->rt_iif     = 0;
1714        rth->rt_pmtu    = 0;
1715        rth->rt_gateway = 0;
1716        rth->rt_uses_gateway = 0;
1717        INIT_LIST_HEAD(&rth->rt_uncached);
1718        if (res.type == RTN_UNREACHABLE) {
1719                rth->dst.input= ip_error;
1720                rth->dst.error= -err;
1721                rth->rt_flags   &= ~RTCF_LOCAL;
1722        }
1723        if (do_cache)
1724                rt_cache_route(&FIB_RES_NH(res), rth);
1725        skb_dst_set(skb, &rth->dst);
1726        err = 0;
1727        goto out;
1728
1729no_route:
1730        RT_CACHE_STAT_INC(in_no_route);
1731        res.type = RTN_UNREACHABLE;
1732        if (err == -ESRCH)
1733                err = -ENETUNREACH;
1734        goto local_input;
1735
1736        /*
1737         *      Do not cache martian addresses: they should be logged (RFC1812)
1738         */
1739martian_destination:
1740        RT_CACHE_STAT_INC(in_martian_dst);
1741#ifdef CONFIG_IP_ROUTE_VERBOSE
1742        if (IN_DEV_LOG_MARTIANS(in_dev))
1743                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1744                                     &daddr, &saddr, dev->name);
1745#endif
1746
1747e_inval:
1748        err = -EINVAL;
1749        goto out;
1750
1751e_nobufs:
1752        err = -ENOBUFS;
1753        goto out;
1754
1755martian_source:
1756        err = -EINVAL;
1757martian_source_keep_err:
1758        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1759        goto out;
1760}
1761
1762int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1763                         u8 tos, struct net_device *dev)
1764{
1765        int res;
1766
1767        rcu_read_lock();
1768
1769        /* Multicast recognition logic is moved from route cache to here.
1770           The problem was that too many Ethernet cards have broken/missing
1771           hardware multicast filters :-( As result the host on multicasting
1772           network acquires a lot of useless route cache entries, sort of
1773           SDR messages from all the world. Now we try to get rid of them.
1774           Really, provided software IP multicast filter is organized
1775           reasonably (at least, hashed), it does not result in a slowdown
1776           comparing with route cache reject entries.
1777           Note, that multicast routers are not affected, because
1778           route cache entry is created eventually.
1779         */
1780        if (ipv4_is_multicast(daddr)) {
1781                struct in_device *in_dev = __in_dev_get_rcu(dev);
1782
1783                if (in_dev) {
1784                        int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1785                                                  ip_hdr(skb)->protocol);
1786                        if (our
1787#ifdef CONFIG_IP_MROUTE
1788                                ||
1789                            (!ipv4_is_local_multicast(daddr) &&
1790                             IN_DEV_MFORWARD(in_dev))
1791#endif
1792                           ) {
1793                                int res = ip_route_input_mc(skb, daddr, saddr,
1794                                                            tos, dev, our);
1795                                rcu_read_unlock();
1796                                return res;
1797                        }
1798                }
1799                rcu_read_unlock();
1800                return -EINVAL;
1801        }
1802        res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1803        rcu_read_unlock();
1804        return res;
1805}
1806EXPORT_SYMBOL(ip_route_input_noref);
1807
1808/* called with rcu_read_lock() */
1809static struct rtable *__mkroute_output(const struct fib_result *res,
1810                                       const struct flowi4 *fl4, int orig_oif,
1811                                       struct net_device *dev_out,
1812                                       unsigned int flags)
1813{
1814        struct fib_info *fi = res->fi;
1815        struct fib_nh_exception *fnhe;
1816        struct in_device *in_dev;
1817        u16 type = res->type;
1818        struct rtable *rth;
1819        bool do_cache;
1820
1821        in_dev = __in_dev_get_rcu(dev_out);
1822        if (!in_dev)
1823                return ERR_PTR(-EINVAL);
1824
1825        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1826                if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1827                        return ERR_PTR(-EINVAL);
1828
1829        if (ipv4_is_lbcast(fl4->daddr))
1830                type = RTN_BROADCAST;
1831        else if (ipv4_is_multicast(fl4->daddr))
1832                type = RTN_MULTICAST;
1833        else if (ipv4_is_zeronet(fl4->daddr))
1834                return ERR_PTR(-EINVAL);
1835
1836        if (dev_out->flags & IFF_LOOPBACK)
1837                flags |= RTCF_LOCAL;
1838
1839        do_cache = true;
1840        if (type == RTN_BROADCAST) {
1841                flags |= RTCF_BROADCAST | RTCF_LOCAL;
1842                fi = NULL;
1843        } else if (type == RTN_MULTICAST) {
1844                flags |= RTCF_MULTICAST | RTCF_LOCAL;
1845                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1846                                     fl4->flowi4_proto))
1847                        flags &= ~RTCF_LOCAL;
1848                else
1849                        do_cache = false;
1850                /* If multicast route do not exist use
1851                 * default one, but do not gateway in this case.
1852                 * Yes, it is hack.
1853                 */
1854                if (fi && res->prefixlen < 4)
1855                        fi = NULL;
1856        }
1857
1858        fnhe = NULL;
1859        do_cache &= fi != NULL;
1860        if (do_cache) {
1861                struct rtable __rcu **prth;
1862                struct fib_nh *nh = &FIB_RES_NH(*res);
1863
1864                fnhe = find_exception(nh, fl4->daddr);
1865                if (fnhe)
1866                        prth = &fnhe->fnhe_rth;
1867                else {
1868                        if (unlikely(fl4->flowi4_flags &
1869                                     FLOWI_FLAG_KNOWN_NH &&
1870                                     !(nh->nh_gw &&
1871                                       nh->nh_scope == RT_SCOPE_LINK))) {
1872                                do_cache = false;
1873                                goto add;
1874                        }
1875                        prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1876                }
1877                rth = rcu_dereference(*prth);
1878                if (rt_cache_valid(rth)) {
1879                        dst_hold(&rth->dst);
1880                        return rth;
1881                }
1882        }
1883
1884add:
1885        rth = rt_dst_alloc(dev_out,
1886                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1887                           IN_DEV_CONF_GET(in_dev, NOXFRM),
1888                           do_cache);
1889        if (!rth)
1890                return ERR_PTR(-ENOBUFS);
1891
1892        rth->dst.output = ip_output;
1893
1894        rth->rt_genid = rt_genid(dev_net(dev_out));
1895        rth->rt_flags   = flags;
1896        rth->rt_type    = type;
1897        rth->rt_is_input = 0;
1898        rth->rt_iif     = orig_oif ? : 0;
1899        rth->rt_pmtu    = 0;
1900        rth->rt_gateway = 0;
1901        rth->rt_uses_gateway = 0;
1902        INIT_LIST_HEAD(&rth->rt_uncached);
1903
1904        RT_CACHE_STAT_INC(out_slow_tot);
1905
1906        if (flags & RTCF_LOCAL)
1907                rth->dst.input = ip_local_deliver;
1908        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1909                if (flags & RTCF_LOCAL &&
1910                    !(dev_out->flags & IFF_LOOPBACK)) {
1911                        rth->dst.output = ip_mc_output;
1912                        RT_CACHE_STAT_INC(out_slow_mc);
1913                }
1914#ifdef CONFIG_IP_MROUTE
1915                if (type == RTN_MULTICAST) {
1916                        if (IN_DEV_MFORWARD(in_dev) &&
1917                            !ipv4_is_local_multicast(fl4->daddr)) {
1918                                rth->dst.input = ip_mr_input;
1919                                rth->dst.output = ip_mc_output;
1920                        }
1921                }
1922#endif
1923        }
1924
1925        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1926
1927        return rth;
1928}
1929
1930/*
1931 * Major route resolver routine.
1932 */
1933
1934struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1935{
1936        struct net_device *dev_out = NULL;
1937        __u8 tos = RT_FL_TOS(fl4);
1938        unsigned int flags = 0;
1939        struct fib_result res;
1940        struct rtable *rth;
1941        int orig_oif;
1942
1943        res.tclassid    = 0;
1944        res.fi          = NULL;
1945        res.table       = NULL;
1946
1947        orig_oif = fl4->flowi4_oif;
1948
1949        fl4->flowi4_iif = LOOPBACK_IFINDEX;
1950        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1951        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1952                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1953
1954        rcu_read_lock();
1955        if (fl4->saddr) {
1956                rth = ERR_PTR(-EINVAL);
1957                if (ipv4_is_multicast(fl4->saddr) ||
1958                    ipv4_is_lbcast(fl4->saddr) ||
1959                    ipv4_is_zeronet(fl4->saddr))
1960                        goto out;
1961
1962                /* I removed check for oif == dev_out->oif here.
1963                   It was wrong for two reasons:
1964                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1965                      is assigned to multiple interfaces.
1966                   2. Moreover, we are allowed to send packets with saddr
1967                      of another iface. --ANK
1968                 */
1969
1970                if (fl4->flowi4_oif == 0 &&
1971                    (ipv4_is_multicast(fl4->daddr) ||
1972                     ipv4_is_lbcast(fl4->daddr))) {
1973                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1974                        dev_out = __ip_dev_find(net, fl4->saddr, false);
1975                        if (dev_out == NULL)
1976                                goto out;
1977
1978                        /* Special hack: user can direct multicasts
1979                           and limited broadcast via necessary interface
1980                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1981                           This hack is not just for fun, it allows
1982                           vic,vat and friends to work.
1983                           They bind socket to loopback, set ttl to zero
1984                           and expect that it will work.
1985                           From the viewpoint of routing cache they are broken,
1986                           because we are not allowed to build multicast path
1987                           with loopback source addr (look, routing cache
1988                           cannot know, that ttl is zero, so that packet
1989                           will not leave this host and route is valid).
1990                           Luckily, this hack is good workaround.
1991                         */
1992
1993                        fl4->flowi4_oif = dev_out->ifindex;
1994                        goto make_route;
1995                }
1996
1997                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1998                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1999                        if (!__ip_dev_find(net, fl4->saddr, false))
2000                                goto out;
2001                }
2002        }
2003
2004
2005        if (fl4->flowi4_oif) {
2006                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2007                rth = ERR_PTR(-ENODEV);
2008                if (dev_out == NULL)
2009                        goto out;
2010
2011                /* RACE: Check return value of inet_select_addr instead. */
2012                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2013                        rth = ERR_PTR(-ENETUNREACH);
2014                        goto out;
2015                }
2016                if (ipv4_is_local_multicast(fl4->daddr) ||
2017                    ipv4_is_lbcast(fl4->daddr)) {
2018                        if (!fl4->saddr)
2019                                fl4->saddr = inet_select_addr(dev_out, 0,
2020                                                              RT_SCOPE_LINK);
2021                        goto make_route;
2022                }
2023                if (fl4->saddr) {
2024                        if (ipv4_is_multicast(fl4->daddr))
2025                                fl4->saddr = inet_select_addr(dev_out, 0,
2026                                                              fl4->flowi4_scope);
2027                        else if (!fl4->daddr)
2028                                fl4->saddr = inet_select_addr(dev_out, 0,
2029                                                              RT_SCOPE_HOST);
2030                }
2031        }
2032
2033        if (!fl4->daddr) {
2034                fl4->daddr = fl4->saddr;
2035                if (!fl4->daddr)
2036                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2037                dev_out = net->loopback_dev;
2038                fl4->flowi4_oif = LOOPBACK_IFINDEX;
2039                res.type = RTN_LOCAL;
2040                flags |= RTCF_LOCAL;
2041                goto make_route;
2042        }
2043
2044        if (fib_lookup(net, fl4, &res)) {
2045                res.fi = NULL;
2046                res.table = NULL;
2047                if (fl4->flowi4_oif) {
2048                        /* Apparently, routing tables are wrong. Assume,
2049                           that the destination is on link.
2050
2051                           WHY? DW.
2052                           Because we are allowed to send to iface
2053                           even if it has NO routes and NO assigned
2054                           addresses. When oif is specified, routing
2055                           tables are looked up with only one purpose:
2056                           to catch if destination is gatewayed, rather than
2057                           direct. Moreover, if MSG_DONTROUTE is set,
2058                           we send packet, ignoring both routing tables
2059                           and ifaddr state. --ANK
2060
2061
2062                           We could make it even if oif is unknown,
2063                           likely IPv6, but we do not.
2064                         */
2065
2066                        if (fl4->saddr == 0)
2067                                fl4->saddr = inet_select_addr(dev_out, 0,
2068                                                              RT_SCOPE_LINK);
2069                        res.type = RTN_UNICAST;
2070                        goto make_route;
2071                }
2072                rth = ERR_PTR(-ENETUNREACH);
2073                goto out;
2074        }
2075
2076        if (res.type == RTN_LOCAL) {
2077                if (!fl4->saddr) {
2078                        if (res.fi->fib_prefsrc)
2079                                fl4->saddr = res.fi->fib_prefsrc;
2080                        else
2081                                fl4->saddr = fl4->daddr;
2082                }
2083                dev_out = net->loopback_dev;
2084                fl4->flowi4_oif = dev_out->ifindex;
2085                flags |= RTCF_LOCAL;
2086                goto make_route;
2087        }
2088
2089#ifdef CONFIG_IP_ROUTE_MULTIPATH
2090        if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2091                fib_select_multipath(&res);
2092        else
2093#endif
2094        if (!res.prefixlen &&
2095            res.table->tb_num_default > 1 &&
2096            res.type == RTN_UNICAST && !fl4->flowi4_oif)
2097                fib_select_default(&res);
2098
2099        if (!fl4->saddr)
2100                fl4->saddr = FIB_RES_PREFSRC(net, res);
2101
2102        dev_out = FIB_RES_DEV(res);
2103        fl4->flowi4_oif = dev_out->ifindex;
2104
2105
2106make_route:
2107        rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2108
2109out:
2110        rcu_read_unlock();
2111        return rth;
2112}
2113EXPORT_SYMBOL_GPL(__ip_route_output_key);
2114
2115static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2116{
2117        return NULL;
2118}
2119
2120static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2121{
2122        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2123
2124        return mtu ? : dst->dev->mtu;
2125}
2126
2127static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2128                                          struct sk_buff *skb, u32 mtu)
2129{
2130}
2131
2132static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2133                                       struct sk_buff *skb)
2134{
2135}
2136
2137static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2138                                          unsigned long old)
2139{
2140        return NULL;
2141}
2142
2143static struct dst_ops ipv4_dst_blackhole_ops = {
2144        .family                 =       AF_INET,
2145        .protocol               =       cpu_to_be16(ETH_P_IP),
2146        .check                  =       ipv4_blackhole_dst_check,
2147        .mtu                    =       ipv4_blackhole_mtu,
2148        .default_advmss         =       ipv4_default_advmss,
2149        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2150        .redirect               =       ipv4_rt_blackhole_redirect,
2151        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2152        .neigh_lookup           =       ipv4_neigh_lookup,
2153};
2154
2155struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2156{
2157        struct rtable *ort = (struct rtable *) dst_orig;
2158        struct rtable *rt;
2159
2160        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2161        if (rt) {
2162                struct dst_entry *new = &rt->dst;
2163
2164                new->__use = 1;
2165                new->input = dst_discard;
2166                new->output = dst_discard;
2167
2168                new->dev = ort->dst.dev;
2169                if (new->dev)
2170                        dev_hold(new->dev);
2171
2172                rt->rt_is_input = ort->rt_is_input;
2173                rt->rt_iif = ort->rt_iif;
2174                rt->rt_pmtu = ort->rt_pmtu;
2175
2176                rt->rt_genid = rt_genid(net);
2177                rt->rt_flags = ort->rt_flags;
2178                rt->rt_type = ort->rt_type;
2179                rt->rt_gateway = ort->rt_gateway;
2180                rt->rt_uses_gateway = ort->rt_uses_gateway;
2181
2182                INIT_LIST_HEAD(&rt->rt_uncached);
2183
2184                dst_free(new);
2185        }
2186
2187        dst_release(dst_orig);
2188
2189        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2190}
2191
2192struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2193                                    struct sock *sk)
2194{
2195        struct rtable *rt = __ip_route_output_key(net, flp4);
2196
2197        if (IS_ERR(rt))
2198                return rt;
2199
2200        if (flp4->flowi4_proto)
2201                rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2202                                                   flowi4_to_flowi(flp4),
2203                                                   sk, 0);
2204
2205        return rt;
2206}
2207EXPORT_SYMBOL_GPL(ip_route_output_flow);
2208
2209static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2210                        struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2211                        u32 seq, int event, int nowait, unsigned int flags)
2212{
2213        struct rtable *rt = skb_rtable(skb);
2214        struct rtmsg *r;
2215        struct nlmsghdr *nlh;
2216        unsigned long expires = 0;
2217        u32 error;
2218        u32 metrics[RTAX_MAX];
2219
2220        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2221        if (nlh == NULL)
2222                return -EMSGSIZE;
2223
2224        r = nlmsg_data(nlh);
2225        r->rtm_family    = AF_INET;
2226        r->rtm_dst_len  = 32;
2227        r->rtm_src_len  = 0;
2228        r->rtm_tos      = fl4->flowi4_tos;
2229        r->rtm_table    = RT_TABLE_MAIN;
2230        if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2231                goto nla_put_failure;
2232        r->rtm_type     = rt->rt_type;
2233        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2234        r->rtm_protocol = RTPROT_UNSPEC;
2235        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2236        if (rt->rt_flags & RTCF_NOTIFY)
2237                r->rtm_flags |= RTM_F_NOTIFY;
2238
2239        if (nla_put_be32(skb, RTA_DST, dst))
2240                goto nla_put_failure;
2241        if (src) {
2242                r->rtm_src_len = 32;
2243                if (nla_put_be32(skb, RTA_SRC, src))
2244                        goto nla_put_failure;
2245        }
2246        if (rt->dst.dev &&
2247            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2248                goto nla_put_failure;
2249#ifdef CONFIG_IP_ROUTE_CLASSID
2250        if (rt->dst.tclassid &&
2251            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2252                goto nla_put_failure;
2253#endif
2254        if (!rt_is_input_route(rt) &&
2255            fl4->saddr != src) {
2256                if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2257                        goto nla_put_failure;
2258        }
2259        if (rt->rt_uses_gateway &&
2260            nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2261                goto nla_put_failure;
2262
2263        expires = rt->dst.expires;
2264        if (expires) {
2265                unsigned long now = jiffies;
2266
2267                if (time_before(now, expires))
2268                        expires -= now;
2269                else
2270                        expires = 0;
2271        }
2272
2273        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2274        if (rt->rt_pmtu && expires)
2275                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2276        if (rtnetlink_put_metrics(skb, metrics) < 0)
2277                goto nla_put_failure;
2278
2279        if (fl4->flowi4_mark &&
2280            nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2281                goto nla_put_failure;
2282
2283        error = rt->dst.error;
2284
2285        if (rt_is_input_route(rt)) {
2286#ifdef CONFIG_IP_MROUTE
2287                if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2288                    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2289                        int err = ipmr_get_route(net, skb,
2290                                                 fl4->saddr, fl4->daddr,
2291                                                 r, nowait);
2292                        if (err <= 0) {
2293                                if (!nowait) {
2294                                        if (err == 0)
2295                                                return 0;
2296                                        goto nla_put_failure;
2297                                } else {
2298                                        if (err == -EMSGSIZE)
2299                                                goto nla_put_failure;
2300                                        error = err;
2301                                }
2302                        }
2303                } else
2304#endif
2305                        if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2306                                goto nla_put_failure;
2307        }
2308
2309        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2310                goto nla_put_failure;
2311
2312        return nlmsg_end(skb, nlh);
2313
2314nla_put_failure:
2315        nlmsg_cancel(skb, nlh);
2316        return -EMSGSIZE;
2317}
2318
2319static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2320{
2321        struct net *net = sock_net(in_skb->sk);
2322        struct rtmsg *rtm;
2323        struct nlattr *tb[RTA_MAX+1];
2324        struct rtable *rt = NULL;
2325        struct flowi4 fl4;
2326        __be32 dst = 0;
2327        __be32 src = 0;
2328        u32 iif;
2329        int err;
2330        int mark;
2331        struct sk_buff *skb;
2332
2333        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2334        if (err < 0)
2335                goto errout;
2336
2337        rtm = nlmsg_data(nlh);
2338
2339        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2340        if (skb == NULL) {
2341                err = -ENOBUFS;
2342                goto errout;
2343        }
2344
2345        /* Reserve room for dummy headers, this skb can pass
2346           through good chunk of routing engine.
2347         */
2348        skb_reset_mac_header(skb);
2349        skb_reset_network_header(skb);
2350
2351        /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2352        ip_hdr(skb)->protocol = IPPROTO_ICMP;
2353        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2354
2355        src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2356        dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2357        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2358        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2359
2360        memset(&fl4, 0, sizeof(fl4));
2361        fl4.daddr = dst;
2362        fl4.saddr = src;
2363        fl4.flowi4_tos = rtm->rtm_tos;
2364        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2365        fl4.flowi4_mark = mark;
2366
2367        if (iif) {
2368                struct net_device *dev;
2369
2370                dev = __dev_get_by_index(net, iif);
2371                if (dev == NULL) {
2372                        err = -ENODEV;
2373                        goto errout_free;
2374                }
2375
2376                skb->protocol   = htons(ETH_P_IP);
2377                skb->dev        = dev;
2378                skb->mark       = mark;
2379                local_bh_disable();
2380                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2381                local_bh_enable();
2382
2383                rt = skb_rtable(skb);
2384                if (err == 0 && rt->dst.error)
2385                        err = -rt->dst.error;
2386        } else {
2387                rt = ip_route_output_key(net, &fl4);
2388
2389                err = 0;
2390                if (IS_ERR(rt))
2391                        err = PTR_ERR(rt);
2392        }
2393
2394        if (err)
2395                goto errout_free;
2396
2397        skb_dst_set(skb, &rt->dst);
2398        if (rtm->rtm_flags & RTM_F_NOTIFY)
2399                rt->rt_flags |= RTCF_NOTIFY;
2400
2401        err = rt_fill_info(net, dst, src, &fl4, skb,
2402                           NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2403                           RTM_NEWROUTE, 0, 0);
2404        if (err <= 0)
2405                goto errout_free;
2406
2407        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2408errout:
2409        return err;
2410
2411errout_free:
2412        kfree_skb(skb);
2413        goto errout;
2414}
2415
2416int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2417{
2418        return skb->len;
2419}
2420
2421void ip_rt_multicast_event(struct in_device *in_dev)
2422{
2423        rt_cache_flush(dev_net(in_dev->dev));
2424}
2425
2426#ifdef CONFIG_SYSCTL
2427static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2428static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2429static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2430static int ip_rt_gc_elasticity __read_mostly    = 8;
2431
2432static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2433                                        void __user *buffer,
2434                                        size_t *lenp, loff_t *ppos)
2435{
2436        if (write) {
2437                rt_cache_flush((struct net *)__ctl->extra1);
2438                return 0;
2439        }
2440
2441        return -EINVAL;
2442}
2443
2444static ctl_table ipv4_route_table[] = {
2445        {
2446                .procname       = "gc_thresh",
2447                .data           = &ipv4_dst_ops.gc_thresh,
2448                .maxlen         = sizeof(int),
2449                .mode           = 0644,
2450                .proc_handler   = proc_dointvec,
2451        },
2452        {
2453                .procname       = "max_size",
2454                .data           = &ip_rt_max_size,
2455                .maxlen         = sizeof(int),
2456                .mode           = 0644,
2457                .proc_handler   = proc_dointvec,
2458        },
2459        {
2460                /*  Deprecated. Use gc_min_interval_ms */
2461
2462                .procname       = "gc_min_interval",
2463                .data           = &ip_rt_gc_min_interval,
2464                .maxlen         = sizeof(int),
2465                .mode           = 0644,
2466                .proc_handler   = proc_dointvec_jiffies,
2467        },
2468        {
2469                .procname       = "gc_min_interval_ms",
2470                .data           = &ip_rt_gc_min_interval,
2471                .maxlen         = sizeof(int),
2472                .mode           = 0644,
2473                .proc_handler   = proc_dointvec_ms_jiffies,
2474        },
2475        {
2476                .procname       = "gc_timeout",
2477                .data           = &ip_rt_gc_timeout,
2478                .maxlen         = sizeof(int),
2479                .mode           = 0644,
2480                .proc_handler   = proc_dointvec_jiffies,
2481        },
2482        {
2483                .procname       = "gc_interval",
2484                .data           = &ip_rt_gc_interval,
2485                .maxlen         = sizeof(int),
2486                .mode           = 0644,
2487                .proc_handler   = proc_dointvec_jiffies,
2488        },
2489        {
2490                .procname       = "redirect_load",
2491                .data           = &ip_rt_redirect_load,
2492                .maxlen         = sizeof(int),
2493                .mode           = 0644,
2494                .proc_handler   = proc_dointvec,
2495        },
2496        {
2497                .procname       = "redirect_number",
2498                .data           = &ip_rt_redirect_number,
2499                .maxlen         = sizeof(int),
2500                .mode           = 0644,
2501                .proc_handler   = proc_dointvec,
2502        },
2503        {
2504                .procname       = "redirect_silence",
2505                .data           = &ip_rt_redirect_silence,
2506                .maxlen         = sizeof(int),
2507                .mode           = 0644,
2508                .proc_handler   = proc_dointvec,
2509        },
2510        {
2511                .procname       = "error_cost",
2512                .data           = &ip_rt_error_cost,
2513                .maxlen         = sizeof(int),
2514                .mode           = 0644,
2515                .proc_handler   = proc_dointvec,
2516        },
2517        {
2518                .procname       = "error_burst",
2519                .data           = &ip_rt_error_burst,
2520                .maxlen         = sizeof(int),
2521                .mode           = 0644,
2522                .proc_handler   = proc_dointvec,
2523        },
2524        {
2525                .procname       = "gc_elasticity",
2526                .data           = &ip_rt_gc_elasticity,
2527                .maxlen         = sizeof(int),
2528                .mode           = 0644,
2529                .proc_handler   = proc_dointvec,
2530        },
2531        {
2532                .procname       = "mtu_expires",
2533                .data           = &ip_rt_mtu_expires,
2534                .maxlen         = sizeof(int),
2535                .mode           = 0644,
2536                .proc_handler   = proc_dointvec_jiffies,
2537        },
2538        {
2539                .procname       = "min_pmtu",
2540                .data           = &ip_rt_min_pmtu,
2541                .maxlen         = sizeof(int),
2542                .mode           = 0644,
2543                .proc_handler   = proc_dointvec,
2544        },
2545        {
2546                .procname       = "min_adv_mss",
2547                .data           = &ip_rt_min_advmss,
2548                .maxlen         = sizeof(int),
2549                .mode           = 0644,
2550                .proc_handler   = proc_dointvec,
2551        },
2552        { }
2553};
2554
2555static struct ctl_table ipv4_route_flush_table[] = {
2556        {
2557                .procname       = "flush",
2558                .maxlen         = sizeof(int),
2559                .mode           = 0200,
2560                .proc_handler   = ipv4_sysctl_rtcache_flush,
2561        },
2562        { },
2563};
2564
2565static __net_init int sysctl_route_net_init(struct net *net)
2566{
2567        struct ctl_table *tbl;
2568
2569        tbl = ipv4_route_flush_table;
2570        if (!net_eq(net, &init_net)) {
2571                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2572                if (tbl == NULL)
2573                        goto err_dup;
2574
2575                /* Don't export sysctls to unprivileged users */
2576                if (net->user_ns != &init_user_ns)
2577                        tbl[0].procname = NULL;
2578        }
2579        tbl[0].extra1 = net;
2580
2581        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2582        if (net->ipv4.route_hdr == NULL)
2583                goto err_reg;
2584        return 0;
2585
2586err_reg:
2587        if (tbl != ipv4_route_flush_table)
2588                kfree(tbl);
2589err_dup:
2590        return -ENOMEM;
2591}
2592
2593static __net_exit void sysctl_route_net_exit(struct net *net)
2594{
2595        struct ctl_table *tbl;
2596
2597        tbl = net->ipv4.route_hdr->ctl_table_arg;
2598        unregister_net_sysctl_table(net->ipv4.route_hdr);
2599        BUG_ON(tbl == ipv4_route_flush_table);
2600        kfree(tbl);
2601}
2602
2603static __net_initdata struct pernet_operations sysctl_route_ops = {
2604        .init = sysctl_route_net_init,
2605        .exit = sysctl_route_net_exit,
2606};
2607#endif
2608
2609static __net_init int rt_genid_init(struct net *net)
2610{
2611        atomic_set(&net->rt_genid, 0);
2612        get_random_bytes(&net->ipv4.dev_addr_genid,
2613                         sizeof(net->ipv4.dev_addr_genid));
2614        return 0;
2615}
2616
2617static __net_initdata struct pernet_operations rt_genid_ops = {
2618        .init = rt_genid_init,
2619};
2620
2621static int __net_init ipv4_inetpeer_init(struct net *net)
2622{
2623        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2624
2625        if (!bp)
2626                return -ENOMEM;
2627        inet_peer_base_init(bp);
2628        net->ipv4.peers = bp;
2629        return 0;
2630}
2631
2632static void __net_exit ipv4_inetpeer_exit(struct net *net)
2633{
2634        struct inet_peer_base *bp = net->ipv4.peers;
2635
2636        net->ipv4.peers = NULL;
2637        inetpeer_invalidate_tree(bp);
2638        kfree(bp);
2639}
2640
2641static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2642        .init   =       ipv4_inetpeer_init,
2643        .exit   =       ipv4_inetpeer_exit,
2644};
2645
2646#ifdef CONFIG_IP_ROUTE_CLASSID
2647struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2648#endif /* CONFIG_IP_ROUTE_CLASSID */
2649
2650int __init ip_rt_init(void)
2651{
2652        int rc = 0;
2653
2654#ifdef CONFIG_IP_ROUTE_CLASSID
2655        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2656        if (!ip_rt_acct)
2657                panic("IP: failed to allocate ip_rt_acct\n");
2658#endif
2659
2660        ipv4_dst_ops.kmem_cachep =
2661                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2662                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2663
2664        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2665
2666        if (dst_entries_init(&ipv4_dst_ops) < 0)
2667                panic("IP: failed to allocate ipv4_dst_ops counter\n");
2668
2669        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2670                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2671
2672        ipv4_dst_ops.gc_thresh = ~0;
2673        ip_rt_max_size = INT_MAX;
2674
2675        devinet_init();
2676        ip_fib_init();
2677
2678        if (ip_rt_proc_init())
2679                pr_err("Unable to create route proc files\n");
2680#ifdef CONFIG_XFRM
2681        xfrm_init();
2682        xfrm4_init();
2683#endif
2684        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2685
2686#ifdef CONFIG_SYSCTL
2687        register_pernet_subsys(&sysctl_route_ops);
2688#endif
2689        register_pernet_subsys(&rt_genid_ops);
2690        register_pernet_subsys(&ipv4_inetpeer_ops);
2691        return rc;
2692}
2693
2694#ifdef CONFIG_SYSCTL
2695/*
2696 * We really need to sanitize the damn ipv4 init order, then all
2697 * this nonsense will go away.
2698 */
2699void __init ip_static_sysctl_init(void)
2700{
2701        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2702}
2703#endif
2704