linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *              Alan Cox        :       Verify area fixes.
  16 *              Alan Cox        :       cli() protects routing changes
  17 *              Rui Oliveira    :       ICMP routing table updates
  18 *              (rco@di.uminho.pt)      Routing table insertion and update
  19 *              Linus Torvalds  :       Rewrote bits to be sensible
  20 *              Alan Cox        :       Added BSD route gw semantics
  21 *              Alan Cox        :       Super /proc >4K
  22 *              Alan Cox        :       MTU in route table
  23 *              Alan Cox        :       MSS actually. Also added the window
  24 *                                      clamper.
  25 *              Sam Lantinga    :       Fixed route matching in rt_del()
  26 *              Alan Cox        :       Routing cache support.
  27 *              Alan Cox        :       Removed compatibility cruft.
  28 *              Alan Cox        :       RTF_REJECT support.
  29 *              Alan Cox        :       TCP irtt support.
  30 *              Jonathan Naylor :       Added Metric support.
  31 *      Miquel van Smoorenburg  :       BSD API fixes.
  32 *      Miquel van Smoorenburg  :       Metrics.
  33 *              Alan Cox        :       Use __u32 properly
  34 *              Alan Cox        :       Aligned routing errors more closely with BSD
  35 *                                      our system is still very different.
  36 *              Alan Cox        :       Faster /proc handling
  37 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38 *                                      routing caches and better behaviour.
  39 *
  40 *              Olaf Erb        :       irtt wasn't being copied right.
  41 *              Bjorn Ekwall    :       Kerneld route support.
  42 *              Alan Cox        :       Multicast fixed (I hope)
  43 *              Pavel Krauz     :       Limited broadcast fixed
  44 *              Mike McLagan    :       Routing by source
  45 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46 *                                      route.c and rewritten from scratch.
  47 *              Andi Kleen      :       Load-limit warning messages.
  48 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52 *              Marc Boucher    :       routing by fwmark
  53 *      Robert Olsson           :       Added rt_cache statistics
  54 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58 *
  59 *              This program is free software; you can redistribute it and/or
  60 *              modify it under the terms of the GNU General Public License
  61 *              as published by the Free Software Foundation; either version
  62 *              2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
  92#include <linux/jhash.h>
  93#include <net/dst.h>
  94#include <net/net_namespace.h>
  95#include <net/protocol.h>
  96#include <net/ip.h>
  97#include <net/route.h>
  98#include <net/inetpeer.h>
  99#include <net/sock.h>
 100#include <net/ip_fib.h>
 101#include <net/arp.h>
 102#include <net/tcp.h>
 103#include <net/icmp.h>
 104#include <net/xfrm.h>
 105#include <net/netevent.h>
 106#include <net/rtnetlink.h>
 107#ifdef CONFIG_SYSCTL
 108#include <linux/sysctl.h>
 109#include <linux/kmemleak.h>
 110#endif
 111#include <net/secure_seq.h>
 112
 113#define RT_FL_TOS(oldflp4) \
 114        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 115
 116#define RT_GC_TIMEOUT (300*HZ)
 117
 118static int ip_rt_max_size;
 119static int ip_rt_redirect_number __read_mostly  = 9;
 120static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 121static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 122static int ip_rt_error_cost __read_mostly       = HZ;
 123static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 124static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 125static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 126static int ip_rt_min_advmss __read_mostly       = 256;
 127
 128/*
 129 *      Interface to generic destination cache.
 130 */
 131
 132static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 133static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 134static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 135static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 136static void              ipv4_link_failure(struct sk_buff *skb);
 137static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 138                                           struct sk_buff *skb, u32 mtu);
 139static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 140                                        struct sk_buff *skb);
 141static void             ipv4_dst_destroy(struct dst_entry *dst);
 142
 143static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 144{
 145        WARN_ON(1);
 146        return NULL;
 147}
 148
 149static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 150                                           struct sk_buff *skb,
 151                                           const void *daddr);
 152
 153static struct dst_ops ipv4_dst_ops = {
 154        .family =               AF_INET,
 155        .protocol =             cpu_to_be16(ETH_P_IP),
 156        .check =                ipv4_dst_check,
 157        .default_advmss =       ipv4_default_advmss,
 158        .mtu =                  ipv4_mtu,
 159        .cow_metrics =          ipv4_cow_metrics,
 160        .destroy =              ipv4_dst_destroy,
 161        .negative_advice =      ipv4_negative_advice,
 162        .link_failure =         ipv4_link_failure,
 163        .update_pmtu =          ip_rt_update_pmtu,
 164        .redirect =             ip_do_redirect,
 165        .local_out =            __ip_local_out,
 166        .neigh_lookup =         ipv4_neigh_lookup,
 167};
 168
 169#define ECN_OR_COST(class)      TC_PRIO_##class
 170
 171const __u8 ip_tos2prio[16] = {
 172        TC_PRIO_BESTEFFORT,
 173        ECN_OR_COST(BESTEFFORT),
 174        TC_PRIO_BESTEFFORT,
 175        ECN_OR_COST(BESTEFFORT),
 176        TC_PRIO_BULK,
 177        ECN_OR_COST(BULK),
 178        TC_PRIO_BULK,
 179        ECN_OR_COST(BULK),
 180        TC_PRIO_INTERACTIVE,
 181        ECN_OR_COST(INTERACTIVE),
 182        TC_PRIO_INTERACTIVE,
 183        ECN_OR_COST(INTERACTIVE),
 184        TC_PRIO_INTERACTIVE_BULK,
 185        ECN_OR_COST(INTERACTIVE_BULK),
 186        TC_PRIO_INTERACTIVE_BULK,
 187        ECN_OR_COST(INTERACTIVE_BULK)
 188};
 189EXPORT_SYMBOL(ip_tos2prio);
 190
 191static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 192#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 193
 194#ifdef CONFIG_PROC_FS
 195static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 196{
 197        if (*pos)
 198                return NULL;
 199        return SEQ_START_TOKEN;
 200}
 201
 202static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 203{
 204        ++*pos;
 205        return NULL;
 206}
 207
 208static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 209{
 210}
 211
 212static int rt_cache_seq_show(struct seq_file *seq, void *v)
 213{
 214        if (v == SEQ_START_TOKEN)
 215                seq_printf(seq, "%-127s\n",
 216                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 217                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 218                           "HHUptod\tSpecDst");
 219        return 0;
 220}
 221
 222static const struct seq_operations rt_cache_seq_ops = {
 223        .start  = rt_cache_seq_start,
 224        .next   = rt_cache_seq_next,
 225        .stop   = rt_cache_seq_stop,
 226        .show   = rt_cache_seq_show,
 227};
 228
 229static int rt_cache_seq_open(struct inode *inode, struct file *file)
 230{
 231        return seq_open(file, &rt_cache_seq_ops);
 232}
 233
 234static const struct file_operations rt_cache_seq_fops = {
 235        .owner   = THIS_MODULE,
 236        .open    = rt_cache_seq_open,
 237        .read    = seq_read,
 238        .llseek  = seq_lseek,
 239        .release = seq_release,
 240};
 241
 242
 243static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 244{
 245        int cpu;
 246
 247        if (*pos == 0)
 248                return SEQ_START_TOKEN;
 249
 250        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 251                if (!cpu_possible(cpu))
 252                        continue;
 253                *pos = cpu+1;
 254                return &per_cpu(rt_cache_stat, cpu);
 255        }
 256        return NULL;
 257}
 258
 259static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 260{
 261        int cpu;
 262
 263        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 264                if (!cpu_possible(cpu))
 265                        continue;
 266                *pos = cpu+1;
 267                return &per_cpu(rt_cache_stat, cpu);
 268        }
 269        return NULL;
 270
 271}
 272
 273static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 274{
 275
 276}
 277
 278static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 279{
 280        struct rt_cache_stat *st = v;
 281
 282        if (v == SEQ_START_TOKEN) {
 283                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 284                return 0;
 285        }
 286
 287        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 288                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 289                   dst_entries_get_slow(&ipv4_dst_ops),
 290                   0, /* st->in_hit */
 291                   st->in_slow_tot,
 292                   st->in_slow_mc,
 293                   st->in_no_route,
 294                   st->in_brd,
 295                   st->in_martian_dst,
 296                   st->in_martian_src,
 297
 298                   0, /* st->out_hit */
 299                   st->out_slow_tot,
 300                   st->out_slow_mc,
 301
 302                   0, /* st->gc_total */
 303                   0, /* st->gc_ignored */
 304                   0, /* st->gc_goal_miss */
 305                   0, /* st->gc_dst_overflow */
 306                   0, /* st->in_hlist_search */
 307                   0  /* st->out_hlist_search */
 308                );
 309        return 0;
 310}
 311
 312static const struct seq_operations rt_cpu_seq_ops = {
 313        .start  = rt_cpu_seq_start,
 314        .next   = rt_cpu_seq_next,
 315        .stop   = rt_cpu_seq_stop,
 316        .show   = rt_cpu_seq_show,
 317};
 318
 319
 320static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 321{
 322        return seq_open(file, &rt_cpu_seq_ops);
 323}
 324
 325static const struct file_operations rt_cpu_seq_fops = {
 326        .owner   = THIS_MODULE,
 327        .open    = rt_cpu_seq_open,
 328        .read    = seq_read,
 329        .llseek  = seq_lseek,
 330        .release = seq_release,
 331};
 332
 333#ifdef CONFIG_IP_ROUTE_CLASSID
 334static int rt_acct_proc_show(struct seq_file *m, void *v)
 335{
 336        struct ip_rt_acct *dst, *src;
 337        unsigned int i, j;
 338
 339        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 340        if (!dst)
 341                return -ENOMEM;
 342
 343        for_each_possible_cpu(i) {
 344                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 345                for (j = 0; j < 256; j++) {
 346                        dst[j].o_bytes   += src[j].o_bytes;
 347                        dst[j].o_packets += src[j].o_packets;
 348                        dst[j].i_bytes   += src[j].i_bytes;
 349                        dst[j].i_packets += src[j].i_packets;
 350                }
 351        }
 352
 353        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 354        kfree(dst);
 355        return 0;
 356}
 357
 358static int rt_acct_proc_open(struct inode *inode, struct file *file)
 359{
 360        return single_open(file, rt_acct_proc_show, NULL);
 361}
 362
 363static const struct file_operations rt_acct_proc_fops = {
 364        .owner          = THIS_MODULE,
 365        .open           = rt_acct_proc_open,
 366        .read           = seq_read,
 367        .llseek         = seq_lseek,
 368        .release        = single_release,
 369};
 370#endif
 371
 372static int __net_init ip_rt_do_proc_init(struct net *net)
 373{
 374        struct proc_dir_entry *pde;
 375
 376        pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 377                          &rt_cache_seq_fops);
 378        if (!pde)
 379                goto err1;
 380
 381        pde = proc_create("rt_cache", S_IRUGO,
 382                          net->proc_net_stat, &rt_cpu_seq_fops);
 383        if (!pde)
 384                goto err2;
 385
 386#ifdef CONFIG_IP_ROUTE_CLASSID
 387        pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 388        if (!pde)
 389                goto err3;
 390#endif
 391        return 0;
 392
 393#ifdef CONFIG_IP_ROUTE_CLASSID
 394err3:
 395        remove_proc_entry("rt_cache", net->proc_net_stat);
 396#endif
 397err2:
 398        remove_proc_entry("rt_cache", net->proc_net);
 399err1:
 400        return -ENOMEM;
 401}
 402
 403static void __net_exit ip_rt_do_proc_exit(struct net *net)
 404{
 405        remove_proc_entry("rt_cache", net->proc_net_stat);
 406        remove_proc_entry("rt_cache", net->proc_net);
 407#ifdef CONFIG_IP_ROUTE_CLASSID
 408        remove_proc_entry("rt_acct", net->proc_net);
 409#endif
 410}
 411
 412static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 413        .init = ip_rt_do_proc_init,
 414        .exit = ip_rt_do_proc_exit,
 415};
 416
 417static int __init ip_rt_proc_init(void)
 418{
 419        return register_pernet_subsys(&ip_rt_proc_ops);
 420}
 421
 422#else
 423static inline int ip_rt_proc_init(void)
 424{
 425        return 0;
 426}
 427#endif /* CONFIG_PROC_FS */
 428
 429static inline bool rt_is_expired(const struct rtable *rth)
 430{
 431        return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 432}
 433
 434void rt_cache_flush(struct net *net)
 435{
 436        rt_genid_bump_ipv4(net);
 437}
 438
 439static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 440                                           struct sk_buff *skb,
 441                                           const void *daddr)
 442{
 443        struct net_device *dev = dst->dev;
 444        const __be32 *pkey = daddr;
 445        const struct rtable *rt;
 446        struct neighbour *n;
 447
 448        rt = (const struct rtable *) dst;
 449        if (rt->rt_gateway)
 450                pkey = (const __be32 *) &rt->rt_gateway;
 451        else if (skb)
 452                pkey = &ip_hdr(skb)->daddr;
 453
 454        n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 455        if (n)
 456                return n;
 457        return neigh_create(&arp_tbl, pkey, dev);
 458}
 459
 460#define IP_IDENTS_SZ 2048u
 461struct ip_ident_bucket {
 462        atomic_t        id;
 463        u32             stamp32;
 464};
 465
 466static struct ip_ident_bucket *ip_idents __read_mostly;
 467
 468/* In order to protect privacy, we add a perturbation to identifiers
 469 * if one generator is seldom used. This makes hard for an attacker
 470 * to infer how many packets were sent between two points in time.
 471 */
 472u32 ip_idents_reserve(u32 hash, int segs)
 473{
 474        struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
 475        u32 old = ACCESS_ONCE(bucket->stamp32);
 476        u32 now = (u32)jiffies;
 477        u32 delta = 0;
 478
 479        if (old != now && cmpxchg(&bucket->stamp32, old, now) == old)
 480                delta = prandom_u32_max(now - old);
 481
 482        return atomic_add_return(segs + delta, &bucket->id) - segs;
 483}
 484EXPORT_SYMBOL(ip_idents_reserve);
 485
 486void __ip_select_ident(struct iphdr *iph, int segs)
 487{
 488        static u32 ip_idents_hashrnd __read_mostly;
 489        u32 hash, id;
 490
 491        net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 492
 493        hash = jhash_3words((__force u32)iph->daddr,
 494                            (__force u32)iph->saddr,
 495                            iph->protocol,
 496                            ip_idents_hashrnd);
 497        id = ip_idents_reserve(hash, segs);
 498        iph->id = htons(id);
 499}
 500EXPORT_SYMBOL(__ip_select_ident);
 501
 502static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 503                             const struct iphdr *iph,
 504                             int oif, u8 tos,
 505                             u8 prot, u32 mark, int flow_flags)
 506{
 507        if (sk) {
 508                const struct inet_sock *inet = inet_sk(sk);
 509
 510                oif = sk->sk_bound_dev_if;
 511                mark = sk->sk_mark;
 512                tos = RT_CONN_FLAGS(sk);
 513                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 514        }
 515        flowi4_init_output(fl4, oif, mark, tos,
 516                           RT_SCOPE_UNIVERSE, prot,
 517                           flow_flags,
 518                           iph->daddr, iph->saddr, 0, 0);
 519}
 520
 521static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 522                               const struct sock *sk)
 523{
 524        const struct iphdr *iph = ip_hdr(skb);
 525        int oif = skb->dev->ifindex;
 526        u8 tos = RT_TOS(iph->tos);
 527        u8 prot = iph->protocol;
 528        u32 mark = skb->mark;
 529
 530        __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 531}
 532
 533static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 534{
 535        const struct inet_sock *inet = inet_sk(sk);
 536        const struct ip_options_rcu *inet_opt;
 537        __be32 daddr = inet->inet_daddr;
 538
 539        rcu_read_lock();
 540        inet_opt = rcu_dereference(inet->inet_opt);
 541        if (inet_opt && inet_opt->opt.srr)
 542                daddr = inet_opt->opt.faddr;
 543        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 544                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 545                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 546                           inet_sk_flowi_flags(sk),
 547                           daddr, inet->inet_saddr, 0, 0);
 548        rcu_read_unlock();
 549}
 550
 551static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 552                                 const struct sk_buff *skb)
 553{
 554        if (skb)
 555                build_skb_flow_key(fl4, skb, sk);
 556        else
 557                build_sk_flow_key(fl4, sk);
 558}
 559
 560static inline void rt_free(struct rtable *rt)
 561{
 562        call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 563}
 564
 565static DEFINE_SPINLOCK(fnhe_lock);
 566
 567static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 568{
 569        struct rtable *rt;
 570
 571        rt = rcu_dereference(fnhe->fnhe_rth_input);
 572        if (rt) {
 573                RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 574                rt_free(rt);
 575        }
 576        rt = rcu_dereference(fnhe->fnhe_rth_output);
 577        if (rt) {
 578                RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 579                rt_free(rt);
 580        }
 581}
 582
 583static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 584{
 585        struct fib_nh_exception *fnhe, *oldest;
 586
 587        oldest = rcu_dereference(hash->chain);
 588        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 589             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 590                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 591                        oldest = fnhe;
 592        }
 593        fnhe_flush_routes(oldest);
 594        return oldest;
 595}
 596
 597static inline u32 fnhe_hashfun(__be32 daddr)
 598{
 599        static u32 fnhe_hashrnd __read_mostly;
 600        u32 hval;
 601
 602        net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 603        hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 604        return hash_32(hval, FNHE_HASH_SHIFT);
 605}
 606
 607static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 608{
 609        rt->rt_pmtu = fnhe->fnhe_pmtu;
 610        rt->dst.expires = fnhe->fnhe_expires;
 611
 612        if (fnhe->fnhe_gw) {
 613                rt->rt_flags |= RTCF_REDIRECTED;
 614                rt->rt_gateway = fnhe->fnhe_gw;
 615                rt->rt_uses_gateway = 1;
 616        }
 617}
 618
 619static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 620                                  u32 pmtu, unsigned long expires)
 621{
 622        struct fnhe_hash_bucket *hash;
 623        struct fib_nh_exception *fnhe;
 624        struct rtable *rt;
 625        unsigned int i;
 626        int depth;
 627        u32 hval = fnhe_hashfun(daddr);
 628
 629        spin_lock_bh(&fnhe_lock);
 630
 631        hash = rcu_dereference(nh->nh_exceptions);
 632        if (!hash) {
 633                hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 634                if (!hash)
 635                        goto out_unlock;
 636                rcu_assign_pointer(nh->nh_exceptions, hash);
 637        }
 638
 639        hash += hval;
 640
 641        depth = 0;
 642        for (fnhe = rcu_dereference(hash->chain); fnhe;
 643             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 644                if (fnhe->fnhe_daddr == daddr)
 645                        break;
 646                depth++;
 647        }
 648
 649        if (fnhe) {
 650                if (gw)
 651                        fnhe->fnhe_gw = gw;
 652                if (pmtu) {
 653                        fnhe->fnhe_pmtu = pmtu;
 654                        fnhe->fnhe_expires = max(1UL, expires);
 655                }
 656                /* Update all cached dsts too */
 657                rt = rcu_dereference(fnhe->fnhe_rth_input);
 658                if (rt)
 659                        fill_route_from_fnhe(rt, fnhe);
 660                rt = rcu_dereference(fnhe->fnhe_rth_output);
 661                if (rt)
 662                        fill_route_from_fnhe(rt, fnhe);
 663        } else {
 664                if (depth > FNHE_RECLAIM_DEPTH)
 665                        fnhe = fnhe_oldest(hash);
 666                else {
 667                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 668                        if (!fnhe)
 669                                goto out_unlock;
 670
 671                        fnhe->fnhe_next = hash->chain;
 672                        rcu_assign_pointer(hash->chain, fnhe);
 673                }
 674                fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
 675                fnhe->fnhe_daddr = daddr;
 676                fnhe->fnhe_gw = gw;
 677                fnhe->fnhe_pmtu = pmtu;
 678                fnhe->fnhe_expires = expires;
 679
 680                /* Exception created; mark the cached routes for the nexthop
 681                 * stale, so anyone caching it rechecks if this exception
 682                 * applies to them.
 683                 */
 684                rt = rcu_dereference(nh->nh_rth_input);
 685                if (rt)
 686                        rt->dst.obsolete = DST_OBSOLETE_KILL;
 687
 688                for_each_possible_cpu(i) {
 689                        struct rtable __rcu **prt;
 690                        prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 691                        rt = rcu_dereference(*prt);
 692                        if (rt)
 693                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 694                }
 695        }
 696
 697        fnhe->fnhe_stamp = jiffies;
 698
 699out_unlock:
 700        spin_unlock_bh(&fnhe_lock);
 701}
 702
 703static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 704                             bool kill_route)
 705{
 706        __be32 new_gw = icmp_hdr(skb)->un.gateway;
 707        __be32 old_gw = ip_hdr(skb)->saddr;
 708        struct net_device *dev = skb->dev;
 709        struct in_device *in_dev;
 710        struct fib_result res;
 711        struct neighbour *n;
 712        struct net *net;
 713
 714        switch (icmp_hdr(skb)->code & 7) {
 715        case ICMP_REDIR_NET:
 716        case ICMP_REDIR_NETTOS:
 717        case ICMP_REDIR_HOST:
 718        case ICMP_REDIR_HOSTTOS:
 719                break;
 720
 721        default:
 722                return;
 723        }
 724
 725        if (rt->rt_gateway != old_gw)
 726                return;
 727
 728        in_dev = __in_dev_get_rcu(dev);
 729        if (!in_dev)
 730                return;
 731
 732        net = dev_net(dev);
 733        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 734            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 735            ipv4_is_zeronet(new_gw))
 736                goto reject_redirect;
 737
 738        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 739                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 740                        goto reject_redirect;
 741                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 742                        goto reject_redirect;
 743        } else {
 744                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 745                        goto reject_redirect;
 746        }
 747
 748        n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 749        if (!IS_ERR(n)) {
 750                if (!(n->nud_state & NUD_VALID)) {
 751                        neigh_event_send(n, NULL);
 752                } else {
 753                        if (fib_lookup(net, fl4, &res) == 0) {
 754                                struct fib_nh *nh = &FIB_RES_NH(res);
 755
 756                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
 757                                                      0, 0);
 758                        }
 759                        if (kill_route)
 760                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 761                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 762                }
 763                neigh_release(n);
 764        }
 765        return;
 766
 767reject_redirect:
 768#ifdef CONFIG_IP_ROUTE_VERBOSE
 769        if (IN_DEV_LOG_MARTIANS(in_dev)) {
 770                const struct iphdr *iph = (const struct iphdr *) skb->data;
 771                __be32 daddr = iph->daddr;
 772                __be32 saddr = iph->saddr;
 773
 774                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 775                                     "  Advised path = %pI4 -> %pI4\n",
 776                                     &old_gw, dev->name, &new_gw,
 777                                     &saddr, &daddr);
 778        }
 779#endif
 780        ;
 781}
 782
 783static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 784{
 785        struct rtable *rt;
 786        struct flowi4 fl4;
 787        const struct iphdr *iph = (const struct iphdr *) skb->data;
 788        int oif = skb->dev->ifindex;
 789        u8 tos = RT_TOS(iph->tos);
 790        u8 prot = iph->protocol;
 791        u32 mark = skb->mark;
 792
 793        rt = (struct rtable *) dst;
 794
 795        __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
 796        __ip_do_redirect(rt, skb, &fl4, true);
 797}
 798
 799static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 800{
 801        struct rtable *rt = (struct rtable *)dst;
 802        struct dst_entry *ret = dst;
 803
 804        if (rt) {
 805                if (dst->obsolete > 0) {
 806                        ip_rt_put(rt);
 807                        ret = NULL;
 808                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 809                           rt->dst.expires) {
 810                        ip_rt_put(rt);
 811                        ret = NULL;
 812                }
 813        }
 814        return ret;
 815}
 816
 817/*
 818 * Algorithm:
 819 *      1. The first ip_rt_redirect_number redirects are sent
 820 *         with exponential backoff, then we stop sending them at all,
 821 *         assuming that the host ignores our redirects.
 822 *      2. If we did not see packets requiring redirects
 823 *         during ip_rt_redirect_silence, we assume that the host
 824 *         forgot redirected route and start to send redirects again.
 825 *
 826 * This algorithm is much cheaper and more intelligent than dumb load limiting
 827 * in icmp.c.
 828 *
 829 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 830 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 831 */
 832
 833void ip_rt_send_redirect(struct sk_buff *skb)
 834{
 835        struct rtable *rt = skb_rtable(skb);
 836        struct in_device *in_dev;
 837        struct inet_peer *peer;
 838        struct net *net;
 839        int log_martians;
 840
 841        rcu_read_lock();
 842        in_dev = __in_dev_get_rcu(rt->dst.dev);
 843        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 844                rcu_read_unlock();
 845                return;
 846        }
 847        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 848        rcu_read_unlock();
 849
 850        net = dev_net(rt->dst.dev);
 851        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 852        if (!peer) {
 853                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 854                          rt_nexthop(rt, ip_hdr(skb)->daddr));
 855                return;
 856        }
 857
 858        /* No redirected packets during ip_rt_redirect_silence;
 859         * reset the algorithm.
 860         */
 861        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 862                peer->rate_tokens = 0;
 863
 864        /* Too many ignored redirects; do not send anything
 865         * set dst.rate_last to the last seen redirected packet.
 866         */
 867        if (peer->rate_tokens >= ip_rt_redirect_number) {
 868                peer->rate_last = jiffies;
 869                goto out_put_peer;
 870        }
 871
 872        /* Check for load limit; set rate_last to the latest sent
 873         * redirect.
 874         */
 875        if (peer->rate_tokens == 0 ||
 876            time_after(jiffies,
 877                       (peer->rate_last +
 878                        (ip_rt_redirect_load << peer->rate_tokens)))) {
 879                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 880
 881                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 882                peer->rate_last = jiffies;
 883                ++peer->rate_tokens;
 884#ifdef CONFIG_IP_ROUTE_VERBOSE
 885                if (log_martians &&
 886                    peer->rate_tokens == ip_rt_redirect_number)
 887                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 888                                             &ip_hdr(skb)->saddr, inet_iif(skb),
 889                                             &ip_hdr(skb)->daddr, &gw);
 890#endif
 891        }
 892out_put_peer:
 893        inet_putpeer(peer);
 894}
 895
 896static int ip_error(struct sk_buff *skb)
 897{
 898        struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 899        struct rtable *rt = skb_rtable(skb);
 900        struct inet_peer *peer;
 901        unsigned long now;
 902        struct net *net;
 903        bool send;
 904        int code;
 905
 906        net = dev_net(rt->dst.dev);
 907        if (!IN_DEV_FORWARD(in_dev)) {
 908                switch (rt->dst.error) {
 909                case EHOSTUNREACH:
 910                        IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
 911                        break;
 912
 913                case ENETUNREACH:
 914                        IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 915                        break;
 916                }
 917                goto out;
 918        }
 919
 920        switch (rt->dst.error) {
 921        case EINVAL:
 922        default:
 923                goto out;
 924        case EHOSTUNREACH:
 925                code = ICMP_HOST_UNREACH;
 926                break;
 927        case ENETUNREACH:
 928                code = ICMP_NET_UNREACH;
 929                IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 930                break;
 931        case EACCES:
 932                code = ICMP_PKT_FILTERED;
 933                break;
 934        }
 935
 936        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 937
 938        send = true;
 939        if (peer) {
 940                now = jiffies;
 941                peer->rate_tokens += now - peer->rate_last;
 942                if (peer->rate_tokens > ip_rt_error_burst)
 943                        peer->rate_tokens = ip_rt_error_burst;
 944                peer->rate_last = now;
 945                if (peer->rate_tokens >= ip_rt_error_cost)
 946                        peer->rate_tokens -= ip_rt_error_cost;
 947                else
 948                        send = false;
 949                inet_putpeer(peer);
 950        }
 951        if (send)
 952                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 953
 954out:    kfree_skb(skb);
 955        return 0;
 956}
 957
 958static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 959{
 960        struct dst_entry *dst = &rt->dst;
 961        struct fib_result res;
 962
 963        if (dst_metric_locked(dst, RTAX_MTU))
 964                return;
 965
 966        if (dst->dev->mtu < mtu)
 967                return;
 968
 969        if (rt->rt_pmtu && rt->rt_pmtu < mtu)
 970                return;
 971
 972        if (mtu < ip_rt_min_pmtu)
 973                mtu = ip_rt_min_pmtu;
 974
 975        if (rt->rt_pmtu == mtu &&
 976            time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
 977                return;
 978
 979        rcu_read_lock();
 980        if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
 981                struct fib_nh *nh = &FIB_RES_NH(res);
 982
 983                update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 984                                      jiffies + ip_rt_mtu_expires);
 985        }
 986        rcu_read_unlock();
 987}
 988
 989static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 990                              struct sk_buff *skb, u32 mtu)
 991{
 992        struct rtable *rt = (struct rtable *) dst;
 993        struct flowi4 fl4;
 994
 995        ip_rt_build_flow_key(&fl4, sk, skb);
 996        __ip_rt_update_pmtu(rt, &fl4, mtu);
 997}
 998
 999void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1000                      int oif, u32 mark, u8 protocol, int flow_flags)
1001{
1002        const struct iphdr *iph = (const struct iphdr *) skb->data;
1003        struct flowi4 fl4;
1004        struct rtable *rt;
1005
1006        if (!mark)
1007                mark = IP4_REPLY_MARK(net, skb->mark);
1008
1009        __build_flow_key(&fl4, NULL, iph, oif,
1010                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1011        rt = __ip_route_output_key(net, &fl4);
1012        if (!IS_ERR(rt)) {
1013                __ip_rt_update_pmtu(rt, &fl4, mtu);
1014                ip_rt_put(rt);
1015        }
1016}
1017EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1018
1019static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1020{
1021        const struct iphdr *iph = (const struct iphdr *) skb->data;
1022        struct flowi4 fl4;
1023        struct rtable *rt;
1024
1025        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1026
1027        if (!fl4.flowi4_mark)
1028                fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1029
1030        rt = __ip_route_output_key(sock_net(sk), &fl4);
1031        if (!IS_ERR(rt)) {
1032                __ip_rt_update_pmtu(rt, &fl4, mtu);
1033                ip_rt_put(rt);
1034        }
1035}
1036
1037void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1038{
1039        const struct iphdr *iph = (const struct iphdr *) skb->data;
1040        struct flowi4 fl4;
1041        struct rtable *rt;
1042        struct dst_entry *odst = NULL;
1043        bool new = false;
1044
1045        bh_lock_sock(sk);
1046
1047        if (!ip_sk_accept_pmtu(sk))
1048                goto out;
1049
1050        odst = sk_dst_get(sk);
1051
1052        if (sock_owned_by_user(sk) || !odst) {
1053                __ipv4_sk_update_pmtu(skb, sk, mtu);
1054                goto out;
1055        }
1056
1057        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1058
1059        rt = (struct rtable *)odst;
1060        if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {
1061                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1062                if (IS_ERR(rt))
1063                        goto out;
1064
1065                new = true;
1066        }
1067
1068        __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1069
1070        if (!dst_check(&rt->dst, 0)) {
1071                if (new)
1072                        dst_release(&rt->dst);
1073
1074                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1075                if (IS_ERR(rt))
1076                        goto out;
1077
1078                new = true;
1079        }
1080
1081        if (new)
1082                sk_dst_set(sk, &rt->dst);
1083
1084out:
1085        bh_unlock_sock(sk);
1086        dst_release(odst);
1087}
1088EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1089
1090void ipv4_redirect(struct sk_buff *skb, struct net *net,
1091                   int oif, u32 mark, u8 protocol, int flow_flags)
1092{
1093        const struct iphdr *iph = (const struct iphdr *) skb->data;
1094        struct flowi4 fl4;
1095        struct rtable *rt;
1096
1097        __build_flow_key(&fl4, NULL, iph, oif,
1098                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1099        rt = __ip_route_output_key(net, &fl4);
1100        if (!IS_ERR(rt)) {
1101                __ip_do_redirect(rt, skb, &fl4, false);
1102                ip_rt_put(rt);
1103        }
1104}
1105EXPORT_SYMBOL_GPL(ipv4_redirect);
1106
1107void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1108{
1109        const struct iphdr *iph = (const struct iphdr *) skb->data;
1110        struct flowi4 fl4;
1111        struct rtable *rt;
1112
1113        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1114        rt = __ip_route_output_key(sock_net(sk), &fl4);
1115        if (!IS_ERR(rt)) {
1116                __ip_do_redirect(rt, skb, &fl4, false);
1117                ip_rt_put(rt);
1118        }
1119}
1120EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1121
1122static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1123{
1124        struct rtable *rt = (struct rtable *) dst;
1125
1126        /* All IPV4 dsts are created with ->obsolete set to the value
1127         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1128         * into this function always.
1129         *
1130         * When a PMTU/redirect information update invalidates a route,
1131         * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1132         * DST_OBSOLETE_DEAD by dst_free().
1133         */
1134        if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1135                return NULL;
1136        return dst;
1137}
1138
1139static void ipv4_link_failure(struct sk_buff *skb)
1140{
1141        struct rtable *rt;
1142
1143        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1144
1145        rt = skb_rtable(skb);
1146        if (rt)
1147                dst_set_expires(&rt->dst, 0);
1148}
1149
1150static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
1151{
1152        pr_debug("%s: %pI4 -> %pI4, %s\n",
1153                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1154                 skb->dev ? skb->dev->name : "?");
1155        kfree_skb(skb);
1156        WARN_ON(1);
1157        return 0;
1158}
1159
1160/*
1161   We do not cache source address of outgoing interface,
1162   because it is used only by IP RR, TS and SRR options,
1163   so that it out of fast path.
1164
1165   BTW remember: "addr" is allowed to be not aligned
1166   in IP options!
1167 */
1168
1169void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1170{
1171        __be32 src;
1172
1173        if (rt_is_output_route(rt))
1174                src = ip_hdr(skb)->saddr;
1175        else {
1176                struct fib_result res;
1177                struct flowi4 fl4;
1178                struct iphdr *iph;
1179
1180                iph = ip_hdr(skb);
1181
1182                memset(&fl4, 0, sizeof(fl4));
1183                fl4.daddr = iph->daddr;
1184                fl4.saddr = iph->saddr;
1185                fl4.flowi4_tos = RT_TOS(iph->tos);
1186                fl4.flowi4_oif = rt->dst.dev->ifindex;
1187                fl4.flowi4_iif = skb->dev->ifindex;
1188                fl4.flowi4_mark = skb->mark;
1189
1190                rcu_read_lock();
1191                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1192                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1193                else
1194                        src = inet_select_addr(rt->dst.dev,
1195                                               rt_nexthop(rt, iph->daddr),
1196                                               RT_SCOPE_UNIVERSE);
1197                rcu_read_unlock();
1198        }
1199        memcpy(addr, &src, 4);
1200}
1201
1202#ifdef CONFIG_IP_ROUTE_CLASSID
1203static void set_class_tag(struct rtable *rt, u32 tag)
1204{
1205        if (!(rt->dst.tclassid & 0xFFFF))
1206                rt->dst.tclassid |= tag & 0xFFFF;
1207        if (!(rt->dst.tclassid & 0xFFFF0000))
1208                rt->dst.tclassid |= tag & 0xFFFF0000;
1209}
1210#endif
1211
1212static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1213{
1214        unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1215
1216        if (advmss == 0) {
1217                advmss = max_t(unsigned int, dst->dev->mtu - 40,
1218                               ip_rt_min_advmss);
1219                if (advmss > 65535 - 40)
1220                        advmss = 65535 - 40;
1221        }
1222        return advmss;
1223}
1224
1225static unsigned int ipv4_mtu(const struct dst_entry *dst)
1226{
1227        const struct rtable *rt = (const struct rtable *) dst;
1228        unsigned int mtu = rt->rt_pmtu;
1229
1230        if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1231                mtu = dst_metric_raw(dst, RTAX_MTU);
1232
1233        if (mtu)
1234                return mtu;
1235
1236        mtu = dst->dev->mtu;
1237
1238        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1239                if (rt->rt_uses_gateway && mtu > 576)
1240                        mtu = 576;
1241        }
1242
1243        return min_t(unsigned int, mtu, IP_MAX_MTU);
1244}
1245
1246static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1247{
1248        struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1249        struct fib_nh_exception *fnhe;
1250        u32 hval;
1251
1252        if (!hash)
1253                return NULL;
1254
1255        hval = fnhe_hashfun(daddr);
1256
1257        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1258             fnhe = rcu_dereference(fnhe->fnhe_next)) {
1259                if (fnhe->fnhe_daddr == daddr)
1260                        return fnhe;
1261        }
1262        return NULL;
1263}
1264
1265static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1266                              __be32 daddr)
1267{
1268        bool ret = false;
1269
1270        spin_lock_bh(&fnhe_lock);
1271
1272        if (daddr == fnhe->fnhe_daddr) {
1273                struct rtable __rcu **porig;
1274                struct rtable *orig;
1275                int genid = fnhe_genid(dev_net(rt->dst.dev));
1276
1277                if (rt_is_input_route(rt))
1278                        porig = &fnhe->fnhe_rth_input;
1279                else
1280                        porig = &fnhe->fnhe_rth_output;
1281                orig = rcu_dereference(*porig);
1282
1283                if (fnhe->fnhe_genid != genid) {
1284                        fnhe->fnhe_genid = genid;
1285                        fnhe->fnhe_gw = 0;
1286                        fnhe->fnhe_pmtu = 0;
1287                        fnhe->fnhe_expires = 0;
1288                        fnhe_flush_routes(fnhe);
1289                        orig = NULL;
1290                }
1291                fill_route_from_fnhe(rt, fnhe);
1292                if (!rt->rt_gateway)
1293                        rt->rt_gateway = daddr;
1294
1295                if (!(rt->dst.flags & DST_NOCACHE)) {
1296                        rcu_assign_pointer(*porig, rt);
1297                        if (orig)
1298                                rt_free(orig);
1299                        ret = true;
1300                }
1301
1302                fnhe->fnhe_stamp = jiffies;
1303        }
1304        spin_unlock_bh(&fnhe_lock);
1305
1306        return ret;
1307}
1308
1309static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1310{
1311        struct rtable *orig, *prev, **p;
1312        bool ret = true;
1313
1314        if (rt_is_input_route(rt)) {
1315                p = (struct rtable **)&nh->nh_rth_input;
1316        } else {
1317                p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1318        }
1319        orig = *p;
1320
1321        prev = cmpxchg(p, orig, rt);
1322        if (prev == orig) {
1323                if (orig)
1324                        rt_free(orig);
1325        } else
1326                ret = false;
1327
1328        return ret;
1329}
1330
1331struct uncached_list {
1332        spinlock_t              lock;
1333        struct list_head        head;
1334};
1335
1336static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1337
1338static void rt_add_uncached_list(struct rtable *rt)
1339{
1340        struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1341
1342        rt->rt_uncached_list = ul;
1343
1344        spin_lock_bh(&ul->lock);
1345        list_add_tail(&rt->rt_uncached, &ul->head);
1346        spin_unlock_bh(&ul->lock);
1347}
1348
1349static void ipv4_dst_destroy(struct dst_entry *dst)
1350{
1351        struct rtable *rt = (struct rtable *) dst;
1352
1353        if (!list_empty(&rt->rt_uncached)) {
1354                struct uncached_list *ul = rt->rt_uncached_list;
1355
1356                spin_lock_bh(&ul->lock);
1357                list_del(&rt->rt_uncached);
1358                spin_unlock_bh(&ul->lock);
1359        }
1360}
1361
1362void rt_flush_dev(struct net_device *dev)
1363{
1364        struct net *net = dev_net(dev);
1365        struct rtable *rt;
1366        int cpu;
1367
1368        for_each_possible_cpu(cpu) {
1369                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1370
1371                spin_lock_bh(&ul->lock);
1372                list_for_each_entry(rt, &ul->head, rt_uncached) {
1373                        if (rt->dst.dev != dev)
1374                                continue;
1375                        rt->dst.dev = net->loopback_dev;
1376                        dev_hold(rt->dst.dev);
1377                        dev_put(dev);
1378                }
1379                spin_unlock_bh(&ul->lock);
1380        }
1381}
1382
1383static bool rt_cache_valid(const struct rtable *rt)
1384{
1385        return  rt &&
1386                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1387                !rt_is_expired(rt);
1388}
1389
1390static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1391                           const struct fib_result *res,
1392                           struct fib_nh_exception *fnhe,
1393                           struct fib_info *fi, u16 type, u32 itag)
1394{
1395        bool cached = false;
1396
1397        if (fi) {
1398                struct fib_nh *nh = &FIB_RES_NH(*res);
1399
1400                if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1401                        rt->rt_gateway = nh->nh_gw;
1402                        rt->rt_uses_gateway = 1;
1403                }
1404                dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1405#ifdef CONFIG_IP_ROUTE_CLASSID
1406                rt->dst.tclassid = nh->nh_tclassid;
1407#endif
1408                if (unlikely(fnhe))
1409                        cached = rt_bind_exception(rt, fnhe, daddr);
1410                else if (!(rt->dst.flags & DST_NOCACHE))
1411                        cached = rt_cache_route(nh, rt);
1412                if (unlikely(!cached)) {
1413                        /* Routes we intend to cache in nexthop exception or
1414                         * FIB nexthop have the DST_NOCACHE bit clear.
1415                         * However, if we are unsuccessful at storing this
1416                         * route into the cache we really need to set it.
1417                         */
1418                        rt->dst.flags |= DST_NOCACHE;
1419                        if (!rt->rt_gateway)
1420                                rt->rt_gateway = daddr;
1421                        rt_add_uncached_list(rt);
1422                }
1423        } else
1424                rt_add_uncached_list(rt);
1425
1426#ifdef CONFIG_IP_ROUTE_CLASSID
1427#ifdef CONFIG_IP_MULTIPLE_TABLES
1428        set_class_tag(rt, res->tclassid);
1429#endif
1430        set_class_tag(rt, itag);
1431#endif
1432}
1433
1434static struct rtable *rt_dst_alloc(struct net_device *dev,
1435                                   bool nopolicy, bool noxfrm, bool will_cache)
1436{
1437        return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1438                         (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1439                         (nopolicy ? DST_NOPOLICY : 0) |
1440                         (noxfrm ? DST_NOXFRM : 0));
1441}
1442
1443/* called in rcu_read_lock() section */
1444static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1445                                u8 tos, struct net_device *dev, int our)
1446{
1447        struct rtable *rth;
1448        struct in_device *in_dev = __in_dev_get_rcu(dev);
1449        u32 itag = 0;
1450        int err;
1451
1452        /* Primary sanity checks. */
1453
1454        if (in_dev == NULL)
1455                return -EINVAL;
1456
1457        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1458            skb->protocol != htons(ETH_P_IP))
1459                goto e_inval;
1460
1461        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1462                if (ipv4_is_loopback(saddr))
1463                        goto e_inval;
1464
1465        if (ipv4_is_zeronet(saddr)) {
1466                if (!ipv4_is_local_multicast(daddr))
1467                        goto e_inval;
1468        } else {
1469                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1470                                          in_dev, &itag);
1471                if (err < 0)
1472                        goto e_err;
1473        }
1474        rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1475                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1476        if (!rth)
1477                goto e_nobufs;
1478
1479#ifdef CONFIG_IP_ROUTE_CLASSID
1480        rth->dst.tclassid = itag;
1481#endif
1482        rth->dst.output = ip_rt_bug;
1483
1484        rth->rt_genid   = rt_genid_ipv4(dev_net(dev));
1485        rth->rt_flags   = RTCF_MULTICAST;
1486        rth->rt_type    = RTN_MULTICAST;
1487        rth->rt_is_input= 1;
1488        rth->rt_iif     = 0;
1489        rth->rt_pmtu    = 0;
1490        rth->rt_gateway = 0;
1491        rth->rt_uses_gateway = 0;
1492        INIT_LIST_HEAD(&rth->rt_uncached);
1493        if (our) {
1494                rth->dst.input= ip_local_deliver;
1495                rth->rt_flags |= RTCF_LOCAL;
1496        }
1497
1498#ifdef CONFIG_IP_MROUTE
1499        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1500                rth->dst.input = ip_mr_input;
1501#endif
1502        RT_CACHE_STAT_INC(in_slow_mc);
1503
1504        skb_dst_set(skb, &rth->dst);
1505        return 0;
1506
1507e_nobufs:
1508        return -ENOBUFS;
1509e_inval:
1510        return -EINVAL;
1511e_err:
1512        return err;
1513}
1514
1515
1516static void ip_handle_martian_source(struct net_device *dev,
1517                                     struct in_device *in_dev,
1518                                     struct sk_buff *skb,
1519                                     __be32 daddr,
1520                                     __be32 saddr)
1521{
1522        RT_CACHE_STAT_INC(in_martian_src);
1523#ifdef CONFIG_IP_ROUTE_VERBOSE
1524        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1525                /*
1526                 *      RFC1812 recommendation, if source is martian,
1527                 *      the only hint is MAC header.
1528                 */
1529                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1530                        &daddr, &saddr, dev->name);
1531                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1532                        print_hex_dump(KERN_WARNING, "ll header: ",
1533                                       DUMP_PREFIX_OFFSET, 16, 1,
1534                                       skb_mac_header(skb),
1535                                       dev->hard_header_len, true);
1536                }
1537        }
1538#endif
1539}
1540
1541/* called in rcu_read_lock() section */
1542static int __mkroute_input(struct sk_buff *skb,
1543                           const struct fib_result *res,
1544                           struct in_device *in_dev,
1545                           __be32 daddr, __be32 saddr, u32 tos)
1546{
1547        struct fib_nh_exception *fnhe;
1548        struct rtable *rth;
1549        int err;
1550        struct in_device *out_dev;
1551        unsigned int flags = 0;
1552        bool do_cache;
1553        u32 itag = 0;
1554
1555        /* get a working reference to the output device */
1556        out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1557        if (out_dev == NULL) {
1558                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1559                return -EINVAL;
1560        }
1561
1562        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1563                                  in_dev->dev, in_dev, &itag);
1564        if (err < 0) {
1565                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1566                                         saddr);
1567
1568                goto cleanup;
1569        }
1570
1571        do_cache = res->fi && !itag;
1572        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1573            skb->protocol == htons(ETH_P_IP) &&
1574            (IN_DEV_SHARED_MEDIA(out_dev) ||
1575             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1576                IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1577
1578        if (skb->protocol != htons(ETH_P_IP)) {
1579                /* Not IP (i.e. ARP). Do not create route, if it is
1580                 * invalid for proxy arp. DNAT routes are always valid.
1581                 *
1582                 * Proxy arp feature have been extended to allow, ARP
1583                 * replies back to the same interface, to support
1584                 * Private VLAN switch technologies. See arp.c.
1585                 */
1586                if (out_dev == in_dev &&
1587                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1588                        err = -EINVAL;
1589                        goto cleanup;
1590                }
1591        }
1592
1593        fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1594        if (do_cache) {
1595                if (fnhe != NULL)
1596                        rth = rcu_dereference(fnhe->fnhe_rth_input);
1597                else
1598                        rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1599
1600                if (rt_cache_valid(rth)) {
1601                        skb_dst_set_noref(skb, &rth->dst);
1602                        goto out;
1603                }
1604        }
1605
1606        rth = rt_dst_alloc(out_dev->dev,
1607                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1608                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1609        if (!rth) {
1610                err = -ENOBUFS;
1611                goto cleanup;
1612        }
1613
1614        rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1615        rth->rt_flags = flags;
1616        rth->rt_type = res->type;
1617        rth->rt_is_input = 1;
1618        rth->rt_iif     = 0;
1619        rth->rt_pmtu    = 0;
1620        rth->rt_gateway = 0;
1621        rth->rt_uses_gateway = 0;
1622        INIT_LIST_HEAD(&rth->rt_uncached);
1623        RT_CACHE_STAT_INC(in_slow_tot);
1624
1625        rth->dst.input = ip_forward;
1626        rth->dst.output = ip_output;
1627
1628        rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1629        skb_dst_set(skb, &rth->dst);
1630out:
1631        err = 0;
1632 cleanup:
1633        return err;
1634}
1635
1636static int ip_mkroute_input(struct sk_buff *skb,
1637                            struct fib_result *res,
1638                            const struct flowi4 *fl4,
1639                            struct in_device *in_dev,
1640                            __be32 daddr, __be32 saddr, u32 tos)
1641{
1642#ifdef CONFIG_IP_ROUTE_MULTIPATH
1643        if (res->fi && res->fi->fib_nhs > 1)
1644                fib_select_multipath(res);
1645#endif
1646
1647        /* create a routing cache entry */
1648        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1649}
1650
1651/*
1652 *      NOTE. We drop all the packets that has local source
1653 *      addresses, because every properly looped back packet
1654 *      must have correct destination already attached by output routine.
1655 *
1656 *      Such approach solves two big problems:
1657 *      1. Not simplex devices are handled properly.
1658 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1659 *      called with rcu_read_lock()
1660 */
1661
1662static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1663                               u8 tos, struct net_device *dev)
1664{
1665        struct fib_result res;
1666        struct in_device *in_dev = __in_dev_get_rcu(dev);
1667        struct flowi4   fl4;
1668        unsigned int    flags = 0;
1669        u32             itag = 0;
1670        struct rtable   *rth;
1671        int             err = -EINVAL;
1672        struct net    *net = dev_net(dev);
1673        bool do_cache;
1674
1675        /* IP on this device is disabled. */
1676
1677        if (!in_dev)
1678                goto out;
1679
1680        /* Check for the most weird martians, which can be not detected
1681           by fib_lookup.
1682         */
1683
1684        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1685                goto martian_source;
1686
1687        res.fi = NULL;
1688        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1689                goto brd_input;
1690
1691        /* Accept zero addresses only to limited broadcast;
1692         * I even do not know to fix it or not. Waiting for complains :-)
1693         */
1694        if (ipv4_is_zeronet(saddr))
1695                goto martian_source;
1696
1697        if (ipv4_is_zeronet(daddr))
1698                goto martian_destination;
1699
1700        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1701         * and call it once if daddr or/and saddr are loopback addresses
1702         */
1703        if (ipv4_is_loopback(daddr)) {
1704                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1705                        goto martian_destination;
1706        } else if (ipv4_is_loopback(saddr)) {
1707                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1708                        goto martian_source;
1709        }
1710
1711        /*
1712         *      Now we are ready to route packet.
1713         */
1714        fl4.flowi4_oif = 0;
1715        fl4.flowi4_iif = dev->ifindex;
1716        fl4.flowi4_mark = skb->mark;
1717        fl4.flowi4_tos = tos;
1718        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1719        fl4.daddr = daddr;
1720        fl4.saddr = saddr;
1721        err = fib_lookup(net, &fl4, &res);
1722        if (err != 0) {
1723                if (!IN_DEV_FORWARD(in_dev))
1724                        err = -EHOSTUNREACH;
1725                goto no_route;
1726        }
1727
1728        if (res.type == RTN_BROADCAST)
1729                goto brd_input;
1730
1731        if (res.type == RTN_LOCAL) {
1732                err = fib_validate_source(skb, saddr, daddr, tos,
1733                                          0, dev, in_dev, &itag);
1734                if (err < 0)
1735                        goto martian_source_keep_err;
1736                goto local_input;
1737        }
1738
1739        if (!IN_DEV_FORWARD(in_dev)) {
1740                err = -EHOSTUNREACH;
1741                goto no_route;
1742        }
1743        if (res.type != RTN_UNICAST)
1744                goto martian_destination;
1745
1746        err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1747out:    return err;
1748
1749brd_input:
1750        if (skb->protocol != htons(ETH_P_IP))
1751                goto e_inval;
1752
1753        if (!ipv4_is_zeronet(saddr)) {
1754                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1755                                          in_dev, &itag);
1756                if (err < 0)
1757                        goto martian_source_keep_err;
1758        }
1759        flags |= RTCF_BROADCAST;
1760        res.type = RTN_BROADCAST;
1761        RT_CACHE_STAT_INC(in_brd);
1762
1763local_input:
1764        do_cache = false;
1765        if (res.fi) {
1766                if (!itag) {
1767                        rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1768                        if (rt_cache_valid(rth)) {
1769                                skb_dst_set_noref(skb, &rth->dst);
1770                                err = 0;
1771                                goto out;
1772                        }
1773                        do_cache = true;
1774                }
1775        }
1776
1777        rth = rt_dst_alloc(net->loopback_dev,
1778                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1779        if (!rth)
1780                goto e_nobufs;
1781
1782        rth->dst.input= ip_local_deliver;
1783        rth->dst.output= ip_rt_bug;
1784#ifdef CONFIG_IP_ROUTE_CLASSID
1785        rth->dst.tclassid = itag;
1786#endif
1787
1788        rth->rt_genid = rt_genid_ipv4(net);
1789        rth->rt_flags   = flags|RTCF_LOCAL;
1790        rth->rt_type    = res.type;
1791        rth->rt_is_input = 1;
1792        rth->rt_iif     = 0;
1793        rth->rt_pmtu    = 0;
1794        rth->rt_gateway = 0;
1795        rth->rt_uses_gateway = 0;
1796        INIT_LIST_HEAD(&rth->rt_uncached);
1797        RT_CACHE_STAT_INC(in_slow_tot);
1798        if (res.type == RTN_UNREACHABLE) {
1799                rth->dst.input= ip_error;
1800                rth->dst.error= -err;
1801                rth->rt_flags   &= ~RTCF_LOCAL;
1802        }
1803        if (do_cache) {
1804                if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1805                        rth->dst.flags |= DST_NOCACHE;
1806                        rt_add_uncached_list(rth);
1807                }
1808        }
1809        skb_dst_set(skb, &rth->dst);
1810        err = 0;
1811        goto out;
1812
1813no_route:
1814        RT_CACHE_STAT_INC(in_no_route);
1815        res.type = RTN_UNREACHABLE;
1816        res.fi = NULL;
1817        goto local_input;
1818
1819        /*
1820         *      Do not cache martian addresses: they should be logged (RFC1812)
1821         */
1822martian_destination:
1823        RT_CACHE_STAT_INC(in_martian_dst);
1824#ifdef CONFIG_IP_ROUTE_VERBOSE
1825        if (IN_DEV_LOG_MARTIANS(in_dev))
1826                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1827                                     &daddr, &saddr, dev->name);
1828#endif
1829
1830e_inval:
1831        err = -EINVAL;
1832        goto out;
1833
1834e_nobufs:
1835        err = -ENOBUFS;
1836        goto out;
1837
1838martian_source:
1839        err = -EINVAL;
1840martian_source_keep_err:
1841        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1842        goto out;
1843}
1844
1845int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1846                         u8 tos, struct net_device *dev)
1847{
1848        int res;
1849
1850        rcu_read_lock();
1851
1852        /* Multicast recognition logic is moved from route cache to here.
1853           The problem was that too many Ethernet cards have broken/missing
1854           hardware multicast filters :-( As result the host on multicasting
1855           network acquires a lot of useless route cache entries, sort of
1856           SDR messages from all the world. Now we try to get rid of them.
1857           Really, provided software IP multicast filter is organized
1858           reasonably (at least, hashed), it does not result in a slowdown
1859           comparing with route cache reject entries.
1860           Note, that multicast routers are not affected, because
1861           route cache entry is created eventually.
1862         */
1863        if (ipv4_is_multicast(daddr)) {
1864                struct in_device *in_dev = __in_dev_get_rcu(dev);
1865
1866                if (in_dev) {
1867                        int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1868                                                  ip_hdr(skb)->protocol);
1869                        if (our
1870#ifdef CONFIG_IP_MROUTE
1871                                ||
1872                            (!ipv4_is_local_multicast(daddr) &&
1873                             IN_DEV_MFORWARD(in_dev))
1874#endif
1875                           ) {
1876                                int res = ip_route_input_mc(skb, daddr, saddr,
1877                                                            tos, dev, our);
1878                                rcu_read_unlock();
1879                                return res;
1880                        }
1881                }
1882                rcu_read_unlock();
1883                return -EINVAL;
1884        }
1885        res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1886        rcu_read_unlock();
1887        return res;
1888}
1889EXPORT_SYMBOL(ip_route_input_noref);
1890
1891/* called with rcu_read_lock() */
1892static struct rtable *__mkroute_output(const struct fib_result *res,
1893                                       const struct flowi4 *fl4, int orig_oif,
1894                                       struct net_device *dev_out,
1895                                       unsigned int flags)
1896{
1897        struct fib_info *fi = res->fi;
1898        struct fib_nh_exception *fnhe;
1899        struct in_device *in_dev;
1900        u16 type = res->type;
1901        struct rtable *rth;
1902        bool do_cache;
1903
1904        in_dev = __in_dev_get_rcu(dev_out);
1905        if (!in_dev)
1906                return ERR_PTR(-EINVAL);
1907
1908        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1909                if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1910                        return ERR_PTR(-EINVAL);
1911
1912        if (ipv4_is_lbcast(fl4->daddr))
1913                type = RTN_BROADCAST;
1914        else if (ipv4_is_multicast(fl4->daddr))
1915                type = RTN_MULTICAST;
1916        else if (ipv4_is_zeronet(fl4->daddr))
1917                return ERR_PTR(-EINVAL);
1918
1919        if (dev_out->flags & IFF_LOOPBACK)
1920                flags |= RTCF_LOCAL;
1921
1922        do_cache = true;
1923        if (type == RTN_BROADCAST) {
1924                flags |= RTCF_BROADCAST | RTCF_LOCAL;
1925                fi = NULL;
1926        } else if (type == RTN_MULTICAST) {
1927                flags |= RTCF_MULTICAST | RTCF_LOCAL;
1928                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1929                                     fl4->flowi4_proto))
1930                        flags &= ~RTCF_LOCAL;
1931                else
1932                        do_cache = false;
1933                /* If multicast route do not exist use
1934                 * default one, but do not gateway in this case.
1935                 * Yes, it is hack.
1936                 */
1937                if (fi && res->prefixlen < 4)
1938                        fi = NULL;
1939        }
1940
1941        fnhe = NULL;
1942        do_cache &= fi != NULL;
1943        if (do_cache) {
1944                struct rtable __rcu **prth;
1945                struct fib_nh *nh = &FIB_RES_NH(*res);
1946
1947                fnhe = find_exception(nh, fl4->daddr);
1948                if (fnhe)
1949                        prth = &fnhe->fnhe_rth_output;
1950                else {
1951                        if (unlikely(fl4->flowi4_flags &
1952                                     FLOWI_FLAG_KNOWN_NH &&
1953                                     !(nh->nh_gw &&
1954                                       nh->nh_scope == RT_SCOPE_LINK))) {
1955                                do_cache = false;
1956                                goto add;
1957                        }
1958                        prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
1959                }
1960                rth = rcu_dereference(*prth);
1961                if (rt_cache_valid(rth)) {
1962                        dst_hold(&rth->dst);
1963                        return rth;
1964                }
1965        }
1966
1967add:
1968        rth = rt_dst_alloc(dev_out,
1969                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1970                           IN_DEV_CONF_GET(in_dev, NOXFRM),
1971                           do_cache);
1972        if (!rth)
1973                return ERR_PTR(-ENOBUFS);
1974
1975        rth->dst.output = ip_output;
1976
1977        rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1978        rth->rt_flags   = flags;
1979        rth->rt_type    = type;
1980        rth->rt_is_input = 0;
1981        rth->rt_iif     = orig_oif ? : 0;
1982        rth->rt_pmtu    = 0;
1983        rth->rt_gateway = 0;
1984        rth->rt_uses_gateway = 0;
1985        INIT_LIST_HEAD(&rth->rt_uncached);
1986
1987        RT_CACHE_STAT_INC(out_slow_tot);
1988
1989        if (flags & RTCF_LOCAL)
1990                rth->dst.input = ip_local_deliver;
1991        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1992                if (flags & RTCF_LOCAL &&
1993                    !(dev_out->flags & IFF_LOOPBACK)) {
1994                        rth->dst.output = ip_mc_output;
1995                        RT_CACHE_STAT_INC(out_slow_mc);
1996                }
1997#ifdef CONFIG_IP_MROUTE
1998                if (type == RTN_MULTICAST) {
1999                        if (IN_DEV_MFORWARD(in_dev) &&
2000                            !ipv4_is_local_multicast(fl4->daddr)) {
2001                                rth->dst.input = ip_mr_input;
2002                                rth->dst.output = ip_mc_output;
2003                        }
2004                }
2005#endif
2006        }
2007
2008        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2009
2010        return rth;
2011}
2012
2013/*
2014 * Major route resolver routine.
2015 */
2016
2017struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
2018{
2019        struct net_device *dev_out = NULL;
2020        __u8 tos = RT_FL_TOS(fl4);
2021        unsigned int flags = 0;
2022        struct fib_result res;
2023        struct rtable *rth;
2024        int orig_oif;
2025
2026        res.tclassid    = 0;
2027        res.fi          = NULL;
2028        res.table       = NULL;
2029
2030        orig_oif = fl4->flowi4_oif;
2031
2032        fl4->flowi4_iif = LOOPBACK_IFINDEX;
2033        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2034        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2035                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2036
2037        rcu_read_lock();
2038        if (fl4->saddr) {
2039                rth = ERR_PTR(-EINVAL);
2040                if (ipv4_is_multicast(fl4->saddr) ||
2041                    ipv4_is_lbcast(fl4->saddr) ||
2042                    ipv4_is_zeronet(fl4->saddr))
2043                        goto out;
2044
2045                /* I removed check for oif == dev_out->oif here.
2046                   It was wrong for two reasons:
2047                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2048                      is assigned to multiple interfaces.
2049                   2. Moreover, we are allowed to send packets with saddr
2050                      of another iface. --ANK
2051                 */
2052
2053                if (fl4->flowi4_oif == 0 &&
2054                    (ipv4_is_multicast(fl4->daddr) ||
2055                     ipv4_is_lbcast(fl4->daddr))) {
2056                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2057                        dev_out = __ip_dev_find(net, fl4->saddr, false);
2058                        if (dev_out == NULL)
2059                                goto out;
2060
2061                        /* Special hack: user can direct multicasts
2062                           and limited broadcast via necessary interface
2063                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2064                           This hack is not just for fun, it allows
2065                           vic,vat and friends to work.
2066                           They bind socket to loopback, set ttl to zero
2067                           and expect that it will work.
2068                           From the viewpoint of routing cache they are broken,
2069                           because we are not allowed to build multicast path
2070                           with loopback source addr (look, routing cache
2071                           cannot know, that ttl is zero, so that packet
2072                           will not leave this host and route is valid).
2073                           Luckily, this hack is good workaround.
2074                         */
2075
2076                        fl4->flowi4_oif = dev_out->ifindex;
2077                        goto make_route;
2078                }
2079
2080                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2081                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2082                        if (!__ip_dev_find(net, fl4->saddr, false))
2083                                goto out;
2084                }
2085        }
2086
2087
2088        if (fl4->flowi4_oif) {
2089                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2090                rth = ERR_PTR(-ENODEV);
2091                if (dev_out == NULL)
2092                        goto out;
2093
2094                /* RACE: Check return value of inet_select_addr instead. */
2095                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2096                        rth = ERR_PTR(-ENETUNREACH);
2097                        goto out;
2098                }
2099                if (ipv4_is_local_multicast(fl4->daddr) ||
2100                    ipv4_is_lbcast(fl4->daddr)) {
2101                        if (!fl4->saddr)
2102                                fl4->saddr = inet_select_addr(dev_out, 0,
2103                                                              RT_SCOPE_LINK);
2104                        goto make_route;
2105                }
2106                if (!fl4->saddr) {
2107                        if (ipv4_is_multicast(fl4->daddr))
2108                                fl4->saddr = inet_select_addr(dev_out, 0,
2109                                                              fl4->flowi4_scope);
2110                        else if (!fl4->daddr)
2111                                fl4->saddr = inet_select_addr(dev_out, 0,
2112                                                              RT_SCOPE_HOST);
2113                }
2114        }
2115
2116        if (!fl4->daddr) {
2117                fl4->daddr = fl4->saddr;
2118                if (!fl4->daddr)
2119                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2120                dev_out = net->loopback_dev;
2121                fl4->flowi4_oif = LOOPBACK_IFINDEX;
2122                res.type = RTN_LOCAL;
2123                flags |= RTCF_LOCAL;
2124                goto make_route;
2125        }
2126
2127        if (fib_lookup(net, fl4, &res)) {
2128                res.fi = NULL;
2129                res.table = NULL;
2130                if (fl4->flowi4_oif) {
2131                        /* Apparently, routing tables are wrong. Assume,
2132                           that the destination is on link.
2133
2134                           WHY? DW.
2135                           Because we are allowed to send to iface
2136                           even if it has NO routes and NO assigned
2137                           addresses. When oif is specified, routing
2138                           tables are looked up with only one purpose:
2139                           to catch if destination is gatewayed, rather than
2140                           direct. Moreover, if MSG_DONTROUTE is set,
2141                           we send packet, ignoring both routing tables
2142                           and ifaddr state. --ANK
2143
2144
2145                           We could make it even if oif is unknown,
2146                           likely IPv6, but we do not.
2147                         */
2148
2149                        if (fl4->saddr == 0)
2150                                fl4->saddr = inet_select_addr(dev_out, 0,
2151                                                              RT_SCOPE_LINK);
2152                        res.type = RTN_UNICAST;
2153                        goto make_route;
2154                }
2155                rth = ERR_PTR(-ENETUNREACH);
2156                goto out;
2157        }
2158
2159        if (res.type == RTN_LOCAL) {
2160                if (!fl4->saddr) {
2161                        if (res.fi->fib_prefsrc)
2162                                fl4->saddr = res.fi->fib_prefsrc;
2163                        else
2164                                fl4->saddr = fl4->daddr;
2165                }
2166                dev_out = net->loopback_dev;
2167                fl4->flowi4_oif = dev_out->ifindex;
2168                flags |= RTCF_LOCAL;
2169                goto make_route;
2170        }
2171
2172#ifdef CONFIG_IP_ROUTE_MULTIPATH
2173        if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2174                fib_select_multipath(&res);
2175        else
2176#endif
2177        if (!res.prefixlen &&
2178            res.table->tb_num_default > 1 &&
2179            res.type == RTN_UNICAST && !fl4->flowi4_oif)
2180                fib_select_default(&res);
2181
2182        if (!fl4->saddr)
2183                fl4->saddr = FIB_RES_PREFSRC(net, res);
2184
2185        dev_out = FIB_RES_DEV(res);
2186        fl4->flowi4_oif = dev_out->ifindex;
2187
2188
2189make_route:
2190        rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2191
2192out:
2193        rcu_read_unlock();
2194        return rth;
2195}
2196EXPORT_SYMBOL_GPL(__ip_route_output_key);
2197
2198static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2199{
2200        return NULL;
2201}
2202
2203static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2204{
2205        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2206
2207        return mtu ? : dst->dev->mtu;
2208}
2209
2210static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2211                                          struct sk_buff *skb, u32 mtu)
2212{
2213}
2214
2215static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2216                                       struct sk_buff *skb)
2217{
2218}
2219
2220static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2221                                          unsigned long old)
2222{
2223        return NULL;
2224}
2225
2226static struct dst_ops ipv4_dst_blackhole_ops = {
2227        .family                 =       AF_INET,
2228        .protocol               =       cpu_to_be16(ETH_P_IP),
2229        .check                  =       ipv4_blackhole_dst_check,
2230        .mtu                    =       ipv4_blackhole_mtu,
2231        .default_advmss         =       ipv4_default_advmss,
2232        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2233        .redirect               =       ipv4_rt_blackhole_redirect,
2234        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2235        .neigh_lookup           =       ipv4_neigh_lookup,
2236};
2237
2238struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2239{
2240        struct rtable *ort = (struct rtable *) dst_orig;
2241        struct rtable *rt;
2242
2243        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2244        if (rt) {
2245                struct dst_entry *new = &rt->dst;
2246
2247                new->__use = 1;
2248                new->input = dst_discard;
2249                new->output = dst_discard_sk;
2250
2251                new->dev = ort->dst.dev;
2252                if (new->dev)
2253                        dev_hold(new->dev);
2254
2255                rt->rt_is_input = ort->rt_is_input;
2256                rt->rt_iif = ort->rt_iif;
2257                rt->rt_pmtu = ort->rt_pmtu;
2258
2259                rt->rt_genid = rt_genid_ipv4(net);
2260                rt->rt_flags = ort->rt_flags;
2261                rt->rt_type = ort->rt_type;
2262                rt->rt_gateway = ort->rt_gateway;
2263                rt->rt_uses_gateway = ort->rt_uses_gateway;
2264
2265                INIT_LIST_HEAD(&rt->rt_uncached);
2266
2267                dst_free(new);
2268        }
2269
2270        dst_release(dst_orig);
2271
2272        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2273}
2274
2275struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2276                                    struct sock *sk)
2277{
2278        struct rtable *rt = __ip_route_output_key(net, flp4);
2279
2280        if (IS_ERR(rt))
2281                return rt;
2282
2283        if (flp4->flowi4_proto)
2284                rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2285                                                        flowi4_to_flowi(flp4),
2286                                                        sk, 0);
2287
2288        return rt;
2289}
2290EXPORT_SYMBOL_GPL(ip_route_output_flow);
2291
2292static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2293                        struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2294                        u32 seq, int event, int nowait, unsigned int flags)
2295{
2296        struct rtable *rt = skb_rtable(skb);
2297        struct rtmsg *r;
2298        struct nlmsghdr *nlh;
2299        unsigned long expires = 0;
2300        u32 error;
2301        u32 metrics[RTAX_MAX];
2302
2303        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2304        if (nlh == NULL)
2305                return -EMSGSIZE;
2306
2307        r = nlmsg_data(nlh);
2308        r->rtm_family    = AF_INET;
2309        r->rtm_dst_len  = 32;
2310        r->rtm_src_len  = 0;
2311        r->rtm_tos      = fl4->flowi4_tos;
2312        r->rtm_table    = RT_TABLE_MAIN;
2313        if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2314                goto nla_put_failure;
2315        r->rtm_type     = rt->rt_type;
2316        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2317        r->rtm_protocol = RTPROT_UNSPEC;
2318        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2319        if (rt->rt_flags & RTCF_NOTIFY)
2320                r->rtm_flags |= RTM_F_NOTIFY;
2321        if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2322                r->rtm_flags |= RTCF_DOREDIRECT;
2323
2324        if (nla_put_be32(skb, RTA_DST, dst))
2325                goto nla_put_failure;
2326        if (src) {
2327                r->rtm_src_len = 32;
2328                if (nla_put_be32(skb, RTA_SRC, src))
2329                        goto nla_put_failure;
2330        }
2331        if (rt->dst.dev &&
2332            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2333                goto nla_put_failure;
2334#ifdef CONFIG_IP_ROUTE_CLASSID
2335        if (rt->dst.tclassid &&
2336            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2337                goto nla_put_failure;
2338#endif
2339        if (!rt_is_input_route(rt) &&
2340            fl4->saddr != src) {
2341                if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2342                        goto nla_put_failure;
2343        }
2344        if (rt->rt_uses_gateway &&
2345            nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2346                goto nla_put_failure;
2347
2348        expires = rt->dst.expires;
2349        if (expires) {
2350                unsigned long now = jiffies;
2351
2352                if (time_before(now, expires))
2353                        expires -= now;
2354                else
2355                        expires = 0;
2356        }
2357
2358        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2359        if (rt->rt_pmtu && expires)
2360                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2361        if (rtnetlink_put_metrics(skb, metrics) < 0)
2362                goto nla_put_failure;
2363
2364        if (fl4->flowi4_mark &&
2365            nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2366                goto nla_put_failure;
2367
2368        error = rt->dst.error;
2369
2370        if (rt_is_input_route(rt)) {
2371#ifdef CONFIG_IP_MROUTE
2372                if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2373                    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2374                        int err = ipmr_get_route(net, skb,
2375                                                 fl4->saddr, fl4->daddr,
2376                                                 r, nowait);
2377                        if (err <= 0) {
2378                                if (!nowait) {
2379                                        if (err == 0)
2380                                                return 0;
2381                                        goto nla_put_failure;
2382                                } else {
2383                                        if (err == -EMSGSIZE)
2384                                                goto nla_put_failure;
2385                                        error = err;
2386                                }
2387                        }
2388                } else
2389#endif
2390                        if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2391                                goto nla_put_failure;
2392        }
2393
2394        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2395                goto nla_put_failure;
2396
2397        nlmsg_end(skb, nlh);
2398        return 0;
2399
2400nla_put_failure:
2401        nlmsg_cancel(skb, nlh);
2402        return -EMSGSIZE;
2403}
2404
2405static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2406{
2407        struct net *net = sock_net(in_skb->sk);
2408        struct rtmsg *rtm;
2409        struct nlattr *tb[RTA_MAX+1];
2410        struct rtable *rt = NULL;
2411        struct flowi4 fl4;
2412        __be32 dst = 0;
2413        __be32 src = 0;
2414        u32 iif;
2415        int err;
2416        int mark;
2417        struct sk_buff *skb;
2418
2419        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2420        if (err < 0)
2421                goto errout;
2422
2423        rtm = nlmsg_data(nlh);
2424
2425        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2426        if (skb == NULL) {
2427                err = -ENOBUFS;
2428                goto errout;
2429        }
2430
2431        /* Reserve room for dummy headers, this skb can pass
2432           through good chunk of routing engine.
2433         */
2434        skb_reset_mac_header(skb);
2435        skb_reset_network_header(skb);
2436
2437        /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2438        ip_hdr(skb)->protocol = IPPROTO_ICMP;
2439        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2440
2441        src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2442        dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2443        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2444        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2445
2446        memset(&fl4, 0, sizeof(fl4));
2447        fl4.daddr = dst;
2448        fl4.saddr = src;
2449        fl4.flowi4_tos = rtm->rtm_tos;
2450        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2451        fl4.flowi4_mark = mark;
2452
2453        if (iif) {
2454                struct net_device *dev;
2455
2456                dev = __dev_get_by_index(net, iif);
2457                if (dev == NULL) {
2458                        err = -ENODEV;
2459                        goto errout_free;
2460                }
2461
2462                skb->protocol   = htons(ETH_P_IP);
2463                skb->dev        = dev;
2464                skb->mark       = mark;
2465                local_bh_disable();
2466                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2467                local_bh_enable();
2468
2469                rt = skb_rtable(skb);
2470                if (err == 0 && rt->dst.error)
2471                        err = -rt->dst.error;
2472        } else {
2473                rt = ip_route_output_key(net, &fl4);
2474
2475                err = 0;
2476                if (IS_ERR(rt))
2477                        err = PTR_ERR(rt);
2478        }
2479
2480        if (err)
2481                goto errout_free;
2482
2483        skb_dst_set(skb, &rt->dst);
2484        if (rtm->rtm_flags & RTM_F_NOTIFY)
2485                rt->rt_flags |= RTCF_NOTIFY;
2486
2487        err = rt_fill_info(net, dst, src, &fl4, skb,
2488                           NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2489                           RTM_NEWROUTE, 0, 0);
2490        if (err < 0)
2491                goto errout_free;
2492
2493        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2494errout:
2495        return err;
2496
2497errout_free:
2498        kfree_skb(skb);
2499        goto errout;
2500}
2501
2502void ip_rt_multicast_event(struct in_device *in_dev)
2503{
2504        rt_cache_flush(dev_net(in_dev->dev));
2505}
2506
2507#ifdef CONFIG_SYSCTL
2508static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2509static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2510static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2511static int ip_rt_gc_elasticity __read_mostly    = 8;
2512
2513static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2514                                        void __user *buffer,
2515                                        size_t *lenp, loff_t *ppos)
2516{
2517        struct net *net = (struct net *)__ctl->extra1;
2518
2519        if (write) {
2520                rt_cache_flush(net);
2521                fnhe_genid_bump(net);
2522                return 0;
2523        }
2524
2525        return -EINVAL;
2526}
2527
2528static struct ctl_table ipv4_route_table[] = {
2529        {
2530                .procname       = "gc_thresh",
2531                .data           = &ipv4_dst_ops.gc_thresh,
2532                .maxlen         = sizeof(int),
2533                .mode           = 0644,
2534                .proc_handler   = proc_dointvec,
2535        },
2536        {
2537                .procname       = "max_size",
2538                .data           = &ip_rt_max_size,
2539                .maxlen         = sizeof(int),
2540                .mode           = 0644,
2541                .proc_handler   = proc_dointvec,
2542        },
2543        {
2544                /*  Deprecated. Use gc_min_interval_ms */
2545
2546                .procname       = "gc_min_interval",
2547                .data           = &ip_rt_gc_min_interval,
2548                .maxlen         = sizeof(int),
2549                .mode           = 0644,
2550                .proc_handler   = proc_dointvec_jiffies,
2551        },
2552        {
2553                .procname       = "gc_min_interval_ms",
2554                .data           = &ip_rt_gc_min_interval,
2555                .maxlen         = sizeof(int),
2556                .mode           = 0644,
2557                .proc_handler   = proc_dointvec_ms_jiffies,
2558        },
2559        {
2560                .procname       = "gc_timeout",
2561                .data           = &ip_rt_gc_timeout,
2562                .maxlen         = sizeof(int),
2563                .mode           = 0644,
2564                .proc_handler   = proc_dointvec_jiffies,
2565        },
2566        {
2567                .procname       = "gc_interval",
2568                .data           = &ip_rt_gc_interval,
2569                .maxlen         = sizeof(int),
2570                .mode           = 0644,
2571                .proc_handler   = proc_dointvec_jiffies,
2572        },
2573        {
2574                .procname       = "redirect_load",
2575                .data           = &ip_rt_redirect_load,
2576                .maxlen         = sizeof(int),
2577                .mode           = 0644,
2578                .proc_handler   = proc_dointvec,
2579        },
2580        {
2581                .procname       = "redirect_number",
2582                .data           = &ip_rt_redirect_number,
2583                .maxlen         = sizeof(int),
2584                .mode           = 0644,
2585                .proc_handler   = proc_dointvec,
2586        },
2587        {
2588                .procname       = "redirect_silence",
2589                .data           = &ip_rt_redirect_silence,
2590                .maxlen         = sizeof(int),
2591                .mode           = 0644,
2592                .proc_handler   = proc_dointvec,
2593        },
2594        {
2595                .procname       = "error_cost",
2596                .data           = &ip_rt_error_cost,
2597                .maxlen         = sizeof(int),
2598                .mode           = 0644,
2599                .proc_handler   = proc_dointvec,
2600        },
2601        {
2602                .procname       = "error_burst",
2603                .data           = &ip_rt_error_burst,
2604                .maxlen         = sizeof(int),
2605                .mode           = 0644,
2606                .proc_handler   = proc_dointvec,
2607        },
2608        {
2609                .procname       = "gc_elasticity",
2610                .data           = &ip_rt_gc_elasticity,
2611                .maxlen         = sizeof(int),
2612                .mode           = 0644,
2613                .proc_handler   = proc_dointvec,
2614        },
2615        {
2616                .procname       = "mtu_expires",
2617                .data           = &ip_rt_mtu_expires,
2618                .maxlen         = sizeof(int),
2619                .mode           = 0644,
2620                .proc_handler   = proc_dointvec_jiffies,
2621        },
2622        {
2623                .procname       = "min_pmtu",
2624                .data           = &ip_rt_min_pmtu,
2625                .maxlen         = sizeof(int),
2626                .mode           = 0644,
2627                .proc_handler   = proc_dointvec,
2628        },
2629        {
2630                .procname       = "min_adv_mss",
2631                .data           = &ip_rt_min_advmss,
2632                .maxlen         = sizeof(int),
2633                .mode           = 0644,
2634                .proc_handler   = proc_dointvec,
2635        },
2636        { }
2637};
2638
2639static struct ctl_table ipv4_route_flush_table[] = {
2640        {
2641                .procname       = "flush",
2642                .maxlen         = sizeof(int),
2643                .mode           = 0200,
2644                .proc_handler   = ipv4_sysctl_rtcache_flush,
2645        },
2646        { },
2647};
2648
2649static __net_init int sysctl_route_net_init(struct net *net)
2650{
2651        struct ctl_table *tbl;
2652
2653        tbl = ipv4_route_flush_table;
2654        if (!net_eq(net, &init_net)) {
2655                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2656                if (tbl == NULL)
2657                        goto err_dup;
2658
2659                /* Don't export sysctls to unprivileged users */
2660                if (net->user_ns != &init_user_ns)
2661                        tbl[0].procname = NULL;
2662        }
2663        tbl[0].extra1 = net;
2664
2665        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2666        if (net->ipv4.route_hdr == NULL)
2667                goto err_reg;
2668        return 0;
2669
2670err_reg:
2671        if (tbl != ipv4_route_flush_table)
2672                kfree(tbl);
2673err_dup:
2674        return -ENOMEM;
2675}
2676
2677static __net_exit void sysctl_route_net_exit(struct net *net)
2678{
2679        struct ctl_table *tbl;
2680
2681        tbl = net->ipv4.route_hdr->ctl_table_arg;
2682        unregister_net_sysctl_table(net->ipv4.route_hdr);
2683        BUG_ON(tbl == ipv4_route_flush_table);
2684        kfree(tbl);
2685}
2686
2687static __net_initdata struct pernet_operations sysctl_route_ops = {
2688        .init = sysctl_route_net_init,
2689        .exit = sysctl_route_net_exit,
2690};
2691#endif
2692
2693static __net_init int rt_genid_init(struct net *net)
2694{
2695        atomic_set(&net->ipv4.rt_genid, 0);
2696        atomic_set(&net->fnhe_genid, 0);
2697        get_random_bytes(&net->ipv4.dev_addr_genid,
2698                         sizeof(net->ipv4.dev_addr_genid));
2699        return 0;
2700}
2701
2702static __net_initdata struct pernet_operations rt_genid_ops = {
2703        .init = rt_genid_init,
2704};
2705
2706static int __net_init ipv4_inetpeer_init(struct net *net)
2707{
2708        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2709
2710        if (!bp)
2711                return -ENOMEM;
2712        inet_peer_base_init(bp);
2713        net->ipv4.peers = bp;
2714        return 0;
2715}
2716
2717static void __net_exit ipv4_inetpeer_exit(struct net *net)
2718{
2719        struct inet_peer_base *bp = net->ipv4.peers;
2720
2721        net->ipv4.peers = NULL;
2722        inetpeer_invalidate_tree(bp);
2723        kfree(bp);
2724}
2725
2726static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2727        .init   =       ipv4_inetpeer_init,
2728        .exit   =       ipv4_inetpeer_exit,
2729};
2730
2731#ifdef CONFIG_IP_ROUTE_CLASSID
2732struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2733#endif /* CONFIG_IP_ROUTE_CLASSID */
2734
2735int __init ip_rt_init(void)
2736{
2737        int rc = 0;
2738        int cpu;
2739
2740        ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2741        if (!ip_idents)
2742                panic("IP: failed to allocate ip_idents\n");
2743
2744        prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2745
2746        for_each_possible_cpu(cpu) {
2747                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2748
2749                INIT_LIST_HEAD(&ul->head);
2750                spin_lock_init(&ul->lock);
2751        }
2752#ifdef CONFIG_IP_ROUTE_CLASSID
2753        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2754        if (!ip_rt_acct)
2755                panic("IP: failed to allocate ip_rt_acct\n");
2756#endif
2757
2758        ipv4_dst_ops.kmem_cachep =
2759                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2760                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2761
2762        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2763
2764        if (dst_entries_init(&ipv4_dst_ops) < 0)
2765                panic("IP: failed to allocate ipv4_dst_ops counter\n");
2766
2767        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2768                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2769
2770        ipv4_dst_ops.gc_thresh = ~0;
2771        ip_rt_max_size = INT_MAX;
2772
2773        devinet_init();
2774        ip_fib_init();
2775
2776        if (ip_rt_proc_init())
2777                pr_err("Unable to create route proc files\n");
2778#ifdef CONFIG_XFRM
2779        xfrm_init();
2780        xfrm4_init();
2781#endif
2782        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2783
2784#ifdef CONFIG_SYSCTL
2785        register_pernet_subsys(&sysctl_route_ops);
2786#endif
2787        register_pernet_subsys(&rt_genid_ops);
2788        register_pernet_subsys(&ipv4_inetpeer_ops);
2789        return rc;
2790}
2791
2792#ifdef CONFIG_SYSCTL
2793/*
2794 * We really need to sanitize the damn ipv4 init order, then all
2795 * this nonsense will go away.
2796 */
2797void __init ip_static_sysctl_init(void)
2798{
2799        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2800}
2801#endif
2802