linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *              Alan Cox        :       Verify area fixes.
  16 *              Alan Cox        :       cli() protects routing changes
  17 *              Rui Oliveira    :       ICMP routing table updates
  18 *              (rco@di.uminho.pt)      Routing table insertion and update
  19 *              Linus Torvalds  :       Rewrote bits to be sensible
  20 *              Alan Cox        :       Added BSD route gw semantics
  21 *              Alan Cox        :       Super /proc >4K
  22 *              Alan Cox        :       MTU in route table
  23 *              Alan Cox        :       MSS actually. Also added the window
  24 *                                      clamper.
  25 *              Sam Lantinga    :       Fixed route matching in rt_del()
  26 *              Alan Cox        :       Routing cache support.
  27 *              Alan Cox        :       Removed compatibility cruft.
  28 *              Alan Cox        :       RTF_REJECT support.
  29 *              Alan Cox        :       TCP irtt support.
  30 *              Jonathan Naylor :       Added Metric support.
  31 *      Miquel van Smoorenburg  :       BSD API fixes.
  32 *      Miquel van Smoorenburg  :       Metrics.
  33 *              Alan Cox        :       Use __u32 properly
  34 *              Alan Cox        :       Aligned routing errors more closely with BSD
  35 *                                      our system is still very different.
  36 *              Alan Cox        :       Faster /proc handling
  37 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38 *                                      routing caches and better behaviour.
  39 *
  40 *              Olaf Erb        :       irtt wasn't being copied right.
  41 *              Bjorn Ekwall    :       Kerneld route support.
  42 *              Alan Cox        :       Multicast fixed (I hope)
  43 *              Pavel Krauz     :       Limited broadcast fixed
  44 *              Mike McLagan    :       Routing by source
  45 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46 *                                      route.c and rewritten from scratch.
  47 *              Andi Kleen      :       Load-limit warning messages.
  48 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52 *              Marc Boucher    :       routing by fwmark
  53 *      Robert Olsson           :       Added rt_cache statistics
  54 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58 *
  59 *              This program is free software; you can redistribute it and/or
  60 *              modify it under the terms of the GNU General Public License
  61 *              as published by the Free Software Foundation; either version
  62 *              2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
  92#include <linux/jhash.h>
  93#include <net/dst.h>
  94#include <net/dst_metadata.h>
  95#include <net/net_namespace.h>
  96#include <net/protocol.h>
  97#include <net/ip.h>
  98#include <net/route.h>
  99#include <net/inetpeer.h>
 100#include <net/sock.h>
 101#include <net/ip_fib.h>
 102#include <net/arp.h>
 103#include <net/tcp.h>
 104#include <net/icmp.h>
 105#include <net/xfrm.h>
 106#include <net/lwtunnel.h>
 107#include <net/netevent.h>
 108#include <net/rtnetlink.h>
 109#ifdef CONFIG_SYSCTL
 110#include <linux/sysctl.h>
 111#include <linux/kmemleak.h>
 112#endif
 113#include <net/secure_seq.h>
 114#include <net/ip_tunnels.h>
 115#include <net/l3mdev.h>
 116
 117#define RT_FL_TOS(oldflp4) \
 118        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 119
 120#define RT_GC_TIMEOUT (300*HZ)
 121
 122static int ip_rt_max_size;
 123static int ip_rt_redirect_number __read_mostly  = 9;
 124static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126static int ip_rt_error_cost __read_mostly       = HZ;
 127static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130static int ip_rt_min_advmss __read_mostly       = 256;
 131
 132/*
 133 *      Interface to generic destination cache.
 134 */
 135
 136static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 137static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 138static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 139static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 140static void              ipv4_link_failure(struct sk_buff *skb);
 141static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 142                                           struct sk_buff *skb, u32 mtu);
 143static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 144                                        struct sk_buff *skb);
 145static void             ipv4_dst_destroy(struct dst_entry *dst);
 146
 147static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 148{
 149        WARN_ON(1);
 150        return NULL;
 151}
 152
 153static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 154                                           struct sk_buff *skb,
 155                                           const void *daddr);
 156
 157static struct dst_ops ipv4_dst_ops = {
 158        .family =               AF_INET,
 159        .check =                ipv4_dst_check,
 160        .default_advmss =       ipv4_default_advmss,
 161        .mtu =                  ipv4_mtu,
 162        .cow_metrics =          ipv4_cow_metrics,
 163        .destroy =              ipv4_dst_destroy,
 164        .negative_advice =      ipv4_negative_advice,
 165        .link_failure =         ipv4_link_failure,
 166        .update_pmtu =          ip_rt_update_pmtu,
 167        .redirect =             ip_do_redirect,
 168        .local_out =            __ip_local_out,
 169        .neigh_lookup =         ipv4_neigh_lookup,
 170};
 171
 172#define ECN_OR_COST(class)      TC_PRIO_##class
 173
 174const __u8 ip_tos2prio[16] = {
 175        TC_PRIO_BESTEFFORT,
 176        ECN_OR_COST(BESTEFFORT),
 177        TC_PRIO_BESTEFFORT,
 178        ECN_OR_COST(BESTEFFORT),
 179        TC_PRIO_BULK,
 180        ECN_OR_COST(BULK),
 181        TC_PRIO_BULK,
 182        ECN_OR_COST(BULK),
 183        TC_PRIO_INTERACTIVE,
 184        ECN_OR_COST(INTERACTIVE),
 185        TC_PRIO_INTERACTIVE,
 186        ECN_OR_COST(INTERACTIVE),
 187        TC_PRIO_INTERACTIVE_BULK,
 188        ECN_OR_COST(INTERACTIVE_BULK),
 189        TC_PRIO_INTERACTIVE_BULK,
 190        ECN_OR_COST(INTERACTIVE_BULK)
 191};
 192EXPORT_SYMBOL(ip_tos2prio);
 193
 194static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 195#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 196
 197#ifdef CONFIG_PROC_FS
 198static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 199{
 200        if (*pos)
 201                return NULL;
 202        return SEQ_START_TOKEN;
 203}
 204
 205static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 206{
 207        ++*pos;
 208        return NULL;
 209}
 210
 211static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 212{
 213}
 214
 215static int rt_cache_seq_show(struct seq_file *seq, void *v)
 216{
 217        if (v == SEQ_START_TOKEN)
 218                seq_printf(seq, "%-127s\n",
 219                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 220                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 221                           "HHUptod\tSpecDst");
 222        return 0;
 223}
 224
 225static const struct seq_operations rt_cache_seq_ops = {
 226        .start  = rt_cache_seq_start,
 227        .next   = rt_cache_seq_next,
 228        .stop   = rt_cache_seq_stop,
 229        .show   = rt_cache_seq_show,
 230};
 231
 232static int rt_cache_seq_open(struct inode *inode, struct file *file)
 233{
 234        return seq_open(file, &rt_cache_seq_ops);
 235}
 236
 237static const struct file_operations rt_cache_seq_fops = {
 238        .owner   = THIS_MODULE,
 239        .open    = rt_cache_seq_open,
 240        .read    = seq_read,
 241        .llseek  = seq_lseek,
 242        .release = seq_release,
 243};
 244
 245
 246static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 247{
 248        int cpu;
 249
 250        if (*pos == 0)
 251                return SEQ_START_TOKEN;
 252
 253        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 254                if (!cpu_possible(cpu))
 255                        continue;
 256                *pos = cpu+1;
 257                return &per_cpu(rt_cache_stat, cpu);
 258        }
 259        return NULL;
 260}
 261
 262static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 263{
 264        int cpu;
 265
 266        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 267                if (!cpu_possible(cpu))
 268                        continue;
 269                *pos = cpu+1;
 270                return &per_cpu(rt_cache_stat, cpu);
 271        }
 272        return NULL;
 273
 274}
 275
 276static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 277{
 278
 279}
 280
 281static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 282{
 283        struct rt_cache_stat *st = v;
 284
 285        if (v == SEQ_START_TOKEN) {
 286                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 287                return 0;
 288        }
 289
 290        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 291                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 292                   dst_entries_get_slow(&ipv4_dst_ops),
 293                   0, /* st->in_hit */
 294                   st->in_slow_tot,
 295                   st->in_slow_mc,
 296                   st->in_no_route,
 297                   st->in_brd,
 298                   st->in_martian_dst,
 299                   st->in_martian_src,
 300
 301                   0, /* st->out_hit */
 302                   st->out_slow_tot,
 303                   st->out_slow_mc,
 304
 305                   0, /* st->gc_total */
 306                   0, /* st->gc_ignored */
 307                   0, /* st->gc_goal_miss */
 308                   0, /* st->gc_dst_overflow */
 309                   0, /* st->in_hlist_search */
 310                   0  /* st->out_hlist_search */
 311                );
 312        return 0;
 313}
 314
 315static const struct seq_operations rt_cpu_seq_ops = {
 316        .start  = rt_cpu_seq_start,
 317        .next   = rt_cpu_seq_next,
 318        .stop   = rt_cpu_seq_stop,
 319        .show   = rt_cpu_seq_show,
 320};
 321
 322
 323static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 324{
 325        return seq_open(file, &rt_cpu_seq_ops);
 326}
 327
 328static const struct file_operations rt_cpu_seq_fops = {
 329        .owner   = THIS_MODULE,
 330        .open    = rt_cpu_seq_open,
 331        .read    = seq_read,
 332        .llseek  = seq_lseek,
 333        .release = seq_release,
 334};
 335
 336#ifdef CONFIG_IP_ROUTE_CLASSID
 337static int rt_acct_proc_show(struct seq_file *m, void *v)
 338{
 339        struct ip_rt_acct *dst, *src;
 340        unsigned int i, j;
 341
 342        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 343        if (!dst)
 344                return -ENOMEM;
 345
 346        for_each_possible_cpu(i) {
 347                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 348                for (j = 0; j < 256; j++) {
 349                        dst[j].o_bytes   += src[j].o_bytes;
 350                        dst[j].o_packets += src[j].o_packets;
 351                        dst[j].i_bytes   += src[j].i_bytes;
 352                        dst[j].i_packets += src[j].i_packets;
 353                }
 354        }
 355
 356        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 357        kfree(dst);
 358        return 0;
 359}
 360
 361static int rt_acct_proc_open(struct inode *inode, struct file *file)
 362{
 363        return single_open(file, rt_acct_proc_show, NULL);
 364}
 365
 366static const struct file_operations rt_acct_proc_fops = {
 367        .owner          = THIS_MODULE,
 368        .open           = rt_acct_proc_open,
 369        .read           = seq_read,
 370        .llseek         = seq_lseek,
 371        .release        = single_release,
 372};
 373#endif
 374
 375static int __net_init ip_rt_do_proc_init(struct net *net)
 376{
 377        struct proc_dir_entry *pde;
 378
 379        pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 380                          &rt_cache_seq_fops);
 381        if (!pde)
 382                goto err1;
 383
 384        pde = proc_create("rt_cache", S_IRUGO,
 385                          net->proc_net_stat, &rt_cpu_seq_fops);
 386        if (!pde)
 387                goto err2;
 388
 389#ifdef CONFIG_IP_ROUTE_CLASSID
 390        pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 391        if (!pde)
 392                goto err3;
 393#endif
 394        return 0;
 395
 396#ifdef CONFIG_IP_ROUTE_CLASSID
 397err3:
 398        remove_proc_entry("rt_cache", net->proc_net_stat);
 399#endif
 400err2:
 401        remove_proc_entry("rt_cache", net->proc_net);
 402err1:
 403        return -ENOMEM;
 404}
 405
 406static void __net_exit ip_rt_do_proc_exit(struct net *net)
 407{
 408        remove_proc_entry("rt_cache", net->proc_net_stat);
 409        remove_proc_entry("rt_cache", net->proc_net);
 410#ifdef CONFIG_IP_ROUTE_CLASSID
 411        remove_proc_entry("rt_acct", net->proc_net);
 412#endif
 413}
 414
 415static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 416        .init = ip_rt_do_proc_init,
 417        .exit = ip_rt_do_proc_exit,
 418};
 419
 420static int __init ip_rt_proc_init(void)
 421{
 422        return register_pernet_subsys(&ip_rt_proc_ops);
 423}
 424
 425#else
 426static inline int ip_rt_proc_init(void)
 427{
 428        return 0;
 429}
 430#endif /* CONFIG_PROC_FS */
 431
 432static inline bool rt_is_expired(const struct rtable *rth)
 433{
 434        return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 435}
 436
 437void rt_cache_flush(struct net *net)
 438{
 439        rt_genid_bump_ipv4(net);
 440}
 441
 442static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 443                                           struct sk_buff *skb,
 444                                           const void *daddr)
 445{
 446        struct net_device *dev = dst->dev;
 447        const __be32 *pkey = daddr;
 448        const struct rtable *rt;
 449        struct neighbour *n;
 450
 451        rt = (const struct rtable *) dst;
 452        if (rt->rt_gateway)
 453                pkey = (const __be32 *) &rt->rt_gateway;
 454        else if (skb)
 455                pkey = &ip_hdr(skb)->daddr;
 456
 457        n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 458        if (n)
 459                return n;
 460        return neigh_create(&arp_tbl, pkey, dev);
 461}
 462
 463#define IP_IDENTS_SZ 2048u
 464
 465static atomic_t *ip_idents __read_mostly;
 466static u32 *ip_tstamps __read_mostly;
 467
 468/* In order to protect privacy, we add a perturbation to identifiers
 469 * if one generator is seldom used. This makes hard for an attacker
 470 * to infer how many packets were sent between two points in time.
 471 */
 472u32 ip_idents_reserve(u32 hash, int segs)
 473{
 474        u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 475        atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 476        u32 old = ACCESS_ONCE(*p_tstamp);
 477        u32 now = (u32)jiffies;
 478        u32 delta = 0;
 479
 480        if (old != now && cmpxchg(p_tstamp, old, now) == old)
 481                delta = prandom_u32_max(now - old);
 482
 483        return atomic_add_return(segs + delta, p_id) - segs;
 484}
 485EXPORT_SYMBOL(ip_idents_reserve);
 486
 487void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 488{
 489        static u32 ip_idents_hashrnd __read_mostly;
 490        u32 hash, id;
 491
 492        net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 493
 494        hash = jhash_3words((__force u32)iph->daddr,
 495                            (__force u32)iph->saddr,
 496                            iph->protocol ^ net_hash_mix(net),
 497                            ip_idents_hashrnd);
 498        id = ip_idents_reserve(hash, segs);
 499        iph->id = htons(id);
 500}
 501EXPORT_SYMBOL(__ip_select_ident);
 502
 503static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 504                             const struct iphdr *iph,
 505                             int oif, u8 tos,
 506                             u8 prot, u32 mark, int flow_flags)
 507{
 508        if (sk) {
 509                const struct inet_sock *inet = inet_sk(sk);
 510
 511                oif = sk->sk_bound_dev_if;
 512                mark = sk->sk_mark;
 513                tos = RT_CONN_FLAGS(sk);
 514                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 515        }
 516        flowi4_init_output(fl4, oif, mark, tos,
 517                           RT_SCOPE_UNIVERSE, prot,
 518                           flow_flags,
 519                           iph->daddr, iph->saddr, 0, 0);
 520}
 521
 522static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 523                               const struct sock *sk)
 524{
 525        const struct iphdr *iph = ip_hdr(skb);
 526        int oif = skb->dev->ifindex;
 527        u8 tos = RT_TOS(iph->tos);
 528        u8 prot = iph->protocol;
 529        u32 mark = skb->mark;
 530
 531        __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 532}
 533
 534static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 535{
 536        const struct inet_sock *inet = inet_sk(sk);
 537        const struct ip_options_rcu *inet_opt;
 538        __be32 daddr = inet->inet_daddr;
 539
 540        rcu_read_lock();
 541        inet_opt = rcu_dereference(inet->inet_opt);
 542        if (inet_opt && inet_opt->opt.srr)
 543                daddr = inet_opt->opt.faddr;
 544        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 545                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 546                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 547                           inet_sk_flowi_flags(sk),
 548                           daddr, inet->inet_saddr, 0, 0);
 549        rcu_read_unlock();
 550}
 551
 552static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 553                                 const struct sk_buff *skb)
 554{
 555        if (skb)
 556                build_skb_flow_key(fl4, skb, sk);
 557        else
 558                build_sk_flow_key(fl4, sk);
 559}
 560
 561static inline void rt_free(struct rtable *rt)
 562{
 563        call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 564}
 565
 566static DEFINE_SPINLOCK(fnhe_lock);
 567
 568static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 569{
 570        struct rtable *rt;
 571
 572        rt = rcu_dereference(fnhe->fnhe_rth_input);
 573        if (rt) {
 574                RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 575                rt_free(rt);
 576        }
 577        rt = rcu_dereference(fnhe->fnhe_rth_output);
 578        if (rt) {
 579                RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 580                rt_free(rt);
 581        }
 582}
 583
 584static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 585{
 586        struct fib_nh_exception *fnhe, *oldest;
 587
 588        oldest = rcu_dereference(hash->chain);
 589        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 590             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 591                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 592                        oldest = fnhe;
 593        }
 594        fnhe_flush_routes(oldest);
 595        return oldest;
 596}
 597
 598static inline u32 fnhe_hashfun(__be32 daddr)
 599{
 600        static u32 fnhe_hashrnd __read_mostly;
 601        u32 hval;
 602
 603        net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 604        hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 605        return hash_32(hval, FNHE_HASH_SHIFT);
 606}
 607
 608static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 609{
 610        rt->rt_pmtu = fnhe->fnhe_pmtu;
 611        rt->dst.expires = fnhe->fnhe_expires;
 612
 613        if (fnhe->fnhe_gw) {
 614                rt->rt_flags |= RTCF_REDIRECTED;
 615                rt->rt_gateway = fnhe->fnhe_gw;
 616                rt->rt_uses_gateway = 1;
 617        }
 618}
 619
 620static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 621                                  u32 pmtu, unsigned long expires)
 622{
 623        struct fnhe_hash_bucket *hash;
 624        struct fib_nh_exception *fnhe;
 625        struct rtable *rt;
 626        unsigned int i;
 627        int depth;
 628        u32 hval = fnhe_hashfun(daddr);
 629
 630        spin_lock_bh(&fnhe_lock);
 631
 632        hash = rcu_dereference(nh->nh_exceptions);
 633        if (!hash) {
 634                hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 635                if (!hash)
 636                        goto out_unlock;
 637                rcu_assign_pointer(nh->nh_exceptions, hash);
 638        }
 639
 640        hash += hval;
 641
 642        depth = 0;
 643        for (fnhe = rcu_dereference(hash->chain); fnhe;
 644             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 645                if (fnhe->fnhe_daddr == daddr)
 646                        break;
 647                depth++;
 648        }
 649
 650        if (fnhe) {
 651                if (gw)
 652                        fnhe->fnhe_gw = gw;
 653                if (pmtu) {
 654                        fnhe->fnhe_pmtu = pmtu;
 655                        fnhe->fnhe_expires = max(1UL, expires);
 656                }
 657                /* Update all cached dsts too */
 658                rt = rcu_dereference(fnhe->fnhe_rth_input);
 659                if (rt)
 660                        fill_route_from_fnhe(rt, fnhe);
 661                rt = rcu_dereference(fnhe->fnhe_rth_output);
 662                if (rt)
 663                        fill_route_from_fnhe(rt, fnhe);
 664        } else {
 665                if (depth > FNHE_RECLAIM_DEPTH)
 666                        fnhe = fnhe_oldest(hash);
 667                else {
 668                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 669                        if (!fnhe)
 670                                goto out_unlock;
 671
 672                        fnhe->fnhe_next = hash->chain;
 673                        rcu_assign_pointer(hash->chain, fnhe);
 674                }
 675                fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
 676                fnhe->fnhe_daddr = daddr;
 677                fnhe->fnhe_gw = gw;
 678                fnhe->fnhe_pmtu = pmtu;
 679                fnhe->fnhe_expires = expires;
 680
 681                /* Exception created; mark the cached routes for the nexthop
 682                 * stale, so anyone caching it rechecks if this exception
 683                 * applies to them.
 684                 */
 685                rt = rcu_dereference(nh->nh_rth_input);
 686                if (rt)
 687                        rt->dst.obsolete = DST_OBSOLETE_KILL;
 688
 689                for_each_possible_cpu(i) {
 690                        struct rtable __rcu **prt;
 691                        prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 692                        rt = rcu_dereference(*prt);
 693                        if (rt)
 694                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 695                }
 696        }
 697
 698        fnhe->fnhe_stamp = jiffies;
 699
 700out_unlock:
 701        spin_unlock_bh(&fnhe_lock);
 702}
 703
 704static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 705                             bool kill_route)
 706{
 707        __be32 new_gw = icmp_hdr(skb)->un.gateway;
 708        __be32 old_gw = ip_hdr(skb)->saddr;
 709        struct net_device *dev = skb->dev;
 710        struct in_device *in_dev;
 711        struct fib_result res;
 712        struct neighbour *n;
 713        struct net *net;
 714
 715        switch (icmp_hdr(skb)->code & 7) {
 716        case ICMP_REDIR_NET:
 717        case ICMP_REDIR_NETTOS:
 718        case ICMP_REDIR_HOST:
 719        case ICMP_REDIR_HOSTTOS:
 720                break;
 721
 722        default:
 723                return;
 724        }
 725
 726        if (rt->rt_gateway != old_gw)
 727                return;
 728
 729        in_dev = __in_dev_get_rcu(dev);
 730        if (!in_dev)
 731                return;
 732
 733        net = dev_net(dev);
 734        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 735            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 736            ipv4_is_zeronet(new_gw))
 737                goto reject_redirect;
 738
 739        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 740                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 741                        goto reject_redirect;
 742                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 743                        goto reject_redirect;
 744        } else {
 745                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 746                        goto reject_redirect;
 747        }
 748
 749        n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 750        if (!IS_ERR(n)) {
 751                if (!(n->nud_state & NUD_VALID)) {
 752                        neigh_event_send(n, NULL);
 753                } else {
 754                        if (fib_lookup(net, fl4, &res, 0) == 0) {
 755                                struct fib_nh *nh = &FIB_RES_NH(res);
 756
 757                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
 758                                                      0, 0);
 759                        }
 760                        if (kill_route)
 761                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 762                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 763                }
 764                neigh_release(n);
 765        }
 766        return;
 767
 768reject_redirect:
 769#ifdef CONFIG_IP_ROUTE_VERBOSE
 770        if (IN_DEV_LOG_MARTIANS(in_dev)) {
 771                const struct iphdr *iph = (const struct iphdr *) skb->data;
 772                __be32 daddr = iph->daddr;
 773                __be32 saddr = iph->saddr;
 774
 775                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 776                                     "  Advised path = %pI4 -> %pI4\n",
 777                                     &old_gw, dev->name, &new_gw,
 778                                     &saddr, &daddr);
 779        }
 780#endif
 781        ;
 782}
 783
 784static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 785{
 786        struct rtable *rt;
 787        struct flowi4 fl4;
 788        const struct iphdr *iph = (const struct iphdr *) skb->data;
 789        int oif = skb->dev->ifindex;
 790        u8 tos = RT_TOS(iph->tos);
 791        u8 prot = iph->protocol;
 792        u32 mark = skb->mark;
 793
 794        rt = (struct rtable *) dst;
 795
 796        __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
 797        __ip_do_redirect(rt, skb, &fl4, true);
 798}
 799
 800static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 801{
 802        struct rtable *rt = (struct rtable *)dst;
 803        struct dst_entry *ret = dst;
 804
 805        if (rt) {
 806                if (dst->obsolete > 0) {
 807                        ip_rt_put(rt);
 808                        ret = NULL;
 809                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 810                           rt->dst.expires) {
 811                        ip_rt_put(rt);
 812                        ret = NULL;
 813                }
 814        }
 815        return ret;
 816}
 817
 818/*
 819 * Algorithm:
 820 *      1. The first ip_rt_redirect_number redirects are sent
 821 *         with exponential backoff, then we stop sending them at all,
 822 *         assuming that the host ignores our redirects.
 823 *      2. If we did not see packets requiring redirects
 824 *         during ip_rt_redirect_silence, we assume that the host
 825 *         forgot redirected route and start to send redirects again.
 826 *
 827 * This algorithm is much cheaper and more intelligent than dumb load limiting
 828 * in icmp.c.
 829 *
 830 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 831 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 832 */
 833
 834void ip_rt_send_redirect(struct sk_buff *skb)
 835{
 836        struct rtable *rt = skb_rtable(skb);
 837        struct in_device *in_dev;
 838        struct inet_peer *peer;
 839        struct net *net;
 840        int log_martians;
 841        int vif;
 842
 843        rcu_read_lock();
 844        in_dev = __in_dev_get_rcu(rt->dst.dev);
 845        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 846                rcu_read_unlock();
 847                return;
 848        }
 849        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 850        vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 851        rcu_read_unlock();
 852
 853        net = dev_net(rt->dst.dev);
 854        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 855        if (!peer) {
 856                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 857                          rt_nexthop(rt, ip_hdr(skb)->daddr));
 858                return;
 859        }
 860
 861        /* No redirected packets during ip_rt_redirect_silence;
 862         * reset the algorithm.
 863         */
 864        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 865                peer->rate_tokens = 0;
 866
 867        /* Too many ignored redirects; do not send anything
 868         * set dst.rate_last to the last seen redirected packet.
 869         */
 870        if (peer->rate_tokens >= ip_rt_redirect_number) {
 871                peer->rate_last = jiffies;
 872                goto out_put_peer;
 873        }
 874
 875        /* Check for load limit; set rate_last to the latest sent
 876         * redirect.
 877         */
 878        if (peer->rate_tokens == 0 ||
 879            time_after(jiffies,
 880                       (peer->rate_last +
 881                        (ip_rt_redirect_load << peer->rate_tokens)))) {
 882                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 883
 884                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 885                peer->rate_last = jiffies;
 886                ++peer->rate_tokens;
 887#ifdef CONFIG_IP_ROUTE_VERBOSE
 888                if (log_martians &&
 889                    peer->rate_tokens == ip_rt_redirect_number)
 890                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 891                                             &ip_hdr(skb)->saddr, inet_iif(skb),
 892                                             &ip_hdr(skb)->daddr, &gw);
 893#endif
 894        }
 895out_put_peer:
 896        inet_putpeer(peer);
 897}
 898
 899static int ip_error(struct sk_buff *skb)
 900{
 901        struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 902        struct rtable *rt = skb_rtable(skb);
 903        struct inet_peer *peer;
 904        unsigned long now;
 905        struct net *net;
 906        bool send;
 907        int code;
 908
 909        /* IP on this device is disabled. */
 910        if (!in_dev)
 911                goto out;
 912
 913        net = dev_net(rt->dst.dev);
 914        if (!IN_DEV_FORWARD(in_dev)) {
 915                switch (rt->dst.error) {
 916                case EHOSTUNREACH:
 917                        IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
 918                        break;
 919
 920                case ENETUNREACH:
 921                        IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 922                        break;
 923                }
 924                goto out;
 925        }
 926
 927        switch (rt->dst.error) {
 928        case EINVAL:
 929        default:
 930                goto out;
 931        case EHOSTUNREACH:
 932                code = ICMP_HOST_UNREACH;
 933                break;
 934        case ENETUNREACH:
 935                code = ICMP_NET_UNREACH;
 936                IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 937                break;
 938        case EACCES:
 939                code = ICMP_PKT_FILTERED;
 940                break;
 941        }
 942
 943        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 944                               l3mdev_master_ifindex(skb->dev), 1);
 945
 946        send = true;
 947        if (peer) {
 948                now = jiffies;
 949                peer->rate_tokens += now - peer->rate_last;
 950                if (peer->rate_tokens > ip_rt_error_burst)
 951                        peer->rate_tokens = ip_rt_error_burst;
 952                peer->rate_last = now;
 953                if (peer->rate_tokens >= ip_rt_error_cost)
 954                        peer->rate_tokens -= ip_rt_error_cost;
 955                else
 956                        send = false;
 957                inet_putpeer(peer);
 958        }
 959        if (send)
 960                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 961
 962out:    kfree_skb(skb);
 963        return 0;
 964}
 965
 966static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 967{
 968        struct dst_entry *dst = &rt->dst;
 969        struct fib_result res;
 970
 971        if (dst_metric_locked(dst, RTAX_MTU))
 972                return;
 973
 974        if (ipv4_mtu(dst) < mtu)
 975                return;
 976
 977        if (mtu < ip_rt_min_pmtu)
 978                mtu = ip_rt_min_pmtu;
 979
 980        if (rt->rt_pmtu == mtu &&
 981            time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
 982                return;
 983
 984        rcu_read_lock();
 985        if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
 986                struct fib_nh *nh = &FIB_RES_NH(res);
 987
 988                update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 989                                      jiffies + ip_rt_mtu_expires);
 990        }
 991        rcu_read_unlock();
 992}
 993
 994static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 995                              struct sk_buff *skb, u32 mtu)
 996{
 997        struct rtable *rt = (struct rtable *) dst;
 998        struct flowi4 fl4;
 999
1000        ip_rt_build_flow_key(&fl4, sk, skb);
1001        __ip_rt_update_pmtu(rt, &fl4, mtu);
1002}
1003
1004void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1005                      int oif, u32 mark, u8 protocol, int flow_flags)
1006{
1007        const struct iphdr *iph = (const struct iphdr *) skb->data;
1008        struct flowi4 fl4;
1009        struct rtable *rt;
1010
1011        if (!mark)
1012                mark = IP4_REPLY_MARK(net, skb->mark);
1013
1014        __build_flow_key(&fl4, NULL, iph, oif,
1015                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1016        rt = __ip_route_output_key(net, &fl4);
1017        if (!IS_ERR(rt)) {
1018                __ip_rt_update_pmtu(rt, &fl4, mtu);
1019                ip_rt_put(rt);
1020        }
1021}
1022EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1023
1024static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1025{
1026        const struct iphdr *iph = (const struct iphdr *) skb->data;
1027        struct flowi4 fl4;
1028        struct rtable *rt;
1029
1030        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1031
1032        if (!fl4.flowi4_mark)
1033                fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1034
1035        rt = __ip_route_output_key(sock_net(sk), &fl4);
1036        if (!IS_ERR(rt)) {
1037                __ip_rt_update_pmtu(rt, &fl4, mtu);
1038                ip_rt_put(rt);
1039        }
1040}
1041
1042void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1043{
1044        const struct iphdr *iph = (const struct iphdr *) skb->data;
1045        struct flowi4 fl4;
1046        struct rtable *rt;
1047        struct dst_entry *odst = NULL;
1048        bool new = false;
1049
1050        bh_lock_sock(sk);
1051
1052        if (!ip_sk_accept_pmtu(sk))
1053                goto out;
1054
1055        odst = sk_dst_get(sk);
1056
1057        if (sock_owned_by_user(sk) || !odst) {
1058                __ipv4_sk_update_pmtu(skb, sk, mtu);
1059                goto out;
1060        }
1061
1062        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1063
1064        rt = (struct rtable *)odst;
1065        if (odst->obsolete && !odst->ops->check(odst, 0)) {
1066                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1067                if (IS_ERR(rt))
1068                        goto out;
1069
1070                new = true;
1071        }
1072
1073        __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1074
1075        if (!dst_check(&rt->dst, 0)) {
1076                if (new)
1077                        dst_release(&rt->dst);
1078
1079                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1080                if (IS_ERR(rt))
1081                        goto out;
1082
1083                new = true;
1084        }
1085
1086        if (new)
1087                sk_dst_set(sk, &rt->dst);
1088
1089out:
1090        bh_unlock_sock(sk);
1091        dst_release(odst);
1092}
1093EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1094
1095void ipv4_redirect(struct sk_buff *skb, struct net *net,
1096                   int oif, u32 mark, u8 protocol, int flow_flags)
1097{
1098        const struct iphdr *iph = (const struct iphdr *) skb->data;
1099        struct flowi4 fl4;
1100        struct rtable *rt;
1101
1102        __build_flow_key(&fl4, NULL, iph, oif,
1103                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1104        rt = __ip_route_output_key(net, &fl4);
1105        if (!IS_ERR(rt)) {
1106                __ip_do_redirect(rt, skb, &fl4, false);
1107                ip_rt_put(rt);
1108        }
1109}
1110EXPORT_SYMBOL_GPL(ipv4_redirect);
1111
1112void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1113{
1114        const struct iphdr *iph = (const struct iphdr *) skb->data;
1115        struct flowi4 fl4;
1116        struct rtable *rt;
1117
1118        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1119        rt = __ip_route_output_key(sock_net(sk), &fl4);
1120        if (!IS_ERR(rt)) {
1121                __ip_do_redirect(rt, skb, &fl4, false);
1122                ip_rt_put(rt);
1123        }
1124}
1125EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1126
1127static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1128{
1129        struct rtable *rt = (struct rtable *) dst;
1130
1131        /* All IPV4 dsts are created with ->obsolete set to the value
1132         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1133         * into this function always.
1134         *
1135         * When a PMTU/redirect information update invalidates a route,
1136         * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1137         * DST_OBSOLETE_DEAD by dst_free().
1138         */
1139        if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1140                return NULL;
1141        return dst;
1142}
1143
1144static void ipv4_link_failure(struct sk_buff *skb)
1145{
1146        struct rtable *rt;
1147
1148        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1149
1150        rt = skb_rtable(skb);
1151        if (rt)
1152                dst_set_expires(&rt->dst, 0);
1153}
1154
1155static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1156{
1157        pr_debug("%s: %pI4 -> %pI4, %s\n",
1158                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1159                 skb->dev ? skb->dev->name : "?");
1160        kfree_skb(skb);
1161        WARN_ON(1);
1162        return 0;
1163}
1164
1165/*
1166   We do not cache source address of outgoing interface,
1167   because it is used only by IP RR, TS and SRR options,
1168   so that it out of fast path.
1169
1170   BTW remember: "addr" is allowed to be not aligned
1171   in IP options!
1172 */
1173
1174void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1175{
1176        __be32 src;
1177
1178        if (rt_is_output_route(rt))
1179                src = ip_hdr(skb)->saddr;
1180        else {
1181                struct fib_result res;
1182                struct flowi4 fl4;
1183                struct iphdr *iph;
1184
1185                iph = ip_hdr(skb);
1186
1187                memset(&fl4, 0, sizeof(fl4));
1188                fl4.daddr = iph->daddr;
1189                fl4.saddr = iph->saddr;
1190                fl4.flowi4_tos = RT_TOS(iph->tos);
1191                fl4.flowi4_oif = rt->dst.dev->ifindex;
1192                fl4.flowi4_iif = skb->dev->ifindex;
1193                fl4.flowi4_mark = skb->mark;
1194
1195                rcu_read_lock();
1196                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1197                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1198                else
1199                        src = inet_select_addr(rt->dst.dev,
1200                                               rt_nexthop(rt, iph->daddr),
1201                                               RT_SCOPE_UNIVERSE);
1202                rcu_read_unlock();
1203        }
1204        memcpy(addr, &src, 4);
1205}
1206
1207#ifdef CONFIG_IP_ROUTE_CLASSID
1208static void set_class_tag(struct rtable *rt, u32 tag)
1209{
1210        if (!(rt->dst.tclassid & 0xFFFF))
1211                rt->dst.tclassid |= tag & 0xFFFF;
1212        if (!(rt->dst.tclassid & 0xFFFF0000))
1213                rt->dst.tclassid |= tag & 0xFFFF0000;
1214}
1215#endif
1216
1217static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1218{
1219        unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1220
1221        if (advmss == 0) {
1222                advmss = max_t(unsigned int, dst->dev->mtu - 40,
1223                               ip_rt_min_advmss);
1224                if (advmss > 65535 - 40)
1225                        advmss = 65535 - 40;
1226        }
1227        return advmss;
1228}
1229
1230static unsigned int ipv4_mtu(const struct dst_entry *dst)
1231{
1232        const struct rtable *rt = (const struct rtable *) dst;
1233        unsigned int mtu = rt->rt_pmtu;
1234
1235        if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1236                mtu = dst_metric_raw(dst, RTAX_MTU);
1237
1238        if (mtu)
1239                return mtu;
1240
1241        mtu = dst->dev->mtu;
1242
1243        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1244                if (rt->rt_uses_gateway && mtu > 576)
1245                        mtu = 576;
1246        }
1247
1248        return min_t(unsigned int, mtu, IP_MAX_MTU);
1249}
1250
1251static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1252{
1253        struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1254        struct fib_nh_exception *fnhe;
1255        u32 hval;
1256
1257        if (!hash)
1258                return NULL;
1259
1260        hval = fnhe_hashfun(daddr);
1261
1262        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1263             fnhe = rcu_dereference(fnhe->fnhe_next)) {
1264                if (fnhe->fnhe_daddr == daddr)
1265                        return fnhe;
1266        }
1267        return NULL;
1268}
1269
1270static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1271                              __be32 daddr)
1272{
1273        bool ret = false;
1274
1275        spin_lock_bh(&fnhe_lock);
1276
1277        if (daddr == fnhe->fnhe_daddr) {
1278                struct rtable __rcu **porig;
1279                struct rtable *orig;
1280                int genid = fnhe_genid(dev_net(rt->dst.dev));
1281
1282                if (rt_is_input_route(rt))
1283                        porig = &fnhe->fnhe_rth_input;
1284                else
1285                        porig = &fnhe->fnhe_rth_output;
1286                orig = rcu_dereference(*porig);
1287
1288                if (fnhe->fnhe_genid != genid) {
1289                        fnhe->fnhe_genid = genid;
1290                        fnhe->fnhe_gw = 0;
1291                        fnhe->fnhe_pmtu = 0;
1292                        fnhe->fnhe_expires = 0;
1293                        fnhe_flush_routes(fnhe);
1294                        orig = NULL;
1295                }
1296                fill_route_from_fnhe(rt, fnhe);
1297                if (!rt->rt_gateway)
1298                        rt->rt_gateway = daddr;
1299
1300                if (!(rt->dst.flags & DST_NOCACHE)) {
1301                        rcu_assign_pointer(*porig, rt);
1302                        if (orig)
1303                                rt_free(orig);
1304                        ret = true;
1305                }
1306
1307                fnhe->fnhe_stamp = jiffies;
1308        }
1309        spin_unlock_bh(&fnhe_lock);
1310
1311        return ret;
1312}
1313
1314static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1315{
1316        struct rtable *orig, *prev, **p;
1317        bool ret = true;
1318
1319        if (rt_is_input_route(rt)) {
1320                p = (struct rtable **)&nh->nh_rth_input;
1321        } else {
1322                p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1323        }
1324        orig = *p;
1325
1326        prev = cmpxchg(p, orig, rt);
1327        if (prev == orig) {
1328                if (orig)
1329                        rt_free(orig);
1330        } else
1331                ret = false;
1332
1333        return ret;
1334}
1335
1336struct uncached_list {
1337        spinlock_t              lock;
1338        struct list_head        head;
1339};
1340
1341static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1342
1343static void rt_add_uncached_list(struct rtable *rt)
1344{
1345        struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1346
1347        rt->rt_uncached_list = ul;
1348
1349        spin_lock_bh(&ul->lock);
1350        list_add_tail(&rt->rt_uncached, &ul->head);
1351        spin_unlock_bh(&ul->lock);
1352}
1353
1354static void ipv4_dst_destroy(struct dst_entry *dst)
1355{
1356        struct rtable *rt = (struct rtable *) dst;
1357
1358        if (!list_empty(&rt->rt_uncached)) {
1359                struct uncached_list *ul = rt->rt_uncached_list;
1360
1361                spin_lock_bh(&ul->lock);
1362                list_del(&rt->rt_uncached);
1363                spin_unlock_bh(&ul->lock);
1364        }
1365}
1366
1367void rt_flush_dev(struct net_device *dev)
1368{
1369        struct net *net = dev_net(dev);
1370        struct rtable *rt;
1371        int cpu;
1372
1373        for_each_possible_cpu(cpu) {
1374                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1375
1376                spin_lock_bh(&ul->lock);
1377                list_for_each_entry(rt, &ul->head, rt_uncached) {
1378                        if (rt->dst.dev != dev)
1379                                continue;
1380                        rt->dst.dev = net->loopback_dev;
1381                        dev_hold(rt->dst.dev);
1382                        dev_put(dev);
1383                }
1384                spin_unlock_bh(&ul->lock);
1385        }
1386}
1387
1388static bool rt_cache_valid(const struct rtable *rt)
1389{
1390        return  rt &&
1391                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1392                !rt_is_expired(rt);
1393}
1394
1395static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1396                           const struct fib_result *res,
1397                           struct fib_nh_exception *fnhe,
1398                           struct fib_info *fi, u16 type, u32 itag)
1399{
1400        bool cached = false;
1401
1402        if (fi) {
1403                struct fib_nh *nh = &FIB_RES_NH(*res);
1404
1405                if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1406                        rt->rt_gateway = nh->nh_gw;
1407                        rt->rt_uses_gateway = 1;
1408                }
1409                dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1410#ifdef CONFIG_IP_ROUTE_CLASSID
1411                rt->dst.tclassid = nh->nh_tclassid;
1412#endif
1413                rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1414                if (unlikely(fnhe))
1415                        cached = rt_bind_exception(rt, fnhe, daddr);
1416                else if (!(rt->dst.flags & DST_NOCACHE))
1417                        cached = rt_cache_route(nh, rt);
1418                if (unlikely(!cached)) {
1419                        /* Routes we intend to cache in nexthop exception or
1420                         * FIB nexthop have the DST_NOCACHE bit clear.
1421                         * However, if we are unsuccessful at storing this
1422                         * route into the cache we really need to set it.
1423                         */
1424                        rt->dst.flags |= DST_NOCACHE;
1425                        if (!rt->rt_gateway)
1426                                rt->rt_gateway = daddr;
1427                        rt_add_uncached_list(rt);
1428                }
1429        } else
1430                rt_add_uncached_list(rt);
1431
1432#ifdef CONFIG_IP_ROUTE_CLASSID
1433#ifdef CONFIG_IP_MULTIPLE_TABLES
1434        set_class_tag(rt, res->tclassid);
1435#endif
1436        set_class_tag(rt, itag);
1437#endif
1438}
1439
1440static struct rtable *rt_dst_alloc(struct net_device *dev,
1441                                   unsigned int flags, u16 type,
1442                                   bool nopolicy, bool noxfrm, bool will_cache)
1443{
1444        struct rtable *rt;
1445
1446        rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1447                       (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1448                       (nopolicy ? DST_NOPOLICY : 0) |
1449                       (noxfrm ? DST_NOXFRM : 0));
1450
1451        if (rt) {
1452                rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1453                rt->rt_flags = flags;
1454                rt->rt_type = type;
1455                rt->rt_is_input = 0;
1456                rt->rt_iif = 0;
1457                rt->rt_pmtu = 0;
1458                rt->rt_gateway = 0;
1459                rt->rt_uses_gateway = 0;
1460                rt->rt_table_id = 0;
1461                INIT_LIST_HEAD(&rt->rt_uncached);
1462
1463                rt->dst.output = ip_output;
1464                if (flags & RTCF_LOCAL)
1465                        rt->dst.input = ip_local_deliver;
1466        }
1467
1468        return rt;
1469}
1470
1471/* called in rcu_read_lock() section */
1472static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1473                                u8 tos, struct net_device *dev, int our)
1474{
1475        struct rtable *rth;
1476        struct in_device *in_dev = __in_dev_get_rcu(dev);
1477        unsigned int flags = RTCF_MULTICAST;
1478        u32 itag = 0;
1479        int err;
1480
1481        /* Primary sanity checks. */
1482
1483        if (!in_dev)
1484                return -EINVAL;
1485
1486        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1487            skb->protocol != htons(ETH_P_IP))
1488                goto e_inval;
1489
1490        if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1491                goto e_inval;
1492
1493        if (ipv4_is_zeronet(saddr)) {
1494                if (!ipv4_is_local_multicast(daddr))
1495                        goto e_inval;
1496        } else {
1497                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1498                                          in_dev, &itag);
1499                if (err < 0)
1500                        goto e_err;
1501        }
1502        if (our)
1503                flags |= RTCF_LOCAL;
1504
1505        rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1506                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1507        if (!rth)
1508                goto e_nobufs;
1509
1510#ifdef CONFIG_IP_ROUTE_CLASSID
1511        rth->dst.tclassid = itag;
1512#endif
1513        rth->dst.output = ip_rt_bug;
1514        rth->rt_is_input= 1;
1515
1516#ifdef CONFIG_IP_MROUTE
1517        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1518                rth->dst.input = ip_mr_input;
1519#endif
1520        RT_CACHE_STAT_INC(in_slow_mc);
1521
1522        skb_dst_set(skb, &rth->dst);
1523        return 0;
1524
1525e_nobufs:
1526        return -ENOBUFS;
1527e_inval:
1528        return -EINVAL;
1529e_err:
1530        return err;
1531}
1532
1533
1534static void ip_handle_martian_source(struct net_device *dev,
1535                                     struct in_device *in_dev,
1536                                     struct sk_buff *skb,
1537                                     __be32 daddr,
1538                                     __be32 saddr)
1539{
1540        RT_CACHE_STAT_INC(in_martian_src);
1541#ifdef CONFIG_IP_ROUTE_VERBOSE
1542        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1543                /*
1544                 *      RFC1812 recommendation, if source is martian,
1545                 *      the only hint is MAC header.
1546                 */
1547                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1548                        &daddr, &saddr, dev->name);
1549                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1550                        print_hex_dump(KERN_WARNING, "ll header: ",
1551                                       DUMP_PREFIX_OFFSET, 16, 1,
1552                                       skb_mac_header(skb),
1553                                       dev->hard_header_len, true);
1554                }
1555        }
1556#endif
1557}
1558
1559/* called in rcu_read_lock() section */
1560static int __mkroute_input(struct sk_buff *skb,
1561                           const struct fib_result *res,
1562                           struct in_device *in_dev,
1563                           __be32 daddr, __be32 saddr, u32 tos)
1564{
1565        struct fib_nh_exception *fnhe;
1566        struct rtable *rth;
1567        int err;
1568        struct in_device *out_dev;
1569        bool do_cache;
1570        u32 itag = 0;
1571
1572        /* get a working reference to the output device */
1573        out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1574        if (!out_dev) {
1575                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1576                return -EINVAL;
1577        }
1578
1579        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1580                                  in_dev->dev, in_dev, &itag);
1581        if (err < 0) {
1582                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1583                                         saddr);
1584
1585                goto cleanup;
1586        }
1587
1588        do_cache = res->fi && !itag;
1589        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1590            skb->protocol == htons(ETH_P_IP) &&
1591            (IN_DEV_SHARED_MEDIA(out_dev) ||
1592             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1593                IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1594
1595        if (skb->protocol != htons(ETH_P_IP)) {
1596                /* Not IP (i.e. ARP). Do not create route, if it is
1597                 * invalid for proxy arp. DNAT routes are always valid.
1598                 *
1599                 * Proxy arp feature have been extended to allow, ARP
1600                 * replies back to the same interface, to support
1601                 * Private VLAN switch technologies. See arp.c.
1602                 */
1603                if (out_dev == in_dev &&
1604                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1605                        err = -EINVAL;
1606                        goto cleanup;
1607                }
1608        }
1609
1610        fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1611        if (do_cache) {
1612                if (fnhe)
1613                        rth = rcu_dereference(fnhe->fnhe_rth_input);
1614                else
1615                        rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1616
1617                if (rt_cache_valid(rth)) {
1618                        skb_dst_set_noref(skb, &rth->dst);
1619                        goto out;
1620                }
1621        }
1622
1623        rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1624                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1625                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1626        if (!rth) {
1627                err = -ENOBUFS;
1628                goto cleanup;
1629        }
1630
1631        rth->rt_is_input = 1;
1632        if (res->table)
1633                rth->rt_table_id = res->table->tb_id;
1634        RT_CACHE_STAT_INC(in_slow_tot);
1635
1636        rth->dst.input = ip_forward;
1637
1638        rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1639        if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1640                rth->dst.lwtstate->orig_output = rth->dst.output;
1641                rth->dst.output = lwtunnel_output;
1642        }
1643        if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1644                rth->dst.lwtstate->orig_input = rth->dst.input;
1645                rth->dst.input = lwtunnel_input;
1646        }
1647        skb_dst_set(skb, &rth->dst);
1648out:
1649        err = 0;
1650 cleanup:
1651        return err;
1652}
1653
1654#ifdef CONFIG_IP_ROUTE_MULTIPATH
1655
1656/* To make ICMP packets follow the right flow, the multipath hash is
1657 * calculated from the inner IP addresses in reverse order.
1658 */
1659static int ip_multipath_icmp_hash(struct sk_buff *skb)
1660{
1661        const struct iphdr *outer_iph = ip_hdr(skb);
1662        struct icmphdr _icmph;
1663        const struct icmphdr *icmph;
1664        struct iphdr _inner_iph;
1665        const struct iphdr *inner_iph;
1666
1667        if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1668                goto standard_hash;
1669
1670        icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1671                                   &_icmph);
1672        if (!icmph)
1673                goto standard_hash;
1674
1675        if (icmph->type != ICMP_DEST_UNREACH &&
1676            icmph->type != ICMP_REDIRECT &&
1677            icmph->type != ICMP_TIME_EXCEEDED &&
1678            icmph->type != ICMP_PARAMETERPROB) {
1679                goto standard_hash;
1680        }
1681
1682        inner_iph = skb_header_pointer(skb,
1683                                       outer_iph->ihl * 4 + sizeof(_icmph),
1684                                       sizeof(_inner_iph), &_inner_iph);
1685        if (!inner_iph)
1686                goto standard_hash;
1687
1688        return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1689
1690standard_hash:
1691        return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1692}
1693
1694#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1695
1696static int ip_mkroute_input(struct sk_buff *skb,
1697                            struct fib_result *res,
1698                            const struct flowi4 *fl4,
1699                            struct in_device *in_dev,
1700                            __be32 daddr, __be32 saddr, u32 tos)
1701{
1702#ifdef CONFIG_IP_ROUTE_MULTIPATH
1703        if (res->fi && res->fi->fib_nhs > 1) {
1704                int h;
1705
1706                if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1707                        h = ip_multipath_icmp_hash(skb);
1708                else
1709                        h = fib_multipath_hash(saddr, daddr);
1710                fib_select_multipath(res, h);
1711        }
1712#endif
1713
1714        /* create a routing cache entry */
1715        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1716}
1717
1718/*
1719 *      NOTE. We drop all the packets that has local source
1720 *      addresses, because every properly looped back packet
1721 *      must have correct destination already attached by output routine.
1722 *
1723 *      Such approach solves two big problems:
1724 *      1. Not simplex devices are handled properly.
1725 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1726 *      called with rcu_read_lock()
1727 */
1728
1729static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1730                               u8 tos, struct net_device *dev)
1731{
1732        struct fib_result res;
1733        struct in_device *in_dev = __in_dev_get_rcu(dev);
1734        struct ip_tunnel_info *tun_info;
1735        struct flowi4   fl4;
1736        unsigned int    flags = 0;
1737        u32             itag = 0;
1738        struct rtable   *rth;
1739        int             err = -EINVAL;
1740        struct net    *net = dev_net(dev);
1741        bool do_cache;
1742
1743        /* IP on this device is disabled. */
1744
1745        if (!in_dev)
1746                goto out;
1747
1748        /* Check for the most weird martians, which can be not detected
1749           by fib_lookup.
1750         */
1751
1752        tun_info = skb_tunnel_info(skb);
1753        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1754                fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1755        else
1756                fl4.flowi4_tun_key.tun_id = 0;
1757        skb_dst_drop(skb);
1758
1759        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1760                goto martian_source;
1761
1762        res.fi = NULL;
1763        res.table = NULL;
1764        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1765                goto brd_input;
1766
1767        /* Accept zero addresses only to limited broadcast;
1768         * I even do not know to fix it or not. Waiting for complains :-)
1769         */
1770        if (ipv4_is_zeronet(saddr))
1771                goto martian_source;
1772
1773        if (ipv4_is_zeronet(daddr))
1774                goto martian_destination;
1775
1776        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1777         * and call it once if daddr or/and saddr are loopback addresses
1778         */
1779        if (ipv4_is_loopback(daddr)) {
1780                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1781                        goto martian_destination;
1782        } else if (ipv4_is_loopback(saddr)) {
1783                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1784                        goto martian_source;
1785        }
1786
1787        /*
1788         *      Now we are ready to route packet.
1789         */
1790        fl4.flowi4_oif = 0;
1791        fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
1792        fl4.flowi4_mark = skb->mark;
1793        fl4.flowi4_tos = tos;
1794        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1795        fl4.flowi4_flags = 0;
1796        fl4.daddr = daddr;
1797        fl4.saddr = saddr;
1798        err = fib_lookup(net, &fl4, &res, 0);
1799        if (err != 0) {
1800                if (!IN_DEV_FORWARD(in_dev))
1801                        err = -EHOSTUNREACH;
1802                goto no_route;
1803        }
1804
1805        if (res.type == RTN_BROADCAST)
1806                goto brd_input;
1807
1808        if (res.type == RTN_LOCAL) {
1809                err = fib_validate_source(skb, saddr, daddr, tos,
1810                                          0, dev, in_dev, &itag);
1811                if (err < 0)
1812                        goto martian_source;
1813                goto local_input;
1814        }
1815
1816        if (!IN_DEV_FORWARD(in_dev)) {
1817                err = -EHOSTUNREACH;
1818                goto no_route;
1819        }
1820        if (res.type != RTN_UNICAST)
1821                goto martian_destination;
1822
1823        err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1824out:    return err;
1825
1826brd_input:
1827        if (skb->protocol != htons(ETH_P_IP))
1828                goto e_inval;
1829
1830        if (!ipv4_is_zeronet(saddr)) {
1831                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1832                                          in_dev, &itag);
1833                if (err < 0)
1834                        goto martian_source;
1835        }
1836        flags |= RTCF_BROADCAST;
1837        res.type = RTN_BROADCAST;
1838        RT_CACHE_STAT_INC(in_brd);
1839
1840local_input:
1841        do_cache = false;
1842        if (res.fi) {
1843                if (!itag) {
1844                        rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1845                        if (rt_cache_valid(rth)) {
1846                                skb_dst_set_noref(skb, &rth->dst);
1847                                err = 0;
1848                                goto out;
1849                        }
1850                        do_cache = true;
1851                }
1852        }
1853
1854        rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
1855                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1856        if (!rth)
1857                goto e_nobufs;
1858
1859        rth->dst.output= ip_rt_bug;
1860#ifdef CONFIG_IP_ROUTE_CLASSID
1861        rth->dst.tclassid = itag;
1862#endif
1863        rth->rt_is_input = 1;
1864        if (res.table)
1865                rth->rt_table_id = res.table->tb_id;
1866
1867        RT_CACHE_STAT_INC(in_slow_tot);
1868        if (res.type == RTN_UNREACHABLE) {
1869                rth->dst.input= ip_error;
1870                rth->dst.error= -err;
1871                rth->rt_flags   &= ~RTCF_LOCAL;
1872        }
1873        if (do_cache) {
1874                if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1875                        rth->dst.flags |= DST_NOCACHE;
1876                        rt_add_uncached_list(rth);
1877                }
1878        }
1879        skb_dst_set(skb, &rth->dst);
1880        err = 0;
1881        goto out;
1882
1883no_route:
1884        RT_CACHE_STAT_INC(in_no_route);
1885        res.type = RTN_UNREACHABLE;
1886        res.fi = NULL;
1887        res.table = NULL;
1888        goto local_input;
1889
1890        /*
1891         *      Do not cache martian addresses: they should be logged (RFC1812)
1892         */
1893martian_destination:
1894        RT_CACHE_STAT_INC(in_martian_dst);
1895#ifdef CONFIG_IP_ROUTE_VERBOSE
1896        if (IN_DEV_LOG_MARTIANS(in_dev))
1897                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1898                                     &daddr, &saddr, dev->name);
1899#endif
1900
1901e_inval:
1902        err = -EINVAL;
1903        goto out;
1904
1905e_nobufs:
1906        err = -ENOBUFS;
1907        goto out;
1908
1909martian_source:
1910        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1911        goto out;
1912}
1913
1914int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1915                         u8 tos, struct net_device *dev)
1916{
1917        int res;
1918
1919        rcu_read_lock();
1920
1921        /* Multicast recognition logic is moved from route cache to here.
1922           The problem was that too many Ethernet cards have broken/missing
1923           hardware multicast filters :-( As result the host on multicasting
1924           network acquires a lot of useless route cache entries, sort of
1925           SDR messages from all the world. Now we try to get rid of them.
1926           Really, provided software IP multicast filter is organized
1927           reasonably (at least, hashed), it does not result in a slowdown
1928           comparing with route cache reject entries.
1929           Note, that multicast routers are not affected, because
1930           route cache entry is created eventually.
1931         */
1932        if (ipv4_is_multicast(daddr)) {
1933                struct in_device *in_dev = __in_dev_get_rcu(dev);
1934
1935                if (in_dev) {
1936                        int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1937                                                  ip_hdr(skb)->protocol);
1938                        if (our
1939#ifdef CONFIG_IP_MROUTE
1940                                ||
1941                            (!ipv4_is_local_multicast(daddr) &&
1942                             IN_DEV_MFORWARD(in_dev))
1943#endif
1944                           ) {
1945                                int res = ip_route_input_mc(skb, daddr, saddr,
1946                                                            tos, dev, our);
1947                                rcu_read_unlock();
1948                                return res;
1949                        }
1950                }
1951                rcu_read_unlock();
1952                return -EINVAL;
1953        }
1954        res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1955        rcu_read_unlock();
1956        return res;
1957}
1958EXPORT_SYMBOL(ip_route_input_noref);
1959
1960/* called with rcu_read_lock() */
1961static struct rtable *__mkroute_output(const struct fib_result *res,
1962                                       const struct flowi4 *fl4, int orig_oif,
1963                                       struct net_device *dev_out,
1964                                       unsigned int flags)
1965{
1966        struct fib_info *fi = res->fi;
1967        struct fib_nh_exception *fnhe;
1968        struct in_device *in_dev;
1969        u16 type = res->type;
1970        struct rtable *rth;
1971        bool do_cache;
1972
1973        in_dev = __in_dev_get_rcu(dev_out);
1974        if (!in_dev)
1975                return ERR_PTR(-EINVAL);
1976
1977        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1978                if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1979                        return ERR_PTR(-EINVAL);
1980
1981        if (ipv4_is_lbcast(fl4->daddr))
1982                type = RTN_BROADCAST;
1983        else if (ipv4_is_multicast(fl4->daddr))
1984                type = RTN_MULTICAST;
1985        else if (ipv4_is_zeronet(fl4->daddr))
1986                return ERR_PTR(-EINVAL);
1987
1988        if (dev_out->flags & IFF_LOOPBACK)
1989                flags |= RTCF_LOCAL;
1990
1991        do_cache = true;
1992        if (type == RTN_BROADCAST) {
1993                flags |= RTCF_BROADCAST | RTCF_LOCAL;
1994                fi = NULL;
1995        } else if (type == RTN_MULTICAST) {
1996                flags |= RTCF_MULTICAST | RTCF_LOCAL;
1997                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1998                                     fl4->flowi4_proto))
1999                        flags &= ~RTCF_LOCAL;
2000                else
2001                        do_cache = false;
2002                /* If multicast route do not exist use
2003                 * default one, but do not gateway in this case.
2004                 * Yes, it is hack.
2005                 */
2006                if (fi && res->prefixlen < 4)
2007                        fi = NULL;
2008        }
2009
2010        fnhe = NULL;
2011        do_cache &= fi != NULL;
2012        if (do_cache) {
2013                struct rtable __rcu **prth;
2014                struct fib_nh *nh = &FIB_RES_NH(*res);
2015
2016                fnhe = find_exception(nh, fl4->daddr);
2017                if (fnhe)
2018                        prth = &fnhe->fnhe_rth_output;
2019                else {
2020                        if (unlikely(fl4->flowi4_flags &
2021                                     FLOWI_FLAG_KNOWN_NH &&
2022                                     !(nh->nh_gw &&
2023                                       nh->nh_scope == RT_SCOPE_LINK))) {
2024                                do_cache = false;
2025                                goto add;
2026                        }
2027                        prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2028                }
2029                rth = rcu_dereference(*prth);
2030                if (rt_cache_valid(rth)) {
2031                        dst_hold(&rth->dst);
2032                        return rth;
2033                }
2034        }
2035
2036add:
2037        rth = rt_dst_alloc(dev_out, flags, type,
2038                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
2039                           IN_DEV_CONF_GET(in_dev, NOXFRM),
2040                           do_cache);
2041        if (!rth)
2042                return ERR_PTR(-ENOBUFS);
2043
2044        rth->rt_iif     = orig_oif ? : 0;
2045        if (res->table)
2046                rth->rt_table_id = res->table->tb_id;
2047
2048        RT_CACHE_STAT_INC(out_slow_tot);
2049
2050        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2051                if (flags & RTCF_LOCAL &&
2052                    !(dev_out->flags & IFF_LOOPBACK)) {
2053                        rth->dst.output = ip_mc_output;
2054                        RT_CACHE_STAT_INC(out_slow_mc);
2055                }
2056#ifdef CONFIG_IP_MROUTE
2057                if (type == RTN_MULTICAST) {
2058                        if (IN_DEV_MFORWARD(in_dev) &&
2059                            !ipv4_is_local_multicast(fl4->daddr)) {
2060                                rth->dst.input = ip_mr_input;
2061                                rth->dst.output = ip_mc_output;
2062                        }
2063                }
2064#endif
2065        }
2066
2067        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2068        if (lwtunnel_output_redirect(rth->dst.lwtstate))
2069                rth->dst.output = lwtunnel_output;
2070
2071        return rth;
2072}
2073
2074/*
2075 * Major route resolver routine.
2076 */
2077
2078struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2079                                          int mp_hash)
2080{
2081        struct net_device *dev_out = NULL;
2082        __u8 tos = RT_FL_TOS(fl4);
2083        unsigned int flags = 0;
2084        struct fib_result res;
2085        struct rtable *rth;
2086        int orig_oif;
2087        int err = -ENETUNREACH;
2088
2089        res.tclassid    = 0;
2090        res.fi          = NULL;
2091        res.table       = NULL;
2092
2093        orig_oif = fl4->flowi4_oif;
2094
2095        fl4->flowi4_iif = LOOPBACK_IFINDEX;
2096        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2097        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2098                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2099
2100        rcu_read_lock();
2101        if (fl4->saddr) {
2102                rth = ERR_PTR(-EINVAL);
2103                if (ipv4_is_multicast(fl4->saddr) ||
2104                    ipv4_is_lbcast(fl4->saddr) ||
2105                    ipv4_is_zeronet(fl4->saddr))
2106                        goto out;
2107
2108                /* I removed check for oif == dev_out->oif here.
2109                   It was wrong for two reasons:
2110                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2111                      is assigned to multiple interfaces.
2112                   2. Moreover, we are allowed to send packets with saddr
2113                      of another iface. --ANK
2114                 */
2115
2116                if (fl4->flowi4_oif == 0 &&
2117                    (ipv4_is_multicast(fl4->daddr) ||
2118                     ipv4_is_lbcast(fl4->daddr))) {
2119                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2120                        dev_out = __ip_dev_find(net, fl4->saddr, false);
2121                        if (!dev_out)
2122                                goto out;
2123
2124                        /* Special hack: user can direct multicasts
2125                           and limited broadcast via necessary interface
2126                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2127                           This hack is not just for fun, it allows
2128                           vic,vat and friends to work.
2129                           They bind socket to loopback, set ttl to zero
2130                           and expect that it will work.
2131                           From the viewpoint of routing cache they are broken,
2132                           because we are not allowed to build multicast path
2133                           with loopback source addr (look, routing cache
2134                           cannot know, that ttl is zero, so that packet
2135                           will not leave this host and route is valid).
2136                           Luckily, this hack is good workaround.
2137                         */
2138
2139                        fl4->flowi4_oif = dev_out->ifindex;
2140                        goto make_route;
2141                }
2142
2143                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2144                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2145                        if (!__ip_dev_find(net, fl4->saddr, false))
2146                                goto out;
2147                }
2148        }
2149
2150
2151        if (fl4->flowi4_oif) {
2152                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2153                rth = ERR_PTR(-ENODEV);
2154                if (!dev_out)
2155                        goto out;
2156
2157                /* RACE: Check return value of inet_select_addr instead. */
2158                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2159                        rth = ERR_PTR(-ENETUNREACH);
2160                        goto out;
2161                }
2162                if (ipv4_is_local_multicast(fl4->daddr) ||
2163                    ipv4_is_lbcast(fl4->daddr) ||
2164                    fl4->flowi4_proto == IPPROTO_IGMP) {
2165                        if (!fl4->saddr)
2166                                fl4->saddr = inet_select_addr(dev_out, 0,
2167                                                              RT_SCOPE_LINK);
2168                        goto make_route;
2169                }
2170                if (!fl4->saddr) {
2171                        if (ipv4_is_multicast(fl4->daddr))
2172                                fl4->saddr = inet_select_addr(dev_out, 0,
2173                                                              fl4->flowi4_scope);
2174                        else if (!fl4->daddr)
2175                                fl4->saddr = inet_select_addr(dev_out, 0,
2176                                                              RT_SCOPE_HOST);
2177                }
2178
2179                rth = l3mdev_get_rtable(dev_out, fl4);
2180                if (rth)
2181                        goto out;
2182        }
2183
2184        if (!fl4->daddr) {
2185                fl4->daddr = fl4->saddr;
2186                if (!fl4->daddr)
2187                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2188                dev_out = net->loopback_dev;
2189                fl4->flowi4_oif = LOOPBACK_IFINDEX;
2190                res.type = RTN_LOCAL;
2191                flags |= RTCF_LOCAL;
2192                goto make_route;
2193        }
2194
2195        err = fib_lookup(net, fl4, &res, 0);
2196        if (err) {
2197                res.fi = NULL;
2198                res.table = NULL;
2199                if (fl4->flowi4_oif &&
2200                    !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
2201                        /* Apparently, routing tables are wrong. Assume,
2202                           that the destination is on link.
2203
2204                           WHY? DW.
2205                           Because we are allowed to send to iface
2206                           even if it has NO routes and NO assigned
2207                           addresses. When oif is specified, routing
2208                           tables are looked up with only one purpose:
2209                           to catch if destination is gatewayed, rather than
2210                           direct. Moreover, if MSG_DONTROUTE is set,
2211                           we send packet, ignoring both routing tables
2212                           and ifaddr state. --ANK
2213
2214
2215                           We could make it even if oif is unknown,
2216                           likely IPv6, but we do not.
2217                         */
2218
2219                        if (fl4->saddr == 0)
2220                                fl4->saddr = inet_select_addr(dev_out, 0,
2221                                                              RT_SCOPE_LINK);
2222                        res.type = RTN_UNICAST;
2223                        goto make_route;
2224                }
2225                rth = ERR_PTR(err);
2226                goto out;
2227        }
2228
2229        if (res.type == RTN_LOCAL) {
2230                if (!fl4->saddr) {
2231                        if (res.fi->fib_prefsrc)
2232                                fl4->saddr = res.fi->fib_prefsrc;
2233                        else
2234                                fl4->saddr = fl4->daddr;
2235                }
2236                dev_out = net->loopback_dev;
2237                fl4->flowi4_oif = dev_out->ifindex;
2238                flags |= RTCF_LOCAL;
2239                goto make_route;
2240        }
2241
2242        fib_select_path(net, &res, fl4, mp_hash);
2243
2244        dev_out = FIB_RES_DEV(res);
2245        fl4->flowi4_oif = dev_out->ifindex;
2246
2247
2248make_route:
2249        rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2250
2251out:
2252        rcu_read_unlock();
2253        return rth;
2254}
2255EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2256
2257static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2258{
2259        return NULL;
2260}
2261
2262static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2263{
2264        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2265
2266        return mtu ? : dst->dev->mtu;
2267}
2268
2269static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2270                                          struct sk_buff *skb, u32 mtu)
2271{
2272}
2273
2274static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2275                                       struct sk_buff *skb)
2276{
2277}
2278
2279static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2280                                          unsigned long old)
2281{
2282        return NULL;
2283}
2284
2285static struct dst_ops ipv4_dst_blackhole_ops = {
2286        .family                 =       AF_INET,
2287        .check                  =       ipv4_blackhole_dst_check,
2288        .mtu                    =       ipv4_blackhole_mtu,
2289        .default_advmss         =       ipv4_default_advmss,
2290        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2291        .redirect               =       ipv4_rt_blackhole_redirect,
2292        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2293        .neigh_lookup           =       ipv4_neigh_lookup,
2294};
2295
2296struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2297{
2298        struct rtable *ort = (struct rtable *) dst_orig;
2299        struct rtable *rt;
2300
2301        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2302        if (rt) {
2303                struct dst_entry *new = &rt->dst;
2304
2305                new->__use = 1;
2306                new->input = dst_discard;
2307                new->output = dst_discard_out;
2308
2309                new->dev = ort->dst.dev;
2310                if (new->dev)
2311                        dev_hold(new->dev);
2312
2313                rt->rt_is_input = ort->rt_is_input;
2314                rt->rt_iif = ort->rt_iif;
2315                rt->rt_pmtu = ort->rt_pmtu;
2316
2317                rt->rt_genid = rt_genid_ipv4(net);
2318                rt->rt_flags = ort->rt_flags;
2319                rt->rt_type = ort->rt_type;
2320                rt->rt_gateway = ort->rt_gateway;
2321                rt->rt_uses_gateway = ort->rt_uses_gateway;
2322
2323                INIT_LIST_HEAD(&rt->rt_uncached);
2324                dst_free(new);
2325        }
2326
2327        dst_release(dst_orig);
2328
2329        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2330}
2331
2332struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2333                                    const struct sock *sk)
2334{
2335        struct rtable *rt = __ip_route_output_key(net, flp4);
2336
2337        if (IS_ERR(rt))
2338                return rt;
2339
2340        if (flp4->flowi4_proto)
2341                rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2342                                                        flowi4_to_flowi(flp4),
2343                                                        sk, 0);
2344
2345        return rt;
2346}
2347EXPORT_SYMBOL_GPL(ip_route_output_flow);
2348
2349static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2350                        struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2351                        u32 seq, int event, int nowait, unsigned int flags)
2352{
2353        struct rtable *rt = skb_rtable(skb);
2354        struct rtmsg *r;
2355        struct nlmsghdr *nlh;
2356        unsigned long expires = 0;
2357        u32 error;
2358        u32 metrics[RTAX_MAX];
2359
2360        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2361        if (!nlh)
2362                return -EMSGSIZE;
2363
2364        r = nlmsg_data(nlh);
2365        r->rtm_family    = AF_INET;
2366        r->rtm_dst_len  = 32;
2367        r->rtm_src_len  = 0;
2368        r->rtm_tos      = fl4->flowi4_tos;
2369        r->rtm_table    = table_id;
2370        if (nla_put_u32(skb, RTA_TABLE, table_id))
2371                goto nla_put_failure;
2372        r->rtm_type     = rt->rt_type;
2373        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2374        r->rtm_protocol = RTPROT_UNSPEC;
2375        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2376        if (rt->rt_flags & RTCF_NOTIFY)
2377                r->rtm_flags |= RTM_F_NOTIFY;
2378        if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2379                r->rtm_flags |= RTCF_DOREDIRECT;
2380
2381        if (nla_put_in_addr(skb, RTA_DST, dst))
2382                goto nla_put_failure;
2383        if (src) {
2384                r->rtm_src_len = 32;
2385                if (nla_put_in_addr(skb, RTA_SRC, src))
2386                        goto nla_put_failure;
2387        }
2388        if (rt->dst.dev &&
2389            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2390                goto nla_put_failure;
2391#ifdef CONFIG_IP_ROUTE_CLASSID
2392        if (rt->dst.tclassid &&
2393            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2394                goto nla_put_failure;
2395#endif
2396        if (!rt_is_input_route(rt) &&
2397            fl4->saddr != src) {
2398                if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2399                        goto nla_put_failure;
2400        }
2401        if (rt->rt_uses_gateway &&
2402            nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2403                goto nla_put_failure;
2404
2405        expires = rt->dst.expires;
2406        if (expires) {
2407                unsigned long now = jiffies;
2408
2409                if (time_before(now, expires))
2410                        expires -= now;
2411                else
2412                        expires = 0;
2413        }
2414
2415        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2416        if (rt->rt_pmtu && expires)
2417                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2418        if (rtnetlink_put_metrics(skb, metrics) < 0)
2419                goto nla_put_failure;
2420
2421        if (fl4->flowi4_mark &&
2422            nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2423                goto nla_put_failure;
2424
2425        error = rt->dst.error;
2426
2427        if (rt_is_input_route(rt)) {
2428#ifdef CONFIG_IP_MROUTE
2429                if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2430                    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2431                        int err = ipmr_get_route(net, skb,
2432                                                 fl4->saddr, fl4->daddr,
2433                                                 r, nowait);
2434                        if (err <= 0) {
2435                                if (!nowait) {
2436                                        if (err == 0)
2437                                                return 0;
2438                                        goto nla_put_failure;
2439                                } else {
2440                                        if (err == -EMSGSIZE)
2441                                                goto nla_put_failure;
2442                                        error = err;
2443                                }
2444                        }
2445                } else
2446#endif
2447                        if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2448                                goto nla_put_failure;
2449        }
2450
2451        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2452                goto nla_put_failure;
2453
2454        nlmsg_end(skb, nlh);
2455        return 0;
2456
2457nla_put_failure:
2458        nlmsg_cancel(skb, nlh);
2459        return -EMSGSIZE;
2460}
2461
2462static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2463{
2464        struct net *net = sock_net(in_skb->sk);
2465        struct rtmsg *rtm;
2466        struct nlattr *tb[RTA_MAX+1];
2467        struct rtable *rt = NULL;
2468        struct flowi4 fl4;
2469        __be32 dst = 0;
2470        __be32 src = 0;
2471        u32 iif;
2472        int err;
2473        int mark;
2474        struct sk_buff *skb;
2475        u32 table_id = RT_TABLE_MAIN;
2476
2477        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2478        if (err < 0)
2479                goto errout;
2480
2481        rtm = nlmsg_data(nlh);
2482
2483        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2484        if (!skb) {
2485                err = -ENOBUFS;
2486                goto errout;
2487        }
2488
2489        /* Reserve room for dummy headers, this skb can pass
2490           through good chunk of routing engine.
2491         */
2492        skb_reset_mac_header(skb);
2493        skb_reset_network_header(skb);
2494
2495        /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2496        ip_hdr(skb)->protocol = IPPROTO_ICMP;
2497        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2498
2499        src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2500        dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2501        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2502        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2503
2504        memset(&fl4, 0, sizeof(fl4));
2505        fl4.daddr = dst;
2506        fl4.saddr = src;
2507        fl4.flowi4_tos = rtm->rtm_tos;
2508        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2509        fl4.flowi4_mark = mark;
2510
2511        if (netif_index_is_l3_master(net, fl4.flowi4_oif))
2512                fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
2513
2514        if (iif) {
2515                struct net_device *dev;
2516
2517                dev = __dev_get_by_index(net, iif);
2518                if (!dev) {
2519                        err = -ENODEV;
2520                        goto errout_free;
2521                }
2522
2523                skb->protocol   = htons(ETH_P_IP);
2524                skb->dev        = dev;
2525                skb->mark       = mark;
2526                local_bh_disable();
2527                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2528                local_bh_enable();
2529
2530                rt = skb_rtable(skb);
2531                if (err == 0 && rt->dst.error)
2532                        err = -rt->dst.error;
2533        } else {
2534                rt = ip_route_output_key(net, &fl4);
2535
2536                err = 0;
2537                if (IS_ERR(rt))
2538                        err = PTR_ERR(rt);
2539        }
2540
2541        if (err)
2542                goto errout_free;
2543
2544        skb_dst_set(skb, &rt->dst);
2545        if (rtm->rtm_flags & RTM_F_NOTIFY)
2546                rt->rt_flags |= RTCF_NOTIFY;
2547
2548        if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2549                table_id = rt->rt_table_id;
2550
2551        err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2552                           NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2553                           RTM_NEWROUTE, 0, 0);
2554        if (err < 0)
2555                goto errout_free;
2556
2557        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2558errout:
2559        return err;
2560
2561errout_free:
2562        kfree_skb(skb);
2563        goto errout;
2564}
2565
2566void ip_rt_multicast_event(struct in_device *in_dev)
2567{
2568        rt_cache_flush(dev_net(in_dev->dev));
2569}
2570
2571#ifdef CONFIG_SYSCTL
2572static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2573static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2574static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2575static int ip_rt_gc_elasticity __read_mostly    = 8;
2576
2577static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2578                                        void __user *buffer,
2579                                        size_t *lenp, loff_t *ppos)
2580{
2581        struct net *net = (struct net *)__ctl->extra1;
2582
2583        if (write) {
2584                rt_cache_flush(net);
2585                fnhe_genid_bump(net);
2586                return 0;
2587        }
2588
2589        return -EINVAL;
2590}
2591
2592static struct ctl_table ipv4_route_table[] = {
2593        {
2594                .procname       = "gc_thresh",
2595                .data           = &ipv4_dst_ops.gc_thresh,
2596                .maxlen         = sizeof(int),
2597                .mode           = 0644,
2598                .proc_handler   = proc_dointvec,
2599        },
2600        {
2601                .procname       = "max_size",
2602                .data           = &ip_rt_max_size,
2603                .maxlen         = sizeof(int),
2604                .mode           = 0644,
2605                .proc_handler   = proc_dointvec,
2606        },
2607        {
2608                /*  Deprecated. Use gc_min_interval_ms */
2609
2610                .procname       = "gc_min_interval",
2611                .data           = &ip_rt_gc_min_interval,
2612                .maxlen         = sizeof(int),
2613                .mode           = 0644,
2614                .proc_handler   = proc_dointvec_jiffies,
2615        },
2616        {
2617                .procname       = "gc_min_interval_ms",
2618                .data           = &ip_rt_gc_min_interval,
2619                .maxlen         = sizeof(int),
2620                .mode           = 0644,
2621                .proc_handler   = proc_dointvec_ms_jiffies,
2622        },
2623        {
2624                .procname       = "gc_timeout",
2625                .data           = &ip_rt_gc_timeout,
2626                .maxlen         = sizeof(int),
2627                .mode           = 0644,
2628                .proc_handler   = proc_dointvec_jiffies,
2629        },
2630        {
2631                .procname       = "gc_interval",
2632                .data           = &ip_rt_gc_interval,
2633                .maxlen         = sizeof(int),
2634                .mode           = 0644,
2635                .proc_handler   = proc_dointvec_jiffies,
2636        },
2637        {
2638                .procname       = "redirect_load",
2639                .data           = &ip_rt_redirect_load,
2640                .maxlen         = sizeof(int),
2641                .mode           = 0644,
2642                .proc_handler   = proc_dointvec,
2643        },
2644        {
2645                .procname       = "redirect_number",
2646                .data           = &ip_rt_redirect_number,
2647                .maxlen         = sizeof(int),
2648                .mode           = 0644,
2649                .proc_handler   = proc_dointvec,
2650        },
2651        {
2652                .procname       = "redirect_silence",
2653                .data           = &ip_rt_redirect_silence,
2654                .maxlen         = sizeof(int),
2655                .mode           = 0644,
2656                .proc_handler   = proc_dointvec,
2657        },
2658        {
2659                .procname       = "error_cost",
2660                .data           = &ip_rt_error_cost,
2661                .maxlen         = sizeof(int),
2662                .mode           = 0644,
2663                .proc_handler   = proc_dointvec,
2664        },
2665        {
2666                .procname       = "error_burst",
2667                .data           = &ip_rt_error_burst,
2668                .maxlen         = sizeof(int),
2669                .mode           = 0644,
2670                .proc_handler   = proc_dointvec,
2671        },
2672        {
2673                .procname       = "gc_elasticity",
2674                .data           = &ip_rt_gc_elasticity,
2675                .maxlen         = sizeof(int),
2676                .mode           = 0644,
2677                .proc_handler   = proc_dointvec,
2678        },
2679        {
2680                .procname       = "mtu_expires",
2681                .data           = &ip_rt_mtu_expires,
2682                .maxlen         = sizeof(int),
2683                .mode           = 0644,
2684                .proc_handler   = proc_dointvec_jiffies,
2685        },
2686        {
2687                .procname       = "min_pmtu",
2688                .data           = &ip_rt_min_pmtu,
2689                .maxlen         = sizeof(int),
2690                .mode           = 0644,
2691                .proc_handler   = proc_dointvec,
2692        },
2693        {
2694                .procname       = "min_adv_mss",
2695                .data           = &ip_rt_min_advmss,
2696                .maxlen         = sizeof(int),
2697                .mode           = 0644,
2698                .proc_handler   = proc_dointvec,
2699        },
2700        { }
2701};
2702
2703static struct ctl_table ipv4_route_flush_table[] = {
2704        {
2705                .procname       = "flush",
2706                .maxlen         = sizeof(int),
2707                .mode           = 0200,
2708                .proc_handler   = ipv4_sysctl_rtcache_flush,
2709        },
2710        { },
2711};
2712
2713static __net_init int sysctl_route_net_init(struct net *net)
2714{
2715        struct ctl_table *tbl;
2716
2717        tbl = ipv4_route_flush_table;
2718        if (!net_eq(net, &init_net)) {
2719                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2720                if (!tbl)
2721                        goto err_dup;
2722
2723                /* Don't export sysctls to unprivileged users */
2724                if (net->user_ns != &init_user_ns)
2725                        tbl[0].procname = NULL;
2726        }
2727        tbl[0].extra1 = net;
2728
2729        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2730        if (!net->ipv4.route_hdr)
2731                goto err_reg;
2732        return 0;
2733
2734err_reg:
2735        if (tbl != ipv4_route_flush_table)
2736                kfree(tbl);
2737err_dup:
2738        return -ENOMEM;
2739}
2740
2741static __net_exit void sysctl_route_net_exit(struct net *net)
2742{
2743        struct ctl_table *tbl;
2744
2745        tbl = net->ipv4.route_hdr->ctl_table_arg;
2746        unregister_net_sysctl_table(net->ipv4.route_hdr);
2747        BUG_ON(tbl == ipv4_route_flush_table);
2748        kfree(tbl);
2749}
2750
2751static __net_initdata struct pernet_operations sysctl_route_ops = {
2752        .init = sysctl_route_net_init,
2753        .exit = sysctl_route_net_exit,
2754};
2755#endif
2756
2757static __net_init int rt_genid_init(struct net *net)
2758{
2759        atomic_set(&net->ipv4.rt_genid, 0);
2760        atomic_set(&net->fnhe_genid, 0);
2761        get_random_bytes(&net->ipv4.dev_addr_genid,
2762                         sizeof(net->ipv4.dev_addr_genid));
2763        return 0;
2764}
2765
2766static __net_initdata struct pernet_operations rt_genid_ops = {
2767        .init = rt_genid_init,
2768};
2769
2770static int __net_init ipv4_inetpeer_init(struct net *net)
2771{
2772        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2773
2774        if (!bp)
2775                return -ENOMEM;
2776        inet_peer_base_init(bp);
2777        net->ipv4.peers = bp;
2778        return 0;
2779}
2780
2781static void __net_exit ipv4_inetpeer_exit(struct net *net)
2782{
2783        struct inet_peer_base *bp = net->ipv4.peers;
2784
2785        net->ipv4.peers = NULL;
2786        inetpeer_invalidate_tree(bp);
2787        kfree(bp);
2788}
2789
2790static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2791        .init   =       ipv4_inetpeer_init,
2792        .exit   =       ipv4_inetpeer_exit,
2793};
2794
2795#ifdef CONFIG_IP_ROUTE_CLASSID
2796struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2797#endif /* CONFIG_IP_ROUTE_CLASSID */
2798
2799int __init ip_rt_init(void)
2800{
2801        int rc = 0;
2802        int cpu;
2803
2804        ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2805        if (!ip_idents)
2806                panic("IP: failed to allocate ip_idents\n");
2807
2808        prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2809
2810        ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2811        if (!ip_tstamps)
2812                panic("IP: failed to allocate ip_tstamps\n");
2813
2814        for_each_possible_cpu(cpu) {
2815                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2816
2817                INIT_LIST_HEAD(&ul->head);
2818                spin_lock_init(&ul->lock);
2819        }
2820#ifdef CONFIG_IP_ROUTE_CLASSID
2821        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2822        if (!ip_rt_acct)
2823                panic("IP: failed to allocate ip_rt_acct\n");
2824#endif
2825
2826        ipv4_dst_ops.kmem_cachep =
2827                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2828                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2829
2830        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2831
2832        if (dst_entries_init(&ipv4_dst_ops) < 0)
2833                panic("IP: failed to allocate ipv4_dst_ops counter\n");
2834
2835        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2836                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2837
2838        ipv4_dst_ops.gc_thresh = ~0;
2839        ip_rt_max_size = INT_MAX;
2840
2841        devinet_init();
2842        ip_fib_init();
2843
2844        if (ip_rt_proc_init())
2845                pr_err("Unable to create route proc files\n");
2846#ifdef CONFIG_XFRM
2847        xfrm_init();
2848        xfrm4_init();
2849#endif
2850        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2851
2852#ifdef CONFIG_SYSCTL
2853        register_pernet_subsys(&sysctl_route_ops);
2854#endif
2855        register_pernet_subsys(&rt_genid_ops);
2856        register_pernet_subsys(&ipv4_inetpeer_ops);
2857        return rc;
2858}
2859
2860#ifdef CONFIG_SYSCTL
2861/*
2862 * We really need to sanitize the damn ipv4 init order, then all
2863 * this nonsense will go away.
2864 */
2865void __init ip_static_sysctl_init(void)
2866{
2867        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2868}
2869#endif
2870