linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *              Alan Cox        :       Verify area fixes.
  16 *              Alan Cox        :       cli() protects routing changes
  17 *              Rui Oliveira    :       ICMP routing table updates
  18 *              (rco@di.uminho.pt)      Routing table insertion and update
  19 *              Linus Torvalds  :       Rewrote bits to be sensible
  20 *              Alan Cox        :       Added BSD route gw semantics
  21 *              Alan Cox        :       Super /proc >4K
  22 *              Alan Cox        :       MTU in route table
  23 *              Alan Cox        :       MSS actually. Also added the window
  24 *                                      clamper.
  25 *              Sam Lantinga    :       Fixed route matching in rt_del()
  26 *              Alan Cox        :       Routing cache support.
  27 *              Alan Cox        :       Removed compatibility cruft.
  28 *              Alan Cox        :       RTF_REJECT support.
  29 *              Alan Cox        :       TCP irtt support.
  30 *              Jonathan Naylor :       Added Metric support.
  31 *      Miquel van Smoorenburg  :       BSD API fixes.
  32 *      Miquel van Smoorenburg  :       Metrics.
  33 *              Alan Cox        :       Use __u32 properly
  34 *              Alan Cox        :       Aligned routing errors more closely with BSD
  35 *                                      our system is still very different.
  36 *              Alan Cox        :       Faster /proc handling
  37 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38 *                                      routing caches and better behaviour.
  39 *
  40 *              Olaf Erb        :       irtt wasn't being copied right.
  41 *              Bjorn Ekwall    :       Kerneld route support.
  42 *              Alan Cox        :       Multicast fixed (I hope)
  43 *              Pavel Krauz     :       Limited broadcast fixed
  44 *              Mike McLagan    :       Routing by source
  45 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46 *                                      route.c and rewritten from scratch.
  47 *              Andi Kleen      :       Load-limit warning messages.
  48 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52 *              Marc Boucher    :       routing by fwmark
  53 *      Robert Olsson           :       Added rt_cache statistics
  54 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58 *
  59 *              This program is free software; you can redistribute it and/or
  60 *              modify it under the terms of the GNU General Public License
  61 *              as published by the Free Software Foundation; either version
  62 *              2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
  92#include <linux/jhash.h>
  93#include <net/dst.h>
  94#include <net/dst_metadata.h>
  95#include <net/net_namespace.h>
  96#include <net/protocol.h>
  97#include <net/ip.h>
  98#include <net/route.h>
  99#include <net/inetpeer.h>
 100#include <net/sock.h>
 101#include <net/ip_fib.h>
 102#include <net/arp.h>
 103#include <net/tcp.h>
 104#include <net/icmp.h>
 105#include <net/xfrm.h>
 106#include <net/lwtunnel.h>
 107#include <net/netevent.h>
 108#include <net/rtnetlink.h>
 109#ifdef CONFIG_SYSCTL
 110#include <linux/sysctl.h>
 111#include <linux/kmemleak.h>
 112#endif
 113#include <net/secure_seq.h>
 114#include <net/ip_tunnels.h>
 115#include <net/l3mdev.h>
 116
 117#define RT_FL_TOS(oldflp4) \
 118        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 119
 120#define RT_GC_TIMEOUT (300*HZ)
 121
 122static int ip_rt_max_size;
 123static int ip_rt_redirect_number __read_mostly  = 9;
 124static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126static int ip_rt_error_cost __read_mostly       = HZ;
 127static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130static int ip_rt_min_advmss __read_mostly       = 256;
 131
 132static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 133/*
 134 *      Interface to generic destination cache.
 135 */
 136
 137static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 138static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 139static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 140static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 141static void              ipv4_link_failure(struct sk_buff *skb);
 142static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 143                                           struct sk_buff *skb, u32 mtu);
 144static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 145                                        struct sk_buff *skb);
 146static void             ipv4_dst_destroy(struct dst_entry *dst);
 147
 148static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 149{
 150        WARN_ON(1);
 151        return NULL;
 152}
 153
 154static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 155                                           struct sk_buff *skb,
 156                                           const void *daddr);
 157
 158static struct dst_ops ipv4_dst_ops = {
 159        .family =               AF_INET,
 160        .check =                ipv4_dst_check,
 161        .default_advmss =       ipv4_default_advmss,
 162        .mtu =                  ipv4_mtu,
 163        .cow_metrics =          ipv4_cow_metrics,
 164        .destroy =              ipv4_dst_destroy,
 165        .negative_advice =      ipv4_negative_advice,
 166        .link_failure =         ipv4_link_failure,
 167        .update_pmtu =          ip_rt_update_pmtu,
 168        .redirect =             ip_do_redirect,
 169        .local_out =            __ip_local_out,
 170        .neigh_lookup =         ipv4_neigh_lookup,
 171};
 172
 173#define ECN_OR_COST(class)      TC_PRIO_##class
 174
 175const __u8 ip_tos2prio[16] = {
 176        TC_PRIO_BESTEFFORT,
 177        ECN_OR_COST(BESTEFFORT),
 178        TC_PRIO_BESTEFFORT,
 179        ECN_OR_COST(BESTEFFORT),
 180        TC_PRIO_BULK,
 181        ECN_OR_COST(BULK),
 182        TC_PRIO_BULK,
 183        ECN_OR_COST(BULK),
 184        TC_PRIO_INTERACTIVE,
 185        ECN_OR_COST(INTERACTIVE),
 186        TC_PRIO_INTERACTIVE,
 187        ECN_OR_COST(INTERACTIVE),
 188        TC_PRIO_INTERACTIVE_BULK,
 189        ECN_OR_COST(INTERACTIVE_BULK),
 190        TC_PRIO_INTERACTIVE_BULK,
 191        ECN_OR_COST(INTERACTIVE_BULK)
 192};
 193EXPORT_SYMBOL(ip_tos2prio);
 194
 195static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 196#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 197
 198#ifdef CONFIG_PROC_FS
 199static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 200{
 201        if (*pos)
 202                return NULL;
 203        return SEQ_START_TOKEN;
 204}
 205
 206static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 207{
 208        ++*pos;
 209        return NULL;
 210}
 211
 212static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 213{
 214}
 215
 216static int rt_cache_seq_show(struct seq_file *seq, void *v)
 217{
 218        if (v == SEQ_START_TOKEN)
 219                seq_printf(seq, "%-127s\n",
 220                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 221                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 222                           "HHUptod\tSpecDst");
 223        return 0;
 224}
 225
 226static const struct seq_operations rt_cache_seq_ops = {
 227        .start  = rt_cache_seq_start,
 228        .next   = rt_cache_seq_next,
 229        .stop   = rt_cache_seq_stop,
 230        .show   = rt_cache_seq_show,
 231};
 232
 233static int rt_cache_seq_open(struct inode *inode, struct file *file)
 234{
 235        return seq_open(file, &rt_cache_seq_ops);
 236}
 237
 238static const struct file_operations rt_cache_seq_fops = {
 239        .owner   = THIS_MODULE,
 240        .open    = rt_cache_seq_open,
 241        .read    = seq_read,
 242        .llseek  = seq_lseek,
 243        .release = seq_release,
 244};
 245
 246
 247static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 248{
 249        int cpu;
 250
 251        if (*pos == 0)
 252                return SEQ_START_TOKEN;
 253
 254        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 255                if (!cpu_possible(cpu))
 256                        continue;
 257                *pos = cpu+1;
 258                return &per_cpu(rt_cache_stat, cpu);
 259        }
 260        return NULL;
 261}
 262
 263static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 264{
 265        int cpu;
 266
 267        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 268                if (!cpu_possible(cpu))
 269                        continue;
 270                *pos = cpu+1;
 271                return &per_cpu(rt_cache_stat, cpu);
 272        }
 273        return NULL;
 274
 275}
 276
 277static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 278{
 279
 280}
 281
 282static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 283{
 284        struct rt_cache_stat *st = v;
 285
 286        if (v == SEQ_START_TOKEN) {
 287                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 288                return 0;
 289        }
 290
 291        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 292                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 293                   dst_entries_get_slow(&ipv4_dst_ops),
 294                   0, /* st->in_hit */
 295                   st->in_slow_tot,
 296                   st->in_slow_mc,
 297                   st->in_no_route,
 298                   st->in_brd,
 299                   st->in_martian_dst,
 300                   st->in_martian_src,
 301
 302                   0, /* st->out_hit */
 303                   st->out_slow_tot,
 304                   st->out_slow_mc,
 305
 306                   0, /* st->gc_total */
 307                   0, /* st->gc_ignored */
 308                   0, /* st->gc_goal_miss */
 309                   0, /* st->gc_dst_overflow */
 310                   0, /* st->in_hlist_search */
 311                   0  /* st->out_hlist_search */
 312                );
 313        return 0;
 314}
 315
 316static const struct seq_operations rt_cpu_seq_ops = {
 317        .start  = rt_cpu_seq_start,
 318        .next   = rt_cpu_seq_next,
 319        .stop   = rt_cpu_seq_stop,
 320        .show   = rt_cpu_seq_show,
 321};
 322
 323
 324static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 325{
 326        return seq_open(file, &rt_cpu_seq_ops);
 327}
 328
 329static const struct file_operations rt_cpu_seq_fops = {
 330        .owner   = THIS_MODULE,
 331        .open    = rt_cpu_seq_open,
 332        .read    = seq_read,
 333        .llseek  = seq_lseek,
 334        .release = seq_release,
 335};
 336
 337#ifdef CONFIG_IP_ROUTE_CLASSID
 338static int rt_acct_proc_show(struct seq_file *m, void *v)
 339{
 340        struct ip_rt_acct *dst, *src;
 341        unsigned int i, j;
 342
 343        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 344        if (!dst)
 345                return -ENOMEM;
 346
 347        for_each_possible_cpu(i) {
 348                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 349                for (j = 0; j < 256; j++) {
 350                        dst[j].o_bytes   += src[j].o_bytes;
 351                        dst[j].o_packets += src[j].o_packets;
 352                        dst[j].i_bytes   += src[j].i_bytes;
 353                        dst[j].i_packets += src[j].i_packets;
 354                }
 355        }
 356
 357        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 358        kfree(dst);
 359        return 0;
 360}
 361
 362static int rt_acct_proc_open(struct inode *inode, struct file *file)
 363{
 364        return single_open(file, rt_acct_proc_show, NULL);
 365}
 366
 367static const struct file_operations rt_acct_proc_fops = {
 368        .owner          = THIS_MODULE,
 369        .open           = rt_acct_proc_open,
 370        .read           = seq_read,
 371        .llseek         = seq_lseek,
 372        .release        = single_release,
 373};
 374#endif
 375
 376static int __net_init ip_rt_do_proc_init(struct net *net)
 377{
 378        struct proc_dir_entry *pde;
 379
 380        pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 381                          &rt_cache_seq_fops);
 382        if (!pde)
 383                goto err1;
 384
 385        pde = proc_create("rt_cache", S_IRUGO,
 386                          net->proc_net_stat, &rt_cpu_seq_fops);
 387        if (!pde)
 388                goto err2;
 389
 390#ifdef CONFIG_IP_ROUTE_CLASSID
 391        pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 392        if (!pde)
 393                goto err3;
 394#endif
 395        return 0;
 396
 397#ifdef CONFIG_IP_ROUTE_CLASSID
 398err3:
 399        remove_proc_entry("rt_cache", net->proc_net_stat);
 400#endif
 401err2:
 402        remove_proc_entry("rt_cache", net->proc_net);
 403err1:
 404        return -ENOMEM;
 405}
 406
 407static void __net_exit ip_rt_do_proc_exit(struct net *net)
 408{
 409        remove_proc_entry("rt_cache", net->proc_net_stat);
 410        remove_proc_entry("rt_cache", net->proc_net);
 411#ifdef CONFIG_IP_ROUTE_CLASSID
 412        remove_proc_entry("rt_acct", net->proc_net);
 413#endif
 414}
 415
 416static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 417        .init = ip_rt_do_proc_init,
 418        .exit = ip_rt_do_proc_exit,
 419};
 420
 421static int __init ip_rt_proc_init(void)
 422{
 423        return register_pernet_subsys(&ip_rt_proc_ops);
 424}
 425
 426#else
 427static inline int ip_rt_proc_init(void)
 428{
 429        return 0;
 430}
 431#endif /* CONFIG_PROC_FS */
 432
 433static inline bool rt_is_expired(const struct rtable *rth)
 434{
 435        return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 436}
 437
 438void rt_cache_flush(struct net *net)
 439{
 440        rt_genid_bump_ipv4(net);
 441}
 442
 443static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 444                                           struct sk_buff *skb,
 445                                           const void *daddr)
 446{
 447        struct net_device *dev = dst->dev;
 448        const __be32 *pkey = daddr;
 449        const struct rtable *rt;
 450        struct neighbour *n;
 451
 452        rt = (const struct rtable *) dst;
 453        if (rt->rt_gateway)
 454                pkey = (const __be32 *) &rt->rt_gateway;
 455        else if (skb)
 456                pkey = &ip_hdr(skb)->daddr;
 457
 458        n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 459        if (n)
 460                return n;
 461        return neigh_create(&arp_tbl, pkey, dev);
 462}
 463
 464#define IP_IDENTS_SZ 2048u
 465
 466static atomic_t *ip_idents __read_mostly;
 467static u32 *ip_tstamps __read_mostly;
 468
 469/* In order to protect privacy, we add a perturbation to identifiers
 470 * if one generator is seldom used. This makes hard for an attacker
 471 * to infer how many packets were sent between two points in time.
 472 */
 473u32 ip_idents_reserve(u32 hash, int segs)
 474{
 475        u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 476        atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 477        u32 old = ACCESS_ONCE(*p_tstamp);
 478        u32 now = (u32)jiffies;
 479        u32 new, delta = 0;
 480
 481        if (old != now && cmpxchg(p_tstamp, old, now) == old)
 482                delta = prandom_u32_max(now - old);
 483
 484        /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 485        do {
 486                old = (u32)atomic_read(p_id);
 487                new = old + delta + segs;
 488        } while (atomic_cmpxchg(p_id, old, new) != old);
 489
 490        return new - segs;
 491}
 492EXPORT_SYMBOL(ip_idents_reserve);
 493
 494void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 495{
 496        static u32 ip_idents_hashrnd __read_mostly;
 497        u32 hash, id;
 498
 499        net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 500
 501        hash = jhash_3words((__force u32)iph->daddr,
 502                            (__force u32)iph->saddr,
 503                            iph->protocol ^ net_hash_mix(net),
 504                            ip_idents_hashrnd);
 505        id = ip_idents_reserve(hash, segs);
 506        iph->id = htons(id);
 507}
 508EXPORT_SYMBOL(__ip_select_ident);
 509
 510static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 511                             const struct iphdr *iph,
 512                             int oif, u8 tos,
 513                             u8 prot, u32 mark, int flow_flags)
 514{
 515        if (sk) {
 516                const struct inet_sock *inet = inet_sk(sk);
 517
 518                oif = sk->sk_bound_dev_if;
 519                mark = sk->sk_mark;
 520                tos = RT_CONN_FLAGS(sk);
 521                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 522        }
 523        flowi4_init_output(fl4, oif, mark, tos,
 524                           RT_SCOPE_UNIVERSE, prot,
 525                           flow_flags,
 526                           iph->daddr, iph->saddr, 0, 0);
 527}
 528
 529static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 530                               const struct sock *sk)
 531{
 532        const struct iphdr *iph = ip_hdr(skb);
 533        int oif = skb->dev->ifindex;
 534        u8 tos = RT_TOS(iph->tos);
 535        u8 prot = iph->protocol;
 536        u32 mark = skb->mark;
 537
 538        __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 539}
 540
 541static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 542{
 543        const struct inet_sock *inet = inet_sk(sk);
 544        const struct ip_options_rcu *inet_opt;
 545        __be32 daddr = inet->inet_daddr;
 546
 547        rcu_read_lock();
 548        inet_opt = rcu_dereference(inet->inet_opt);
 549        if (inet_opt && inet_opt->opt.srr)
 550                daddr = inet_opt->opt.faddr;
 551        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 552                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 553                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 554                           inet_sk_flowi_flags(sk),
 555                           daddr, inet->inet_saddr, 0, 0);
 556        rcu_read_unlock();
 557}
 558
 559static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 560                                 const struct sk_buff *skb)
 561{
 562        if (skb)
 563                build_skb_flow_key(fl4, skb, sk);
 564        else
 565                build_sk_flow_key(fl4, sk);
 566}
 567
 568static inline void rt_free(struct rtable *rt)
 569{
 570        call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 571}
 572
 573static DEFINE_SPINLOCK(fnhe_lock);
 574
 575static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 576{
 577        struct rtable *rt;
 578
 579        rt = rcu_dereference(fnhe->fnhe_rth_input);
 580        if (rt) {
 581                RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 582                rt_free(rt);
 583        }
 584        rt = rcu_dereference(fnhe->fnhe_rth_output);
 585        if (rt) {
 586                RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 587                rt_free(rt);
 588        }
 589}
 590
 591static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 592{
 593        struct fib_nh_exception *fnhe, *oldest;
 594
 595        oldest = rcu_dereference(hash->chain);
 596        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 597             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 598                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 599                        oldest = fnhe;
 600        }
 601        fnhe_flush_routes(oldest);
 602        return oldest;
 603}
 604
 605static inline u32 fnhe_hashfun(__be32 daddr)
 606{
 607        static u32 fnhe_hashrnd __read_mostly;
 608        u32 hval;
 609
 610        net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 611        hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 612        return hash_32(hval, FNHE_HASH_SHIFT);
 613}
 614
 615static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 616{
 617        rt->rt_pmtu = fnhe->fnhe_pmtu;
 618        rt->dst.expires = fnhe->fnhe_expires;
 619
 620        if (fnhe->fnhe_gw) {
 621                rt->rt_flags |= RTCF_REDIRECTED;
 622                rt->rt_gateway = fnhe->fnhe_gw;
 623                rt->rt_uses_gateway = 1;
 624        }
 625}
 626
 627static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 628                                  u32 pmtu, unsigned long expires)
 629{
 630        struct fnhe_hash_bucket *hash;
 631        struct fib_nh_exception *fnhe;
 632        struct rtable *rt;
 633        unsigned int i;
 634        int depth;
 635        u32 hval = fnhe_hashfun(daddr);
 636
 637        spin_lock_bh(&fnhe_lock);
 638
 639        hash = rcu_dereference(nh->nh_exceptions);
 640        if (!hash) {
 641                hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 642                if (!hash)
 643                        goto out_unlock;
 644                rcu_assign_pointer(nh->nh_exceptions, hash);
 645        }
 646
 647        hash += hval;
 648
 649        depth = 0;
 650        for (fnhe = rcu_dereference(hash->chain); fnhe;
 651             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 652                if (fnhe->fnhe_daddr == daddr)
 653                        break;
 654                depth++;
 655        }
 656
 657        if (fnhe) {
 658                if (gw)
 659                        fnhe->fnhe_gw = gw;
 660                if (pmtu) {
 661                        fnhe->fnhe_pmtu = pmtu;
 662                        fnhe->fnhe_expires = max(1UL, expires);
 663                }
 664                /* Update all cached dsts too */
 665                rt = rcu_dereference(fnhe->fnhe_rth_input);
 666                if (rt)
 667                        fill_route_from_fnhe(rt, fnhe);
 668                rt = rcu_dereference(fnhe->fnhe_rth_output);
 669                if (rt)
 670                        fill_route_from_fnhe(rt, fnhe);
 671        } else {
 672                if (depth > FNHE_RECLAIM_DEPTH)
 673                        fnhe = fnhe_oldest(hash);
 674                else {
 675                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 676                        if (!fnhe)
 677                                goto out_unlock;
 678
 679                        fnhe->fnhe_next = hash->chain;
 680                        rcu_assign_pointer(hash->chain, fnhe);
 681                }
 682                fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
 683                fnhe->fnhe_daddr = daddr;
 684                fnhe->fnhe_gw = gw;
 685                fnhe->fnhe_pmtu = pmtu;
 686                fnhe->fnhe_expires = expires;
 687
 688                /* Exception created; mark the cached routes for the nexthop
 689                 * stale, so anyone caching it rechecks if this exception
 690                 * applies to them.
 691                 */
 692                rt = rcu_dereference(nh->nh_rth_input);
 693                if (rt)
 694                        rt->dst.obsolete = DST_OBSOLETE_KILL;
 695
 696                for_each_possible_cpu(i) {
 697                        struct rtable __rcu **prt;
 698                        prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 699                        rt = rcu_dereference(*prt);
 700                        if (rt)
 701                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 702                }
 703        }
 704
 705        fnhe->fnhe_stamp = jiffies;
 706
 707out_unlock:
 708        spin_unlock_bh(&fnhe_lock);
 709}
 710
 711static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 712                             bool kill_route)
 713{
 714        __be32 new_gw = icmp_hdr(skb)->un.gateway;
 715        __be32 old_gw = ip_hdr(skb)->saddr;
 716        struct net_device *dev = skb->dev;
 717        struct in_device *in_dev;
 718        struct fib_result res;
 719        struct neighbour *n;
 720        struct net *net;
 721
 722        switch (icmp_hdr(skb)->code & 7) {
 723        case ICMP_REDIR_NET:
 724        case ICMP_REDIR_NETTOS:
 725        case ICMP_REDIR_HOST:
 726        case ICMP_REDIR_HOSTTOS:
 727                break;
 728
 729        default:
 730                return;
 731        }
 732
 733        if (rt->rt_gateway != old_gw)
 734                return;
 735
 736        in_dev = __in_dev_get_rcu(dev);
 737        if (!in_dev)
 738                return;
 739
 740        net = dev_net(dev);
 741        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 742            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 743            ipv4_is_zeronet(new_gw))
 744                goto reject_redirect;
 745
 746        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 747                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 748                        goto reject_redirect;
 749                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 750                        goto reject_redirect;
 751        } else {
 752                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 753                        goto reject_redirect;
 754        }
 755
 756        n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 757        if (!n)
 758                n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 759        if (!IS_ERR(n)) {
 760                if (!(n->nud_state & NUD_VALID)) {
 761                        neigh_event_send(n, NULL);
 762                } else {
 763                        if (fib_lookup(net, fl4, &res, 0) == 0) {
 764                                struct fib_nh *nh = &FIB_RES_NH(res);
 765
 766                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
 767                                                0, jiffies + ip_rt_gc_timeout);
 768                        }
 769                        if (kill_route)
 770                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 771                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 772                }
 773                neigh_release(n);
 774        }
 775        return;
 776
 777reject_redirect:
 778#ifdef CONFIG_IP_ROUTE_VERBOSE
 779        if (IN_DEV_LOG_MARTIANS(in_dev)) {
 780                const struct iphdr *iph = (const struct iphdr *) skb->data;
 781                __be32 daddr = iph->daddr;
 782                __be32 saddr = iph->saddr;
 783
 784                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 785                                     "  Advised path = %pI4 -> %pI4\n",
 786                                     &old_gw, dev->name, &new_gw,
 787                                     &saddr, &daddr);
 788        }
 789#endif
 790        ;
 791}
 792
 793static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 794{
 795        struct rtable *rt;
 796        struct flowi4 fl4;
 797        const struct iphdr *iph = (const struct iphdr *) skb->data;
 798        int oif = skb->dev->ifindex;
 799        u8 tos = RT_TOS(iph->tos);
 800        u8 prot = iph->protocol;
 801        u32 mark = skb->mark;
 802
 803        rt = (struct rtable *) dst;
 804
 805        __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
 806        __ip_do_redirect(rt, skb, &fl4, true);
 807}
 808
 809static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 810{
 811        struct rtable *rt = (struct rtable *)dst;
 812        struct dst_entry *ret = dst;
 813
 814        if (rt) {
 815                if (dst->obsolete > 0) {
 816                        ip_rt_put(rt);
 817                        ret = NULL;
 818                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 819                           rt->dst.expires) {
 820                        ip_rt_put(rt);
 821                        ret = NULL;
 822                }
 823        }
 824        return ret;
 825}
 826
 827/*
 828 * Algorithm:
 829 *      1. The first ip_rt_redirect_number redirects are sent
 830 *         with exponential backoff, then we stop sending them at all,
 831 *         assuming that the host ignores our redirects.
 832 *      2. If we did not see packets requiring redirects
 833 *         during ip_rt_redirect_silence, we assume that the host
 834 *         forgot redirected route and start to send redirects again.
 835 *
 836 * This algorithm is much cheaper and more intelligent than dumb load limiting
 837 * in icmp.c.
 838 *
 839 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 840 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 841 */
 842
 843void ip_rt_send_redirect(struct sk_buff *skb)
 844{
 845        struct rtable *rt = skb_rtable(skb);
 846        struct in_device *in_dev;
 847        struct inet_peer *peer;
 848        struct net *net;
 849        int log_martians;
 850        int vif;
 851
 852        rcu_read_lock();
 853        in_dev = __in_dev_get_rcu(rt->dst.dev);
 854        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 855                rcu_read_unlock();
 856                return;
 857        }
 858        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 859        vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 860        rcu_read_unlock();
 861
 862        net = dev_net(rt->dst.dev);
 863        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 864        if (!peer) {
 865                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 866                          rt_nexthop(rt, ip_hdr(skb)->daddr));
 867                return;
 868        }
 869
 870        /* No redirected packets during ip_rt_redirect_silence;
 871         * reset the algorithm.
 872         */
 873        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 874                peer->rate_tokens = 0;
 875
 876        /* Too many ignored redirects; do not send anything
 877         * set dst.rate_last to the last seen redirected packet.
 878         */
 879        if (peer->rate_tokens >= ip_rt_redirect_number) {
 880                peer->rate_last = jiffies;
 881                goto out_put_peer;
 882        }
 883
 884        /* Check for load limit; set rate_last to the latest sent
 885         * redirect.
 886         */
 887        if (peer->rate_tokens == 0 ||
 888            time_after(jiffies,
 889                       (peer->rate_last +
 890                        (ip_rt_redirect_load << peer->rate_tokens)))) {
 891                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 892
 893                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 894                peer->rate_last = jiffies;
 895                ++peer->rate_tokens;
 896#ifdef CONFIG_IP_ROUTE_VERBOSE
 897                if (log_martians &&
 898                    peer->rate_tokens == ip_rt_redirect_number)
 899                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 900                                             &ip_hdr(skb)->saddr, inet_iif(skb),
 901                                             &ip_hdr(skb)->daddr, &gw);
 902#endif
 903        }
 904out_put_peer:
 905        inet_putpeer(peer);
 906}
 907
 908static int ip_error(struct sk_buff *skb)
 909{
 910        struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 911        struct rtable *rt = skb_rtable(skb);
 912        struct inet_peer *peer;
 913        unsigned long now;
 914        struct net *net;
 915        bool send;
 916        int code;
 917
 918        /* IP on this device is disabled. */
 919        if (!in_dev)
 920                goto out;
 921
 922        net = dev_net(rt->dst.dev);
 923        if (!IN_DEV_FORWARD(in_dev)) {
 924                switch (rt->dst.error) {
 925                case EHOSTUNREACH:
 926                        __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 927                        break;
 928
 929                case ENETUNREACH:
 930                        __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 931                        break;
 932                }
 933                goto out;
 934        }
 935
 936        switch (rt->dst.error) {
 937        case EINVAL:
 938        default:
 939                goto out;
 940        case EHOSTUNREACH:
 941                code = ICMP_HOST_UNREACH;
 942                break;
 943        case ENETUNREACH:
 944                code = ICMP_NET_UNREACH;
 945                __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 946                break;
 947        case EACCES:
 948                code = ICMP_PKT_FILTERED;
 949                break;
 950        }
 951
 952        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 953                               l3mdev_master_ifindex(skb->dev), 1);
 954
 955        send = true;
 956        if (peer) {
 957                now = jiffies;
 958                peer->rate_tokens += now - peer->rate_last;
 959                if (peer->rate_tokens > ip_rt_error_burst)
 960                        peer->rate_tokens = ip_rt_error_burst;
 961                peer->rate_last = now;
 962                if (peer->rate_tokens >= ip_rt_error_cost)
 963                        peer->rate_tokens -= ip_rt_error_cost;
 964                else
 965                        send = false;
 966                inet_putpeer(peer);
 967        }
 968        if (send)
 969                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 970
 971out:    kfree_skb(skb);
 972        return 0;
 973}
 974
 975static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 976{
 977        struct dst_entry *dst = &rt->dst;
 978        struct fib_result res;
 979
 980        if (dst_metric_locked(dst, RTAX_MTU))
 981                return;
 982
 983        if (ipv4_mtu(dst) < mtu)
 984                return;
 985
 986        if (mtu < ip_rt_min_pmtu)
 987                mtu = ip_rt_min_pmtu;
 988
 989        if (rt->rt_pmtu == mtu &&
 990            time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
 991                return;
 992
 993        rcu_read_lock();
 994        if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
 995                struct fib_nh *nh = &FIB_RES_NH(res);
 996
 997                update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 998                                      jiffies + ip_rt_mtu_expires);
 999        }
1000        rcu_read_unlock();
1001}
1002
1003static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1004                              struct sk_buff *skb, u32 mtu)
1005{
1006        struct rtable *rt = (struct rtable *) dst;
1007        struct flowi4 fl4;
1008
1009        ip_rt_build_flow_key(&fl4, sk, skb);
1010        __ip_rt_update_pmtu(rt, &fl4, mtu);
1011}
1012
1013void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1014                      int oif, u32 mark, u8 protocol, int flow_flags)
1015{
1016        const struct iphdr *iph = (const struct iphdr *) skb->data;
1017        struct flowi4 fl4;
1018        struct rtable *rt;
1019
1020        if (!mark)
1021                mark = IP4_REPLY_MARK(net, skb->mark);
1022
1023        __build_flow_key(&fl4, NULL, iph, oif,
1024                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1025        rt = __ip_route_output_key(net, &fl4);
1026        if (!IS_ERR(rt)) {
1027                __ip_rt_update_pmtu(rt, &fl4, mtu);
1028                ip_rt_put(rt);
1029        }
1030}
1031EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1032
1033static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1034{
1035        const struct iphdr *iph = (const struct iphdr *) skb->data;
1036        struct flowi4 fl4;
1037        struct rtable *rt;
1038
1039        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1040
1041        if (!fl4.flowi4_mark)
1042                fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1043
1044        rt = __ip_route_output_key(sock_net(sk), &fl4);
1045        if (!IS_ERR(rt)) {
1046                __ip_rt_update_pmtu(rt, &fl4, mtu);
1047                ip_rt_put(rt);
1048        }
1049}
1050
1051void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1052{
1053        const struct iphdr *iph = (const struct iphdr *) skb->data;
1054        struct flowi4 fl4;
1055        struct rtable *rt;
1056        struct dst_entry *odst = NULL;
1057        bool new = false;
1058
1059        bh_lock_sock(sk);
1060
1061        if (!ip_sk_accept_pmtu(sk))
1062                goto out;
1063
1064        odst = sk_dst_get(sk);
1065
1066        if (sock_owned_by_user(sk) || !odst) {
1067                __ipv4_sk_update_pmtu(skb, sk, mtu);
1068                goto out;
1069        }
1070
1071        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1072
1073        rt = (struct rtable *)odst;
1074        if (odst->obsolete && !odst->ops->check(odst, 0)) {
1075                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1076                if (IS_ERR(rt))
1077                        goto out;
1078
1079                new = true;
1080        }
1081
1082        __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1083
1084        if (!dst_check(&rt->dst, 0)) {
1085                if (new)
1086                        dst_release(&rt->dst);
1087
1088                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1089                if (IS_ERR(rt))
1090                        goto out;
1091
1092                new = true;
1093        }
1094
1095        if (new)
1096                sk_dst_set(sk, &rt->dst);
1097
1098out:
1099        bh_unlock_sock(sk);
1100        dst_release(odst);
1101}
1102EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1103
1104void ipv4_redirect(struct sk_buff *skb, struct net *net,
1105                   int oif, u32 mark, u8 protocol, int flow_flags)
1106{
1107        const struct iphdr *iph = (const struct iphdr *) skb->data;
1108        struct flowi4 fl4;
1109        struct rtable *rt;
1110
1111        __build_flow_key(&fl4, NULL, iph, oif,
1112                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1113        rt = __ip_route_output_key(net, &fl4);
1114        if (!IS_ERR(rt)) {
1115                __ip_do_redirect(rt, skb, &fl4, false);
1116                ip_rt_put(rt);
1117        }
1118}
1119EXPORT_SYMBOL_GPL(ipv4_redirect);
1120
1121void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1122{
1123        const struct iphdr *iph = (const struct iphdr *) skb->data;
1124        struct flowi4 fl4;
1125        struct rtable *rt;
1126
1127        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1128        rt = __ip_route_output_key(sock_net(sk), &fl4);
1129        if (!IS_ERR(rt)) {
1130                __ip_do_redirect(rt, skb, &fl4, false);
1131                ip_rt_put(rt);
1132        }
1133}
1134EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1135
1136static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1137{
1138        struct rtable *rt = (struct rtable *) dst;
1139
1140        /* All IPV4 dsts are created with ->obsolete set to the value
1141         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1142         * into this function always.
1143         *
1144         * When a PMTU/redirect information update invalidates a route,
1145         * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1146         * DST_OBSOLETE_DEAD by dst_free().
1147         */
1148        if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1149                return NULL;
1150        return dst;
1151}
1152
1153static void ipv4_link_failure(struct sk_buff *skb)
1154{
1155        struct rtable *rt;
1156
1157        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1158
1159        rt = skb_rtable(skb);
1160        if (rt)
1161                dst_set_expires(&rt->dst, 0);
1162}
1163
1164static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1165{
1166        pr_debug("%s: %pI4 -> %pI4, %s\n",
1167                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1168                 skb->dev ? skb->dev->name : "?");
1169        kfree_skb(skb);
1170        WARN_ON(1);
1171        return 0;
1172}
1173
1174/*
1175   We do not cache source address of outgoing interface,
1176   because it is used only by IP RR, TS and SRR options,
1177   so that it out of fast path.
1178
1179   BTW remember: "addr" is allowed to be not aligned
1180   in IP options!
1181 */
1182
1183void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1184{
1185        __be32 src;
1186
1187        if (rt_is_output_route(rt))
1188                src = ip_hdr(skb)->saddr;
1189        else {
1190                struct fib_result res;
1191                struct flowi4 fl4;
1192                struct iphdr *iph;
1193
1194                iph = ip_hdr(skb);
1195
1196                memset(&fl4, 0, sizeof(fl4));
1197                fl4.daddr = iph->daddr;
1198                fl4.saddr = iph->saddr;
1199                fl4.flowi4_tos = RT_TOS(iph->tos);
1200                fl4.flowi4_oif = rt->dst.dev->ifindex;
1201                fl4.flowi4_iif = skb->dev->ifindex;
1202                fl4.flowi4_mark = skb->mark;
1203
1204                rcu_read_lock();
1205                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1206                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1207                else
1208                        src = inet_select_addr(rt->dst.dev,
1209                                               rt_nexthop(rt, iph->daddr),
1210                                               RT_SCOPE_UNIVERSE);
1211                rcu_read_unlock();
1212        }
1213        memcpy(addr, &src, 4);
1214}
1215
1216#ifdef CONFIG_IP_ROUTE_CLASSID
1217static void set_class_tag(struct rtable *rt, u32 tag)
1218{
1219        if (!(rt->dst.tclassid & 0xFFFF))
1220                rt->dst.tclassid |= tag & 0xFFFF;
1221        if (!(rt->dst.tclassid & 0xFFFF0000))
1222                rt->dst.tclassid |= tag & 0xFFFF0000;
1223}
1224#endif
1225
1226static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1227{
1228        unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1229
1230        if (advmss == 0) {
1231                advmss = max_t(unsigned int, dst->dev->mtu - 40,
1232                               ip_rt_min_advmss);
1233                if (advmss > 65535 - 40)
1234                        advmss = 65535 - 40;
1235        }
1236        return advmss;
1237}
1238
1239static unsigned int ipv4_mtu(const struct dst_entry *dst)
1240{
1241        const struct rtable *rt = (const struct rtable *) dst;
1242        unsigned int mtu = rt->rt_pmtu;
1243
1244        if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1245                mtu = dst_metric_raw(dst, RTAX_MTU);
1246
1247        if (mtu)
1248                return mtu;
1249
1250        mtu = dst->dev->mtu;
1251
1252        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1253                if (rt->rt_uses_gateway && mtu > 576)
1254                        mtu = 576;
1255        }
1256
1257        mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1258
1259        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1260}
1261
1262static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1263{
1264        struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1265        struct fib_nh_exception *fnhe;
1266        u32 hval;
1267
1268        if (!hash)
1269                return NULL;
1270
1271        hval = fnhe_hashfun(daddr);
1272
1273        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1274             fnhe = rcu_dereference(fnhe->fnhe_next)) {
1275                if (fnhe->fnhe_daddr == daddr)
1276                        return fnhe;
1277        }
1278        return NULL;
1279}
1280
1281static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1282                              __be32 daddr)
1283{
1284        bool ret = false;
1285
1286        spin_lock_bh(&fnhe_lock);
1287
1288        if (daddr == fnhe->fnhe_daddr) {
1289                struct rtable __rcu **porig;
1290                struct rtable *orig;
1291                int genid = fnhe_genid(dev_net(rt->dst.dev));
1292
1293                if (rt_is_input_route(rt))
1294                        porig = &fnhe->fnhe_rth_input;
1295                else
1296                        porig = &fnhe->fnhe_rth_output;
1297                orig = rcu_dereference(*porig);
1298
1299                if (fnhe->fnhe_genid != genid) {
1300                        fnhe->fnhe_genid = genid;
1301                        fnhe->fnhe_gw = 0;
1302                        fnhe->fnhe_pmtu = 0;
1303                        fnhe->fnhe_expires = 0;
1304                        fnhe_flush_routes(fnhe);
1305                        orig = NULL;
1306                }
1307                fill_route_from_fnhe(rt, fnhe);
1308                if (!rt->rt_gateway)
1309                        rt->rt_gateway = daddr;
1310
1311                if (!(rt->dst.flags & DST_NOCACHE)) {
1312                        rcu_assign_pointer(*porig, rt);
1313                        if (orig)
1314                                rt_free(orig);
1315                        ret = true;
1316                }
1317
1318                fnhe->fnhe_stamp = jiffies;
1319        }
1320        spin_unlock_bh(&fnhe_lock);
1321
1322        return ret;
1323}
1324
1325static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1326{
1327        struct rtable *orig, *prev, **p;
1328        bool ret = true;
1329
1330        if (rt_is_input_route(rt)) {
1331                p = (struct rtable **)&nh->nh_rth_input;
1332        } else {
1333                p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1334        }
1335        orig = *p;
1336
1337        prev = cmpxchg(p, orig, rt);
1338        if (prev == orig) {
1339                if (orig)
1340                        rt_free(orig);
1341        } else
1342                ret = false;
1343
1344        return ret;
1345}
1346
1347struct uncached_list {
1348        spinlock_t              lock;
1349        struct list_head        head;
1350};
1351
1352static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1353
1354static void rt_add_uncached_list(struct rtable *rt)
1355{
1356        struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1357
1358        rt->rt_uncached_list = ul;
1359
1360        spin_lock_bh(&ul->lock);
1361        list_add_tail(&rt->rt_uncached, &ul->head);
1362        spin_unlock_bh(&ul->lock);
1363}
1364
1365static void ipv4_dst_destroy(struct dst_entry *dst)
1366{
1367        struct rtable *rt = (struct rtable *) dst;
1368
1369        if (!list_empty(&rt->rt_uncached)) {
1370                struct uncached_list *ul = rt->rt_uncached_list;
1371
1372                spin_lock_bh(&ul->lock);
1373                list_del(&rt->rt_uncached);
1374                spin_unlock_bh(&ul->lock);
1375        }
1376}
1377
1378void rt_flush_dev(struct net_device *dev)
1379{
1380        struct net *net = dev_net(dev);
1381        struct rtable *rt;
1382        int cpu;
1383
1384        for_each_possible_cpu(cpu) {
1385                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1386
1387                spin_lock_bh(&ul->lock);
1388                list_for_each_entry(rt, &ul->head, rt_uncached) {
1389                        if (rt->dst.dev != dev)
1390                                continue;
1391                        rt->dst.dev = net->loopback_dev;
1392                        dev_hold(rt->dst.dev);
1393                        dev_put(dev);
1394                }
1395                spin_unlock_bh(&ul->lock);
1396        }
1397}
1398
1399static bool rt_cache_valid(const struct rtable *rt)
1400{
1401        return  rt &&
1402                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1403                !rt_is_expired(rt);
1404}
1405
1406static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1407                           const struct fib_result *res,
1408                           struct fib_nh_exception *fnhe,
1409                           struct fib_info *fi, u16 type, u32 itag)
1410{
1411        bool cached = false;
1412
1413        if (fi) {
1414                struct fib_nh *nh = &FIB_RES_NH(*res);
1415
1416                if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1417                        rt->rt_gateway = nh->nh_gw;
1418                        rt->rt_uses_gateway = 1;
1419                }
1420                dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1421#ifdef CONFIG_IP_ROUTE_CLASSID
1422                rt->dst.tclassid = nh->nh_tclassid;
1423#endif
1424                rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1425                if (unlikely(fnhe))
1426                        cached = rt_bind_exception(rt, fnhe, daddr);
1427                else if (!(rt->dst.flags & DST_NOCACHE))
1428                        cached = rt_cache_route(nh, rt);
1429                if (unlikely(!cached)) {
1430                        /* Routes we intend to cache in nexthop exception or
1431                         * FIB nexthop have the DST_NOCACHE bit clear.
1432                         * However, if we are unsuccessful at storing this
1433                         * route into the cache we really need to set it.
1434                         */
1435                        rt->dst.flags |= DST_NOCACHE;
1436                        if (!rt->rt_gateway)
1437                                rt->rt_gateway = daddr;
1438                        rt_add_uncached_list(rt);
1439                }
1440        } else
1441                rt_add_uncached_list(rt);
1442
1443#ifdef CONFIG_IP_ROUTE_CLASSID
1444#ifdef CONFIG_IP_MULTIPLE_TABLES
1445        set_class_tag(rt, res->tclassid);
1446#endif
1447        set_class_tag(rt, itag);
1448#endif
1449}
1450
1451struct rtable *rt_dst_alloc(struct net_device *dev,
1452                            unsigned int flags, u16 type,
1453                            bool nopolicy, bool noxfrm, bool will_cache)
1454{
1455        struct rtable *rt;
1456
1457        rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1458                       (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1459                       (nopolicy ? DST_NOPOLICY : 0) |
1460                       (noxfrm ? DST_NOXFRM : 0));
1461
1462        if (rt) {
1463                rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1464                rt->rt_flags = flags;
1465                rt->rt_type = type;
1466                rt->rt_is_input = 0;
1467                rt->rt_iif = 0;
1468                rt->rt_pmtu = 0;
1469                rt->rt_gateway = 0;
1470                rt->rt_uses_gateway = 0;
1471                rt->rt_table_id = 0;
1472                INIT_LIST_HEAD(&rt->rt_uncached);
1473
1474                rt->dst.output = ip_output;
1475                if (flags & RTCF_LOCAL)
1476                        rt->dst.input = ip_local_deliver;
1477        }
1478
1479        return rt;
1480}
1481EXPORT_SYMBOL(rt_dst_alloc);
1482
1483/* called in rcu_read_lock() section */
1484static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1485                                u8 tos, struct net_device *dev, int our)
1486{
1487        struct rtable *rth;
1488        struct in_device *in_dev = __in_dev_get_rcu(dev);
1489        unsigned int flags = RTCF_MULTICAST;
1490        u32 itag = 0;
1491        int err;
1492
1493        /* Primary sanity checks. */
1494
1495        if (!in_dev)
1496                return -EINVAL;
1497
1498        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1499            skb->protocol != htons(ETH_P_IP))
1500                goto e_inval;
1501
1502        if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1503                goto e_inval;
1504
1505        if (ipv4_is_zeronet(saddr)) {
1506                if (!ipv4_is_local_multicast(daddr))
1507                        goto e_inval;
1508        } else {
1509                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1510                                          in_dev, &itag);
1511                if (err < 0)
1512                        goto e_err;
1513        }
1514        if (our)
1515                flags |= RTCF_LOCAL;
1516
1517        rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1518                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1519        if (!rth)
1520                goto e_nobufs;
1521
1522#ifdef CONFIG_IP_ROUTE_CLASSID
1523        rth->dst.tclassid = itag;
1524#endif
1525        rth->dst.output = ip_rt_bug;
1526        rth->rt_is_input= 1;
1527
1528#ifdef CONFIG_IP_MROUTE
1529        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1530                rth->dst.input = ip_mr_input;
1531#endif
1532        RT_CACHE_STAT_INC(in_slow_mc);
1533
1534        skb_dst_set(skb, &rth->dst);
1535        return 0;
1536
1537e_nobufs:
1538        return -ENOBUFS;
1539e_inval:
1540        return -EINVAL;
1541e_err:
1542        return err;
1543}
1544
1545
1546static void ip_handle_martian_source(struct net_device *dev,
1547                                     struct in_device *in_dev,
1548                                     struct sk_buff *skb,
1549                                     __be32 daddr,
1550                                     __be32 saddr)
1551{
1552        RT_CACHE_STAT_INC(in_martian_src);
1553#ifdef CONFIG_IP_ROUTE_VERBOSE
1554        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1555                /*
1556                 *      RFC1812 recommendation, if source is martian,
1557                 *      the only hint is MAC header.
1558                 */
1559                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1560                        &daddr, &saddr, dev->name);
1561                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1562                        print_hex_dump(KERN_WARNING, "ll header: ",
1563                                       DUMP_PREFIX_OFFSET, 16, 1,
1564                                       skb_mac_header(skb),
1565                                       dev->hard_header_len, true);
1566                }
1567        }
1568#endif
1569}
1570
1571static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1572{
1573        struct fnhe_hash_bucket *hash;
1574        struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1575        u32 hval = fnhe_hashfun(daddr);
1576
1577        spin_lock_bh(&fnhe_lock);
1578
1579        hash = rcu_dereference_protected(nh->nh_exceptions,
1580                                         lockdep_is_held(&fnhe_lock));
1581        hash += hval;
1582
1583        fnhe_p = &hash->chain;
1584        fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1585        while (fnhe) {
1586                if (fnhe->fnhe_daddr == daddr) {
1587                        rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1588                                fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1589                        fnhe_flush_routes(fnhe);
1590                        kfree_rcu(fnhe, rcu);
1591                        break;
1592                }
1593                fnhe_p = &fnhe->fnhe_next;
1594                fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1595                                                 lockdep_is_held(&fnhe_lock));
1596        }
1597
1598        spin_unlock_bh(&fnhe_lock);
1599}
1600
1601/* called in rcu_read_lock() section */
1602static int __mkroute_input(struct sk_buff *skb,
1603                           const struct fib_result *res,
1604                           struct in_device *in_dev,
1605                           __be32 daddr, __be32 saddr, u32 tos)
1606{
1607        struct fib_nh_exception *fnhe;
1608        struct rtable *rth;
1609        int err;
1610        struct in_device *out_dev;
1611        bool do_cache;
1612        u32 itag = 0;
1613
1614        /* get a working reference to the output device */
1615        out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1616        if (!out_dev) {
1617                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1618                return -EINVAL;
1619        }
1620
1621        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1622                                  in_dev->dev, in_dev, &itag);
1623        if (err < 0) {
1624                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1625                                         saddr);
1626
1627                goto cleanup;
1628        }
1629
1630        do_cache = res->fi && !itag;
1631        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1632            skb->protocol == htons(ETH_P_IP) &&
1633            (IN_DEV_SHARED_MEDIA(out_dev) ||
1634             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1635                IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1636
1637        if (skb->protocol != htons(ETH_P_IP)) {
1638                /* Not IP (i.e. ARP). Do not create route, if it is
1639                 * invalid for proxy arp. DNAT routes are always valid.
1640                 *
1641                 * Proxy arp feature have been extended to allow, ARP
1642                 * replies back to the same interface, to support
1643                 * Private VLAN switch technologies. See arp.c.
1644                 */
1645                if (out_dev == in_dev &&
1646                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1647                        err = -EINVAL;
1648                        goto cleanup;
1649                }
1650        }
1651
1652        fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1653        if (do_cache) {
1654                if (fnhe) {
1655                        rth = rcu_dereference(fnhe->fnhe_rth_input);
1656                        if (rth && rth->dst.expires &&
1657                            time_after(jiffies, rth->dst.expires)) {
1658                                ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1659                                fnhe = NULL;
1660                        } else {
1661                                goto rt_cache;
1662                        }
1663                }
1664
1665                rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1666
1667rt_cache:
1668                if (rt_cache_valid(rth)) {
1669                        skb_dst_set_noref(skb, &rth->dst);
1670                        goto out;
1671                }
1672        }
1673
1674        rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1675                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1676                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1677        if (!rth) {
1678                err = -ENOBUFS;
1679                goto cleanup;
1680        }
1681
1682        rth->rt_is_input = 1;
1683        if (res->table)
1684                rth->rt_table_id = res->table->tb_id;
1685        RT_CACHE_STAT_INC(in_slow_tot);
1686
1687        rth->dst.input = ip_forward;
1688
1689        rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1690        if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1691                rth->dst.lwtstate->orig_output = rth->dst.output;
1692                rth->dst.output = lwtunnel_output;
1693        }
1694        if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1695                rth->dst.lwtstate->orig_input = rth->dst.input;
1696                rth->dst.input = lwtunnel_input;
1697        }
1698        skb_dst_set(skb, &rth->dst);
1699out:
1700        err = 0;
1701 cleanup:
1702        return err;
1703}
1704
1705#ifdef CONFIG_IP_ROUTE_MULTIPATH
1706
1707/* To make ICMP packets follow the right flow, the multipath hash is
1708 * calculated from the inner IP addresses in reverse order.
1709 */
1710static int ip_multipath_icmp_hash(struct sk_buff *skb)
1711{
1712        const struct iphdr *outer_iph = ip_hdr(skb);
1713        struct icmphdr _icmph;
1714        const struct icmphdr *icmph;
1715        struct iphdr _inner_iph;
1716        const struct iphdr *inner_iph;
1717
1718        if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1719                goto standard_hash;
1720
1721        icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1722                                   &_icmph);
1723        if (!icmph)
1724                goto standard_hash;
1725
1726        if (icmph->type != ICMP_DEST_UNREACH &&
1727            icmph->type != ICMP_REDIRECT &&
1728            icmph->type != ICMP_TIME_EXCEEDED &&
1729            icmph->type != ICMP_PARAMETERPROB) {
1730                goto standard_hash;
1731        }
1732
1733        inner_iph = skb_header_pointer(skb,
1734                                       outer_iph->ihl * 4 + sizeof(_icmph),
1735                                       sizeof(_inner_iph), &_inner_iph);
1736        if (!inner_iph)
1737                goto standard_hash;
1738
1739        return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1740
1741standard_hash:
1742        return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1743}
1744
1745#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1746
1747static int ip_mkroute_input(struct sk_buff *skb,
1748                            struct fib_result *res,
1749                            const struct flowi4 *fl4,
1750                            struct in_device *in_dev,
1751                            __be32 daddr, __be32 saddr, u32 tos)
1752{
1753#ifdef CONFIG_IP_ROUTE_MULTIPATH
1754        if (res->fi && res->fi->fib_nhs > 1) {
1755                int h;
1756
1757                if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1758                        h = ip_multipath_icmp_hash(skb);
1759                else
1760                        h = fib_multipath_hash(saddr, daddr);
1761                fib_select_multipath(res, h);
1762        }
1763#endif
1764
1765        /* create a routing cache entry */
1766        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1767}
1768
1769/*
1770 *      NOTE. We drop all the packets that has local source
1771 *      addresses, because every properly looped back packet
1772 *      must have correct destination already attached by output routine.
1773 *
1774 *      Such approach solves two big problems:
1775 *      1. Not simplex devices are handled properly.
1776 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1777 *      called with rcu_read_lock()
1778 */
1779
1780static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1781                               u8 tos, struct net_device *dev)
1782{
1783        struct fib_result res;
1784        struct in_device *in_dev = __in_dev_get_rcu(dev);
1785        struct ip_tunnel_info *tun_info;
1786        struct flowi4   fl4;
1787        unsigned int    flags = 0;
1788        u32             itag = 0;
1789        struct rtable   *rth;
1790        int             err = -EINVAL;
1791        struct net    *net = dev_net(dev);
1792        bool do_cache;
1793
1794        /* IP on this device is disabled. */
1795
1796        if (!in_dev)
1797                goto out;
1798
1799        /* Check for the most weird martians, which can be not detected
1800           by fib_lookup.
1801         */
1802
1803        tun_info = skb_tunnel_info(skb);
1804        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1805                fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1806        else
1807                fl4.flowi4_tun_key.tun_id = 0;
1808        skb_dst_drop(skb);
1809
1810        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1811                goto martian_source;
1812
1813        res.fi = NULL;
1814        res.table = NULL;
1815        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1816                goto brd_input;
1817
1818        /* Accept zero addresses only to limited broadcast;
1819         * I even do not know to fix it or not. Waiting for complains :-)
1820         */
1821        if (ipv4_is_zeronet(saddr))
1822                goto martian_source;
1823
1824        if (ipv4_is_zeronet(daddr))
1825                goto martian_destination;
1826
1827        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1828         * and call it once if daddr or/and saddr are loopback addresses
1829         */
1830        if (ipv4_is_loopback(daddr)) {
1831                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1832                        goto martian_destination;
1833        } else if (ipv4_is_loopback(saddr)) {
1834                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1835                        goto martian_source;
1836        }
1837
1838        /*
1839         *      Now we are ready to route packet.
1840         */
1841        fl4.flowi4_oif = 0;
1842        fl4.flowi4_iif = dev->ifindex;
1843        fl4.flowi4_mark = skb->mark;
1844        fl4.flowi4_tos = tos;
1845        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1846        fl4.flowi4_flags = 0;
1847        fl4.daddr = daddr;
1848        fl4.saddr = saddr;
1849        err = fib_lookup(net, &fl4, &res, 0);
1850        if (err != 0) {
1851                if (!IN_DEV_FORWARD(in_dev))
1852                        err = -EHOSTUNREACH;
1853                goto no_route;
1854        }
1855
1856        if (res.type == RTN_BROADCAST)
1857                goto brd_input;
1858
1859        if (res.type == RTN_LOCAL) {
1860                err = fib_validate_source(skb, saddr, daddr, tos,
1861                                          0, dev, in_dev, &itag);
1862                if (err < 0)
1863                        goto martian_source;
1864                goto local_input;
1865        }
1866
1867        if (!IN_DEV_FORWARD(in_dev)) {
1868                err = -EHOSTUNREACH;
1869                goto no_route;
1870        }
1871        if (res.type != RTN_UNICAST)
1872                goto martian_destination;
1873
1874        err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1875out:    return err;
1876
1877brd_input:
1878        if (skb->protocol != htons(ETH_P_IP))
1879                goto e_inval;
1880
1881        if (!ipv4_is_zeronet(saddr)) {
1882                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1883                                          in_dev, &itag);
1884                if (err < 0)
1885                        goto martian_source;
1886        }
1887        flags |= RTCF_BROADCAST;
1888        res.type = RTN_BROADCAST;
1889        RT_CACHE_STAT_INC(in_brd);
1890
1891local_input:
1892        do_cache = false;
1893        if (res.fi) {
1894                if (!itag) {
1895                        rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1896                        if (rt_cache_valid(rth)) {
1897                                skb_dst_set_noref(skb, &rth->dst);
1898                                err = 0;
1899                                goto out;
1900                        }
1901                        do_cache = true;
1902                }
1903        }
1904
1905        rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
1906                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1907        if (!rth)
1908                goto e_nobufs;
1909
1910        rth->dst.output= ip_rt_bug;
1911#ifdef CONFIG_IP_ROUTE_CLASSID
1912        rth->dst.tclassid = itag;
1913#endif
1914        rth->rt_is_input = 1;
1915        if (res.table)
1916                rth->rt_table_id = res.table->tb_id;
1917
1918        RT_CACHE_STAT_INC(in_slow_tot);
1919        if (res.type == RTN_UNREACHABLE) {
1920                rth->dst.input= ip_error;
1921                rth->dst.error= -err;
1922                rth->rt_flags   &= ~RTCF_LOCAL;
1923        }
1924        if (do_cache) {
1925                if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1926                        rth->dst.flags |= DST_NOCACHE;
1927                        rt_add_uncached_list(rth);
1928                }
1929        }
1930        skb_dst_set(skb, &rth->dst);
1931        err = 0;
1932        goto out;
1933
1934no_route:
1935        RT_CACHE_STAT_INC(in_no_route);
1936        res.type = RTN_UNREACHABLE;
1937        res.fi = NULL;
1938        res.table = NULL;
1939        goto local_input;
1940
1941        /*
1942         *      Do not cache martian addresses: they should be logged (RFC1812)
1943         */
1944martian_destination:
1945        RT_CACHE_STAT_INC(in_martian_dst);
1946#ifdef CONFIG_IP_ROUTE_VERBOSE
1947        if (IN_DEV_LOG_MARTIANS(in_dev))
1948                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1949                                     &daddr, &saddr, dev->name);
1950#endif
1951
1952e_inval:
1953        err = -EINVAL;
1954        goto out;
1955
1956e_nobufs:
1957        err = -ENOBUFS;
1958        goto out;
1959
1960martian_source:
1961        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1962        goto out;
1963}
1964
1965int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1966                         u8 tos, struct net_device *dev)
1967{
1968        int res;
1969
1970        rcu_read_lock();
1971
1972        /* Multicast recognition logic is moved from route cache to here.
1973           The problem was that too many Ethernet cards have broken/missing
1974           hardware multicast filters :-( As result the host on multicasting
1975           network acquires a lot of useless route cache entries, sort of
1976           SDR messages from all the world. Now we try to get rid of them.
1977           Really, provided software IP multicast filter is organized
1978           reasonably (at least, hashed), it does not result in a slowdown
1979           comparing with route cache reject entries.
1980           Note, that multicast routers are not affected, because
1981           route cache entry is created eventually.
1982         */
1983        if (ipv4_is_multicast(daddr)) {
1984                struct in_device *in_dev = __in_dev_get_rcu(dev);
1985
1986                if (in_dev) {
1987                        int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1988                                                  ip_hdr(skb)->protocol);
1989                        if (our
1990#ifdef CONFIG_IP_MROUTE
1991                                ||
1992                            (!ipv4_is_local_multicast(daddr) &&
1993                             IN_DEV_MFORWARD(in_dev))
1994#endif
1995                           ) {
1996                                int res = ip_route_input_mc(skb, daddr, saddr,
1997                                                            tos, dev, our);
1998                                rcu_read_unlock();
1999                                return res;
2000                        }
2001                }
2002                rcu_read_unlock();
2003                return -EINVAL;
2004        }
2005        res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2006        rcu_read_unlock();
2007        return res;
2008}
2009EXPORT_SYMBOL(ip_route_input_noref);
2010
2011/* called with rcu_read_lock() */
2012static struct rtable *__mkroute_output(const struct fib_result *res,
2013                                       const struct flowi4 *fl4, int orig_oif,
2014                                       struct net_device *dev_out,
2015                                       unsigned int flags)
2016{
2017        struct fib_info *fi = res->fi;
2018        struct fib_nh_exception *fnhe;
2019        struct in_device *in_dev;
2020        u16 type = res->type;
2021        struct rtable *rth;
2022        bool do_cache;
2023
2024        in_dev = __in_dev_get_rcu(dev_out);
2025        if (!in_dev)
2026                return ERR_PTR(-EINVAL);
2027
2028        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2029                if (ipv4_is_loopback(fl4->saddr) &&
2030                    !(dev_out->flags & IFF_LOOPBACK) &&
2031                    !netif_is_l3_master(dev_out))
2032                        return ERR_PTR(-EINVAL);
2033
2034        if (ipv4_is_lbcast(fl4->daddr))
2035                type = RTN_BROADCAST;
2036        else if (ipv4_is_multicast(fl4->daddr))
2037                type = RTN_MULTICAST;
2038        else if (ipv4_is_zeronet(fl4->daddr))
2039                return ERR_PTR(-EINVAL);
2040
2041        if (dev_out->flags & IFF_LOOPBACK)
2042                flags |= RTCF_LOCAL;
2043
2044        do_cache = true;
2045        if (type == RTN_BROADCAST) {
2046                flags |= RTCF_BROADCAST | RTCF_LOCAL;
2047                fi = NULL;
2048        } else if (type == RTN_MULTICAST) {
2049                flags |= RTCF_MULTICAST | RTCF_LOCAL;
2050                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2051                                     fl4->flowi4_proto))
2052                        flags &= ~RTCF_LOCAL;
2053                else
2054                        do_cache = false;
2055                /* If multicast route do not exist use
2056                 * default one, but do not gateway in this case.
2057                 * Yes, it is hack.
2058                 */
2059                if (fi && res->prefixlen < 4)
2060                        fi = NULL;
2061        } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2062                   (orig_oif != dev_out->ifindex)) {
2063                /* For local routes that require a particular output interface
2064                 * we do not want to cache the result.  Caching the result
2065                 * causes incorrect behaviour when there are multiple source
2066                 * addresses on the interface, the end result being that if the
2067                 * intended recipient is waiting on that interface for the
2068                 * packet he won't receive it because it will be delivered on
2069                 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2070                 * be set to the loopback interface as well.
2071                 */
2072                fi = NULL;
2073        }
2074
2075        fnhe = NULL;
2076        do_cache &= fi != NULL;
2077        if (do_cache) {
2078                struct rtable __rcu **prth;
2079                struct fib_nh *nh = &FIB_RES_NH(*res);
2080
2081                fnhe = find_exception(nh, fl4->daddr);
2082                if (fnhe) {
2083                        prth = &fnhe->fnhe_rth_output;
2084                        rth = rcu_dereference(*prth);
2085                        if (rth && rth->dst.expires &&
2086                            time_after(jiffies, rth->dst.expires)) {
2087                                ip_del_fnhe(nh, fl4->daddr);
2088                                fnhe = NULL;
2089                        } else {
2090                                goto rt_cache;
2091                        }
2092                }
2093
2094                if (unlikely(fl4->flowi4_flags &
2095                             FLOWI_FLAG_KNOWN_NH &&
2096                             !(nh->nh_gw &&
2097                               nh->nh_scope == RT_SCOPE_LINK))) {
2098                        do_cache = false;
2099                        goto add;
2100                }
2101                prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2102                rth = rcu_dereference(*prth);
2103
2104rt_cache:
2105                if (rt_cache_valid(rth)) {
2106                        dst_hold(&rth->dst);
2107                        return rth;
2108                }
2109        }
2110
2111add:
2112        rth = rt_dst_alloc(dev_out, flags, type,
2113                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
2114                           IN_DEV_CONF_GET(in_dev, NOXFRM),
2115                           do_cache);
2116        if (!rth)
2117                return ERR_PTR(-ENOBUFS);
2118
2119        rth->rt_iif     = orig_oif ? : 0;
2120        if (res->table)
2121                rth->rt_table_id = res->table->tb_id;
2122
2123        RT_CACHE_STAT_INC(out_slow_tot);
2124
2125        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2126                if (flags & RTCF_LOCAL &&
2127                    !(dev_out->flags & IFF_LOOPBACK)) {
2128                        rth->dst.output = ip_mc_output;
2129                        RT_CACHE_STAT_INC(out_slow_mc);
2130                }
2131#ifdef CONFIG_IP_MROUTE
2132                if (type == RTN_MULTICAST) {
2133                        if (IN_DEV_MFORWARD(in_dev) &&
2134                            !ipv4_is_local_multicast(fl4->daddr)) {
2135                                rth->dst.input = ip_mr_input;
2136                                rth->dst.output = ip_mc_output;
2137                        }
2138                }
2139#endif
2140        }
2141
2142        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2143        if (lwtunnel_output_redirect(rth->dst.lwtstate))
2144                rth->dst.output = lwtunnel_output;
2145
2146        return rth;
2147}
2148
2149/*
2150 * Major route resolver routine.
2151 */
2152
2153struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2154                                          int mp_hash)
2155{
2156        struct net_device *dev_out = NULL;
2157        __u8 tos = RT_FL_TOS(fl4);
2158        unsigned int flags = 0;
2159        struct fib_result res;
2160        struct rtable *rth;
2161        int orig_oif;
2162        int err = -ENETUNREACH;
2163
2164        res.tclassid    = 0;
2165        res.fi          = NULL;
2166        res.table       = NULL;
2167
2168        orig_oif = fl4->flowi4_oif;
2169
2170        fl4->flowi4_iif = LOOPBACK_IFINDEX;
2171        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2172        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2173                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2174
2175        rcu_read_lock();
2176        if (fl4->saddr) {
2177                rth = ERR_PTR(-EINVAL);
2178                if (ipv4_is_multicast(fl4->saddr) ||
2179                    ipv4_is_lbcast(fl4->saddr) ||
2180                    ipv4_is_zeronet(fl4->saddr))
2181                        goto out;
2182
2183                /* I removed check for oif == dev_out->oif here.
2184                   It was wrong for two reasons:
2185                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2186                      is assigned to multiple interfaces.
2187                   2. Moreover, we are allowed to send packets with saddr
2188                      of another iface. --ANK
2189                 */
2190
2191                if (fl4->flowi4_oif == 0 &&
2192                    (ipv4_is_multicast(fl4->daddr) ||
2193                     ipv4_is_lbcast(fl4->daddr))) {
2194                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2195                        dev_out = __ip_dev_find(net, fl4->saddr, false);
2196                        if (!dev_out)
2197                                goto out;
2198
2199                        /* Special hack: user can direct multicasts
2200                           and limited broadcast via necessary interface
2201                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2202                           This hack is not just for fun, it allows
2203                           vic,vat and friends to work.
2204                           They bind socket to loopback, set ttl to zero
2205                           and expect that it will work.
2206                           From the viewpoint of routing cache they are broken,
2207                           because we are not allowed to build multicast path
2208                           with loopback source addr (look, routing cache
2209                           cannot know, that ttl is zero, so that packet
2210                           will not leave this host and route is valid).
2211                           Luckily, this hack is good workaround.
2212                         */
2213
2214                        fl4->flowi4_oif = dev_out->ifindex;
2215                        goto make_route;
2216                }
2217
2218                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2219                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2220                        if (!__ip_dev_find(net, fl4->saddr, false))
2221                                goto out;
2222                }
2223        }
2224
2225
2226        if (fl4->flowi4_oif) {
2227                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2228                rth = ERR_PTR(-ENODEV);
2229                if (!dev_out)
2230                        goto out;
2231
2232                /* RACE: Check return value of inet_select_addr instead. */
2233                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2234                        rth = ERR_PTR(-ENETUNREACH);
2235                        goto out;
2236                }
2237                if (ipv4_is_local_multicast(fl4->daddr) ||
2238                    ipv4_is_lbcast(fl4->daddr) ||
2239                    fl4->flowi4_proto == IPPROTO_IGMP) {
2240                        if (!fl4->saddr)
2241                                fl4->saddr = inet_select_addr(dev_out, 0,
2242                                                              RT_SCOPE_LINK);
2243                        goto make_route;
2244                }
2245                if (!fl4->saddr) {
2246                        if (ipv4_is_multicast(fl4->daddr))
2247                                fl4->saddr = inet_select_addr(dev_out, 0,
2248                                                              fl4->flowi4_scope);
2249                        else if (!fl4->daddr)
2250                                fl4->saddr = inet_select_addr(dev_out, 0,
2251                                                              RT_SCOPE_HOST);
2252                }
2253        }
2254
2255        if (!fl4->daddr) {
2256                fl4->daddr = fl4->saddr;
2257                if (!fl4->daddr)
2258                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2259                dev_out = net->loopback_dev;
2260                fl4->flowi4_oif = LOOPBACK_IFINDEX;
2261                res.type = RTN_LOCAL;
2262                flags |= RTCF_LOCAL;
2263                goto make_route;
2264        }
2265
2266        err = fib_lookup(net, fl4, &res, 0);
2267        if (err) {
2268                res.fi = NULL;
2269                res.table = NULL;
2270                if (fl4->flowi4_oif &&
2271                    !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
2272                        /* Apparently, routing tables are wrong. Assume,
2273                           that the destination is on link.
2274
2275                           WHY? DW.
2276                           Because we are allowed to send to iface
2277                           even if it has NO routes and NO assigned
2278                           addresses. When oif is specified, routing
2279                           tables are looked up with only one purpose:
2280                           to catch if destination is gatewayed, rather than
2281                           direct. Moreover, if MSG_DONTROUTE is set,
2282                           we send packet, ignoring both routing tables
2283                           and ifaddr state. --ANK
2284
2285
2286                           We could make it even if oif is unknown,
2287                           likely IPv6, but we do not.
2288                         */
2289
2290                        if (fl4->saddr == 0)
2291                                fl4->saddr = inet_select_addr(dev_out, 0,
2292                                                              RT_SCOPE_LINK);
2293                        res.type = RTN_UNICAST;
2294                        goto make_route;
2295                }
2296                rth = ERR_PTR(err);
2297                goto out;
2298        }
2299
2300        if (res.type == RTN_LOCAL) {
2301                if (!fl4->saddr) {
2302                        if (res.fi->fib_prefsrc)
2303                                fl4->saddr = res.fi->fib_prefsrc;
2304                        else
2305                                fl4->saddr = fl4->daddr;
2306                }
2307
2308                /* L3 master device is the loopback for that domain */
2309                dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
2310                fl4->flowi4_oif = dev_out->ifindex;
2311                flags |= RTCF_LOCAL;
2312                goto make_route;
2313        }
2314
2315        fib_select_path(net, &res, fl4, mp_hash);
2316
2317        dev_out = FIB_RES_DEV(res);
2318        fl4->flowi4_oif = dev_out->ifindex;
2319
2320
2321make_route:
2322        rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2323
2324out:
2325        rcu_read_unlock();
2326        return rth;
2327}
2328EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2329
2330static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2331{
2332        return NULL;
2333}
2334
2335static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2336{
2337        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2338
2339        return mtu ? : dst->dev->mtu;
2340}
2341
2342static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2343                                          struct sk_buff *skb, u32 mtu)
2344{
2345}
2346
2347static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2348                                       struct sk_buff *skb)
2349{
2350}
2351
2352static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2353                                          unsigned long old)
2354{
2355        return NULL;
2356}
2357
2358static struct dst_ops ipv4_dst_blackhole_ops = {
2359        .family                 =       AF_INET,
2360        .check                  =       ipv4_blackhole_dst_check,
2361        .mtu                    =       ipv4_blackhole_mtu,
2362        .default_advmss         =       ipv4_default_advmss,
2363        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2364        .redirect               =       ipv4_rt_blackhole_redirect,
2365        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2366        .neigh_lookup           =       ipv4_neigh_lookup,
2367};
2368
2369struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2370{
2371        struct rtable *ort = (struct rtable *) dst_orig;
2372        struct rtable *rt;
2373
2374        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2375        if (rt) {
2376                struct dst_entry *new = &rt->dst;
2377
2378                new->__use = 1;
2379                new->input = dst_discard;
2380                new->output = dst_discard_out;
2381
2382                new->dev = ort->dst.dev;
2383                if (new->dev)
2384                        dev_hold(new->dev);
2385
2386                rt->rt_is_input = ort->rt_is_input;
2387                rt->rt_iif = ort->rt_iif;
2388                rt->rt_pmtu = ort->rt_pmtu;
2389
2390                rt->rt_genid = rt_genid_ipv4(net);
2391                rt->rt_flags = ort->rt_flags;
2392                rt->rt_type = ort->rt_type;
2393                rt->rt_gateway = ort->rt_gateway;
2394                rt->rt_uses_gateway = ort->rt_uses_gateway;
2395
2396                INIT_LIST_HEAD(&rt->rt_uncached);
2397                dst_free(new);
2398        }
2399
2400        dst_release(dst_orig);
2401
2402        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2403}
2404
2405struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2406                                    const struct sock *sk)
2407{
2408        struct rtable *rt = __ip_route_output_key(net, flp4);
2409
2410        if (IS_ERR(rt))
2411                return rt;
2412
2413        if (flp4->flowi4_proto)
2414                rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2415                                                        flowi4_to_flowi(flp4),
2416                                                        sk, 0);
2417
2418        return rt;
2419}
2420EXPORT_SYMBOL_GPL(ip_route_output_flow);
2421
2422static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2423                        struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2424                        u32 seq, int event, int nowait, unsigned int flags)
2425{
2426        struct rtable *rt = skb_rtable(skb);
2427        struct rtmsg *r;
2428        struct nlmsghdr *nlh;
2429        unsigned long expires = 0;
2430        u32 error;
2431        u32 metrics[RTAX_MAX];
2432
2433        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2434        if (!nlh)
2435                return -EMSGSIZE;
2436
2437        r = nlmsg_data(nlh);
2438        r->rtm_family    = AF_INET;
2439        r->rtm_dst_len  = 32;
2440        r->rtm_src_len  = 0;
2441        r->rtm_tos      = fl4->flowi4_tos;
2442        r->rtm_table    = table_id;
2443        if (nla_put_u32(skb, RTA_TABLE, table_id))
2444                goto nla_put_failure;
2445        r->rtm_type     = rt->rt_type;
2446        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2447        r->rtm_protocol = RTPROT_UNSPEC;
2448        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2449        if (rt->rt_flags & RTCF_NOTIFY)
2450                r->rtm_flags |= RTM_F_NOTIFY;
2451        if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2452                r->rtm_flags |= RTCF_DOREDIRECT;
2453
2454        if (nla_put_in_addr(skb, RTA_DST, dst))
2455                goto nla_put_failure;
2456        if (src) {
2457                r->rtm_src_len = 32;
2458                if (nla_put_in_addr(skb, RTA_SRC, src))
2459                        goto nla_put_failure;
2460        }
2461        if (rt->dst.dev &&
2462            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2463                goto nla_put_failure;
2464#ifdef CONFIG_IP_ROUTE_CLASSID
2465        if (rt->dst.tclassid &&
2466            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2467                goto nla_put_failure;
2468#endif
2469        if (!rt_is_input_route(rt) &&
2470            fl4->saddr != src) {
2471                if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2472                        goto nla_put_failure;
2473        }
2474        if (rt->rt_uses_gateway &&
2475            nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2476                goto nla_put_failure;
2477
2478        expires = rt->dst.expires;
2479        if (expires) {
2480                unsigned long now = jiffies;
2481
2482                if (time_before(now, expires))
2483                        expires -= now;
2484                else
2485                        expires = 0;
2486        }
2487
2488        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2489        if (rt->rt_pmtu && expires)
2490                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2491        if (rtnetlink_put_metrics(skb, metrics) < 0)
2492                goto nla_put_failure;
2493
2494        if (fl4->flowi4_mark &&
2495            nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2496                goto nla_put_failure;
2497
2498        error = rt->dst.error;
2499
2500        if (rt_is_input_route(rt)) {
2501#ifdef CONFIG_IP_MROUTE
2502                if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2503                    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2504                        int err = ipmr_get_route(net, skb,
2505                                                 fl4->saddr, fl4->daddr,
2506                                                 r, nowait, portid);
2507
2508                        if (err <= 0) {
2509                                if (!nowait) {
2510                                        if (err == 0)
2511                                                return 0;
2512                                        goto nla_put_failure;
2513                                } else {
2514                                        if (err == -EMSGSIZE)
2515                                                goto nla_put_failure;
2516                                        error = err;
2517                                }
2518                        }
2519                } else
2520#endif
2521                        if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2522                                goto nla_put_failure;
2523        }
2524
2525        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2526                goto nla_put_failure;
2527
2528        nlmsg_end(skb, nlh);
2529        return 0;
2530
2531nla_put_failure:
2532        nlmsg_cancel(skb, nlh);
2533        return -EMSGSIZE;
2534}
2535
2536static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2537{
2538        struct net *net = sock_net(in_skb->sk);
2539        struct rtmsg *rtm;
2540        struct nlattr *tb[RTA_MAX+1];
2541        struct rtable *rt = NULL;
2542        struct flowi4 fl4;
2543        __be32 dst = 0;
2544        __be32 src = 0;
2545        u32 iif;
2546        int err;
2547        int mark;
2548        struct sk_buff *skb;
2549        u32 table_id = RT_TABLE_MAIN;
2550
2551        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2552        if (err < 0)
2553                goto errout;
2554
2555        rtm = nlmsg_data(nlh);
2556
2557        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2558        if (!skb) {
2559                err = -ENOBUFS;
2560                goto errout;
2561        }
2562
2563        /* Reserve room for dummy headers, this skb can pass
2564           through good chunk of routing engine.
2565         */
2566        skb_reset_mac_header(skb);
2567        skb_reset_network_header(skb);
2568
2569        /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2570        ip_hdr(skb)->protocol = IPPROTO_ICMP;
2571        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2572
2573        src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2574        dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2575        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2576        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2577
2578        memset(&fl4, 0, sizeof(fl4));
2579        fl4.daddr = dst;
2580        fl4.saddr = src;
2581        fl4.flowi4_tos = rtm->rtm_tos;
2582        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2583        fl4.flowi4_mark = mark;
2584
2585        if (iif) {
2586                struct net_device *dev;
2587
2588                dev = __dev_get_by_index(net, iif);
2589                if (!dev) {
2590                        err = -ENODEV;
2591                        goto errout_free;
2592                }
2593
2594                skb->protocol   = htons(ETH_P_IP);
2595                skb->dev        = dev;
2596                skb->mark       = mark;
2597                local_bh_disable();
2598                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2599                local_bh_enable();
2600
2601                rt = skb_rtable(skb);
2602                if (err == 0 && rt->dst.error)
2603                        err = -rt->dst.error;
2604        } else {
2605                rt = ip_route_output_key(net, &fl4);
2606
2607                err = 0;
2608                if (IS_ERR(rt))
2609                        err = PTR_ERR(rt);
2610        }
2611
2612        if (err)
2613                goto errout_free;
2614
2615        skb_dst_set(skb, &rt->dst);
2616        if (rtm->rtm_flags & RTM_F_NOTIFY)
2617                rt->rt_flags |= RTCF_NOTIFY;
2618
2619        if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2620                table_id = rt->rt_table_id;
2621
2622        err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2623                           NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2624                           RTM_NEWROUTE, 0, 0);
2625        if (err < 0)
2626                goto errout_free;
2627
2628        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2629errout:
2630        return err;
2631
2632errout_free:
2633        kfree_skb(skb);
2634        goto errout;
2635}
2636
2637void ip_rt_multicast_event(struct in_device *in_dev)
2638{
2639        rt_cache_flush(dev_net(in_dev->dev));
2640}
2641
2642#ifdef CONFIG_SYSCTL
2643static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2644static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2645static int ip_rt_gc_elasticity __read_mostly    = 8;
2646
2647static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2648                                        void __user *buffer,
2649                                        size_t *lenp, loff_t *ppos)
2650{
2651        struct net *net = (struct net *)__ctl->extra1;
2652
2653        if (write) {
2654                rt_cache_flush(net);
2655                fnhe_genid_bump(net);
2656                return 0;
2657        }
2658
2659        return -EINVAL;
2660}
2661
2662static struct ctl_table ipv4_route_table[] = {
2663        {
2664                .procname       = "gc_thresh",
2665                .data           = &ipv4_dst_ops.gc_thresh,
2666                .maxlen         = sizeof(int),
2667                .mode           = 0644,
2668                .proc_handler   = proc_dointvec,
2669        },
2670        {
2671                .procname       = "max_size",
2672                .data           = &ip_rt_max_size,
2673                .maxlen         = sizeof(int),
2674                .mode           = 0644,
2675                .proc_handler   = proc_dointvec,
2676        },
2677        {
2678                /*  Deprecated. Use gc_min_interval_ms */
2679
2680                .procname       = "gc_min_interval",
2681                .data           = &ip_rt_gc_min_interval,
2682                .maxlen         = sizeof(int),
2683                .mode           = 0644,
2684                .proc_handler   = proc_dointvec_jiffies,
2685        },
2686        {
2687                .procname       = "gc_min_interval_ms",
2688                .data           = &ip_rt_gc_min_interval,
2689                .maxlen         = sizeof(int),
2690                .mode           = 0644,
2691                .proc_handler   = proc_dointvec_ms_jiffies,
2692        },
2693        {
2694                .procname       = "gc_timeout",
2695                .data           = &ip_rt_gc_timeout,
2696                .maxlen         = sizeof(int),
2697                .mode           = 0644,
2698                .proc_handler   = proc_dointvec_jiffies,
2699        },
2700        {
2701                .procname       = "gc_interval",
2702                .data           = &ip_rt_gc_interval,
2703                .maxlen         = sizeof(int),
2704                .mode           = 0644,
2705                .proc_handler   = proc_dointvec_jiffies,
2706        },
2707        {
2708                .procname       = "redirect_load",
2709                .data           = &ip_rt_redirect_load,
2710                .maxlen         = sizeof(int),
2711                .mode           = 0644,
2712                .proc_handler   = proc_dointvec,
2713        },
2714        {
2715                .procname       = "redirect_number",
2716                .data           = &ip_rt_redirect_number,
2717                .maxlen         = sizeof(int),
2718                .mode           = 0644,
2719                .proc_handler   = proc_dointvec,
2720        },
2721        {
2722                .procname       = "redirect_silence",
2723                .data           = &ip_rt_redirect_silence,
2724                .maxlen         = sizeof(int),
2725                .mode           = 0644,
2726                .proc_handler   = proc_dointvec,
2727        },
2728        {
2729                .procname       = "error_cost",
2730                .data           = &ip_rt_error_cost,
2731                .maxlen         = sizeof(int),
2732                .mode           = 0644,
2733                .proc_handler   = proc_dointvec,
2734        },
2735        {
2736                .procname       = "error_burst",
2737                .data           = &ip_rt_error_burst,
2738                .maxlen         = sizeof(int),
2739                .mode           = 0644,
2740                .proc_handler   = proc_dointvec,
2741        },
2742        {
2743                .procname       = "gc_elasticity",
2744                .data           = &ip_rt_gc_elasticity,
2745                .maxlen         = sizeof(int),
2746                .mode           = 0644,
2747                .proc_handler   = proc_dointvec,
2748        },
2749        {
2750                .procname       = "mtu_expires",
2751                .data           = &ip_rt_mtu_expires,
2752                .maxlen         = sizeof(int),
2753                .mode           = 0644,
2754                .proc_handler   = proc_dointvec_jiffies,
2755        },
2756        {
2757                .procname       = "min_pmtu",
2758                .data           = &ip_rt_min_pmtu,
2759                .maxlen         = sizeof(int),
2760                .mode           = 0644,
2761                .proc_handler   = proc_dointvec,
2762        },
2763        {
2764                .procname       = "min_adv_mss",
2765                .data           = &ip_rt_min_advmss,
2766                .maxlen         = sizeof(int),
2767                .mode           = 0644,
2768                .proc_handler   = proc_dointvec,
2769        },
2770        { }
2771};
2772
2773static struct ctl_table ipv4_route_flush_table[] = {
2774        {
2775                .procname       = "flush",
2776                .maxlen         = sizeof(int),
2777                .mode           = 0200,
2778                .proc_handler   = ipv4_sysctl_rtcache_flush,
2779        },
2780        { },
2781};
2782
2783static __net_init int sysctl_route_net_init(struct net *net)
2784{
2785        struct ctl_table *tbl;
2786
2787        tbl = ipv4_route_flush_table;
2788        if (!net_eq(net, &init_net)) {
2789                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2790                if (!tbl)
2791                        goto err_dup;
2792
2793                /* Don't export sysctls to unprivileged users */
2794                if (net->user_ns != &init_user_ns)
2795                        tbl[0].procname = NULL;
2796        }
2797        tbl[0].extra1 = net;
2798
2799        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2800        if (!net->ipv4.route_hdr)
2801                goto err_reg;
2802        return 0;
2803
2804err_reg:
2805        if (tbl != ipv4_route_flush_table)
2806                kfree(tbl);
2807err_dup:
2808        return -ENOMEM;
2809}
2810
2811static __net_exit void sysctl_route_net_exit(struct net *net)
2812{
2813        struct ctl_table *tbl;
2814
2815        tbl = net->ipv4.route_hdr->ctl_table_arg;
2816        unregister_net_sysctl_table(net->ipv4.route_hdr);
2817        BUG_ON(tbl == ipv4_route_flush_table);
2818        kfree(tbl);
2819}
2820
2821static __net_initdata struct pernet_operations sysctl_route_ops = {
2822        .init = sysctl_route_net_init,
2823        .exit = sysctl_route_net_exit,
2824};
2825#endif
2826
2827static __net_init int rt_genid_init(struct net *net)
2828{
2829        atomic_set(&net->ipv4.rt_genid, 0);
2830        atomic_set(&net->fnhe_genid, 0);
2831        get_random_bytes(&net->ipv4.dev_addr_genid,
2832                         sizeof(net->ipv4.dev_addr_genid));
2833        return 0;
2834}
2835
2836static __net_initdata struct pernet_operations rt_genid_ops = {
2837        .init = rt_genid_init,
2838};
2839
2840static int __net_init ipv4_inetpeer_init(struct net *net)
2841{
2842        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2843
2844        if (!bp)
2845                return -ENOMEM;
2846        inet_peer_base_init(bp);
2847        net->ipv4.peers = bp;
2848        return 0;
2849}
2850
2851static void __net_exit ipv4_inetpeer_exit(struct net *net)
2852{
2853        struct inet_peer_base *bp = net->ipv4.peers;
2854
2855        net->ipv4.peers = NULL;
2856        inetpeer_invalidate_tree(bp);
2857        kfree(bp);
2858}
2859
2860static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2861        .init   =       ipv4_inetpeer_init,
2862        .exit   =       ipv4_inetpeer_exit,
2863};
2864
2865#ifdef CONFIG_IP_ROUTE_CLASSID
2866struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2867#endif /* CONFIG_IP_ROUTE_CLASSID */
2868
2869int __init ip_rt_init(void)
2870{
2871        int rc = 0;
2872        int cpu;
2873
2874        ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2875        if (!ip_idents)
2876                panic("IP: failed to allocate ip_idents\n");
2877
2878        prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2879
2880        ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2881        if (!ip_tstamps)
2882                panic("IP: failed to allocate ip_tstamps\n");
2883
2884        for_each_possible_cpu(cpu) {
2885                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2886
2887                INIT_LIST_HEAD(&ul->head);
2888                spin_lock_init(&ul->lock);
2889        }
2890#ifdef CONFIG_IP_ROUTE_CLASSID
2891        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2892        if (!ip_rt_acct)
2893                panic("IP: failed to allocate ip_rt_acct\n");
2894#endif
2895
2896        ipv4_dst_ops.kmem_cachep =
2897                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2898                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2899
2900        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2901
2902        if (dst_entries_init(&ipv4_dst_ops) < 0)
2903                panic("IP: failed to allocate ipv4_dst_ops counter\n");
2904
2905        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2906                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2907
2908        ipv4_dst_ops.gc_thresh = ~0;
2909        ip_rt_max_size = INT_MAX;
2910
2911        devinet_init();
2912        ip_fib_init();
2913
2914        if (ip_rt_proc_init())
2915                pr_err("Unable to create route proc files\n");
2916#ifdef CONFIG_XFRM
2917        xfrm_init();
2918        xfrm4_init();
2919#endif
2920        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2921
2922#ifdef CONFIG_SYSCTL
2923        register_pernet_subsys(&sysctl_route_ops);
2924#endif
2925        register_pernet_subsys(&rt_genid_ops);
2926        register_pernet_subsys(&ipv4_inetpeer_ops);
2927        return rc;
2928}
2929
2930#ifdef CONFIG_SYSCTL
2931/*
2932 * We really need to sanitize the damn ipv4 init order, then all
2933 * this nonsense will go away.
2934 */
2935void __init ip_static_sysctl_init(void)
2936{
2937        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2938}
2939#endif
2940