linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *              Alan Cox        :       Verify area fixes.
  16 *              Alan Cox        :       cli() protects routing changes
  17 *              Rui Oliveira    :       ICMP routing table updates
  18 *              (rco@di.uminho.pt)      Routing table insertion and update
  19 *              Linus Torvalds  :       Rewrote bits to be sensible
  20 *              Alan Cox        :       Added BSD route gw semantics
  21 *              Alan Cox        :       Super /proc >4K
  22 *              Alan Cox        :       MTU in route table
  23 *              Alan Cox        :       MSS actually. Also added the window
  24 *                                      clamper.
  25 *              Sam Lantinga    :       Fixed route matching in rt_del()
  26 *              Alan Cox        :       Routing cache support.
  27 *              Alan Cox        :       Removed compatibility cruft.
  28 *              Alan Cox        :       RTF_REJECT support.
  29 *              Alan Cox        :       TCP irtt support.
  30 *              Jonathan Naylor :       Added Metric support.
  31 *      Miquel van Smoorenburg  :       BSD API fixes.
  32 *      Miquel van Smoorenburg  :       Metrics.
  33 *              Alan Cox        :       Use __u32 properly
  34 *              Alan Cox        :       Aligned routing errors more closely with BSD
  35 *                                      our system is still very different.
  36 *              Alan Cox        :       Faster /proc handling
  37 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38 *                                      routing caches and better behaviour.
  39 *
  40 *              Olaf Erb        :       irtt wasn't being copied right.
  41 *              Bjorn Ekwall    :       Kerneld route support.
  42 *              Alan Cox        :       Multicast fixed (I hope)
  43 *              Pavel Krauz     :       Limited broadcast fixed
  44 *              Mike McLagan    :       Routing by source
  45 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46 *                                      route.c and rewritten from scratch.
  47 *              Andi Kleen      :       Load-limit warning messages.
  48 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52 *              Marc Boucher    :       routing by fwmark
  53 *      Robert Olsson           :       Added rt_cache statistics
  54 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58 *
  59 *              This program is free software; you can redistribute it and/or
  60 *              modify it under the terms of the GNU General Public License
  61 *              as published by the Free Software Foundation; either version
  62 *              2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
  92#include <linux/jhash.h>
  93#include <net/dst.h>
  94#include <net/dst_metadata.h>
  95#include <net/net_namespace.h>
  96#include <net/protocol.h>
  97#include <net/ip.h>
  98#include <net/route.h>
  99#include <net/inetpeer.h>
 100#include <net/sock.h>
 101#include <net/ip_fib.h>
 102#include <net/arp.h>
 103#include <net/tcp.h>
 104#include <net/icmp.h>
 105#include <net/xfrm.h>
 106#include <net/lwtunnel.h>
 107#include <net/netevent.h>
 108#include <net/rtnetlink.h>
 109#ifdef CONFIG_SYSCTL
 110#include <linux/sysctl.h>
 111#include <linux/kmemleak.h>
 112#endif
 113#include <net/secure_seq.h>
 114#include <net/ip_tunnels.h>
 115#include <net/l3mdev.h>
 116
 117#define RT_FL_TOS(oldflp4) \
 118        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 119
 120#define RT_GC_TIMEOUT (300*HZ)
 121
 122static int ip_rt_max_size;
 123static int ip_rt_redirect_number __read_mostly  = 9;
 124static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126static int ip_rt_error_cost __read_mostly       = HZ;
 127static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130static int ip_rt_min_advmss __read_mostly       = 256;
 131
 132static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 133/*
 134 *      Interface to generic destination cache.
 135 */
 136
 137static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 138static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 139static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 140static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 141static void              ipv4_link_failure(struct sk_buff *skb);
 142static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 143                                           struct sk_buff *skb, u32 mtu);
 144static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 145                                        struct sk_buff *skb);
 146static void             ipv4_dst_destroy(struct dst_entry *dst);
 147
 148static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 149{
 150        WARN_ON(1);
 151        return NULL;
 152}
 153
 154static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 155                                           struct sk_buff *skb,
 156                                           const void *daddr);
 157
 158static struct dst_ops ipv4_dst_ops = {
 159        .family =               AF_INET,
 160        .check =                ipv4_dst_check,
 161        .default_advmss =       ipv4_default_advmss,
 162        .mtu =                  ipv4_mtu,
 163        .cow_metrics =          ipv4_cow_metrics,
 164        .destroy =              ipv4_dst_destroy,
 165        .negative_advice =      ipv4_negative_advice,
 166        .link_failure =         ipv4_link_failure,
 167        .update_pmtu =          ip_rt_update_pmtu,
 168        .redirect =             ip_do_redirect,
 169        .local_out =            __ip_local_out,
 170        .neigh_lookup =         ipv4_neigh_lookup,
 171};
 172
 173#define ECN_OR_COST(class)      TC_PRIO_##class
 174
 175const __u8 ip_tos2prio[16] = {
 176        TC_PRIO_BESTEFFORT,
 177        ECN_OR_COST(BESTEFFORT),
 178        TC_PRIO_BESTEFFORT,
 179        ECN_OR_COST(BESTEFFORT),
 180        TC_PRIO_BULK,
 181        ECN_OR_COST(BULK),
 182        TC_PRIO_BULK,
 183        ECN_OR_COST(BULK),
 184        TC_PRIO_INTERACTIVE,
 185        ECN_OR_COST(INTERACTIVE),
 186        TC_PRIO_INTERACTIVE,
 187        ECN_OR_COST(INTERACTIVE),
 188        TC_PRIO_INTERACTIVE_BULK,
 189        ECN_OR_COST(INTERACTIVE_BULK),
 190        TC_PRIO_INTERACTIVE_BULK,
 191        ECN_OR_COST(INTERACTIVE_BULK)
 192};
 193EXPORT_SYMBOL(ip_tos2prio);
 194
 195static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 196#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 197
 198#ifdef CONFIG_PROC_FS
 199static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 200{
 201        if (*pos)
 202                return NULL;
 203        return SEQ_START_TOKEN;
 204}
 205
 206static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 207{
 208        ++*pos;
 209        return NULL;
 210}
 211
 212static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 213{
 214}
 215
 216static int rt_cache_seq_show(struct seq_file *seq, void *v)
 217{
 218        if (v == SEQ_START_TOKEN)
 219                seq_printf(seq, "%-127s\n",
 220                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 221                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 222                           "HHUptod\tSpecDst");
 223        return 0;
 224}
 225
 226static const struct seq_operations rt_cache_seq_ops = {
 227        .start  = rt_cache_seq_start,
 228        .next   = rt_cache_seq_next,
 229        .stop   = rt_cache_seq_stop,
 230        .show   = rt_cache_seq_show,
 231};
 232
 233static int rt_cache_seq_open(struct inode *inode, struct file *file)
 234{
 235        return seq_open(file, &rt_cache_seq_ops);
 236}
 237
 238static const struct file_operations rt_cache_seq_fops = {
 239        .owner   = THIS_MODULE,
 240        .open    = rt_cache_seq_open,
 241        .read    = seq_read,
 242        .llseek  = seq_lseek,
 243        .release = seq_release,
 244};
 245
 246
 247static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 248{
 249        int cpu;
 250
 251        if (*pos == 0)
 252                return SEQ_START_TOKEN;
 253
 254        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 255                if (!cpu_possible(cpu))
 256                        continue;
 257                *pos = cpu+1;
 258                return &per_cpu(rt_cache_stat, cpu);
 259        }
 260        return NULL;
 261}
 262
 263static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 264{
 265        int cpu;
 266
 267        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 268                if (!cpu_possible(cpu))
 269                        continue;
 270                *pos = cpu+1;
 271                return &per_cpu(rt_cache_stat, cpu);
 272        }
 273        return NULL;
 274
 275}
 276
 277static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 278{
 279
 280}
 281
 282static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 283{
 284        struct rt_cache_stat *st = v;
 285
 286        if (v == SEQ_START_TOKEN) {
 287                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 288                return 0;
 289        }
 290
 291        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 292                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 293                   dst_entries_get_slow(&ipv4_dst_ops),
 294                   0, /* st->in_hit */
 295                   st->in_slow_tot,
 296                   st->in_slow_mc,
 297                   st->in_no_route,
 298                   st->in_brd,
 299                   st->in_martian_dst,
 300                   st->in_martian_src,
 301
 302                   0, /* st->out_hit */
 303                   st->out_slow_tot,
 304                   st->out_slow_mc,
 305
 306                   0, /* st->gc_total */
 307                   0, /* st->gc_ignored */
 308                   0, /* st->gc_goal_miss */
 309                   0, /* st->gc_dst_overflow */
 310                   0, /* st->in_hlist_search */
 311                   0  /* st->out_hlist_search */
 312                );
 313        return 0;
 314}
 315
 316static const struct seq_operations rt_cpu_seq_ops = {
 317        .start  = rt_cpu_seq_start,
 318        .next   = rt_cpu_seq_next,
 319        .stop   = rt_cpu_seq_stop,
 320        .show   = rt_cpu_seq_show,
 321};
 322
 323
 324static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 325{
 326        return seq_open(file, &rt_cpu_seq_ops);
 327}
 328
 329static const struct file_operations rt_cpu_seq_fops = {
 330        .owner   = THIS_MODULE,
 331        .open    = rt_cpu_seq_open,
 332        .read    = seq_read,
 333        .llseek  = seq_lseek,
 334        .release = seq_release,
 335};
 336
 337#ifdef CONFIG_IP_ROUTE_CLASSID
 338static int rt_acct_proc_show(struct seq_file *m, void *v)
 339{
 340        struct ip_rt_acct *dst, *src;
 341        unsigned int i, j;
 342
 343        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 344        if (!dst)
 345                return -ENOMEM;
 346
 347        for_each_possible_cpu(i) {
 348                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 349                for (j = 0; j < 256; j++) {
 350                        dst[j].o_bytes   += src[j].o_bytes;
 351                        dst[j].o_packets += src[j].o_packets;
 352                        dst[j].i_bytes   += src[j].i_bytes;
 353                        dst[j].i_packets += src[j].i_packets;
 354                }
 355        }
 356
 357        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 358        kfree(dst);
 359        return 0;
 360}
 361
 362static int rt_acct_proc_open(struct inode *inode, struct file *file)
 363{
 364        return single_open(file, rt_acct_proc_show, NULL);
 365}
 366
 367static const struct file_operations rt_acct_proc_fops = {
 368        .owner          = THIS_MODULE,
 369        .open           = rt_acct_proc_open,
 370        .read           = seq_read,
 371        .llseek         = seq_lseek,
 372        .release        = single_release,
 373};
 374#endif
 375
 376static int __net_init ip_rt_do_proc_init(struct net *net)
 377{
 378        struct proc_dir_entry *pde;
 379
 380        pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 381                          &rt_cache_seq_fops);
 382        if (!pde)
 383                goto err1;
 384
 385        pde = proc_create("rt_cache", S_IRUGO,
 386                          net->proc_net_stat, &rt_cpu_seq_fops);
 387        if (!pde)
 388                goto err2;
 389
 390#ifdef CONFIG_IP_ROUTE_CLASSID
 391        pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 392        if (!pde)
 393                goto err3;
 394#endif
 395        return 0;
 396
 397#ifdef CONFIG_IP_ROUTE_CLASSID
 398err3:
 399        remove_proc_entry("rt_cache", net->proc_net_stat);
 400#endif
 401err2:
 402        remove_proc_entry("rt_cache", net->proc_net);
 403err1:
 404        return -ENOMEM;
 405}
 406
 407static void __net_exit ip_rt_do_proc_exit(struct net *net)
 408{
 409        remove_proc_entry("rt_cache", net->proc_net_stat);
 410        remove_proc_entry("rt_cache", net->proc_net);
 411#ifdef CONFIG_IP_ROUTE_CLASSID
 412        remove_proc_entry("rt_acct", net->proc_net);
 413#endif
 414}
 415
 416static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 417        .init = ip_rt_do_proc_init,
 418        .exit = ip_rt_do_proc_exit,
 419};
 420
 421static int __init ip_rt_proc_init(void)
 422{
 423        return register_pernet_subsys(&ip_rt_proc_ops);
 424}
 425
 426#else
 427static inline int ip_rt_proc_init(void)
 428{
 429        return 0;
 430}
 431#endif /* CONFIG_PROC_FS */
 432
 433static inline bool rt_is_expired(const struct rtable *rth)
 434{
 435        return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 436}
 437
 438void rt_cache_flush(struct net *net)
 439{
 440        rt_genid_bump_ipv4(net);
 441}
 442
 443static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 444                                           struct sk_buff *skb,
 445                                           const void *daddr)
 446{
 447        struct net_device *dev = dst->dev;
 448        const __be32 *pkey = daddr;
 449        const struct rtable *rt;
 450        struct neighbour *n;
 451
 452        rt = (const struct rtable *) dst;
 453        if (rt->rt_gateway)
 454                pkey = (const __be32 *) &rt->rt_gateway;
 455        else if (skb)
 456                pkey = &ip_hdr(skb)->daddr;
 457
 458        n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 459        if (n)
 460                return n;
 461        return neigh_create(&arp_tbl, pkey, dev);
 462}
 463
 464#define IP_IDENTS_SZ 2048u
 465
 466static atomic_t *ip_idents __read_mostly;
 467static u32 *ip_tstamps __read_mostly;
 468
 469/* In order to protect privacy, we add a perturbation to identifiers
 470 * if one generator is seldom used. This makes hard for an attacker
 471 * to infer how many packets were sent between two points in time.
 472 */
 473u32 ip_idents_reserve(u32 hash, int segs)
 474{
 475        u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 476        atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 477        u32 old = ACCESS_ONCE(*p_tstamp);
 478        u32 now = (u32)jiffies;
 479        u32 new, delta = 0;
 480
 481        if (old != now && cmpxchg(p_tstamp, old, now) == old)
 482                delta = prandom_u32_max(now - old);
 483
 484        /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 485        do {
 486                old = (u32)atomic_read(p_id);
 487                new = old + delta + segs;
 488        } while (atomic_cmpxchg(p_id, old, new) != old);
 489
 490        return new - segs;
 491}
 492EXPORT_SYMBOL(ip_idents_reserve);
 493
 494void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 495{
 496        static u32 ip_idents_hashrnd __read_mostly;
 497        u32 hash, id;
 498
 499        net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 500
 501        hash = jhash_3words((__force u32)iph->daddr,
 502                            (__force u32)iph->saddr,
 503                            iph->protocol ^ net_hash_mix(net),
 504                            ip_idents_hashrnd);
 505        id = ip_idents_reserve(hash, segs);
 506        iph->id = htons(id);
 507}
 508EXPORT_SYMBOL(__ip_select_ident);
 509
 510static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 511                             const struct iphdr *iph,
 512                             int oif, u8 tos,
 513                             u8 prot, u32 mark, int flow_flags)
 514{
 515        if (sk) {
 516                const struct inet_sock *inet = inet_sk(sk);
 517
 518                oif = sk->sk_bound_dev_if;
 519                mark = sk->sk_mark;
 520                tos = RT_CONN_FLAGS(sk);
 521                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 522        }
 523        flowi4_init_output(fl4, oif, mark, tos,
 524                           RT_SCOPE_UNIVERSE, prot,
 525                           flow_flags,
 526                           iph->daddr, iph->saddr, 0, 0);
 527}
 528
 529static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 530                               const struct sock *sk)
 531{
 532        const struct iphdr *iph = ip_hdr(skb);
 533        int oif = skb->dev->ifindex;
 534        u8 tos = RT_TOS(iph->tos);
 535        u8 prot = iph->protocol;
 536        u32 mark = skb->mark;
 537
 538        __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 539}
 540
 541static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 542{
 543        const struct inet_sock *inet = inet_sk(sk);
 544        const struct ip_options_rcu *inet_opt;
 545        __be32 daddr = inet->inet_daddr;
 546
 547        rcu_read_lock();
 548        inet_opt = rcu_dereference(inet->inet_opt);
 549        if (inet_opt && inet_opt->opt.srr)
 550                daddr = inet_opt->opt.faddr;
 551        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 552                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 553                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 554                           inet_sk_flowi_flags(sk),
 555                           daddr, inet->inet_saddr, 0, 0);
 556        rcu_read_unlock();
 557}
 558
 559static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 560                                 const struct sk_buff *skb)
 561{
 562        if (skb)
 563                build_skb_flow_key(fl4, skb, sk);
 564        else
 565                build_sk_flow_key(fl4, sk);
 566}
 567
 568static inline void rt_free(struct rtable *rt)
 569{
 570        call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 571}
 572
 573static DEFINE_SPINLOCK(fnhe_lock);
 574
 575static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 576{
 577        struct rtable *rt;
 578
 579        rt = rcu_dereference(fnhe->fnhe_rth_input);
 580        if (rt) {
 581                RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 582                rt_free(rt);
 583        }
 584        rt = rcu_dereference(fnhe->fnhe_rth_output);
 585        if (rt) {
 586                RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 587                rt_free(rt);
 588        }
 589}
 590
 591static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 592{
 593        struct fib_nh_exception *fnhe, *oldest;
 594
 595        oldest = rcu_dereference(hash->chain);
 596        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 597             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 598                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 599                        oldest = fnhe;
 600        }
 601        fnhe_flush_routes(oldest);
 602        return oldest;
 603}
 604
 605static inline u32 fnhe_hashfun(__be32 daddr)
 606{
 607        static u32 fnhe_hashrnd __read_mostly;
 608        u32 hval;
 609
 610        net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 611        hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 612        return hash_32(hval, FNHE_HASH_SHIFT);
 613}
 614
 615static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 616{
 617        rt->rt_pmtu = fnhe->fnhe_pmtu;
 618        rt->dst.expires = fnhe->fnhe_expires;
 619
 620        if (fnhe->fnhe_gw) {
 621                rt->rt_flags |= RTCF_REDIRECTED;
 622                rt->rt_gateway = fnhe->fnhe_gw;
 623                rt->rt_uses_gateway = 1;
 624        }
 625}
 626
 627static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 628                                  u32 pmtu, unsigned long expires)
 629{
 630        struct fnhe_hash_bucket *hash;
 631        struct fib_nh_exception *fnhe;
 632        struct rtable *rt;
 633        unsigned int i;
 634        int depth;
 635        u32 hval = fnhe_hashfun(daddr);
 636
 637        spin_lock_bh(&fnhe_lock);
 638
 639        hash = rcu_dereference(nh->nh_exceptions);
 640        if (!hash) {
 641                hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 642                if (!hash)
 643                        goto out_unlock;
 644                rcu_assign_pointer(nh->nh_exceptions, hash);
 645        }
 646
 647        hash += hval;
 648
 649        depth = 0;
 650        for (fnhe = rcu_dereference(hash->chain); fnhe;
 651             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 652                if (fnhe->fnhe_daddr == daddr)
 653                        break;
 654                depth++;
 655        }
 656
 657        if (fnhe) {
 658                if (gw)
 659                        fnhe->fnhe_gw = gw;
 660                if (pmtu) {
 661                        fnhe->fnhe_pmtu = pmtu;
 662                        fnhe->fnhe_expires = max(1UL, expires);
 663                }
 664                /* Update all cached dsts too */
 665                rt = rcu_dereference(fnhe->fnhe_rth_input);
 666                if (rt)
 667                        fill_route_from_fnhe(rt, fnhe);
 668                rt = rcu_dereference(fnhe->fnhe_rth_output);
 669                if (rt)
 670                        fill_route_from_fnhe(rt, fnhe);
 671        } else {
 672                if (depth > FNHE_RECLAIM_DEPTH)
 673                        fnhe = fnhe_oldest(hash);
 674                else {
 675                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 676                        if (!fnhe)
 677                                goto out_unlock;
 678
 679                        fnhe->fnhe_next = hash->chain;
 680                        rcu_assign_pointer(hash->chain, fnhe);
 681                }
 682                fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
 683                fnhe->fnhe_daddr = daddr;
 684                fnhe->fnhe_gw = gw;
 685                fnhe->fnhe_pmtu = pmtu;
 686                fnhe->fnhe_expires = expires;
 687
 688                /* Exception created; mark the cached routes for the nexthop
 689                 * stale, so anyone caching it rechecks if this exception
 690                 * applies to them.
 691                 */
 692                rt = rcu_dereference(nh->nh_rth_input);
 693                if (rt)
 694                        rt->dst.obsolete = DST_OBSOLETE_KILL;
 695
 696                for_each_possible_cpu(i) {
 697                        struct rtable __rcu **prt;
 698                        prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 699                        rt = rcu_dereference(*prt);
 700                        if (rt)
 701                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 702                }
 703        }
 704
 705        fnhe->fnhe_stamp = jiffies;
 706
 707out_unlock:
 708        spin_unlock_bh(&fnhe_lock);
 709}
 710
 711static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 712                             bool kill_route)
 713{
 714        __be32 new_gw = icmp_hdr(skb)->un.gateway;
 715        __be32 old_gw = ip_hdr(skb)->saddr;
 716        struct net_device *dev = skb->dev;
 717        struct in_device *in_dev;
 718        struct fib_result res;
 719        struct neighbour *n;
 720        struct net *net;
 721
 722        switch (icmp_hdr(skb)->code & 7) {
 723        case ICMP_REDIR_NET:
 724        case ICMP_REDIR_NETTOS:
 725        case ICMP_REDIR_HOST:
 726        case ICMP_REDIR_HOSTTOS:
 727                break;
 728
 729        default:
 730                return;
 731        }
 732
 733        if (rt->rt_gateway != old_gw)
 734                return;
 735
 736        in_dev = __in_dev_get_rcu(dev);
 737        if (!in_dev)
 738                return;
 739
 740        net = dev_net(dev);
 741        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 742            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 743            ipv4_is_zeronet(new_gw))
 744                goto reject_redirect;
 745
 746        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 747                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 748                        goto reject_redirect;
 749                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 750                        goto reject_redirect;
 751        } else {
 752                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 753                        goto reject_redirect;
 754        }
 755
 756        n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 757        if (!IS_ERR(n)) {
 758                if (!(n->nud_state & NUD_VALID)) {
 759                        neigh_event_send(n, NULL);
 760                } else {
 761                        if (fib_lookup(net, fl4, &res, 0) == 0) {
 762                                struct fib_nh *nh = &FIB_RES_NH(res);
 763
 764                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
 765                                                0, jiffies + ip_rt_gc_timeout);
 766                        }
 767                        if (kill_route)
 768                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 769                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 770                }
 771                neigh_release(n);
 772        }
 773        return;
 774
 775reject_redirect:
 776#ifdef CONFIG_IP_ROUTE_VERBOSE
 777        if (IN_DEV_LOG_MARTIANS(in_dev)) {
 778                const struct iphdr *iph = (const struct iphdr *) skb->data;
 779                __be32 daddr = iph->daddr;
 780                __be32 saddr = iph->saddr;
 781
 782                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 783                                     "  Advised path = %pI4 -> %pI4\n",
 784                                     &old_gw, dev->name, &new_gw,
 785                                     &saddr, &daddr);
 786        }
 787#endif
 788        ;
 789}
 790
 791static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 792{
 793        struct rtable *rt;
 794        struct flowi4 fl4;
 795        const struct iphdr *iph = (const struct iphdr *) skb->data;
 796        int oif = skb->dev->ifindex;
 797        u8 tos = RT_TOS(iph->tos);
 798        u8 prot = iph->protocol;
 799        u32 mark = skb->mark;
 800
 801        rt = (struct rtable *) dst;
 802
 803        __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
 804        __ip_do_redirect(rt, skb, &fl4, true);
 805}
 806
 807static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 808{
 809        struct rtable *rt = (struct rtable *)dst;
 810        struct dst_entry *ret = dst;
 811
 812        if (rt) {
 813                if (dst->obsolete > 0) {
 814                        ip_rt_put(rt);
 815                        ret = NULL;
 816                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 817                           rt->dst.expires) {
 818                        ip_rt_put(rt);
 819                        ret = NULL;
 820                }
 821        }
 822        return ret;
 823}
 824
 825/*
 826 * Algorithm:
 827 *      1. The first ip_rt_redirect_number redirects are sent
 828 *         with exponential backoff, then we stop sending them at all,
 829 *         assuming that the host ignores our redirects.
 830 *      2. If we did not see packets requiring redirects
 831 *         during ip_rt_redirect_silence, we assume that the host
 832 *         forgot redirected route and start to send redirects again.
 833 *
 834 * This algorithm is much cheaper and more intelligent than dumb load limiting
 835 * in icmp.c.
 836 *
 837 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 838 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 839 */
 840
 841void ip_rt_send_redirect(struct sk_buff *skb)
 842{
 843        struct rtable *rt = skb_rtable(skb);
 844        struct in_device *in_dev;
 845        struct inet_peer *peer;
 846        struct net *net;
 847        int log_martians;
 848        int vif;
 849
 850        rcu_read_lock();
 851        in_dev = __in_dev_get_rcu(rt->dst.dev);
 852        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 853                rcu_read_unlock();
 854                return;
 855        }
 856        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 857        vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 858        rcu_read_unlock();
 859
 860        net = dev_net(rt->dst.dev);
 861        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 862        if (!peer) {
 863                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 864                          rt_nexthop(rt, ip_hdr(skb)->daddr));
 865                return;
 866        }
 867
 868        /* No redirected packets during ip_rt_redirect_silence;
 869         * reset the algorithm.
 870         */
 871        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 872                peer->rate_tokens = 0;
 873
 874        /* Too many ignored redirects; do not send anything
 875         * set dst.rate_last to the last seen redirected packet.
 876         */
 877        if (peer->rate_tokens >= ip_rt_redirect_number) {
 878                peer->rate_last = jiffies;
 879                goto out_put_peer;
 880        }
 881
 882        /* Check for load limit; set rate_last to the latest sent
 883         * redirect.
 884         */
 885        if (peer->rate_tokens == 0 ||
 886            time_after(jiffies,
 887                       (peer->rate_last +
 888                        (ip_rt_redirect_load << peer->rate_tokens)))) {
 889                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 890
 891                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 892                peer->rate_last = jiffies;
 893                ++peer->rate_tokens;
 894#ifdef CONFIG_IP_ROUTE_VERBOSE
 895                if (log_martians &&
 896                    peer->rate_tokens == ip_rt_redirect_number)
 897                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 898                                             &ip_hdr(skb)->saddr, inet_iif(skb),
 899                                             &ip_hdr(skb)->daddr, &gw);
 900#endif
 901        }
 902out_put_peer:
 903        inet_putpeer(peer);
 904}
 905
 906static int ip_error(struct sk_buff *skb)
 907{
 908        struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 909        struct rtable *rt = skb_rtable(skb);
 910        struct inet_peer *peer;
 911        unsigned long now;
 912        struct net *net;
 913        bool send;
 914        int code;
 915
 916        /* IP on this device is disabled. */
 917        if (!in_dev)
 918                goto out;
 919
 920        net = dev_net(rt->dst.dev);
 921        if (!IN_DEV_FORWARD(in_dev)) {
 922                switch (rt->dst.error) {
 923                case EHOSTUNREACH:
 924                        __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 925                        break;
 926
 927                case ENETUNREACH:
 928                        __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 929                        break;
 930                }
 931                goto out;
 932        }
 933
 934        switch (rt->dst.error) {
 935        case EINVAL:
 936        default:
 937                goto out;
 938        case EHOSTUNREACH:
 939                code = ICMP_HOST_UNREACH;
 940                break;
 941        case ENETUNREACH:
 942                code = ICMP_NET_UNREACH;
 943                __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 944                break;
 945        case EACCES:
 946                code = ICMP_PKT_FILTERED;
 947                break;
 948        }
 949
 950        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 951                               l3mdev_master_ifindex(skb->dev), 1);
 952
 953        send = true;
 954        if (peer) {
 955                now = jiffies;
 956                peer->rate_tokens += now - peer->rate_last;
 957                if (peer->rate_tokens > ip_rt_error_burst)
 958                        peer->rate_tokens = ip_rt_error_burst;
 959                peer->rate_last = now;
 960                if (peer->rate_tokens >= ip_rt_error_cost)
 961                        peer->rate_tokens -= ip_rt_error_cost;
 962                else
 963                        send = false;
 964                inet_putpeer(peer);
 965        }
 966        if (send)
 967                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 968
 969out:    kfree_skb(skb);
 970        return 0;
 971}
 972
 973static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 974{
 975        struct dst_entry *dst = &rt->dst;
 976        struct fib_result res;
 977
 978        if (dst_metric_locked(dst, RTAX_MTU))
 979                return;
 980
 981        if (ipv4_mtu(dst) < mtu)
 982                return;
 983
 984        if (mtu < ip_rt_min_pmtu)
 985                mtu = ip_rt_min_pmtu;
 986
 987        if (rt->rt_pmtu == mtu &&
 988            time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
 989                return;
 990
 991        rcu_read_lock();
 992        if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
 993                struct fib_nh *nh = &FIB_RES_NH(res);
 994
 995                update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 996                                      jiffies + ip_rt_mtu_expires);
 997        }
 998        rcu_read_unlock();
 999}
1000
1001static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1002                              struct sk_buff *skb, u32 mtu)
1003{
1004        struct rtable *rt = (struct rtable *) dst;
1005        struct flowi4 fl4;
1006
1007        ip_rt_build_flow_key(&fl4, sk, skb);
1008        __ip_rt_update_pmtu(rt, &fl4, mtu);
1009}
1010
1011void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1012                      int oif, u32 mark, u8 protocol, int flow_flags)
1013{
1014        const struct iphdr *iph = (const struct iphdr *) skb->data;
1015        struct flowi4 fl4;
1016        struct rtable *rt;
1017
1018        if (!mark)
1019                mark = IP4_REPLY_MARK(net, skb->mark);
1020
1021        __build_flow_key(&fl4, NULL, iph, oif,
1022                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1023        rt = __ip_route_output_key(net, &fl4);
1024        if (!IS_ERR(rt)) {
1025                __ip_rt_update_pmtu(rt, &fl4, mtu);
1026                ip_rt_put(rt);
1027        }
1028}
1029EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1030
1031static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1032{
1033        const struct iphdr *iph = (const struct iphdr *) skb->data;
1034        struct flowi4 fl4;
1035        struct rtable *rt;
1036
1037        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1038
1039        if (!fl4.flowi4_mark)
1040                fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1041
1042        rt = __ip_route_output_key(sock_net(sk), &fl4);
1043        if (!IS_ERR(rt)) {
1044                __ip_rt_update_pmtu(rt, &fl4, mtu);
1045                ip_rt_put(rt);
1046        }
1047}
1048
1049void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1050{
1051        const struct iphdr *iph = (const struct iphdr *) skb->data;
1052        struct flowi4 fl4;
1053        struct rtable *rt;
1054        struct dst_entry *odst = NULL;
1055        bool new = false;
1056
1057        bh_lock_sock(sk);
1058
1059        if (!ip_sk_accept_pmtu(sk))
1060                goto out;
1061
1062        odst = sk_dst_get(sk);
1063
1064        if (sock_owned_by_user(sk) || !odst) {
1065                __ipv4_sk_update_pmtu(skb, sk, mtu);
1066                goto out;
1067        }
1068
1069        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1070
1071        rt = (struct rtable *)odst;
1072        if (odst->obsolete && !odst->ops->check(odst, 0)) {
1073                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1074                if (IS_ERR(rt))
1075                        goto out;
1076
1077                new = true;
1078        }
1079
1080        __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1081
1082        if (!dst_check(&rt->dst, 0)) {
1083                if (new)
1084                        dst_release(&rt->dst);
1085
1086                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1087                if (IS_ERR(rt))
1088                        goto out;
1089
1090                new = true;
1091        }
1092
1093        if (new)
1094                sk_dst_set(sk, &rt->dst);
1095
1096out:
1097        bh_unlock_sock(sk);
1098        dst_release(odst);
1099}
1100EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1101
1102void ipv4_redirect(struct sk_buff *skb, struct net *net,
1103                   int oif, u32 mark, u8 protocol, int flow_flags)
1104{
1105        const struct iphdr *iph = (const struct iphdr *) skb->data;
1106        struct flowi4 fl4;
1107        struct rtable *rt;
1108
1109        __build_flow_key(&fl4, NULL, iph, oif,
1110                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1111        rt = __ip_route_output_key(net, &fl4);
1112        if (!IS_ERR(rt)) {
1113                __ip_do_redirect(rt, skb, &fl4, false);
1114                ip_rt_put(rt);
1115        }
1116}
1117EXPORT_SYMBOL_GPL(ipv4_redirect);
1118
1119void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1120{
1121        const struct iphdr *iph = (const struct iphdr *) skb->data;
1122        struct flowi4 fl4;
1123        struct rtable *rt;
1124
1125        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1126        rt = __ip_route_output_key(sock_net(sk), &fl4);
1127        if (!IS_ERR(rt)) {
1128                __ip_do_redirect(rt, skb, &fl4, false);
1129                ip_rt_put(rt);
1130        }
1131}
1132EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1133
1134static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1135{
1136        struct rtable *rt = (struct rtable *) dst;
1137
1138        /* All IPV4 dsts are created with ->obsolete set to the value
1139         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1140         * into this function always.
1141         *
1142         * When a PMTU/redirect information update invalidates a route,
1143         * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1144         * DST_OBSOLETE_DEAD by dst_free().
1145         */
1146        if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1147                return NULL;
1148        return dst;
1149}
1150
1151static void ipv4_link_failure(struct sk_buff *skb)
1152{
1153        struct rtable *rt;
1154
1155        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1156
1157        rt = skb_rtable(skb);
1158        if (rt)
1159                dst_set_expires(&rt->dst, 0);
1160}
1161
1162static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1163{
1164        pr_debug("%s: %pI4 -> %pI4, %s\n",
1165                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1166                 skb->dev ? skb->dev->name : "?");
1167        kfree_skb(skb);
1168        WARN_ON(1);
1169        return 0;
1170}
1171
1172/*
1173   We do not cache source address of outgoing interface,
1174   because it is used only by IP RR, TS and SRR options,
1175   so that it out of fast path.
1176
1177   BTW remember: "addr" is allowed to be not aligned
1178   in IP options!
1179 */
1180
1181void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1182{
1183        __be32 src;
1184
1185        if (rt_is_output_route(rt))
1186                src = ip_hdr(skb)->saddr;
1187        else {
1188                struct fib_result res;
1189                struct flowi4 fl4;
1190                struct iphdr *iph;
1191
1192                iph = ip_hdr(skb);
1193
1194                memset(&fl4, 0, sizeof(fl4));
1195                fl4.daddr = iph->daddr;
1196                fl4.saddr = iph->saddr;
1197                fl4.flowi4_tos = RT_TOS(iph->tos);
1198                fl4.flowi4_oif = rt->dst.dev->ifindex;
1199                fl4.flowi4_iif = skb->dev->ifindex;
1200                fl4.flowi4_mark = skb->mark;
1201
1202                rcu_read_lock();
1203                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1204                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1205                else
1206                        src = inet_select_addr(rt->dst.dev,
1207                                               rt_nexthop(rt, iph->daddr),
1208                                               RT_SCOPE_UNIVERSE);
1209                rcu_read_unlock();
1210        }
1211        memcpy(addr, &src, 4);
1212}
1213
1214#ifdef CONFIG_IP_ROUTE_CLASSID
1215static void set_class_tag(struct rtable *rt, u32 tag)
1216{
1217        if (!(rt->dst.tclassid & 0xFFFF))
1218                rt->dst.tclassid |= tag & 0xFFFF;
1219        if (!(rt->dst.tclassid & 0xFFFF0000))
1220                rt->dst.tclassid |= tag & 0xFFFF0000;
1221}
1222#endif
1223
1224static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1225{
1226        unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1227
1228        if (advmss == 0) {
1229                advmss = max_t(unsigned int, dst->dev->mtu - 40,
1230                               ip_rt_min_advmss);
1231                if (advmss > 65535 - 40)
1232                        advmss = 65535 - 40;
1233        }
1234        return advmss;
1235}
1236
1237static unsigned int ipv4_mtu(const struct dst_entry *dst)
1238{
1239        const struct rtable *rt = (const struct rtable *) dst;
1240        unsigned int mtu = rt->rt_pmtu;
1241
1242        if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1243                mtu = dst_metric_raw(dst, RTAX_MTU);
1244
1245        if (mtu)
1246                return mtu;
1247
1248        mtu = dst->dev->mtu;
1249
1250        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1251                if (rt->rt_uses_gateway && mtu > 576)
1252                        mtu = 576;
1253        }
1254
1255        return min_t(unsigned int, mtu, IP_MAX_MTU);
1256}
1257
1258static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1259{
1260        struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1261        struct fib_nh_exception *fnhe;
1262        u32 hval;
1263
1264        if (!hash)
1265                return NULL;
1266
1267        hval = fnhe_hashfun(daddr);
1268
1269        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1270             fnhe = rcu_dereference(fnhe->fnhe_next)) {
1271                if (fnhe->fnhe_daddr == daddr)
1272                        return fnhe;
1273        }
1274        return NULL;
1275}
1276
1277static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1278                              __be32 daddr)
1279{
1280        bool ret = false;
1281
1282        spin_lock_bh(&fnhe_lock);
1283
1284        if (daddr == fnhe->fnhe_daddr) {
1285                struct rtable __rcu **porig;
1286                struct rtable *orig;
1287                int genid = fnhe_genid(dev_net(rt->dst.dev));
1288
1289                if (rt_is_input_route(rt))
1290                        porig = &fnhe->fnhe_rth_input;
1291                else
1292                        porig = &fnhe->fnhe_rth_output;
1293                orig = rcu_dereference(*porig);
1294
1295                if (fnhe->fnhe_genid != genid) {
1296                        fnhe->fnhe_genid = genid;
1297                        fnhe->fnhe_gw = 0;
1298                        fnhe->fnhe_pmtu = 0;
1299                        fnhe->fnhe_expires = 0;
1300                        fnhe_flush_routes(fnhe);
1301                        orig = NULL;
1302                }
1303                fill_route_from_fnhe(rt, fnhe);
1304                if (!rt->rt_gateway)
1305                        rt->rt_gateway = daddr;
1306
1307                if (!(rt->dst.flags & DST_NOCACHE)) {
1308                        rcu_assign_pointer(*porig, rt);
1309                        if (orig)
1310                                rt_free(orig);
1311                        ret = true;
1312                }
1313
1314                fnhe->fnhe_stamp = jiffies;
1315        }
1316        spin_unlock_bh(&fnhe_lock);
1317
1318        return ret;
1319}
1320
1321static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1322{
1323        struct rtable *orig, *prev, **p;
1324        bool ret = true;
1325
1326        if (rt_is_input_route(rt)) {
1327                p = (struct rtable **)&nh->nh_rth_input;
1328        } else {
1329                p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1330        }
1331        orig = *p;
1332
1333        prev = cmpxchg(p, orig, rt);
1334        if (prev == orig) {
1335                if (orig)
1336                        rt_free(orig);
1337        } else
1338                ret = false;
1339
1340        return ret;
1341}
1342
1343struct uncached_list {
1344        spinlock_t              lock;
1345        struct list_head        head;
1346};
1347
1348static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1349
1350static void rt_add_uncached_list(struct rtable *rt)
1351{
1352        struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1353
1354        rt->rt_uncached_list = ul;
1355
1356        spin_lock_bh(&ul->lock);
1357        list_add_tail(&rt->rt_uncached, &ul->head);
1358        spin_unlock_bh(&ul->lock);
1359}
1360
1361static void ipv4_dst_destroy(struct dst_entry *dst)
1362{
1363        struct rtable *rt = (struct rtable *) dst;
1364
1365        if (!list_empty(&rt->rt_uncached)) {
1366                struct uncached_list *ul = rt->rt_uncached_list;
1367
1368                spin_lock_bh(&ul->lock);
1369                list_del(&rt->rt_uncached);
1370                spin_unlock_bh(&ul->lock);
1371        }
1372}
1373
1374void rt_flush_dev(struct net_device *dev)
1375{
1376        struct net *net = dev_net(dev);
1377        struct rtable *rt;
1378        int cpu;
1379
1380        for_each_possible_cpu(cpu) {
1381                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1382
1383                spin_lock_bh(&ul->lock);
1384                list_for_each_entry(rt, &ul->head, rt_uncached) {
1385                        if (rt->dst.dev != dev)
1386                                continue;
1387                        rt->dst.dev = net->loopback_dev;
1388                        dev_hold(rt->dst.dev);
1389                        dev_put(dev);
1390                }
1391                spin_unlock_bh(&ul->lock);
1392        }
1393}
1394
1395static bool rt_cache_valid(const struct rtable *rt)
1396{
1397        return  rt &&
1398                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1399                !rt_is_expired(rt);
1400}
1401
1402static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1403                           const struct fib_result *res,
1404                           struct fib_nh_exception *fnhe,
1405                           struct fib_info *fi, u16 type, u32 itag)
1406{
1407        bool cached = false;
1408
1409        if (fi) {
1410                struct fib_nh *nh = &FIB_RES_NH(*res);
1411
1412                if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1413                        rt->rt_gateway = nh->nh_gw;
1414                        rt->rt_uses_gateway = 1;
1415                }
1416                dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1417#ifdef CONFIG_IP_ROUTE_CLASSID
1418                rt->dst.tclassid = nh->nh_tclassid;
1419#endif
1420                rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1421                if (unlikely(fnhe))
1422                        cached = rt_bind_exception(rt, fnhe, daddr);
1423                else if (!(rt->dst.flags & DST_NOCACHE))
1424                        cached = rt_cache_route(nh, rt);
1425                if (unlikely(!cached)) {
1426                        /* Routes we intend to cache in nexthop exception or
1427                         * FIB nexthop have the DST_NOCACHE bit clear.
1428                         * However, if we are unsuccessful at storing this
1429                         * route into the cache we really need to set it.
1430                         */
1431                        rt->dst.flags |= DST_NOCACHE;
1432                        if (!rt->rt_gateway)
1433                                rt->rt_gateway = daddr;
1434                        rt_add_uncached_list(rt);
1435                }
1436        } else
1437                rt_add_uncached_list(rt);
1438
1439#ifdef CONFIG_IP_ROUTE_CLASSID
1440#ifdef CONFIG_IP_MULTIPLE_TABLES
1441        set_class_tag(rt, res->tclassid);
1442#endif
1443        set_class_tag(rt, itag);
1444#endif
1445}
1446
1447struct rtable *rt_dst_alloc(struct net_device *dev,
1448                            unsigned int flags, u16 type,
1449                            bool nopolicy, bool noxfrm, bool will_cache)
1450{
1451        struct rtable *rt;
1452
1453        rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1454                       (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1455                       (nopolicy ? DST_NOPOLICY : 0) |
1456                       (noxfrm ? DST_NOXFRM : 0));
1457
1458        if (rt) {
1459                rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1460                rt->rt_flags = flags;
1461                rt->rt_type = type;
1462                rt->rt_is_input = 0;
1463                rt->rt_iif = 0;
1464                rt->rt_pmtu = 0;
1465                rt->rt_gateway = 0;
1466                rt->rt_uses_gateway = 0;
1467                rt->rt_table_id = 0;
1468                INIT_LIST_HEAD(&rt->rt_uncached);
1469
1470                rt->dst.output = ip_output;
1471                if (flags & RTCF_LOCAL)
1472                        rt->dst.input = ip_local_deliver;
1473        }
1474
1475        return rt;
1476}
1477EXPORT_SYMBOL(rt_dst_alloc);
1478
1479/* called in rcu_read_lock() section */
1480static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1481                                u8 tos, struct net_device *dev, int our)
1482{
1483        struct rtable *rth;
1484        struct in_device *in_dev = __in_dev_get_rcu(dev);
1485        unsigned int flags = RTCF_MULTICAST;
1486        u32 itag = 0;
1487        int err;
1488
1489        /* Primary sanity checks. */
1490
1491        if (!in_dev)
1492                return -EINVAL;
1493
1494        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1495            skb->protocol != htons(ETH_P_IP))
1496                goto e_inval;
1497
1498        if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1499                goto e_inval;
1500
1501        if (ipv4_is_zeronet(saddr)) {
1502                if (!ipv4_is_local_multicast(daddr))
1503                        goto e_inval;
1504        } else {
1505                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1506                                          in_dev, &itag);
1507                if (err < 0)
1508                        goto e_err;
1509        }
1510        if (our)
1511                flags |= RTCF_LOCAL;
1512
1513        rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1514                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1515        if (!rth)
1516                goto e_nobufs;
1517
1518#ifdef CONFIG_IP_ROUTE_CLASSID
1519        rth->dst.tclassid = itag;
1520#endif
1521        rth->dst.output = ip_rt_bug;
1522        rth->rt_is_input= 1;
1523
1524#ifdef CONFIG_IP_MROUTE
1525        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1526                rth->dst.input = ip_mr_input;
1527#endif
1528        RT_CACHE_STAT_INC(in_slow_mc);
1529
1530        skb_dst_set(skb, &rth->dst);
1531        return 0;
1532
1533e_nobufs:
1534        return -ENOBUFS;
1535e_inval:
1536        return -EINVAL;
1537e_err:
1538        return err;
1539}
1540
1541
1542static void ip_handle_martian_source(struct net_device *dev,
1543                                     struct in_device *in_dev,
1544                                     struct sk_buff *skb,
1545                                     __be32 daddr,
1546                                     __be32 saddr)
1547{
1548        RT_CACHE_STAT_INC(in_martian_src);
1549#ifdef CONFIG_IP_ROUTE_VERBOSE
1550        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1551                /*
1552                 *      RFC1812 recommendation, if source is martian,
1553                 *      the only hint is MAC header.
1554                 */
1555                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1556                        &daddr, &saddr, dev->name);
1557                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1558                        print_hex_dump(KERN_WARNING, "ll header: ",
1559                                       DUMP_PREFIX_OFFSET, 16, 1,
1560                                       skb_mac_header(skb),
1561                                       dev->hard_header_len, true);
1562                }
1563        }
1564#endif
1565}
1566
1567static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1568{
1569        struct fnhe_hash_bucket *hash;
1570        struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1571        u32 hval = fnhe_hashfun(daddr);
1572
1573        spin_lock_bh(&fnhe_lock);
1574
1575        hash = rcu_dereference_protected(nh->nh_exceptions,
1576                                         lockdep_is_held(&fnhe_lock));
1577        hash += hval;
1578
1579        fnhe_p = &hash->chain;
1580        fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1581        while (fnhe) {
1582                if (fnhe->fnhe_daddr == daddr) {
1583                        rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1584                                fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1585                        fnhe_flush_routes(fnhe);
1586                        kfree_rcu(fnhe, rcu);
1587                        break;
1588                }
1589                fnhe_p = &fnhe->fnhe_next;
1590                fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1591                                                 lockdep_is_held(&fnhe_lock));
1592        }
1593
1594        spin_unlock_bh(&fnhe_lock);
1595}
1596
1597/* called in rcu_read_lock() section */
1598static int __mkroute_input(struct sk_buff *skb,
1599                           const struct fib_result *res,
1600                           struct in_device *in_dev,
1601                           __be32 daddr, __be32 saddr, u32 tos)
1602{
1603        struct fib_nh_exception *fnhe;
1604        struct rtable *rth;
1605        int err;
1606        struct in_device *out_dev;
1607        bool do_cache;
1608        u32 itag = 0;
1609
1610        /* get a working reference to the output device */
1611        out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1612        if (!out_dev) {
1613                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1614                return -EINVAL;
1615        }
1616
1617        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1618                                  in_dev->dev, in_dev, &itag);
1619        if (err < 0) {
1620                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1621                                         saddr);
1622
1623                goto cleanup;
1624        }
1625
1626        do_cache = res->fi && !itag;
1627        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1628            skb->protocol == htons(ETH_P_IP) &&
1629            (IN_DEV_SHARED_MEDIA(out_dev) ||
1630             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1631                IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1632
1633        if (skb->protocol != htons(ETH_P_IP)) {
1634                /* Not IP (i.e. ARP). Do not create route, if it is
1635                 * invalid for proxy arp. DNAT routes are always valid.
1636                 *
1637                 * Proxy arp feature have been extended to allow, ARP
1638                 * replies back to the same interface, to support
1639                 * Private VLAN switch technologies. See arp.c.
1640                 */
1641                if (out_dev == in_dev &&
1642                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1643                        err = -EINVAL;
1644                        goto cleanup;
1645                }
1646        }
1647
1648        fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1649        if (do_cache) {
1650                if (fnhe) {
1651                        rth = rcu_dereference(fnhe->fnhe_rth_input);
1652                        if (rth && rth->dst.expires &&
1653                            time_after(jiffies, rth->dst.expires)) {
1654                                ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1655                                fnhe = NULL;
1656                        } else {
1657                                goto rt_cache;
1658                        }
1659                }
1660
1661                rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1662
1663rt_cache:
1664                if (rt_cache_valid(rth)) {
1665                        skb_dst_set_noref(skb, &rth->dst);
1666                        goto out;
1667                }
1668        }
1669
1670        rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1671                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1672                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1673        if (!rth) {
1674                err = -ENOBUFS;
1675                goto cleanup;
1676        }
1677
1678        rth->rt_is_input = 1;
1679        if (res->table)
1680                rth->rt_table_id = res->table->tb_id;
1681        RT_CACHE_STAT_INC(in_slow_tot);
1682
1683        rth->dst.input = ip_forward;
1684
1685        rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1686        if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1687                rth->dst.lwtstate->orig_output = rth->dst.output;
1688                rth->dst.output = lwtunnel_output;
1689        }
1690        if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1691                rth->dst.lwtstate->orig_input = rth->dst.input;
1692                rth->dst.input = lwtunnel_input;
1693        }
1694        skb_dst_set(skb, &rth->dst);
1695out:
1696        err = 0;
1697 cleanup:
1698        return err;
1699}
1700
1701#ifdef CONFIG_IP_ROUTE_MULTIPATH
1702
1703/* To make ICMP packets follow the right flow, the multipath hash is
1704 * calculated from the inner IP addresses in reverse order.
1705 */
1706static int ip_multipath_icmp_hash(struct sk_buff *skb)
1707{
1708        const struct iphdr *outer_iph = ip_hdr(skb);
1709        struct icmphdr _icmph;
1710        const struct icmphdr *icmph;
1711        struct iphdr _inner_iph;
1712        const struct iphdr *inner_iph;
1713
1714        if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1715                goto standard_hash;
1716
1717        icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1718                                   &_icmph);
1719        if (!icmph)
1720                goto standard_hash;
1721
1722        if (icmph->type != ICMP_DEST_UNREACH &&
1723            icmph->type != ICMP_REDIRECT &&
1724            icmph->type != ICMP_TIME_EXCEEDED &&
1725            icmph->type != ICMP_PARAMETERPROB) {
1726                goto standard_hash;
1727        }
1728
1729        inner_iph = skb_header_pointer(skb,
1730                                       outer_iph->ihl * 4 + sizeof(_icmph),
1731                                       sizeof(_inner_iph), &_inner_iph);
1732        if (!inner_iph)
1733                goto standard_hash;
1734
1735        return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1736
1737standard_hash:
1738        return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1739}
1740
1741#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1742
1743static int ip_mkroute_input(struct sk_buff *skb,
1744                            struct fib_result *res,
1745                            const struct flowi4 *fl4,
1746                            struct in_device *in_dev,
1747                            __be32 daddr, __be32 saddr, u32 tos)
1748{
1749#ifdef CONFIG_IP_ROUTE_MULTIPATH
1750        if (res->fi && res->fi->fib_nhs > 1) {
1751                int h;
1752
1753                if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1754                        h = ip_multipath_icmp_hash(skb);
1755                else
1756                        h = fib_multipath_hash(saddr, daddr);
1757                fib_select_multipath(res, h);
1758        }
1759#endif
1760
1761        /* create a routing cache entry */
1762        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1763}
1764
1765/*
1766 *      NOTE. We drop all the packets that has local source
1767 *      addresses, because every properly looped back packet
1768 *      must have correct destination already attached by output routine.
1769 *
1770 *      Such approach solves two big problems:
1771 *      1. Not simplex devices are handled properly.
1772 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1773 *      called with rcu_read_lock()
1774 */
1775
1776static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1777                               u8 tos, struct net_device *dev)
1778{
1779        struct fib_result res;
1780        struct in_device *in_dev = __in_dev_get_rcu(dev);
1781        struct ip_tunnel_info *tun_info;
1782        struct flowi4   fl4;
1783        unsigned int    flags = 0;
1784        u32             itag = 0;
1785        struct rtable   *rth;
1786        int             err = -EINVAL;
1787        struct net    *net = dev_net(dev);
1788        bool do_cache;
1789
1790        /* IP on this device is disabled. */
1791
1792        if (!in_dev)
1793                goto out;
1794
1795        /* Check for the most weird martians, which can be not detected
1796           by fib_lookup.
1797         */
1798
1799        tun_info = skb_tunnel_info(skb);
1800        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1801                fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1802        else
1803                fl4.flowi4_tun_key.tun_id = 0;
1804        skb_dst_drop(skb);
1805
1806        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1807                goto martian_source;
1808
1809        res.fi = NULL;
1810        res.table = NULL;
1811        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1812                goto brd_input;
1813
1814        /* Accept zero addresses only to limited broadcast;
1815         * I even do not know to fix it or not. Waiting for complains :-)
1816         */
1817        if (ipv4_is_zeronet(saddr))
1818                goto martian_source;
1819
1820        if (ipv4_is_zeronet(daddr))
1821                goto martian_destination;
1822
1823        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1824         * and call it once if daddr or/and saddr are loopback addresses
1825         */
1826        if (ipv4_is_loopback(daddr)) {
1827                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1828                        goto martian_destination;
1829        } else if (ipv4_is_loopback(saddr)) {
1830                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1831                        goto martian_source;
1832        }
1833
1834        /*
1835         *      Now we are ready to route packet.
1836         */
1837        fl4.flowi4_oif = 0;
1838        fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
1839        fl4.flowi4_mark = skb->mark;
1840        fl4.flowi4_tos = tos;
1841        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1842        fl4.flowi4_flags = 0;
1843        fl4.daddr = daddr;
1844        fl4.saddr = saddr;
1845        err = fib_lookup(net, &fl4, &res, 0);
1846        if (err != 0) {
1847                if (!IN_DEV_FORWARD(in_dev))
1848                        err = -EHOSTUNREACH;
1849                goto no_route;
1850        }
1851
1852        if (res.type == RTN_BROADCAST)
1853                goto brd_input;
1854
1855        if (res.type == RTN_LOCAL) {
1856                err = fib_validate_source(skb, saddr, daddr, tos,
1857                                          0, dev, in_dev, &itag);
1858                if (err < 0)
1859                        goto martian_source;
1860                goto local_input;
1861        }
1862
1863        if (!IN_DEV_FORWARD(in_dev)) {
1864                err = -EHOSTUNREACH;
1865                goto no_route;
1866        }
1867        if (res.type != RTN_UNICAST)
1868                goto martian_destination;
1869
1870        err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1871out:    return err;
1872
1873brd_input:
1874        if (skb->protocol != htons(ETH_P_IP))
1875                goto e_inval;
1876
1877        if (!ipv4_is_zeronet(saddr)) {
1878                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1879                                          in_dev, &itag);
1880                if (err < 0)
1881                        goto martian_source;
1882        }
1883        flags |= RTCF_BROADCAST;
1884        res.type = RTN_BROADCAST;
1885        RT_CACHE_STAT_INC(in_brd);
1886
1887local_input:
1888        do_cache = false;
1889        if (res.fi) {
1890                if (!itag) {
1891                        rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1892                        if (rt_cache_valid(rth)) {
1893                                skb_dst_set_noref(skb, &rth->dst);
1894                                err = 0;
1895                                goto out;
1896                        }
1897                        do_cache = true;
1898                }
1899        }
1900
1901        rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
1902                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1903        if (!rth)
1904                goto e_nobufs;
1905
1906        rth->dst.output= ip_rt_bug;
1907#ifdef CONFIG_IP_ROUTE_CLASSID
1908        rth->dst.tclassid = itag;
1909#endif
1910        rth->rt_is_input = 1;
1911        if (res.table)
1912                rth->rt_table_id = res.table->tb_id;
1913
1914        RT_CACHE_STAT_INC(in_slow_tot);
1915        if (res.type == RTN_UNREACHABLE) {
1916                rth->dst.input= ip_error;
1917                rth->dst.error= -err;
1918                rth->rt_flags   &= ~RTCF_LOCAL;
1919        }
1920        if (do_cache) {
1921                if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1922                        rth->dst.flags |= DST_NOCACHE;
1923                        rt_add_uncached_list(rth);
1924                }
1925        }
1926        skb_dst_set(skb, &rth->dst);
1927        err = 0;
1928        goto out;
1929
1930no_route:
1931        RT_CACHE_STAT_INC(in_no_route);
1932        res.type = RTN_UNREACHABLE;
1933        res.fi = NULL;
1934        res.table = NULL;
1935        goto local_input;
1936
1937        /*
1938         *      Do not cache martian addresses: they should be logged (RFC1812)
1939         */
1940martian_destination:
1941        RT_CACHE_STAT_INC(in_martian_dst);
1942#ifdef CONFIG_IP_ROUTE_VERBOSE
1943        if (IN_DEV_LOG_MARTIANS(in_dev))
1944                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1945                                     &daddr, &saddr, dev->name);
1946#endif
1947
1948e_inval:
1949        err = -EINVAL;
1950        goto out;
1951
1952e_nobufs:
1953        err = -ENOBUFS;
1954        goto out;
1955
1956martian_source:
1957        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1958        goto out;
1959}
1960
1961int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1962                         u8 tos, struct net_device *dev)
1963{
1964        int res;
1965
1966        rcu_read_lock();
1967
1968        /* Multicast recognition logic is moved from route cache to here.
1969           The problem was that too many Ethernet cards have broken/missing
1970           hardware multicast filters :-( As result the host on multicasting
1971           network acquires a lot of useless route cache entries, sort of
1972           SDR messages from all the world. Now we try to get rid of them.
1973           Really, provided software IP multicast filter is organized
1974           reasonably (at least, hashed), it does not result in a slowdown
1975           comparing with route cache reject entries.
1976           Note, that multicast routers are not affected, because
1977           route cache entry is created eventually.
1978         */
1979        if (ipv4_is_multicast(daddr)) {
1980                struct in_device *in_dev = __in_dev_get_rcu(dev);
1981
1982                if (in_dev) {
1983                        int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1984                                                  ip_hdr(skb)->protocol);
1985                        if (our
1986#ifdef CONFIG_IP_MROUTE
1987                                ||
1988                            (!ipv4_is_local_multicast(daddr) &&
1989                             IN_DEV_MFORWARD(in_dev))
1990#endif
1991                           ) {
1992                                int res = ip_route_input_mc(skb, daddr, saddr,
1993                                                            tos, dev, our);
1994                                rcu_read_unlock();
1995                                return res;
1996                        }
1997                }
1998                rcu_read_unlock();
1999                return -EINVAL;
2000        }
2001        res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2002        rcu_read_unlock();
2003        return res;
2004}
2005EXPORT_SYMBOL(ip_route_input_noref);
2006
2007/* called with rcu_read_lock() */
2008static struct rtable *__mkroute_output(const struct fib_result *res,
2009                                       const struct flowi4 *fl4, int orig_oif,
2010                                       struct net_device *dev_out,
2011                                       unsigned int flags)
2012{
2013        struct fib_info *fi = res->fi;
2014        struct fib_nh_exception *fnhe;
2015        struct in_device *in_dev;
2016        u16 type = res->type;
2017        struct rtable *rth;
2018        bool do_cache;
2019
2020        in_dev = __in_dev_get_rcu(dev_out);
2021        if (!in_dev)
2022                return ERR_PTR(-EINVAL);
2023
2024        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2025                if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2026                        return ERR_PTR(-EINVAL);
2027
2028        if (ipv4_is_lbcast(fl4->daddr))
2029                type = RTN_BROADCAST;
2030        else if (ipv4_is_multicast(fl4->daddr))
2031                type = RTN_MULTICAST;
2032        else if (ipv4_is_zeronet(fl4->daddr))
2033                return ERR_PTR(-EINVAL);
2034
2035        if (dev_out->flags & IFF_LOOPBACK)
2036                flags |= RTCF_LOCAL;
2037
2038        do_cache = true;
2039        if (type == RTN_BROADCAST) {
2040                flags |= RTCF_BROADCAST | RTCF_LOCAL;
2041                fi = NULL;
2042        } else if (type == RTN_MULTICAST) {
2043                flags |= RTCF_MULTICAST | RTCF_LOCAL;
2044                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2045                                     fl4->flowi4_proto))
2046                        flags &= ~RTCF_LOCAL;
2047                else
2048                        do_cache = false;
2049                /* If multicast route do not exist use
2050                 * default one, but do not gateway in this case.
2051                 * Yes, it is hack.
2052                 */
2053                if (fi && res->prefixlen < 4)
2054                        fi = NULL;
2055        } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2056                   (orig_oif != dev_out->ifindex)) {
2057                /* For local routes that require a particular output interface
2058                 * we do not want to cache the result.  Caching the result
2059                 * causes incorrect behaviour when there are multiple source
2060                 * addresses on the interface, the end result being that if the
2061                 * intended recipient is waiting on that interface for the
2062                 * packet he won't receive it because it will be delivered on
2063                 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2064                 * be set to the loopback interface as well.
2065                 */
2066                fi = NULL;
2067        }
2068
2069        fnhe = NULL;
2070        do_cache &= fi != NULL;
2071        if (do_cache) {
2072                struct rtable __rcu **prth;
2073                struct fib_nh *nh = &FIB_RES_NH(*res);
2074
2075                fnhe = find_exception(nh, fl4->daddr);
2076                if (fnhe) {
2077                        prth = &fnhe->fnhe_rth_output;
2078                        rth = rcu_dereference(*prth);
2079                        if (rth && rth->dst.expires &&
2080                            time_after(jiffies, rth->dst.expires)) {
2081                                ip_del_fnhe(nh, fl4->daddr);
2082                                fnhe = NULL;
2083                        } else {
2084                                goto rt_cache;
2085                        }
2086                }
2087
2088                if (unlikely(fl4->flowi4_flags &
2089                             FLOWI_FLAG_KNOWN_NH &&
2090                             !(nh->nh_gw &&
2091                               nh->nh_scope == RT_SCOPE_LINK))) {
2092                        do_cache = false;
2093                        goto add;
2094                }
2095                prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2096                rth = rcu_dereference(*prth);
2097
2098rt_cache:
2099                if (rt_cache_valid(rth)) {
2100                        dst_hold(&rth->dst);
2101                        return rth;
2102                }
2103        }
2104
2105add:
2106        rth = rt_dst_alloc(dev_out, flags, type,
2107                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
2108                           IN_DEV_CONF_GET(in_dev, NOXFRM),
2109                           do_cache);
2110        if (!rth)
2111                return ERR_PTR(-ENOBUFS);
2112
2113        rth->rt_iif     = orig_oif ? : 0;
2114        if (res->table)
2115                rth->rt_table_id = res->table->tb_id;
2116
2117        RT_CACHE_STAT_INC(out_slow_tot);
2118
2119        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2120                if (flags & RTCF_LOCAL &&
2121                    !(dev_out->flags & IFF_LOOPBACK)) {
2122                        rth->dst.output = ip_mc_output;
2123                        RT_CACHE_STAT_INC(out_slow_mc);
2124                }
2125#ifdef CONFIG_IP_MROUTE
2126                if (type == RTN_MULTICAST) {
2127                        if (IN_DEV_MFORWARD(in_dev) &&
2128                            !ipv4_is_local_multicast(fl4->daddr)) {
2129                                rth->dst.input = ip_mr_input;
2130                                rth->dst.output = ip_mc_output;
2131                        }
2132                }
2133#endif
2134        }
2135
2136        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2137        if (lwtunnel_output_redirect(rth->dst.lwtstate))
2138                rth->dst.output = lwtunnel_output;
2139
2140        return rth;
2141}
2142
2143/*
2144 * Major route resolver routine.
2145 */
2146
2147struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2148                                          int mp_hash)
2149{
2150        struct net_device *dev_out = NULL;
2151        __u8 tos = RT_FL_TOS(fl4);
2152        unsigned int flags = 0;
2153        struct fib_result res;
2154        struct rtable *rth;
2155        int master_idx;
2156        int orig_oif;
2157        int err = -ENETUNREACH;
2158
2159        res.tclassid    = 0;
2160        res.fi          = NULL;
2161        res.table       = NULL;
2162
2163        orig_oif = fl4->flowi4_oif;
2164
2165        master_idx = l3mdev_master_ifindex_by_index(net, fl4->flowi4_oif);
2166        if (master_idx)
2167                fl4->flowi4_oif = master_idx;
2168        fl4->flowi4_iif = LOOPBACK_IFINDEX;
2169        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2170        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2171                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2172
2173        rcu_read_lock();
2174        if (fl4->saddr) {
2175                rth = ERR_PTR(-EINVAL);
2176                if (ipv4_is_multicast(fl4->saddr) ||
2177                    ipv4_is_lbcast(fl4->saddr) ||
2178                    ipv4_is_zeronet(fl4->saddr))
2179                        goto out;
2180
2181                /* I removed check for oif == dev_out->oif here.
2182                   It was wrong for two reasons:
2183                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2184                      is assigned to multiple interfaces.
2185                   2. Moreover, we are allowed to send packets with saddr
2186                      of another iface. --ANK
2187                 */
2188
2189                if (fl4->flowi4_oif == 0 &&
2190                    (ipv4_is_multicast(fl4->daddr) ||
2191                     ipv4_is_lbcast(fl4->daddr))) {
2192                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2193                        dev_out = __ip_dev_find(net, fl4->saddr, false);
2194                        if (!dev_out)
2195                                goto out;
2196
2197                        /* Special hack: user can direct multicasts
2198                           and limited broadcast via necessary interface
2199                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2200                           This hack is not just for fun, it allows
2201                           vic,vat and friends to work.
2202                           They bind socket to loopback, set ttl to zero
2203                           and expect that it will work.
2204                           From the viewpoint of routing cache they are broken,
2205                           because we are not allowed to build multicast path
2206                           with loopback source addr (look, routing cache
2207                           cannot know, that ttl is zero, so that packet
2208                           will not leave this host and route is valid).
2209                           Luckily, this hack is good workaround.
2210                         */
2211
2212                        fl4->flowi4_oif = dev_out->ifindex;
2213                        goto make_route;
2214                }
2215
2216                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2217                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2218                        if (!__ip_dev_find(net, fl4->saddr, false))
2219                                goto out;
2220                }
2221        }
2222
2223
2224        if (fl4->flowi4_oif) {
2225                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2226                rth = ERR_PTR(-ENODEV);
2227                if (!dev_out)
2228                        goto out;
2229
2230                /* RACE: Check return value of inet_select_addr instead. */
2231                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2232                        rth = ERR_PTR(-ENETUNREACH);
2233                        goto out;
2234                }
2235                if (ipv4_is_local_multicast(fl4->daddr) ||
2236                    ipv4_is_lbcast(fl4->daddr) ||
2237                    fl4->flowi4_proto == IPPROTO_IGMP) {
2238                        if (!fl4->saddr)
2239                                fl4->saddr = inet_select_addr(dev_out, 0,
2240                                                              RT_SCOPE_LINK);
2241                        goto make_route;
2242                }
2243                if (!fl4->saddr) {
2244                        if (ipv4_is_multicast(fl4->daddr))
2245                                fl4->saddr = inet_select_addr(dev_out, 0,
2246                                                              fl4->flowi4_scope);
2247                        else if (!fl4->daddr)
2248                                fl4->saddr = inet_select_addr(dev_out, 0,
2249                                                              RT_SCOPE_HOST);
2250                }
2251
2252                rth = l3mdev_get_rtable(dev_out, fl4);
2253                if (rth)
2254                        goto out;
2255        }
2256
2257        if (!fl4->daddr) {
2258                fl4->daddr = fl4->saddr;
2259                if (!fl4->daddr)
2260                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2261                dev_out = net->loopback_dev;
2262                fl4->flowi4_oif = LOOPBACK_IFINDEX;
2263                res.type = RTN_LOCAL;
2264                flags |= RTCF_LOCAL;
2265                goto make_route;
2266        }
2267
2268        err = fib_lookup(net, fl4, &res, 0);
2269        if (err) {
2270                res.fi = NULL;
2271                res.table = NULL;
2272                if (fl4->flowi4_oif &&
2273                    !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
2274                        /* Apparently, routing tables are wrong. Assume,
2275                           that the destination is on link.
2276
2277                           WHY? DW.
2278                           Because we are allowed to send to iface
2279                           even if it has NO routes and NO assigned
2280                           addresses. When oif is specified, routing
2281                           tables are looked up with only one purpose:
2282                           to catch if destination is gatewayed, rather than
2283                           direct. Moreover, if MSG_DONTROUTE is set,
2284                           we send packet, ignoring both routing tables
2285                           and ifaddr state. --ANK
2286
2287
2288                           We could make it even if oif is unknown,
2289                           likely IPv6, but we do not.
2290                         */
2291
2292                        if (fl4->saddr == 0)
2293                                fl4->saddr = inet_select_addr(dev_out, 0,
2294                                                              RT_SCOPE_LINK);
2295                        res.type = RTN_UNICAST;
2296                        goto make_route;
2297                }
2298                rth = ERR_PTR(err);
2299                goto out;
2300        }
2301
2302        if (res.type == RTN_LOCAL) {
2303                if (!fl4->saddr) {
2304                        if (res.fi->fib_prefsrc)
2305                                fl4->saddr = res.fi->fib_prefsrc;
2306                        else
2307                                fl4->saddr = fl4->daddr;
2308                }
2309                dev_out = net->loopback_dev;
2310                fl4->flowi4_oif = dev_out->ifindex;
2311                flags |= RTCF_LOCAL;
2312                goto make_route;
2313        }
2314
2315        fib_select_path(net, &res, fl4, mp_hash);
2316
2317        dev_out = FIB_RES_DEV(res);
2318        fl4->flowi4_oif = dev_out->ifindex;
2319
2320
2321make_route:
2322        rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2323
2324out:
2325        rcu_read_unlock();
2326        return rth;
2327}
2328EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2329
2330static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2331{
2332        return NULL;
2333}
2334
2335static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2336{
2337        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2338
2339        return mtu ? : dst->dev->mtu;
2340}
2341
2342static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2343                                          struct sk_buff *skb, u32 mtu)
2344{
2345}
2346
2347static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2348                                       struct sk_buff *skb)
2349{
2350}
2351
2352static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2353                                          unsigned long old)
2354{
2355        return NULL;
2356}
2357
2358static struct dst_ops ipv4_dst_blackhole_ops = {
2359        .family                 =       AF_INET,
2360        .check                  =       ipv4_blackhole_dst_check,
2361        .mtu                    =       ipv4_blackhole_mtu,
2362        .default_advmss         =       ipv4_default_advmss,
2363        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2364        .redirect               =       ipv4_rt_blackhole_redirect,
2365        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2366        .neigh_lookup           =       ipv4_neigh_lookup,
2367};
2368
2369struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2370{
2371        struct rtable *ort = (struct rtable *) dst_orig;
2372        struct rtable *rt;
2373
2374        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2375        if (rt) {
2376                struct dst_entry *new = &rt->dst;
2377
2378                new->__use = 1;
2379                new->input = dst_discard;
2380                new->output = dst_discard_out;
2381
2382                new->dev = ort->dst.dev;
2383                if (new->dev)
2384                        dev_hold(new->dev);
2385
2386                rt->rt_is_input = ort->rt_is_input;
2387                rt->rt_iif = ort->rt_iif;
2388                rt->rt_pmtu = ort->rt_pmtu;
2389
2390                rt->rt_genid = rt_genid_ipv4(net);
2391                rt->rt_flags = ort->rt_flags;
2392                rt->rt_type = ort->rt_type;
2393                rt->rt_gateway = ort->rt_gateway;
2394                rt->rt_uses_gateway = ort->rt_uses_gateway;
2395
2396                INIT_LIST_HEAD(&rt->rt_uncached);
2397                dst_free(new);
2398        }
2399
2400        dst_release(dst_orig);
2401
2402        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2403}
2404
2405struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2406                                    const struct sock *sk)
2407{
2408        struct rtable *rt = __ip_route_output_key(net, flp4);
2409
2410        if (IS_ERR(rt))
2411                return rt;
2412
2413        if (flp4->flowi4_proto)
2414                rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2415                                                        flowi4_to_flowi(flp4),
2416                                                        sk, 0);
2417
2418        return rt;
2419}
2420EXPORT_SYMBOL_GPL(ip_route_output_flow);
2421
2422static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2423                        struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2424                        u32 seq, int event, int nowait, unsigned int flags)
2425{
2426        struct rtable *rt = skb_rtable(skb);
2427        struct rtmsg *r;
2428        struct nlmsghdr *nlh;
2429        unsigned long expires = 0;
2430        u32 error;
2431        u32 metrics[RTAX_MAX];
2432
2433        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2434        if (!nlh)
2435                return -EMSGSIZE;
2436
2437        r = nlmsg_data(nlh);
2438        r->rtm_family    = AF_INET;
2439        r->rtm_dst_len  = 32;
2440        r->rtm_src_len  = 0;
2441        r->rtm_tos      = fl4->flowi4_tos;
2442        r->rtm_table    = table_id;
2443        if (nla_put_u32(skb, RTA_TABLE, table_id))
2444                goto nla_put_failure;
2445        r->rtm_type     = rt->rt_type;
2446        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2447        r->rtm_protocol = RTPROT_UNSPEC;
2448        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2449        if (rt->rt_flags & RTCF_NOTIFY)
2450                r->rtm_flags |= RTM_F_NOTIFY;
2451        if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2452                r->rtm_flags |= RTCF_DOREDIRECT;
2453
2454        if (nla_put_in_addr(skb, RTA_DST, dst))
2455                goto nla_put_failure;
2456        if (src) {
2457                r->rtm_src_len = 32;
2458                if (nla_put_in_addr(skb, RTA_SRC, src))
2459                        goto nla_put_failure;
2460        }
2461        if (rt->dst.dev &&
2462            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2463                goto nla_put_failure;
2464#ifdef CONFIG_IP_ROUTE_CLASSID
2465        if (rt->dst.tclassid &&
2466            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2467                goto nla_put_failure;
2468#endif
2469        if (!rt_is_input_route(rt) &&
2470            fl4->saddr != src) {
2471                if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2472                        goto nla_put_failure;
2473        }
2474        if (rt->rt_uses_gateway &&
2475            nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2476                goto nla_put_failure;
2477
2478        expires = rt->dst.expires;
2479        if (expires) {
2480                unsigned long now = jiffies;
2481
2482                if (time_before(now, expires))
2483                        expires -= now;
2484                else
2485                        expires = 0;
2486        }
2487
2488        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2489        if (rt->rt_pmtu && expires)
2490                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2491        if (rtnetlink_put_metrics(skb, metrics) < 0)
2492                goto nla_put_failure;
2493
2494        if (fl4->flowi4_mark &&
2495            nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2496                goto nla_put_failure;
2497
2498        error = rt->dst.error;
2499
2500        if (rt_is_input_route(rt)) {
2501#ifdef CONFIG_IP_MROUTE
2502                if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2503                    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2504                        int err = ipmr_get_route(net, skb,
2505                                                 fl4->saddr, fl4->daddr,
2506                                                 r, nowait, portid);
2507
2508                        if (err <= 0) {
2509                                if (!nowait) {
2510                                        if (err == 0)
2511                                                return 0;
2512                                        goto nla_put_failure;
2513                                } else {
2514                                        if (err == -EMSGSIZE)
2515                                                goto nla_put_failure;
2516                                        error = err;
2517                                }
2518                        }
2519                } else
2520#endif
2521                        if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2522                                goto nla_put_failure;
2523        }
2524
2525        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2526                goto nla_put_failure;
2527
2528        nlmsg_end(skb, nlh);
2529        return 0;
2530
2531nla_put_failure:
2532        nlmsg_cancel(skb, nlh);
2533        return -EMSGSIZE;
2534}
2535
2536static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2537{
2538        struct net *net = sock_net(in_skb->sk);
2539        struct rtmsg *rtm;
2540        struct nlattr *tb[RTA_MAX+1];
2541        struct rtable *rt = NULL;
2542        struct flowi4 fl4;
2543        __be32 dst = 0;
2544        __be32 src = 0;
2545        u32 iif;
2546        int err;
2547        int mark;
2548        struct sk_buff *skb;
2549        u32 table_id = RT_TABLE_MAIN;
2550
2551        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2552        if (err < 0)
2553                goto errout;
2554
2555        rtm = nlmsg_data(nlh);
2556
2557        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2558        if (!skb) {
2559                err = -ENOBUFS;
2560                goto errout;
2561        }
2562
2563        /* Reserve room for dummy headers, this skb can pass
2564           through good chunk of routing engine.
2565         */
2566        skb_reset_mac_header(skb);
2567        skb_reset_network_header(skb);
2568
2569        /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2570        ip_hdr(skb)->protocol = IPPROTO_ICMP;
2571        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2572
2573        src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2574        dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2575        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2576        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2577
2578        memset(&fl4, 0, sizeof(fl4));
2579        fl4.daddr = dst;
2580        fl4.saddr = src;
2581        fl4.flowi4_tos = rtm->rtm_tos;
2582        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2583        fl4.flowi4_mark = mark;
2584
2585        if (netif_index_is_l3_master(net, fl4.flowi4_oif))
2586                fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
2587
2588        if (iif) {
2589                struct net_device *dev;
2590
2591                dev = __dev_get_by_index(net, iif);
2592                if (!dev) {
2593                        err = -ENODEV;
2594                        goto errout_free;
2595                }
2596
2597                skb->protocol   = htons(ETH_P_IP);
2598                skb->dev        = dev;
2599                skb->mark       = mark;
2600                local_bh_disable();
2601                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2602                local_bh_enable();
2603
2604                rt = skb_rtable(skb);
2605                if (err == 0 && rt->dst.error)
2606                        err = -rt->dst.error;
2607        } else {
2608                rt = ip_route_output_key(net, &fl4);
2609
2610                err = 0;
2611                if (IS_ERR(rt))
2612                        err = PTR_ERR(rt);
2613        }
2614
2615        if (err)
2616                goto errout_free;
2617
2618        skb_dst_set(skb, &rt->dst);
2619        if (rtm->rtm_flags & RTM_F_NOTIFY)
2620                rt->rt_flags |= RTCF_NOTIFY;
2621
2622        if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2623                table_id = rt->rt_table_id;
2624
2625        err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2626                           NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2627                           RTM_NEWROUTE, 0, 0);
2628        if (err < 0)
2629                goto errout_free;
2630
2631        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2632errout:
2633        return err;
2634
2635errout_free:
2636        kfree_skb(skb);
2637        goto errout;
2638}
2639
2640void ip_rt_multicast_event(struct in_device *in_dev)
2641{
2642        rt_cache_flush(dev_net(in_dev->dev));
2643}
2644
2645#ifdef CONFIG_SYSCTL
2646static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2647static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2648static int ip_rt_gc_elasticity __read_mostly    = 8;
2649
2650static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2651                                        void __user *buffer,
2652                                        size_t *lenp, loff_t *ppos)
2653{
2654        struct net *net = (struct net *)__ctl->extra1;
2655
2656        if (write) {
2657                rt_cache_flush(net);
2658                fnhe_genid_bump(net);
2659                return 0;
2660        }
2661
2662        return -EINVAL;
2663}
2664
2665static struct ctl_table ipv4_route_table[] = {
2666        {
2667                .procname       = "gc_thresh",
2668                .data           = &ipv4_dst_ops.gc_thresh,
2669                .maxlen         = sizeof(int),
2670                .mode           = 0644,
2671                .proc_handler   = proc_dointvec,
2672        },
2673        {
2674                .procname       = "max_size",
2675                .data           = &ip_rt_max_size,
2676                .maxlen         = sizeof(int),
2677                .mode           = 0644,
2678                .proc_handler   = proc_dointvec,
2679        },
2680        {
2681                /*  Deprecated. Use gc_min_interval_ms */
2682
2683                .procname       = "gc_min_interval",
2684                .data           = &ip_rt_gc_min_interval,
2685                .maxlen         = sizeof(int),
2686                .mode           = 0644,
2687                .proc_handler   = proc_dointvec_jiffies,
2688        },
2689        {
2690                .procname       = "gc_min_interval_ms",
2691                .data           = &ip_rt_gc_min_interval,
2692                .maxlen         = sizeof(int),
2693                .mode           = 0644,
2694                .proc_handler   = proc_dointvec_ms_jiffies,
2695        },
2696        {
2697                .procname       = "gc_timeout",
2698                .data           = &ip_rt_gc_timeout,
2699                .maxlen         = sizeof(int),
2700                .mode           = 0644,
2701                .proc_handler   = proc_dointvec_jiffies,
2702        },
2703        {
2704                .procname       = "gc_interval",
2705                .data           = &ip_rt_gc_interval,
2706                .maxlen         = sizeof(int),
2707                .mode           = 0644,
2708                .proc_handler   = proc_dointvec_jiffies,
2709        },
2710        {
2711                .procname       = "redirect_load",
2712                .data           = &ip_rt_redirect_load,
2713                .maxlen         = sizeof(int),
2714                .mode           = 0644,
2715                .proc_handler   = proc_dointvec,
2716        },
2717        {
2718                .procname       = "redirect_number",
2719                .data           = &ip_rt_redirect_number,
2720                .maxlen         = sizeof(int),
2721                .mode           = 0644,
2722                .proc_handler   = proc_dointvec,
2723        },
2724        {
2725                .procname       = "redirect_silence",
2726                .data           = &ip_rt_redirect_silence,
2727                .maxlen         = sizeof(int),
2728                .mode           = 0644,
2729                .proc_handler   = proc_dointvec,
2730        },
2731        {
2732                .procname       = "error_cost",
2733                .data           = &ip_rt_error_cost,
2734                .maxlen         = sizeof(int),
2735                .mode           = 0644,
2736                .proc_handler   = proc_dointvec,
2737        },
2738        {
2739                .procname       = "error_burst",
2740                .data           = &ip_rt_error_burst,
2741                .maxlen         = sizeof(int),
2742                .mode           = 0644,
2743                .proc_handler   = proc_dointvec,
2744        },
2745        {
2746                .procname       = "gc_elasticity",
2747                .data           = &ip_rt_gc_elasticity,
2748                .maxlen         = sizeof(int),
2749                .mode           = 0644,
2750                .proc_handler   = proc_dointvec,
2751        },
2752        {
2753                .procname       = "mtu_expires",
2754                .data           = &ip_rt_mtu_expires,
2755                .maxlen         = sizeof(int),
2756                .mode           = 0644,
2757                .proc_handler   = proc_dointvec_jiffies,
2758        },
2759        {
2760                .procname       = "min_pmtu",
2761                .data           = &ip_rt_min_pmtu,
2762                .maxlen         = sizeof(int),
2763                .mode           = 0644,
2764                .proc_handler   = proc_dointvec,
2765        },
2766        {
2767                .procname       = "min_adv_mss",
2768                .data           = &ip_rt_min_advmss,
2769                .maxlen         = sizeof(int),
2770                .mode           = 0644,
2771                .proc_handler   = proc_dointvec,
2772        },
2773        { }
2774};
2775
2776static struct ctl_table ipv4_route_flush_table[] = {
2777        {
2778                .procname       = "flush",
2779                .maxlen         = sizeof(int),
2780                .mode           = 0200,
2781                .proc_handler   = ipv4_sysctl_rtcache_flush,
2782        },
2783        { },
2784};
2785
2786static __net_init int sysctl_route_net_init(struct net *net)
2787{
2788        struct ctl_table *tbl;
2789
2790        tbl = ipv4_route_flush_table;
2791        if (!net_eq(net, &init_net)) {
2792                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2793                if (!tbl)
2794                        goto err_dup;
2795
2796                /* Don't export sysctls to unprivileged users */
2797                if (net->user_ns != &init_user_ns)
2798                        tbl[0].procname = NULL;
2799        }
2800        tbl[0].extra1 = net;
2801
2802        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2803        if (!net->ipv4.route_hdr)
2804                goto err_reg;
2805        return 0;
2806
2807err_reg:
2808        if (tbl != ipv4_route_flush_table)
2809                kfree(tbl);
2810err_dup:
2811        return -ENOMEM;
2812}
2813
2814static __net_exit void sysctl_route_net_exit(struct net *net)
2815{
2816        struct ctl_table *tbl;
2817
2818        tbl = net->ipv4.route_hdr->ctl_table_arg;
2819        unregister_net_sysctl_table(net->ipv4.route_hdr);
2820        BUG_ON(tbl == ipv4_route_flush_table);
2821        kfree(tbl);
2822}
2823
2824static __net_initdata struct pernet_operations sysctl_route_ops = {
2825        .init = sysctl_route_net_init,
2826        .exit = sysctl_route_net_exit,
2827};
2828#endif
2829
2830static __net_init int rt_genid_init(struct net *net)
2831{
2832        atomic_set(&net->ipv4.rt_genid, 0);
2833        atomic_set(&net->fnhe_genid, 0);
2834        get_random_bytes(&net->ipv4.dev_addr_genid,
2835                         sizeof(net->ipv4.dev_addr_genid));
2836        return 0;
2837}
2838
2839static __net_initdata struct pernet_operations rt_genid_ops = {
2840        .init = rt_genid_init,
2841};
2842
2843static int __net_init ipv4_inetpeer_init(struct net *net)
2844{
2845        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2846
2847        if (!bp)
2848                return -ENOMEM;
2849        inet_peer_base_init(bp);
2850        net->ipv4.peers = bp;
2851        return 0;
2852}
2853
2854static void __net_exit ipv4_inetpeer_exit(struct net *net)
2855{
2856        struct inet_peer_base *bp = net->ipv4.peers;
2857
2858        net->ipv4.peers = NULL;
2859        inetpeer_invalidate_tree(bp);
2860        kfree(bp);
2861}
2862
2863static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2864        .init   =       ipv4_inetpeer_init,
2865        .exit   =       ipv4_inetpeer_exit,
2866};
2867
2868#ifdef CONFIG_IP_ROUTE_CLASSID
2869struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2870#endif /* CONFIG_IP_ROUTE_CLASSID */
2871
2872int __init ip_rt_init(void)
2873{
2874        int rc = 0;
2875        int cpu;
2876
2877        ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2878        if (!ip_idents)
2879                panic("IP: failed to allocate ip_idents\n");
2880
2881        prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2882
2883        ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2884        if (!ip_tstamps)
2885                panic("IP: failed to allocate ip_tstamps\n");
2886
2887        for_each_possible_cpu(cpu) {
2888                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2889
2890                INIT_LIST_HEAD(&ul->head);
2891                spin_lock_init(&ul->lock);
2892        }
2893#ifdef CONFIG_IP_ROUTE_CLASSID
2894        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2895        if (!ip_rt_acct)
2896                panic("IP: failed to allocate ip_rt_acct\n");
2897#endif
2898
2899        ipv4_dst_ops.kmem_cachep =
2900                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2901                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2902
2903        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2904
2905        if (dst_entries_init(&ipv4_dst_ops) < 0)
2906                panic("IP: failed to allocate ipv4_dst_ops counter\n");
2907
2908        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2909                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2910
2911        ipv4_dst_ops.gc_thresh = ~0;
2912        ip_rt_max_size = INT_MAX;
2913
2914        devinet_init();
2915        ip_fib_init();
2916
2917        if (ip_rt_proc_init())
2918                pr_err("Unable to create route proc files\n");
2919#ifdef CONFIG_XFRM
2920        xfrm_init();
2921        xfrm4_init();
2922#endif
2923        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2924
2925#ifdef CONFIG_SYSCTL
2926        register_pernet_subsys(&sysctl_route_ops);
2927#endif
2928        register_pernet_subsys(&rt_genid_ops);
2929        register_pernet_subsys(&ipv4_inetpeer_ops);
2930        return rc;
2931}
2932
2933#ifdef CONFIG_SYSCTL
2934/*
2935 * We really need to sanitize the damn ipv4 init order, then all
2936 * this nonsense will go away.
2937 */
2938void __init ip_static_sysctl_init(void)
2939{
2940        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2941}
2942#endif
2943