linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *              Alan Cox        :       Verify area fixes.
  16 *              Alan Cox        :       cli() protects routing changes
  17 *              Rui Oliveira    :       ICMP routing table updates
  18 *              (rco@di.uminho.pt)      Routing table insertion and update
  19 *              Linus Torvalds  :       Rewrote bits to be sensible
  20 *              Alan Cox        :       Added BSD route gw semantics
  21 *              Alan Cox        :       Super /proc >4K
  22 *              Alan Cox        :       MTU in route table
  23 *              Alan Cox        :       MSS actually. Also added the window
  24 *                                      clamper.
  25 *              Sam Lantinga    :       Fixed route matching in rt_del()
  26 *              Alan Cox        :       Routing cache support.
  27 *              Alan Cox        :       Removed compatibility cruft.
  28 *              Alan Cox        :       RTF_REJECT support.
  29 *              Alan Cox        :       TCP irtt support.
  30 *              Jonathan Naylor :       Added Metric support.
  31 *      Miquel van Smoorenburg  :       BSD API fixes.
  32 *      Miquel van Smoorenburg  :       Metrics.
  33 *              Alan Cox        :       Use __u32 properly
  34 *              Alan Cox        :       Aligned routing errors more closely with BSD
  35 *                                      our system is still very different.
  36 *              Alan Cox        :       Faster /proc handling
  37 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38 *                                      routing caches and better behaviour.
  39 *
  40 *              Olaf Erb        :       irtt wasn't being copied right.
  41 *              Bjorn Ekwall    :       Kerneld route support.
  42 *              Alan Cox        :       Multicast fixed (I hope)
  43 *              Pavel Krauz     :       Limited broadcast fixed
  44 *              Mike McLagan    :       Routing by source
  45 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46 *                                      route.c and rewritten from scratch.
  47 *              Andi Kleen      :       Load-limit warning messages.
  48 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52 *              Marc Boucher    :       routing by fwmark
  53 *      Robert Olsson           :       Added rt_cache statistics
  54 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58 *
  59 *              This program is free software; you can redistribute it and/or
  60 *              modify it under the terms of the GNU General Public License
  61 *              as published by the Free Software Foundation; either version
  62 *              2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
  92#include <linux/jhash.h>
  93#include <net/dst.h>
  94#include <net/net_namespace.h>
  95#include <net/protocol.h>
  96#include <net/ip.h>
  97#include <net/route.h>
  98#include <net/inetpeer.h>
  99#include <net/sock.h>
 100#include <net/ip_fib.h>
 101#include <net/arp.h>
 102#include <net/tcp.h>
 103#include <net/icmp.h>
 104#include <net/xfrm.h>
 105#include <net/netevent.h>
 106#include <net/rtnetlink.h>
 107#ifdef CONFIG_SYSCTL
 108#include <linux/sysctl.h>
 109#include <linux/kmemleak.h>
 110#endif
 111#include <net/secure_seq.h>
 112
 113#define RT_FL_TOS(oldflp4) \
 114        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 115
 116#define RT_GC_TIMEOUT (300*HZ)
 117
 118static int ip_rt_max_size;
 119static int ip_rt_redirect_number __read_mostly  = 9;
 120static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 121static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 122static int ip_rt_error_cost __read_mostly       = HZ;
 123static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 124static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 125static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 126static int ip_rt_min_advmss __read_mostly       = 256;
 127
 128/*
 129 *      Interface to generic destination cache.
 130 */
 131
 132static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 133static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 134static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 135static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 136static void              ipv4_link_failure(struct sk_buff *skb);
 137static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 138                                           struct sk_buff *skb, u32 mtu);
 139static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 140                                        struct sk_buff *skb);
 141static void             ipv4_dst_destroy(struct dst_entry *dst);
 142
 143static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 144{
 145        WARN_ON(1);
 146        return NULL;
 147}
 148
 149static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 150                                           struct sk_buff *skb,
 151                                           const void *daddr);
 152
 153static struct dst_ops ipv4_dst_ops = {
 154        .family =               AF_INET,
 155        .check =                ipv4_dst_check,
 156        .default_advmss =       ipv4_default_advmss,
 157        .mtu =                  ipv4_mtu,
 158        .cow_metrics =          ipv4_cow_metrics,
 159        .destroy =              ipv4_dst_destroy,
 160        .negative_advice =      ipv4_negative_advice,
 161        .link_failure =         ipv4_link_failure,
 162        .update_pmtu =          ip_rt_update_pmtu,
 163        .redirect =             ip_do_redirect,
 164        .local_out =            __ip_local_out,
 165        .neigh_lookup =         ipv4_neigh_lookup,
 166};
 167
 168#define ECN_OR_COST(class)      TC_PRIO_##class
 169
 170const __u8 ip_tos2prio[16] = {
 171        TC_PRIO_BESTEFFORT,
 172        ECN_OR_COST(BESTEFFORT),
 173        TC_PRIO_BESTEFFORT,
 174        ECN_OR_COST(BESTEFFORT),
 175        TC_PRIO_BULK,
 176        ECN_OR_COST(BULK),
 177        TC_PRIO_BULK,
 178        ECN_OR_COST(BULK),
 179        TC_PRIO_INTERACTIVE,
 180        ECN_OR_COST(INTERACTIVE),
 181        TC_PRIO_INTERACTIVE,
 182        ECN_OR_COST(INTERACTIVE),
 183        TC_PRIO_INTERACTIVE_BULK,
 184        ECN_OR_COST(INTERACTIVE_BULK),
 185        TC_PRIO_INTERACTIVE_BULK,
 186        ECN_OR_COST(INTERACTIVE_BULK)
 187};
 188EXPORT_SYMBOL(ip_tos2prio);
 189
 190static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 191#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 192
 193#ifdef CONFIG_PROC_FS
 194static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 195{
 196        if (*pos)
 197                return NULL;
 198        return SEQ_START_TOKEN;
 199}
 200
 201static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 202{
 203        ++*pos;
 204        return NULL;
 205}
 206
 207static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 208{
 209}
 210
 211static int rt_cache_seq_show(struct seq_file *seq, void *v)
 212{
 213        if (v == SEQ_START_TOKEN)
 214                seq_printf(seq, "%-127s\n",
 215                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 216                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 217                           "HHUptod\tSpecDst");
 218        return 0;
 219}
 220
 221static const struct seq_operations rt_cache_seq_ops = {
 222        .start  = rt_cache_seq_start,
 223        .next   = rt_cache_seq_next,
 224        .stop   = rt_cache_seq_stop,
 225        .show   = rt_cache_seq_show,
 226};
 227
 228static int rt_cache_seq_open(struct inode *inode, struct file *file)
 229{
 230        return seq_open(file, &rt_cache_seq_ops);
 231}
 232
 233static const struct file_operations rt_cache_seq_fops = {
 234        .owner   = THIS_MODULE,
 235        .open    = rt_cache_seq_open,
 236        .read    = seq_read,
 237        .llseek  = seq_lseek,
 238        .release = seq_release,
 239};
 240
 241
 242static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 243{
 244        int cpu;
 245
 246        if (*pos == 0)
 247                return SEQ_START_TOKEN;
 248
 249        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 250                if (!cpu_possible(cpu))
 251                        continue;
 252                *pos = cpu+1;
 253                return &per_cpu(rt_cache_stat, cpu);
 254        }
 255        return NULL;
 256}
 257
 258static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 259{
 260        int cpu;
 261
 262        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 263                if (!cpu_possible(cpu))
 264                        continue;
 265                *pos = cpu+1;
 266                return &per_cpu(rt_cache_stat, cpu);
 267        }
 268        return NULL;
 269
 270}
 271
 272static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 273{
 274
 275}
 276
 277static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 278{
 279        struct rt_cache_stat *st = v;
 280
 281        if (v == SEQ_START_TOKEN) {
 282                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 283                return 0;
 284        }
 285
 286        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 287                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 288                   dst_entries_get_slow(&ipv4_dst_ops),
 289                   0, /* st->in_hit */
 290                   st->in_slow_tot,
 291                   st->in_slow_mc,
 292                   st->in_no_route,
 293                   st->in_brd,
 294                   st->in_martian_dst,
 295                   st->in_martian_src,
 296
 297                   0, /* st->out_hit */
 298                   st->out_slow_tot,
 299                   st->out_slow_mc,
 300
 301                   0, /* st->gc_total */
 302                   0, /* st->gc_ignored */
 303                   0, /* st->gc_goal_miss */
 304                   0, /* st->gc_dst_overflow */
 305                   0, /* st->in_hlist_search */
 306                   0  /* st->out_hlist_search */
 307                );
 308        return 0;
 309}
 310
 311static const struct seq_operations rt_cpu_seq_ops = {
 312        .start  = rt_cpu_seq_start,
 313        .next   = rt_cpu_seq_next,
 314        .stop   = rt_cpu_seq_stop,
 315        .show   = rt_cpu_seq_show,
 316};
 317
 318
 319static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 320{
 321        return seq_open(file, &rt_cpu_seq_ops);
 322}
 323
 324static const struct file_operations rt_cpu_seq_fops = {
 325        .owner   = THIS_MODULE,
 326        .open    = rt_cpu_seq_open,
 327        .read    = seq_read,
 328        .llseek  = seq_lseek,
 329        .release = seq_release,
 330};
 331
 332#ifdef CONFIG_IP_ROUTE_CLASSID
 333static int rt_acct_proc_show(struct seq_file *m, void *v)
 334{
 335        struct ip_rt_acct *dst, *src;
 336        unsigned int i, j;
 337
 338        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 339        if (!dst)
 340                return -ENOMEM;
 341
 342        for_each_possible_cpu(i) {
 343                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 344                for (j = 0; j < 256; j++) {
 345                        dst[j].o_bytes   += src[j].o_bytes;
 346                        dst[j].o_packets += src[j].o_packets;
 347                        dst[j].i_bytes   += src[j].i_bytes;
 348                        dst[j].i_packets += src[j].i_packets;
 349                }
 350        }
 351
 352        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 353        kfree(dst);
 354        return 0;
 355}
 356
 357static int rt_acct_proc_open(struct inode *inode, struct file *file)
 358{
 359        return single_open(file, rt_acct_proc_show, NULL);
 360}
 361
 362static const struct file_operations rt_acct_proc_fops = {
 363        .owner          = THIS_MODULE,
 364        .open           = rt_acct_proc_open,
 365        .read           = seq_read,
 366        .llseek         = seq_lseek,
 367        .release        = single_release,
 368};
 369#endif
 370
 371static int __net_init ip_rt_do_proc_init(struct net *net)
 372{
 373        struct proc_dir_entry *pde;
 374
 375        pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 376                          &rt_cache_seq_fops);
 377        if (!pde)
 378                goto err1;
 379
 380        pde = proc_create("rt_cache", S_IRUGO,
 381                          net->proc_net_stat, &rt_cpu_seq_fops);
 382        if (!pde)
 383                goto err2;
 384
 385#ifdef CONFIG_IP_ROUTE_CLASSID
 386        pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 387        if (!pde)
 388                goto err3;
 389#endif
 390        return 0;
 391
 392#ifdef CONFIG_IP_ROUTE_CLASSID
 393err3:
 394        remove_proc_entry("rt_cache", net->proc_net_stat);
 395#endif
 396err2:
 397        remove_proc_entry("rt_cache", net->proc_net);
 398err1:
 399        return -ENOMEM;
 400}
 401
 402static void __net_exit ip_rt_do_proc_exit(struct net *net)
 403{
 404        remove_proc_entry("rt_cache", net->proc_net_stat);
 405        remove_proc_entry("rt_cache", net->proc_net);
 406#ifdef CONFIG_IP_ROUTE_CLASSID
 407        remove_proc_entry("rt_acct", net->proc_net);
 408#endif
 409}
 410
 411static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 412        .init = ip_rt_do_proc_init,
 413        .exit = ip_rt_do_proc_exit,
 414};
 415
 416static int __init ip_rt_proc_init(void)
 417{
 418        return register_pernet_subsys(&ip_rt_proc_ops);
 419}
 420
 421#else
 422static inline int ip_rt_proc_init(void)
 423{
 424        return 0;
 425}
 426#endif /* CONFIG_PROC_FS */
 427
 428static inline bool rt_is_expired(const struct rtable *rth)
 429{
 430        return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 431}
 432
 433void rt_cache_flush(struct net *net)
 434{
 435        rt_genid_bump_ipv4(net);
 436}
 437
 438static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 439                                           struct sk_buff *skb,
 440                                           const void *daddr)
 441{
 442        struct net_device *dev = dst->dev;
 443        const __be32 *pkey = daddr;
 444        const struct rtable *rt;
 445        struct neighbour *n;
 446
 447        rt = (const struct rtable *) dst;
 448        if (rt->rt_gateway)
 449                pkey = (const __be32 *) &rt->rt_gateway;
 450        else if (skb)
 451                pkey = &ip_hdr(skb)->daddr;
 452
 453        n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 454        if (n)
 455                return n;
 456        return neigh_create(&arp_tbl, pkey, dev);
 457}
 458
 459#define IP_IDENTS_SZ 2048u
 460struct ip_ident_bucket {
 461        atomic_t        id;
 462        u32             stamp32;
 463};
 464
 465static struct ip_ident_bucket *ip_idents __read_mostly;
 466
 467/* In order to protect privacy, we add a perturbation to identifiers
 468 * if one generator is seldom used. This makes hard for an attacker
 469 * to infer how many packets were sent between two points in time.
 470 */
 471u32 ip_idents_reserve(u32 hash, int segs)
 472{
 473        struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
 474        u32 old = ACCESS_ONCE(bucket->stamp32);
 475        u32 now = (u32)jiffies;
 476        u32 delta = 0;
 477
 478        if (old != now && cmpxchg(&bucket->stamp32, old, now) == old)
 479                delta = prandom_u32_max(now - old);
 480
 481        return atomic_add_return(segs + delta, &bucket->id) - segs;
 482}
 483EXPORT_SYMBOL(ip_idents_reserve);
 484
 485void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 486{
 487        static u32 ip_idents_hashrnd __read_mostly;
 488        u32 hash, id;
 489
 490        net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 491
 492        hash = jhash_3words((__force u32)iph->daddr,
 493                            (__force u32)iph->saddr,
 494                            iph->protocol ^ net_hash_mix(net),
 495                            ip_idents_hashrnd);
 496        id = ip_idents_reserve(hash, segs);
 497        iph->id = htons(id);
 498}
 499EXPORT_SYMBOL(__ip_select_ident);
 500
 501static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 502                             const struct iphdr *iph,
 503                             int oif, u8 tos,
 504                             u8 prot, u32 mark, int flow_flags)
 505{
 506        if (sk) {
 507                const struct inet_sock *inet = inet_sk(sk);
 508
 509                oif = sk->sk_bound_dev_if;
 510                mark = sk->sk_mark;
 511                tos = RT_CONN_FLAGS(sk);
 512                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 513        }
 514        flowi4_init_output(fl4, oif, mark, tos,
 515                           RT_SCOPE_UNIVERSE, prot,
 516                           flow_flags,
 517                           iph->daddr, iph->saddr, 0, 0);
 518}
 519
 520static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 521                               const struct sock *sk)
 522{
 523        const struct iphdr *iph = ip_hdr(skb);
 524        int oif = skb->dev->ifindex;
 525        u8 tos = RT_TOS(iph->tos);
 526        u8 prot = iph->protocol;
 527        u32 mark = skb->mark;
 528
 529        __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 530}
 531
 532static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 533{
 534        const struct inet_sock *inet = inet_sk(sk);
 535        const struct ip_options_rcu *inet_opt;
 536        __be32 daddr = inet->inet_daddr;
 537
 538        rcu_read_lock();
 539        inet_opt = rcu_dereference(inet->inet_opt);
 540        if (inet_opt && inet_opt->opt.srr)
 541                daddr = inet_opt->opt.faddr;
 542        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 543                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 544                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 545                           inet_sk_flowi_flags(sk),
 546                           daddr, inet->inet_saddr, 0, 0);
 547        rcu_read_unlock();
 548}
 549
 550static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 551                                 const struct sk_buff *skb)
 552{
 553        if (skb)
 554                build_skb_flow_key(fl4, skb, sk);
 555        else
 556                build_sk_flow_key(fl4, sk);
 557}
 558
 559static inline void rt_free(struct rtable *rt)
 560{
 561        call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 562}
 563
 564static DEFINE_SPINLOCK(fnhe_lock);
 565
 566static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 567{
 568        struct rtable *rt;
 569
 570        rt = rcu_dereference(fnhe->fnhe_rth_input);
 571        if (rt) {
 572                RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 573                rt_free(rt);
 574        }
 575        rt = rcu_dereference(fnhe->fnhe_rth_output);
 576        if (rt) {
 577                RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 578                rt_free(rt);
 579        }
 580}
 581
 582static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 583{
 584        struct fib_nh_exception *fnhe, *oldest;
 585
 586        oldest = rcu_dereference(hash->chain);
 587        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 588             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 589                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 590                        oldest = fnhe;
 591        }
 592        fnhe_flush_routes(oldest);
 593        return oldest;
 594}
 595
 596static inline u32 fnhe_hashfun(__be32 daddr)
 597{
 598        static u32 fnhe_hashrnd __read_mostly;
 599        u32 hval;
 600
 601        net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 602        hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 603        return hash_32(hval, FNHE_HASH_SHIFT);
 604}
 605
 606static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 607{
 608        rt->rt_pmtu = fnhe->fnhe_pmtu;
 609        rt->dst.expires = fnhe->fnhe_expires;
 610
 611        if (fnhe->fnhe_gw) {
 612                rt->rt_flags |= RTCF_REDIRECTED;
 613                rt->rt_gateway = fnhe->fnhe_gw;
 614                rt->rt_uses_gateway = 1;
 615        }
 616}
 617
 618static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 619                                  u32 pmtu, unsigned long expires)
 620{
 621        struct fnhe_hash_bucket *hash;
 622        struct fib_nh_exception *fnhe;
 623        struct rtable *rt;
 624        unsigned int i;
 625        int depth;
 626        u32 hval = fnhe_hashfun(daddr);
 627
 628        spin_lock_bh(&fnhe_lock);
 629
 630        hash = rcu_dereference(nh->nh_exceptions);
 631        if (!hash) {
 632                hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 633                if (!hash)
 634                        goto out_unlock;
 635                rcu_assign_pointer(nh->nh_exceptions, hash);
 636        }
 637
 638        hash += hval;
 639
 640        depth = 0;
 641        for (fnhe = rcu_dereference(hash->chain); fnhe;
 642             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 643                if (fnhe->fnhe_daddr == daddr)
 644                        break;
 645                depth++;
 646        }
 647
 648        if (fnhe) {
 649                if (gw)
 650                        fnhe->fnhe_gw = gw;
 651                if (pmtu) {
 652                        fnhe->fnhe_pmtu = pmtu;
 653                        fnhe->fnhe_expires = max(1UL, expires);
 654                }
 655                /* Update all cached dsts too */
 656                rt = rcu_dereference(fnhe->fnhe_rth_input);
 657                if (rt)
 658                        fill_route_from_fnhe(rt, fnhe);
 659                rt = rcu_dereference(fnhe->fnhe_rth_output);
 660                if (rt)
 661                        fill_route_from_fnhe(rt, fnhe);
 662        } else {
 663                if (depth > FNHE_RECLAIM_DEPTH)
 664                        fnhe = fnhe_oldest(hash);
 665                else {
 666                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 667                        if (!fnhe)
 668                                goto out_unlock;
 669
 670                        fnhe->fnhe_next = hash->chain;
 671                        rcu_assign_pointer(hash->chain, fnhe);
 672                }
 673                fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
 674                fnhe->fnhe_daddr = daddr;
 675                fnhe->fnhe_gw = gw;
 676                fnhe->fnhe_pmtu = pmtu;
 677                fnhe->fnhe_expires = expires;
 678
 679                /* Exception created; mark the cached routes for the nexthop
 680                 * stale, so anyone caching it rechecks if this exception
 681                 * applies to them.
 682                 */
 683                rt = rcu_dereference(nh->nh_rth_input);
 684                if (rt)
 685                        rt->dst.obsolete = DST_OBSOLETE_KILL;
 686
 687                for_each_possible_cpu(i) {
 688                        struct rtable __rcu **prt;
 689                        prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 690                        rt = rcu_dereference(*prt);
 691                        if (rt)
 692                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 693                }
 694        }
 695
 696        fnhe->fnhe_stamp = jiffies;
 697
 698out_unlock:
 699        spin_unlock_bh(&fnhe_lock);
 700}
 701
 702static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 703                             bool kill_route)
 704{
 705        __be32 new_gw = icmp_hdr(skb)->un.gateway;
 706        __be32 old_gw = ip_hdr(skb)->saddr;
 707        struct net_device *dev = skb->dev;
 708        struct in_device *in_dev;
 709        struct fib_result res;
 710        struct neighbour *n;
 711        struct net *net;
 712
 713        switch (icmp_hdr(skb)->code & 7) {
 714        case ICMP_REDIR_NET:
 715        case ICMP_REDIR_NETTOS:
 716        case ICMP_REDIR_HOST:
 717        case ICMP_REDIR_HOSTTOS:
 718                break;
 719
 720        default:
 721                return;
 722        }
 723
 724        if (rt->rt_gateway != old_gw)
 725                return;
 726
 727        in_dev = __in_dev_get_rcu(dev);
 728        if (!in_dev)
 729                return;
 730
 731        net = dev_net(dev);
 732        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 733            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 734            ipv4_is_zeronet(new_gw))
 735                goto reject_redirect;
 736
 737        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 738                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 739                        goto reject_redirect;
 740                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 741                        goto reject_redirect;
 742        } else {
 743                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 744                        goto reject_redirect;
 745        }
 746
 747        n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 748        if (!IS_ERR(n)) {
 749                if (!(n->nud_state & NUD_VALID)) {
 750                        neigh_event_send(n, NULL);
 751                } else {
 752                        if (fib_lookup(net, fl4, &res) == 0) {
 753                                struct fib_nh *nh = &FIB_RES_NH(res);
 754
 755                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
 756                                                      0, 0);
 757                        }
 758                        if (kill_route)
 759                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 760                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 761                }
 762                neigh_release(n);
 763        }
 764        return;
 765
 766reject_redirect:
 767#ifdef CONFIG_IP_ROUTE_VERBOSE
 768        if (IN_DEV_LOG_MARTIANS(in_dev)) {
 769                const struct iphdr *iph = (const struct iphdr *) skb->data;
 770                __be32 daddr = iph->daddr;
 771                __be32 saddr = iph->saddr;
 772
 773                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 774                                     "  Advised path = %pI4 -> %pI4\n",
 775                                     &old_gw, dev->name, &new_gw,
 776                                     &saddr, &daddr);
 777        }
 778#endif
 779        ;
 780}
 781
 782static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 783{
 784        struct rtable *rt;
 785        struct flowi4 fl4;
 786        const struct iphdr *iph = (const struct iphdr *) skb->data;
 787        int oif = skb->dev->ifindex;
 788        u8 tos = RT_TOS(iph->tos);
 789        u8 prot = iph->protocol;
 790        u32 mark = skb->mark;
 791
 792        rt = (struct rtable *) dst;
 793
 794        __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
 795        __ip_do_redirect(rt, skb, &fl4, true);
 796}
 797
 798static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 799{
 800        struct rtable *rt = (struct rtable *)dst;
 801        struct dst_entry *ret = dst;
 802
 803        if (rt) {
 804                if (dst->obsolete > 0) {
 805                        ip_rt_put(rt);
 806                        ret = NULL;
 807                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 808                           rt->dst.expires) {
 809                        ip_rt_put(rt);
 810                        ret = NULL;
 811                }
 812        }
 813        return ret;
 814}
 815
 816/*
 817 * Algorithm:
 818 *      1. The first ip_rt_redirect_number redirects are sent
 819 *         with exponential backoff, then we stop sending them at all,
 820 *         assuming that the host ignores our redirects.
 821 *      2. If we did not see packets requiring redirects
 822 *         during ip_rt_redirect_silence, we assume that the host
 823 *         forgot redirected route and start to send redirects again.
 824 *
 825 * This algorithm is much cheaper and more intelligent than dumb load limiting
 826 * in icmp.c.
 827 *
 828 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 829 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 830 */
 831
 832void ip_rt_send_redirect(struct sk_buff *skb)
 833{
 834        struct rtable *rt = skb_rtable(skb);
 835        struct in_device *in_dev;
 836        struct inet_peer *peer;
 837        struct net *net;
 838        int log_martians;
 839
 840        rcu_read_lock();
 841        in_dev = __in_dev_get_rcu(rt->dst.dev);
 842        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 843                rcu_read_unlock();
 844                return;
 845        }
 846        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 847        rcu_read_unlock();
 848
 849        net = dev_net(rt->dst.dev);
 850        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 851        if (!peer) {
 852                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 853                          rt_nexthop(rt, ip_hdr(skb)->daddr));
 854                return;
 855        }
 856
 857        /* No redirected packets during ip_rt_redirect_silence;
 858         * reset the algorithm.
 859         */
 860        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 861                peer->rate_tokens = 0;
 862
 863        /* Too many ignored redirects; do not send anything
 864         * set dst.rate_last to the last seen redirected packet.
 865         */
 866        if (peer->rate_tokens >= ip_rt_redirect_number) {
 867                peer->rate_last = jiffies;
 868                goto out_put_peer;
 869        }
 870
 871        /* Check for load limit; set rate_last to the latest sent
 872         * redirect.
 873         */
 874        if (peer->rate_tokens == 0 ||
 875            time_after(jiffies,
 876                       (peer->rate_last +
 877                        (ip_rt_redirect_load << peer->rate_tokens)))) {
 878                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 879
 880                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 881                peer->rate_last = jiffies;
 882                ++peer->rate_tokens;
 883#ifdef CONFIG_IP_ROUTE_VERBOSE
 884                if (log_martians &&
 885                    peer->rate_tokens == ip_rt_redirect_number)
 886                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 887                                             &ip_hdr(skb)->saddr, inet_iif(skb),
 888                                             &ip_hdr(skb)->daddr, &gw);
 889#endif
 890        }
 891out_put_peer:
 892        inet_putpeer(peer);
 893}
 894
 895static int ip_error(struct sk_buff *skb)
 896{
 897        struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 898        struct rtable *rt = skb_rtable(skb);
 899        struct inet_peer *peer;
 900        unsigned long now;
 901        struct net *net;
 902        bool send;
 903        int code;
 904
 905        /* IP on this device is disabled. */
 906        if (!in_dev)
 907                goto out;
 908
 909        net = dev_net(rt->dst.dev);
 910        if (!IN_DEV_FORWARD(in_dev)) {
 911                switch (rt->dst.error) {
 912                case EHOSTUNREACH:
 913                        IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
 914                        break;
 915
 916                case ENETUNREACH:
 917                        IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 918                        break;
 919                }
 920                goto out;
 921        }
 922
 923        switch (rt->dst.error) {
 924        case EINVAL:
 925        default:
 926                goto out;
 927        case EHOSTUNREACH:
 928                code = ICMP_HOST_UNREACH;
 929                break;
 930        case ENETUNREACH:
 931                code = ICMP_NET_UNREACH;
 932                IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 933                break;
 934        case EACCES:
 935                code = ICMP_PKT_FILTERED;
 936                break;
 937        }
 938
 939        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 940
 941        send = true;
 942        if (peer) {
 943                now = jiffies;
 944                peer->rate_tokens += now - peer->rate_last;
 945                if (peer->rate_tokens > ip_rt_error_burst)
 946                        peer->rate_tokens = ip_rt_error_burst;
 947                peer->rate_last = now;
 948                if (peer->rate_tokens >= ip_rt_error_cost)
 949                        peer->rate_tokens -= ip_rt_error_cost;
 950                else
 951                        send = false;
 952                inet_putpeer(peer);
 953        }
 954        if (send)
 955                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 956
 957out:    kfree_skb(skb);
 958        return 0;
 959}
 960
 961static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 962{
 963        struct dst_entry *dst = &rt->dst;
 964        struct fib_result res;
 965
 966        if (dst_metric_locked(dst, RTAX_MTU))
 967                return;
 968
 969        if (ipv4_mtu(dst) < mtu)
 970                return;
 971
 972        if (mtu < ip_rt_min_pmtu)
 973                mtu = ip_rt_min_pmtu;
 974
 975        if (rt->rt_pmtu == mtu &&
 976            time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
 977                return;
 978
 979        rcu_read_lock();
 980        if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
 981                struct fib_nh *nh = &FIB_RES_NH(res);
 982
 983                update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 984                                      jiffies + ip_rt_mtu_expires);
 985        }
 986        rcu_read_unlock();
 987}
 988
 989static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 990                              struct sk_buff *skb, u32 mtu)
 991{
 992        struct rtable *rt = (struct rtable *) dst;
 993        struct flowi4 fl4;
 994
 995        ip_rt_build_flow_key(&fl4, sk, skb);
 996        __ip_rt_update_pmtu(rt, &fl4, mtu);
 997}
 998
 999void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1000                      int oif, u32 mark, u8 protocol, int flow_flags)
1001{
1002        const struct iphdr *iph = (const struct iphdr *) skb->data;
1003        struct flowi4 fl4;
1004        struct rtable *rt;
1005
1006        if (!mark)
1007                mark = IP4_REPLY_MARK(net, skb->mark);
1008
1009        __build_flow_key(&fl4, NULL, iph, oif,
1010                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1011        rt = __ip_route_output_key(net, &fl4);
1012        if (!IS_ERR(rt)) {
1013                __ip_rt_update_pmtu(rt, &fl4, mtu);
1014                ip_rt_put(rt);
1015        }
1016}
1017EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1018
1019static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1020{
1021        const struct iphdr *iph = (const struct iphdr *) skb->data;
1022        struct flowi4 fl4;
1023        struct rtable *rt;
1024
1025        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1026
1027        if (!fl4.flowi4_mark)
1028                fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1029
1030        rt = __ip_route_output_key(sock_net(sk), &fl4);
1031        if (!IS_ERR(rt)) {
1032                __ip_rt_update_pmtu(rt, &fl4, mtu);
1033                ip_rt_put(rt);
1034        }
1035}
1036
1037void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1038{
1039        const struct iphdr *iph = (const struct iphdr *) skb->data;
1040        struct flowi4 fl4;
1041        struct rtable *rt;
1042        struct dst_entry *odst = NULL;
1043        bool new = false;
1044
1045        bh_lock_sock(sk);
1046
1047        if (!ip_sk_accept_pmtu(sk))
1048                goto out;
1049
1050        odst = sk_dst_get(sk);
1051
1052        if (sock_owned_by_user(sk) || !odst) {
1053                __ipv4_sk_update_pmtu(skb, sk, mtu);
1054                goto out;
1055        }
1056
1057        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1058
1059        rt = (struct rtable *)odst;
1060        if (odst->obsolete && !odst->ops->check(odst, 0)) {
1061                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1062                if (IS_ERR(rt))
1063                        goto out;
1064
1065                new = true;
1066        }
1067
1068        __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1069
1070        if (!dst_check(&rt->dst, 0)) {
1071                if (new)
1072                        dst_release(&rt->dst);
1073
1074                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1075                if (IS_ERR(rt))
1076                        goto out;
1077
1078                new = true;
1079        }
1080
1081        if (new)
1082                sk_dst_set(sk, &rt->dst);
1083
1084out:
1085        bh_unlock_sock(sk);
1086        dst_release(odst);
1087}
1088EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1089
1090void ipv4_redirect(struct sk_buff *skb, struct net *net,
1091                   int oif, u32 mark, u8 protocol, int flow_flags)
1092{
1093        const struct iphdr *iph = (const struct iphdr *) skb->data;
1094        struct flowi4 fl4;
1095        struct rtable *rt;
1096
1097        __build_flow_key(&fl4, NULL, iph, oif,
1098                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1099        rt = __ip_route_output_key(net, &fl4);
1100        if (!IS_ERR(rt)) {
1101                __ip_do_redirect(rt, skb, &fl4, false);
1102                ip_rt_put(rt);
1103        }
1104}
1105EXPORT_SYMBOL_GPL(ipv4_redirect);
1106
1107void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1108{
1109        const struct iphdr *iph = (const struct iphdr *) skb->data;
1110        struct flowi4 fl4;
1111        struct rtable *rt;
1112
1113        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1114        rt = __ip_route_output_key(sock_net(sk), &fl4);
1115        if (!IS_ERR(rt)) {
1116                __ip_do_redirect(rt, skb, &fl4, false);
1117                ip_rt_put(rt);
1118        }
1119}
1120EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1121
1122static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1123{
1124        struct rtable *rt = (struct rtable *) dst;
1125
1126        /* All IPV4 dsts are created with ->obsolete set to the value
1127         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1128         * into this function always.
1129         *
1130         * When a PMTU/redirect information update invalidates a route,
1131         * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1132         * DST_OBSOLETE_DEAD by dst_free().
1133         */
1134        if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1135                return NULL;
1136        return dst;
1137}
1138
1139static void ipv4_link_failure(struct sk_buff *skb)
1140{
1141        struct rtable *rt;
1142
1143        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1144
1145        rt = skb_rtable(skb);
1146        if (rt)
1147                dst_set_expires(&rt->dst, 0);
1148}
1149
1150static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
1151{
1152        pr_debug("%s: %pI4 -> %pI4, %s\n",
1153                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1154                 skb->dev ? skb->dev->name : "?");
1155        kfree_skb(skb);
1156        WARN_ON(1);
1157        return 0;
1158}
1159
1160/*
1161   We do not cache source address of outgoing interface,
1162   because it is used only by IP RR, TS and SRR options,
1163   so that it out of fast path.
1164
1165   BTW remember: "addr" is allowed to be not aligned
1166   in IP options!
1167 */
1168
1169void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1170{
1171        __be32 src;
1172
1173        if (rt_is_output_route(rt))
1174                src = ip_hdr(skb)->saddr;
1175        else {
1176                struct fib_result res;
1177                struct flowi4 fl4;
1178                struct iphdr *iph;
1179
1180                iph = ip_hdr(skb);
1181
1182                memset(&fl4, 0, sizeof(fl4));
1183                fl4.daddr = iph->daddr;
1184                fl4.saddr = iph->saddr;
1185                fl4.flowi4_tos = RT_TOS(iph->tos);
1186                fl4.flowi4_oif = rt->dst.dev->ifindex;
1187                fl4.flowi4_iif = skb->dev->ifindex;
1188                fl4.flowi4_mark = skb->mark;
1189
1190                rcu_read_lock();
1191                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1192                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1193                else
1194                        src = inet_select_addr(rt->dst.dev,
1195                                               rt_nexthop(rt, iph->daddr),
1196                                               RT_SCOPE_UNIVERSE);
1197                rcu_read_unlock();
1198        }
1199        memcpy(addr, &src, 4);
1200}
1201
1202#ifdef CONFIG_IP_ROUTE_CLASSID
1203static void set_class_tag(struct rtable *rt, u32 tag)
1204{
1205        if (!(rt->dst.tclassid & 0xFFFF))
1206                rt->dst.tclassid |= tag & 0xFFFF;
1207        if (!(rt->dst.tclassid & 0xFFFF0000))
1208                rt->dst.tclassid |= tag & 0xFFFF0000;
1209}
1210#endif
1211
1212static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1213{
1214        unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1215
1216        if (advmss == 0) {
1217                advmss = max_t(unsigned int, dst->dev->mtu - 40,
1218                               ip_rt_min_advmss);
1219                if (advmss > 65535 - 40)
1220                        advmss = 65535 - 40;
1221        }
1222        return advmss;
1223}
1224
1225static unsigned int ipv4_mtu(const struct dst_entry *dst)
1226{
1227        const struct rtable *rt = (const struct rtable *) dst;
1228        unsigned int mtu = rt->rt_pmtu;
1229
1230        if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1231                mtu = dst_metric_raw(dst, RTAX_MTU);
1232
1233        if (mtu)
1234                return mtu;
1235
1236        mtu = dst->dev->mtu;
1237
1238        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1239                if (rt->rt_uses_gateway && mtu > 576)
1240                        mtu = 576;
1241        }
1242
1243        return min_t(unsigned int, mtu, IP_MAX_MTU);
1244}
1245
1246static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1247{
1248        struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1249        struct fib_nh_exception *fnhe;
1250        u32 hval;
1251
1252        if (!hash)
1253                return NULL;
1254
1255        hval = fnhe_hashfun(daddr);
1256
1257        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1258             fnhe = rcu_dereference(fnhe->fnhe_next)) {
1259                if (fnhe->fnhe_daddr == daddr)
1260                        return fnhe;
1261        }
1262        return NULL;
1263}
1264
1265static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1266                              __be32 daddr)
1267{
1268        bool ret = false;
1269
1270        spin_lock_bh(&fnhe_lock);
1271
1272        if (daddr == fnhe->fnhe_daddr) {
1273                struct rtable __rcu **porig;
1274                struct rtable *orig;
1275                int genid = fnhe_genid(dev_net(rt->dst.dev));
1276
1277                if (rt_is_input_route(rt))
1278                        porig = &fnhe->fnhe_rth_input;
1279                else
1280                        porig = &fnhe->fnhe_rth_output;
1281                orig = rcu_dereference(*porig);
1282
1283                if (fnhe->fnhe_genid != genid) {
1284                        fnhe->fnhe_genid = genid;
1285                        fnhe->fnhe_gw = 0;
1286                        fnhe->fnhe_pmtu = 0;
1287                        fnhe->fnhe_expires = 0;
1288                        fnhe_flush_routes(fnhe);
1289                        orig = NULL;
1290                }
1291                fill_route_from_fnhe(rt, fnhe);
1292                if (!rt->rt_gateway)
1293                        rt->rt_gateway = daddr;
1294
1295                if (!(rt->dst.flags & DST_NOCACHE)) {
1296                        rcu_assign_pointer(*porig, rt);
1297                        if (orig)
1298                                rt_free(orig);
1299                        ret = true;
1300                }
1301
1302                fnhe->fnhe_stamp = jiffies;
1303        }
1304        spin_unlock_bh(&fnhe_lock);
1305
1306        return ret;
1307}
1308
1309static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1310{
1311        struct rtable *orig, *prev, **p;
1312        bool ret = true;
1313
1314        if (rt_is_input_route(rt)) {
1315                p = (struct rtable **)&nh->nh_rth_input;
1316        } else {
1317                p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1318        }
1319        orig = *p;
1320
1321        prev = cmpxchg(p, orig, rt);
1322        if (prev == orig) {
1323                if (orig)
1324                        rt_free(orig);
1325        } else
1326                ret = false;
1327
1328        return ret;
1329}
1330
1331struct uncached_list {
1332        spinlock_t              lock;
1333        struct list_head        head;
1334};
1335
1336static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1337
1338static void rt_add_uncached_list(struct rtable *rt)
1339{
1340        struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1341
1342        rt->rt_uncached_list = ul;
1343
1344        spin_lock_bh(&ul->lock);
1345        list_add_tail(&rt->rt_uncached, &ul->head);
1346        spin_unlock_bh(&ul->lock);
1347}
1348
1349static void ipv4_dst_destroy(struct dst_entry *dst)
1350{
1351        struct rtable *rt = (struct rtable *) dst;
1352
1353        if (!list_empty(&rt->rt_uncached)) {
1354                struct uncached_list *ul = rt->rt_uncached_list;
1355
1356                spin_lock_bh(&ul->lock);
1357                list_del(&rt->rt_uncached);
1358                spin_unlock_bh(&ul->lock);
1359        }
1360}
1361
1362void rt_flush_dev(struct net_device *dev)
1363{
1364        struct net *net = dev_net(dev);
1365        struct rtable *rt;
1366        int cpu;
1367
1368        for_each_possible_cpu(cpu) {
1369                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1370
1371                spin_lock_bh(&ul->lock);
1372                list_for_each_entry(rt, &ul->head, rt_uncached) {
1373                        if (rt->dst.dev != dev)
1374                                continue;
1375                        rt->dst.dev = net->loopback_dev;
1376                        dev_hold(rt->dst.dev);
1377                        dev_put(dev);
1378                }
1379                spin_unlock_bh(&ul->lock);
1380        }
1381}
1382
1383static bool rt_cache_valid(const struct rtable *rt)
1384{
1385        return  rt &&
1386                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1387                !rt_is_expired(rt);
1388}
1389
1390static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1391                           const struct fib_result *res,
1392                           struct fib_nh_exception *fnhe,
1393                           struct fib_info *fi, u16 type, u32 itag)
1394{
1395        bool cached = false;
1396
1397        if (fi) {
1398                struct fib_nh *nh = &FIB_RES_NH(*res);
1399
1400                if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1401                        rt->rt_gateway = nh->nh_gw;
1402                        rt->rt_uses_gateway = 1;
1403                }
1404                dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1405#ifdef CONFIG_IP_ROUTE_CLASSID
1406                rt->dst.tclassid = nh->nh_tclassid;
1407#endif
1408                if (unlikely(fnhe))
1409                        cached = rt_bind_exception(rt, fnhe, daddr);
1410                else if (!(rt->dst.flags & DST_NOCACHE))
1411                        cached = rt_cache_route(nh, rt);
1412                if (unlikely(!cached)) {
1413                        /* Routes we intend to cache in nexthop exception or
1414                         * FIB nexthop have the DST_NOCACHE bit clear.
1415                         * However, if we are unsuccessful at storing this
1416                         * route into the cache we really need to set it.
1417                         */
1418                        rt->dst.flags |= DST_NOCACHE;
1419                        if (!rt->rt_gateway)
1420                                rt->rt_gateway = daddr;
1421                        rt_add_uncached_list(rt);
1422                }
1423        } else
1424                rt_add_uncached_list(rt);
1425
1426#ifdef CONFIG_IP_ROUTE_CLASSID
1427#ifdef CONFIG_IP_MULTIPLE_TABLES
1428        set_class_tag(rt, res->tclassid);
1429#endif
1430        set_class_tag(rt, itag);
1431#endif
1432}
1433
1434static struct rtable *rt_dst_alloc(struct net_device *dev,
1435                                   bool nopolicy, bool noxfrm, bool will_cache)
1436{
1437        return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1438                         (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1439                         (nopolicy ? DST_NOPOLICY : 0) |
1440                         (noxfrm ? DST_NOXFRM : 0));
1441}
1442
1443/* called in rcu_read_lock() section */
1444static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1445                                u8 tos, struct net_device *dev, int our)
1446{
1447        struct rtable *rth;
1448        struct in_device *in_dev = __in_dev_get_rcu(dev);
1449        u32 itag = 0;
1450        int err;
1451
1452        /* Primary sanity checks. */
1453
1454        if (!in_dev)
1455                return -EINVAL;
1456
1457        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1458            skb->protocol != htons(ETH_P_IP))
1459                goto e_inval;
1460
1461        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1462                if (ipv4_is_loopback(saddr))
1463                        goto e_inval;
1464
1465        if (ipv4_is_zeronet(saddr)) {
1466                if (!ipv4_is_local_multicast(daddr))
1467                        goto e_inval;
1468        } else {
1469                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1470                                          in_dev, &itag);
1471                if (err < 0)
1472                        goto e_err;
1473        }
1474        rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1475                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1476        if (!rth)
1477                goto e_nobufs;
1478
1479#ifdef CONFIG_IP_ROUTE_CLASSID
1480        rth->dst.tclassid = itag;
1481#endif
1482        rth->dst.output = ip_rt_bug;
1483
1484        rth->rt_genid   = rt_genid_ipv4(dev_net(dev));
1485        rth->rt_flags   = RTCF_MULTICAST;
1486        rth->rt_type    = RTN_MULTICAST;
1487        rth->rt_is_input= 1;
1488        rth->rt_iif     = 0;
1489        rth->rt_pmtu    = 0;
1490        rth->rt_gateway = 0;
1491        rth->rt_uses_gateway = 0;
1492        INIT_LIST_HEAD(&rth->rt_uncached);
1493        if (our) {
1494                rth->dst.input= ip_local_deliver;
1495                rth->rt_flags |= RTCF_LOCAL;
1496        }
1497
1498#ifdef CONFIG_IP_MROUTE
1499        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1500                rth->dst.input = ip_mr_input;
1501#endif
1502        RT_CACHE_STAT_INC(in_slow_mc);
1503
1504        skb_dst_set(skb, &rth->dst);
1505        return 0;
1506
1507e_nobufs:
1508        return -ENOBUFS;
1509e_inval:
1510        return -EINVAL;
1511e_err:
1512        return err;
1513}
1514
1515
1516static void ip_handle_martian_source(struct net_device *dev,
1517                                     struct in_device *in_dev,
1518                                     struct sk_buff *skb,
1519                                     __be32 daddr,
1520                                     __be32 saddr)
1521{
1522        RT_CACHE_STAT_INC(in_martian_src);
1523#ifdef CONFIG_IP_ROUTE_VERBOSE
1524        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1525                /*
1526                 *      RFC1812 recommendation, if source is martian,
1527                 *      the only hint is MAC header.
1528                 */
1529                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1530                        &daddr, &saddr, dev->name);
1531                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1532                        print_hex_dump(KERN_WARNING, "ll header: ",
1533                                       DUMP_PREFIX_OFFSET, 16, 1,
1534                                       skb_mac_header(skb),
1535                                       dev->hard_header_len, true);
1536                }
1537        }
1538#endif
1539}
1540
1541/* called in rcu_read_lock() section */
1542static int __mkroute_input(struct sk_buff *skb,
1543                           const struct fib_result *res,
1544                           struct in_device *in_dev,
1545                           __be32 daddr, __be32 saddr, u32 tos)
1546{
1547        struct fib_nh_exception *fnhe;
1548        struct rtable *rth;
1549        int err;
1550        struct in_device *out_dev;
1551        unsigned int flags = 0;
1552        bool do_cache;
1553        u32 itag = 0;
1554
1555        /* get a working reference to the output device */
1556        out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1557        if (!out_dev) {
1558                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1559                return -EINVAL;
1560        }
1561
1562        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1563                                  in_dev->dev, in_dev, &itag);
1564        if (err < 0) {
1565                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1566                                         saddr);
1567
1568                goto cleanup;
1569        }
1570
1571        do_cache = res->fi && !itag;
1572        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1573            skb->protocol == htons(ETH_P_IP) &&
1574            (IN_DEV_SHARED_MEDIA(out_dev) ||
1575             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1576                IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1577
1578        if (skb->protocol != htons(ETH_P_IP)) {
1579                /* Not IP (i.e. ARP). Do not create route, if it is
1580                 * invalid for proxy arp. DNAT routes are always valid.
1581                 *
1582                 * Proxy arp feature have been extended to allow, ARP
1583                 * replies back to the same interface, to support
1584                 * Private VLAN switch technologies. See arp.c.
1585                 */
1586                if (out_dev == in_dev &&
1587                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1588                        err = -EINVAL;
1589                        goto cleanup;
1590                }
1591        }
1592
1593        fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1594        if (do_cache) {
1595                if (fnhe)
1596                        rth = rcu_dereference(fnhe->fnhe_rth_input);
1597                else
1598                        rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1599
1600                if (rt_cache_valid(rth)) {
1601                        skb_dst_set_noref(skb, &rth->dst);
1602                        goto out;
1603                }
1604        }
1605
1606        rth = rt_dst_alloc(out_dev->dev,
1607                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1608                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1609        if (!rth) {
1610                err = -ENOBUFS;
1611                goto cleanup;
1612        }
1613
1614        rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1615        rth->rt_flags = flags;
1616        rth->rt_type = res->type;
1617        rth->rt_is_input = 1;
1618        rth->rt_iif     = 0;
1619        rth->rt_pmtu    = 0;
1620        rth->rt_gateway = 0;
1621        rth->rt_uses_gateway = 0;
1622        INIT_LIST_HEAD(&rth->rt_uncached);
1623        RT_CACHE_STAT_INC(in_slow_tot);
1624
1625        rth->dst.input = ip_forward;
1626        rth->dst.output = ip_output;
1627
1628        rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1629        skb_dst_set(skb, &rth->dst);
1630out:
1631        err = 0;
1632 cleanup:
1633        return err;
1634}
1635
1636static int ip_mkroute_input(struct sk_buff *skb,
1637                            struct fib_result *res,
1638                            const struct flowi4 *fl4,
1639                            struct in_device *in_dev,
1640                            __be32 daddr, __be32 saddr, u32 tos)
1641{
1642#ifdef CONFIG_IP_ROUTE_MULTIPATH
1643        if (res->fi && res->fi->fib_nhs > 1)
1644                fib_select_multipath(res);
1645#endif
1646
1647        /* create a routing cache entry */
1648        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1649}
1650
1651/*
1652 *      NOTE. We drop all the packets that has local source
1653 *      addresses, because every properly looped back packet
1654 *      must have correct destination already attached by output routine.
1655 *
1656 *      Such approach solves two big problems:
1657 *      1. Not simplex devices are handled properly.
1658 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1659 *      called with rcu_read_lock()
1660 */
1661
1662static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1663                               u8 tos, struct net_device *dev)
1664{
1665        struct fib_result res;
1666        struct in_device *in_dev = __in_dev_get_rcu(dev);
1667        struct flowi4   fl4;
1668        unsigned int    flags = 0;
1669        u32             itag = 0;
1670        struct rtable   *rth;
1671        int             err = -EINVAL;
1672        struct net    *net = dev_net(dev);
1673        bool do_cache;
1674
1675        /* IP on this device is disabled. */
1676
1677        if (!in_dev)
1678                goto out;
1679
1680        /* Check for the most weird martians, which can be not detected
1681           by fib_lookup.
1682         */
1683
1684        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1685                goto martian_source;
1686
1687        res.fi = NULL;
1688        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1689                goto brd_input;
1690
1691        /* Accept zero addresses only to limited broadcast;
1692         * I even do not know to fix it or not. Waiting for complains :-)
1693         */
1694        if (ipv4_is_zeronet(saddr))
1695                goto martian_source;
1696
1697        if (ipv4_is_zeronet(daddr))
1698                goto martian_destination;
1699
1700        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1701         * and call it once if daddr or/and saddr are loopback addresses
1702         */
1703        if (ipv4_is_loopback(daddr)) {
1704                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1705                        goto martian_destination;
1706        } else if (ipv4_is_loopback(saddr)) {
1707                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1708                        goto martian_source;
1709        }
1710
1711        /*
1712         *      Now we are ready to route packet.
1713         */
1714        fl4.flowi4_oif = 0;
1715        fl4.flowi4_iif = dev->ifindex;
1716        fl4.flowi4_mark = skb->mark;
1717        fl4.flowi4_tos = tos;
1718        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1719        fl4.daddr = daddr;
1720        fl4.saddr = saddr;
1721        err = fib_lookup(net, &fl4, &res);
1722        if (err != 0) {
1723                if (!IN_DEV_FORWARD(in_dev))
1724                        err = -EHOSTUNREACH;
1725                goto no_route;
1726        }
1727
1728        if (res.type == RTN_BROADCAST)
1729                goto brd_input;
1730
1731        if (res.type == RTN_LOCAL) {
1732                err = fib_validate_source(skb, saddr, daddr, tos,
1733                                          0, dev, in_dev, &itag);
1734                if (err < 0)
1735                        goto martian_source_keep_err;
1736                goto local_input;
1737        }
1738
1739        if (!IN_DEV_FORWARD(in_dev)) {
1740                err = -EHOSTUNREACH;
1741                goto no_route;
1742        }
1743        if (res.type != RTN_UNICAST)
1744                goto martian_destination;
1745
1746        err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1747out:    return err;
1748
1749brd_input:
1750        if (skb->protocol != htons(ETH_P_IP))
1751                goto e_inval;
1752
1753        if (!ipv4_is_zeronet(saddr)) {
1754                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1755                                          in_dev, &itag);
1756                if (err < 0)
1757                        goto martian_source_keep_err;
1758        }
1759        flags |= RTCF_BROADCAST;
1760        res.type = RTN_BROADCAST;
1761        RT_CACHE_STAT_INC(in_brd);
1762
1763local_input:
1764        do_cache = false;
1765        if (res.fi) {
1766                if (!itag) {
1767                        rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1768                        if (rt_cache_valid(rth)) {
1769                                skb_dst_set_noref(skb, &rth->dst);
1770                                err = 0;
1771                                goto out;
1772                        }
1773                        do_cache = true;
1774                }
1775        }
1776
1777        rth = rt_dst_alloc(net->loopback_dev,
1778                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1779        if (!rth)
1780                goto e_nobufs;
1781
1782        rth->dst.input= ip_local_deliver;
1783        rth->dst.output= ip_rt_bug;
1784#ifdef CONFIG_IP_ROUTE_CLASSID
1785        rth->dst.tclassid = itag;
1786#endif
1787
1788        rth->rt_genid = rt_genid_ipv4(net);
1789        rth->rt_flags   = flags|RTCF_LOCAL;
1790        rth->rt_type    = res.type;
1791        rth->rt_is_input = 1;
1792        rth->rt_iif     = 0;
1793        rth->rt_pmtu    = 0;
1794        rth->rt_gateway = 0;
1795        rth->rt_uses_gateway = 0;
1796        INIT_LIST_HEAD(&rth->rt_uncached);
1797        RT_CACHE_STAT_INC(in_slow_tot);
1798        if (res.type == RTN_UNREACHABLE) {
1799                rth->dst.input= ip_error;
1800                rth->dst.error= -err;
1801                rth->rt_flags   &= ~RTCF_LOCAL;
1802        }
1803        if (do_cache) {
1804                if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1805                        rth->dst.flags |= DST_NOCACHE;
1806                        rt_add_uncached_list(rth);
1807                }
1808        }
1809        skb_dst_set(skb, &rth->dst);
1810        err = 0;
1811        goto out;
1812
1813no_route:
1814        RT_CACHE_STAT_INC(in_no_route);
1815        res.type = RTN_UNREACHABLE;
1816        res.fi = NULL;
1817        goto local_input;
1818
1819        /*
1820         *      Do not cache martian addresses: they should be logged (RFC1812)
1821         */
1822martian_destination:
1823        RT_CACHE_STAT_INC(in_martian_dst);
1824#ifdef CONFIG_IP_ROUTE_VERBOSE
1825        if (IN_DEV_LOG_MARTIANS(in_dev))
1826                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1827                                     &daddr, &saddr, dev->name);
1828#endif
1829
1830e_inval:
1831        err = -EINVAL;
1832        goto out;
1833
1834e_nobufs:
1835        err = -ENOBUFS;
1836        goto out;
1837
1838martian_source:
1839        err = -EINVAL;
1840martian_source_keep_err:
1841        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1842        goto out;
1843}
1844
1845int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1846                         u8 tos, struct net_device *dev)
1847{
1848        int res;
1849
1850        rcu_read_lock();
1851
1852        /* Multicast recognition logic is moved from route cache to here.
1853           The problem was that too many Ethernet cards have broken/missing
1854           hardware multicast filters :-( As result the host on multicasting
1855           network acquires a lot of useless route cache entries, sort of
1856           SDR messages from all the world. Now we try to get rid of them.
1857           Really, provided software IP multicast filter is organized
1858           reasonably (at least, hashed), it does not result in a slowdown
1859           comparing with route cache reject entries.
1860           Note, that multicast routers are not affected, because
1861           route cache entry is created eventually.
1862         */
1863        if (ipv4_is_multicast(daddr)) {
1864                struct in_device *in_dev = __in_dev_get_rcu(dev);
1865
1866                if (in_dev) {
1867                        int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1868                                                  ip_hdr(skb)->protocol);
1869                        if (our
1870#ifdef CONFIG_IP_MROUTE
1871                                ||
1872                            (!ipv4_is_local_multicast(daddr) &&
1873                             IN_DEV_MFORWARD(in_dev))
1874#endif
1875                           ) {
1876                                int res = ip_route_input_mc(skb, daddr, saddr,
1877                                                            tos, dev, our);
1878                                rcu_read_unlock();
1879                                return res;
1880                        }
1881                }
1882                rcu_read_unlock();
1883                return -EINVAL;
1884        }
1885        res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1886        rcu_read_unlock();
1887        return res;
1888}
1889EXPORT_SYMBOL(ip_route_input_noref);
1890
1891/* called with rcu_read_lock() */
1892static struct rtable *__mkroute_output(const struct fib_result *res,
1893                                       const struct flowi4 *fl4, int orig_oif,
1894                                       struct net_device *dev_out,
1895                                       unsigned int flags)
1896{
1897        struct fib_info *fi = res->fi;
1898        struct fib_nh_exception *fnhe;
1899        struct in_device *in_dev;
1900        u16 type = res->type;
1901        struct rtable *rth;
1902        bool do_cache;
1903
1904        in_dev = __in_dev_get_rcu(dev_out);
1905        if (!in_dev)
1906                return ERR_PTR(-EINVAL);
1907
1908        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1909                if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1910                        return ERR_PTR(-EINVAL);
1911
1912        if (ipv4_is_lbcast(fl4->daddr))
1913                type = RTN_BROADCAST;
1914        else if (ipv4_is_multicast(fl4->daddr))
1915                type = RTN_MULTICAST;
1916        else if (ipv4_is_zeronet(fl4->daddr))
1917                return ERR_PTR(-EINVAL);
1918
1919        if (dev_out->flags & IFF_LOOPBACK)
1920                flags |= RTCF_LOCAL;
1921
1922        do_cache = true;
1923        if (type == RTN_BROADCAST) {
1924                flags |= RTCF_BROADCAST | RTCF_LOCAL;
1925                fi = NULL;
1926        } else if (type == RTN_MULTICAST) {
1927                flags |= RTCF_MULTICAST | RTCF_LOCAL;
1928                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1929                                     fl4->flowi4_proto))
1930                        flags &= ~RTCF_LOCAL;
1931                else
1932                        do_cache = false;
1933                /* If multicast route do not exist use
1934                 * default one, but do not gateway in this case.
1935                 * Yes, it is hack.
1936                 */
1937                if (fi && res->prefixlen < 4)
1938                        fi = NULL;
1939        }
1940
1941        fnhe = NULL;
1942        do_cache &= fi != NULL;
1943        if (do_cache) {
1944                struct rtable __rcu **prth;
1945                struct fib_nh *nh = &FIB_RES_NH(*res);
1946
1947                fnhe = find_exception(nh, fl4->daddr);
1948                if (fnhe)
1949                        prth = &fnhe->fnhe_rth_output;
1950                else {
1951                        if (unlikely(fl4->flowi4_flags &
1952                                     FLOWI_FLAG_KNOWN_NH &&
1953                                     !(nh->nh_gw &&
1954                                       nh->nh_scope == RT_SCOPE_LINK))) {
1955                                do_cache = false;
1956                                goto add;
1957                        }
1958                        prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
1959                }
1960                rth = rcu_dereference(*prth);
1961                if (rt_cache_valid(rth)) {
1962                        dst_hold(&rth->dst);
1963                        return rth;
1964                }
1965        }
1966
1967add:
1968        rth = rt_dst_alloc(dev_out,
1969                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1970                           IN_DEV_CONF_GET(in_dev, NOXFRM),
1971                           do_cache);
1972        if (!rth)
1973                return ERR_PTR(-ENOBUFS);
1974
1975        rth->dst.output = ip_output;
1976
1977        rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1978        rth->rt_flags   = flags;
1979        rth->rt_type    = type;
1980        rth->rt_is_input = 0;
1981        rth->rt_iif     = orig_oif ? : 0;
1982        rth->rt_pmtu    = 0;
1983        rth->rt_gateway = 0;
1984        rth->rt_uses_gateway = 0;
1985        INIT_LIST_HEAD(&rth->rt_uncached);
1986
1987        RT_CACHE_STAT_INC(out_slow_tot);
1988
1989        if (flags & RTCF_LOCAL)
1990                rth->dst.input = ip_local_deliver;
1991        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1992                if (flags & RTCF_LOCAL &&
1993                    !(dev_out->flags & IFF_LOOPBACK)) {
1994                        rth->dst.output = ip_mc_output;
1995                        RT_CACHE_STAT_INC(out_slow_mc);
1996                }
1997#ifdef CONFIG_IP_MROUTE
1998                if (type == RTN_MULTICAST) {
1999                        if (IN_DEV_MFORWARD(in_dev) &&
2000                            !ipv4_is_local_multicast(fl4->daddr)) {
2001                                rth->dst.input = ip_mr_input;
2002                                rth->dst.output = ip_mc_output;
2003                        }
2004                }
2005#endif
2006        }
2007
2008        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2009
2010        return rth;
2011}
2012
2013/*
2014 * Major route resolver routine.
2015 */
2016
2017struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
2018{
2019        struct net_device *dev_out = NULL;
2020        __u8 tos = RT_FL_TOS(fl4);
2021        unsigned int flags = 0;
2022        struct fib_result res;
2023        struct rtable *rth;
2024        int orig_oif;
2025
2026        res.tclassid    = 0;
2027        res.fi          = NULL;
2028        res.table       = NULL;
2029
2030        orig_oif = fl4->flowi4_oif;
2031
2032        fl4->flowi4_iif = LOOPBACK_IFINDEX;
2033        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2034        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2035                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2036
2037        rcu_read_lock();
2038        if (fl4->saddr) {
2039                rth = ERR_PTR(-EINVAL);
2040                if (ipv4_is_multicast(fl4->saddr) ||
2041                    ipv4_is_lbcast(fl4->saddr) ||
2042                    ipv4_is_zeronet(fl4->saddr))
2043                        goto out;
2044
2045                /* I removed check for oif == dev_out->oif here.
2046                   It was wrong for two reasons:
2047                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2048                      is assigned to multiple interfaces.
2049                   2. Moreover, we are allowed to send packets with saddr
2050                      of another iface. --ANK
2051                 */
2052
2053                if (fl4->flowi4_oif == 0 &&
2054                    (ipv4_is_multicast(fl4->daddr) ||
2055                     ipv4_is_lbcast(fl4->daddr))) {
2056                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2057                        dev_out = __ip_dev_find(net, fl4->saddr, false);
2058                        if (!dev_out)
2059                                goto out;
2060
2061                        /* Special hack: user can direct multicasts
2062                           and limited broadcast via necessary interface
2063                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2064                           This hack is not just for fun, it allows
2065                           vic,vat and friends to work.
2066                           They bind socket to loopback, set ttl to zero
2067                           and expect that it will work.
2068                           From the viewpoint of routing cache they are broken,
2069                           because we are not allowed to build multicast path
2070                           with loopback source addr (look, routing cache
2071                           cannot know, that ttl is zero, so that packet
2072                           will not leave this host and route is valid).
2073                           Luckily, this hack is good workaround.
2074                         */
2075
2076                        fl4->flowi4_oif = dev_out->ifindex;
2077                        goto make_route;
2078                }
2079
2080                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2081                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2082                        if (!__ip_dev_find(net, fl4->saddr, false))
2083                                goto out;
2084                }
2085        }
2086
2087
2088        if (fl4->flowi4_oif) {
2089                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2090                rth = ERR_PTR(-ENODEV);
2091                if (!dev_out)
2092                        goto out;
2093
2094                /* RACE: Check return value of inet_select_addr instead. */
2095                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2096                        rth = ERR_PTR(-ENETUNREACH);
2097                        goto out;
2098                }
2099                if (ipv4_is_local_multicast(fl4->daddr) ||
2100                    ipv4_is_lbcast(fl4->daddr)) {
2101                        if (!fl4->saddr)
2102                                fl4->saddr = inet_select_addr(dev_out, 0,
2103                                                              RT_SCOPE_LINK);
2104                        goto make_route;
2105                }
2106                if (!fl4->saddr) {
2107                        if (ipv4_is_multicast(fl4->daddr))
2108                                fl4->saddr = inet_select_addr(dev_out, 0,
2109                                                              fl4->flowi4_scope);
2110                        else if (!fl4->daddr)
2111                                fl4->saddr = inet_select_addr(dev_out, 0,
2112                                                              RT_SCOPE_HOST);
2113                }
2114        }
2115
2116        if (!fl4->daddr) {
2117                fl4->daddr = fl4->saddr;
2118                if (!fl4->daddr)
2119                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2120                dev_out = net->loopback_dev;
2121                fl4->flowi4_oif = LOOPBACK_IFINDEX;
2122                res.type = RTN_LOCAL;
2123                flags |= RTCF_LOCAL;
2124                goto make_route;
2125        }
2126
2127        if (fib_lookup(net, fl4, &res)) {
2128                res.fi = NULL;
2129                res.table = NULL;
2130                if (fl4->flowi4_oif) {
2131                        /* Apparently, routing tables are wrong. Assume,
2132                           that the destination is on link.
2133
2134                           WHY? DW.
2135                           Because we are allowed to send to iface
2136                           even if it has NO routes and NO assigned
2137                           addresses. When oif is specified, routing
2138                           tables are looked up with only one purpose:
2139                           to catch if destination is gatewayed, rather than
2140                           direct. Moreover, if MSG_DONTROUTE is set,
2141                           we send packet, ignoring both routing tables
2142                           and ifaddr state. --ANK
2143
2144
2145                           We could make it even if oif is unknown,
2146                           likely IPv6, but we do not.
2147                         */
2148
2149                        if (fl4->saddr == 0)
2150                                fl4->saddr = inet_select_addr(dev_out, 0,
2151                                                              RT_SCOPE_LINK);
2152                        res.type = RTN_UNICAST;
2153                        goto make_route;
2154                }
2155                rth = ERR_PTR(-ENETUNREACH);
2156                goto out;
2157        }
2158
2159        if (res.type == RTN_LOCAL) {
2160                if (!fl4->saddr) {
2161                        if (res.fi->fib_prefsrc)
2162                                fl4->saddr = res.fi->fib_prefsrc;
2163                        else
2164                                fl4->saddr = fl4->daddr;
2165                }
2166                dev_out = net->loopback_dev;
2167                fl4->flowi4_oif = dev_out->ifindex;
2168                flags |= RTCF_LOCAL;
2169                goto make_route;
2170        }
2171
2172#ifdef CONFIG_IP_ROUTE_MULTIPATH
2173        if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2174                fib_select_multipath(&res);
2175        else
2176#endif
2177        if (!res.prefixlen &&
2178            res.table->tb_num_default > 1 &&
2179            res.type == RTN_UNICAST && !fl4->flowi4_oif)
2180                fib_select_default(&res);
2181
2182        if (!fl4->saddr)
2183                fl4->saddr = FIB_RES_PREFSRC(net, res);
2184
2185        dev_out = FIB_RES_DEV(res);
2186        fl4->flowi4_oif = dev_out->ifindex;
2187
2188
2189make_route:
2190        rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2191
2192out:
2193        rcu_read_unlock();
2194        return rth;
2195}
2196EXPORT_SYMBOL_GPL(__ip_route_output_key);
2197
2198static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2199{
2200        return NULL;
2201}
2202
2203static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2204{
2205        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2206
2207        return mtu ? : dst->dev->mtu;
2208}
2209
2210static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2211                                          struct sk_buff *skb, u32 mtu)
2212{
2213}
2214
2215static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2216                                       struct sk_buff *skb)
2217{
2218}
2219
2220static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2221                                          unsigned long old)
2222{
2223        return NULL;
2224}
2225
2226static struct dst_ops ipv4_dst_blackhole_ops = {
2227        .family                 =       AF_INET,
2228        .check                  =       ipv4_blackhole_dst_check,
2229        .mtu                    =       ipv4_blackhole_mtu,
2230        .default_advmss         =       ipv4_default_advmss,
2231        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2232        .redirect               =       ipv4_rt_blackhole_redirect,
2233        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2234        .neigh_lookup           =       ipv4_neigh_lookup,
2235};
2236
2237struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2238{
2239        struct rtable *ort = (struct rtable *) dst_orig;
2240        struct rtable *rt;
2241
2242        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2243        if (rt) {
2244                struct dst_entry *new = &rt->dst;
2245
2246                new->__use = 1;
2247                new->input = dst_discard;
2248                new->output = dst_discard_sk;
2249
2250                new->dev = ort->dst.dev;
2251                if (new->dev)
2252                        dev_hold(new->dev);
2253
2254                rt->rt_is_input = ort->rt_is_input;
2255                rt->rt_iif = ort->rt_iif;
2256                rt->rt_pmtu = ort->rt_pmtu;
2257
2258                rt->rt_genid = rt_genid_ipv4(net);
2259                rt->rt_flags = ort->rt_flags;
2260                rt->rt_type = ort->rt_type;
2261                rt->rt_gateway = ort->rt_gateway;
2262                rt->rt_uses_gateway = ort->rt_uses_gateway;
2263
2264                INIT_LIST_HEAD(&rt->rt_uncached);
2265
2266                dst_free(new);
2267        }
2268
2269        dst_release(dst_orig);
2270
2271        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2272}
2273
2274struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2275                                    struct sock *sk)
2276{
2277        struct rtable *rt = __ip_route_output_key(net, flp4);
2278
2279        if (IS_ERR(rt))
2280                return rt;
2281
2282        if (flp4->flowi4_proto)
2283                rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2284                                                        flowi4_to_flowi(flp4),
2285                                                        sk, 0);
2286
2287        return rt;
2288}
2289EXPORT_SYMBOL_GPL(ip_route_output_flow);
2290
2291static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2292                        struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2293                        u32 seq, int event, int nowait, unsigned int flags)
2294{
2295        struct rtable *rt = skb_rtable(skb);
2296        struct rtmsg *r;
2297        struct nlmsghdr *nlh;
2298        unsigned long expires = 0;
2299        u32 error;
2300        u32 metrics[RTAX_MAX];
2301
2302        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2303        if (!nlh)
2304                return -EMSGSIZE;
2305
2306        r = nlmsg_data(nlh);
2307        r->rtm_family    = AF_INET;
2308        r->rtm_dst_len  = 32;
2309        r->rtm_src_len  = 0;
2310        r->rtm_tos      = fl4->flowi4_tos;
2311        r->rtm_table    = RT_TABLE_MAIN;
2312        if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2313                goto nla_put_failure;
2314        r->rtm_type     = rt->rt_type;
2315        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2316        r->rtm_protocol = RTPROT_UNSPEC;
2317        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2318        if (rt->rt_flags & RTCF_NOTIFY)
2319                r->rtm_flags |= RTM_F_NOTIFY;
2320        if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2321                r->rtm_flags |= RTCF_DOREDIRECT;
2322
2323        if (nla_put_in_addr(skb, RTA_DST, dst))
2324                goto nla_put_failure;
2325        if (src) {
2326                r->rtm_src_len = 32;
2327                if (nla_put_in_addr(skb, RTA_SRC, src))
2328                        goto nla_put_failure;
2329        }
2330        if (rt->dst.dev &&
2331            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2332                goto nla_put_failure;
2333#ifdef CONFIG_IP_ROUTE_CLASSID
2334        if (rt->dst.tclassid &&
2335            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2336                goto nla_put_failure;
2337#endif
2338        if (!rt_is_input_route(rt) &&
2339            fl4->saddr != src) {
2340                if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2341                        goto nla_put_failure;
2342        }
2343        if (rt->rt_uses_gateway &&
2344            nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2345                goto nla_put_failure;
2346
2347        expires = rt->dst.expires;
2348        if (expires) {
2349                unsigned long now = jiffies;
2350
2351                if (time_before(now, expires))
2352                        expires -= now;
2353                else
2354                        expires = 0;
2355        }
2356
2357        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2358        if (rt->rt_pmtu && expires)
2359                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2360        if (rtnetlink_put_metrics(skb, metrics) < 0)
2361                goto nla_put_failure;
2362
2363        if (fl4->flowi4_mark &&
2364            nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2365                goto nla_put_failure;
2366
2367        error = rt->dst.error;
2368
2369        if (rt_is_input_route(rt)) {
2370#ifdef CONFIG_IP_MROUTE
2371                if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2372                    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2373                        int err = ipmr_get_route(net, skb,
2374                                                 fl4->saddr, fl4->daddr,
2375                                                 r, nowait);
2376                        if (err <= 0) {
2377                                if (!nowait) {
2378                                        if (err == 0)
2379                                                return 0;
2380                                        goto nla_put_failure;
2381                                } else {
2382                                        if (err == -EMSGSIZE)
2383                                                goto nla_put_failure;
2384                                        error = err;
2385                                }
2386                        }
2387                } else
2388#endif
2389                        if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2390                                goto nla_put_failure;
2391        }
2392
2393        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2394                goto nla_put_failure;
2395
2396        nlmsg_end(skb, nlh);
2397        return 0;
2398
2399nla_put_failure:
2400        nlmsg_cancel(skb, nlh);
2401        return -EMSGSIZE;
2402}
2403
2404static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2405{
2406        struct net *net = sock_net(in_skb->sk);
2407        struct rtmsg *rtm;
2408        struct nlattr *tb[RTA_MAX+1];
2409        struct rtable *rt = NULL;
2410        struct flowi4 fl4;
2411        __be32 dst = 0;
2412        __be32 src = 0;
2413        u32 iif;
2414        int err;
2415        int mark;
2416        struct sk_buff *skb;
2417
2418        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2419        if (err < 0)
2420                goto errout;
2421
2422        rtm = nlmsg_data(nlh);
2423
2424        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2425        if (!skb) {
2426                err = -ENOBUFS;
2427                goto errout;
2428        }
2429
2430        /* Reserve room for dummy headers, this skb can pass
2431           through good chunk of routing engine.
2432         */
2433        skb_reset_mac_header(skb);
2434        skb_reset_network_header(skb);
2435
2436        /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2437        ip_hdr(skb)->protocol = IPPROTO_ICMP;
2438        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2439
2440        src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2441        dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2442        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2443        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2444
2445        memset(&fl4, 0, sizeof(fl4));
2446        fl4.daddr = dst;
2447        fl4.saddr = src;
2448        fl4.flowi4_tos = rtm->rtm_tos;
2449        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2450        fl4.flowi4_mark = mark;
2451
2452        if (iif) {
2453                struct net_device *dev;
2454
2455                dev = __dev_get_by_index(net, iif);
2456                if (!dev) {
2457                        err = -ENODEV;
2458                        goto errout_free;
2459                }
2460
2461                skb->protocol   = htons(ETH_P_IP);
2462                skb->dev        = dev;
2463                skb->mark       = mark;
2464                local_bh_disable();
2465                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2466                local_bh_enable();
2467
2468                rt = skb_rtable(skb);
2469                if (err == 0 && rt->dst.error)
2470                        err = -rt->dst.error;
2471        } else {
2472                rt = ip_route_output_key(net, &fl4);
2473
2474                err = 0;
2475                if (IS_ERR(rt))
2476                        err = PTR_ERR(rt);
2477        }
2478
2479        if (err)
2480                goto errout_free;
2481
2482        skb_dst_set(skb, &rt->dst);
2483        if (rtm->rtm_flags & RTM_F_NOTIFY)
2484                rt->rt_flags |= RTCF_NOTIFY;
2485
2486        err = rt_fill_info(net, dst, src, &fl4, skb,
2487                           NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2488                           RTM_NEWROUTE, 0, 0);
2489        if (err < 0)
2490                goto errout_free;
2491
2492        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2493errout:
2494        return err;
2495
2496errout_free:
2497        kfree_skb(skb);
2498        goto errout;
2499}
2500
2501void ip_rt_multicast_event(struct in_device *in_dev)
2502{
2503        rt_cache_flush(dev_net(in_dev->dev));
2504}
2505
2506#ifdef CONFIG_SYSCTL
2507static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2508static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2509static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2510static int ip_rt_gc_elasticity __read_mostly    = 8;
2511
2512static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2513                                        void __user *buffer,
2514                                        size_t *lenp, loff_t *ppos)
2515{
2516        struct net *net = (struct net *)__ctl->extra1;
2517
2518        if (write) {
2519                rt_cache_flush(net);
2520                fnhe_genid_bump(net);
2521                return 0;
2522        }
2523
2524        return -EINVAL;
2525}
2526
2527static struct ctl_table ipv4_route_table[] = {
2528        {
2529                .procname       = "gc_thresh",
2530                .data           = &ipv4_dst_ops.gc_thresh,
2531                .maxlen         = sizeof(int),
2532                .mode           = 0644,
2533                .proc_handler   = proc_dointvec,
2534        },
2535        {
2536                .procname       = "max_size",
2537                .data           = &ip_rt_max_size,
2538                .maxlen         = sizeof(int),
2539                .mode           = 0644,
2540                .proc_handler   = proc_dointvec,
2541        },
2542        {
2543                /*  Deprecated. Use gc_min_interval_ms */
2544
2545                .procname       = "gc_min_interval",
2546                .data           = &ip_rt_gc_min_interval,
2547                .maxlen         = sizeof(int),
2548                .mode           = 0644,
2549                .proc_handler   = proc_dointvec_jiffies,
2550        },
2551        {
2552                .procname       = "gc_min_interval_ms",
2553                .data           = &ip_rt_gc_min_interval,
2554                .maxlen         = sizeof(int),
2555                .mode           = 0644,
2556                .proc_handler   = proc_dointvec_ms_jiffies,
2557        },
2558        {
2559                .procname       = "gc_timeout",
2560                .data           = &ip_rt_gc_timeout,
2561                .maxlen         = sizeof(int),
2562                .mode           = 0644,
2563                .proc_handler   = proc_dointvec_jiffies,
2564        },
2565        {
2566                .procname       = "gc_interval",
2567                .data           = &ip_rt_gc_interval,
2568                .maxlen         = sizeof(int),
2569                .mode           = 0644,
2570                .proc_handler   = proc_dointvec_jiffies,
2571        },
2572        {
2573                .procname       = "redirect_load",
2574                .data           = &ip_rt_redirect_load,
2575                .maxlen         = sizeof(int),
2576                .mode           = 0644,
2577                .proc_handler   = proc_dointvec,
2578        },
2579        {
2580                .procname       = "redirect_number",
2581                .data           = &ip_rt_redirect_number,
2582                .maxlen         = sizeof(int),
2583                .mode           = 0644,
2584                .proc_handler   = proc_dointvec,
2585        },
2586        {
2587                .procname       = "redirect_silence",
2588                .data           = &ip_rt_redirect_silence,
2589                .maxlen         = sizeof(int),
2590                .mode           = 0644,
2591                .proc_handler   = proc_dointvec,
2592        },
2593        {
2594                .procname       = "error_cost",
2595                .data           = &ip_rt_error_cost,
2596                .maxlen         = sizeof(int),
2597                .mode           = 0644,
2598                .proc_handler   = proc_dointvec,
2599        },
2600        {
2601                .procname       = "error_burst",
2602                .data           = &ip_rt_error_burst,
2603                .maxlen         = sizeof(int),
2604                .mode           = 0644,
2605                .proc_handler   = proc_dointvec,
2606        },
2607        {
2608                .procname       = "gc_elasticity",
2609                .data           = &ip_rt_gc_elasticity,
2610                .maxlen         = sizeof(int),
2611                .mode           = 0644,
2612                .proc_handler   = proc_dointvec,
2613        },
2614        {
2615                .procname       = "mtu_expires",
2616                .data           = &ip_rt_mtu_expires,
2617                .maxlen         = sizeof(int),
2618                .mode           = 0644,
2619                .proc_handler   = proc_dointvec_jiffies,
2620        },
2621        {
2622                .procname       = "min_pmtu",
2623                .data           = &ip_rt_min_pmtu,
2624                .maxlen         = sizeof(int),
2625                .mode           = 0644,
2626                .proc_handler   = proc_dointvec,
2627        },
2628        {
2629                .procname       = "min_adv_mss",
2630                .data           = &ip_rt_min_advmss,
2631                .maxlen         = sizeof(int),
2632                .mode           = 0644,
2633                .proc_handler   = proc_dointvec,
2634        },
2635        { }
2636};
2637
2638static struct ctl_table ipv4_route_flush_table[] = {
2639        {
2640                .procname       = "flush",
2641                .maxlen         = sizeof(int),
2642                .mode           = 0200,
2643                .proc_handler   = ipv4_sysctl_rtcache_flush,
2644        },
2645        { },
2646};
2647
2648static __net_init int sysctl_route_net_init(struct net *net)
2649{
2650        struct ctl_table *tbl;
2651
2652        tbl = ipv4_route_flush_table;
2653        if (!net_eq(net, &init_net)) {
2654                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2655                if (!tbl)
2656                        goto err_dup;
2657
2658                /* Don't export sysctls to unprivileged users */
2659                if (net->user_ns != &init_user_ns)
2660                        tbl[0].procname = NULL;
2661        }
2662        tbl[0].extra1 = net;
2663
2664        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2665        if (!net->ipv4.route_hdr)
2666                goto err_reg;
2667        return 0;
2668
2669err_reg:
2670        if (tbl != ipv4_route_flush_table)
2671                kfree(tbl);
2672err_dup:
2673        return -ENOMEM;
2674}
2675
2676static __net_exit void sysctl_route_net_exit(struct net *net)
2677{
2678        struct ctl_table *tbl;
2679
2680        tbl = net->ipv4.route_hdr->ctl_table_arg;
2681        unregister_net_sysctl_table(net->ipv4.route_hdr);
2682        BUG_ON(tbl == ipv4_route_flush_table);
2683        kfree(tbl);
2684}
2685
2686static __net_initdata struct pernet_operations sysctl_route_ops = {
2687        .init = sysctl_route_net_init,
2688        .exit = sysctl_route_net_exit,
2689};
2690#endif
2691
2692static __net_init int rt_genid_init(struct net *net)
2693{
2694        atomic_set(&net->ipv4.rt_genid, 0);
2695        atomic_set(&net->fnhe_genid, 0);
2696        get_random_bytes(&net->ipv4.dev_addr_genid,
2697                         sizeof(net->ipv4.dev_addr_genid));
2698        return 0;
2699}
2700
2701static __net_initdata struct pernet_operations rt_genid_ops = {
2702        .init = rt_genid_init,
2703};
2704
2705static int __net_init ipv4_inetpeer_init(struct net *net)
2706{
2707        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2708
2709        if (!bp)
2710                return -ENOMEM;
2711        inet_peer_base_init(bp);
2712        net->ipv4.peers = bp;
2713        return 0;
2714}
2715
2716static void __net_exit ipv4_inetpeer_exit(struct net *net)
2717{
2718        struct inet_peer_base *bp = net->ipv4.peers;
2719
2720        net->ipv4.peers = NULL;
2721        inetpeer_invalidate_tree(bp);
2722        kfree(bp);
2723}
2724
2725static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2726        .init   =       ipv4_inetpeer_init,
2727        .exit   =       ipv4_inetpeer_exit,
2728};
2729
2730#ifdef CONFIG_IP_ROUTE_CLASSID
2731struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2732#endif /* CONFIG_IP_ROUTE_CLASSID */
2733
2734int __init ip_rt_init(void)
2735{
2736        int rc = 0;
2737        int cpu;
2738
2739        ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2740        if (!ip_idents)
2741                panic("IP: failed to allocate ip_idents\n");
2742
2743        prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2744
2745        for_each_possible_cpu(cpu) {
2746                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2747
2748                INIT_LIST_HEAD(&ul->head);
2749                spin_lock_init(&ul->lock);
2750        }
2751#ifdef CONFIG_IP_ROUTE_CLASSID
2752        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2753        if (!ip_rt_acct)
2754                panic("IP: failed to allocate ip_rt_acct\n");
2755#endif
2756
2757        ipv4_dst_ops.kmem_cachep =
2758                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2759                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2760
2761        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2762
2763        if (dst_entries_init(&ipv4_dst_ops) < 0)
2764                panic("IP: failed to allocate ipv4_dst_ops counter\n");
2765
2766        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2767                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2768
2769        ipv4_dst_ops.gc_thresh = ~0;
2770        ip_rt_max_size = INT_MAX;
2771
2772        devinet_init();
2773        ip_fib_init();
2774
2775        if (ip_rt_proc_init())
2776                pr_err("Unable to create route proc files\n");
2777#ifdef CONFIG_XFRM
2778        xfrm_init();
2779        xfrm4_init();
2780#endif
2781        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2782
2783#ifdef CONFIG_SYSCTL
2784        register_pernet_subsys(&sysctl_route_ops);
2785#endif
2786        register_pernet_subsys(&rt_genid_ops);
2787        register_pernet_subsys(&ipv4_inetpeer_ops);
2788        return rc;
2789}
2790
2791#ifdef CONFIG_SYSCTL
2792/*
2793 * We really need to sanitize the damn ipv4 init order, then all
2794 * this nonsense will go away.
2795 */
2796void __init ip_static_sysctl_init(void)
2797{
2798        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2799}
2800#endif
2801