linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *              Alan Cox        :       Verify area fixes.
  16 *              Alan Cox        :       cli() protects routing changes
  17 *              Rui Oliveira    :       ICMP routing table updates
  18 *              (rco@di.uminho.pt)      Routing table insertion and update
  19 *              Linus Torvalds  :       Rewrote bits to be sensible
  20 *              Alan Cox        :       Added BSD route gw semantics
  21 *              Alan Cox        :       Super /proc >4K
  22 *              Alan Cox        :       MTU in route table
  23 *              Alan Cox        :       MSS actually. Also added the window
  24 *                                      clamper.
  25 *              Sam Lantinga    :       Fixed route matching in rt_del()
  26 *              Alan Cox        :       Routing cache support.
  27 *              Alan Cox        :       Removed compatibility cruft.
  28 *              Alan Cox        :       RTF_REJECT support.
  29 *              Alan Cox        :       TCP irtt support.
  30 *              Jonathan Naylor :       Added Metric support.
  31 *      Miquel van Smoorenburg  :       BSD API fixes.
  32 *      Miquel van Smoorenburg  :       Metrics.
  33 *              Alan Cox        :       Use __u32 properly
  34 *              Alan Cox        :       Aligned routing errors more closely with BSD
  35 *                                      our system is still very different.
  36 *              Alan Cox        :       Faster /proc handling
  37 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38 *                                      routing caches and better behaviour.
  39 *
  40 *              Olaf Erb        :       irtt wasn't being copied right.
  41 *              Bjorn Ekwall    :       Kerneld route support.
  42 *              Alan Cox        :       Multicast fixed (I hope)
  43 *              Pavel Krauz     :       Limited broadcast fixed
  44 *              Mike McLagan    :       Routing by source
  45 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46 *                                      route.c and rewritten from scratch.
  47 *              Andi Kleen      :       Load-limit warning messages.
  48 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52 *              Marc Boucher    :       routing by fwmark
  53 *      Robert Olsson           :       Added rt_cache statistics
  54 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58 *
  59 *              This program is free software; you can redistribute it and/or
  60 *              modify it under the terms of the GNU General Public License
  61 *              as published by the Free Software Foundation; either version
  62 *              2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
  92#include <linux/jhash.h>
  93#include <net/dst.h>
  94#include <net/dst_metadata.h>
  95#include <net/net_namespace.h>
  96#include <net/protocol.h>
  97#include <net/ip.h>
  98#include <net/route.h>
  99#include <net/inetpeer.h>
 100#include <net/sock.h>
 101#include <net/ip_fib.h>
 102#include <net/arp.h>
 103#include <net/tcp.h>
 104#include <net/icmp.h>
 105#include <net/xfrm.h>
 106#include <net/lwtunnel.h>
 107#include <net/netevent.h>
 108#include <net/rtnetlink.h>
 109#ifdef CONFIG_SYSCTL
 110#include <linux/sysctl.h>
 111#include <linux/kmemleak.h>
 112#endif
 113#include <net/secure_seq.h>
 114#include <net/ip_tunnels.h>
 115#include <net/l3mdev.h>
 116
 117#define RT_FL_TOS(oldflp4) \
 118        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 119
 120#define RT_GC_TIMEOUT (300*HZ)
 121
 122static int ip_rt_max_size;
 123static int ip_rt_redirect_number __read_mostly  = 9;
 124static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126static int ip_rt_error_cost __read_mostly       = HZ;
 127static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130static int ip_rt_min_advmss __read_mostly       = 256;
 131
 132static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 133/*
 134 *      Interface to generic destination cache.
 135 */
 136
 137static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 138static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 139static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 140static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 141static void              ipv4_link_failure(struct sk_buff *skb);
 142static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 143                                           struct sk_buff *skb, u32 mtu);
 144static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 145                                        struct sk_buff *skb);
 146static void             ipv4_dst_destroy(struct dst_entry *dst);
 147
 148static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 149{
 150        WARN_ON(1);
 151        return NULL;
 152}
 153
 154static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 155                                           struct sk_buff *skb,
 156                                           const void *daddr);
 157
 158static struct dst_ops ipv4_dst_ops = {
 159        .family =               AF_INET,
 160        .check =                ipv4_dst_check,
 161        .default_advmss =       ipv4_default_advmss,
 162        .mtu =                  ipv4_mtu,
 163        .cow_metrics =          ipv4_cow_metrics,
 164        .destroy =              ipv4_dst_destroy,
 165        .negative_advice =      ipv4_negative_advice,
 166        .link_failure =         ipv4_link_failure,
 167        .update_pmtu =          ip_rt_update_pmtu,
 168        .redirect =             ip_do_redirect,
 169        .local_out =            __ip_local_out,
 170        .neigh_lookup =         ipv4_neigh_lookup,
 171};
 172
 173#define ECN_OR_COST(class)      TC_PRIO_##class
 174
 175const __u8 ip_tos2prio[16] = {
 176        TC_PRIO_BESTEFFORT,
 177        ECN_OR_COST(BESTEFFORT),
 178        TC_PRIO_BESTEFFORT,
 179        ECN_OR_COST(BESTEFFORT),
 180        TC_PRIO_BULK,
 181        ECN_OR_COST(BULK),
 182        TC_PRIO_BULK,
 183        ECN_OR_COST(BULK),
 184        TC_PRIO_INTERACTIVE,
 185        ECN_OR_COST(INTERACTIVE),
 186        TC_PRIO_INTERACTIVE,
 187        ECN_OR_COST(INTERACTIVE),
 188        TC_PRIO_INTERACTIVE_BULK,
 189        ECN_OR_COST(INTERACTIVE_BULK),
 190        TC_PRIO_INTERACTIVE_BULK,
 191        ECN_OR_COST(INTERACTIVE_BULK)
 192};
 193EXPORT_SYMBOL(ip_tos2prio);
 194
 195static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 196#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 197
 198#ifdef CONFIG_PROC_FS
 199static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 200{
 201        if (*pos)
 202                return NULL;
 203        return SEQ_START_TOKEN;
 204}
 205
 206static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 207{
 208        ++*pos;
 209        return NULL;
 210}
 211
 212static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 213{
 214}
 215
 216static int rt_cache_seq_show(struct seq_file *seq, void *v)
 217{
 218        if (v == SEQ_START_TOKEN)
 219                seq_printf(seq, "%-127s\n",
 220                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 221                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 222                           "HHUptod\tSpecDst");
 223        return 0;
 224}
 225
 226static const struct seq_operations rt_cache_seq_ops = {
 227        .start  = rt_cache_seq_start,
 228        .next   = rt_cache_seq_next,
 229        .stop   = rt_cache_seq_stop,
 230        .show   = rt_cache_seq_show,
 231};
 232
 233static int rt_cache_seq_open(struct inode *inode, struct file *file)
 234{
 235        return seq_open(file, &rt_cache_seq_ops);
 236}
 237
 238static const struct file_operations rt_cache_seq_fops = {
 239        .owner   = THIS_MODULE,
 240        .open    = rt_cache_seq_open,
 241        .read    = seq_read,
 242        .llseek  = seq_lseek,
 243        .release = seq_release,
 244};
 245
 246
 247static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 248{
 249        int cpu;
 250
 251        if (*pos == 0)
 252                return SEQ_START_TOKEN;
 253
 254        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 255                if (!cpu_possible(cpu))
 256                        continue;
 257                *pos = cpu+1;
 258                return &per_cpu(rt_cache_stat, cpu);
 259        }
 260        return NULL;
 261}
 262
 263static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 264{
 265        int cpu;
 266
 267        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 268                if (!cpu_possible(cpu))
 269                        continue;
 270                *pos = cpu+1;
 271                return &per_cpu(rt_cache_stat, cpu);
 272        }
 273        return NULL;
 274
 275}
 276
 277static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 278{
 279
 280}
 281
 282static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 283{
 284        struct rt_cache_stat *st = v;
 285
 286        if (v == SEQ_START_TOKEN) {
 287                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 288                return 0;
 289        }
 290
 291        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 292                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 293                   dst_entries_get_slow(&ipv4_dst_ops),
 294                   0, /* st->in_hit */
 295                   st->in_slow_tot,
 296                   st->in_slow_mc,
 297                   st->in_no_route,
 298                   st->in_brd,
 299                   st->in_martian_dst,
 300                   st->in_martian_src,
 301
 302                   0, /* st->out_hit */
 303                   st->out_slow_tot,
 304                   st->out_slow_mc,
 305
 306                   0, /* st->gc_total */
 307                   0, /* st->gc_ignored */
 308                   0, /* st->gc_goal_miss */
 309                   0, /* st->gc_dst_overflow */
 310                   0, /* st->in_hlist_search */
 311                   0  /* st->out_hlist_search */
 312                );
 313        return 0;
 314}
 315
 316static const struct seq_operations rt_cpu_seq_ops = {
 317        .start  = rt_cpu_seq_start,
 318        .next   = rt_cpu_seq_next,
 319        .stop   = rt_cpu_seq_stop,
 320        .show   = rt_cpu_seq_show,
 321};
 322
 323
 324static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 325{
 326        return seq_open(file, &rt_cpu_seq_ops);
 327}
 328
 329static const struct file_operations rt_cpu_seq_fops = {
 330        .owner   = THIS_MODULE,
 331        .open    = rt_cpu_seq_open,
 332        .read    = seq_read,
 333        .llseek  = seq_lseek,
 334        .release = seq_release,
 335};
 336
 337#ifdef CONFIG_IP_ROUTE_CLASSID
 338static int rt_acct_proc_show(struct seq_file *m, void *v)
 339{
 340        struct ip_rt_acct *dst, *src;
 341        unsigned int i, j;
 342
 343        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 344        if (!dst)
 345                return -ENOMEM;
 346
 347        for_each_possible_cpu(i) {
 348                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 349                for (j = 0; j < 256; j++) {
 350                        dst[j].o_bytes   += src[j].o_bytes;
 351                        dst[j].o_packets += src[j].o_packets;
 352                        dst[j].i_bytes   += src[j].i_bytes;
 353                        dst[j].i_packets += src[j].i_packets;
 354                }
 355        }
 356
 357        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 358        kfree(dst);
 359        return 0;
 360}
 361
 362static int rt_acct_proc_open(struct inode *inode, struct file *file)
 363{
 364        return single_open(file, rt_acct_proc_show, NULL);
 365}
 366
 367static const struct file_operations rt_acct_proc_fops = {
 368        .owner          = THIS_MODULE,
 369        .open           = rt_acct_proc_open,
 370        .read           = seq_read,
 371        .llseek         = seq_lseek,
 372        .release        = single_release,
 373};
 374#endif
 375
 376static int __net_init ip_rt_do_proc_init(struct net *net)
 377{
 378        struct proc_dir_entry *pde;
 379
 380        pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 381                          &rt_cache_seq_fops);
 382        if (!pde)
 383                goto err1;
 384
 385        pde = proc_create("rt_cache", S_IRUGO,
 386                          net->proc_net_stat, &rt_cpu_seq_fops);
 387        if (!pde)
 388                goto err2;
 389
 390#ifdef CONFIG_IP_ROUTE_CLASSID
 391        pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 392        if (!pde)
 393                goto err3;
 394#endif
 395        return 0;
 396
 397#ifdef CONFIG_IP_ROUTE_CLASSID
 398err3:
 399        remove_proc_entry("rt_cache", net->proc_net_stat);
 400#endif
 401err2:
 402        remove_proc_entry("rt_cache", net->proc_net);
 403err1:
 404        return -ENOMEM;
 405}
 406
 407static void __net_exit ip_rt_do_proc_exit(struct net *net)
 408{
 409        remove_proc_entry("rt_cache", net->proc_net_stat);
 410        remove_proc_entry("rt_cache", net->proc_net);
 411#ifdef CONFIG_IP_ROUTE_CLASSID
 412        remove_proc_entry("rt_acct", net->proc_net);
 413#endif
 414}
 415
 416static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 417        .init = ip_rt_do_proc_init,
 418        .exit = ip_rt_do_proc_exit,
 419};
 420
 421static int __init ip_rt_proc_init(void)
 422{
 423        return register_pernet_subsys(&ip_rt_proc_ops);
 424}
 425
 426#else
 427static inline int ip_rt_proc_init(void)
 428{
 429        return 0;
 430}
 431#endif /* CONFIG_PROC_FS */
 432
 433static inline bool rt_is_expired(const struct rtable *rth)
 434{
 435        return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 436}
 437
 438void rt_cache_flush(struct net *net)
 439{
 440        rt_genid_bump_ipv4(net);
 441}
 442
 443static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 444                                           struct sk_buff *skb,
 445                                           const void *daddr)
 446{
 447        struct net_device *dev = dst->dev;
 448        const __be32 *pkey = daddr;
 449        const struct rtable *rt;
 450        struct neighbour *n;
 451
 452        rt = (const struct rtable *) dst;
 453        if (rt->rt_gateway)
 454                pkey = (const __be32 *) &rt->rt_gateway;
 455        else if (skb)
 456                pkey = &ip_hdr(skb)->daddr;
 457
 458        n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 459        if (n)
 460                return n;
 461        return neigh_create(&arp_tbl, pkey, dev);
 462}
 463
 464#define IP_IDENTS_SZ 2048u
 465
 466static atomic_t *ip_idents __read_mostly;
 467static u32 *ip_tstamps __read_mostly;
 468
 469/* In order to protect privacy, we add a perturbation to identifiers
 470 * if one generator is seldom used. This makes hard for an attacker
 471 * to infer how many packets were sent between two points in time.
 472 */
 473u32 ip_idents_reserve(u32 hash, int segs)
 474{
 475        u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 476        atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 477        u32 old = ACCESS_ONCE(*p_tstamp);
 478        u32 now = (u32)jiffies;
 479        u32 delta = 0;
 480
 481        if (old != now && cmpxchg(p_tstamp, old, now) == old)
 482                delta = prandom_u32_max(now - old);
 483
 484        return atomic_add_return(segs + delta, p_id) - segs;
 485}
 486EXPORT_SYMBOL(ip_idents_reserve);
 487
 488void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 489{
 490        static u32 ip_idents_hashrnd __read_mostly;
 491        u32 hash, id;
 492
 493        net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 494
 495        hash = jhash_3words((__force u32)iph->daddr,
 496                            (__force u32)iph->saddr,
 497                            iph->protocol ^ net_hash_mix(net),
 498                            ip_idents_hashrnd);
 499        id = ip_idents_reserve(hash, segs);
 500        iph->id = htons(id);
 501}
 502EXPORT_SYMBOL(__ip_select_ident);
 503
 504static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 505                             const struct iphdr *iph,
 506                             int oif, u8 tos,
 507                             u8 prot, u32 mark, int flow_flags)
 508{
 509        if (sk) {
 510                const struct inet_sock *inet = inet_sk(sk);
 511
 512                oif = sk->sk_bound_dev_if;
 513                mark = sk->sk_mark;
 514                tos = RT_CONN_FLAGS(sk);
 515                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 516        }
 517        flowi4_init_output(fl4, oif, mark, tos,
 518                           RT_SCOPE_UNIVERSE, prot,
 519                           flow_flags,
 520                           iph->daddr, iph->saddr, 0, 0);
 521}
 522
 523static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 524                               const struct sock *sk)
 525{
 526        const struct iphdr *iph = ip_hdr(skb);
 527        int oif = skb->dev->ifindex;
 528        u8 tos = RT_TOS(iph->tos);
 529        u8 prot = iph->protocol;
 530        u32 mark = skb->mark;
 531
 532        __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 533}
 534
 535static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 536{
 537        const struct inet_sock *inet = inet_sk(sk);
 538        const struct ip_options_rcu *inet_opt;
 539        __be32 daddr = inet->inet_daddr;
 540
 541        rcu_read_lock();
 542        inet_opt = rcu_dereference(inet->inet_opt);
 543        if (inet_opt && inet_opt->opt.srr)
 544                daddr = inet_opt->opt.faddr;
 545        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 546                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 547                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 548                           inet_sk_flowi_flags(sk),
 549                           daddr, inet->inet_saddr, 0, 0);
 550        rcu_read_unlock();
 551}
 552
 553static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 554                                 const struct sk_buff *skb)
 555{
 556        if (skb)
 557                build_skb_flow_key(fl4, skb, sk);
 558        else
 559                build_sk_flow_key(fl4, sk);
 560}
 561
 562static inline void rt_free(struct rtable *rt)
 563{
 564        call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 565}
 566
 567static DEFINE_SPINLOCK(fnhe_lock);
 568
 569static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 570{
 571        struct rtable *rt;
 572
 573        rt = rcu_dereference(fnhe->fnhe_rth_input);
 574        if (rt) {
 575                RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 576                rt_free(rt);
 577        }
 578        rt = rcu_dereference(fnhe->fnhe_rth_output);
 579        if (rt) {
 580                RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 581                rt_free(rt);
 582        }
 583}
 584
 585static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 586{
 587        struct fib_nh_exception *fnhe, *oldest;
 588
 589        oldest = rcu_dereference(hash->chain);
 590        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 591             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 592                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 593                        oldest = fnhe;
 594        }
 595        fnhe_flush_routes(oldest);
 596        return oldest;
 597}
 598
 599static inline u32 fnhe_hashfun(__be32 daddr)
 600{
 601        static u32 fnhe_hashrnd __read_mostly;
 602        u32 hval;
 603
 604        net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 605        hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 606        return hash_32(hval, FNHE_HASH_SHIFT);
 607}
 608
 609static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 610{
 611        rt->rt_pmtu = fnhe->fnhe_pmtu;
 612        rt->dst.expires = fnhe->fnhe_expires;
 613
 614        if (fnhe->fnhe_gw) {
 615                rt->rt_flags |= RTCF_REDIRECTED;
 616                rt->rt_gateway = fnhe->fnhe_gw;
 617                rt->rt_uses_gateway = 1;
 618        }
 619}
 620
 621static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 622                                  u32 pmtu, unsigned long expires)
 623{
 624        struct fnhe_hash_bucket *hash;
 625        struct fib_nh_exception *fnhe;
 626        struct rtable *rt;
 627        unsigned int i;
 628        int depth;
 629        u32 hval = fnhe_hashfun(daddr);
 630
 631        spin_lock_bh(&fnhe_lock);
 632
 633        hash = rcu_dereference(nh->nh_exceptions);
 634        if (!hash) {
 635                hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 636                if (!hash)
 637                        goto out_unlock;
 638                rcu_assign_pointer(nh->nh_exceptions, hash);
 639        }
 640
 641        hash += hval;
 642
 643        depth = 0;
 644        for (fnhe = rcu_dereference(hash->chain); fnhe;
 645             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 646                if (fnhe->fnhe_daddr == daddr)
 647                        break;
 648                depth++;
 649        }
 650
 651        if (fnhe) {
 652                if (gw)
 653                        fnhe->fnhe_gw = gw;
 654                if (pmtu) {
 655                        fnhe->fnhe_pmtu = pmtu;
 656                        fnhe->fnhe_expires = max(1UL, expires);
 657                }
 658                /* Update all cached dsts too */
 659                rt = rcu_dereference(fnhe->fnhe_rth_input);
 660                if (rt)
 661                        fill_route_from_fnhe(rt, fnhe);
 662                rt = rcu_dereference(fnhe->fnhe_rth_output);
 663                if (rt)
 664                        fill_route_from_fnhe(rt, fnhe);
 665        } else {
 666                if (depth > FNHE_RECLAIM_DEPTH)
 667                        fnhe = fnhe_oldest(hash);
 668                else {
 669                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 670                        if (!fnhe)
 671                                goto out_unlock;
 672
 673                        fnhe->fnhe_next = hash->chain;
 674                        rcu_assign_pointer(hash->chain, fnhe);
 675                }
 676                fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
 677                fnhe->fnhe_daddr = daddr;
 678                fnhe->fnhe_gw = gw;
 679                fnhe->fnhe_pmtu = pmtu;
 680                fnhe->fnhe_expires = expires;
 681
 682                /* Exception created; mark the cached routes for the nexthop
 683                 * stale, so anyone caching it rechecks if this exception
 684                 * applies to them.
 685                 */
 686                rt = rcu_dereference(nh->nh_rth_input);
 687                if (rt)
 688                        rt->dst.obsolete = DST_OBSOLETE_KILL;
 689
 690                for_each_possible_cpu(i) {
 691                        struct rtable __rcu **prt;
 692                        prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 693                        rt = rcu_dereference(*prt);
 694                        if (rt)
 695                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 696                }
 697        }
 698
 699        fnhe->fnhe_stamp = jiffies;
 700
 701out_unlock:
 702        spin_unlock_bh(&fnhe_lock);
 703}
 704
 705static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 706                             bool kill_route)
 707{
 708        __be32 new_gw = icmp_hdr(skb)->un.gateway;
 709        __be32 old_gw = ip_hdr(skb)->saddr;
 710        struct net_device *dev = skb->dev;
 711        struct in_device *in_dev;
 712        struct fib_result res;
 713        struct neighbour *n;
 714        struct net *net;
 715
 716        switch (icmp_hdr(skb)->code & 7) {
 717        case ICMP_REDIR_NET:
 718        case ICMP_REDIR_NETTOS:
 719        case ICMP_REDIR_HOST:
 720        case ICMP_REDIR_HOSTTOS:
 721                break;
 722
 723        default:
 724                return;
 725        }
 726
 727        if (rt->rt_gateway != old_gw)
 728                return;
 729
 730        in_dev = __in_dev_get_rcu(dev);
 731        if (!in_dev)
 732                return;
 733
 734        net = dev_net(dev);
 735        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 736            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 737            ipv4_is_zeronet(new_gw))
 738                goto reject_redirect;
 739
 740        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 741                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 742                        goto reject_redirect;
 743                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 744                        goto reject_redirect;
 745        } else {
 746                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 747                        goto reject_redirect;
 748        }
 749
 750        n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 751        if (!IS_ERR(n)) {
 752                if (!(n->nud_state & NUD_VALID)) {
 753                        neigh_event_send(n, NULL);
 754                } else {
 755                        if (fib_lookup(net, fl4, &res, 0) == 0) {
 756                                struct fib_nh *nh = &FIB_RES_NH(res);
 757
 758                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
 759                                                0, jiffies + ip_rt_gc_timeout);
 760                        }
 761                        if (kill_route)
 762                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 763                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 764                }
 765                neigh_release(n);
 766        }
 767        return;
 768
 769reject_redirect:
 770#ifdef CONFIG_IP_ROUTE_VERBOSE
 771        if (IN_DEV_LOG_MARTIANS(in_dev)) {
 772                const struct iphdr *iph = (const struct iphdr *) skb->data;
 773                __be32 daddr = iph->daddr;
 774                __be32 saddr = iph->saddr;
 775
 776                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 777                                     "  Advised path = %pI4 -> %pI4\n",
 778                                     &old_gw, dev->name, &new_gw,
 779                                     &saddr, &daddr);
 780        }
 781#endif
 782        ;
 783}
 784
 785static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 786{
 787        struct rtable *rt;
 788        struct flowi4 fl4;
 789        const struct iphdr *iph = (const struct iphdr *) skb->data;
 790        int oif = skb->dev->ifindex;
 791        u8 tos = RT_TOS(iph->tos);
 792        u8 prot = iph->protocol;
 793        u32 mark = skb->mark;
 794
 795        rt = (struct rtable *) dst;
 796
 797        __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
 798        __ip_do_redirect(rt, skb, &fl4, true);
 799}
 800
 801static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 802{
 803        struct rtable *rt = (struct rtable *)dst;
 804        struct dst_entry *ret = dst;
 805
 806        if (rt) {
 807                if (dst->obsolete > 0) {
 808                        ip_rt_put(rt);
 809                        ret = NULL;
 810                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 811                           rt->dst.expires) {
 812                        ip_rt_put(rt);
 813                        ret = NULL;
 814                }
 815        }
 816        return ret;
 817}
 818
 819/*
 820 * Algorithm:
 821 *      1. The first ip_rt_redirect_number redirects are sent
 822 *         with exponential backoff, then we stop sending them at all,
 823 *         assuming that the host ignores our redirects.
 824 *      2. If we did not see packets requiring redirects
 825 *         during ip_rt_redirect_silence, we assume that the host
 826 *         forgot redirected route and start to send redirects again.
 827 *
 828 * This algorithm is much cheaper and more intelligent than dumb load limiting
 829 * in icmp.c.
 830 *
 831 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 832 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 833 */
 834
 835void ip_rt_send_redirect(struct sk_buff *skb)
 836{
 837        struct rtable *rt = skb_rtable(skb);
 838        struct in_device *in_dev;
 839        struct inet_peer *peer;
 840        struct net *net;
 841        int log_martians;
 842        int vif;
 843
 844        rcu_read_lock();
 845        in_dev = __in_dev_get_rcu(rt->dst.dev);
 846        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 847                rcu_read_unlock();
 848                return;
 849        }
 850        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 851        vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 852        rcu_read_unlock();
 853
 854        net = dev_net(rt->dst.dev);
 855        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 856        if (!peer) {
 857                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 858                          rt_nexthop(rt, ip_hdr(skb)->daddr));
 859                return;
 860        }
 861
 862        /* No redirected packets during ip_rt_redirect_silence;
 863         * reset the algorithm.
 864         */
 865        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 866                peer->rate_tokens = 0;
 867
 868        /* Too many ignored redirects; do not send anything
 869         * set dst.rate_last to the last seen redirected packet.
 870         */
 871        if (peer->rate_tokens >= ip_rt_redirect_number) {
 872                peer->rate_last = jiffies;
 873                goto out_put_peer;
 874        }
 875
 876        /* Check for load limit; set rate_last to the latest sent
 877         * redirect.
 878         */
 879        if (peer->rate_tokens == 0 ||
 880            time_after(jiffies,
 881                       (peer->rate_last +
 882                        (ip_rt_redirect_load << peer->rate_tokens)))) {
 883                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 884
 885                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 886                peer->rate_last = jiffies;
 887                ++peer->rate_tokens;
 888#ifdef CONFIG_IP_ROUTE_VERBOSE
 889                if (log_martians &&
 890                    peer->rate_tokens == ip_rt_redirect_number)
 891                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 892                                             &ip_hdr(skb)->saddr, inet_iif(skb),
 893                                             &ip_hdr(skb)->daddr, &gw);
 894#endif
 895        }
 896out_put_peer:
 897        inet_putpeer(peer);
 898}
 899
 900static int ip_error(struct sk_buff *skb)
 901{
 902        struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 903        struct rtable *rt = skb_rtable(skb);
 904        struct inet_peer *peer;
 905        unsigned long now;
 906        struct net *net;
 907        bool send;
 908        int code;
 909
 910        /* IP on this device is disabled. */
 911        if (!in_dev)
 912                goto out;
 913
 914        net = dev_net(rt->dst.dev);
 915        if (!IN_DEV_FORWARD(in_dev)) {
 916                switch (rt->dst.error) {
 917                case EHOSTUNREACH:
 918                        __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 919                        break;
 920
 921                case ENETUNREACH:
 922                        __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 923                        break;
 924                }
 925                goto out;
 926        }
 927
 928        switch (rt->dst.error) {
 929        case EINVAL:
 930        default:
 931                goto out;
 932        case EHOSTUNREACH:
 933                code = ICMP_HOST_UNREACH;
 934                break;
 935        case ENETUNREACH:
 936                code = ICMP_NET_UNREACH;
 937                __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 938                break;
 939        case EACCES:
 940                code = ICMP_PKT_FILTERED;
 941                break;
 942        }
 943
 944        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 945                               l3mdev_master_ifindex(skb->dev), 1);
 946
 947        send = true;
 948        if (peer) {
 949                now = jiffies;
 950                peer->rate_tokens += now - peer->rate_last;
 951                if (peer->rate_tokens > ip_rt_error_burst)
 952                        peer->rate_tokens = ip_rt_error_burst;
 953                peer->rate_last = now;
 954                if (peer->rate_tokens >= ip_rt_error_cost)
 955                        peer->rate_tokens -= ip_rt_error_cost;
 956                else
 957                        send = false;
 958                inet_putpeer(peer);
 959        }
 960        if (send)
 961                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 962
 963out:    kfree_skb(skb);
 964        return 0;
 965}
 966
 967static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 968{
 969        struct dst_entry *dst = &rt->dst;
 970        struct fib_result res;
 971
 972        if (dst_metric_locked(dst, RTAX_MTU))
 973                return;
 974
 975        if (ipv4_mtu(dst) < mtu)
 976                return;
 977
 978        if (mtu < ip_rt_min_pmtu)
 979                mtu = ip_rt_min_pmtu;
 980
 981        if (rt->rt_pmtu == mtu &&
 982            time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
 983                return;
 984
 985        rcu_read_lock();
 986        if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
 987                struct fib_nh *nh = &FIB_RES_NH(res);
 988
 989                update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 990                                      jiffies + ip_rt_mtu_expires);
 991        }
 992        rcu_read_unlock();
 993}
 994
 995static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 996                              struct sk_buff *skb, u32 mtu)
 997{
 998        struct rtable *rt = (struct rtable *) dst;
 999        struct flowi4 fl4;
1000
1001        ip_rt_build_flow_key(&fl4, sk, skb);
1002        __ip_rt_update_pmtu(rt, &fl4, mtu);
1003}
1004
1005void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1006                      int oif, u32 mark, u8 protocol, int flow_flags)
1007{
1008        const struct iphdr *iph = (const struct iphdr *) skb->data;
1009        struct flowi4 fl4;
1010        struct rtable *rt;
1011
1012        if (!mark)
1013                mark = IP4_REPLY_MARK(net, skb->mark);
1014
1015        __build_flow_key(&fl4, NULL, iph, oif,
1016                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1017        rt = __ip_route_output_key(net, &fl4);
1018        if (!IS_ERR(rt)) {
1019                __ip_rt_update_pmtu(rt, &fl4, mtu);
1020                ip_rt_put(rt);
1021        }
1022}
1023EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1024
1025static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1026{
1027        const struct iphdr *iph = (const struct iphdr *) skb->data;
1028        struct flowi4 fl4;
1029        struct rtable *rt;
1030
1031        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1032
1033        if (!fl4.flowi4_mark)
1034                fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1035
1036        rt = __ip_route_output_key(sock_net(sk), &fl4);
1037        if (!IS_ERR(rt)) {
1038                __ip_rt_update_pmtu(rt, &fl4, mtu);
1039                ip_rt_put(rt);
1040        }
1041}
1042
1043void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1044{
1045        const struct iphdr *iph = (const struct iphdr *) skb->data;
1046        struct flowi4 fl4;
1047        struct rtable *rt;
1048        struct dst_entry *odst = NULL;
1049        bool new = false;
1050
1051        bh_lock_sock(sk);
1052
1053        if (!ip_sk_accept_pmtu(sk))
1054                goto out;
1055
1056        odst = sk_dst_get(sk);
1057
1058        if (sock_owned_by_user(sk) || !odst) {
1059                __ipv4_sk_update_pmtu(skb, sk, mtu);
1060                goto out;
1061        }
1062
1063        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1064
1065        rt = (struct rtable *)odst;
1066        if (odst->obsolete && !odst->ops->check(odst, 0)) {
1067                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1068                if (IS_ERR(rt))
1069                        goto out;
1070
1071                new = true;
1072        }
1073
1074        __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1075
1076        if (!dst_check(&rt->dst, 0)) {
1077                if (new)
1078                        dst_release(&rt->dst);
1079
1080                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1081                if (IS_ERR(rt))
1082                        goto out;
1083
1084                new = true;
1085        }
1086
1087        if (new)
1088                sk_dst_set(sk, &rt->dst);
1089
1090out:
1091        bh_unlock_sock(sk);
1092        dst_release(odst);
1093}
1094EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1095
1096void ipv4_redirect(struct sk_buff *skb, struct net *net,
1097                   int oif, u32 mark, u8 protocol, int flow_flags)
1098{
1099        const struct iphdr *iph = (const struct iphdr *) skb->data;
1100        struct flowi4 fl4;
1101        struct rtable *rt;
1102
1103        __build_flow_key(&fl4, NULL, iph, oif,
1104                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1105        rt = __ip_route_output_key(net, &fl4);
1106        if (!IS_ERR(rt)) {
1107                __ip_do_redirect(rt, skb, &fl4, false);
1108                ip_rt_put(rt);
1109        }
1110}
1111EXPORT_SYMBOL_GPL(ipv4_redirect);
1112
1113void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1114{
1115        const struct iphdr *iph = (const struct iphdr *) skb->data;
1116        struct flowi4 fl4;
1117        struct rtable *rt;
1118
1119        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1120        rt = __ip_route_output_key(sock_net(sk), &fl4);
1121        if (!IS_ERR(rt)) {
1122                __ip_do_redirect(rt, skb, &fl4, false);
1123                ip_rt_put(rt);
1124        }
1125}
1126EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1127
1128static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1129{
1130        struct rtable *rt = (struct rtable *) dst;
1131
1132        /* All IPV4 dsts are created with ->obsolete set to the value
1133         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1134         * into this function always.
1135         *
1136         * When a PMTU/redirect information update invalidates a route,
1137         * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1138         * DST_OBSOLETE_DEAD by dst_free().
1139         */
1140        if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1141                return NULL;
1142        return dst;
1143}
1144
1145static void ipv4_link_failure(struct sk_buff *skb)
1146{
1147        struct rtable *rt;
1148
1149        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1150
1151        rt = skb_rtable(skb);
1152        if (rt)
1153                dst_set_expires(&rt->dst, 0);
1154}
1155
1156static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1157{
1158        pr_debug("%s: %pI4 -> %pI4, %s\n",
1159                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1160                 skb->dev ? skb->dev->name : "?");
1161        kfree_skb(skb);
1162        WARN_ON(1);
1163        return 0;
1164}
1165
1166/*
1167   We do not cache source address of outgoing interface,
1168   because it is used only by IP RR, TS and SRR options,
1169   so that it out of fast path.
1170
1171   BTW remember: "addr" is allowed to be not aligned
1172   in IP options!
1173 */
1174
1175void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1176{
1177        __be32 src;
1178
1179        if (rt_is_output_route(rt))
1180                src = ip_hdr(skb)->saddr;
1181        else {
1182                struct fib_result res;
1183                struct flowi4 fl4;
1184                struct iphdr *iph;
1185
1186                iph = ip_hdr(skb);
1187
1188                memset(&fl4, 0, sizeof(fl4));
1189                fl4.daddr = iph->daddr;
1190                fl4.saddr = iph->saddr;
1191                fl4.flowi4_tos = RT_TOS(iph->tos);
1192                fl4.flowi4_oif = rt->dst.dev->ifindex;
1193                fl4.flowi4_iif = skb->dev->ifindex;
1194                fl4.flowi4_mark = skb->mark;
1195
1196                rcu_read_lock();
1197                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1198                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1199                else
1200                        src = inet_select_addr(rt->dst.dev,
1201                                               rt_nexthop(rt, iph->daddr),
1202                                               RT_SCOPE_UNIVERSE);
1203                rcu_read_unlock();
1204        }
1205        memcpy(addr, &src, 4);
1206}
1207
1208#ifdef CONFIG_IP_ROUTE_CLASSID
1209static void set_class_tag(struct rtable *rt, u32 tag)
1210{
1211        if (!(rt->dst.tclassid & 0xFFFF))
1212                rt->dst.tclassid |= tag & 0xFFFF;
1213        if (!(rt->dst.tclassid & 0xFFFF0000))
1214                rt->dst.tclassid |= tag & 0xFFFF0000;
1215}
1216#endif
1217
1218static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1219{
1220        unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1221
1222        if (advmss == 0) {
1223                advmss = max_t(unsigned int, dst->dev->mtu - 40,
1224                               ip_rt_min_advmss);
1225                if (advmss > 65535 - 40)
1226                        advmss = 65535 - 40;
1227        }
1228        return advmss;
1229}
1230
1231static unsigned int ipv4_mtu(const struct dst_entry *dst)
1232{
1233        const struct rtable *rt = (const struct rtable *) dst;
1234        unsigned int mtu = rt->rt_pmtu;
1235
1236        if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1237                mtu = dst_metric_raw(dst, RTAX_MTU);
1238
1239        if (mtu)
1240                return mtu;
1241
1242        mtu = dst->dev->mtu;
1243
1244        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1245                if (rt->rt_uses_gateway && mtu > 576)
1246                        mtu = 576;
1247        }
1248
1249        return min_t(unsigned int, mtu, IP_MAX_MTU);
1250}
1251
1252static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1253{
1254        struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1255        struct fib_nh_exception *fnhe;
1256        u32 hval;
1257
1258        if (!hash)
1259                return NULL;
1260
1261        hval = fnhe_hashfun(daddr);
1262
1263        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1264             fnhe = rcu_dereference(fnhe->fnhe_next)) {
1265                if (fnhe->fnhe_daddr == daddr)
1266                        return fnhe;
1267        }
1268        return NULL;
1269}
1270
1271static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1272                              __be32 daddr)
1273{
1274        bool ret = false;
1275
1276        spin_lock_bh(&fnhe_lock);
1277
1278        if (daddr == fnhe->fnhe_daddr) {
1279                struct rtable __rcu **porig;
1280                struct rtable *orig;
1281                int genid = fnhe_genid(dev_net(rt->dst.dev));
1282
1283                if (rt_is_input_route(rt))
1284                        porig = &fnhe->fnhe_rth_input;
1285                else
1286                        porig = &fnhe->fnhe_rth_output;
1287                orig = rcu_dereference(*porig);
1288
1289                if (fnhe->fnhe_genid != genid) {
1290                        fnhe->fnhe_genid = genid;
1291                        fnhe->fnhe_gw = 0;
1292                        fnhe->fnhe_pmtu = 0;
1293                        fnhe->fnhe_expires = 0;
1294                        fnhe_flush_routes(fnhe);
1295                        orig = NULL;
1296                }
1297                fill_route_from_fnhe(rt, fnhe);
1298                if (!rt->rt_gateway)
1299                        rt->rt_gateway = daddr;
1300
1301                if (!(rt->dst.flags & DST_NOCACHE)) {
1302                        rcu_assign_pointer(*porig, rt);
1303                        if (orig)
1304                                rt_free(orig);
1305                        ret = true;
1306                }
1307
1308                fnhe->fnhe_stamp = jiffies;
1309        }
1310        spin_unlock_bh(&fnhe_lock);
1311
1312        return ret;
1313}
1314
1315static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1316{
1317        struct rtable *orig, *prev, **p;
1318        bool ret = true;
1319
1320        if (rt_is_input_route(rt)) {
1321                p = (struct rtable **)&nh->nh_rth_input;
1322        } else {
1323                p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1324        }
1325        orig = *p;
1326
1327        prev = cmpxchg(p, orig, rt);
1328        if (prev == orig) {
1329                if (orig)
1330                        rt_free(orig);
1331        } else
1332                ret = false;
1333
1334        return ret;
1335}
1336
1337struct uncached_list {
1338        spinlock_t              lock;
1339        struct list_head        head;
1340};
1341
1342static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1343
1344static void rt_add_uncached_list(struct rtable *rt)
1345{
1346        struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1347
1348        rt->rt_uncached_list = ul;
1349
1350        spin_lock_bh(&ul->lock);
1351        list_add_tail(&rt->rt_uncached, &ul->head);
1352        spin_unlock_bh(&ul->lock);
1353}
1354
1355static void ipv4_dst_destroy(struct dst_entry *dst)
1356{
1357        struct rtable *rt = (struct rtable *) dst;
1358
1359        if (!list_empty(&rt->rt_uncached)) {
1360                struct uncached_list *ul = rt->rt_uncached_list;
1361
1362                spin_lock_bh(&ul->lock);
1363                list_del(&rt->rt_uncached);
1364                spin_unlock_bh(&ul->lock);
1365        }
1366}
1367
1368void rt_flush_dev(struct net_device *dev)
1369{
1370        struct net *net = dev_net(dev);
1371        struct rtable *rt;
1372        int cpu;
1373
1374        for_each_possible_cpu(cpu) {
1375                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1376
1377                spin_lock_bh(&ul->lock);
1378                list_for_each_entry(rt, &ul->head, rt_uncached) {
1379                        if (rt->dst.dev != dev)
1380                                continue;
1381                        rt->dst.dev = net->loopback_dev;
1382                        dev_hold(rt->dst.dev);
1383                        dev_put(dev);
1384                }
1385                spin_unlock_bh(&ul->lock);
1386        }
1387}
1388
1389static bool rt_cache_valid(const struct rtable *rt)
1390{
1391        return  rt &&
1392                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1393                !rt_is_expired(rt);
1394}
1395
1396static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1397                           const struct fib_result *res,
1398                           struct fib_nh_exception *fnhe,
1399                           struct fib_info *fi, u16 type, u32 itag)
1400{
1401        bool cached = false;
1402
1403        if (fi) {
1404                struct fib_nh *nh = &FIB_RES_NH(*res);
1405
1406                if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1407                        rt->rt_gateway = nh->nh_gw;
1408                        rt->rt_uses_gateway = 1;
1409                }
1410                dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1411#ifdef CONFIG_IP_ROUTE_CLASSID
1412                rt->dst.tclassid = nh->nh_tclassid;
1413#endif
1414                rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1415                if (unlikely(fnhe))
1416                        cached = rt_bind_exception(rt, fnhe, daddr);
1417                else if (!(rt->dst.flags & DST_NOCACHE))
1418                        cached = rt_cache_route(nh, rt);
1419                if (unlikely(!cached)) {
1420                        /* Routes we intend to cache in nexthop exception or
1421                         * FIB nexthop have the DST_NOCACHE bit clear.
1422                         * However, if we are unsuccessful at storing this
1423                         * route into the cache we really need to set it.
1424                         */
1425                        rt->dst.flags |= DST_NOCACHE;
1426                        if (!rt->rt_gateway)
1427                                rt->rt_gateway = daddr;
1428                        rt_add_uncached_list(rt);
1429                }
1430        } else
1431                rt_add_uncached_list(rt);
1432
1433#ifdef CONFIG_IP_ROUTE_CLASSID
1434#ifdef CONFIG_IP_MULTIPLE_TABLES
1435        set_class_tag(rt, res->tclassid);
1436#endif
1437        set_class_tag(rt, itag);
1438#endif
1439}
1440
1441struct rtable *rt_dst_alloc(struct net_device *dev,
1442                            unsigned int flags, u16 type,
1443                            bool nopolicy, bool noxfrm, bool will_cache)
1444{
1445        struct rtable *rt;
1446
1447        rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1448                       (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1449                       (nopolicy ? DST_NOPOLICY : 0) |
1450                       (noxfrm ? DST_NOXFRM : 0));
1451
1452        if (rt) {
1453                rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1454                rt->rt_flags = flags;
1455                rt->rt_type = type;
1456                rt->rt_is_input = 0;
1457                rt->rt_iif = 0;
1458                rt->rt_pmtu = 0;
1459                rt->rt_gateway = 0;
1460                rt->rt_uses_gateway = 0;
1461                rt->rt_table_id = 0;
1462                INIT_LIST_HEAD(&rt->rt_uncached);
1463
1464                rt->dst.output = ip_output;
1465                if (flags & RTCF_LOCAL)
1466                        rt->dst.input = ip_local_deliver;
1467        }
1468
1469        return rt;
1470}
1471EXPORT_SYMBOL(rt_dst_alloc);
1472
1473/* called in rcu_read_lock() section */
1474static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1475                                u8 tos, struct net_device *dev, int our)
1476{
1477        struct rtable *rth;
1478        struct in_device *in_dev = __in_dev_get_rcu(dev);
1479        unsigned int flags = RTCF_MULTICAST;
1480        u32 itag = 0;
1481        int err;
1482
1483        /* Primary sanity checks. */
1484
1485        if (!in_dev)
1486                return -EINVAL;
1487
1488        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1489            skb->protocol != htons(ETH_P_IP))
1490                goto e_inval;
1491
1492        if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1493                goto e_inval;
1494
1495        if (ipv4_is_zeronet(saddr)) {
1496                if (!ipv4_is_local_multicast(daddr))
1497                        goto e_inval;
1498        } else {
1499                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1500                                          in_dev, &itag);
1501                if (err < 0)
1502                        goto e_err;
1503        }
1504        if (our)
1505                flags |= RTCF_LOCAL;
1506
1507        rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1508                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1509        if (!rth)
1510                goto e_nobufs;
1511
1512#ifdef CONFIG_IP_ROUTE_CLASSID
1513        rth->dst.tclassid = itag;
1514#endif
1515        rth->dst.output = ip_rt_bug;
1516        rth->rt_is_input= 1;
1517
1518#ifdef CONFIG_IP_MROUTE
1519        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1520                rth->dst.input = ip_mr_input;
1521#endif
1522        RT_CACHE_STAT_INC(in_slow_mc);
1523
1524        skb_dst_set(skb, &rth->dst);
1525        return 0;
1526
1527e_nobufs:
1528        return -ENOBUFS;
1529e_inval:
1530        return -EINVAL;
1531e_err:
1532        return err;
1533}
1534
1535
1536static void ip_handle_martian_source(struct net_device *dev,
1537                                     struct in_device *in_dev,
1538                                     struct sk_buff *skb,
1539                                     __be32 daddr,
1540                                     __be32 saddr)
1541{
1542        RT_CACHE_STAT_INC(in_martian_src);
1543#ifdef CONFIG_IP_ROUTE_VERBOSE
1544        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1545                /*
1546                 *      RFC1812 recommendation, if source is martian,
1547                 *      the only hint is MAC header.
1548                 */
1549                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1550                        &daddr, &saddr, dev->name);
1551                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1552                        print_hex_dump(KERN_WARNING, "ll header: ",
1553                                       DUMP_PREFIX_OFFSET, 16, 1,
1554                                       skb_mac_header(skb),
1555                                       dev->hard_header_len, true);
1556                }
1557        }
1558#endif
1559}
1560
1561static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1562{
1563        struct fnhe_hash_bucket *hash;
1564        struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1565        u32 hval = fnhe_hashfun(daddr);
1566
1567        spin_lock_bh(&fnhe_lock);
1568
1569        hash = rcu_dereference_protected(nh->nh_exceptions,
1570                                         lockdep_is_held(&fnhe_lock));
1571        hash += hval;
1572
1573        fnhe_p = &hash->chain;
1574        fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1575        while (fnhe) {
1576                if (fnhe->fnhe_daddr == daddr) {
1577                        rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1578                                fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1579                        fnhe_flush_routes(fnhe);
1580                        kfree_rcu(fnhe, rcu);
1581                        break;
1582                }
1583                fnhe_p = &fnhe->fnhe_next;
1584                fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1585                                                 lockdep_is_held(&fnhe_lock));
1586        }
1587
1588        spin_unlock_bh(&fnhe_lock);
1589}
1590
1591/* called in rcu_read_lock() section */
1592static int __mkroute_input(struct sk_buff *skb,
1593                           const struct fib_result *res,
1594                           struct in_device *in_dev,
1595                           __be32 daddr, __be32 saddr, u32 tos)
1596{
1597        struct fib_nh_exception *fnhe;
1598        struct rtable *rth;
1599        int err;
1600        struct in_device *out_dev;
1601        bool do_cache;
1602        u32 itag = 0;
1603
1604        /* get a working reference to the output device */
1605        out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1606        if (!out_dev) {
1607                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1608                return -EINVAL;
1609        }
1610
1611        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1612                                  in_dev->dev, in_dev, &itag);
1613        if (err < 0) {
1614                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1615                                         saddr);
1616
1617                goto cleanup;
1618        }
1619
1620        do_cache = res->fi && !itag;
1621        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1622            skb->protocol == htons(ETH_P_IP) &&
1623            (IN_DEV_SHARED_MEDIA(out_dev) ||
1624             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1625                IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1626
1627        if (skb->protocol != htons(ETH_P_IP)) {
1628                /* Not IP (i.e. ARP). Do not create route, if it is
1629                 * invalid for proxy arp. DNAT routes are always valid.
1630                 *
1631                 * Proxy arp feature have been extended to allow, ARP
1632                 * replies back to the same interface, to support
1633                 * Private VLAN switch technologies. See arp.c.
1634                 */
1635                if (out_dev == in_dev &&
1636                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1637                        err = -EINVAL;
1638                        goto cleanup;
1639                }
1640        }
1641
1642        fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1643        if (do_cache) {
1644                if (fnhe) {
1645                        rth = rcu_dereference(fnhe->fnhe_rth_input);
1646                        if (rth && rth->dst.expires &&
1647                            time_after(jiffies, rth->dst.expires)) {
1648                                ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1649                                fnhe = NULL;
1650                        } else {
1651                                goto rt_cache;
1652                        }
1653                }
1654
1655                rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1656
1657rt_cache:
1658                if (rt_cache_valid(rth)) {
1659                        skb_dst_set_noref(skb, &rth->dst);
1660                        goto out;
1661                }
1662        }
1663
1664        rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1665                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1666                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1667        if (!rth) {
1668                err = -ENOBUFS;
1669                goto cleanup;
1670        }
1671
1672        rth->rt_is_input = 1;
1673        if (res->table)
1674                rth->rt_table_id = res->table->tb_id;
1675        RT_CACHE_STAT_INC(in_slow_tot);
1676
1677        rth->dst.input = ip_forward;
1678
1679        rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1680        if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1681                rth->dst.lwtstate->orig_output = rth->dst.output;
1682                rth->dst.output = lwtunnel_output;
1683        }
1684        if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1685                rth->dst.lwtstate->orig_input = rth->dst.input;
1686                rth->dst.input = lwtunnel_input;
1687        }
1688        skb_dst_set(skb, &rth->dst);
1689out:
1690        err = 0;
1691 cleanup:
1692        return err;
1693}
1694
1695#ifdef CONFIG_IP_ROUTE_MULTIPATH
1696
1697/* To make ICMP packets follow the right flow, the multipath hash is
1698 * calculated from the inner IP addresses in reverse order.
1699 */
1700static int ip_multipath_icmp_hash(struct sk_buff *skb)
1701{
1702        const struct iphdr *outer_iph = ip_hdr(skb);
1703        struct icmphdr _icmph;
1704        const struct icmphdr *icmph;
1705        struct iphdr _inner_iph;
1706        const struct iphdr *inner_iph;
1707
1708        if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1709                goto standard_hash;
1710
1711        icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1712                                   &_icmph);
1713        if (!icmph)
1714                goto standard_hash;
1715
1716        if (icmph->type != ICMP_DEST_UNREACH &&
1717            icmph->type != ICMP_REDIRECT &&
1718            icmph->type != ICMP_TIME_EXCEEDED &&
1719            icmph->type != ICMP_PARAMETERPROB) {
1720                goto standard_hash;
1721        }
1722
1723        inner_iph = skb_header_pointer(skb,
1724                                       outer_iph->ihl * 4 + sizeof(_icmph),
1725                                       sizeof(_inner_iph), &_inner_iph);
1726        if (!inner_iph)
1727                goto standard_hash;
1728
1729        return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1730
1731standard_hash:
1732        return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1733}
1734
1735#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1736
1737static int ip_mkroute_input(struct sk_buff *skb,
1738                            struct fib_result *res,
1739                            const struct flowi4 *fl4,
1740                            struct in_device *in_dev,
1741                            __be32 daddr, __be32 saddr, u32 tos)
1742{
1743#ifdef CONFIG_IP_ROUTE_MULTIPATH
1744        if (res->fi && res->fi->fib_nhs > 1) {
1745                int h;
1746
1747                if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1748                        h = ip_multipath_icmp_hash(skb);
1749                else
1750                        h = fib_multipath_hash(saddr, daddr);
1751                fib_select_multipath(res, h);
1752        }
1753#endif
1754
1755        /* create a routing cache entry */
1756        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1757}
1758
1759/*
1760 *      NOTE. We drop all the packets that has local source
1761 *      addresses, because every properly looped back packet
1762 *      must have correct destination already attached by output routine.
1763 *
1764 *      Such approach solves two big problems:
1765 *      1. Not simplex devices are handled properly.
1766 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1767 *      called with rcu_read_lock()
1768 */
1769
1770static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1771                               u8 tos, struct net_device *dev)
1772{
1773        struct fib_result res;
1774        struct in_device *in_dev = __in_dev_get_rcu(dev);
1775        struct ip_tunnel_info *tun_info;
1776        struct flowi4   fl4;
1777        unsigned int    flags = 0;
1778        u32             itag = 0;
1779        struct rtable   *rth;
1780        int             err = -EINVAL;
1781        struct net    *net = dev_net(dev);
1782        bool do_cache;
1783
1784        /* IP on this device is disabled. */
1785
1786        if (!in_dev)
1787                goto out;
1788
1789        /* Check for the most weird martians, which can be not detected
1790           by fib_lookup.
1791         */
1792
1793        tun_info = skb_tunnel_info(skb);
1794        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1795                fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1796        else
1797                fl4.flowi4_tun_key.tun_id = 0;
1798        skb_dst_drop(skb);
1799
1800        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1801                goto martian_source;
1802
1803        res.fi = NULL;
1804        res.table = NULL;
1805        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1806                goto brd_input;
1807
1808        /* Accept zero addresses only to limited broadcast;
1809         * I even do not know to fix it or not. Waiting for complains :-)
1810         */
1811        if (ipv4_is_zeronet(saddr))
1812                goto martian_source;
1813
1814        if (ipv4_is_zeronet(daddr))
1815                goto martian_destination;
1816
1817        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1818         * and call it once if daddr or/and saddr are loopback addresses
1819         */
1820        if (ipv4_is_loopback(daddr)) {
1821                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1822                        goto martian_destination;
1823        } else if (ipv4_is_loopback(saddr)) {
1824                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1825                        goto martian_source;
1826        }
1827
1828        /*
1829         *      Now we are ready to route packet.
1830         */
1831        fl4.flowi4_oif = 0;
1832        fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
1833        fl4.flowi4_mark = skb->mark;
1834        fl4.flowi4_tos = tos;
1835        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1836        fl4.flowi4_flags = 0;
1837        fl4.daddr = daddr;
1838        fl4.saddr = saddr;
1839        err = fib_lookup(net, &fl4, &res, 0);
1840        if (err != 0) {
1841                if (!IN_DEV_FORWARD(in_dev))
1842                        err = -EHOSTUNREACH;
1843                goto no_route;
1844        }
1845
1846        if (res.type == RTN_BROADCAST)
1847                goto brd_input;
1848
1849        if (res.type == RTN_LOCAL) {
1850                err = fib_validate_source(skb, saddr, daddr, tos,
1851                                          0, dev, in_dev, &itag);
1852                if (err < 0)
1853                        goto martian_source;
1854                goto local_input;
1855        }
1856
1857        if (!IN_DEV_FORWARD(in_dev)) {
1858                err = -EHOSTUNREACH;
1859                goto no_route;
1860        }
1861        if (res.type != RTN_UNICAST)
1862                goto martian_destination;
1863
1864        err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1865out:    return err;
1866
1867brd_input:
1868        if (skb->protocol != htons(ETH_P_IP))
1869                goto e_inval;
1870
1871        if (!ipv4_is_zeronet(saddr)) {
1872                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1873                                          in_dev, &itag);
1874                if (err < 0)
1875                        goto martian_source;
1876        }
1877        flags |= RTCF_BROADCAST;
1878        res.type = RTN_BROADCAST;
1879        RT_CACHE_STAT_INC(in_brd);
1880
1881local_input:
1882        do_cache = false;
1883        if (res.fi) {
1884                if (!itag) {
1885                        rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1886                        if (rt_cache_valid(rth)) {
1887                                skb_dst_set_noref(skb, &rth->dst);
1888                                err = 0;
1889                                goto out;
1890                        }
1891                        do_cache = true;
1892                }
1893        }
1894
1895        rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
1896                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1897        if (!rth)
1898                goto e_nobufs;
1899
1900        rth->dst.output= ip_rt_bug;
1901#ifdef CONFIG_IP_ROUTE_CLASSID
1902        rth->dst.tclassid = itag;
1903#endif
1904        rth->rt_is_input = 1;
1905        if (res.table)
1906                rth->rt_table_id = res.table->tb_id;
1907
1908        RT_CACHE_STAT_INC(in_slow_tot);
1909        if (res.type == RTN_UNREACHABLE) {
1910                rth->dst.input= ip_error;
1911                rth->dst.error= -err;
1912                rth->rt_flags   &= ~RTCF_LOCAL;
1913        }
1914        if (do_cache) {
1915                if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1916                        rth->dst.flags |= DST_NOCACHE;
1917                        rt_add_uncached_list(rth);
1918                }
1919        }
1920        skb_dst_set(skb, &rth->dst);
1921        err = 0;
1922        goto out;
1923
1924no_route:
1925        RT_CACHE_STAT_INC(in_no_route);
1926        res.type = RTN_UNREACHABLE;
1927        res.fi = NULL;
1928        res.table = NULL;
1929        goto local_input;
1930
1931        /*
1932         *      Do not cache martian addresses: they should be logged (RFC1812)
1933         */
1934martian_destination:
1935        RT_CACHE_STAT_INC(in_martian_dst);
1936#ifdef CONFIG_IP_ROUTE_VERBOSE
1937        if (IN_DEV_LOG_MARTIANS(in_dev))
1938                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1939                                     &daddr, &saddr, dev->name);
1940#endif
1941
1942e_inval:
1943        err = -EINVAL;
1944        goto out;
1945
1946e_nobufs:
1947        err = -ENOBUFS;
1948        goto out;
1949
1950martian_source:
1951        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1952        goto out;
1953}
1954
1955int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1956                         u8 tos, struct net_device *dev)
1957{
1958        int res;
1959
1960        rcu_read_lock();
1961
1962        /* Multicast recognition logic is moved from route cache to here.
1963           The problem was that too many Ethernet cards have broken/missing
1964           hardware multicast filters :-( As result the host on multicasting
1965           network acquires a lot of useless route cache entries, sort of
1966           SDR messages from all the world. Now we try to get rid of them.
1967           Really, provided software IP multicast filter is organized
1968           reasonably (at least, hashed), it does not result in a slowdown
1969           comparing with route cache reject entries.
1970           Note, that multicast routers are not affected, because
1971           route cache entry is created eventually.
1972         */
1973        if (ipv4_is_multicast(daddr)) {
1974                struct in_device *in_dev = __in_dev_get_rcu(dev);
1975
1976                if (in_dev) {
1977                        int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1978                                                  ip_hdr(skb)->protocol);
1979                        if (our
1980#ifdef CONFIG_IP_MROUTE
1981                                ||
1982                            (!ipv4_is_local_multicast(daddr) &&
1983                             IN_DEV_MFORWARD(in_dev))
1984#endif
1985                           ) {
1986                                int res = ip_route_input_mc(skb, daddr, saddr,
1987                                                            tos, dev, our);
1988                                rcu_read_unlock();
1989                                return res;
1990                        }
1991                }
1992                rcu_read_unlock();
1993                return -EINVAL;
1994        }
1995        res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1996        rcu_read_unlock();
1997        return res;
1998}
1999EXPORT_SYMBOL(ip_route_input_noref);
2000
2001/* called with rcu_read_lock() */
2002static struct rtable *__mkroute_output(const struct fib_result *res,
2003                                       const struct flowi4 *fl4, int orig_oif,
2004                                       struct net_device *dev_out,
2005                                       unsigned int flags)
2006{
2007        struct fib_info *fi = res->fi;
2008        struct fib_nh_exception *fnhe;
2009        struct in_device *in_dev;
2010        u16 type = res->type;
2011        struct rtable *rth;
2012        bool do_cache;
2013
2014        in_dev = __in_dev_get_rcu(dev_out);
2015        if (!in_dev)
2016                return ERR_PTR(-EINVAL);
2017
2018        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2019                if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2020                        return ERR_PTR(-EINVAL);
2021
2022        if (ipv4_is_lbcast(fl4->daddr))
2023                type = RTN_BROADCAST;
2024        else if (ipv4_is_multicast(fl4->daddr))
2025                type = RTN_MULTICAST;
2026        else if (ipv4_is_zeronet(fl4->daddr))
2027                return ERR_PTR(-EINVAL);
2028
2029        if (dev_out->flags & IFF_LOOPBACK)
2030                flags |= RTCF_LOCAL;
2031
2032        do_cache = true;
2033        if (type == RTN_BROADCAST) {
2034                flags |= RTCF_BROADCAST | RTCF_LOCAL;
2035                fi = NULL;
2036        } else if (type == RTN_MULTICAST) {
2037                flags |= RTCF_MULTICAST | RTCF_LOCAL;
2038                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2039                                     fl4->flowi4_proto))
2040                        flags &= ~RTCF_LOCAL;
2041                else
2042                        do_cache = false;
2043                /* If multicast route do not exist use
2044                 * default one, but do not gateway in this case.
2045                 * Yes, it is hack.
2046                 */
2047                if (fi && res->prefixlen < 4)
2048                        fi = NULL;
2049        } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2050                   (orig_oif != dev_out->ifindex)) {
2051                /* For local routes that require a particular output interface
2052                 * we do not want to cache the result.  Caching the result
2053                 * causes incorrect behaviour when there are multiple source
2054                 * addresses on the interface, the end result being that if the
2055                 * intended recipient is waiting on that interface for the
2056                 * packet he won't receive it because it will be delivered on
2057                 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2058                 * be set to the loopback interface as well.
2059                 */
2060                fi = NULL;
2061        }
2062
2063        fnhe = NULL;
2064        do_cache &= fi != NULL;
2065        if (do_cache) {
2066                struct rtable __rcu **prth;
2067                struct fib_nh *nh = &FIB_RES_NH(*res);
2068
2069                fnhe = find_exception(nh, fl4->daddr);
2070                if (fnhe) {
2071                        prth = &fnhe->fnhe_rth_output;
2072                        rth = rcu_dereference(*prth);
2073                        if (rth && rth->dst.expires &&
2074                            time_after(jiffies, rth->dst.expires)) {
2075                                ip_del_fnhe(nh, fl4->daddr);
2076                                fnhe = NULL;
2077                        } else {
2078                                goto rt_cache;
2079                        }
2080                }
2081
2082                if (unlikely(fl4->flowi4_flags &
2083                             FLOWI_FLAG_KNOWN_NH &&
2084                             !(nh->nh_gw &&
2085                               nh->nh_scope == RT_SCOPE_LINK))) {
2086                        do_cache = false;
2087                        goto add;
2088                }
2089                prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2090                rth = rcu_dereference(*prth);
2091
2092rt_cache:
2093                if (rt_cache_valid(rth)) {
2094                        dst_hold(&rth->dst);
2095                        return rth;
2096                }
2097        }
2098
2099add:
2100        rth = rt_dst_alloc(dev_out, flags, type,
2101                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
2102                           IN_DEV_CONF_GET(in_dev, NOXFRM),
2103                           do_cache);
2104        if (!rth)
2105                return ERR_PTR(-ENOBUFS);
2106
2107        rth->rt_iif     = orig_oif ? : 0;
2108        if (res->table)
2109                rth->rt_table_id = res->table->tb_id;
2110
2111        RT_CACHE_STAT_INC(out_slow_tot);
2112
2113        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2114                if (flags & RTCF_LOCAL &&
2115                    !(dev_out->flags & IFF_LOOPBACK)) {
2116                        rth->dst.output = ip_mc_output;
2117                        RT_CACHE_STAT_INC(out_slow_mc);
2118                }
2119#ifdef CONFIG_IP_MROUTE
2120                if (type == RTN_MULTICAST) {
2121                        if (IN_DEV_MFORWARD(in_dev) &&
2122                            !ipv4_is_local_multicast(fl4->daddr)) {
2123                                rth->dst.input = ip_mr_input;
2124                                rth->dst.output = ip_mc_output;
2125                        }
2126                }
2127#endif
2128        }
2129
2130        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2131        if (lwtunnel_output_redirect(rth->dst.lwtstate))
2132                rth->dst.output = lwtunnel_output;
2133
2134        return rth;
2135}
2136
2137/*
2138 * Major route resolver routine.
2139 */
2140
2141struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2142                                          int mp_hash)
2143{
2144        struct net_device *dev_out = NULL;
2145        __u8 tos = RT_FL_TOS(fl4);
2146        unsigned int flags = 0;
2147        struct fib_result res;
2148        struct rtable *rth;
2149        int master_idx;
2150        int orig_oif;
2151        int err = -ENETUNREACH;
2152
2153        res.tclassid    = 0;
2154        res.fi          = NULL;
2155        res.table       = NULL;
2156
2157        orig_oif = fl4->flowi4_oif;
2158
2159        master_idx = l3mdev_master_ifindex_by_index(net, fl4->flowi4_oif);
2160        if (master_idx)
2161                fl4->flowi4_oif = master_idx;
2162        fl4->flowi4_iif = LOOPBACK_IFINDEX;
2163        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2164        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2165                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2166
2167        rcu_read_lock();
2168        if (fl4->saddr) {
2169                rth = ERR_PTR(-EINVAL);
2170                if (ipv4_is_multicast(fl4->saddr) ||
2171                    ipv4_is_lbcast(fl4->saddr) ||
2172                    ipv4_is_zeronet(fl4->saddr))
2173                        goto out;
2174
2175                /* I removed check for oif == dev_out->oif here.
2176                   It was wrong for two reasons:
2177                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2178                      is assigned to multiple interfaces.
2179                   2. Moreover, we are allowed to send packets with saddr
2180                      of another iface. --ANK
2181                 */
2182
2183                if (fl4->flowi4_oif == 0 &&
2184                    (ipv4_is_multicast(fl4->daddr) ||
2185                     ipv4_is_lbcast(fl4->daddr))) {
2186                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2187                        dev_out = __ip_dev_find(net, fl4->saddr, false);
2188                        if (!dev_out)
2189                                goto out;
2190
2191                        /* Special hack: user can direct multicasts
2192                           and limited broadcast via necessary interface
2193                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2194                           This hack is not just for fun, it allows
2195                           vic,vat and friends to work.
2196                           They bind socket to loopback, set ttl to zero
2197                           and expect that it will work.
2198                           From the viewpoint of routing cache they are broken,
2199                           because we are not allowed to build multicast path
2200                           with loopback source addr (look, routing cache
2201                           cannot know, that ttl is zero, so that packet
2202                           will not leave this host and route is valid).
2203                           Luckily, this hack is good workaround.
2204                         */
2205
2206                        fl4->flowi4_oif = dev_out->ifindex;
2207                        goto make_route;
2208                }
2209
2210                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2211                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2212                        if (!__ip_dev_find(net, fl4->saddr, false))
2213                                goto out;
2214                }
2215        }
2216
2217
2218        if (fl4->flowi4_oif) {
2219                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2220                rth = ERR_PTR(-ENODEV);
2221                if (!dev_out)
2222                        goto out;
2223
2224                /* RACE: Check return value of inet_select_addr instead. */
2225                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2226                        rth = ERR_PTR(-ENETUNREACH);
2227                        goto out;
2228                }
2229                if (ipv4_is_local_multicast(fl4->daddr) ||
2230                    ipv4_is_lbcast(fl4->daddr) ||
2231                    fl4->flowi4_proto == IPPROTO_IGMP) {
2232                        if (!fl4->saddr)
2233                                fl4->saddr = inet_select_addr(dev_out, 0,
2234                                                              RT_SCOPE_LINK);
2235                        goto make_route;
2236                }
2237                if (!fl4->saddr) {
2238                        if (ipv4_is_multicast(fl4->daddr))
2239                                fl4->saddr = inet_select_addr(dev_out, 0,
2240                                                              fl4->flowi4_scope);
2241                        else if (!fl4->daddr)
2242                                fl4->saddr = inet_select_addr(dev_out, 0,
2243                                                              RT_SCOPE_HOST);
2244                }
2245
2246                rth = l3mdev_get_rtable(dev_out, fl4);
2247                if (rth)
2248                        goto out;
2249        }
2250
2251        if (!fl4->daddr) {
2252                fl4->daddr = fl4->saddr;
2253                if (!fl4->daddr)
2254                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2255                dev_out = net->loopback_dev;
2256                fl4->flowi4_oif = LOOPBACK_IFINDEX;
2257                res.type = RTN_LOCAL;
2258                flags |= RTCF_LOCAL;
2259                goto make_route;
2260        }
2261
2262        err = fib_lookup(net, fl4, &res, 0);
2263        if (err) {
2264                res.fi = NULL;
2265                res.table = NULL;
2266                if (fl4->flowi4_oif &&
2267                    !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
2268                        /* Apparently, routing tables are wrong. Assume,
2269                           that the destination is on link.
2270
2271                           WHY? DW.
2272                           Because we are allowed to send to iface
2273                           even if it has NO routes and NO assigned
2274                           addresses. When oif is specified, routing
2275                           tables are looked up with only one purpose:
2276                           to catch if destination is gatewayed, rather than
2277                           direct. Moreover, if MSG_DONTROUTE is set,
2278                           we send packet, ignoring both routing tables
2279                           and ifaddr state. --ANK
2280
2281
2282                           We could make it even if oif is unknown,
2283                           likely IPv6, but we do not.
2284                         */
2285
2286                        if (fl4->saddr == 0)
2287                                fl4->saddr = inet_select_addr(dev_out, 0,
2288                                                              RT_SCOPE_LINK);
2289                        res.type = RTN_UNICAST;
2290                        goto make_route;
2291                }
2292                rth = ERR_PTR(err);
2293                goto out;
2294        }
2295
2296        if (res.type == RTN_LOCAL) {
2297                if (!fl4->saddr) {
2298                        if (res.fi->fib_prefsrc)
2299                                fl4->saddr = res.fi->fib_prefsrc;
2300                        else
2301                                fl4->saddr = fl4->daddr;
2302                }
2303                dev_out = net->loopback_dev;
2304                fl4->flowi4_oif = dev_out->ifindex;
2305                flags |= RTCF_LOCAL;
2306                goto make_route;
2307        }
2308
2309        fib_select_path(net, &res, fl4, mp_hash);
2310
2311        dev_out = FIB_RES_DEV(res);
2312        fl4->flowi4_oif = dev_out->ifindex;
2313
2314
2315make_route:
2316        rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2317
2318out:
2319        rcu_read_unlock();
2320        return rth;
2321}
2322EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2323
2324static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2325{
2326        return NULL;
2327}
2328
2329static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2330{
2331        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2332
2333        return mtu ? : dst->dev->mtu;
2334}
2335
2336static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2337                                          struct sk_buff *skb, u32 mtu)
2338{
2339}
2340
2341static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2342                                       struct sk_buff *skb)
2343{
2344}
2345
2346static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2347                                          unsigned long old)
2348{
2349        return NULL;
2350}
2351
2352static struct dst_ops ipv4_dst_blackhole_ops = {
2353        .family                 =       AF_INET,
2354        .check                  =       ipv4_blackhole_dst_check,
2355        .mtu                    =       ipv4_blackhole_mtu,
2356        .default_advmss         =       ipv4_default_advmss,
2357        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2358        .redirect               =       ipv4_rt_blackhole_redirect,
2359        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2360        .neigh_lookup           =       ipv4_neigh_lookup,
2361};
2362
2363struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2364{
2365        struct rtable *ort = (struct rtable *) dst_orig;
2366        struct rtable *rt;
2367
2368        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2369        if (rt) {
2370                struct dst_entry *new = &rt->dst;
2371
2372                new->__use = 1;
2373                new->input = dst_discard;
2374                new->output = dst_discard_out;
2375
2376                new->dev = ort->dst.dev;
2377                if (new->dev)
2378                        dev_hold(new->dev);
2379
2380                rt->rt_is_input = ort->rt_is_input;
2381                rt->rt_iif = ort->rt_iif;
2382                rt->rt_pmtu = ort->rt_pmtu;
2383
2384                rt->rt_genid = rt_genid_ipv4(net);
2385                rt->rt_flags = ort->rt_flags;
2386                rt->rt_type = ort->rt_type;
2387                rt->rt_gateway = ort->rt_gateway;
2388                rt->rt_uses_gateway = ort->rt_uses_gateway;
2389
2390                INIT_LIST_HEAD(&rt->rt_uncached);
2391                dst_free(new);
2392        }
2393
2394        dst_release(dst_orig);
2395
2396        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2397}
2398
2399struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2400                                    const struct sock *sk)
2401{
2402        struct rtable *rt = __ip_route_output_key(net, flp4);
2403
2404        if (IS_ERR(rt))
2405                return rt;
2406
2407        if (flp4->flowi4_proto)
2408                rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2409                                                        flowi4_to_flowi(flp4),
2410                                                        sk, 0);
2411
2412        return rt;
2413}
2414EXPORT_SYMBOL_GPL(ip_route_output_flow);
2415
2416static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2417                        struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2418                        u32 seq, int event, int nowait, unsigned int flags)
2419{
2420        struct rtable *rt = skb_rtable(skb);
2421        struct rtmsg *r;
2422        struct nlmsghdr *nlh;
2423        unsigned long expires = 0;
2424        u32 error;
2425        u32 metrics[RTAX_MAX];
2426
2427        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2428        if (!nlh)
2429                return -EMSGSIZE;
2430
2431        r = nlmsg_data(nlh);
2432        r->rtm_family    = AF_INET;
2433        r->rtm_dst_len  = 32;
2434        r->rtm_src_len  = 0;
2435        r->rtm_tos      = fl4->flowi4_tos;
2436        r->rtm_table    = table_id;
2437        if (nla_put_u32(skb, RTA_TABLE, table_id))
2438                goto nla_put_failure;
2439        r->rtm_type     = rt->rt_type;
2440        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2441        r->rtm_protocol = RTPROT_UNSPEC;
2442        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2443        if (rt->rt_flags & RTCF_NOTIFY)
2444                r->rtm_flags |= RTM_F_NOTIFY;
2445        if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2446                r->rtm_flags |= RTCF_DOREDIRECT;
2447
2448        if (nla_put_in_addr(skb, RTA_DST, dst))
2449                goto nla_put_failure;
2450        if (src) {
2451                r->rtm_src_len = 32;
2452                if (nla_put_in_addr(skb, RTA_SRC, src))
2453                        goto nla_put_failure;
2454        }
2455        if (rt->dst.dev &&
2456            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2457                goto nla_put_failure;
2458#ifdef CONFIG_IP_ROUTE_CLASSID
2459        if (rt->dst.tclassid &&
2460            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2461                goto nla_put_failure;
2462#endif
2463        if (!rt_is_input_route(rt) &&
2464            fl4->saddr != src) {
2465                if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2466                        goto nla_put_failure;
2467        }
2468        if (rt->rt_uses_gateway &&
2469            nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2470                goto nla_put_failure;
2471
2472        expires = rt->dst.expires;
2473        if (expires) {
2474                unsigned long now = jiffies;
2475
2476                if (time_before(now, expires))
2477                        expires -= now;
2478                else
2479                        expires = 0;
2480        }
2481
2482        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2483        if (rt->rt_pmtu && expires)
2484                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2485        if (rtnetlink_put_metrics(skb, metrics) < 0)
2486                goto nla_put_failure;
2487
2488        if (fl4->flowi4_mark &&
2489            nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2490                goto nla_put_failure;
2491
2492        error = rt->dst.error;
2493
2494        if (rt_is_input_route(rt)) {
2495#ifdef CONFIG_IP_MROUTE
2496                if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2497                    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2498                        int err = ipmr_get_route(net, skb,
2499                                                 fl4->saddr, fl4->daddr,
2500                                                 r, nowait);
2501                        if (err <= 0) {
2502                                if (!nowait) {
2503                                        if (err == 0)
2504                                                return 0;
2505                                        goto nla_put_failure;
2506                                } else {
2507                                        if (err == -EMSGSIZE)
2508                                                goto nla_put_failure;
2509                                        error = err;
2510                                }
2511                        }
2512                } else
2513#endif
2514                        if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2515                                goto nla_put_failure;
2516        }
2517
2518        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2519                goto nla_put_failure;
2520
2521        nlmsg_end(skb, nlh);
2522        return 0;
2523
2524nla_put_failure:
2525        nlmsg_cancel(skb, nlh);
2526        return -EMSGSIZE;
2527}
2528
2529static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2530{
2531        struct net *net = sock_net(in_skb->sk);
2532        struct rtmsg *rtm;
2533        struct nlattr *tb[RTA_MAX+1];
2534        struct rtable *rt = NULL;
2535        struct flowi4 fl4;
2536        __be32 dst = 0;
2537        __be32 src = 0;
2538        u32 iif;
2539        int err;
2540        int mark;
2541        struct sk_buff *skb;
2542        u32 table_id = RT_TABLE_MAIN;
2543
2544        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2545        if (err < 0)
2546                goto errout;
2547
2548        rtm = nlmsg_data(nlh);
2549
2550        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2551        if (!skb) {
2552                err = -ENOBUFS;
2553                goto errout;
2554        }
2555
2556        /* Reserve room for dummy headers, this skb can pass
2557           through good chunk of routing engine.
2558         */
2559        skb_reset_mac_header(skb);
2560        skb_reset_network_header(skb);
2561
2562        /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2563        ip_hdr(skb)->protocol = IPPROTO_ICMP;
2564        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2565
2566        src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2567        dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2568        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2569        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2570
2571        memset(&fl4, 0, sizeof(fl4));
2572        fl4.daddr = dst;
2573        fl4.saddr = src;
2574        fl4.flowi4_tos = rtm->rtm_tos;
2575        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2576        fl4.flowi4_mark = mark;
2577
2578        if (netif_index_is_l3_master(net, fl4.flowi4_oif))
2579                fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
2580
2581        if (iif) {
2582                struct net_device *dev;
2583
2584                dev = __dev_get_by_index(net, iif);
2585                if (!dev) {
2586                        err = -ENODEV;
2587                        goto errout_free;
2588                }
2589
2590                skb->protocol   = htons(ETH_P_IP);
2591                skb->dev        = dev;
2592                skb->mark       = mark;
2593                local_bh_disable();
2594                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2595                local_bh_enable();
2596
2597                rt = skb_rtable(skb);
2598                if (err == 0 && rt->dst.error)
2599                        err = -rt->dst.error;
2600        } else {
2601                rt = ip_route_output_key(net, &fl4);
2602
2603                err = 0;
2604                if (IS_ERR(rt))
2605                        err = PTR_ERR(rt);
2606        }
2607
2608        if (err)
2609                goto errout_free;
2610
2611        skb_dst_set(skb, &rt->dst);
2612        if (rtm->rtm_flags & RTM_F_NOTIFY)
2613                rt->rt_flags |= RTCF_NOTIFY;
2614
2615        if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2616                table_id = rt->rt_table_id;
2617
2618        err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2619                           NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2620                           RTM_NEWROUTE, 0, 0);
2621        if (err < 0)
2622                goto errout_free;
2623
2624        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2625errout:
2626        return err;
2627
2628errout_free:
2629        kfree_skb(skb);
2630        goto errout;
2631}
2632
2633void ip_rt_multicast_event(struct in_device *in_dev)
2634{
2635        rt_cache_flush(dev_net(in_dev->dev));
2636}
2637
2638#ifdef CONFIG_SYSCTL
2639static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2640static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2641static int ip_rt_gc_elasticity __read_mostly    = 8;
2642
2643static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2644                                        void __user *buffer,
2645                                        size_t *lenp, loff_t *ppos)
2646{
2647        struct net *net = (struct net *)__ctl->extra1;
2648
2649        if (write) {
2650                rt_cache_flush(net);
2651                fnhe_genid_bump(net);
2652                return 0;
2653        }
2654
2655        return -EINVAL;
2656}
2657
2658static struct ctl_table ipv4_route_table[] = {
2659        {
2660                .procname       = "gc_thresh",
2661                .data           = &ipv4_dst_ops.gc_thresh,
2662                .maxlen         = sizeof(int),
2663                .mode           = 0644,
2664                .proc_handler   = proc_dointvec,
2665        },
2666        {
2667                .procname       = "max_size",
2668                .data           = &ip_rt_max_size,
2669                .maxlen         = sizeof(int),
2670                .mode           = 0644,
2671                .proc_handler   = proc_dointvec,
2672        },
2673        {
2674                /*  Deprecated. Use gc_min_interval_ms */
2675
2676                .procname       = "gc_min_interval",
2677                .data           = &ip_rt_gc_min_interval,
2678                .maxlen         = sizeof(int),
2679                .mode           = 0644,
2680                .proc_handler   = proc_dointvec_jiffies,
2681        },
2682        {
2683                .procname       = "gc_min_interval_ms",
2684                .data           = &ip_rt_gc_min_interval,
2685                .maxlen         = sizeof(int),
2686                .mode           = 0644,
2687                .proc_handler   = proc_dointvec_ms_jiffies,
2688        },
2689        {
2690                .procname       = "gc_timeout",
2691                .data           = &ip_rt_gc_timeout,
2692                .maxlen         = sizeof(int),
2693                .mode           = 0644,
2694                .proc_handler   = proc_dointvec_jiffies,
2695        },
2696        {
2697                .procname       = "gc_interval",
2698                .data           = &ip_rt_gc_interval,
2699                .maxlen         = sizeof(int),
2700                .mode           = 0644,
2701                .proc_handler   = proc_dointvec_jiffies,
2702        },
2703        {
2704                .procname       = "redirect_load",
2705                .data           = &ip_rt_redirect_load,
2706                .maxlen         = sizeof(int),
2707                .mode           = 0644,
2708                .proc_handler   = proc_dointvec,
2709        },
2710        {
2711                .procname       = "redirect_number",
2712                .data           = &ip_rt_redirect_number,
2713                .maxlen         = sizeof(int),
2714                .mode           = 0644,
2715                .proc_handler   = proc_dointvec,
2716        },
2717        {
2718                .procname       = "redirect_silence",
2719                .data           = &ip_rt_redirect_silence,
2720                .maxlen         = sizeof(int),
2721                .mode           = 0644,
2722                .proc_handler   = proc_dointvec,
2723        },
2724        {
2725                .procname       = "error_cost",
2726                .data           = &ip_rt_error_cost,
2727                .maxlen         = sizeof(int),
2728                .mode           = 0644,
2729                .proc_handler   = proc_dointvec,
2730        },
2731        {
2732                .procname       = "error_burst",
2733                .data           = &ip_rt_error_burst,
2734                .maxlen         = sizeof(int),
2735                .mode           = 0644,
2736                .proc_handler   = proc_dointvec,
2737        },
2738        {
2739                .procname       = "gc_elasticity",
2740                .data           = &ip_rt_gc_elasticity,
2741                .maxlen         = sizeof(int),
2742                .mode           = 0644,
2743                .proc_handler   = proc_dointvec,
2744        },
2745        {
2746                .procname       = "mtu_expires",
2747                .data           = &ip_rt_mtu_expires,
2748                .maxlen         = sizeof(int),
2749                .mode           = 0644,
2750                .proc_handler   = proc_dointvec_jiffies,
2751        },
2752        {
2753                .procname       = "min_pmtu",
2754                .data           = &ip_rt_min_pmtu,
2755                .maxlen         = sizeof(int),
2756                .mode           = 0644,
2757                .proc_handler   = proc_dointvec,
2758        },
2759        {
2760                .procname       = "min_adv_mss",
2761                .data           = &ip_rt_min_advmss,
2762                .maxlen         = sizeof(int),
2763                .mode           = 0644,
2764                .proc_handler   = proc_dointvec,
2765        },
2766        { }
2767};
2768
2769static struct ctl_table ipv4_route_flush_table[] = {
2770        {
2771                .procname       = "flush",
2772                .maxlen         = sizeof(int),
2773                .mode           = 0200,
2774                .proc_handler   = ipv4_sysctl_rtcache_flush,
2775        },
2776        { },
2777};
2778
2779static __net_init int sysctl_route_net_init(struct net *net)
2780{
2781        struct ctl_table *tbl;
2782
2783        tbl = ipv4_route_flush_table;
2784        if (!net_eq(net, &init_net)) {
2785                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2786                if (!tbl)
2787                        goto err_dup;
2788
2789                /* Don't export sysctls to unprivileged users */
2790                if (net->user_ns != &init_user_ns)
2791                        tbl[0].procname = NULL;
2792        }
2793        tbl[0].extra1 = net;
2794
2795        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2796        if (!net->ipv4.route_hdr)
2797                goto err_reg;
2798        return 0;
2799
2800err_reg:
2801        if (tbl != ipv4_route_flush_table)
2802                kfree(tbl);
2803err_dup:
2804        return -ENOMEM;
2805}
2806
2807static __net_exit void sysctl_route_net_exit(struct net *net)
2808{
2809        struct ctl_table *tbl;
2810
2811        tbl = net->ipv4.route_hdr->ctl_table_arg;
2812        unregister_net_sysctl_table(net->ipv4.route_hdr);
2813        BUG_ON(tbl == ipv4_route_flush_table);
2814        kfree(tbl);
2815}
2816
2817static __net_initdata struct pernet_operations sysctl_route_ops = {
2818        .init = sysctl_route_net_init,
2819        .exit = sysctl_route_net_exit,
2820};
2821#endif
2822
2823static __net_init int rt_genid_init(struct net *net)
2824{
2825        atomic_set(&net->ipv4.rt_genid, 0);
2826        atomic_set(&net->fnhe_genid, 0);
2827        get_random_bytes(&net->ipv4.dev_addr_genid,
2828                         sizeof(net->ipv4.dev_addr_genid));
2829        return 0;
2830}
2831
2832static __net_initdata struct pernet_operations rt_genid_ops = {
2833        .init = rt_genid_init,
2834};
2835
2836static int __net_init ipv4_inetpeer_init(struct net *net)
2837{
2838        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2839
2840        if (!bp)
2841                return -ENOMEM;
2842        inet_peer_base_init(bp);
2843        net->ipv4.peers = bp;
2844        return 0;
2845}
2846
2847static void __net_exit ipv4_inetpeer_exit(struct net *net)
2848{
2849        struct inet_peer_base *bp = net->ipv4.peers;
2850
2851        net->ipv4.peers = NULL;
2852        inetpeer_invalidate_tree(bp);
2853        kfree(bp);
2854}
2855
2856static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2857        .init   =       ipv4_inetpeer_init,
2858        .exit   =       ipv4_inetpeer_exit,
2859};
2860
2861#ifdef CONFIG_IP_ROUTE_CLASSID
2862struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2863#endif /* CONFIG_IP_ROUTE_CLASSID */
2864
2865int __init ip_rt_init(void)
2866{
2867        int rc = 0;
2868        int cpu;
2869
2870        ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2871        if (!ip_idents)
2872                panic("IP: failed to allocate ip_idents\n");
2873
2874        prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2875
2876        ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2877        if (!ip_tstamps)
2878                panic("IP: failed to allocate ip_tstamps\n");
2879
2880        for_each_possible_cpu(cpu) {
2881                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2882
2883                INIT_LIST_HEAD(&ul->head);
2884                spin_lock_init(&ul->lock);
2885        }
2886#ifdef CONFIG_IP_ROUTE_CLASSID
2887        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2888        if (!ip_rt_acct)
2889                panic("IP: failed to allocate ip_rt_acct\n");
2890#endif
2891
2892        ipv4_dst_ops.kmem_cachep =
2893                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2894                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2895
2896        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2897
2898        if (dst_entries_init(&ipv4_dst_ops) < 0)
2899                panic("IP: failed to allocate ipv4_dst_ops counter\n");
2900
2901        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2902                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2903
2904        ipv4_dst_ops.gc_thresh = ~0;
2905        ip_rt_max_size = INT_MAX;
2906
2907        devinet_init();
2908        ip_fib_init();
2909
2910        if (ip_rt_proc_init())
2911                pr_err("Unable to create route proc files\n");
2912#ifdef CONFIG_XFRM
2913        xfrm_init();
2914        xfrm4_init();
2915#endif
2916        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2917
2918#ifdef CONFIG_SYSCTL
2919        register_pernet_subsys(&sysctl_route_ops);
2920#endif
2921        register_pernet_subsys(&rt_genid_ops);
2922        register_pernet_subsys(&ipv4_inetpeer_ops);
2923        return rc;
2924}
2925
2926#ifdef CONFIG_SYSCTL
2927/*
2928 * We really need to sanitize the damn ipv4 init order, then all
2929 * this nonsense will go away.
2930 */
2931void __init ip_static_sysctl_init(void)
2932{
2933        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2934}
2935#endif
2936