linux/net/ipv4/route.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              ROUTE - implementation of the IP router.
   8 *
   9 * Authors:     Ross Biro
  10 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  12 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  13 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Verify area fixes.
  17 *              Alan Cox        :       cli() protects routing changes
  18 *              Rui Oliveira    :       ICMP routing table updates
  19 *              (rco@di.uminho.pt)      Routing table insertion and update
  20 *              Linus Torvalds  :       Rewrote bits to be sensible
  21 *              Alan Cox        :       Added BSD route gw semantics
  22 *              Alan Cox        :       Super /proc >4K
  23 *              Alan Cox        :       MTU in route table
  24 *              Alan Cox        :       MSS actually. Also added the window
  25 *                                      clamper.
  26 *              Sam Lantinga    :       Fixed route matching in rt_del()
  27 *              Alan Cox        :       Routing cache support.
  28 *              Alan Cox        :       Removed compatibility cruft.
  29 *              Alan Cox        :       RTF_REJECT support.
  30 *              Alan Cox        :       TCP irtt support.
  31 *              Jonathan Naylor :       Added Metric support.
  32 *      Miquel van Smoorenburg  :       BSD API fixes.
  33 *      Miquel van Smoorenburg  :       Metrics.
  34 *              Alan Cox        :       Use __u32 properly
  35 *              Alan Cox        :       Aligned routing errors more closely with BSD
  36 *                                      our system is still very different.
  37 *              Alan Cox        :       Faster /proc handling
  38 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  39 *                                      routing caches and better behaviour.
  40 *
  41 *              Olaf Erb        :       irtt wasn't being copied right.
  42 *              Bjorn Ekwall    :       Kerneld route support.
  43 *              Alan Cox        :       Multicast fixed (I hope)
  44 *              Pavel Krauz     :       Limited broadcast fixed
  45 *              Mike McLagan    :       Routing by source
  46 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  47 *                                      route.c and rewritten from scratch.
  48 *              Andi Kleen      :       Load-limit warning messages.
  49 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  50 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  51 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  52 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  53 *              Marc Boucher    :       routing by fwmark
  54 *      Robert Olsson           :       Added rt_cache statistics
  55 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  56 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  57 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  58 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  59 */
  60
  61#define pr_fmt(fmt) "IPv4: " fmt
  62
  63#include <linux/module.h>
  64#include <linux/uaccess.h>
  65#include <linux/bitops.h>
  66#include <linux/types.h>
  67#include <linux/kernel.h>
  68#include <linux/mm.h>
  69#include <linux/string.h>
  70#include <linux/socket.h>
  71#include <linux/sockios.h>
  72#include <linux/errno.h>
  73#include <linux/in.h>
  74#include <linux/inet.h>
  75#include <linux/netdevice.h>
  76#include <linux/proc_fs.h>
  77#include <linux/init.h>
  78#include <linux/skbuff.h>
  79#include <linux/inetdevice.h>
  80#include <linux/igmp.h>
  81#include <linux/pkt_sched.h>
  82#include <linux/mroute.h>
  83#include <linux/netfilter_ipv4.h>
  84#include <linux/random.h>
  85#include <linux/rcupdate.h>
  86#include <linux/times.h>
  87#include <linux/slab.h>
  88#include <linux/jhash.h>
  89#include <net/dst.h>
  90#include <net/dst_metadata.h>
  91#include <net/net_namespace.h>
  92#include <net/protocol.h>
  93#include <net/ip.h>
  94#include <net/route.h>
  95#include <net/inetpeer.h>
  96#include <net/sock.h>
  97#include <net/ip_fib.h>
  98#include <net/arp.h>
  99#include <net/tcp.h>
 100#include <net/icmp.h>
 101#include <net/xfrm.h>
 102#include <net/lwtunnel.h>
 103#include <net/netevent.h>
 104#include <net/rtnetlink.h>
 105#ifdef CONFIG_SYSCTL
 106#include <linux/sysctl.h>
 107#endif
 108#include <net/secure_seq.h>
 109#include <net/ip_tunnels.h>
 110#include <net/l3mdev.h>
 111
 112#include "fib_lookup.h"
 113
 114#define RT_FL_TOS(oldflp4) \
 115        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 116
 117#define RT_GC_TIMEOUT (300*HZ)
 118
 119static int ip_rt_max_size;
 120static int ip_rt_redirect_number __read_mostly  = 9;
 121static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 122static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 123static int ip_rt_error_cost __read_mostly       = HZ;
 124static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 125static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 126static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 127static int ip_rt_min_advmss __read_mostly       = 256;
 128
 129static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 130
 131/*
 132 *      Interface to generic destination cache.
 133 */
 134
 135static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 136static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 137static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 138static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 139static void              ipv4_link_failure(struct sk_buff *skb);
 140static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 141                                           struct sk_buff *skb, u32 mtu);
 142static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 143                                        struct sk_buff *skb);
 144static void             ipv4_dst_destroy(struct dst_entry *dst);
 145
 146static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 147{
 148        WARN_ON(1);
 149        return NULL;
 150}
 151
 152static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 153                                           struct sk_buff *skb,
 154                                           const void *daddr);
 155static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 156
 157static struct dst_ops ipv4_dst_ops = {
 158        .family =               AF_INET,
 159        .check =                ipv4_dst_check,
 160        .default_advmss =       ipv4_default_advmss,
 161        .mtu =                  ipv4_mtu,
 162        .cow_metrics =          ipv4_cow_metrics,
 163        .destroy =              ipv4_dst_destroy,
 164        .negative_advice =      ipv4_negative_advice,
 165        .link_failure =         ipv4_link_failure,
 166        .update_pmtu =          ip_rt_update_pmtu,
 167        .redirect =             ip_do_redirect,
 168        .local_out =            __ip_local_out,
 169        .neigh_lookup =         ipv4_neigh_lookup,
 170        .confirm_neigh =        ipv4_confirm_neigh,
 171};
 172
 173#define ECN_OR_COST(class)      TC_PRIO_##class
 174
 175const __u8 ip_tos2prio[16] = {
 176        TC_PRIO_BESTEFFORT,
 177        ECN_OR_COST(BESTEFFORT),
 178        TC_PRIO_BESTEFFORT,
 179        ECN_OR_COST(BESTEFFORT),
 180        TC_PRIO_BULK,
 181        ECN_OR_COST(BULK),
 182        TC_PRIO_BULK,
 183        ECN_OR_COST(BULK),
 184        TC_PRIO_INTERACTIVE,
 185        ECN_OR_COST(INTERACTIVE),
 186        TC_PRIO_INTERACTIVE,
 187        ECN_OR_COST(INTERACTIVE),
 188        TC_PRIO_INTERACTIVE_BULK,
 189        ECN_OR_COST(INTERACTIVE_BULK),
 190        TC_PRIO_INTERACTIVE_BULK,
 191        ECN_OR_COST(INTERACTIVE_BULK)
 192};
 193EXPORT_SYMBOL(ip_tos2prio);
 194
 195static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 196#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 197
 198#ifdef CONFIG_PROC_FS
 199static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 200{
 201        if (*pos)
 202                return NULL;
 203        return SEQ_START_TOKEN;
 204}
 205
 206static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 207{
 208        ++*pos;
 209        return NULL;
 210}
 211
 212static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 213{
 214}
 215
 216static int rt_cache_seq_show(struct seq_file *seq, void *v)
 217{
 218        if (v == SEQ_START_TOKEN)
 219                seq_printf(seq, "%-127s\n",
 220                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 221                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 222                           "HHUptod\tSpecDst");
 223        return 0;
 224}
 225
 226static const struct seq_operations rt_cache_seq_ops = {
 227        .start  = rt_cache_seq_start,
 228        .next   = rt_cache_seq_next,
 229        .stop   = rt_cache_seq_stop,
 230        .show   = rt_cache_seq_show,
 231};
 232
 233static int rt_cache_seq_open(struct inode *inode, struct file *file)
 234{
 235        return seq_open(file, &rt_cache_seq_ops);
 236}
 237
 238static const struct file_operations rt_cache_seq_fops = {
 239        .open    = rt_cache_seq_open,
 240        .read    = seq_read,
 241        .llseek  = seq_lseek,
 242        .release = seq_release,
 243};
 244
 245
 246static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 247{
 248        int cpu;
 249
 250        if (*pos == 0)
 251                return SEQ_START_TOKEN;
 252
 253        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 254                if (!cpu_possible(cpu))
 255                        continue;
 256                *pos = cpu+1;
 257                return &per_cpu(rt_cache_stat, cpu);
 258        }
 259        return NULL;
 260}
 261
 262static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 263{
 264        int cpu;
 265
 266        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 267                if (!cpu_possible(cpu))
 268                        continue;
 269                *pos = cpu+1;
 270                return &per_cpu(rt_cache_stat, cpu);
 271        }
 272        return NULL;
 273
 274}
 275
 276static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 277{
 278
 279}
 280
 281static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 282{
 283        struct rt_cache_stat *st = v;
 284
 285        if (v == SEQ_START_TOKEN) {
 286                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 287                return 0;
 288        }
 289
 290        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 291                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 292                   dst_entries_get_slow(&ipv4_dst_ops),
 293                   0, /* st->in_hit */
 294                   st->in_slow_tot,
 295                   st->in_slow_mc,
 296                   st->in_no_route,
 297                   st->in_brd,
 298                   st->in_martian_dst,
 299                   st->in_martian_src,
 300
 301                   0, /* st->out_hit */
 302                   st->out_slow_tot,
 303                   st->out_slow_mc,
 304
 305                   0, /* st->gc_total */
 306                   0, /* st->gc_ignored */
 307                   0, /* st->gc_goal_miss */
 308                   0, /* st->gc_dst_overflow */
 309                   0, /* st->in_hlist_search */
 310                   0  /* st->out_hlist_search */
 311                );
 312        return 0;
 313}
 314
 315static const struct seq_operations rt_cpu_seq_ops = {
 316        .start  = rt_cpu_seq_start,
 317        .next   = rt_cpu_seq_next,
 318        .stop   = rt_cpu_seq_stop,
 319        .show   = rt_cpu_seq_show,
 320};
 321
 322
 323static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 324{
 325        return seq_open(file, &rt_cpu_seq_ops);
 326}
 327
 328static const struct file_operations rt_cpu_seq_fops = {
 329        .open    = rt_cpu_seq_open,
 330        .read    = seq_read,
 331        .llseek  = seq_lseek,
 332        .release = seq_release,
 333};
 334
 335#ifdef CONFIG_IP_ROUTE_CLASSID
 336static int rt_acct_proc_show(struct seq_file *m, void *v)
 337{
 338        struct ip_rt_acct *dst, *src;
 339        unsigned int i, j;
 340
 341        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 342        if (!dst)
 343                return -ENOMEM;
 344
 345        for_each_possible_cpu(i) {
 346                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 347                for (j = 0; j < 256; j++) {
 348                        dst[j].o_bytes   += src[j].o_bytes;
 349                        dst[j].o_packets += src[j].o_packets;
 350                        dst[j].i_bytes   += src[j].i_bytes;
 351                        dst[j].i_packets += src[j].i_packets;
 352                }
 353        }
 354
 355        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 356        kfree(dst);
 357        return 0;
 358}
 359#endif
 360
 361static int __net_init ip_rt_do_proc_init(struct net *net)
 362{
 363        struct proc_dir_entry *pde;
 364
 365        pde = proc_create("rt_cache", 0444, net->proc_net,
 366                          &rt_cache_seq_fops);
 367        if (!pde)
 368                goto err1;
 369
 370        pde = proc_create("rt_cache", 0444,
 371                          net->proc_net_stat, &rt_cpu_seq_fops);
 372        if (!pde)
 373                goto err2;
 374
 375#ifdef CONFIG_IP_ROUTE_CLASSID
 376        pde = proc_create_single("rt_acct", 0, net->proc_net,
 377                        rt_acct_proc_show);
 378        if (!pde)
 379                goto err3;
 380#endif
 381        return 0;
 382
 383#ifdef CONFIG_IP_ROUTE_CLASSID
 384err3:
 385        remove_proc_entry("rt_cache", net->proc_net_stat);
 386#endif
 387err2:
 388        remove_proc_entry("rt_cache", net->proc_net);
 389err1:
 390        return -ENOMEM;
 391}
 392
 393static void __net_exit ip_rt_do_proc_exit(struct net *net)
 394{
 395        remove_proc_entry("rt_cache", net->proc_net_stat);
 396        remove_proc_entry("rt_cache", net->proc_net);
 397#ifdef CONFIG_IP_ROUTE_CLASSID
 398        remove_proc_entry("rt_acct", net->proc_net);
 399#endif
 400}
 401
 402static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 403        .init = ip_rt_do_proc_init,
 404        .exit = ip_rt_do_proc_exit,
 405};
 406
 407static int __init ip_rt_proc_init(void)
 408{
 409        return register_pernet_subsys(&ip_rt_proc_ops);
 410}
 411
 412#else
 413static inline int ip_rt_proc_init(void)
 414{
 415        return 0;
 416}
 417#endif /* CONFIG_PROC_FS */
 418
 419static inline bool rt_is_expired(const struct rtable *rth)
 420{
 421        return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 422}
 423
 424void rt_cache_flush(struct net *net)
 425{
 426        rt_genid_bump_ipv4(net);
 427}
 428
 429static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 430                                           struct sk_buff *skb,
 431                                           const void *daddr)
 432{
 433        const struct rtable *rt = container_of(dst, struct rtable, dst);
 434        struct net_device *dev = dst->dev;
 435        struct neighbour *n;
 436
 437        rcu_read_lock_bh();
 438
 439        if (likely(rt->rt_gw_family == AF_INET)) {
 440                n = ip_neigh_gw4(dev, rt->rt_gw4);
 441        } else if (rt->rt_gw_family == AF_INET6) {
 442                n = ip_neigh_gw6(dev, &rt->rt_gw6);
 443        } else {
 444                __be32 pkey;
 445
 446                pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
 447                n = ip_neigh_gw4(dev, pkey);
 448        }
 449
 450        if (n && !refcount_inc_not_zero(&n->refcnt))
 451                n = NULL;
 452
 453        rcu_read_unlock_bh();
 454
 455        return n;
 456}
 457
 458static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 459{
 460        const struct rtable *rt = container_of(dst, struct rtable, dst);
 461        struct net_device *dev = dst->dev;
 462        const __be32 *pkey = daddr;
 463
 464        if (rt->rt_gw_family == AF_INET) {
 465                pkey = (const __be32 *)&rt->rt_gw4;
 466        } else if (rt->rt_gw_family == AF_INET6) {
 467                return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
 468        } else if (!daddr ||
 469                 (rt->rt_flags &
 470                  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
 471                return;
 472        }
 473        __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 474}
 475
 476#define IP_IDENTS_SZ 2048u
 477
 478static atomic_t *ip_idents __read_mostly;
 479static u32 *ip_tstamps __read_mostly;
 480
 481/* In order to protect privacy, we add a perturbation to identifiers
 482 * if one generator is seldom used. This makes hard for an attacker
 483 * to infer how many packets were sent between two points in time.
 484 */
 485u32 ip_idents_reserve(u32 hash, int segs)
 486{
 487        u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 488        atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 489        u32 old = READ_ONCE(*p_tstamp);
 490        u32 now = (u32)jiffies;
 491        u32 new, delta = 0;
 492
 493        if (old != now && cmpxchg(p_tstamp, old, now) == old)
 494                delta = prandom_u32_max(now - old);
 495
 496        /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 497        do {
 498                old = (u32)atomic_read(p_id);
 499                new = old + delta + segs;
 500        } while (atomic_cmpxchg(p_id, old, new) != old);
 501
 502        return new - segs;
 503}
 504EXPORT_SYMBOL(ip_idents_reserve);
 505
 506void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 507{
 508        u32 hash, id;
 509
 510        /* Note the following code is not safe, but this is okay. */
 511        if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 512                get_random_bytes(&net->ipv4.ip_id_key,
 513                                 sizeof(net->ipv4.ip_id_key));
 514
 515        hash = siphash_3u32((__force u32)iph->daddr,
 516                            (__force u32)iph->saddr,
 517                            iph->protocol,
 518                            &net->ipv4.ip_id_key);
 519        id = ip_idents_reserve(hash, segs);
 520        iph->id = htons(id);
 521}
 522EXPORT_SYMBOL(__ip_select_ident);
 523
 524static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 525                             const struct sock *sk,
 526                             const struct iphdr *iph,
 527                             int oif, u8 tos,
 528                             u8 prot, u32 mark, int flow_flags)
 529{
 530        if (sk) {
 531                const struct inet_sock *inet = inet_sk(sk);
 532
 533                oif = sk->sk_bound_dev_if;
 534                mark = sk->sk_mark;
 535                tos = RT_CONN_FLAGS(sk);
 536                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 537        }
 538        flowi4_init_output(fl4, oif, mark, tos,
 539                           RT_SCOPE_UNIVERSE, prot,
 540                           flow_flags,
 541                           iph->daddr, iph->saddr, 0, 0,
 542                           sock_net_uid(net, sk));
 543}
 544
 545static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 546                               const struct sock *sk)
 547{
 548        const struct net *net = dev_net(skb->dev);
 549        const struct iphdr *iph = ip_hdr(skb);
 550        int oif = skb->dev->ifindex;
 551        u8 tos = RT_TOS(iph->tos);
 552        u8 prot = iph->protocol;
 553        u32 mark = skb->mark;
 554
 555        __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 556}
 557
 558static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 559{
 560        const struct inet_sock *inet = inet_sk(sk);
 561        const struct ip_options_rcu *inet_opt;
 562        __be32 daddr = inet->inet_daddr;
 563
 564        rcu_read_lock();
 565        inet_opt = rcu_dereference(inet->inet_opt);
 566        if (inet_opt && inet_opt->opt.srr)
 567                daddr = inet_opt->opt.faddr;
 568        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 569                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 570                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 571                           inet_sk_flowi_flags(sk),
 572                           daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 573        rcu_read_unlock();
 574}
 575
 576static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 577                                 const struct sk_buff *skb)
 578{
 579        if (skb)
 580                build_skb_flow_key(fl4, skb, sk);
 581        else
 582                build_sk_flow_key(fl4, sk);
 583}
 584
 585static DEFINE_SPINLOCK(fnhe_lock);
 586
 587static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 588{
 589        struct rtable *rt;
 590
 591        rt = rcu_dereference(fnhe->fnhe_rth_input);
 592        if (rt) {
 593                RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 594                dst_dev_put(&rt->dst);
 595                dst_release(&rt->dst);
 596        }
 597        rt = rcu_dereference(fnhe->fnhe_rth_output);
 598        if (rt) {
 599                RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 600                dst_dev_put(&rt->dst);
 601                dst_release(&rt->dst);
 602        }
 603}
 604
 605static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 606{
 607        struct fib_nh_exception *fnhe, *oldest;
 608
 609        oldest = rcu_dereference(hash->chain);
 610        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 611             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 612                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 613                        oldest = fnhe;
 614        }
 615        fnhe_flush_routes(oldest);
 616        return oldest;
 617}
 618
 619static inline u32 fnhe_hashfun(__be32 daddr)
 620{
 621        static u32 fnhe_hashrnd __read_mostly;
 622        u32 hval;
 623
 624        net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 625        hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 626        return hash_32(hval, FNHE_HASH_SHIFT);
 627}
 628
 629static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 630{
 631        rt->rt_pmtu = fnhe->fnhe_pmtu;
 632        rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 633        rt->dst.expires = fnhe->fnhe_expires;
 634
 635        if (fnhe->fnhe_gw) {
 636                rt->rt_flags |= RTCF_REDIRECTED;
 637                rt->rt_gw_family = AF_INET;
 638                rt->rt_gw4 = fnhe->fnhe_gw;
 639        }
 640}
 641
 642static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
 643                                  __be32 gw, u32 pmtu, bool lock,
 644                                  unsigned long expires)
 645{
 646        struct fnhe_hash_bucket *hash;
 647        struct fib_nh_exception *fnhe;
 648        struct rtable *rt;
 649        u32 genid, hval;
 650        unsigned int i;
 651        int depth;
 652
 653        genid = fnhe_genid(dev_net(nhc->nhc_dev));
 654        hval = fnhe_hashfun(daddr);
 655
 656        spin_lock_bh(&fnhe_lock);
 657
 658        hash = rcu_dereference(nhc->nhc_exceptions);
 659        if (!hash) {
 660                hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 661                if (!hash)
 662                        goto out_unlock;
 663                rcu_assign_pointer(nhc->nhc_exceptions, hash);
 664        }
 665
 666        hash += hval;
 667
 668        depth = 0;
 669        for (fnhe = rcu_dereference(hash->chain); fnhe;
 670             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 671                if (fnhe->fnhe_daddr == daddr)
 672                        break;
 673                depth++;
 674        }
 675
 676        if (fnhe) {
 677                if (fnhe->fnhe_genid != genid)
 678                        fnhe->fnhe_genid = genid;
 679                if (gw)
 680                        fnhe->fnhe_gw = gw;
 681                if (pmtu) {
 682                        fnhe->fnhe_pmtu = pmtu;
 683                        fnhe->fnhe_mtu_locked = lock;
 684                }
 685                fnhe->fnhe_expires = max(1UL, expires);
 686                /* Update all cached dsts too */
 687                rt = rcu_dereference(fnhe->fnhe_rth_input);
 688                if (rt)
 689                        fill_route_from_fnhe(rt, fnhe);
 690                rt = rcu_dereference(fnhe->fnhe_rth_output);
 691                if (rt)
 692                        fill_route_from_fnhe(rt, fnhe);
 693        } else {
 694                if (depth > FNHE_RECLAIM_DEPTH)
 695                        fnhe = fnhe_oldest(hash);
 696                else {
 697                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 698                        if (!fnhe)
 699                                goto out_unlock;
 700
 701                        fnhe->fnhe_next = hash->chain;
 702                        rcu_assign_pointer(hash->chain, fnhe);
 703                }
 704                fnhe->fnhe_genid = genid;
 705                fnhe->fnhe_daddr = daddr;
 706                fnhe->fnhe_gw = gw;
 707                fnhe->fnhe_pmtu = pmtu;
 708                fnhe->fnhe_mtu_locked = lock;
 709                fnhe->fnhe_expires = max(1UL, expires);
 710
 711                /* Exception created; mark the cached routes for the nexthop
 712                 * stale, so anyone caching it rechecks if this exception
 713                 * applies to them.
 714                 */
 715                rt = rcu_dereference(nhc->nhc_rth_input);
 716                if (rt)
 717                        rt->dst.obsolete = DST_OBSOLETE_KILL;
 718
 719                for_each_possible_cpu(i) {
 720                        struct rtable __rcu **prt;
 721                        prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 722                        rt = rcu_dereference(*prt);
 723                        if (rt)
 724                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 725                }
 726        }
 727
 728        fnhe->fnhe_stamp = jiffies;
 729
 730out_unlock:
 731        spin_unlock_bh(&fnhe_lock);
 732}
 733
 734static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 735                             bool kill_route)
 736{
 737        __be32 new_gw = icmp_hdr(skb)->un.gateway;
 738        __be32 old_gw = ip_hdr(skb)->saddr;
 739        struct net_device *dev = skb->dev;
 740        struct in_device *in_dev;
 741        struct fib_result res;
 742        struct neighbour *n;
 743        struct net *net;
 744
 745        switch (icmp_hdr(skb)->code & 7) {
 746        case ICMP_REDIR_NET:
 747        case ICMP_REDIR_NETTOS:
 748        case ICMP_REDIR_HOST:
 749        case ICMP_REDIR_HOSTTOS:
 750                break;
 751
 752        default:
 753                return;
 754        }
 755
 756        if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
 757                return;
 758
 759        in_dev = __in_dev_get_rcu(dev);
 760        if (!in_dev)
 761                return;
 762
 763        net = dev_net(dev);
 764        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 765            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 766            ipv4_is_zeronet(new_gw))
 767                goto reject_redirect;
 768
 769        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 770                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 771                        goto reject_redirect;
 772                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 773                        goto reject_redirect;
 774        } else {
 775                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 776                        goto reject_redirect;
 777        }
 778
 779        n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 780        if (!n)
 781                n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 782        if (!IS_ERR(n)) {
 783                if (!(n->nud_state & NUD_VALID)) {
 784                        neigh_event_send(n, NULL);
 785                } else {
 786                        if (fib_lookup(net, fl4, &res, 0) == 0) {
 787                                struct fib_nh_common *nhc = FIB_RES_NHC(res);
 788
 789                                update_or_create_fnhe(nhc, fl4->daddr, new_gw,
 790                                                0, false,
 791                                                jiffies + ip_rt_gc_timeout);
 792                        }
 793                        if (kill_route)
 794                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 795                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 796                }
 797                neigh_release(n);
 798        }
 799        return;
 800
 801reject_redirect:
 802#ifdef CONFIG_IP_ROUTE_VERBOSE
 803        if (IN_DEV_LOG_MARTIANS(in_dev)) {
 804                const struct iphdr *iph = (const struct iphdr *) skb->data;
 805                __be32 daddr = iph->daddr;
 806                __be32 saddr = iph->saddr;
 807
 808                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 809                                     "  Advised path = %pI4 -> %pI4\n",
 810                                     &old_gw, dev->name, &new_gw,
 811                                     &saddr, &daddr);
 812        }
 813#endif
 814        ;
 815}
 816
 817static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 818{
 819        struct rtable *rt;
 820        struct flowi4 fl4;
 821        const struct iphdr *iph = (const struct iphdr *) skb->data;
 822        struct net *net = dev_net(skb->dev);
 823        int oif = skb->dev->ifindex;
 824        u8 tos = RT_TOS(iph->tos);
 825        u8 prot = iph->protocol;
 826        u32 mark = skb->mark;
 827
 828        rt = (struct rtable *) dst;
 829
 830        __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 831        __ip_do_redirect(rt, skb, &fl4, true);
 832}
 833
 834static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 835{
 836        struct rtable *rt = (struct rtable *)dst;
 837        struct dst_entry *ret = dst;
 838
 839        if (rt) {
 840                if (dst->obsolete > 0) {
 841                        ip_rt_put(rt);
 842                        ret = NULL;
 843                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 844                           rt->dst.expires) {
 845                        ip_rt_put(rt);
 846                        ret = NULL;
 847                }
 848        }
 849        return ret;
 850}
 851
 852/*
 853 * Algorithm:
 854 *      1. The first ip_rt_redirect_number redirects are sent
 855 *         with exponential backoff, then we stop sending them at all,
 856 *         assuming that the host ignores our redirects.
 857 *      2. If we did not see packets requiring redirects
 858 *         during ip_rt_redirect_silence, we assume that the host
 859 *         forgot redirected route and start to send redirects again.
 860 *
 861 * This algorithm is much cheaper and more intelligent than dumb load limiting
 862 * in icmp.c.
 863 *
 864 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 865 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 866 */
 867
 868void ip_rt_send_redirect(struct sk_buff *skb)
 869{
 870        struct rtable *rt = skb_rtable(skb);
 871        struct in_device *in_dev;
 872        struct inet_peer *peer;
 873        struct net *net;
 874        int log_martians;
 875        int vif;
 876
 877        rcu_read_lock();
 878        in_dev = __in_dev_get_rcu(rt->dst.dev);
 879        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 880                rcu_read_unlock();
 881                return;
 882        }
 883        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 884        vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 885        rcu_read_unlock();
 886
 887        net = dev_net(rt->dst.dev);
 888        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 889        if (!peer) {
 890                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 891                          rt_nexthop(rt, ip_hdr(skb)->daddr));
 892                return;
 893        }
 894
 895        /* No redirected packets during ip_rt_redirect_silence;
 896         * reset the algorithm.
 897         */
 898        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 899                peer->rate_tokens = 0;
 900                peer->n_redirects = 0;
 901        }
 902
 903        /* Too many ignored redirects; do not send anything
 904         * set dst.rate_last to the last seen redirected packet.
 905         */
 906        if (peer->n_redirects >= ip_rt_redirect_number) {
 907                peer->rate_last = jiffies;
 908                goto out_put_peer;
 909        }
 910
 911        /* Check for load limit; set rate_last to the latest sent
 912         * redirect.
 913         */
 914        if (peer->rate_tokens == 0 ||
 915            time_after(jiffies,
 916                       (peer->rate_last +
 917                        (ip_rt_redirect_load << peer->rate_tokens)))) {
 918                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 919
 920                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 921                peer->rate_last = jiffies;
 922                ++peer->rate_tokens;
 923                ++peer->n_redirects;
 924#ifdef CONFIG_IP_ROUTE_VERBOSE
 925                if (log_martians &&
 926                    peer->rate_tokens == ip_rt_redirect_number)
 927                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 928                                             &ip_hdr(skb)->saddr, inet_iif(skb),
 929                                             &ip_hdr(skb)->daddr, &gw);
 930#endif
 931        }
 932out_put_peer:
 933        inet_putpeer(peer);
 934}
 935
 936static int ip_error(struct sk_buff *skb)
 937{
 938        struct rtable *rt = skb_rtable(skb);
 939        struct net_device *dev = skb->dev;
 940        struct in_device *in_dev;
 941        struct inet_peer *peer;
 942        unsigned long now;
 943        struct net *net;
 944        bool send;
 945        int code;
 946
 947        if (netif_is_l3_master(skb->dev)) {
 948                dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 949                if (!dev)
 950                        goto out;
 951        }
 952
 953        in_dev = __in_dev_get_rcu(dev);
 954
 955        /* IP on this device is disabled. */
 956        if (!in_dev)
 957                goto out;
 958
 959        net = dev_net(rt->dst.dev);
 960        if (!IN_DEV_FORWARD(in_dev)) {
 961                switch (rt->dst.error) {
 962                case EHOSTUNREACH:
 963                        __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 964                        break;
 965
 966                case ENETUNREACH:
 967                        __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 968                        break;
 969                }
 970                goto out;
 971        }
 972
 973        switch (rt->dst.error) {
 974        case EINVAL:
 975        default:
 976                goto out;
 977        case EHOSTUNREACH:
 978                code = ICMP_HOST_UNREACH;
 979                break;
 980        case ENETUNREACH:
 981                code = ICMP_NET_UNREACH;
 982                __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 983                break;
 984        case EACCES:
 985                code = ICMP_PKT_FILTERED;
 986                break;
 987        }
 988
 989        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 990                               l3mdev_master_ifindex(skb->dev), 1);
 991
 992        send = true;
 993        if (peer) {
 994                now = jiffies;
 995                peer->rate_tokens += now - peer->rate_last;
 996                if (peer->rate_tokens > ip_rt_error_burst)
 997                        peer->rate_tokens = ip_rt_error_burst;
 998                peer->rate_last = now;
 999                if (peer->rate_tokens >= ip_rt_error_cost)
1000                        peer->rate_tokens -= ip_rt_error_cost;
1001                else
1002                        send = false;
1003                inet_putpeer(peer);
1004        }
1005        if (send)
1006                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1007
1008out:    kfree_skb(skb);
1009        return 0;
1010}
1011
1012static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1013{
1014        struct dst_entry *dst = &rt->dst;
1015        u32 old_mtu = ipv4_mtu(dst);
1016        struct fib_result res;
1017        bool lock = false;
1018
1019        if (ip_mtu_locked(dst))
1020                return;
1021
1022        if (old_mtu < mtu)
1023                return;
1024
1025        if (mtu < ip_rt_min_pmtu) {
1026                lock = true;
1027                mtu = min(old_mtu, ip_rt_min_pmtu);
1028        }
1029
1030        if (rt->rt_pmtu == mtu && !lock &&
1031            time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1032                return;
1033
1034        rcu_read_lock();
1035        if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1036                struct fib_nh_common *nhc = FIB_RES_NHC(res);
1037
1038                update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1039                                      jiffies + ip_rt_mtu_expires);
1040        }
1041        rcu_read_unlock();
1042}
1043
1044static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1045                              struct sk_buff *skb, u32 mtu)
1046{
1047        struct rtable *rt = (struct rtable *) dst;
1048        struct flowi4 fl4;
1049
1050        ip_rt_build_flow_key(&fl4, sk, skb);
1051        __ip_rt_update_pmtu(rt, &fl4, mtu);
1052}
1053
1054void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1055                      int oif, u8 protocol)
1056{
1057        const struct iphdr *iph = (const struct iphdr *) skb->data;
1058        struct flowi4 fl4;
1059        struct rtable *rt;
1060        u32 mark = IP4_REPLY_MARK(net, skb->mark);
1061
1062        __build_flow_key(net, &fl4, NULL, iph, oif,
1063                         RT_TOS(iph->tos), protocol, mark, 0);
1064        rt = __ip_route_output_key(net, &fl4);
1065        if (!IS_ERR(rt)) {
1066                __ip_rt_update_pmtu(rt, &fl4, mtu);
1067                ip_rt_put(rt);
1068        }
1069}
1070EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1071
1072static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1073{
1074        const struct iphdr *iph = (const struct iphdr *) skb->data;
1075        struct flowi4 fl4;
1076        struct rtable *rt;
1077
1078        __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1079
1080        if (!fl4.flowi4_mark)
1081                fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1082
1083        rt = __ip_route_output_key(sock_net(sk), &fl4);
1084        if (!IS_ERR(rt)) {
1085                __ip_rt_update_pmtu(rt, &fl4, mtu);
1086                ip_rt_put(rt);
1087        }
1088}
1089
1090void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1091{
1092        const struct iphdr *iph = (const struct iphdr *) skb->data;
1093        struct flowi4 fl4;
1094        struct rtable *rt;
1095        struct dst_entry *odst = NULL;
1096        bool new = false;
1097        struct net *net = sock_net(sk);
1098
1099        bh_lock_sock(sk);
1100
1101        if (!ip_sk_accept_pmtu(sk))
1102                goto out;
1103
1104        odst = sk_dst_get(sk);
1105
1106        if (sock_owned_by_user(sk) || !odst) {
1107                __ipv4_sk_update_pmtu(skb, sk, mtu);
1108                goto out;
1109        }
1110
1111        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1112
1113        rt = (struct rtable *)odst;
1114        if (odst->obsolete && !odst->ops->check(odst, 0)) {
1115                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1116                if (IS_ERR(rt))
1117                        goto out;
1118
1119                new = true;
1120        }
1121
1122        __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1123
1124        if (!dst_check(&rt->dst, 0)) {
1125                if (new)
1126                        dst_release(&rt->dst);
1127
1128                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1129                if (IS_ERR(rt))
1130                        goto out;
1131
1132                new = true;
1133        }
1134
1135        if (new)
1136                sk_dst_set(sk, &rt->dst);
1137
1138out:
1139        bh_unlock_sock(sk);
1140        dst_release(odst);
1141}
1142EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1143
1144void ipv4_redirect(struct sk_buff *skb, struct net *net,
1145                   int oif, u8 protocol)
1146{
1147        const struct iphdr *iph = (const struct iphdr *) skb->data;
1148        struct flowi4 fl4;
1149        struct rtable *rt;
1150
1151        __build_flow_key(net, &fl4, NULL, iph, oif,
1152                         RT_TOS(iph->tos), protocol, 0, 0);
1153        rt = __ip_route_output_key(net, &fl4);
1154        if (!IS_ERR(rt)) {
1155                __ip_do_redirect(rt, skb, &fl4, false);
1156                ip_rt_put(rt);
1157        }
1158}
1159EXPORT_SYMBOL_GPL(ipv4_redirect);
1160
1161void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1162{
1163        const struct iphdr *iph = (const struct iphdr *) skb->data;
1164        struct flowi4 fl4;
1165        struct rtable *rt;
1166        struct net *net = sock_net(sk);
1167
1168        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1169        rt = __ip_route_output_key(net, &fl4);
1170        if (!IS_ERR(rt)) {
1171                __ip_do_redirect(rt, skb, &fl4, false);
1172                ip_rt_put(rt);
1173        }
1174}
1175EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1176
1177static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1178{
1179        struct rtable *rt = (struct rtable *) dst;
1180
1181        /* All IPV4 dsts are created with ->obsolete set to the value
1182         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1183         * into this function always.
1184         *
1185         * When a PMTU/redirect information update invalidates a route,
1186         * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1187         * DST_OBSOLETE_DEAD.
1188         */
1189        if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1190                return NULL;
1191        return dst;
1192}
1193
1194static void ipv4_send_dest_unreach(struct sk_buff *skb)
1195{
1196        struct ip_options opt;
1197        int res;
1198
1199        /* Recompile ip options since IPCB may not be valid anymore.
1200         * Also check we have a reasonable ipv4 header.
1201         */
1202        if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1203            ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1204                return;
1205
1206        memset(&opt, 0, sizeof(opt));
1207        if (ip_hdr(skb)->ihl > 5) {
1208                if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1209                        return;
1210                opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1211
1212                rcu_read_lock();
1213                res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1214                rcu_read_unlock();
1215
1216                if (res)
1217                        return;
1218        }
1219        __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1220}
1221
1222static void ipv4_link_failure(struct sk_buff *skb)
1223{
1224        struct rtable *rt;
1225
1226        ipv4_send_dest_unreach(skb);
1227
1228        rt = skb_rtable(skb);
1229        if (rt)
1230                dst_set_expires(&rt->dst, 0);
1231}
1232
1233static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1234{
1235        pr_debug("%s: %pI4 -> %pI4, %s\n",
1236                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1237                 skb->dev ? skb->dev->name : "?");
1238        kfree_skb(skb);
1239        WARN_ON(1);
1240        return 0;
1241}
1242
1243/*
1244   We do not cache source address of outgoing interface,
1245   because it is used only by IP RR, TS and SRR options,
1246   so that it out of fast path.
1247
1248   BTW remember: "addr" is allowed to be not aligned
1249   in IP options!
1250 */
1251
1252void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1253{
1254        __be32 src;
1255
1256        if (rt_is_output_route(rt))
1257                src = ip_hdr(skb)->saddr;
1258        else {
1259                struct fib_result res;
1260                struct iphdr *iph = ip_hdr(skb);
1261                struct flowi4 fl4 = {
1262                        .daddr = iph->daddr,
1263                        .saddr = iph->saddr,
1264                        .flowi4_tos = RT_TOS(iph->tos),
1265                        .flowi4_oif = rt->dst.dev->ifindex,
1266                        .flowi4_iif = skb->dev->ifindex,
1267                        .flowi4_mark = skb->mark,
1268                };
1269
1270                rcu_read_lock();
1271                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1272                        src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1273                else
1274                        src = inet_select_addr(rt->dst.dev,
1275                                               rt_nexthop(rt, iph->daddr),
1276                                               RT_SCOPE_UNIVERSE);
1277                rcu_read_unlock();
1278        }
1279        memcpy(addr, &src, 4);
1280}
1281
1282#ifdef CONFIG_IP_ROUTE_CLASSID
1283static void set_class_tag(struct rtable *rt, u32 tag)
1284{
1285        if (!(rt->dst.tclassid & 0xFFFF))
1286                rt->dst.tclassid |= tag & 0xFFFF;
1287        if (!(rt->dst.tclassid & 0xFFFF0000))
1288                rt->dst.tclassid |= tag & 0xFFFF0000;
1289}
1290#endif
1291
1292static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1293{
1294        unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1295        unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1296                                    ip_rt_min_advmss);
1297
1298        return min(advmss, IPV4_MAX_PMTU - header_size);
1299}
1300
1301static unsigned int ipv4_mtu(const struct dst_entry *dst)
1302{
1303        const struct rtable *rt = (const struct rtable *) dst;
1304        unsigned int mtu = rt->rt_pmtu;
1305
1306        if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1307                mtu = dst_metric_raw(dst, RTAX_MTU);
1308
1309        if (mtu)
1310                return mtu;
1311
1312        mtu = READ_ONCE(dst->dev->mtu);
1313
1314        if (unlikely(ip_mtu_locked(dst))) {
1315                if (rt->rt_gw_family && mtu > 576)
1316                        mtu = 576;
1317        }
1318
1319        mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1320
1321        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1322}
1323
1324static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1325{
1326        struct fnhe_hash_bucket *hash;
1327        struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1328        u32 hval = fnhe_hashfun(daddr);
1329
1330        spin_lock_bh(&fnhe_lock);
1331
1332        hash = rcu_dereference_protected(nhc->nhc_exceptions,
1333                                         lockdep_is_held(&fnhe_lock));
1334        hash += hval;
1335
1336        fnhe_p = &hash->chain;
1337        fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1338        while (fnhe) {
1339                if (fnhe->fnhe_daddr == daddr) {
1340                        rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1341                                fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1342                        /* set fnhe_daddr to 0 to ensure it won't bind with
1343                         * new dsts in rt_bind_exception().
1344                         */
1345                        fnhe->fnhe_daddr = 0;
1346                        fnhe_flush_routes(fnhe);
1347                        kfree_rcu(fnhe, rcu);
1348                        break;
1349                }
1350                fnhe_p = &fnhe->fnhe_next;
1351                fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1352                                                 lockdep_is_held(&fnhe_lock));
1353        }
1354
1355        spin_unlock_bh(&fnhe_lock);
1356}
1357
1358static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1359                                               __be32 daddr)
1360{
1361        struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1362        struct fib_nh_exception *fnhe;
1363        u32 hval;
1364
1365        if (!hash)
1366                return NULL;
1367
1368        hval = fnhe_hashfun(daddr);
1369
1370        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1371             fnhe = rcu_dereference(fnhe->fnhe_next)) {
1372                if (fnhe->fnhe_daddr == daddr) {
1373                        if (fnhe->fnhe_expires &&
1374                            time_after(jiffies, fnhe->fnhe_expires)) {
1375                                ip_del_fnhe(nhc, daddr);
1376                                break;
1377                        }
1378                        return fnhe;
1379                }
1380        }
1381        return NULL;
1382}
1383
1384/* MTU selection:
1385 * 1. mtu on route is locked - use it
1386 * 2. mtu from nexthop exception
1387 * 3. mtu from egress device
1388 */
1389
1390u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1391{
1392        struct fib_nh_common *nhc = res->nhc;
1393        struct net_device *dev = nhc->nhc_dev;
1394        struct fib_info *fi = res->fi;
1395        u32 mtu = 0;
1396
1397        if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1398            fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1399                mtu = fi->fib_mtu;
1400
1401        if (likely(!mtu)) {
1402                struct fib_nh_exception *fnhe;
1403
1404                fnhe = find_exception(nhc, daddr);
1405                if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1406                        mtu = fnhe->fnhe_pmtu;
1407        }
1408
1409        if (likely(!mtu))
1410                mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1411
1412        return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1413}
1414
1415static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1416                              __be32 daddr, const bool do_cache)
1417{
1418        bool ret = false;
1419
1420        spin_lock_bh(&fnhe_lock);
1421
1422        if (daddr == fnhe->fnhe_daddr) {
1423                struct rtable __rcu **porig;
1424                struct rtable *orig;
1425                int genid = fnhe_genid(dev_net(rt->dst.dev));
1426
1427                if (rt_is_input_route(rt))
1428                        porig = &fnhe->fnhe_rth_input;
1429                else
1430                        porig = &fnhe->fnhe_rth_output;
1431                orig = rcu_dereference(*porig);
1432
1433                if (fnhe->fnhe_genid != genid) {
1434                        fnhe->fnhe_genid = genid;
1435                        fnhe->fnhe_gw = 0;
1436                        fnhe->fnhe_pmtu = 0;
1437                        fnhe->fnhe_expires = 0;
1438                        fnhe->fnhe_mtu_locked = false;
1439                        fnhe_flush_routes(fnhe);
1440                        orig = NULL;
1441                }
1442                fill_route_from_fnhe(rt, fnhe);
1443                if (!rt->rt_gw4) {
1444                        rt->rt_gw4 = daddr;
1445                        rt->rt_gw_family = AF_INET;
1446                }
1447
1448                if (do_cache) {
1449                        dst_hold(&rt->dst);
1450                        rcu_assign_pointer(*porig, rt);
1451                        if (orig) {
1452                                dst_dev_put(&orig->dst);
1453                                dst_release(&orig->dst);
1454                        }
1455                        ret = true;
1456                }
1457
1458                fnhe->fnhe_stamp = jiffies;
1459        }
1460        spin_unlock_bh(&fnhe_lock);
1461
1462        return ret;
1463}
1464
1465static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1466{
1467        struct rtable *orig, *prev, **p;
1468        bool ret = true;
1469
1470        if (rt_is_input_route(rt)) {
1471                p = (struct rtable **)&nhc->nhc_rth_input;
1472        } else {
1473                p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1474        }
1475        orig = *p;
1476
1477        /* hold dst before doing cmpxchg() to avoid race condition
1478         * on this dst
1479         */
1480        dst_hold(&rt->dst);
1481        prev = cmpxchg(p, orig, rt);
1482        if (prev == orig) {
1483                if (orig) {
1484                        dst_dev_put(&orig->dst);
1485                        dst_release(&orig->dst);
1486                }
1487        } else {
1488                dst_release(&rt->dst);
1489                ret = false;
1490        }
1491
1492        return ret;
1493}
1494
1495struct uncached_list {
1496        spinlock_t              lock;
1497        struct list_head        head;
1498};
1499
1500static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1501
1502void rt_add_uncached_list(struct rtable *rt)
1503{
1504        struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1505
1506        rt->rt_uncached_list = ul;
1507
1508        spin_lock_bh(&ul->lock);
1509        list_add_tail(&rt->rt_uncached, &ul->head);
1510        spin_unlock_bh(&ul->lock);
1511}
1512
1513void rt_del_uncached_list(struct rtable *rt)
1514{
1515        if (!list_empty(&rt->rt_uncached)) {
1516                struct uncached_list *ul = rt->rt_uncached_list;
1517
1518                spin_lock_bh(&ul->lock);
1519                list_del(&rt->rt_uncached);
1520                spin_unlock_bh(&ul->lock);
1521        }
1522}
1523
1524static void ipv4_dst_destroy(struct dst_entry *dst)
1525{
1526        struct rtable *rt = (struct rtable *)dst;
1527
1528        ip_dst_metrics_put(dst);
1529        rt_del_uncached_list(rt);
1530}
1531
1532void rt_flush_dev(struct net_device *dev)
1533{
1534        struct net *net = dev_net(dev);
1535        struct rtable *rt;
1536        int cpu;
1537
1538        for_each_possible_cpu(cpu) {
1539                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1540
1541                spin_lock_bh(&ul->lock);
1542                list_for_each_entry(rt, &ul->head, rt_uncached) {
1543                        if (rt->dst.dev != dev)
1544                                continue;
1545                        rt->dst.dev = net->loopback_dev;
1546                        dev_hold(rt->dst.dev);
1547                        dev_put(dev);
1548                }
1549                spin_unlock_bh(&ul->lock);
1550        }
1551}
1552
1553static bool rt_cache_valid(const struct rtable *rt)
1554{
1555        return  rt &&
1556                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1557                !rt_is_expired(rt);
1558}
1559
1560static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1561                           const struct fib_result *res,
1562                           struct fib_nh_exception *fnhe,
1563                           struct fib_info *fi, u16 type, u32 itag,
1564                           const bool do_cache)
1565{
1566        bool cached = false;
1567
1568        if (fi) {
1569                struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1570
1571                if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1572                        rt->rt_gw_family = nhc->nhc_gw_family;
1573                        /* only INET and INET6 are supported */
1574                        if (likely(nhc->nhc_gw_family == AF_INET))
1575                                rt->rt_gw4 = nhc->nhc_gw.ipv4;
1576                        else
1577                                rt->rt_gw6 = nhc->nhc_gw.ipv6;
1578                }
1579
1580                ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1581
1582#ifdef CONFIG_IP_ROUTE_CLASSID
1583                {
1584                        struct fib_nh *nh;
1585
1586                        nh = container_of(nhc, struct fib_nh, nh_common);
1587                        rt->dst.tclassid = nh->nh_tclassid;
1588                }
1589#endif
1590                rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1591                if (unlikely(fnhe))
1592                        cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1593                else if (do_cache)
1594                        cached = rt_cache_route(nhc, rt);
1595                if (unlikely(!cached)) {
1596                        /* Routes we intend to cache in nexthop exception or
1597                         * FIB nexthop have the DST_NOCACHE bit clear.
1598                         * However, if we are unsuccessful at storing this
1599                         * route into the cache we really need to set it.
1600                         */
1601                        if (!rt->rt_gw4) {
1602                                rt->rt_gw_family = AF_INET;
1603                                rt->rt_gw4 = daddr;
1604                        }
1605                        rt_add_uncached_list(rt);
1606                }
1607        } else
1608                rt_add_uncached_list(rt);
1609
1610#ifdef CONFIG_IP_ROUTE_CLASSID
1611#ifdef CONFIG_IP_MULTIPLE_TABLES
1612        set_class_tag(rt, res->tclassid);
1613#endif
1614        set_class_tag(rt, itag);
1615#endif
1616}
1617
1618struct rtable *rt_dst_alloc(struct net_device *dev,
1619                            unsigned int flags, u16 type,
1620                            bool nopolicy, bool noxfrm, bool will_cache)
1621{
1622        struct rtable *rt;
1623
1624        rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1625                       (will_cache ? 0 : DST_HOST) |
1626                       (nopolicy ? DST_NOPOLICY : 0) |
1627                       (noxfrm ? DST_NOXFRM : 0));
1628
1629        if (rt) {
1630                rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1631                rt->rt_flags = flags;
1632                rt->rt_type = type;
1633                rt->rt_is_input = 0;
1634                rt->rt_iif = 0;
1635                rt->rt_pmtu = 0;
1636                rt->rt_mtu_locked = 0;
1637                rt->rt_gw_family = 0;
1638                rt->rt_gw4 = 0;
1639                INIT_LIST_HEAD(&rt->rt_uncached);
1640
1641                rt->dst.output = ip_output;
1642                if (flags & RTCF_LOCAL)
1643                        rt->dst.input = ip_local_deliver;
1644        }
1645
1646        return rt;
1647}
1648EXPORT_SYMBOL(rt_dst_alloc);
1649
1650struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1651{
1652        struct rtable *new_rt;
1653
1654        new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1655                           rt->dst.flags);
1656
1657        if (new_rt) {
1658                new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1659                new_rt->rt_flags = rt->rt_flags;
1660                new_rt->rt_type = rt->rt_type;
1661                new_rt->rt_is_input = rt->rt_is_input;
1662                new_rt->rt_iif = rt->rt_iif;
1663                new_rt->rt_pmtu = rt->rt_pmtu;
1664                new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1665                new_rt->rt_gw_family = rt->rt_gw_family;
1666                if (rt->rt_gw_family == AF_INET)
1667                        new_rt->rt_gw4 = rt->rt_gw4;
1668                else if (rt->rt_gw_family == AF_INET6)
1669                        new_rt->rt_gw6 = rt->rt_gw6;
1670                INIT_LIST_HEAD(&new_rt->rt_uncached);
1671
1672                new_rt->dst.flags |= DST_HOST;
1673                new_rt->dst.input = rt->dst.input;
1674                new_rt->dst.output = rt->dst.output;
1675                new_rt->dst.error = rt->dst.error;
1676                new_rt->dst.lastuse = jiffies;
1677                new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1678        }
1679        return new_rt;
1680}
1681EXPORT_SYMBOL(rt_dst_clone);
1682
1683/* called in rcu_read_lock() section */
1684int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1685                          u8 tos, struct net_device *dev,
1686                          struct in_device *in_dev, u32 *itag)
1687{
1688        int err;
1689
1690        /* Primary sanity checks. */
1691        if (!in_dev)
1692                return -EINVAL;
1693
1694        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1695            skb->protocol != htons(ETH_P_IP))
1696                return -EINVAL;
1697
1698        if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1699                return -EINVAL;
1700
1701        if (ipv4_is_zeronet(saddr)) {
1702                if (!ipv4_is_local_multicast(daddr) &&
1703                    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1704                        return -EINVAL;
1705        } else {
1706                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1707                                          in_dev, itag);
1708                if (err < 0)
1709                        return err;
1710        }
1711        return 0;
1712}
1713
1714/* called in rcu_read_lock() section */
1715static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1716                             u8 tos, struct net_device *dev, int our)
1717{
1718        struct in_device *in_dev = __in_dev_get_rcu(dev);
1719        unsigned int flags = RTCF_MULTICAST;
1720        struct rtable *rth;
1721        u32 itag = 0;
1722        int err;
1723
1724        err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1725        if (err)
1726                return err;
1727
1728        if (our)
1729                flags |= RTCF_LOCAL;
1730
1731        rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1732                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1733        if (!rth)
1734                return -ENOBUFS;
1735
1736#ifdef CONFIG_IP_ROUTE_CLASSID
1737        rth->dst.tclassid = itag;
1738#endif
1739        rth->dst.output = ip_rt_bug;
1740        rth->rt_is_input= 1;
1741
1742#ifdef CONFIG_IP_MROUTE
1743        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1744                rth->dst.input = ip_mr_input;
1745#endif
1746        RT_CACHE_STAT_INC(in_slow_mc);
1747
1748        skb_dst_set(skb, &rth->dst);
1749        return 0;
1750}
1751
1752
1753static void ip_handle_martian_source(struct net_device *dev,
1754                                     struct in_device *in_dev,
1755                                     struct sk_buff *skb,
1756                                     __be32 daddr,
1757                                     __be32 saddr)
1758{
1759        RT_CACHE_STAT_INC(in_martian_src);
1760#ifdef CONFIG_IP_ROUTE_VERBOSE
1761        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1762                /*
1763                 *      RFC1812 recommendation, if source is martian,
1764                 *      the only hint is MAC header.
1765                 */
1766                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1767                        &daddr, &saddr, dev->name);
1768                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1769                        print_hex_dump(KERN_WARNING, "ll header: ",
1770                                       DUMP_PREFIX_OFFSET, 16, 1,
1771                                       skb_mac_header(skb),
1772                                       dev->hard_header_len, false);
1773                }
1774        }
1775#endif
1776}
1777
1778/* called in rcu_read_lock() section */
1779static int __mkroute_input(struct sk_buff *skb,
1780                           const struct fib_result *res,
1781                           struct in_device *in_dev,
1782                           __be32 daddr, __be32 saddr, u32 tos)
1783{
1784        struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1785        struct net_device *dev = nhc->nhc_dev;
1786        struct fib_nh_exception *fnhe;
1787        struct rtable *rth;
1788        int err;
1789        struct in_device *out_dev;
1790        bool do_cache;
1791        u32 itag = 0;
1792
1793        /* get a working reference to the output device */
1794        out_dev = __in_dev_get_rcu(dev);
1795        if (!out_dev) {
1796                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1797                return -EINVAL;
1798        }
1799
1800        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1801                                  in_dev->dev, in_dev, &itag);
1802        if (err < 0) {
1803                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1804                                         saddr);
1805
1806                goto cleanup;
1807        }
1808
1809        do_cache = res->fi && !itag;
1810        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1811            skb->protocol == htons(ETH_P_IP)) {
1812                __be32 gw;
1813
1814                gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1815                if (IN_DEV_SHARED_MEDIA(out_dev) ||
1816                    inet_addr_onlink(out_dev, saddr, gw))
1817                        IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1818        }
1819
1820        if (skb->protocol != htons(ETH_P_IP)) {
1821                /* Not IP (i.e. ARP). Do not create route, if it is
1822                 * invalid for proxy arp. DNAT routes are always valid.
1823                 *
1824                 * Proxy arp feature have been extended to allow, ARP
1825                 * replies back to the same interface, to support
1826                 * Private VLAN switch technologies. See arp.c.
1827                 */
1828                if (out_dev == in_dev &&
1829                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1830                        err = -EINVAL;
1831                        goto cleanup;
1832                }
1833        }
1834
1835        fnhe = find_exception(nhc, daddr);
1836        if (do_cache) {
1837                if (fnhe)
1838                        rth = rcu_dereference(fnhe->fnhe_rth_input);
1839                else
1840                        rth = rcu_dereference(nhc->nhc_rth_input);
1841                if (rt_cache_valid(rth)) {
1842                        skb_dst_set_noref(skb, &rth->dst);
1843                        goto out;
1844                }
1845        }
1846
1847        rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1848                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1849                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1850        if (!rth) {
1851                err = -ENOBUFS;
1852                goto cleanup;
1853        }
1854
1855        rth->rt_is_input = 1;
1856        RT_CACHE_STAT_INC(in_slow_tot);
1857
1858        rth->dst.input = ip_forward;
1859
1860        rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1861                       do_cache);
1862        lwtunnel_set_redirect(&rth->dst);
1863        skb_dst_set(skb, &rth->dst);
1864out:
1865        err = 0;
1866 cleanup:
1867        return err;
1868}
1869
1870#ifdef CONFIG_IP_ROUTE_MULTIPATH
1871/* To make ICMP packets follow the right flow, the multipath hash is
1872 * calculated from the inner IP addresses.
1873 */
1874static void ip_multipath_l3_keys(const struct sk_buff *skb,
1875                                 struct flow_keys *hash_keys)
1876{
1877        const struct iphdr *outer_iph = ip_hdr(skb);
1878        const struct iphdr *key_iph = outer_iph;
1879        const struct iphdr *inner_iph;
1880        const struct icmphdr *icmph;
1881        struct iphdr _inner_iph;
1882        struct icmphdr _icmph;
1883
1884        if (likely(outer_iph->protocol != IPPROTO_ICMP))
1885                goto out;
1886
1887        if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1888                goto out;
1889
1890        icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1891                                   &_icmph);
1892        if (!icmph)
1893                goto out;
1894
1895        if (icmph->type != ICMP_DEST_UNREACH &&
1896            icmph->type != ICMP_REDIRECT &&
1897            icmph->type != ICMP_TIME_EXCEEDED &&
1898            icmph->type != ICMP_PARAMETERPROB)
1899                goto out;
1900
1901        inner_iph = skb_header_pointer(skb,
1902                                       outer_iph->ihl * 4 + sizeof(_icmph),
1903                                       sizeof(_inner_iph), &_inner_iph);
1904        if (!inner_iph)
1905                goto out;
1906
1907        key_iph = inner_iph;
1908out:
1909        hash_keys->addrs.v4addrs.src = key_iph->saddr;
1910        hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1911}
1912
1913/* if skb is set it will be used and fl4 can be NULL */
1914int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1915                       const struct sk_buff *skb, struct flow_keys *flkeys)
1916{
1917        u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1918        struct flow_keys hash_keys;
1919        u32 mhash;
1920
1921        switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1922        case 0:
1923                memset(&hash_keys, 0, sizeof(hash_keys));
1924                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1925                if (skb) {
1926                        ip_multipath_l3_keys(skb, &hash_keys);
1927                } else {
1928                        hash_keys.addrs.v4addrs.src = fl4->saddr;
1929                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
1930                }
1931                break;
1932        case 1:
1933                /* skb is currently provided only when forwarding */
1934                if (skb) {
1935                        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1936                        struct flow_keys keys;
1937
1938                        /* short-circuit if we already have L4 hash present */
1939                        if (skb->l4_hash)
1940                                return skb_get_hash_raw(skb) >> 1;
1941
1942                        memset(&hash_keys, 0, sizeof(hash_keys));
1943
1944                        if (!flkeys) {
1945                                skb_flow_dissect_flow_keys(skb, &keys, flag);
1946                                flkeys = &keys;
1947                        }
1948
1949                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1950                        hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1951                        hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1952                        hash_keys.ports.src = flkeys->ports.src;
1953                        hash_keys.ports.dst = flkeys->ports.dst;
1954                        hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1955                } else {
1956                        memset(&hash_keys, 0, sizeof(hash_keys));
1957                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1958                        hash_keys.addrs.v4addrs.src = fl4->saddr;
1959                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
1960                        hash_keys.ports.src = fl4->fl4_sport;
1961                        hash_keys.ports.dst = fl4->fl4_dport;
1962                        hash_keys.basic.ip_proto = fl4->flowi4_proto;
1963                }
1964                break;
1965        }
1966        mhash = flow_hash_from_keys(&hash_keys);
1967
1968        if (multipath_hash)
1969                mhash = jhash_2words(mhash, multipath_hash, 0);
1970
1971        return mhash >> 1;
1972}
1973#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1974
1975static int ip_mkroute_input(struct sk_buff *skb,
1976                            struct fib_result *res,
1977                            struct in_device *in_dev,
1978                            __be32 daddr, __be32 saddr, u32 tos,
1979                            struct flow_keys *hkeys)
1980{
1981#ifdef CONFIG_IP_ROUTE_MULTIPATH
1982        if (res->fi && res->fi->fib_nhs > 1) {
1983                int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1984
1985                fib_select_multipath(res, h);
1986        }
1987#endif
1988
1989        /* create a routing cache entry */
1990        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1991}
1992
1993/*
1994 *      NOTE. We drop all the packets that has local source
1995 *      addresses, because every properly looped back packet
1996 *      must have correct destination already attached by output routine.
1997 *
1998 *      Such approach solves two big problems:
1999 *      1. Not simplex devices are handled properly.
2000 *      2. IP spoofing attempts are filtered with 100% of guarantee.
2001 *      called with rcu_read_lock()
2002 */
2003
2004static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2005                               u8 tos, struct net_device *dev,
2006                               struct fib_result *res)
2007{
2008        struct in_device *in_dev = __in_dev_get_rcu(dev);
2009        struct flow_keys *flkeys = NULL, _flkeys;
2010        struct net    *net = dev_net(dev);
2011        struct ip_tunnel_info *tun_info;
2012        int             err = -EINVAL;
2013        unsigned int    flags = 0;
2014        u32             itag = 0;
2015        struct rtable   *rth;
2016        struct flowi4   fl4;
2017        bool do_cache = true;
2018
2019        /* IP on this device is disabled. */
2020
2021        if (!in_dev)
2022                goto out;
2023
2024        /* Check for the most weird martians, which can be not detected
2025           by fib_lookup.
2026         */
2027
2028        tun_info = skb_tunnel_info(skb);
2029        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2030                fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2031        else
2032                fl4.flowi4_tun_key.tun_id = 0;
2033        skb_dst_drop(skb);
2034
2035        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2036                goto martian_source;
2037
2038        res->fi = NULL;
2039        res->table = NULL;
2040        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2041                goto brd_input;
2042
2043        /* Accept zero addresses only to limited broadcast;
2044         * I even do not know to fix it or not. Waiting for complains :-)
2045         */
2046        if (ipv4_is_zeronet(saddr))
2047                goto martian_source;
2048
2049        if (ipv4_is_zeronet(daddr))
2050                goto martian_destination;
2051
2052        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2053         * and call it once if daddr or/and saddr are loopback addresses
2054         */
2055        if (ipv4_is_loopback(daddr)) {
2056                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2057                        goto martian_destination;
2058        } else if (ipv4_is_loopback(saddr)) {
2059                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2060                        goto martian_source;
2061        }
2062
2063        /*
2064         *      Now we are ready to route packet.
2065         */
2066        fl4.flowi4_oif = 0;
2067        fl4.flowi4_iif = dev->ifindex;
2068        fl4.flowi4_mark = skb->mark;
2069        fl4.flowi4_tos = tos;
2070        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2071        fl4.flowi4_flags = 0;
2072        fl4.daddr = daddr;
2073        fl4.saddr = saddr;
2074        fl4.flowi4_uid = sock_net_uid(net, NULL);
2075
2076        if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2077                flkeys = &_flkeys;
2078        } else {
2079                fl4.flowi4_proto = 0;
2080                fl4.fl4_sport = 0;
2081                fl4.fl4_dport = 0;
2082        }
2083
2084        err = fib_lookup(net, &fl4, res, 0);
2085        if (err != 0) {
2086                if (!IN_DEV_FORWARD(in_dev))
2087                        err = -EHOSTUNREACH;
2088                goto no_route;
2089        }
2090
2091        if (res->type == RTN_BROADCAST) {
2092                if (IN_DEV_BFORWARD(in_dev))
2093                        goto make_route;
2094                /* not do cache if bc_forwarding is enabled */
2095                if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2096                        do_cache = false;
2097                goto brd_input;
2098        }
2099
2100        if (res->type == RTN_LOCAL) {
2101                err = fib_validate_source(skb, saddr, daddr, tos,
2102                                          0, dev, in_dev, &itag);
2103                if (err < 0)
2104                        goto martian_source;
2105                goto local_input;
2106        }
2107
2108        if (!IN_DEV_FORWARD(in_dev)) {
2109                err = -EHOSTUNREACH;
2110                goto no_route;
2111        }
2112        if (res->type != RTN_UNICAST)
2113                goto martian_destination;
2114
2115make_route:
2116        err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2117out:    return err;
2118
2119brd_input:
2120        if (skb->protocol != htons(ETH_P_IP))
2121                goto e_inval;
2122
2123        if (!ipv4_is_zeronet(saddr)) {
2124                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2125                                          in_dev, &itag);
2126                if (err < 0)
2127                        goto martian_source;
2128        }
2129        flags |= RTCF_BROADCAST;
2130        res->type = RTN_BROADCAST;
2131        RT_CACHE_STAT_INC(in_brd);
2132
2133local_input:
2134        do_cache &= res->fi && !itag;
2135        if (do_cache) {
2136                struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2137
2138                rth = rcu_dereference(nhc->nhc_rth_input);
2139                if (rt_cache_valid(rth)) {
2140                        skb_dst_set_noref(skb, &rth->dst);
2141                        err = 0;
2142                        goto out;
2143                }
2144        }
2145
2146        rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2147                           flags | RTCF_LOCAL, res->type,
2148                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2149        if (!rth)
2150                goto e_nobufs;
2151
2152        rth->dst.output= ip_rt_bug;
2153#ifdef CONFIG_IP_ROUTE_CLASSID
2154        rth->dst.tclassid = itag;
2155#endif
2156        rth->rt_is_input = 1;
2157
2158        RT_CACHE_STAT_INC(in_slow_tot);
2159        if (res->type == RTN_UNREACHABLE) {
2160                rth->dst.input= ip_error;
2161                rth->dst.error= -err;
2162                rth->rt_flags   &= ~RTCF_LOCAL;
2163        }
2164
2165        if (do_cache) {
2166                struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2167
2168                rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2169                if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2170                        WARN_ON(rth->dst.input == lwtunnel_input);
2171                        rth->dst.lwtstate->orig_input = rth->dst.input;
2172                        rth->dst.input = lwtunnel_input;
2173                }
2174
2175                if (unlikely(!rt_cache_route(nhc, rth)))
2176                        rt_add_uncached_list(rth);
2177        }
2178        skb_dst_set(skb, &rth->dst);
2179        err = 0;
2180        goto out;
2181
2182no_route:
2183        RT_CACHE_STAT_INC(in_no_route);
2184        res->type = RTN_UNREACHABLE;
2185        res->fi = NULL;
2186        res->table = NULL;
2187        goto local_input;
2188
2189        /*
2190         *      Do not cache martian addresses: they should be logged (RFC1812)
2191         */
2192martian_destination:
2193        RT_CACHE_STAT_INC(in_martian_dst);
2194#ifdef CONFIG_IP_ROUTE_VERBOSE
2195        if (IN_DEV_LOG_MARTIANS(in_dev))
2196                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2197                                     &daddr, &saddr, dev->name);
2198#endif
2199
2200e_inval:
2201        err = -EINVAL;
2202        goto out;
2203
2204e_nobufs:
2205        err = -ENOBUFS;
2206        goto out;
2207
2208martian_source:
2209        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2210        goto out;
2211}
2212
2213int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2214                         u8 tos, struct net_device *dev)
2215{
2216        struct fib_result res;
2217        int err;
2218
2219        tos &= IPTOS_RT_MASK;
2220        rcu_read_lock();
2221        err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2222        rcu_read_unlock();
2223
2224        return err;
2225}
2226EXPORT_SYMBOL(ip_route_input_noref);
2227
2228/* called with rcu_read_lock held */
2229int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2230                       u8 tos, struct net_device *dev, struct fib_result *res)
2231{
2232        /* Multicast recognition logic is moved from route cache to here.
2233           The problem was that too many Ethernet cards have broken/missing
2234           hardware multicast filters :-( As result the host on multicasting
2235           network acquires a lot of useless route cache entries, sort of
2236           SDR messages from all the world. Now we try to get rid of them.
2237           Really, provided software IP multicast filter is organized
2238           reasonably (at least, hashed), it does not result in a slowdown
2239           comparing with route cache reject entries.
2240           Note, that multicast routers are not affected, because
2241           route cache entry is created eventually.
2242         */
2243        if (ipv4_is_multicast(daddr)) {
2244                struct in_device *in_dev = __in_dev_get_rcu(dev);
2245                int our = 0;
2246                int err = -EINVAL;
2247
2248                if (!in_dev)
2249                        return err;
2250                our = ip_check_mc_rcu(in_dev, daddr, saddr,
2251                                      ip_hdr(skb)->protocol);
2252
2253                /* check l3 master if no match yet */
2254                if (!our && netif_is_l3_slave(dev)) {
2255                        struct in_device *l3_in_dev;
2256
2257                        l3_in_dev = __in_dev_get_rcu(skb->dev);
2258                        if (l3_in_dev)
2259                                our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2260                                                      ip_hdr(skb)->protocol);
2261                }
2262
2263                if (our
2264#ifdef CONFIG_IP_MROUTE
2265                        ||
2266                    (!ipv4_is_local_multicast(daddr) &&
2267                     IN_DEV_MFORWARD(in_dev))
2268#endif
2269                   ) {
2270                        err = ip_route_input_mc(skb, daddr, saddr,
2271                                                tos, dev, our);
2272                }
2273                return err;
2274        }
2275
2276        return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2277}
2278
2279/* called with rcu_read_lock() */
2280static struct rtable *__mkroute_output(const struct fib_result *res,
2281                                       const struct flowi4 *fl4, int orig_oif,
2282                                       struct net_device *dev_out,
2283                                       unsigned int flags)
2284{
2285        struct fib_info *fi = res->fi;
2286        struct fib_nh_exception *fnhe;
2287        struct in_device *in_dev;
2288        u16 type = res->type;
2289        struct rtable *rth;
2290        bool do_cache;
2291
2292        in_dev = __in_dev_get_rcu(dev_out);
2293        if (!in_dev)
2294                return ERR_PTR(-EINVAL);
2295
2296        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2297                if (ipv4_is_loopback(fl4->saddr) &&
2298                    !(dev_out->flags & IFF_LOOPBACK) &&
2299                    !netif_is_l3_master(dev_out))
2300                        return ERR_PTR(-EINVAL);
2301
2302        if (ipv4_is_lbcast(fl4->daddr))
2303                type = RTN_BROADCAST;
2304        else if (ipv4_is_multicast(fl4->daddr))
2305                type = RTN_MULTICAST;
2306        else if (ipv4_is_zeronet(fl4->daddr))
2307                return ERR_PTR(-EINVAL);
2308
2309        if (dev_out->flags & IFF_LOOPBACK)
2310                flags |= RTCF_LOCAL;
2311
2312        do_cache = true;
2313        if (type == RTN_BROADCAST) {
2314                flags |= RTCF_BROADCAST | RTCF_LOCAL;
2315                fi = NULL;
2316        } else if (type == RTN_MULTICAST) {
2317                flags |= RTCF_MULTICAST | RTCF_LOCAL;
2318                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2319                                     fl4->flowi4_proto))
2320                        flags &= ~RTCF_LOCAL;
2321                else
2322                        do_cache = false;
2323                /* If multicast route do not exist use
2324                 * default one, but do not gateway in this case.
2325                 * Yes, it is hack.
2326                 */
2327                if (fi && res->prefixlen < 4)
2328                        fi = NULL;
2329        } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2330                   (orig_oif != dev_out->ifindex)) {
2331                /* For local routes that require a particular output interface
2332                 * we do not want to cache the result.  Caching the result
2333                 * causes incorrect behaviour when there are multiple source
2334                 * addresses on the interface, the end result being that if the
2335                 * intended recipient is waiting on that interface for the
2336                 * packet he won't receive it because it will be delivered on
2337                 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2338                 * be set to the loopback interface as well.
2339                 */
2340                do_cache = false;
2341        }
2342
2343        fnhe = NULL;
2344        do_cache &= fi != NULL;
2345        if (fi) {
2346                struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2347                struct rtable __rcu **prth;
2348
2349                fnhe = find_exception(nhc, fl4->daddr);
2350                if (!do_cache)
2351                        goto add;
2352                if (fnhe) {
2353                        prth = &fnhe->fnhe_rth_output;
2354                } else {
2355                        if (unlikely(fl4->flowi4_flags &
2356                                     FLOWI_FLAG_KNOWN_NH &&
2357                                     !(nhc->nhc_gw_family &&
2358                                       nhc->nhc_scope == RT_SCOPE_LINK))) {
2359                                do_cache = false;
2360                                goto add;
2361                        }
2362                        prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2363                }
2364                rth = rcu_dereference(*prth);
2365                if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2366                        return rth;
2367        }
2368
2369add:
2370        rth = rt_dst_alloc(dev_out, flags, type,
2371                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
2372                           IN_DEV_CONF_GET(in_dev, NOXFRM),
2373                           do_cache);
2374        if (!rth)
2375                return ERR_PTR(-ENOBUFS);
2376
2377        rth->rt_iif = orig_oif;
2378
2379        RT_CACHE_STAT_INC(out_slow_tot);
2380
2381        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2382                if (flags & RTCF_LOCAL &&
2383                    !(dev_out->flags & IFF_LOOPBACK)) {
2384                        rth->dst.output = ip_mc_output;
2385                        RT_CACHE_STAT_INC(out_slow_mc);
2386                }
2387#ifdef CONFIG_IP_MROUTE
2388                if (type == RTN_MULTICAST) {
2389                        if (IN_DEV_MFORWARD(in_dev) &&
2390                            !ipv4_is_local_multicast(fl4->daddr)) {
2391                                rth->dst.input = ip_mr_input;
2392                                rth->dst.output = ip_mc_output;
2393                        }
2394                }
2395#endif
2396        }
2397
2398        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2399        lwtunnel_set_redirect(&rth->dst);
2400
2401        return rth;
2402}
2403
2404/*
2405 * Major route resolver routine.
2406 */
2407
2408struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2409                                        const struct sk_buff *skb)
2410{
2411        __u8 tos = RT_FL_TOS(fl4);
2412        struct fib_result res = {
2413                .type           = RTN_UNSPEC,
2414                .fi             = NULL,
2415                .table          = NULL,
2416                .tclassid       = 0,
2417        };
2418        struct rtable *rth;
2419
2420        fl4->flowi4_iif = LOOPBACK_IFINDEX;
2421        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2422        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2423                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2424
2425        rcu_read_lock();
2426        rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2427        rcu_read_unlock();
2428
2429        return rth;
2430}
2431EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2432
2433struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2434                                            struct fib_result *res,
2435                                            const struct sk_buff *skb)
2436{
2437        struct net_device *dev_out = NULL;
2438        int orig_oif = fl4->flowi4_oif;
2439        unsigned int flags = 0;
2440        struct rtable *rth;
2441        int err = -ENETUNREACH;
2442
2443        if (fl4->saddr) {
2444                rth = ERR_PTR(-EINVAL);
2445                if (ipv4_is_multicast(fl4->saddr) ||
2446                    ipv4_is_lbcast(fl4->saddr) ||
2447                    ipv4_is_zeronet(fl4->saddr))
2448                        goto out;
2449
2450                /* I removed check for oif == dev_out->oif here.
2451                   It was wrong for two reasons:
2452                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2453                      is assigned to multiple interfaces.
2454                   2. Moreover, we are allowed to send packets with saddr
2455                      of another iface. --ANK
2456                 */
2457
2458                if (fl4->flowi4_oif == 0 &&
2459                    (ipv4_is_multicast(fl4->daddr) ||
2460                     ipv4_is_lbcast(fl4->daddr))) {
2461                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2462                        dev_out = __ip_dev_find(net, fl4->saddr, false);
2463                        if (!dev_out)
2464                                goto out;
2465
2466                        /* Special hack: user can direct multicasts
2467                           and limited broadcast via necessary interface
2468                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2469                           This hack is not just for fun, it allows
2470                           vic,vat and friends to work.
2471                           They bind socket to loopback, set ttl to zero
2472                           and expect that it will work.
2473                           From the viewpoint of routing cache they are broken,
2474                           because we are not allowed to build multicast path
2475                           with loopback source addr (look, routing cache
2476                           cannot know, that ttl is zero, so that packet
2477                           will not leave this host and route is valid).
2478                           Luckily, this hack is good workaround.
2479                         */
2480
2481                        fl4->flowi4_oif = dev_out->ifindex;
2482                        goto make_route;
2483                }
2484
2485                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2486                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2487                        if (!__ip_dev_find(net, fl4->saddr, false))
2488                                goto out;
2489                }
2490        }
2491
2492
2493        if (fl4->flowi4_oif) {
2494                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2495                rth = ERR_PTR(-ENODEV);
2496                if (!dev_out)
2497                        goto out;
2498
2499                /* RACE: Check return value of inet_select_addr instead. */
2500                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2501                        rth = ERR_PTR(-ENETUNREACH);
2502                        goto out;
2503                }
2504                if (ipv4_is_local_multicast(fl4->daddr) ||
2505                    ipv4_is_lbcast(fl4->daddr) ||
2506                    fl4->flowi4_proto == IPPROTO_IGMP) {
2507                        if (!fl4->saddr)
2508                                fl4->saddr = inet_select_addr(dev_out, 0,
2509                                                              RT_SCOPE_LINK);
2510                        goto make_route;
2511                }
2512                if (!fl4->saddr) {
2513                        if (ipv4_is_multicast(fl4->daddr))
2514                                fl4->saddr = inet_select_addr(dev_out, 0,
2515                                                              fl4->flowi4_scope);
2516                        else if (!fl4->daddr)
2517                                fl4->saddr = inet_select_addr(dev_out, 0,
2518                                                              RT_SCOPE_HOST);
2519                }
2520        }
2521
2522        if (!fl4->daddr) {
2523                fl4->daddr = fl4->saddr;
2524                if (!fl4->daddr)
2525                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2526                dev_out = net->loopback_dev;
2527                fl4->flowi4_oif = LOOPBACK_IFINDEX;
2528                res->type = RTN_LOCAL;
2529                flags |= RTCF_LOCAL;
2530                goto make_route;
2531        }
2532
2533        err = fib_lookup(net, fl4, res, 0);
2534        if (err) {
2535                res->fi = NULL;
2536                res->table = NULL;
2537                if (fl4->flowi4_oif &&
2538                    (ipv4_is_multicast(fl4->daddr) ||
2539                    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2540                        /* Apparently, routing tables are wrong. Assume,
2541                           that the destination is on link.
2542
2543                           WHY? DW.
2544                           Because we are allowed to send to iface
2545                           even if it has NO routes and NO assigned
2546                           addresses. When oif is specified, routing
2547                           tables are looked up with only one purpose:
2548                           to catch if destination is gatewayed, rather than
2549                           direct. Moreover, if MSG_DONTROUTE is set,
2550                           we send packet, ignoring both routing tables
2551                           and ifaddr state. --ANK
2552
2553
2554                           We could make it even if oif is unknown,
2555                           likely IPv6, but we do not.
2556                         */
2557
2558                        if (fl4->saddr == 0)
2559                                fl4->saddr = inet_select_addr(dev_out, 0,
2560                                                              RT_SCOPE_LINK);
2561                        res->type = RTN_UNICAST;
2562                        goto make_route;
2563                }
2564                rth = ERR_PTR(err);
2565                goto out;
2566        }
2567
2568        if (res->type == RTN_LOCAL) {
2569                if (!fl4->saddr) {
2570                        if (res->fi->fib_prefsrc)
2571                                fl4->saddr = res->fi->fib_prefsrc;
2572                        else
2573                                fl4->saddr = fl4->daddr;
2574                }
2575
2576                /* L3 master device is the loopback for that domain */
2577                dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2578                        net->loopback_dev;
2579
2580                /* make sure orig_oif points to fib result device even
2581                 * though packet rx/tx happens over loopback or l3mdev
2582                 */
2583                orig_oif = FIB_RES_OIF(*res);
2584
2585                fl4->flowi4_oif = dev_out->ifindex;
2586                flags |= RTCF_LOCAL;
2587                goto make_route;
2588        }
2589
2590        fib_select_path(net, res, fl4, skb);
2591
2592        dev_out = FIB_RES_DEV(*res);
2593        fl4->flowi4_oif = dev_out->ifindex;
2594
2595
2596make_route:
2597        rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2598
2599out:
2600        return rth;
2601}
2602
2603static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2604{
2605        return NULL;
2606}
2607
2608static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2609{
2610        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2611
2612        return mtu ? : dst->dev->mtu;
2613}
2614
2615static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2616                                          struct sk_buff *skb, u32 mtu)
2617{
2618}
2619
2620static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2621                                       struct sk_buff *skb)
2622{
2623}
2624
2625static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2626                                          unsigned long old)
2627{
2628        return NULL;
2629}
2630
2631static struct dst_ops ipv4_dst_blackhole_ops = {
2632        .family                 =       AF_INET,
2633        .check                  =       ipv4_blackhole_dst_check,
2634        .mtu                    =       ipv4_blackhole_mtu,
2635        .default_advmss         =       ipv4_default_advmss,
2636        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2637        .redirect               =       ipv4_rt_blackhole_redirect,
2638        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2639        .neigh_lookup           =       ipv4_neigh_lookup,
2640};
2641
2642struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2643{
2644        struct rtable *ort = (struct rtable *) dst_orig;
2645        struct rtable *rt;
2646
2647        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2648        if (rt) {
2649                struct dst_entry *new = &rt->dst;
2650
2651                new->__use = 1;
2652                new->input = dst_discard;
2653                new->output = dst_discard_out;
2654
2655                new->dev = net->loopback_dev;
2656                if (new->dev)
2657                        dev_hold(new->dev);
2658
2659                rt->rt_is_input = ort->rt_is_input;
2660                rt->rt_iif = ort->rt_iif;
2661                rt->rt_pmtu = ort->rt_pmtu;
2662                rt->rt_mtu_locked = ort->rt_mtu_locked;
2663
2664                rt->rt_genid = rt_genid_ipv4(net);
2665                rt->rt_flags = ort->rt_flags;
2666                rt->rt_type = ort->rt_type;
2667                rt->rt_gw_family = ort->rt_gw_family;
2668                if (rt->rt_gw_family == AF_INET)
2669                        rt->rt_gw4 = ort->rt_gw4;
2670                else if (rt->rt_gw_family == AF_INET6)
2671                        rt->rt_gw6 = ort->rt_gw6;
2672
2673                INIT_LIST_HEAD(&rt->rt_uncached);
2674        }
2675
2676        dst_release(dst_orig);
2677
2678        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2679}
2680
2681struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2682                                    const struct sock *sk)
2683{
2684        struct rtable *rt = __ip_route_output_key(net, flp4);
2685
2686        if (IS_ERR(rt))
2687                return rt;
2688
2689        if (flp4->flowi4_proto)
2690                rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2691                                                        flowi4_to_flowi(flp4),
2692                                                        sk, 0);
2693
2694        return rt;
2695}
2696EXPORT_SYMBOL_GPL(ip_route_output_flow);
2697
2698/* called with rcu_read_lock held */
2699static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2700                        struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2701                        struct sk_buff *skb, u32 portid, u32 seq)
2702{
2703        struct rtmsg *r;
2704        struct nlmsghdr *nlh;
2705        unsigned long expires = 0;
2706        u32 error;
2707        u32 metrics[RTAX_MAX];
2708
2709        nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2710        if (!nlh)
2711                return -EMSGSIZE;
2712
2713        r = nlmsg_data(nlh);
2714        r->rtm_family    = AF_INET;
2715        r->rtm_dst_len  = 32;
2716        r->rtm_src_len  = 0;
2717        r->rtm_tos      = fl4->flowi4_tos;
2718        r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2719        if (nla_put_u32(skb, RTA_TABLE, table_id))
2720                goto nla_put_failure;
2721        r->rtm_type     = rt->rt_type;
2722        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2723        r->rtm_protocol = RTPROT_UNSPEC;
2724        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2725        if (rt->rt_flags & RTCF_NOTIFY)
2726                r->rtm_flags |= RTM_F_NOTIFY;
2727        if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2728                r->rtm_flags |= RTCF_DOREDIRECT;
2729
2730        if (nla_put_in_addr(skb, RTA_DST, dst))
2731                goto nla_put_failure;
2732        if (src) {
2733                r->rtm_src_len = 32;
2734                if (nla_put_in_addr(skb, RTA_SRC, src))
2735                        goto nla_put_failure;
2736        }
2737        if (rt->dst.dev &&
2738            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2739                goto nla_put_failure;
2740#ifdef CONFIG_IP_ROUTE_CLASSID
2741        if (rt->dst.tclassid &&
2742            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2743                goto nla_put_failure;
2744#endif
2745        if (!rt_is_input_route(rt) &&
2746            fl4->saddr != src) {
2747                if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2748                        goto nla_put_failure;
2749        }
2750        if (rt->rt_gw_family == AF_INET &&
2751            nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2752                goto nla_put_failure;
2753        } else if (rt->rt_gw_family == AF_INET6) {
2754                int alen = sizeof(struct in6_addr);
2755                struct nlattr *nla;
2756                struct rtvia *via;
2757
2758                nla = nla_reserve(skb, RTA_VIA, alen + 2);
2759                if (!nla)
2760                        goto nla_put_failure;
2761
2762                via = nla_data(nla);
2763                via->rtvia_family = AF_INET6;
2764                memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2765        }
2766
2767        expires = rt->dst.expires;
2768        if (expires) {
2769                unsigned long now = jiffies;
2770
2771                if (time_before(now, expires))
2772                        expires -= now;
2773                else
2774                        expires = 0;
2775        }
2776
2777        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2778        if (rt->rt_pmtu && expires)
2779                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2780        if (rt->rt_mtu_locked && expires)
2781                metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2782        if (rtnetlink_put_metrics(skb, metrics) < 0)
2783                goto nla_put_failure;
2784
2785        if (fl4->flowi4_mark &&
2786            nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2787                goto nla_put_failure;
2788
2789        if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2790            nla_put_u32(skb, RTA_UID,
2791                        from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2792                goto nla_put_failure;
2793
2794        error = rt->dst.error;
2795
2796        if (rt_is_input_route(rt)) {
2797#ifdef CONFIG_IP_MROUTE
2798                if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2799                    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2800                        int err = ipmr_get_route(net, skb,
2801                                                 fl4->saddr, fl4->daddr,
2802                                                 r, portid);
2803
2804                        if (err <= 0) {
2805                                if (err == 0)
2806                                        return 0;
2807                                goto nla_put_failure;
2808                        }
2809                } else
2810#endif
2811                        if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2812                                goto nla_put_failure;
2813        }
2814
2815        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2816                goto nla_put_failure;
2817
2818        nlmsg_end(skb, nlh);
2819        return 0;
2820
2821nla_put_failure:
2822        nlmsg_cancel(skb, nlh);
2823        return -EMSGSIZE;
2824}
2825
2826static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2827                                                   u8 ip_proto, __be16 sport,
2828                                                   __be16 dport)
2829{
2830        struct sk_buff *skb;
2831        struct iphdr *iph;
2832
2833        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2834        if (!skb)
2835                return NULL;
2836
2837        /* Reserve room for dummy headers, this skb can pass
2838         * through good chunk of routing engine.
2839         */
2840        skb_reset_mac_header(skb);
2841        skb_reset_network_header(skb);
2842        skb->protocol = htons(ETH_P_IP);
2843        iph = skb_put(skb, sizeof(struct iphdr));
2844        iph->protocol = ip_proto;
2845        iph->saddr = src;
2846        iph->daddr = dst;
2847        iph->version = 0x4;
2848        iph->frag_off = 0;
2849        iph->ihl = 0x5;
2850        skb_set_transport_header(skb, skb->len);
2851
2852        switch (iph->protocol) {
2853        case IPPROTO_UDP: {
2854                struct udphdr *udph;
2855
2856                udph = skb_put_zero(skb, sizeof(struct udphdr));
2857                udph->source = sport;
2858                udph->dest = dport;
2859                udph->len = sizeof(struct udphdr);
2860                udph->check = 0;
2861                break;
2862        }
2863        case IPPROTO_TCP: {
2864                struct tcphdr *tcph;
2865
2866                tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2867                tcph->source    = sport;
2868                tcph->dest      = dport;
2869                tcph->doff      = sizeof(struct tcphdr) / 4;
2870                tcph->rst = 1;
2871                tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2872                                            src, dst, 0);
2873                break;
2874        }
2875        case IPPROTO_ICMP: {
2876                struct icmphdr *icmph;
2877
2878                icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2879                icmph->type = ICMP_ECHO;
2880                icmph->code = 0;
2881        }
2882        }
2883
2884        return skb;
2885}
2886
2887static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
2888                                       const struct nlmsghdr *nlh,
2889                                       struct nlattr **tb,
2890                                       struct netlink_ext_ack *extack)
2891{
2892        struct rtmsg *rtm;
2893        int i, err;
2894
2895        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
2896                NL_SET_ERR_MSG(extack,
2897                               "ipv4: Invalid header for route get request");
2898                return -EINVAL;
2899        }
2900
2901        if (!netlink_strict_get_check(skb))
2902                return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
2903                                              rtm_ipv4_policy, extack);
2904
2905        rtm = nlmsg_data(nlh);
2906        if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
2907            (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
2908            rtm->rtm_table || rtm->rtm_protocol ||
2909            rtm->rtm_scope || rtm->rtm_type) {
2910                NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
2911                return -EINVAL;
2912        }
2913
2914        if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
2915                               RTM_F_LOOKUP_TABLE |
2916                               RTM_F_FIB_MATCH)) {
2917                NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
2918                return -EINVAL;
2919        }
2920
2921        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
2922                                            rtm_ipv4_policy, extack);
2923        if (err)
2924                return err;
2925
2926        if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
2927            (tb[RTA_DST] && !rtm->rtm_dst_len)) {
2928                NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
2929                return -EINVAL;
2930        }
2931
2932        for (i = 0; i <= RTA_MAX; i++) {
2933                if (!tb[i])
2934                        continue;
2935
2936                switch (i) {
2937                case RTA_IIF:
2938                case RTA_OIF:
2939                case RTA_SRC:
2940                case RTA_DST:
2941                case RTA_IP_PROTO:
2942                case RTA_SPORT:
2943                case RTA_DPORT:
2944                case RTA_MARK:
2945                case RTA_UID:
2946                        break;
2947                default:
2948                        NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
2949                        return -EINVAL;
2950                }
2951        }
2952
2953        return 0;
2954}
2955
2956static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2957                             struct netlink_ext_ack *extack)
2958{
2959        struct net *net = sock_net(in_skb->sk);
2960        struct nlattr *tb[RTA_MAX+1];
2961        u32 table_id = RT_TABLE_MAIN;
2962        __be16 sport = 0, dport = 0;
2963        struct fib_result res = {};
2964        u8 ip_proto = IPPROTO_UDP;
2965        struct rtable *rt = NULL;
2966        struct sk_buff *skb;
2967        struct rtmsg *rtm;
2968        struct flowi4 fl4 = {};
2969        __be32 dst = 0;
2970        __be32 src = 0;
2971        kuid_t uid;
2972        u32 iif;
2973        int err;
2974        int mark;
2975
2976        err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
2977        if (err < 0)
2978                return err;
2979
2980        rtm = nlmsg_data(nlh);
2981        src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2982        dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2983        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2984        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2985        if (tb[RTA_UID])
2986                uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2987        else
2988                uid = (iif ? INVALID_UID : current_uid());
2989
2990        if (tb[RTA_IP_PROTO]) {
2991                err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2992                                                  &ip_proto, AF_INET, extack);
2993                if (err)
2994                        return err;
2995        }
2996
2997        if (tb[RTA_SPORT])
2998                sport = nla_get_be16(tb[RTA_SPORT]);
2999
3000        if (tb[RTA_DPORT])
3001                dport = nla_get_be16(tb[RTA_DPORT]);
3002
3003        skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3004        if (!skb)
3005                return -ENOBUFS;
3006
3007        fl4.daddr = dst;
3008        fl4.saddr = src;
3009        fl4.flowi4_tos = rtm->rtm_tos;
3010        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3011        fl4.flowi4_mark = mark;
3012        fl4.flowi4_uid = uid;
3013        if (sport)
3014                fl4.fl4_sport = sport;
3015        if (dport)
3016                fl4.fl4_dport = dport;
3017        fl4.flowi4_proto = ip_proto;
3018
3019        rcu_read_lock();
3020
3021        if (iif) {
3022                struct net_device *dev;
3023
3024                dev = dev_get_by_index_rcu(net, iif);
3025                if (!dev) {
3026                        err = -ENODEV;
3027                        goto errout_rcu;
3028                }
3029
3030                fl4.flowi4_iif = iif; /* for rt_fill_info */
3031                skb->dev        = dev;
3032                skb->mark       = mark;
3033                err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3034                                         dev, &res);
3035
3036                rt = skb_rtable(skb);
3037                if (err == 0 && rt->dst.error)
3038                        err = -rt->dst.error;
3039        } else {
3040                fl4.flowi4_iif = LOOPBACK_IFINDEX;
3041                skb->dev = net->loopback_dev;
3042                rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3043                err = 0;
3044                if (IS_ERR(rt))
3045                        err = PTR_ERR(rt);
3046                else
3047                        skb_dst_set(skb, &rt->dst);
3048        }
3049
3050        if (err)
3051                goto errout_rcu;
3052
3053        if (rtm->rtm_flags & RTM_F_NOTIFY)
3054                rt->rt_flags |= RTCF_NOTIFY;
3055
3056        if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3057                table_id = res.table ? res.table->tb_id : 0;
3058
3059        /* reset skb for netlink reply msg */
3060        skb_trim(skb, 0);
3061        skb_reset_network_header(skb);
3062        skb_reset_transport_header(skb);
3063        skb_reset_mac_header(skb);
3064
3065        if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3066                if (!res.fi) {
3067                        err = fib_props[res.type].error;
3068                        if (!err)
3069                                err = -EHOSTUNREACH;
3070                        goto errout_rcu;
3071                }
3072                err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3073                                    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3074                                    rt->rt_type, res.prefix, res.prefixlen,
3075                                    fl4.flowi4_tos, res.fi, 0);
3076        } else {
3077                err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3078                                   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
3079        }
3080        if (err < 0)
3081                goto errout_rcu;
3082
3083        rcu_read_unlock();
3084
3085        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3086
3087errout_free:
3088        return err;
3089errout_rcu:
3090        rcu_read_unlock();
3091        kfree_skb(skb);
3092        goto errout_free;
3093}
3094
3095void ip_rt_multicast_event(struct in_device *in_dev)
3096{
3097        rt_cache_flush(dev_net(in_dev->dev));
3098}
3099
3100#ifdef CONFIG_SYSCTL
3101static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3102static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3103static int ip_rt_gc_elasticity __read_mostly    = 8;
3104static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3105
3106static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3107                                        void __user *buffer,
3108                                        size_t *lenp, loff_t *ppos)
3109{
3110        struct net *net = (struct net *)__ctl->extra1;
3111
3112        if (write) {
3113                rt_cache_flush(net);
3114                fnhe_genid_bump(net);
3115                return 0;
3116        }
3117
3118        return -EINVAL;
3119}
3120
3121static struct ctl_table ipv4_route_table[] = {
3122        {
3123                .procname       = "gc_thresh",
3124                .data           = &ipv4_dst_ops.gc_thresh,
3125                .maxlen         = sizeof(int),
3126                .mode           = 0644,
3127                .proc_handler   = proc_dointvec,
3128        },
3129        {
3130                .procname       = "max_size",
3131                .data           = &ip_rt_max_size,
3132                .maxlen         = sizeof(int),
3133                .mode           = 0644,
3134                .proc_handler   = proc_dointvec,
3135        },
3136        {
3137                /*  Deprecated. Use gc_min_interval_ms */
3138
3139                .procname       = "gc_min_interval",
3140                .data           = &ip_rt_gc_min_interval,
3141                .maxlen         = sizeof(int),
3142                .mode           = 0644,
3143                .proc_handler   = proc_dointvec_jiffies,
3144        },
3145        {
3146                .procname       = "gc_min_interval_ms",
3147                .data           = &ip_rt_gc_min_interval,
3148                .maxlen         = sizeof(int),
3149                .mode           = 0644,
3150                .proc_handler   = proc_dointvec_ms_jiffies,
3151        },
3152        {
3153                .procname       = "gc_timeout",
3154                .data           = &ip_rt_gc_timeout,
3155                .maxlen         = sizeof(int),
3156                .mode           = 0644,
3157                .proc_handler   = proc_dointvec_jiffies,
3158        },
3159        {
3160                .procname       = "gc_interval",
3161                .data           = &ip_rt_gc_interval,
3162                .maxlen         = sizeof(int),
3163                .mode           = 0644,
3164                .proc_handler   = proc_dointvec_jiffies,
3165        },
3166        {
3167                .procname       = "redirect_load",
3168                .data           = &ip_rt_redirect_load,
3169                .maxlen         = sizeof(int),
3170                .mode           = 0644,
3171                .proc_handler   = proc_dointvec,
3172        },
3173        {
3174                .procname       = "redirect_number",
3175                .data           = &ip_rt_redirect_number,
3176                .maxlen         = sizeof(int),
3177                .mode           = 0644,
3178                .proc_handler   = proc_dointvec,
3179        },
3180        {
3181                .procname       = "redirect_silence",
3182                .data           = &ip_rt_redirect_silence,
3183                .maxlen         = sizeof(int),
3184                .mode           = 0644,
3185                .proc_handler   = proc_dointvec,
3186        },
3187        {
3188                .procname       = "error_cost",
3189                .data           = &ip_rt_error_cost,
3190                .maxlen         = sizeof(int),
3191                .mode           = 0644,
3192                .proc_handler   = proc_dointvec,
3193        },
3194        {
3195                .procname       = "error_burst",
3196                .data           = &ip_rt_error_burst,
3197                .maxlen         = sizeof(int),
3198                .mode           = 0644,
3199                .proc_handler   = proc_dointvec,
3200        },
3201        {
3202                .procname       = "gc_elasticity",
3203                .data           = &ip_rt_gc_elasticity,
3204                .maxlen         = sizeof(int),
3205                .mode           = 0644,
3206                .proc_handler   = proc_dointvec,
3207        },
3208        {
3209                .procname       = "mtu_expires",
3210                .data           = &ip_rt_mtu_expires,
3211                .maxlen         = sizeof(int),
3212                .mode           = 0644,
3213                .proc_handler   = proc_dointvec_jiffies,
3214        },
3215        {
3216                .procname       = "min_pmtu",
3217                .data           = &ip_rt_min_pmtu,
3218                .maxlen         = sizeof(int),
3219                .mode           = 0644,
3220                .proc_handler   = proc_dointvec_minmax,
3221                .extra1         = &ip_min_valid_pmtu,
3222        },
3223        {
3224                .procname       = "min_adv_mss",
3225                .data           = &ip_rt_min_advmss,
3226                .maxlen         = sizeof(int),
3227                .mode           = 0644,
3228                .proc_handler   = proc_dointvec,
3229        },
3230        { }
3231};
3232
3233static struct ctl_table ipv4_route_flush_table[] = {
3234        {
3235                .procname       = "flush",
3236                .maxlen         = sizeof(int),
3237                .mode           = 0200,
3238                .proc_handler   = ipv4_sysctl_rtcache_flush,
3239        },
3240        { },
3241};
3242
3243static __net_init int sysctl_route_net_init(struct net *net)
3244{
3245        struct ctl_table *tbl;
3246
3247        tbl = ipv4_route_flush_table;
3248        if (!net_eq(net, &init_net)) {
3249                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3250                if (!tbl)
3251                        goto err_dup;
3252
3253                /* Don't export sysctls to unprivileged users */
3254                if (net->user_ns != &init_user_ns)
3255                        tbl[0].procname = NULL;
3256        }
3257        tbl[0].extra1 = net;
3258
3259        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3260        if (!net->ipv4.route_hdr)
3261                goto err_reg;
3262        return 0;
3263
3264err_reg:
3265        if (tbl != ipv4_route_flush_table)
3266                kfree(tbl);
3267err_dup:
3268        return -ENOMEM;
3269}
3270
3271static __net_exit void sysctl_route_net_exit(struct net *net)
3272{
3273        struct ctl_table *tbl;
3274
3275        tbl = net->ipv4.route_hdr->ctl_table_arg;
3276        unregister_net_sysctl_table(net->ipv4.route_hdr);
3277        BUG_ON(tbl == ipv4_route_flush_table);
3278        kfree(tbl);
3279}
3280
3281static __net_initdata struct pernet_operations sysctl_route_ops = {
3282        .init = sysctl_route_net_init,
3283        .exit = sysctl_route_net_exit,
3284};
3285#endif
3286
3287static __net_init int rt_genid_init(struct net *net)
3288{
3289        atomic_set(&net->ipv4.rt_genid, 0);
3290        atomic_set(&net->fnhe_genid, 0);
3291        atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3292        return 0;
3293}
3294
3295static __net_initdata struct pernet_operations rt_genid_ops = {
3296        .init = rt_genid_init,
3297};
3298
3299static int __net_init ipv4_inetpeer_init(struct net *net)
3300{
3301        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3302
3303        if (!bp)
3304                return -ENOMEM;
3305        inet_peer_base_init(bp);
3306        net->ipv4.peers = bp;
3307        return 0;
3308}
3309
3310static void __net_exit ipv4_inetpeer_exit(struct net *net)
3311{
3312        struct inet_peer_base *bp = net->ipv4.peers;
3313
3314        net->ipv4.peers = NULL;
3315        inetpeer_invalidate_tree(bp);
3316        kfree(bp);
3317}
3318
3319static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3320        .init   =       ipv4_inetpeer_init,
3321        .exit   =       ipv4_inetpeer_exit,
3322};
3323
3324#ifdef CONFIG_IP_ROUTE_CLASSID
3325struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3326#endif /* CONFIG_IP_ROUTE_CLASSID */
3327
3328int __init ip_rt_init(void)
3329{
3330        int cpu;
3331
3332        ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3333                                  GFP_KERNEL);
3334        if (!ip_idents)
3335                panic("IP: failed to allocate ip_idents\n");
3336
3337        prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3338
3339        ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3340        if (!ip_tstamps)
3341                panic("IP: failed to allocate ip_tstamps\n");
3342
3343        for_each_possible_cpu(cpu) {
3344                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3345
3346                INIT_LIST_HEAD(&ul->head);
3347                spin_lock_init(&ul->lock);
3348        }
3349#ifdef CONFIG_IP_ROUTE_CLASSID
3350        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3351        if (!ip_rt_acct)
3352                panic("IP: failed to allocate ip_rt_acct\n");
3353#endif
3354
3355        ipv4_dst_ops.kmem_cachep =
3356                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3357                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3358
3359        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3360
3361        if (dst_entries_init(&ipv4_dst_ops) < 0)
3362                panic("IP: failed to allocate ipv4_dst_ops counter\n");
3363
3364        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3365                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3366
3367        ipv4_dst_ops.gc_thresh = ~0;
3368        ip_rt_max_size = INT_MAX;
3369
3370        devinet_init();
3371        ip_fib_init();
3372
3373        if (ip_rt_proc_init())
3374                pr_err("Unable to create route proc files\n");
3375#ifdef CONFIG_XFRM
3376        xfrm_init();
3377        xfrm4_init();
3378#endif
3379        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3380                      RTNL_FLAG_DOIT_UNLOCKED);
3381
3382#ifdef CONFIG_SYSCTL
3383        register_pernet_subsys(&sysctl_route_ops);
3384#endif
3385        register_pernet_subsys(&rt_genid_ops);
3386        register_pernet_subsys(&ipv4_inetpeer_ops);
3387        return 0;
3388}
3389
3390#ifdef CONFIG_SYSCTL
3391/*
3392 * We really need to sanitize the damn ipv4 init order, then all
3393 * this nonsense will go away.
3394 */
3395void __init ip_static_sysctl_init(void)
3396{
3397        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3398}
3399#endif
3400