linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *              Alan Cox        :       Verify area fixes.
  16 *              Alan Cox        :       cli() protects routing changes
  17 *              Rui Oliveira    :       ICMP routing table updates
  18 *              (rco@di.uminho.pt)      Routing table insertion and update
  19 *              Linus Torvalds  :       Rewrote bits to be sensible
  20 *              Alan Cox        :       Added BSD route gw semantics
  21 *              Alan Cox        :       Super /proc >4K
  22 *              Alan Cox        :       MTU in route table
  23 *              Alan Cox        :       MSS actually. Also added the window
  24 *                                      clamper.
  25 *              Sam Lantinga    :       Fixed route matching in rt_del()
  26 *              Alan Cox        :       Routing cache support.
  27 *              Alan Cox        :       Removed compatibility cruft.
  28 *              Alan Cox        :       RTF_REJECT support.
  29 *              Alan Cox        :       TCP irtt support.
  30 *              Jonathan Naylor :       Added Metric support.
  31 *      Miquel van Smoorenburg  :       BSD API fixes.
  32 *      Miquel van Smoorenburg  :       Metrics.
  33 *              Alan Cox        :       Use __u32 properly
  34 *              Alan Cox        :       Aligned routing errors more closely with BSD
  35 *                                      our system is still very different.
  36 *              Alan Cox        :       Faster /proc handling
  37 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38 *                                      routing caches and better behaviour.
  39 *
  40 *              Olaf Erb        :       irtt wasn't being copied right.
  41 *              Bjorn Ekwall    :       Kerneld route support.
  42 *              Alan Cox        :       Multicast fixed (I hope)
  43 *              Pavel Krauz     :       Limited broadcast fixed
  44 *              Mike McLagan    :       Routing by source
  45 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46 *                                      route.c and rewritten from scratch.
  47 *              Andi Kleen      :       Load-limit warning messages.
  48 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52 *              Marc Boucher    :       routing by fwmark
  53 *      Robert Olsson           :       Added rt_cache statistics
  54 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58 *
  59 *              This program is free software; you can redistribute it and/or
  60 *              modify it under the terms of the GNU General Public License
  61 *              as published by the Free Software Foundation; either version
  62 *              2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <linux/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/memblock.h>
  74#include <linux/string.h>
  75#include <linux/socket.h>
  76#include <linux/sockios.h>
  77#include <linux/errno.h>
  78#include <linux/in.h>
  79#include <linux/inet.h>
  80#include <linux/netdevice.h>
  81#include <linux/proc_fs.h>
  82#include <linux/init.h>
  83#include <linux/skbuff.h>
  84#include <linux/inetdevice.h>
  85#include <linux/igmp.h>
  86#include <linux/pkt_sched.h>
  87#include <linux/mroute.h>
  88#include <linux/netfilter_ipv4.h>
  89#include <linux/random.h>
  90#include <linux/rcupdate.h>
  91#include <linux/times.h>
  92#include <linux/slab.h>
  93#include <linux/jhash.h>
  94#include <net/dst.h>
  95#include <net/dst_metadata.h>
  96#include <net/net_namespace.h>
  97#include <net/protocol.h>
  98#include <net/ip.h>
  99#include <net/route.h>
 100#include <net/inetpeer.h>
 101#include <net/sock.h>
 102#include <net/ip_fib.h>
 103#include <net/arp.h>
 104#include <net/tcp.h>
 105#include <net/icmp.h>
 106#include <net/xfrm.h>
 107#include <net/lwtunnel.h>
 108#include <net/netevent.h>
 109#include <net/rtnetlink.h>
 110#ifdef CONFIG_SYSCTL
 111#include <linux/sysctl.h>
 112#endif
 113#include <net/secure_seq.h>
 114#include <net/ip_tunnels.h>
 115#include <net/l3mdev.h>
 116
 117#include "fib_lookup.h"
 118
 119#define RT_FL_TOS(oldflp4) \
 120        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 121
 122#define RT_GC_TIMEOUT (300*HZ)
 123
 124static int ip_rt_max_size;
 125static int ip_rt_redirect_number __read_mostly  = 9;
 126static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128static int ip_rt_error_cost __read_mostly       = HZ;
 129static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132static int ip_rt_min_advmss __read_mostly       = 256;
 133
 134static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 135
 136/*
 137 *      Interface to generic destination cache.
 138 */
 139
 140static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 141static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 142static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 143static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 144static void              ipv4_link_failure(struct sk_buff *skb);
 145static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 146                                           struct sk_buff *skb, u32 mtu,
 147                                           bool confirm_neigh);
 148static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 149                                        struct sk_buff *skb);
 150static void             ipv4_dst_destroy(struct dst_entry *dst);
 151
 152static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 153{
 154        WARN_ON(1);
 155        return NULL;
 156}
 157
 158static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 159                                           struct sk_buff *skb,
 160                                           const void *daddr);
 161static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 162
 163static struct dst_ops ipv4_dst_ops = {
 164        .family =               AF_INET,
 165        .check =                ipv4_dst_check,
 166        .default_advmss =       ipv4_default_advmss,
 167        .mtu =                  ipv4_mtu,
 168        .cow_metrics =          ipv4_cow_metrics,
 169        .destroy =              ipv4_dst_destroy,
 170        .negative_advice =      ipv4_negative_advice,
 171        .link_failure =         ipv4_link_failure,
 172        .update_pmtu =          ip_rt_update_pmtu,
 173        .redirect =             ip_do_redirect,
 174        .local_out =            __ip_local_out,
 175        .neigh_lookup =         ipv4_neigh_lookup,
 176        .confirm_neigh =        ipv4_confirm_neigh,
 177};
 178
 179#define ECN_OR_COST(class)      TC_PRIO_##class
 180
 181const __u8 ip_tos2prio[16] = {
 182        TC_PRIO_BESTEFFORT,
 183        ECN_OR_COST(BESTEFFORT),
 184        TC_PRIO_BESTEFFORT,
 185        ECN_OR_COST(BESTEFFORT),
 186        TC_PRIO_BULK,
 187        ECN_OR_COST(BULK),
 188        TC_PRIO_BULK,
 189        ECN_OR_COST(BULK),
 190        TC_PRIO_INTERACTIVE,
 191        ECN_OR_COST(INTERACTIVE),
 192        TC_PRIO_INTERACTIVE,
 193        ECN_OR_COST(INTERACTIVE),
 194        TC_PRIO_INTERACTIVE_BULK,
 195        ECN_OR_COST(INTERACTIVE_BULK),
 196        TC_PRIO_INTERACTIVE_BULK,
 197        ECN_OR_COST(INTERACTIVE_BULK)
 198};
 199EXPORT_SYMBOL(ip_tos2prio);
 200
 201static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 202#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 203
 204#ifdef CONFIG_PROC_FS
 205static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 206{
 207        if (*pos)
 208                return NULL;
 209        return SEQ_START_TOKEN;
 210}
 211
 212static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 213{
 214        ++*pos;
 215        return NULL;
 216}
 217
 218static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 219{
 220}
 221
 222static int rt_cache_seq_show(struct seq_file *seq, void *v)
 223{
 224        if (v == SEQ_START_TOKEN)
 225                seq_printf(seq, "%-127s\n",
 226                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 227                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 228                           "HHUptod\tSpecDst");
 229        return 0;
 230}
 231
 232static const struct seq_operations rt_cache_seq_ops = {
 233        .start  = rt_cache_seq_start,
 234        .next   = rt_cache_seq_next,
 235        .stop   = rt_cache_seq_stop,
 236        .show   = rt_cache_seq_show,
 237};
 238
 239static int rt_cache_seq_open(struct inode *inode, struct file *file)
 240{
 241        return seq_open(file, &rt_cache_seq_ops);
 242}
 243
 244static const struct file_operations rt_cache_seq_fops = {
 245        .open    = rt_cache_seq_open,
 246        .read    = seq_read,
 247        .llseek  = seq_lseek,
 248        .release = seq_release,
 249};
 250
 251
 252static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 253{
 254        int cpu;
 255
 256        if (*pos == 0)
 257                return SEQ_START_TOKEN;
 258
 259        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 260                if (!cpu_possible(cpu))
 261                        continue;
 262                *pos = cpu+1;
 263                return &per_cpu(rt_cache_stat, cpu);
 264        }
 265        return NULL;
 266}
 267
 268static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 269{
 270        int cpu;
 271
 272        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 273                if (!cpu_possible(cpu))
 274                        continue;
 275                *pos = cpu+1;
 276                return &per_cpu(rt_cache_stat, cpu);
 277        }
 278        return NULL;
 279
 280}
 281
 282static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 283{
 284
 285}
 286
 287static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 288{
 289        struct rt_cache_stat *st = v;
 290
 291        if (v == SEQ_START_TOKEN) {
 292                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 293                return 0;
 294        }
 295
 296        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 297                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 298                   dst_entries_get_slow(&ipv4_dst_ops),
 299                   0, /* st->in_hit */
 300                   st->in_slow_tot,
 301                   st->in_slow_mc,
 302                   st->in_no_route,
 303                   st->in_brd,
 304                   st->in_martian_dst,
 305                   st->in_martian_src,
 306
 307                   0, /* st->out_hit */
 308                   st->out_slow_tot,
 309                   st->out_slow_mc,
 310
 311                   0, /* st->gc_total */
 312                   0, /* st->gc_ignored */
 313                   0, /* st->gc_goal_miss */
 314                   0, /* st->gc_dst_overflow */
 315                   0, /* st->in_hlist_search */
 316                   0  /* st->out_hlist_search */
 317                );
 318        return 0;
 319}
 320
 321static const struct seq_operations rt_cpu_seq_ops = {
 322        .start  = rt_cpu_seq_start,
 323        .next   = rt_cpu_seq_next,
 324        .stop   = rt_cpu_seq_stop,
 325        .show   = rt_cpu_seq_show,
 326};
 327
 328
 329static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 330{
 331        return seq_open(file, &rt_cpu_seq_ops);
 332}
 333
 334static const struct file_operations rt_cpu_seq_fops = {
 335        .open    = rt_cpu_seq_open,
 336        .read    = seq_read,
 337        .llseek  = seq_lseek,
 338        .release = seq_release,
 339};
 340
 341#ifdef CONFIG_IP_ROUTE_CLASSID
 342static int rt_acct_proc_show(struct seq_file *m, void *v)
 343{
 344        struct ip_rt_acct *dst, *src;
 345        unsigned int i, j;
 346
 347        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 348        if (!dst)
 349                return -ENOMEM;
 350
 351        for_each_possible_cpu(i) {
 352                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 353                for (j = 0; j < 256; j++) {
 354                        dst[j].o_bytes   += src[j].o_bytes;
 355                        dst[j].o_packets += src[j].o_packets;
 356                        dst[j].i_bytes   += src[j].i_bytes;
 357                        dst[j].i_packets += src[j].i_packets;
 358                }
 359        }
 360
 361        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 362        kfree(dst);
 363        return 0;
 364}
 365#endif
 366
 367static int __net_init ip_rt_do_proc_init(struct net *net)
 368{
 369        struct proc_dir_entry *pde;
 370
 371        pde = proc_create("rt_cache", 0444, net->proc_net,
 372                          &rt_cache_seq_fops);
 373        if (!pde)
 374                goto err1;
 375
 376        pde = proc_create("rt_cache", 0444,
 377                          net->proc_net_stat, &rt_cpu_seq_fops);
 378        if (!pde)
 379                goto err2;
 380
 381#ifdef CONFIG_IP_ROUTE_CLASSID
 382        pde = proc_create_single("rt_acct", 0, net->proc_net,
 383                        rt_acct_proc_show);
 384        if (!pde)
 385                goto err3;
 386#endif
 387        return 0;
 388
 389#ifdef CONFIG_IP_ROUTE_CLASSID
 390err3:
 391        remove_proc_entry("rt_cache", net->proc_net_stat);
 392#endif
 393err2:
 394        remove_proc_entry("rt_cache", net->proc_net);
 395err1:
 396        return -ENOMEM;
 397}
 398
 399static void __net_exit ip_rt_do_proc_exit(struct net *net)
 400{
 401        remove_proc_entry("rt_cache", net->proc_net_stat);
 402        remove_proc_entry("rt_cache", net->proc_net);
 403#ifdef CONFIG_IP_ROUTE_CLASSID
 404        remove_proc_entry("rt_acct", net->proc_net);
 405#endif
 406}
 407
 408static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 409        .init = ip_rt_do_proc_init,
 410        .exit = ip_rt_do_proc_exit,
 411};
 412
 413static int __init ip_rt_proc_init(void)
 414{
 415        return register_pernet_subsys(&ip_rt_proc_ops);
 416}
 417
 418#else
 419static inline int ip_rt_proc_init(void)
 420{
 421        return 0;
 422}
 423#endif /* CONFIG_PROC_FS */
 424
 425static inline bool rt_is_expired(const struct rtable *rth)
 426{
 427        return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 428}
 429
 430void rt_cache_flush(struct net *net)
 431{
 432        rt_genid_bump_ipv4(net);
 433}
 434
 435static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 436                                           struct sk_buff *skb,
 437                                           const void *daddr)
 438{
 439        struct net_device *dev = dst->dev;
 440        const __be32 *pkey = daddr;
 441        const struct rtable *rt;
 442        struct neighbour *n;
 443
 444        rt = (const struct rtable *) dst;
 445        if (rt->rt_gateway)
 446                pkey = (const __be32 *) &rt->rt_gateway;
 447        else if (skb)
 448                pkey = &ip_hdr(skb)->daddr;
 449
 450        n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 451        if (n)
 452                return n;
 453        return neigh_create(&arp_tbl, pkey, dev);
 454}
 455
 456static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 457{
 458        struct net_device *dev = dst->dev;
 459        const __be32 *pkey = daddr;
 460        const struct rtable *rt;
 461
 462        rt = (const struct rtable *)dst;
 463        if (rt->rt_gateway)
 464                pkey = (const __be32 *)&rt->rt_gateway;
 465        else if (!daddr ||
 466                 (rt->rt_flags &
 467                  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 468                return;
 469
 470        __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 471}
 472
 473/* Hash tables of size 2048..262144 depending on RAM size.
 474 * Each bucket uses 8 bytes.
 475 */
 476static u32 ip_idents_mask __read_mostly;
 477static atomic_t *ip_idents __read_mostly;
 478static u32 *ip_tstamps __read_mostly;
 479
 480/* In order to protect privacy, we add a perturbation to identifiers
 481 * if one generator is seldom used. This makes hard for an attacker
 482 * to infer how many packets were sent between two points in time.
 483 */
 484u32 ip_idents_reserve(u32 hash, int segs)
 485{
 486        u32 bucket, old, now = (u32)jiffies;
 487        atomic_t *p_id;
 488        u32 *p_tstamp;
 489        u32 delta = 0;
 490
 491        bucket = hash & ip_idents_mask;
 492        p_tstamp = ip_tstamps + bucket;
 493        p_id = ip_idents + bucket;
 494        old = READ_ONCE(*p_tstamp);
 495
 496        if (old != now && cmpxchg(p_tstamp, old, now) == old)
 497                delta = prandom_u32_max(now - old);
 498
 499        /* If UBSAN reports an error there, please make sure your compiler
 500         * supports -fno-strict-overflow before reporting it that was a bug
 501         * in UBSAN, and it has been fixed in GCC-8.
 502         */
 503        return atomic_add_return(segs + delta, p_id) - segs;
 504}
 505EXPORT_SYMBOL(ip_idents_reserve);
 506
 507void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 508{
 509        u32 hash, id;
 510
 511        /* Note the following code is not safe, but this is okay. */
 512        if (unlikely(siphash_key_is_zero(&net->ipv4_ip_id_key)))
 513                get_random_bytes(&net->ipv4_ip_id_key,
 514                                 sizeof(net->ipv4_ip_id_key));
 515
 516        hash = siphash_3u32((__force u32)iph->daddr,
 517                            (__force u32)iph->saddr,
 518                            iph->protocol,
 519                            &net->ipv4_ip_id_key);
 520        id = ip_idents_reserve(hash, segs);
 521        iph->id = htons(id);
 522}
 523EXPORT_SYMBOL(__ip_select_ident);
 524
 525static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 526                             const struct sock *sk,
 527                             const struct iphdr *iph,
 528                             int oif, u8 tos,
 529                             u8 prot, u32 mark, int flow_flags)
 530{
 531        if (sk) {
 532                const struct inet_sock *inet = inet_sk(sk);
 533
 534                oif = sk->sk_bound_dev_if;
 535                mark = sk->sk_mark;
 536                tos = RT_CONN_FLAGS(sk);
 537                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 538        }
 539        flowi4_init_output(fl4, oif, mark, tos,
 540                           RT_SCOPE_UNIVERSE, prot,
 541                           flow_flags,
 542                           iph->daddr, iph->saddr, 0, 0,
 543                           sock_net_uid(net, sk));
 544}
 545
 546static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 547                               const struct sock *sk)
 548{
 549        const struct net *net = dev_net(skb->dev);
 550        const struct iphdr *iph = ip_hdr(skb);
 551        int oif = skb->dev->ifindex;
 552        u8 tos = RT_TOS(iph->tos);
 553        u8 prot = iph->protocol;
 554        u32 mark = skb->mark;
 555
 556        __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 557}
 558
 559static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 560{
 561        const struct inet_sock *inet = inet_sk(sk);
 562        const struct ip_options_rcu *inet_opt;
 563        __be32 daddr = inet->inet_daddr;
 564
 565        rcu_read_lock();
 566        inet_opt = rcu_dereference(inet->inet_opt);
 567        if (inet_opt && inet_opt->opt.srr)
 568                daddr = inet_opt->opt.faddr;
 569        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 570                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 571                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 572                           inet_sk_flowi_flags(sk),
 573                           daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 574        rcu_read_unlock();
 575}
 576
 577static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 578                                 const struct sk_buff *skb)
 579{
 580        if (skb)
 581                build_skb_flow_key(fl4, skb, sk);
 582        else
 583                build_sk_flow_key(fl4, sk);
 584}
 585
 586static DEFINE_SPINLOCK(fnhe_lock);
 587
 588static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 589{
 590        struct rtable *rt;
 591
 592        rt = rcu_dereference(fnhe->fnhe_rth_input);
 593        if (rt) {
 594                RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 595                dst_dev_put(&rt->dst);
 596                dst_release(&rt->dst);
 597        }
 598        rt = rcu_dereference(fnhe->fnhe_rth_output);
 599        if (rt) {
 600                RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 601                dst_dev_put(&rt->dst);
 602                dst_release(&rt->dst);
 603        }
 604}
 605
 606static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
 607{
 608        struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
 609        struct fib_nh_exception *fnhe, *oldest = NULL;
 610
 611        for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
 612                fnhe = rcu_dereference_protected(*fnhe_p,
 613                                                 lockdep_is_held(&fnhe_lock));
 614                if (!fnhe)
 615                        break;
 616                if (!oldest ||
 617                    time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
 618                        oldest = fnhe;
 619                        oldest_p = fnhe_p;
 620                }
 621        }
 622        fnhe_flush_routes(oldest);
 623        *oldest_p = oldest->fnhe_next;
 624        kfree_rcu(oldest, rcu);
 625}
 626
 627static u32 fnhe_hashfun(__be32 daddr)
 628{
 629        static siphash_key_t fnhe_hash_key __read_mostly;
 630        u64 hval;
 631
 632        net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
 633        hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
 634        return hash_64(hval, FNHE_HASH_SHIFT);
 635}
 636
 637static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 638{
 639        rt->rt_pmtu = fnhe->fnhe_pmtu;
 640        rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 641        rt->dst.expires = fnhe->fnhe_expires;
 642
 643        if (fnhe->fnhe_gw) {
 644                rt->rt_flags |= RTCF_REDIRECTED;
 645                rt->rt_gateway = fnhe->fnhe_gw;
 646                rt->rt_uses_gateway = 1;
 647        }
 648}
 649
 650static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 651                                  u32 pmtu, bool lock, unsigned long expires)
 652{
 653        struct fnhe_hash_bucket *hash;
 654        struct fib_nh_exception *fnhe;
 655        struct rtable *rt;
 656        u32 genid, hval;
 657        unsigned int i;
 658        int depth;
 659
 660        genid = fnhe_genid(dev_net(nh->nh_dev));
 661        hval = fnhe_hashfun(daddr);
 662
 663        spin_lock_bh(&fnhe_lock);
 664
 665        hash = rcu_dereference(nh->nh_exceptions);
 666        if (!hash) {
 667                hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 668                if (!hash)
 669                        goto out_unlock;
 670                rcu_assign_pointer(nh->nh_exceptions, hash);
 671        }
 672
 673        hash += hval;
 674
 675        depth = 0;
 676        for (fnhe = rcu_dereference(hash->chain); fnhe;
 677             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 678                if (fnhe->fnhe_daddr == daddr)
 679                        break;
 680                depth++;
 681        }
 682
 683        if (fnhe) {
 684                if (fnhe->fnhe_genid != genid)
 685                        fnhe->fnhe_genid = genid;
 686                if (gw)
 687                        fnhe->fnhe_gw = gw;
 688                if (pmtu) {
 689                        fnhe->fnhe_pmtu = pmtu;
 690                        fnhe->fnhe_mtu_locked = lock;
 691                }
 692                fnhe->fnhe_expires = max(1UL, expires);
 693                /* Update all cached dsts too */
 694                rt = rcu_dereference(fnhe->fnhe_rth_input);
 695                if (rt)
 696                        fill_route_from_fnhe(rt, fnhe);
 697                rt = rcu_dereference(fnhe->fnhe_rth_output);
 698                if (rt)
 699                        fill_route_from_fnhe(rt, fnhe);
 700        } else {
 701                /* Randomize max depth to avoid some side channels attacks. */
 702                int max_depth = FNHE_RECLAIM_DEPTH +
 703                                prandom_u32_max(FNHE_RECLAIM_DEPTH);
 704
 705                while (depth > max_depth) {
 706                        fnhe_remove_oldest(hash);
 707                        depth--;
 708                }
 709
 710                fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 711                if (!fnhe)
 712                        goto out_unlock;
 713
 714                fnhe->fnhe_next = hash->chain;
 715
 716                fnhe->fnhe_genid = genid;
 717                fnhe->fnhe_daddr = daddr;
 718                fnhe->fnhe_gw = gw;
 719                fnhe->fnhe_pmtu = pmtu;
 720                fnhe->fnhe_mtu_locked = lock;
 721                fnhe->fnhe_expires = max(1UL, expires);
 722
 723                rcu_assign_pointer(hash->chain, fnhe);
 724
 725                /* Exception created; mark the cached routes for the nexthop
 726                 * stale, so anyone caching it rechecks if this exception
 727                 * applies to them.
 728                 */
 729                rt = rcu_dereference(nh->nh_rth_input);
 730                if (rt)
 731                        rt->dst.obsolete = DST_OBSOLETE_KILL;
 732
 733                for_each_possible_cpu(i) {
 734                        struct rtable __rcu **prt;
 735                        prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 736                        rt = rcu_dereference(*prt);
 737                        if (rt)
 738                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 739                }
 740        }
 741
 742        fnhe->fnhe_stamp = jiffies;
 743
 744out_unlock:
 745        spin_unlock_bh(&fnhe_lock);
 746}
 747
 748static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 749                             bool kill_route)
 750{
 751        __be32 new_gw = icmp_hdr(skb)->un.gateway;
 752        __be32 old_gw = ip_hdr(skb)->saddr;
 753        struct net_device *dev = skb->dev;
 754        struct in_device *in_dev;
 755        struct fib_result res;
 756        struct neighbour *n;
 757        struct net *net;
 758
 759        switch (icmp_hdr(skb)->code & 7) {
 760        case ICMP_REDIR_NET:
 761        case ICMP_REDIR_NETTOS:
 762        case ICMP_REDIR_HOST:
 763        case ICMP_REDIR_HOSTTOS:
 764                break;
 765
 766        default:
 767                return;
 768        }
 769
 770        if (rt->rt_gateway != old_gw)
 771                return;
 772
 773        in_dev = __in_dev_get_rcu(dev);
 774        if (!in_dev)
 775                return;
 776
 777        net = dev_net(dev);
 778        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 779            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 780            ipv4_is_zeronet(new_gw))
 781                goto reject_redirect;
 782
 783        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 784                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 785                        goto reject_redirect;
 786                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 787                        goto reject_redirect;
 788        } else {
 789                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 790                        goto reject_redirect;
 791        }
 792
 793        n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 794        if (!n)
 795                n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 796        if (!IS_ERR(n)) {
 797                if (!(n->nud_state & NUD_VALID)) {
 798                        neigh_event_send(n, NULL);
 799                } else {
 800                        if (fib_lookup(net, fl4, &res, 0) == 0) {
 801                                struct fib_nh *nh;
 802
 803                                fib_select_path(net, &res, fl4, skb);
 804                                nh = &FIB_RES_NH(res);
 805                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
 806                                                0, false,
 807                                                jiffies + ip_rt_gc_timeout);
 808                        }
 809                        if (kill_route)
 810                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 811                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 812                }
 813                neigh_release(n);
 814        }
 815        return;
 816
 817reject_redirect:
 818#ifdef CONFIG_IP_ROUTE_VERBOSE
 819        if (IN_DEV_LOG_MARTIANS(in_dev)) {
 820                const struct iphdr *iph = (const struct iphdr *) skb->data;
 821                __be32 daddr = iph->daddr;
 822                __be32 saddr = iph->saddr;
 823
 824                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 825                                     "  Advised path = %pI4 -> %pI4\n",
 826                                     &old_gw, dev->name, &new_gw,
 827                                     &saddr, &daddr);
 828        }
 829#endif
 830        ;
 831}
 832
 833static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 834{
 835        struct rtable *rt;
 836        struct flowi4 fl4;
 837        const struct iphdr *iph = (const struct iphdr *) skb->data;
 838        struct net *net = dev_net(skb->dev);
 839        int oif = skb->dev->ifindex;
 840        u8 tos = RT_TOS(iph->tos);
 841        u8 prot = iph->protocol;
 842        u32 mark = skb->mark;
 843
 844        rt = (struct rtable *) dst;
 845
 846        __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 847        __ip_do_redirect(rt, skb, &fl4, true);
 848}
 849
 850static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 851{
 852        struct rtable *rt = (struct rtable *)dst;
 853        struct dst_entry *ret = dst;
 854
 855        if (rt) {
 856                if (dst->obsolete > 0) {
 857                        ip_rt_put(rt);
 858                        ret = NULL;
 859                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 860                           rt->dst.expires) {
 861                        ip_rt_put(rt);
 862                        ret = NULL;
 863                }
 864        }
 865        return ret;
 866}
 867
 868/*
 869 * Algorithm:
 870 *      1. The first ip_rt_redirect_number redirects are sent
 871 *         with exponential backoff, then we stop sending them at all,
 872 *         assuming that the host ignores our redirects.
 873 *      2. If we did not see packets requiring redirects
 874 *         during ip_rt_redirect_silence, we assume that the host
 875 *         forgot redirected route and start to send redirects again.
 876 *
 877 * This algorithm is much cheaper and more intelligent than dumb load limiting
 878 * in icmp.c.
 879 *
 880 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 881 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 882 */
 883
 884void ip_rt_send_redirect(struct sk_buff *skb)
 885{
 886        struct rtable *rt = skb_rtable(skb);
 887        struct in_device *in_dev;
 888        struct inet_peer *peer;
 889        struct net *net;
 890        int log_martians;
 891        int vif;
 892
 893        rcu_read_lock();
 894        in_dev = __in_dev_get_rcu(rt->dst.dev);
 895        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 896                rcu_read_unlock();
 897                return;
 898        }
 899        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 900        vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 901        rcu_read_unlock();
 902
 903        net = dev_net(rt->dst.dev);
 904        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 905        if (!peer) {
 906                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 907                          rt_nexthop(rt, ip_hdr(skb)->daddr));
 908                return;
 909        }
 910
 911        /* No redirected packets during ip_rt_redirect_silence;
 912         * reset the algorithm.
 913         */
 914        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 915                peer->rate_tokens = 0;
 916                peer->n_redirects = 0;
 917        }
 918
 919        /* Too many ignored redirects; do not send anything
 920         * set dst.rate_last to the last seen redirected packet.
 921         */
 922        if (peer->n_redirects >= ip_rt_redirect_number) {
 923                peer->rate_last = jiffies;
 924                goto out_put_peer;
 925        }
 926
 927        /* Check for load limit; set rate_last to the latest sent
 928         * redirect.
 929         */
 930        if (peer->n_redirects == 0 ||
 931            time_after(jiffies,
 932                       (peer->rate_last +
 933                        (ip_rt_redirect_load << peer->n_redirects)))) {
 934                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 935
 936                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 937                peer->rate_last = jiffies;
 938                ++peer->n_redirects;
 939#ifdef CONFIG_IP_ROUTE_VERBOSE
 940                if (log_martians &&
 941                    peer->n_redirects == ip_rt_redirect_number)
 942                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 943                                             &ip_hdr(skb)->saddr, inet_iif(skb),
 944                                             &ip_hdr(skb)->daddr, &gw);
 945#endif
 946        }
 947out_put_peer:
 948        inet_putpeer(peer);
 949}
 950
 951static int ip_error(struct sk_buff *skb)
 952{
 953        struct rtable *rt = skb_rtable(skb);
 954        struct net_device *dev = skb->dev;
 955        struct in_device *in_dev;
 956        struct inet_peer *peer;
 957        unsigned long now;
 958        struct net *net;
 959        bool send;
 960        int code;
 961
 962        if (netif_is_l3_master(skb->dev)) {
 963                dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 964                if (!dev)
 965                        goto out;
 966        }
 967
 968        in_dev = __in_dev_get_rcu(dev);
 969
 970        /* IP on this device is disabled. */
 971        if (!in_dev)
 972                goto out;
 973
 974        net = dev_net(rt->dst.dev);
 975        if (!IN_DEV_FORWARD(in_dev)) {
 976                switch (rt->dst.error) {
 977                case EHOSTUNREACH:
 978                        __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 979                        break;
 980
 981                case ENETUNREACH:
 982                        __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 983                        break;
 984                }
 985                goto out;
 986        }
 987
 988        switch (rt->dst.error) {
 989        case EINVAL:
 990        default:
 991                goto out;
 992        case EHOSTUNREACH:
 993                code = ICMP_HOST_UNREACH;
 994                break;
 995        case ENETUNREACH:
 996                code = ICMP_NET_UNREACH;
 997                __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 998                break;
 999        case EACCES:
1000                code = ICMP_PKT_FILTERED;
1001                break;
1002        }
1003
1004        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1005                               l3mdev_master_ifindex(skb->dev), 1);
1006
1007        send = true;
1008        if (peer) {
1009                now = jiffies;
1010                peer->rate_tokens += now - peer->rate_last;
1011                if (peer->rate_tokens > ip_rt_error_burst)
1012                        peer->rate_tokens = ip_rt_error_burst;
1013                peer->rate_last = now;
1014                if (peer->rate_tokens >= ip_rt_error_cost)
1015                        peer->rate_tokens -= ip_rt_error_cost;
1016                else
1017                        send = false;
1018                inet_putpeer(peer);
1019        }
1020        if (send)
1021                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1022
1023out:    kfree_skb(skb);
1024        return 0;
1025}
1026
1027static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1028{
1029        struct dst_entry *dst = &rt->dst;
1030        struct net *net = dev_net(dst->dev);
1031        u32 old_mtu = ipv4_mtu(dst);
1032        struct fib_result res;
1033        bool lock = false;
1034
1035        if (ip_mtu_locked(dst))
1036                return;
1037
1038        if (old_mtu < mtu)
1039                return;
1040
1041        if (mtu < ip_rt_min_pmtu) {
1042                lock = true;
1043                mtu = min(old_mtu, ip_rt_min_pmtu);
1044        }
1045
1046        if (rt->rt_pmtu == mtu && !lock &&
1047            time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1048                return;
1049
1050        rcu_read_lock();
1051        if (fib_lookup(net, fl4, &res, 0) == 0) {
1052                struct fib_nh *nh;
1053
1054                fib_select_path(net, &res, fl4, NULL);
1055                nh = &FIB_RES_NH(res);
1056                update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1057                                      jiffies + ip_rt_mtu_expires);
1058        }
1059        rcu_read_unlock();
1060}
1061
1062static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1063                              struct sk_buff *skb, u32 mtu,
1064                              bool confirm_neigh)
1065{
1066        struct rtable *rt = (struct rtable *) dst;
1067        struct flowi4 fl4;
1068
1069        ip_rt_build_flow_key(&fl4, sk, skb);
1070
1071        /* Don't make lookup fail for bridged encapsulations */
1072        if (skb && netif_is_any_bridge_port(skb->dev))
1073                fl4.flowi4_oif = 0;
1074
1075        __ip_rt_update_pmtu(rt, &fl4, mtu);
1076}
1077
1078void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1079                      int oif, u32 mark, u8 protocol, int flow_flags)
1080{
1081        const struct iphdr *iph = (const struct iphdr *) skb->data;
1082        struct flowi4 fl4;
1083        struct rtable *rt;
1084
1085        if (!mark)
1086                mark = IP4_REPLY_MARK(net, skb->mark);
1087
1088        __build_flow_key(net, &fl4, NULL, iph, oif,
1089                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1090        rt = __ip_route_output_key(net, &fl4);
1091        if (!IS_ERR(rt)) {
1092                __ip_rt_update_pmtu(rt, &fl4, mtu);
1093                ip_rt_put(rt);
1094        }
1095}
1096EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1097
1098static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1099{
1100        const struct iphdr *iph = (const struct iphdr *) skb->data;
1101        struct flowi4 fl4;
1102        struct rtable *rt;
1103
1104        __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1105
1106        if (!fl4.flowi4_mark)
1107                fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1108
1109        rt = __ip_route_output_key(sock_net(sk), &fl4);
1110        if (!IS_ERR(rt)) {
1111                __ip_rt_update_pmtu(rt, &fl4, mtu);
1112                ip_rt_put(rt);
1113        }
1114}
1115
1116void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1117{
1118        const struct iphdr *iph = (const struct iphdr *) skb->data;
1119        struct flowi4 fl4;
1120        struct rtable *rt;
1121        struct dst_entry *odst = NULL;
1122        bool new = false;
1123        struct net *net = sock_net(sk);
1124
1125        bh_lock_sock(sk);
1126
1127        if (!ip_sk_accept_pmtu(sk))
1128                goto out;
1129
1130        odst = sk_dst_get(sk);
1131
1132        if (sock_owned_by_user(sk) || !odst) {
1133                __ipv4_sk_update_pmtu(skb, sk, mtu);
1134                goto out;
1135        }
1136
1137        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1138
1139        rt = (struct rtable *)odst;
1140        if (odst->obsolete && !odst->ops->check(odst, 0)) {
1141                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1142                if (IS_ERR(rt))
1143                        goto out;
1144
1145                new = true;
1146        }
1147
1148        __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1149
1150        if (!dst_check(&rt->dst, 0)) {
1151                if (new)
1152                        dst_release(&rt->dst);
1153
1154                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1155                if (IS_ERR(rt))
1156                        goto out;
1157
1158                new = true;
1159        }
1160
1161        if (new)
1162                sk_dst_set(sk, &rt->dst);
1163
1164out:
1165        bh_unlock_sock(sk);
1166        dst_release(odst);
1167}
1168EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1169
1170void ipv4_redirect(struct sk_buff *skb, struct net *net,
1171                   int oif, u32 mark, u8 protocol, int flow_flags)
1172{
1173        const struct iphdr *iph = (const struct iphdr *) skb->data;
1174        struct flowi4 fl4;
1175        struct rtable *rt;
1176
1177        __build_flow_key(net, &fl4, NULL, iph, oif,
1178                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1179        rt = __ip_route_output_key(net, &fl4);
1180        if (!IS_ERR(rt)) {
1181                __ip_do_redirect(rt, skb, &fl4, false);
1182                ip_rt_put(rt);
1183        }
1184}
1185EXPORT_SYMBOL_GPL(ipv4_redirect);
1186
1187void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1188{
1189        const struct iphdr *iph = (const struct iphdr *) skb->data;
1190        struct flowi4 fl4;
1191        struct rtable *rt;
1192        struct net *net = sock_net(sk);
1193
1194        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1195        rt = __ip_route_output_key(net, &fl4);
1196        if (!IS_ERR(rt)) {
1197                __ip_do_redirect(rt, skb, &fl4, false);
1198                ip_rt_put(rt);
1199        }
1200}
1201EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1202
1203static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1204{
1205        struct rtable *rt = (struct rtable *) dst;
1206
1207        /* All IPV4 dsts are created with ->obsolete set to the value
1208         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1209         * into this function always.
1210         *
1211         * When a PMTU/redirect information update invalidates a route,
1212         * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1213         * DST_OBSOLETE_DEAD by dst_free().
1214         */
1215        if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1216                return NULL;
1217        return dst;
1218}
1219
1220static void ipv4_send_dest_unreach(struct sk_buff *skb)
1221{
1222        struct ip_options opt;
1223        int res;
1224
1225        /* Recompile ip options since IPCB may not be valid anymore.
1226         * Also check we have a reasonable ipv4 header.
1227         */
1228        if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1229            ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1230                return;
1231
1232        memset(&opt, 0, sizeof(opt));
1233        if (ip_hdr(skb)->ihl > 5) {
1234                if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1235                        return;
1236                opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1237
1238                rcu_read_lock();
1239                res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1240                rcu_read_unlock();
1241
1242                if (res)
1243                        return;
1244        }
1245        __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1246}
1247
1248static void ipv4_link_failure(struct sk_buff *skb)
1249{
1250        struct rtable *rt;
1251
1252        ipv4_send_dest_unreach(skb);
1253
1254        rt = skb_rtable(skb);
1255        if (rt)
1256                dst_set_expires(&rt->dst, 0);
1257}
1258
1259static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1260{
1261        pr_debug("%s: %pI4 -> %pI4, %s\n",
1262                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1263                 skb->dev ? skb->dev->name : "?");
1264        kfree_skb(skb);
1265        WARN_ON(1);
1266        return 0;
1267}
1268
1269/*
1270   We do not cache source address of outgoing interface,
1271   because it is used only by IP RR, TS and SRR options,
1272   so that it out of fast path.
1273
1274   BTW remember: "addr" is allowed to be not aligned
1275   in IP options!
1276 */
1277
1278void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1279{
1280        __be32 src;
1281
1282        if (rt_is_output_route(rt))
1283                src = ip_hdr(skb)->saddr;
1284        else {
1285                struct fib_result res;
1286                struct flowi4 fl4;
1287                struct iphdr *iph;
1288
1289                iph = ip_hdr(skb);
1290
1291                memset(&fl4, 0, sizeof(fl4));
1292                fl4.daddr = iph->daddr;
1293                fl4.saddr = iph->saddr;
1294                fl4.flowi4_tos = RT_TOS(iph->tos);
1295                fl4.flowi4_oif = rt->dst.dev->ifindex;
1296                fl4.flowi4_iif = skb->dev->ifindex;
1297                fl4.flowi4_mark = skb->mark;
1298
1299                rcu_read_lock();
1300                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1301                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1302                else
1303                        src = inet_select_addr(rt->dst.dev,
1304                                               rt_nexthop(rt, iph->daddr),
1305                                               RT_SCOPE_UNIVERSE);
1306                rcu_read_unlock();
1307        }
1308        memcpy(addr, &src, 4);
1309}
1310
1311#ifdef CONFIG_IP_ROUTE_CLASSID
1312static void set_class_tag(struct rtable *rt, u32 tag)
1313{
1314        if (!(rt->dst.tclassid & 0xFFFF))
1315                rt->dst.tclassid |= tag & 0xFFFF;
1316        if (!(rt->dst.tclassid & 0xFFFF0000))
1317                rt->dst.tclassid |= tag & 0xFFFF0000;
1318}
1319#endif
1320
1321static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1322{
1323        unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1324        unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1325                                    ip_rt_min_advmss);
1326
1327        return min(advmss, IPV4_MAX_PMTU - header_size);
1328}
1329
1330static unsigned int ipv4_mtu(const struct dst_entry *dst)
1331{
1332        const struct rtable *rt = (const struct rtable *) dst;
1333        unsigned int mtu = rt->rt_pmtu;
1334
1335        if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1336                mtu = dst_metric_raw(dst, RTAX_MTU);
1337
1338        if (mtu)
1339                return mtu;
1340
1341        mtu = READ_ONCE(dst->dev->mtu);
1342
1343        if (unlikely(ip_mtu_locked(dst))) {
1344                if (rt->rt_uses_gateway && mtu > 576)
1345                        mtu = 576;
1346        }
1347
1348        mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1349
1350        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1351}
1352
1353static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1354{
1355        struct fnhe_hash_bucket *hash;
1356        struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1357        u32 hval = fnhe_hashfun(daddr);
1358
1359        spin_lock_bh(&fnhe_lock);
1360
1361        hash = rcu_dereference_protected(nh->nh_exceptions,
1362                                         lockdep_is_held(&fnhe_lock));
1363        hash += hval;
1364
1365        fnhe_p = &hash->chain;
1366        fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1367        while (fnhe) {
1368                if (fnhe->fnhe_daddr == daddr) {
1369                        rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1370                                fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1371                        /* set fnhe_daddr to 0 to ensure it won't bind with
1372                         * new dsts in rt_bind_exception().
1373                         */
1374                        fnhe->fnhe_daddr = 0;
1375                        fnhe_flush_routes(fnhe);
1376                        kfree_rcu(fnhe, rcu);
1377                        break;
1378                }
1379                fnhe_p = &fnhe->fnhe_next;
1380                fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1381                                                 lockdep_is_held(&fnhe_lock));
1382        }
1383
1384        spin_unlock_bh(&fnhe_lock);
1385}
1386
1387static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1388{
1389        struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1390        struct fib_nh_exception *fnhe;
1391        u32 hval;
1392
1393        if (!hash)
1394                return NULL;
1395
1396        hval = fnhe_hashfun(daddr);
1397
1398        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1399             fnhe = rcu_dereference(fnhe->fnhe_next)) {
1400                if (fnhe->fnhe_daddr == daddr) {
1401                        if (fnhe->fnhe_expires &&
1402                            time_after(jiffies, fnhe->fnhe_expires)) {
1403                                ip_del_fnhe(nh, daddr);
1404                                break;
1405                        }
1406                        return fnhe;
1407                }
1408        }
1409        return NULL;
1410}
1411
1412/* MTU selection:
1413 * 1. mtu on route is locked - use it
1414 * 2. mtu from nexthop exception
1415 * 3. mtu from egress device
1416 */
1417
1418u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1419{
1420        struct fib_info *fi = res->fi;
1421        struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
1422        struct net_device *dev = nh->nh_dev;
1423        u32 mtu = 0;
1424
1425        if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1426            fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1427                mtu = fi->fib_mtu;
1428
1429        if (likely(!mtu)) {
1430                struct fib_nh_exception *fnhe;
1431
1432                fnhe = find_exception(nh, daddr);
1433                if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1434                        mtu = fnhe->fnhe_pmtu;
1435        }
1436
1437        if (likely(!mtu))
1438                mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1439
1440        return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
1441}
1442
1443static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1444                              __be32 daddr, const bool do_cache)
1445{
1446        bool ret = false;
1447
1448        spin_lock_bh(&fnhe_lock);
1449
1450        if (daddr == fnhe->fnhe_daddr) {
1451                struct rtable __rcu **porig;
1452                struct rtable *orig;
1453                int genid = fnhe_genid(dev_net(rt->dst.dev));
1454
1455                if (rt_is_input_route(rt))
1456                        porig = &fnhe->fnhe_rth_input;
1457                else
1458                        porig = &fnhe->fnhe_rth_output;
1459                orig = rcu_dereference(*porig);
1460
1461                if (fnhe->fnhe_genid != genid) {
1462                        fnhe->fnhe_genid = genid;
1463                        fnhe->fnhe_gw = 0;
1464                        fnhe->fnhe_pmtu = 0;
1465                        fnhe->fnhe_expires = 0;
1466                        fnhe->fnhe_mtu_locked = false;
1467                        fnhe_flush_routes(fnhe);
1468                        orig = NULL;
1469                }
1470                fill_route_from_fnhe(rt, fnhe);
1471                if (!rt->rt_gateway)
1472                        rt->rt_gateway = daddr;
1473
1474                if (do_cache) {
1475                        dst_hold(&rt->dst);
1476                        rcu_assign_pointer(*porig, rt);
1477                        if (orig) {
1478                                dst_dev_put(&orig->dst);
1479                                dst_release(&orig->dst);
1480                        }
1481                        ret = true;
1482                }
1483
1484                fnhe->fnhe_stamp = jiffies;
1485        }
1486        spin_unlock_bh(&fnhe_lock);
1487
1488        return ret;
1489}
1490
1491static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1492{
1493        struct rtable *orig, *prev, **p;
1494        bool ret = true;
1495
1496        if (rt_is_input_route(rt)) {
1497                p = (struct rtable **)&nh->nh_rth_input;
1498        } else {
1499                p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1500        }
1501        orig = *p;
1502
1503        /* hold dst before doing cmpxchg() to avoid race condition
1504         * on this dst
1505         */
1506        dst_hold(&rt->dst);
1507        prev = cmpxchg(p, orig, rt);
1508        if (prev == orig) {
1509                if (orig) {
1510                        rt_add_uncached_list(orig);
1511                        dst_release(&orig->dst);
1512                }
1513        } else {
1514                dst_release(&rt->dst);
1515                ret = false;
1516        }
1517
1518        return ret;
1519}
1520
1521struct uncached_list {
1522        spinlock_t              lock;
1523        struct list_head        head;
1524};
1525
1526static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1527
1528void rt_add_uncached_list(struct rtable *rt)
1529{
1530        struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1531
1532        rt->rt_uncached_list = ul;
1533
1534        spin_lock_bh(&ul->lock);
1535        list_add_tail(&rt->rt_uncached, &ul->head);
1536        spin_unlock_bh(&ul->lock);
1537}
1538
1539void rt_del_uncached_list(struct rtable *rt)
1540{
1541        if (!list_empty(&rt->rt_uncached)) {
1542                struct uncached_list *ul = rt->rt_uncached_list;
1543
1544                spin_lock_bh(&ul->lock);
1545                list_del(&rt->rt_uncached);
1546                spin_unlock_bh(&ul->lock);
1547        }
1548}
1549
1550static void ipv4_dst_destroy(struct dst_entry *dst)
1551{
1552        struct rtable *rt = (struct rtable *)dst;
1553
1554        ip_dst_metrics_put(dst);
1555        rt_del_uncached_list(rt);
1556}
1557
1558void rt_flush_dev(struct net_device *dev)
1559{
1560        struct net *net = dev_net(dev);
1561        struct rtable *rt;
1562        int cpu;
1563
1564        for_each_possible_cpu(cpu) {
1565                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1566
1567                spin_lock_bh(&ul->lock);
1568                list_for_each_entry(rt, &ul->head, rt_uncached) {
1569                        if (rt->dst.dev != dev)
1570                                continue;
1571                        rt->dst.dev = net->loopback_dev;
1572                        dev_hold(rt->dst.dev);
1573                        dev_put(dev);
1574                }
1575                spin_unlock_bh(&ul->lock);
1576        }
1577}
1578
1579static bool rt_cache_valid(const struct rtable *rt)
1580{
1581        return  rt &&
1582                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1583                !rt_is_expired(rt);
1584}
1585
1586static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1587                           const struct fib_result *res,
1588                           struct fib_nh_exception *fnhe,
1589                           struct fib_info *fi, u16 type, u32 itag,
1590                           const bool do_cache)
1591{
1592        bool cached = false;
1593
1594        if (fi) {
1595                struct fib_nh *nh = &FIB_RES_NH(*res);
1596
1597                if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1598                        rt->rt_gateway = nh->nh_gw;
1599                        rt->rt_uses_gateway = 1;
1600                }
1601                ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1602
1603#ifdef CONFIG_IP_ROUTE_CLASSID
1604                rt->dst.tclassid = nh->nh_tclassid;
1605#endif
1606                rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1607                if (unlikely(fnhe))
1608                        cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1609                else if (do_cache)
1610                        cached = rt_cache_route(nh, rt);
1611                if (unlikely(!cached)) {
1612                        /* Routes we intend to cache in nexthop exception or
1613                         * FIB nexthop have the DST_NOCACHE bit clear.
1614                         * However, if we are unsuccessful at storing this
1615                         * route into the cache we really need to set it.
1616                         */
1617                        if (!rt->rt_gateway)
1618                                rt->rt_gateway = daddr;
1619                        rt_add_uncached_list(rt);
1620                }
1621        } else
1622                rt_add_uncached_list(rt);
1623
1624#ifdef CONFIG_IP_ROUTE_CLASSID
1625#ifdef CONFIG_IP_MULTIPLE_TABLES
1626        set_class_tag(rt, res->tclassid);
1627#endif
1628        set_class_tag(rt, itag);
1629#endif
1630}
1631
1632struct rtable *rt_dst_alloc(struct net_device *dev,
1633                            unsigned int flags, u16 type,
1634                            bool nopolicy, bool noxfrm, bool will_cache)
1635{
1636        struct rtable *rt;
1637
1638        rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1639                       (will_cache ? 0 : DST_HOST) |
1640                       (nopolicy ? DST_NOPOLICY : 0) |
1641                       (noxfrm ? DST_NOXFRM : 0));
1642
1643        if (rt) {
1644                rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1645                rt->rt_flags = flags;
1646                rt->rt_type = type;
1647                rt->rt_is_input = 0;
1648                rt->rt_iif = 0;
1649                rt->rt_pmtu = 0;
1650                rt->rt_mtu_locked = 0;
1651                rt->rt_gateway = 0;
1652                rt->rt_uses_gateway = 0;
1653                INIT_LIST_HEAD(&rt->rt_uncached);
1654
1655                rt->dst.output = ip_output;
1656                if (flags & RTCF_LOCAL)
1657                        rt->dst.input = ip_local_deliver;
1658        }
1659
1660        return rt;
1661}
1662EXPORT_SYMBOL(rt_dst_alloc);
1663
1664/* called in rcu_read_lock() section */
1665int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1666                          u8 tos, struct net_device *dev,
1667                          struct in_device *in_dev, u32 *itag)
1668{
1669        int err;
1670
1671        /* Primary sanity checks. */
1672        if (!in_dev)
1673                return -EINVAL;
1674
1675        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1676            skb->protocol != htons(ETH_P_IP))
1677                return -EINVAL;
1678
1679        if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1680                return -EINVAL;
1681
1682        if (ipv4_is_zeronet(saddr)) {
1683                if (!ipv4_is_local_multicast(daddr))
1684                        return -EINVAL;
1685        } else {
1686                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1687                                          in_dev, itag);
1688                if (err < 0)
1689                        return err;
1690        }
1691        return 0;
1692}
1693
1694/* called in rcu_read_lock() section */
1695static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1696                             u8 tos, struct net_device *dev, int our)
1697{
1698        struct in_device *in_dev = __in_dev_get_rcu(dev);
1699        unsigned int flags = RTCF_MULTICAST;
1700        struct rtable *rth;
1701        u32 itag = 0;
1702        int err;
1703
1704        err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1705        if (err)
1706                return err;
1707
1708        if (our)
1709                flags |= RTCF_LOCAL;
1710
1711        rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1712                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1713        if (!rth)
1714                return -ENOBUFS;
1715
1716#ifdef CONFIG_IP_ROUTE_CLASSID
1717        rth->dst.tclassid = itag;
1718#endif
1719        rth->dst.output = ip_rt_bug;
1720        rth->rt_is_input= 1;
1721
1722#ifdef CONFIG_IP_MROUTE
1723        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1724                rth->dst.input = ip_mr_input;
1725#endif
1726        RT_CACHE_STAT_INC(in_slow_mc);
1727
1728        skb_dst_set(skb, &rth->dst);
1729        return 0;
1730}
1731
1732
1733static void ip_handle_martian_source(struct net_device *dev,
1734                                     struct in_device *in_dev,
1735                                     struct sk_buff *skb,
1736                                     __be32 daddr,
1737                                     __be32 saddr)
1738{
1739        RT_CACHE_STAT_INC(in_martian_src);
1740#ifdef CONFIG_IP_ROUTE_VERBOSE
1741        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1742                /*
1743                 *      RFC1812 recommendation, if source is martian,
1744                 *      the only hint is MAC header.
1745                 */
1746                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1747                        &daddr, &saddr, dev->name);
1748                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1749                        print_hex_dump(KERN_WARNING, "ll header: ",
1750                                       DUMP_PREFIX_OFFSET, 16, 1,
1751                                       skb_mac_header(skb),
1752                                       dev->hard_header_len, true);
1753                }
1754        }
1755#endif
1756}
1757
1758/* called in rcu_read_lock() section */
1759static int __mkroute_input(struct sk_buff *skb,
1760                           const struct fib_result *res,
1761                           struct in_device *in_dev,
1762                           __be32 daddr, __be32 saddr, u32 tos)
1763{
1764        struct fib_nh_exception *fnhe;
1765        struct rtable *rth;
1766        int err;
1767        struct in_device *out_dev;
1768        bool do_cache;
1769        u32 itag = 0;
1770
1771        /* get a working reference to the output device */
1772        out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1773        if (!out_dev) {
1774                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1775                return -EINVAL;
1776        }
1777
1778        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1779                                  in_dev->dev, in_dev, &itag);
1780        if (err < 0) {
1781                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1782                                         saddr);
1783
1784                goto cleanup;
1785        }
1786
1787        do_cache = res->fi && !itag;
1788        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1789            skb->protocol == htons(ETH_P_IP) &&
1790            (IN_DEV_SHARED_MEDIA(out_dev) ||
1791             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1792                IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1793
1794        if (skb->protocol != htons(ETH_P_IP)) {
1795                /* Not IP (i.e. ARP). Do not create route, if it is
1796                 * invalid for proxy arp. DNAT routes are always valid.
1797                 *
1798                 * Proxy arp feature have been extended to allow, ARP
1799                 * replies back to the same interface, to support
1800                 * Private VLAN switch technologies. See arp.c.
1801                 */
1802                if (out_dev == in_dev &&
1803                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1804                        err = -EINVAL;
1805                        goto cleanup;
1806                }
1807        }
1808
1809        fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1810        if (do_cache) {
1811                if (fnhe)
1812                        rth = rcu_dereference(fnhe->fnhe_rth_input);
1813                else
1814                        rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1815                if (rt_cache_valid(rth)) {
1816                        skb_dst_set_noref(skb, &rth->dst);
1817                        goto out;
1818                }
1819        }
1820
1821        rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1822                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1823                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1824        if (!rth) {
1825                err = -ENOBUFS;
1826                goto cleanup;
1827        }
1828
1829        rth->rt_is_input = 1;
1830        RT_CACHE_STAT_INC(in_slow_tot);
1831
1832        rth->dst.input = ip_forward;
1833
1834        rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1835                       do_cache);
1836        lwtunnel_set_redirect(&rth->dst);
1837        skb_dst_set(skb, &rth->dst);
1838out:
1839        err = 0;
1840 cleanup:
1841        return err;
1842}
1843
1844#ifdef CONFIG_IP_ROUTE_MULTIPATH
1845/* To make ICMP packets follow the right flow, the multipath hash is
1846 * calculated from the inner IP addresses.
1847 */
1848static void ip_multipath_l3_keys(const struct sk_buff *skb,
1849                                 struct flow_keys *hash_keys)
1850{
1851        const struct iphdr *outer_iph = ip_hdr(skb);
1852        const struct iphdr *key_iph = outer_iph;
1853        const struct iphdr *inner_iph;
1854        const struct icmphdr *icmph;
1855        struct iphdr _inner_iph;
1856        struct icmphdr _icmph;
1857
1858        if (likely(outer_iph->protocol != IPPROTO_ICMP))
1859                goto out;
1860
1861        if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1862                goto out;
1863
1864        icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1865                                   &_icmph);
1866        if (!icmph)
1867                goto out;
1868
1869        if (!icmp_is_err(icmph->type))
1870                goto out;
1871
1872        inner_iph = skb_header_pointer(skb,
1873                                       outer_iph->ihl * 4 + sizeof(_icmph),
1874                                       sizeof(_inner_iph), &_inner_iph);
1875        if (!inner_iph)
1876                goto out;
1877
1878        key_iph = inner_iph;
1879out:
1880        hash_keys->addrs.v4addrs.src = key_iph->saddr;
1881        hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1882}
1883
1884/* if skb is set it will be used and fl4 can be NULL */
1885int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1886                       const struct sk_buff *skb, struct flow_keys *flkeys)
1887{
1888        struct flow_keys hash_keys;
1889        u32 mhash;
1890
1891        switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1892        case 0:
1893                memset(&hash_keys, 0, sizeof(hash_keys));
1894                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1895                if (skb) {
1896                        ip_multipath_l3_keys(skb, &hash_keys);
1897                } else {
1898                        hash_keys.addrs.v4addrs.src = fl4->saddr;
1899                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
1900                }
1901                break;
1902        case 1:
1903                /* skb is currently provided only when forwarding */
1904                if (skb) {
1905                        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1906                        struct flow_keys keys;
1907
1908                        /* short-circuit if we already have L4 hash present */
1909                        if (skb->l4_hash)
1910                                return skb_get_hash_raw(skb) >> 1;
1911
1912                        memset(&hash_keys, 0, sizeof(hash_keys));
1913
1914                        if (!flkeys) {
1915                                skb_flow_dissect_flow_keys(skb, &keys, flag);
1916                                flkeys = &keys;
1917                        }
1918
1919                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1920                        hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1921                        hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1922                        hash_keys.ports.src = flkeys->ports.src;
1923                        hash_keys.ports.dst = flkeys->ports.dst;
1924                        hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1925                } else {
1926                        memset(&hash_keys, 0, sizeof(hash_keys));
1927                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1928                        hash_keys.addrs.v4addrs.src = fl4->saddr;
1929                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
1930                        hash_keys.ports.src = fl4->fl4_sport;
1931                        hash_keys.ports.dst = fl4->fl4_dport;
1932                        hash_keys.basic.ip_proto = fl4->flowi4_proto;
1933                }
1934                break;
1935        }
1936        mhash = flow_hash_from_keys(&hash_keys);
1937
1938        return mhash >> 1;
1939}
1940#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1941
1942static int ip_mkroute_input(struct sk_buff *skb,
1943                            struct fib_result *res,
1944                            struct in_device *in_dev,
1945                            __be32 daddr, __be32 saddr, u32 tos,
1946                            struct flow_keys *hkeys)
1947{
1948#ifdef CONFIG_IP_ROUTE_MULTIPATH
1949        if (res->fi && res->fi->fib_nhs > 1) {
1950                int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1951
1952                fib_select_multipath(res, h);
1953        }
1954#endif
1955
1956        /* create a routing cache entry */
1957        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1958}
1959
1960/* get device for dst_alloc with local routes */
1961static struct net_device *ip_rt_get_dev(struct net *net,
1962                                        const struct fib_result *res)
1963{
1964        struct net_device *dev_out = res->fi ? FIB_RES_DEV(*res) : NULL;
1965
1966        return l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
1967}
1968
1969/*
1970 *      NOTE. We drop all the packets that has local source
1971 *      addresses, because every properly looped back packet
1972 *      must have correct destination already attached by output routine.
1973 *
1974 *      Such approach solves two big problems:
1975 *      1. Not simplex devices are handled properly.
1976 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1977 *      called with rcu_read_lock()
1978 */
1979
1980static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1981                               u8 tos, struct net_device *dev,
1982                               struct fib_result *res)
1983{
1984        struct in_device *in_dev = __in_dev_get_rcu(dev);
1985        struct flow_keys *flkeys = NULL, _flkeys;
1986        struct net    *net = dev_net(dev);
1987        struct ip_tunnel_info *tun_info;
1988        int             err = -EINVAL;
1989        unsigned int    flags = 0;
1990        u32             itag = 0;
1991        struct rtable   *rth;
1992        struct flowi4   fl4;
1993        bool do_cache = true;
1994
1995        /* IP on this device is disabled. */
1996
1997        if (!in_dev)
1998                goto out;
1999
2000        /* Check for the most weird martians, which can be not detected
2001           by fib_lookup.
2002         */
2003
2004        tun_info = skb_tunnel_info(skb);
2005        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2006                fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2007        else
2008                fl4.flowi4_tun_key.tun_id = 0;
2009        skb_dst_drop(skb);
2010
2011        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2012                goto martian_source;
2013
2014        res->fi = NULL;
2015        res->table = NULL;
2016        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2017                goto brd_input;
2018
2019        /* Accept zero addresses only to limited broadcast;
2020         * I even do not know to fix it or not. Waiting for complains :-)
2021         */
2022        if (ipv4_is_zeronet(saddr))
2023                goto martian_source;
2024
2025        if (ipv4_is_zeronet(daddr))
2026                goto martian_destination;
2027
2028        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2029         * and call it once if daddr or/and saddr are loopback addresses
2030         */
2031        if (ipv4_is_loopback(daddr)) {
2032                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2033                        goto martian_destination;
2034        } else if (ipv4_is_loopback(saddr)) {
2035                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2036                        goto martian_source;
2037        }
2038
2039        /*
2040         *      Now we are ready to route packet.
2041         */
2042        fl4.flowi4_oif = 0;
2043        fl4.flowi4_iif = dev->ifindex;
2044        fl4.flowi4_mark = skb->mark;
2045        fl4.flowi4_tos = tos;
2046        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2047        fl4.flowi4_flags = 0;
2048        fl4.daddr = daddr;
2049        fl4.saddr = saddr;
2050        fl4.flowi4_uid = sock_net_uid(net, NULL);
2051
2052        if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2053                flkeys = &_flkeys;
2054        } else {
2055                fl4.flowi4_proto = 0;
2056                fl4.fl4_sport = 0;
2057                fl4.fl4_dport = 0;
2058        }
2059
2060        err = fib_lookup(net, &fl4, res, 0);
2061        if (err != 0) {
2062                if (!IN_DEV_FORWARD(in_dev))
2063                        err = -EHOSTUNREACH;
2064                goto no_route;
2065        }
2066
2067        if (res->type == RTN_BROADCAST) {
2068                if (IN_DEV_BFORWARD(in_dev))
2069                        goto make_route;
2070                /* not do cache if bc_forwarding is enabled */
2071                if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2072                        do_cache = false;
2073                goto brd_input;
2074        }
2075
2076        if (res->type == RTN_LOCAL) {
2077                err = fib_validate_source(skb, saddr, daddr, tos,
2078                                          0, dev, in_dev, &itag);
2079                if (err < 0)
2080                        goto martian_source;
2081                goto local_input;
2082        }
2083
2084        if (!IN_DEV_FORWARD(in_dev)) {
2085                err = -EHOSTUNREACH;
2086                goto no_route;
2087        }
2088        if (res->type != RTN_UNICAST)
2089                goto martian_destination;
2090
2091make_route:
2092        err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2093out:    return err;
2094
2095brd_input:
2096        if (skb->protocol != htons(ETH_P_IP))
2097                goto e_inval;
2098
2099        if (!ipv4_is_zeronet(saddr)) {
2100                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2101                                          in_dev, &itag);
2102                if (err < 0)
2103                        goto martian_source;
2104        }
2105        flags |= RTCF_BROADCAST;
2106        res->type = RTN_BROADCAST;
2107        RT_CACHE_STAT_INC(in_brd);
2108
2109local_input:
2110        do_cache &= res->fi && !itag;
2111        if (do_cache) {
2112                rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2113                if (rt_cache_valid(rth)) {
2114                        skb_dst_set_noref(skb, &rth->dst);
2115                        err = 0;
2116                        goto out;
2117                }
2118        }
2119
2120        rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2121                           flags | RTCF_LOCAL, res->type,
2122                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2123        if (!rth)
2124                goto e_nobufs;
2125
2126        rth->dst.output= ip_rt_bug;
2127#ifdef CONFIG_IP_ROUTE_CLASSID
2128        rth->dst.tclassid = itag;
2129#endif
2130        rth->rt_is_input = 1;
2131
2132        RT_CACHE_STAT_INC(in_slow_tot);
2133        if (res->type == RTN_UNREACHABLE) {
2134                rth->dst.input= ip_error;
2135                rth->dst.error= -err;
2136                rth->rt_flags   &= ~RTCF_LOCAL;
2137        }
2138
2139        if (do_cache) {
2140                struct fib_nh *nh = &FIB_RES_NH(*res);
2141
2142                rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2143                if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2144                        WARN_ON(rth->dst.input == lwtunnel_input);
2145                        rth->dst.lwtstate->orig_input = rth->dst.input;
2146                        rth->dst.input = lwtunnel_input;
2147                }
2148
2149                if (unlikely(!rt_cache_route(nh, rth)))
2150                        rt_add_uncached_list(rth);
2151        }
2152        skb_dst_set(skb, &rth->dst);
2153        err = 0;
2154        goto out;
2155
2156no_route:
2157        RT_CACHE_STAT_INC(in_no_route);
2158        res->type = RTN_UNREACHABLE;
2159        res->fi = NULL;
2160        res->table = NULL;
2161        goto local_input;
2162
2163        /*
2164         *      Do not cache martian addresses: they should be logged (RFC1812)
2165         */
2166martian_destination:
2167        RT_CACHE_STAT_INC(in_martian_dst);
2168#ifdef CONFIG_IP_ROUTE_VERBOSE
2169        if (IN_DEV_LOG_MARTIANS(in_dev))
2170                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2171                                     &daddr, &saddr, dev->name);
2172#endif
2173
2174e_inval:
2175        err = -EINVAL;
2176        goto out;
2177
2178e_nobufs:
2179        err = -ENOBUFS;
2180        goto out;
2181
2182martian_source:
2183        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2184        goto out;
2185}
2186
2187int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2188                         u8 tos, struct net_device *dev)
2189{
2190        struct fib_result res;
2191        int err;
2192
2193        tos &= IPTOS_RT_MASK;
2194        rcu_read_lock();
2195        err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2196        rcu_read_unlock();
2197
2198        return err;
2199}
2200EXPORT_SYMBOL(ip_route_input_noref);
2201
2202/* called with rcu_read_lock held */
2203int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2204                       u8 tos, struct net_device *dev, struct fib_result *res)
2205{
2206        /* Multicast recognition logic is moved from route cache to here.
2207           The problem was that too many Ethernet cards have broken/missing
2208           hardware multicast filters :-( As result the host on multicasting
2209           network acquires a lot of useless route cache entries, sort of
2210           SDR messages from all the world. Now we try to get rid of them.
2211           Really, provided software IP multicast filter is organized
2212           reasonably (at least, hashed), it does not result in a slowdown
2213           comparing with route cache reject entries.
2214           Note, that multicast routers are not affected, because
2215           route cache entry is created eventually.
2216         */
2217        if (ipv4_is_multicast(daddr)) {
2218                struct in_device *in_dev = __in_dev_get_rcu(dev);
2219                int our = 0;
2220                int err = -EINVAL;
2221
2222                if (!in_dev)
2223                        return err;
2224                our = ip_check_mc_rcu(in_dev, daddr, saddr,
2225                                      ip_hdr(skb)->protocol);
2226
2227                /* check l3 master if no match yet */
2228                if (!our && netif_is_l3_slave(dev)) {
2229                        struct in_device *l3_in_dev;
2230
2231                        l3_in_dev = __in_dev_get_rcu(skb->dev);
2232                        if (l3_in_dev)
2233                                our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2234                                                      ip_hdr(skb)->protocol);
2235                }
2236
2237                if (our
2238#ifdef CONFIG_IP_MROUTE
2239                        ||
2240                    (!ipv4_is_local_multicast(daddr) &&
2241                     IN_DEV_MFORWARD(in_dev))
2242#endif
2243                   ) {
2244                        err = ip_route_input_mc(skb, daddr, saddr,
2245                                                tos, dev, our);
2246                }
2247                return err;
2248        }
2249
2250        return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2251}
2252
2253/* called with rcu_read_lock() */
2254static struct rtable *__mkroute_output(const struct fib_result *res,
2255                                       const struct flowi4 *fl4, int orig_oif,
2256                                       struct net_device *dev_out,
2257                                       unsigned int flags)
2258{
2259        struct fib_info *fi = res->fi;
2260        struct fib_nh_exception *fnhe;
2261        struct in_device *in_dev;
2262        u16 type = res->type;
2263        struct rtable *rth;
2264        bool do_cache;
2265
2266        in_dev = __in_dev_get_rcu(dev_out);
2267        if (!in_dev)
2268                return ERR_PTR(-EINVAL);
2269
2270        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2271                if (ipv4_is_loopback(fl4->saddr) &&
2272                    !(dev_out->flags & IFF_LOOPBACK) &&
2273                    !netif_is_l3_master(dev_out))
2274                        return ERR_PTR(-EINVAL);
2275
2276        if (ipv4_is_lbcast(fl4->daddr))
2277                type = RTN_BROADCAST;
2278        else if (ipv4_is_multicast(fl4->daddr))
2279                type = RTN_MULTICAST;
2280        else if (ipv4_is_zeronet(fl4->daddr))
2281                return ERR_PTR(-EINVAL);
2282
2283        if (dev_out->flags & IFF_LOOPBACK)
2284                flags |= RTCF_LOCAL;
2285
2286        do_cache = true;
2287        if (type == RTN_BROADCAST) {
2288                flags |= RTCF_BROADCAST | RTCF_LOCAL;
2289                fi = NULL;
2290        } else if (type == RTN_MULTICAST) {
2291                flags |= RTCF_MULTICAST | RTCF_LOCAL;
2292                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2293                                     fl4->flowi4_proto))
2294                        flags &= ~RTCF_LOCAL;
2295                else
2296                        do_cache = false;
2297                /* If multicast route do not exist use
2298                 * default one, but do not gateway in this case.
2299                 * Yes, it is hack.
2300                 */
2301                if (fi && res->prefixlen < 4)
2302                        fi = NULL;
2303        } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2304                   (orig_oif != dev_out->ifindex)) {
2305                /* For local routes that require a particular output interface
2306                 * we do not want to cache the result.  Caching the result
2307                 * causes incorrect behaviour when there are multiple source
2308                 * addresses on the interface, the end result being that if the
2309                 * intended recipient is waiting on that interface for the
2310                 * packet he won't receive it because it will be delivered on
2311                 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2312                 * be set to the loopback interface as well.
2313                 */
2314                do_cache = false;
2315        }
2316
2317        fnhe = NULL;
2318        do_cache &= fi != NULL;
2319        if (fi) {
2320                struct rtable __rcu **prth;
2321                struct fib_nh *nh = &FIB_RES_NH(*res);
2322
2323                fnhe = find_exception(nh, fl4->daddr);
2324                if (!do_cache)
2325                        goto add;
2326                if (fnhe) {
2327                        prth = &fnhe->fnhe_rth_output;
2328                } else {
2329                        if (unlikely(fl4->flowi4_flags &
2330                                     FLOWI_FLAG_KNOWN_NH &&
2331                                     !(nh->nh_gw &&
2332                                       nh->nh_scope == RT_SCOPE_LINK))) {
2333                                do_cache = false;
2334                                goto add;
2335                        }
2336                        prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2337                }
2338                rth = rcu_dereference(*prth);
2339                if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2340                        return rth;
2341        }
2342
2343add:
2344        rth = rt_dst_alloc(dev_out, flags, type,
2345                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
2346                           IN_DEV_CONF_GET(in_dev, NOXFRM),
2347                           do_cache);
2348        if (!rth)
2349                return ERR_PTR(-ENOBUFS);
2350
2351        rth->rt_iif = orig_oif;
2352
2353        RT_CACHE_STAT_INC(out_slow_tot);
2354
2355        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2356                if (flags & RTCF_LOCAL &&
2357                    !(dev_out->flags & IFF_LOOPBACK)) {
2358                        rth->dst.output = ip_mc_output;
2359                        RT_CACHE_STAT_INC(out_slow_mc);
2360                }
2361#ifdef CONFIG_IP_MROUTE
2362                if (type == RTN_MULTICAST) {
2363                        if (IN_DEV_MFORWARD(in_dev) &&
2364                            !ipv4_is_local_multicast(fl4->daddr)) {
2365                                rth->dst.input = ip_mr_input;
2366                                rth->dst.output = ip_mc_output;
2367                        }
2368                }
2369#endif
2370        }
2371
2372        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2373        lwtunnel_set_redirect(&rth->dst);
2374
2375        return rth;
2376}
2377
2378/*
2379 * Major route resolver routine.
2380 */
2381
2382struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2383                                        const struct sk_buff *skb)
2384{
2385        __u8 tos = RT_FL_TOS(fl4);
2386        struct fib_result res = {
2387                .type           = RTN_UNSPEC,
2388                .fi             = NULL,
2389                .table          = NULL,
2390                .tclassid       = 0,
2391        };
2392        struct rtable *rth;
2393
2394        fl4->flowi4_iif = LOOPBACK_IFINDEX;
2395        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2396        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2397                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2398
2399        rcu_read_lock();
2400        rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2401        rcu_read_unlock();
2402
2403        return rth;
2404}
2405EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2406
2407struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2408                                            struct fib_result *res,
2409                                            const struct sk_buff *skb)
2410{
2411        struct net_device *dev_out = NULL;
2412        int orig_oif = fl4->flowi4_oif;
2413        unsigned int flags = 0;
2414        struct rtable *rth;
2415        int err;
2416
2417        if (fl4->saddr) {
2418                if (ipv4_is_multicast(fl4->saddr) ||
2419                    ipv4_is_lbcast(fl4->saddr) ||
2420                    ipv4_is_zeronet(fl4->saddr)) {
2421                        rth = ERR_PTR(-EINVAL);
2422                        goto out;
2423                }
2424
2425                rth = ERR_PTR(-ENETUNREACH);
2426
2427                /* I removed check for oif == dev_out->oif here.
2428                   It was wrong for two reasons:
2429                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2430                      is assigned to multiple interfaces.
2431                   2. Moreover, we are allowed to send packets with saddr
2432                      of another iface. --ANK
2433                 */
2434
2435                if (fl4->flowi4_oif == 0 &&
2436                    (ipv4_is_multicast(fl4->daddr) ||
2437                     ipv4_is_lbcast(fl4->daddr))) {
2438                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2439                        dev_out = __ip_dev_find(net, fl4->saddr, false);
2440                        if (!dev_out)
2441                                goto out;
2442
2443                        /* Special hack: user can direct multicasts
2444                           and limited broadcast via necessary interface
2445                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2446                           This hack is not just for fun, it allows
2447                           vic,vat and friends to work.
2448                           They bind socket to loopback, set ttl to zero
2449                           and expect that it will work.
2450                           From the viewpoint of routing cache they are broken,
2451                           because we are not allowed to build multicast path
2452                           with loopback source addr (look, routing cache
2453                           cannot know, that ttl is zero, so that packet
2454                           will not leave this host and route is valid).
2455                           Luckily, this hack is good workaround.
2456                         */
2457
2458                        fl4->flowi4_oif = dev_out->ifindex;
2459                        goto make_route;
2460                }
2461
2462                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2463                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2464                        if (!__ip_dev_find(net, fl4->saddr, false))
2465                                goto out;
2466                }
2467        }
2468
2469
2470        if (fl4->flowi4_oif) {
2471                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2472                rth = ERR_PTR(-ENODEV);
2473                if (!dev_out)
2474                        goto out;
2475
2476                /* RACE: Check return value of inet_select_addr instead. */
2477                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2478                        rth = ERR_PTR(-ENETUNREACH);
2479                        goto out;
2480                }
2481                if (ipv4_is_local_multicast(fl4->daddr) ||
2482                    ipv4_is_lbcast(fl4->daddr) ||
2483                    fl4->flowi4_proto == IPPROTO_IGMP) {
2484                        if (!fl4->saddr)
2485                                fl4->saddr = inet_select_addr(dev_out, 0,
2486                                                              RT_SCOPE_LINK);
2487                        goto make_route;
2488                }
2489                if (!fl4->saddr) {
2490                        if (ipv4_is_multicast(fl4->daddr))
2491                                fl4->saddr = inet_select_addr(dev_out, 0,
2492                                                              fl4->flowi4_scope);
2493                        else if (!fl4->daddr)
2494                                fl4->saddr = inet_select_addr(dev_out, 0,
2495                                                              RT_SCOPE_HOST);
2496                }
2497        }
2498
2499        if (!fl4->daddr) {
2500                fl4->daddr = fl4->saddr;
2501                if (!fl4->daddr)
2502                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2503                dev_out = net->loopback_dev;
2504                fl4->flowi4_oif = LOOPBACK_IFINDEX;
2505                res->type = RTN_LOCAL;
2506                flags |= RTCF_LOCAL;
2507                goto make_route;
2508        }
2509
2510        err = fib_lookup(net, fl4, res, 0);
2511        if (err) {
2512                res->fi = NULL;
2513                res->table = NULL;
2514                if (fl4->flowi4_oif &&
2515                    (ipv4_is_multicast(fl4->daddr) ||
2516                    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2517                        /* Apparently, routing tables are wrong. Assume,
2518                           that the destination is on link.
2519
2520                           WHY? DW.
2521                           Because we are allowed to send to iface
2522                           even if it has NO routes and NO assigned
2523                           addresses. When oif is specified, routing
2524                           tables are looked up with only one purpose:
2525                           to catch if destination is gatewayed, rather than
2526                           direct. Moreover, if MSG_DONTROUTE is set,
2527                           we send packet, ignoring both routing tables
2528                           and ifaddr state. --ANK
2529
2530
2531                           We could make it even if oif is unknown,
2532                           likely IPv6, but we do not.
2533                         */
2534
2535                        if (fl4->saddr == 0)
2536                                fl4->saddr = inet_select_addr(dev_out, 0,
2537                                                              RT_SCOPE_LINK);
2538                        res->type = RTN_UNICAST;
2539                        goto make_route;
2540                }
2541                rth = ERR_PTR(err);
2542                goto out;
2543        }
2544
2545        if (res->type == RTN_LOCAL) {
2546                if (!fl4->saddr) {
2547                        if (res->fi->fib_prefsrc)
2548                                fl4->saddr = res->fi->fib_prefsrc;
2549                        else
2550                                fl4->saddr = fl4->daddr;
2551                }
2552
2553                /* L3 master device is the loopback for that domain */
2554                dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2555                        net->loopback_dev;
2556
2557                /* make sure orig_oif points to fib result device even
2558                 * though packet rx/tx happens over loopback or l3mdev
2559                 */
2560                orig_oif = FIB_RES_OIF(*res);
2561
2562                fl4->flowi4_oif = dev_out->ifindex;
2563                flags |= RTCF_LOCAL;
2564                goto make_route;
2565        }
2566
2567        fib_select_path(net, res, fl4, skb);
2568
2569        dev_out = FIB_RES_DEV(*res);
2570
2571make_route:
2572        rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2573
2574out:
2575        return rth;
2576}
2577
2578static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2579{
2580        return NULL;
2581}
2582
2583static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2584{
2585        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2586
2587        return mtu ? : dst->dev->mtu;
2588}
2589
2590static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2591                                          struct sk_buff *skb, u32 mtu,
2592                                          bool confirm_neigh)
2593{
2594}
2595
2596static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2597                                       struct sk_buff *skb)
2598{
2599}
2600
2601static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2602                                          unsigned long old)
2603{
2604        return NULL;
2605}
2606
2607static struct dst_ops ipv4_dst_blackhole_ops = {
2608        .family                 =       AF_INET,
2609        .check                  =       ipv4_blackhole_dst_check,
2610        .mtu                    =       ipv4_blackhole_mtu,
2611        .default_advmss         =       ipv4_default_advmss,
2612        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2613        .redirect               =       ipv4_rt_blackhole_redirect,
2614        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2615        .neigh_lookup           =       ipv4_neigh_lookup,
2616};
2617
2618struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2619{
2620        struct rtable *ort = (struct rtable *) dst_orig;
2621        struct rtable *rt;
2622
2623        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2624        if (rt) {
2625                struct dst_entry *new = &rt->dst;
2626
2627                new->__use = 1;
2628                new->input = dst_discard;
2629                new->output = dst_discard_out;
2630
2631                new->dev = net->loopback_dev;
2632                dev_hold(new->dev);
2633
2634                rt->rt_is_input = ort->rt_is_input;
2635                rt->rt_iif = ort->rt_iif;
2636                rt->rt_pmtu = ort->rt_pmtu;
2637                rt->rt_mtu_locked = ort->rt_mtu_locked;
2638
2639                rt->rt_genid = rt_genid_ipv4(net);
2640                rt->rt_flags = ort->rt_flags;
2641                rt->rt_type = ort->rt_type;
2642                rt->rt_gateway = ort->rt_gateway;
2643                rt->rt_uses_gateway = ort->rt_uses_gateway;
2644
2645                INIT_LIST_HEAD(&rt->rt_uncached);
2646        }
2647
2648        dst_release(dst_orig);
2649
2650        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2651}
2652
2653struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2654                                    const struct sock *sk)
2655{
2656        struct rtable *rt = __ip_route_output_key(net, flp4);
2657
2658        if (IS_ERR(rt))
2659                return rt;
2660
2661        if (flp4->flowi4_proto) {
2662                flp4->flowi4_oif = rt->dst.dev->ifindex;
2663                rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2664                                                        flowi4_to_flowi(flp4),
2665                                                        sk, 0);
2666        }
2667
2668        return rt;
2669}
2670EXPORT_SYMBOL_GPL(ip_route_output_flow);
2671
2672struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2673                                      struct net_device *dev,
2674                                      struct net *net, __be32 *saddr,
2675                                      const struct ip_tunnel_info *info,
2676                                      u8 protocol, bool use_cache)
2677{
2678#ifdef CONFIG_DST_CACHE
2679        struct dst_cache *dst_cache;
2680#endif
2681        struct rtable *rt = NULL;
2682        struct flowi4 fl4;
2683        __u8 tos;
2684
2685#ifdef CONFIG_DST_CACHE
2686        dst_cache = (struct dst_cache *)&info->dst_cache;
2687        if (use_cache) {
2688                rt = dst_cache_get_ip4(dst_cache, saddr);
2689                if (rt)
2690                        return rt;
2691        }
2692#endif
2693        memset(&fl4, 0, sizeof(fl4));
2694        fl4.flowi4_mark = skb->mark;
2695        fl4.flowi4_proto = protocol;
2696        fl4.daddr = info->key.u.ipv4.dst;
2697        fl4.saddr = info->key.u.ipv4.src;
2698        tos = info->key.tos;
2699        fl4.flowi4_tos = RT_TOS(tos);
2700
2701        rt = ip_route_output_key(net, &fl4);
2702        if (IS_ERR(rt)) {
2703                netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2704                return ERR_PTR(-ENETUNREACH);
2705        }
2706        if (rt->dst.dev == dev) { /* is this necessary? */
2707                netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2708                ip_rt_put(rt);
2709                return ERR_PTR(-ELOOP);
2710        }
2711#ifdef CONFIG_DST_CACHE
2712        if (use_cache)
2713                dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2714#endif
2715        *saddr = fl4.saddr;
2716        return rt;
2717}
2718EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2719
2720/* called with rcu_read_lock held */
2721static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2722                        struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2723                        struct sk_buff *skb, u32 portid, u32 seq,
2724                        unsigned int flags)
2725{
2726        struct rtmsg *r;
2727        struct nlmsghdr *nlh;
2728        unsigned long expires = 0;
2729        u32 error;
2730        u32 metrics[RTAX_MAX];
2731
2732        nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2733        if (!nlh)
2734                return -EMSGSIZE;
2735
2736        r = nlmsg_data(nlh);
2737        r->rtm_family    = AF_INET;
2738        r->rtm_dst_len  = 32;
2739        r->rtm_src_len  = 0;
2740        r->rtm_tos      = fl4 ? fl4->flowi4_tos : 0;
2741        r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2742        if (nla_put_u32(skb, RTA_TABLE, table_id))
2743                goto nla_put_failure;
2744        r->rtm_type     = rt->rt_type;
2745        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2746        r->rtm_protocol = RTPROT_UNSPEC;
2747        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2748        if (rt->rt_flags & RTCF_NOTIFY)
2749                r->rtm_flags |= RTM_F_NOTIFY;
2750        if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2751                r->rtm_flags |= RTCF_DOREDIRECT;
2752
2753        if (nla_put_in_addr(skb, RTA_DST, dst))
2754                goto nla_put_failure;
2755        if (src) {
2756                r->rtm_src_len = 32;
2757                if (nla_put_in_addr(skb, RTA_SRC, src))
2758                        goto nla_put_failure;
2759        }
2760        if (rt->dst.dev &&
2761            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2762                goto nla_put_failure;
2763#ifdef CONFIG_IP_ROUTE_CLASSID
2764        if (rt->dst.tclassid &&
2765            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2766                goto nla_put_failure;
2767#endif
2768        if (fl4 && !rt_is_input_route(rt) &&
2769            fl4->saddr != src) {
2770                if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2771                        goto nla_put_failure;
2772        }
2773        if (rt->rt_uses_gateway &&
2774            nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2775                goto nla_put_failure;
2776
2777        expires = rt->dst.expires;
2778        if (expires) {
2779                unsigned long now = jiffies;
2780
2781                if (time_before(now, expires))
2782                        expires -= now;
2783                else
2784                        expires = 0;
2785        }
2786
2787        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2788        if (rt->rt_pmtu && expires)
2789                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2790        if (rt->rt_mtu_locked && expires)
2791                metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2792        if (rtnetlink_put_metrics(skb, metrics) < 0)
2793                goto nla_put_failure;
2794
2795        if (fl4) {
2796                if (fl4->flowi4_mark &&
2797                    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2798                        goto nla_put_failure;
2799
2800                if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2801                    nla_put_u32(skb, RTA_UID,
2802                                from_kuid_munged(current_user_ns(),
2803                                                 fl4->flowi4_uid)))
2804                        goto nla_put_failure;
2805
2806                if (rt_is_input_route(rt)) {
2807#ifdef CONFIG_IP_MROUTE
2808                        if (ipv4_is_multicast(dst) &&
2809                            !ipv4_is_local_multicast(dst) &&
2810                            IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2811                                int err = ipmr_get_route(net, skb,
2812                                                         fl4->saddr, fl4->daddr,
2813                                                         r, portid);
2814
2815                                if (err <= 0) {
2816                                        if (err == 0)
2817                                                return 0;
2818                                        goto nla_put_failure;
2819                                }
2820                        } else
2821#endif
2822                                if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2823                                        goto nla_put_failure;
2824                }
2825        }
2826
2827        error = rt->dst.error;
2828
2829        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2830                goto nla_put_failure;
2831
2832        nlmsg_end(skb, nlh);
2833        return 0;
2834
2835nla_put_failure:
2836        nlmsg_cancel(skb, nlh);
2837        return -EMSGSIZE;
2838}
2839
2840static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2841                            struct netlink_callback *cb, u32 table_id,
2842                            struct fnhe_hash_bucket *bucket, int genid,
2843                            int *fa_index, int fa_start, unsigned int flags)
2844{
2845        int i;
2846
2847        for (i = 0; i < FNHE_HASH_SIZE; i++) {
2848                struct fib_nh_exception *fnhe;
2849
2850                for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2851                     fnhe = rcu_dereference(fnhe->fnhe_next)) {
2852                        struct rtable *rt;
2853                        int err;
2854
2855                        if (*fa_index < fa_start)
2856                                goto next;
2857
2858                        if (fnhe->fnhe_genid != genid)
2859                                goto next;
2860
2861                        if (fnhe->fnhe_expires &&
2862                            time_after(jiffies, fnhe->fnhe_expires))
2863                                goto next;
2864
2865                        rt = rcu_dereference(fnhe->fnhe_rth_input);
2866                        if (!rt)
2867                                rt = rcu_dereference(fnhe->fnhe_rth_output);
2868                        if (!rt)
2869                                goto next;
2870
2871                        err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2872                                           table_id, NULL, skb,
2873                                           NETLINK_CB(cb->skb).portid,
2874                                           cb->nlh->nlmsg_seq, flags);
2875                        if (err)
2876                                return err;
2877next:
2878                        (*fa_index)++;
2879                }
2880        }
2881
2882        return 0;
2883}
2884
2885int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
2886                       u32 table_id, struct fib_info *fi,
2887                       int *fa_index, int fa_start, unsigned int flags)
2888{
2889        struct net *net = sock_net(cb->skb->sk);
2890        int nhsel, genid = fnhe_genid(net);
2891
2892        for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
2893                struct fib_nh *nh = &fi->fib_nh[nhsel];
2894                struct fnhe_hash_bucket *bucket;
2895                int err;
2896
2897                if (nh->nh_flags & RTNH_F_DEAD)
2898                        continue;
2899
2900                rcu_read_lock();
2901                bucket = rcu_dereference(nh->nh_exceptions);
2902                err = 0;
2903                if (bucket)
2904                        err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
2905                                               genid, fa_index, fa_start,
2906                                               flags);
2907                rcu_read_unlock();
2908                if (err)
2909                        return err;
2910        }
2911
2912        return 0;
2913}
2914
2915static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2916                                                   u8 ip_proto, __be16 sport,
2917                                                   __be16 dport)
2918{
2919        struct sk_buff *skb;
2920        struct iphdr *iph;
2921
2922        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2923        if (!skb)
2924                return NULL;
2925
2926        /* Reserve room for dummy headers, this skb can pass
2927         * through good chunk of routing engine.
2928         */
2929        skb_reset_mac_header(skb);
2930        skb_reset_network_header(skb);
2931        skb->protocol = htons(ETH_P_IP);
2932        iph = skb_put(skb, sizeof(struct iphdr));
2933        iph->protocol = ip_proto;
2934        iph->saddr = src;
2935        iph->daddr = dst;
2936        iph->version = 0x4;
2937        iph->frag_off = 0;
2938        iph->ihl = 0x5;
2939        skb_set_transport_header(skb, skb->len);
2940
2941        switch (iph->protocol) {
2942        case IPPROTO_UDP: {
2943                struct udphdr *udph;
2944
2945                udph = skb_put_zero(skb, sizeof(struct udphdr));
2946                udph->source = sport;
2947                udph->dest = dport;
2948                udph->len = htons(sizeof(struct udphdr));
2949                udph->check = 0;
2950                break;
2951        }
2952        case IPPROTO_TCP: {
2953                struct tcphdr *tcph;
2954
2955                tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2956                tcph->source    = sport;
2957                tcph->dest      = dport;
2958                tcph->doff      = sizeof(struct tcphdr) / 4;
2959                tcph->rst = 1;
2960                tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2961                                            src, dst, 0);
2962                break;
2963        }
2964        case IPPROTO_ICMP: {
2965                struct icmphdr *icmph;
2966
2967                icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2968                icmph->type = ICMP_ECHO;
2969                icmph->code = 0;
2970        }
2971        }
2972
2973        return skb;
2974}
2975
2976static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
2977                                       const struct nlmsghdr *nlh,
2978                                       struct nlattr **tb,
2979                                       struct netlink_ext_ack *extack)
2980{
2981        struct rtmsg *rtm;
2982        int i, err;
2983
2984        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
2985                NL_SET_ERR_MSG(extack,
2986                               "ipv4: Invalid header for route get request");
2987                return -EINVAL;
2988        }
2989
2990        if (!netlink_strict_get_check(skb))
2991                return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
2992                                              rtm_ipv4_policy, extack);
2993
2994        rtm = nlmsg_data(nlh);
2995        if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
2996            (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
2997            rtm->rtm_table || rtm->rtm_protocol ||
2998            rtm->rtm_scope || rtm->rtm_type) {
2999                NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3000                return -EINVAL;
3001        }
3002
3003        if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3004                               RTM_F_LOOKUP_TABLE |
3005                               RTM_F_FIB_MATCH)) {
3006                NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3007                return -EINVAL;
3008        }
3009
3010        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3011                                            rtm_ipv4_policy, extack);
3012        if (err)
3013                return err;
3014
3015        if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3016            (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3017                NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3018                return -EINVAL;
3019        }
3020
3021        for (i = 0; i <= RTA_MAX; i++) {
3022                if (!tb[i])
3023                        continue;
3024
3025                switch (i) {
3026                case RTA_IIF:
3027                case RTA_OIF:
3028                case RTA_SRC:
3029                case RTA_DST:
3030                case RTA_IP_PROTO:
3031                case RTA_SPORT:
3032                case RTA_DPORT:
3033                case RTA_MARK:
3034                case RTA_UID:
3035                        break;
3036                default:
3037                        NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3038                        return -EINVAL;
3039                }
3040        }
3041
3042        return 0;
3043}
3044
3045static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3046                             struct netlink_ext_ack *extack)
3047{
3048        struct net *net = sock_net(in_skb->sk);
3049        struct nlattr *tb[RTA_MAX+1];
3050        u32 table_id = RT_TABLE_MAIN;
3051        __be16 sport = 0, dport = 0;
3052        struct fib_result res = {};
3053        u8 ip_proto = IPPROTO_UDP;
3054        struct rtable *rt = NULL;
3055        struct sk_buff *skb;
3056        struct rtmsg *rtm;
3057        struct flowi4 fl4;
3058        __be32 dst = 0;
3059        __be32 src = 0;
3060        kuid_t uid;
3061        u32 iif;
3062        int err;
3063        int mark;
3064
3065        err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3066        if (err < 0)
3067                return err;
3068
3069        rtm = nlmsg_data(nlh);
3070        src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3071        dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3072        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3073        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3074        if (tb[RTA_UID])
3075                uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3076        else
3077                uid = (iif ? INVALID_UID : current_uid());
3078
3079        if (tb[RTA_IP_PROTO]) {
3080                err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3081                                                  &ip_proto, AF_INET, extack);
3082                if (err)
3083                        return err;
3084        }
3085
3086        if (tb[RTA_SPORT])
3087                sport = nla_get_be16(tb[RTA_SPORT]);
3088
3089        if (tb[RTA_DPORT])
3090                dport = nla_get_be16(tb[RTA_DPORT]);
3091
3092        skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3093        if (!skb)
3094                return -ENOBUFS;
3095
3096        memset(&fl4, 0, sizeof(fl4));
3097        fl4.daddr = dst;
3098        fl4.saddr = src;
3099        fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3100        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3101        fl4.flowi4_mark = mark;
3102        fl4.flowi4_uid = uid;
3103        if (sport)
3104                fl4.fl4_sport = sport;
3105        if (dport)
3106                fl4.fl4_dport = dport;
3107        fl4.flowi4_proto = ip_proto;
3108
3109        rcu_read_lock();
3110
3111        if (iif) {
3112                struct net_device *dev;
3113
3114                dev = dev_get_by_index_rcu(net, iif);
3115                if (!dev) {
3116                        err = -ENODEV;
3117                        goto errout_rcu;
3118                }
3119
3120                fl4.flowi4_iif = iif; /* for rt_fill_info */
3121                skb->dev        = dev;
3122                skb->mark       = mark;
3123                err = ip_route_input_rcu(skb, dst, src,
3124                                         rtm->rtm_tos & IPTOS_RT_MASK, dev,
3125                                         &res);
3126
3127                rt = skb_rtable(skb);
3128                if (err == 0 && rt->dst.error)
3129                        err = -rt->dst.error;
3130        } else {
3131                fl4.flowi4_iif = LOOPBACK_IFINDEX;
3132                skb->dev = net->loopback_dev;
3133                rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3134                err = 0;
3135                if (IS_ERR(rt))
3136                        err = PTR_ERR(rt);
3137                else
3138                        skb_dst_set(skb, &rt->dst);
3139        }
3140
3141        if (err)
3142                goto errout_rcu;
3143
3144        if (rtm->rtm_flags & RTM_F_NOTIFY)
3145                rt->rt_flags |= RTCF_NOTIFY;
3146
3147        if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3148                table_id = res.table ? res.table->tb_id : 0;
3149
3150        /* reset skb for netlink reply msg */
3151        skb_trim(skb, 0);
3152        skb_reset_network_header(skb);
3153        skb_reset_transport_header(skb);
3154        skb_reset_mac_header(skb);
3155
3156        if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3157                struct fib_rt_info fri;
3158
3159                if (!res.fi) {
3160                        err = fib_props[res.type].error;
3161                        if (!err)
3162                                err = -EHOSTUNREACH;
3163                        goto errout_rcu;
3164                }
3165                fri.fi = res.fi;
3166                fri.tb_id = table_id;
3167                fri.dst = res.prefix;
3168                fri.dst_len = res.prefixlen;
3169                fri.tos = fl4.flowi4_tos;
3170                fri.type = rt->rt_type;
3171                fri.offload = 0;
3172                fri.trap = 0;
3173                if (res.fa_head) {
3174                        struct fib_alias *fa;
3175
3176                        hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3177                                u8 slen = 32 - fri.dst_len;
3178
3179                                if (fa->fa_slen == slen &&
3180                                    fa->tb_id == fri.tb_id &&
3181                                    fa->fa_tos == fri.tos &&
3182                                    fa->fa_info == res.fi &&
3183                                    fa->fa_type == fri.type) {
3184                                        fri.offload = fa->offload;
3185                                        fri.trap = fa->trap;
3186                                        break;
3187                                }
3188                        }
3189                }
3190                err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3191                                    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3192        } else {
3193                err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3194                                   NETLINK_CB(in_skb).portid,
3195                                   nlh->nlmsg_seq, 0);
3196        }
3197        if (err < 0)
3198                goto errout_rcu;
3199
3200        rcu_read_unlock();
3201
3202        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3203
3204errout_free:
3205        return err;
3206errout_rcu:
3207        rcu_read_unlock();
3208        kfree_skb(skb);
3209        goto errout_free;
3210}
3211
3212void ip_rt_multicast_event(struct in_device *in_dev)
3213{
3214        rt_cache_flush(dev_net(in_dev->dev));
3215}
3216
3217#ifdef CONFIG_SYSCTL
3218static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3219static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3220static int ip_rt_gc_elasticity __read_mostly    = 8;
3221static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3222
3223static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3224                                        void __user *buffer,
3225                                        size_t *lenp, loff_t *ppos)
3226{
3227        struct net *net = (struct net *)__ctl->extra1;
3228
3229        if (write) {
3230                rt_cache_flush(net);
3231                fnhe_genid_bump(net);
3232                return 0;
3233        }
3234
3235        return -EINVAL;
3236}
3237
3238static struct ctl_table ipv4_route_table[] = {
3239        {
3240                .procname       = "gc_thresh",
3241                .data           = &ipv4_dst_ops.gc_thresh,
3242                .maxlen         = sizeof(int),
3243                .mode           = 0644,
3244                .proc_handler   = proc_dointvec,
3245        },
3246        {
3247                .procname       = "max_size",
3248                .data           = &ip_rt_max_size,
3249                .maxlen         = sizeof(int),
3250                .mode           = 0644,
3251                .proc_handler   = proc_dointvec,
3252        },
3253        {
3254                /*  Deprecated. Use gc_min_interval_ms */
3255
3256                .procname       = "gc_min_interval",
3257                .data           = &ip_rt_gc_min_interval,
3258                .maxlen         = sizeof(int),
3259                .mode           = 0644,
3260                .proc_handler   = proc_dointvec_jiffies,
3261        },
3262        {
3263                .procname       = "gc_min_interval_ms",
3264                .data           = &ip_rt_gc_min_interval,
3265                .maxlen         = sizeof(int),
3266                .mode           = 0644,
3267                .proc_handler   = proc_dointvec_ms_jiffies,
3268        },
3269        {
3270                .procname       = "gc_timeout",
3271                .data           = &ip_rt_gc_timeout,
3272                .maxlen         = sizeof(int),
3273                .mode           = 0644,
3274                .proc_handler   = proc_dointvec_jiffies,
3275        },
3276        {
3277                .procname       = "gc_interval",
3278                .data           = &ip_rt_gc_interval,
3279                .maxlen         = sizeof(int),
3280                .mode           = 0644,
3281                .proc_handler   = proc_dointvec_jiffies,
3282        },
3283        {
3284                .procname       = "redirect_load",
3285                .data           = &ip_rt_redirect_load,
3286                .maxlen         = sizeof(int),
3287                .mode           = 0644,
3288                .proc_handler   = proc_dointvec,
3289        },
3290        {
3291                .procname       = "redirect_number",
3292                .data           = &ip_rt_redirect_number,
3293                .maxlen         = sizeof(int),
3294                .mode           = 0644,
3295                .proc_handler   = proc_dointvec,
3296        },
3297        {
3298                .procname       = "redirect_silence",
3299                .data           = &ip_rt_redirect_silence,
3300                .maxlen         = sizeof(int),
3301                .mode           = 0644,
3302                .proc_handler   = proc_dointvec,
3303        },
3304        {
3305                .procname       = "error_cost",
3306                .data           = &ip_rt_error_cost,
3307                .maxlen         = sizeof(int),
3308                .mode           = 0644,
3309                .proc_handler   = proc_dointvec,
3310        },
3311        {
3312                .procname       = "error_burst",
3313                .data           = &ip_rt_error_burst,
3314                .maxlen         = sizeof(int),
3315                .mode           = 0644,
3316                .proc_handler   = proc_dointvec,
3317        },
3318        {
3319                .procname       = "gc_elasticity",
3320                .data           = &ip_rt_gc_elasticity,
3321                .maxlen         = sizeof(int),
3322                .mode           = 0644,
3323                .proc_handler   = proc_dointvec,
3324        },
3325        {
3326                .procname       = "mtu_expires",
3327                .data           = &ip_rt_mtu_expires,
3328                .maxlen         = sizeof(int),
3329                .mode           = 0644,
3330                .proc_handler   = proc_dointvec_jiffies,
3331        },
3332        {
3333                .procname       = "min_pmtu",
3334                .data           = &ip_rt_min_pmtu,
3335                .maxlen         = sizeof(int),
3336                .mode           = 0644,
3337                .proc_handler   = proc_dointvec_minmax,
3338                .extra1         = &ip_min_valid_pmtu,
3339        },
3340        {
3341                .procname       = "min_adv_mss",
3342                .data           = &ip_rt_min_advmss,
3343                .maxlen         = sizeof(int),
3344                .mode           = 0644,
3345                .proc_handler   = proc_dointvec,
3346        },
3347        { }
3348};
3349
3350static struct ctl_table ipv4_route_flush_table[] = {
3351        {
3352                .procname       = "flush",
3353                .maxlen         = sizeof(int),
3354                .mode           = 0200,
3355                .proc_handler   = ipv4_sysctl_rtcache_flush,
3356        },
3357        { },
3358};
3359
3360static __net_init int sysctl_route_net_init(struct net *net)
3361{
3362        struct ctl_table *tbl;
3363
3364        tbl = ipv4_route_flush_table;
3365        if (!net_eq(net, &init_net)) {
3366                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3367                if (!tbl)
3368                        goto err_dup;
3369
3370                /* Don't export sysctls to unprivileged users */
3371                if (net->user_ns != &init_user_ns)
3372                        tbl[0].procname = NULL;
3373        }
3374        tbl[0].extra1 = net;
3375
3376        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3377        if (!net->ipv4.route_hdr)
3378                goto err_reg;
3379        return 0;
3380
3381err_reg:
3382        if (tbl != ipv4_route_flush_table)
3383                kfree(tbl);
3384err_dup:
3385        return -ENOMEM;
3386}
3387
3388static __net_exit void sysctl_route_net_exit(struct net *net)
3389{
3390        struct ctl_table *tbl;
3391
3392        tbl = net->ipv4.route_hdr->ctl_table_arg;
3393        unregister_net_sysctl_table(net->ipv4.route_hdr);
3394        BUG_ON(tbl == ipv4_route_flush_table);
3395        kfree(tbl);
3396}
3397
3398static __net_initdata struct pernet_operations sysctl_route_ops = {
3399        .init = sysctl_route_net_init,
3400        .exit = sysctl_route_net_exit,
3401};
3402#endif
3403
3404static __net_init int rt_genid_init(struct net *net)
3405{
3406        atomic_set(&net->ipv4.rt_genid, 0);
3407        atomic_set(&net->fnhe_genid, 0);
3408        atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3409        return 0;
3410}
3411
3412static __net_initdata struct pernet_operations rt_genid_ops = {
3413        .init = rt_genid_init,
3414};
3415
3416static int __net_init ipv4_inetpeer_init(struct net *net)
3417{
3418        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3419
3420        if (!bp)
3421                return -ENOMEM;
3422        inet_peer_base_init(bp);
3423        net->ipv4.peers = bp;
3424        return 0;
3425}
3426
3427static void __net_exit ipv4_inetpeer_exit(struct net *net)
3428{
3429        struct inet_peer_base *bp = net->ipv4.peers;
3430
3431        net->ipv4.peers = NULL;
3432        inetpeer_invalidate_tree(bp);
3433        kfree(bp);
3434}
3435
3436static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3437        .init   =       ipv4_inetpeer_init,
3438        .exit   =       ipv4_inetpeer_exit,
3439};
3440
3441#ifdef CONFIG_IP_ROUTE_CLASSID
3442struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3443#endif /* CONFIG_IP_ROUTE_CLASSID */
3444
3445int __init ip_rt_init(void)
3446{
3447        void *idents_hash;
3448        int cpu;
3449
3450        /* For modern hosts, this will use 2 MB of memory */
3451        idents_hash = alloc_large_system_hash("IP idents",
3452                                              sizeof(*ip_idents) + sizeof(*ip_tstamps),
3453                                              0,
3454                                              16, /* one bucket per 64 KB */
3455                                              HASH_ZERO,
3456                                              NULL,
3457                                              &ip_idents_mask,
3458                                              2048,
3459                                              256*1024);
3460
3461        ip_idents = idents_hash;
3462
3463        prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3464
3465        ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3466
3467        for_each_possible_cpu(cpu) {
3468                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3469
3470                INIT_LIST_HEAD(&ul->head);
3471                spin_lock_init(&ul->lock);
3472        }
3473#ifdef CONFIG_IP_ROUTE_CLASSID
3474        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3475        if (!ip_rt_acct)
3476                panic("IP: failed to allocate ip_rt_acct\n");
3477#endif
3478
3479        ipv4_dst_ops.kmem_cachep =
3480                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3481                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3482
3483        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3484
3485        if (dst_entries_init(&ipv4_dst_ops) < 0)
3486                panic("IP: failed to allocate ipv4_dst_ops counter\n");
3487
3488        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3489                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3490
3491        ipv4_dst_ops.gc_thresh = ~0;
3492        ip_rt_max_size = INT_MAX;
3493
3494        devinet_init();
3495        ip_fib_init();
3496
3497        if (ip_rt_proc_init())
3498                pr_err("Unable to create route proc files\n");
3499#ifdef CONFIG_XFRM
3500        xfrm_init();
3501        xfrm4_init();
3502#endif
3503        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3504                      RTNL_FLAG_DOIT_UNLOCKED);
3505
3506#ifdef CONFIG_SYSCTL
3507        register_pernet_subsys(&sysctl_route_ops);
3508#endif
3509        register_pernet_subsys(&rt_genid_ops);
3510        register_pernet_subsys(&ipv4_inetpeer_ops);
3511        return 0;
3512}
3513
3514#ifdef CONFIG_SYSCTL
3515/*
3516 * We really need to sanitize the damn ipv4 init order, then all
3517 * this nonsense will go away.
3518 */
3519void __init ip_static_sysctl_init(void)
3520{
3521        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3522}
3523#endif
3524