linux/net/ipv4/route.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              ROUTE - implementation of the IP router.
   8 *
   9 * Authors:     Ross Biro
  10 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  12 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  13 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Verify area fixes.
  17 *              Alan Cox        :       cli() protects routing changes
  18 *              Rui Oliveira    :       ICMP routing table updates
  19 *              (rco@di.uminho.pt)      Routing table insertion and update
  20 *              Linus Torvalds  :       Rewrote bits to be sensible
  21 *              Alan Cox        :       Added BSD route gw semantics
  22 *              Alan Cox        :       Super /proc >4K
  23 *              Alan Cox        :       MTU in route table
  24 *              Alan Cox        :       MSS actually. Also added the window
  25 *                                      clamper.
  26 *              Sam Lantinga    :       Fixed route matching in rt_del()
  27 *              Alan Cox        :       Routing cache support.
  28 *              Alan Cox        :       Removed compatibility cruft.
  29 *              Alan Cox        :       RTF_REJECT support.
  30 *              Alan Cox        :       TCP irtt support.
  31 *              Jonathan Naylor :       Added Metric support.
  32 *      Miquel van Smoorenburg  :       BSD API fixes.
  33 *      Miquel van Smoorenburg  :       Metrics.
  34 *              Alan Cox        :       Use __u32 properly
  35 *              Alan Cox        :       Aligned routing errors more closely with BSD
  36 *                                      our system is still very different.
  37 *              Alan Cox        :       Faster /proc handling
  38 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  39 *                                      routing caches and better behaviour.
  40 *
  41 *              Olaf Erb        :       irtt wasn't being copied right.
  42 *              Bjorn Ekwall    :       Kerneld route support.
  43 *              Alan Cox        :       Multicast fixed (I hope)
  44 *              Pavel Krauz     :       Limited broadcast fixed
  45 *              Mike McLagan    :       Routing by source
  46 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  47 *                                      route.c and rewritten from scratch.
  48 *              Andi Kleen      :       Load-limit warning messages.
  49 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  50 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  51 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  52 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  53 *              Marc Boucher    :       routing by fwmark
  54 *      Robert Olsson           :       Added rt_cache statistics
  55 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  56 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  57 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  58 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  59 */
  60
  61#define pr_fmt(fmt) "IPv4: " fmt
  62
  63#include <linux/module.h>
  64#include <linux/uaccess.h>
  65#include <linux/bitops.h>
  66#include <linux/types.h>
  67#include <linux/kernel.h>
  68#include <linux/mm.h>
  69#include <linux/string.h>
  70#include <linux/socket.h>
  71#include <linux/sockios.h>
  72#include <linux/errno.h>
  73#include <linux/in.h>
  74#include <linux/inet.h>
  75#include <linux/netdevice.h>
  76#include <linux/proc_fs.h>
  77#include <linux/init.h>
  78#include <linux/skbuff.h>
  79#include <linux/inetdevice.h>
  80#include <linux/igmp.h>
  81#include <linux/pkt_sched.h>
  82#include <linux/mroute.h>
  83#include <linux/netfilter_ipv4.h>
  84#include <linux/random.h>
  85#include <linux/rcupdate.h>
  86#include <linux/times.h>
  87#include <linux/slab.h>
  88#include <linux/jhash.h>
  89#include <net/dst.h>
  90#include <net/dst_metadata.h>
  91#include <net/net_namespace.h>
  92#include <net/protocol.h>
  93#include <net/ip.h>
  94#include <net/route.h>
  95#include <net/inetpeer.h>
  96#include <net/sock.h>
  97#include <net/ip_fib.h>
  98#include <net/nexthop.h>
  99#include <net/arp.h>
 100#include <net/tcp.h>
 101#include <net/icmp.h>
 102#include <net/xfrm.h>
 103#include <net/lwtunnel.h>
 104#include <net/netevent.h>
 105#include <net/rtnetlink.h>
 106#ifdef CONFIG_SYSCTL
 107#include <linux/sysctl.h>
 108#endif
 109#include <net/secure_seq.h>
 110#include <net/ip_tunnels.h>
 111#include <net/l3mdev.h>
 112
 113#include "fib_lookup.h"
 114
 115#define RT_FL_TOS(oldflp4) \
 116        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 117
 118#define RT_GC_TIMEOUT (300*HZ)
 119
 120static int ip_rt_max_size;
 121static int ip_rt_redirect_number __read_mostly  = 9;
 122static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 123static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 124static int ip_rt_error_cost __read_mostly       = HZ;
 125static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 126static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 127static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 128static int ip_rt_min_advmss __read_mostly       = 256;
 129
 130static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 131
 132/*
 133 *      Interface to generic destination cache.
 134 */
 135
 136static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 137static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 138static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 139static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 140static void              ipv4_link_failure(struct sk_buff *skb);
 141static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 142                                           struct sk_buff *skb, u32 mtu);
 143static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 144                                        struct sk_buff *skb);
 145static void             ipv4_dst_destroy(struct dst_entry *dst);
 146
 147static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 148{
 149        WARN_ON(1);
 150        return NULL;
 151}
 152
 153static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 154                                           struct sk_buff *skb,
 155                                           const void *daddr);
 156static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 157
 158static struct dst_ops ipv4_dst_ops = {
 159        .family =               AF_INET,
 160        .check =                ipv4_dst_check,
 161        .default_advmss =       ipv4_default_advmss,
 162        .mtu =                  ipv4_mtu,
 163        .cow_metrics =          ipv4_cow_metrics,
 164        .destroy =              ipv4_dst_destroy,
 165        .negative_advice =      ipv4_negative_advice,
 166        .link_failure =         ipv4_link_failure,
 167        .update_pmtu =          ip_rt_update_pmtu,
 168        .redirect =             ip_do_redirect,
 169        .local_out =            __ip_local_out,
 170        .neigh_lookup =         ipv4_neigh_lookup,
 171        .confirm_neigh =        ipv4_confirm_neigh,
 172};
 173
 174#define ECN_OR_COST(class)      TC_PRIO_##class
 175
 176const __u8 ip_tos2prio[16] = {
 177        TC_PRIO_BESTEFFORT,
 178        ECN_OR_COST(BESTEFFORT),
 179        TC_PRIO_BESTEFFORT,
 180        ECN_OR_COST(BESTEFFORT),
 181        TC_PRIO_BULK,
 182        ECN_OR_COST(BULK),
 183        TC_PRIO_BULK,
 184        ECN_OR_COST(BULK),
 185        TC_PRIO_INTERACTIVE,
 186        ECN_OR_COST(INTERACTIVE),
 187        TC_PRIO_INTERACTIVE,
 188        ECN_OR_COST(INTERACTIVE),
 189        TC_PRIO_INTERACTIVE_BULK,
 190        ECN_OR_COST(INTERACTIVE_BULK),
 191        TC_PRIO_INTERACTIVE_BULK,
 192        ECN_OR_COST(INTERACTIVE_BULK)
 193};
 194EXPORT_SYMBOL(ip_tos2prio);
 195
 196static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 197#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 198
 199#ifdef CONFIG_PROC_FS
 200static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 201{
 202        if (*pos)
 203                return NULL;
 204        return SEQ_START_TOKEN;
 205}
 206
 207static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 208{
 209        ++*pos;
 210        return NULL;
 211}
 212
 213static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 214{
 215}
 216
 217static int rt_cache_seq_show(struct seq_file *seq, void *v)
 218{
 219        if (v == SEQ_START_TOKEN)
 220                seq_printf(seq, "%-127s\n",
 221                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 222                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 223                           "HHUptod\tSpecDst");
 224        return 0;
 225}
 226
 227static const struct seq_operations rt_cache_seq_ops = {
 228        .start  = rt_cache_seq_start,
 229        .next   = rt_cache_seq_next,
 230        .stop   = rt_cache_seq_stop,
 231        .show   = rt_cache_seq_show,
 232};
 233
 234static int rt_cache_seq_open(struct inode *inode, struct file *file)
 235{
 236        return seq_open(file, &rt_cache_seq_ops);
 237}
 238
 239static const struct file_operations rt_cache_seq_fops = {
 240        .open    = rt_cache_seq_open,
 241        .read    = seq_read,
 242        .llseek  = seq_lseek,
 243        .release = seq_release,
 244};
 245
 246
 247static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 248{
 249        int cpu;
 250
 251        if (*pos == 0)
 252                return SEQ_START_TOKEN;
 253
 254        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 255                if (!cpu_possible(cpu))
 256                        continue;
 257                *pos = cpu+1;
 258                return &per_cpu(rt_cache_stat, cpu);
 259        }
 260        return NULL;
 261}
 262
 263static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 264{
 265        int cpu;
 266
 267        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 268                if (!cpu_possible(cpu))
 269                        continue;
 270                *pos = cpu+1;
 271                return &per_cpu(rt_cache_stat, cpu);
 272        }
 273        return NULL;
 274
 275}
 276
 277static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 278{
 279
 280}
 281
 282static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 283{
 284        struct rt_cache_stat *st = v;
 285
 286        if (v == SEQ_START_TOKEN) {
 287                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 288                return 0;
 289        }
 290
 291        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 292                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 293                   dst_entries_get_slow(&ipv4_dst_ops),
 294                   0, /* st->in_hit */
 295                   st->in_slow_tot,
 296                   st->in_slow_mc,
 297                   st->in_no_route,
 298                   st->in_brd,
 299                   st->in_martian_dst,
 300                   st->in_martian_src,
 301
 302                   0, /* st->out_hit */
 303                   st->out_slow_tot,
 304                   st->out_slow_mc,
 305
 306                   0, /* st->gc_total */
 307                   0, /* st->gc_ignored */
 308                   0, /* st->gc_goal_miss */
 309                   0, /* st->gc_dst_overflow */
 310                   0, /* st->in_hlist_search */
 311                   0  /* st->out_hlist_search */
 312                );
 313        return 0;
 314}
 315
 316static const struct seq_operations rt_cpu_seq_ops = {
 317        .start  = rt_cpu_seq_start,
 318        .next   = rt_cpu_seq_next,
 319        .stop   = rt_cpu_seq_stop,
 320        .show   = rt_cpu_seq_show,
 321};
 322
 323
 324static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 325{
 326        return seq_open(file, &rt_cpu_seq_ops);
 327}
 328
 329static const struct file_operations rt_cpu_seq_fops = {
 330        .open    = rt_cpu_seq_open,
 331        .read    = seq_read,
 332        .llseek  = seq_lseek,
 333        .release = seq_release,
 334};
 335
 336#ifdef CONFIG_IP_ROUTE_CLASSID
 337static int rt_acct_proc_show(struct seq_file *m, void *v)
 338{
 339        struct ip_rt_acct *dst, *src;
 340        unsigned int i, j;
 341
 342        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 343        if (!dst)
 344                return -ENOMEM;
 345
 346        for_each_possible_cpu(i) {
 347                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 348                for (j = 0; j < 256; j++) {
 349                        dst[j].o_bytes   += src[j].o_bytes;
 350                        dst[j].o_packets += src[j].o_packets;
 351                        dst[j].i_bytes   += src[j].i_bytes;
 352                        dst[j].i_packets += src[j].i_packets;
 353                }
 354        }
 355
 356        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 357        kfree(dst);
 358        return 0;
 359}
 360#endif
 361
 362static int __net_init ip_rt_do_proc_init(struct net *net)
 363{
 364        struct proc_dir_entry *pde;
 365
 366        pde = proc_create("rt_cache", 0444, net->proc_net,
 367                          &rt_cache_seq_fops);
 368        if (!pde)
 369                goto err1;
 370
 371        pde = proc_create("rt_cache", 0444,
 372                          net->proc_net_stat, &rt_cpu_seq_fops);
 373        if (!pde)
 374                goto err2;
 375
 376#ifdef CONFIG_IP_ROUTE_CLASSID
 377        pde = proc_create_single("rt_acct", 0, net->proc_net,
 378                        rt_acct_proc_show);
 379        if (!pde)
 380                goto err3;
 381#endif
 382        return 0;
 383
 384#ifdef CONFIG_IP_ROUTE_CLASSID
 385err3:
 386        remove_proc_entry("rt_cache", net->proc_net_stat);
 387#endif
 388err2:
 389        remove_proc_entry("rt_cache", net->proc_net);
 390err1:
 391        return -ENOMEM;
 392}
 393
 394static void __net_exit ip_rt_do_proc_exit(struct net *net)
 395{
 396        remove_proc_entry("rt_cache", net->proc_net_stat);
 397        remove_proc_entry("rt_cache", net->proc_net);
 398#ifdef CONFIG_IP_ROUTE_CLASSID
 399        remove_proc_entry("rt_acct", net->proc_net);
 400#endif
 401}
 402
 403static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 404        .init = ip_rt_do_proc_init,
 405        .exit = ip_rt_do_proc_exit,
 406};
 407
 408static int __init ip_rt_proc_init(void)
 409{
 410        return register_pernet_subsys(&ip_rt_proc_ops);
 411}
 412
 413#else
 414static inline int ip_rt_proc_init(void)
 415{
 416        return 0;
 417}
 418#endif /* CONFIG_PROC_FS */
 419
 420static inline bool rt_is_expired(const struct rtable *rth)
 421{
 422        return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 423}
 424
 425void rt_cache_flush(struct net *net)
 426{
 427        rt_genid_bump_ipv4(net);
 428}
 429
 430static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 431                                           struct sk_buff *skb,
 432                                           const void *daddr)
 433{
 434        const struct rtable *rt = container_of(dst, struct rtable, dst);
 435        struct net_device *dev = dst->dev;
 436        struct neighbour *n;
 437
 438        rcu_read_lock_bh();
 439
 440        if (likely(rt->rt_gw_family == AF_INET)) {
 441                n = ip_neigh_gw4(dev, rt->rt_gw4);
 442        } else if (rt->rt_gw_family == AF_INET6) {
 443                n = ip_neigh_gw6(dev, &rt->rt_gw6);
 444        } else {
 445                __be32 pkey;
 446
 447                pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
 448                n = ip_neigh_gw4(dev, pkey);
 449        }
 450
 451        if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
 452                n = NULL;
 453
 454        rcu_read_unlock_bh();
 455
 456        return n;
 457}
 458
 459static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 460{
 461        const struct rtable *rt = container_of(dst, struct rtable, dst);
 462        struct net_device *dev = dst->dev;
 463        const __be32 *pkey = daddr;
 464
 465        if (rt->rt_gw_family == AF_INET) {
 466                pkey = (const __be32 *)&rt->rt_gw4;
 467        } else if (rt->rt_gw_family == AF_INET6) {
 468                return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
 469        } else if (!daddr ||
 470                 (rt->rt_flags &
 471                  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
 472                return;
 473        }
 474        __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 475}
 476
 477#define IP_IDENTS_SZ 2048u
 478
 479static atomic_t *ip_idents __read_mostly;
 480static u32 *ip_tstamps __read_mostly;
 481
 482/* In order to protect privacy, we add a perturbation to identifiers
 483 * if one generator is seldom used. This makes hard for an attacker
 484 * to infer how many packets were sent between two points in time.
 485 */
 486u32 ip_idents_reserve(u32 hash, int segs)
 487{
 488        u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 489        atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 490        u32 old = READ_ONCE(*p_tstamp);
 491        u32 now = (u32)jiffies;
 492        u32 new, delta = 0;
 493
 494        if (old != now && cmpxchg(p_tstamp, old, now) == old)
 495                delta = prandom_u32_max(now - old);
 496
 497        /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 498        do {
 499                old = (u32)atomic_read(p_id);
 500                new = old + delta + segs;
 501        } while (atomic_cmpxchg(p_id, old, new) != old);
 502
 503        return new - segs;
 504}
 505EXPORT_SYMBOL(ip_idents_reserve);
 506
 507void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 508{
 509        u32 hash, id;
 510
 511        /* Note the following code is not safe, but this is okay. */
 512        if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 513                get_random_bytes(&net->ipv4.ip_id_key,
 514                                 sizeof(net->ipv4.ip_id_key));
 515
 516        hash = siphash_3u32((__force u32)iph->daddr,
 517                            (__force u32)iph->saddr,
 518                            iph->protocol,
 519                            &net->ipv4.ip_id_key);
 520        id = ip_idents_reserve(hash, segs);
 521        iph->id = htons(id);
 522}
 523EXPORT_SYMBOL(__ip_select_ident);
 524
 525static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 526                             const struct sock *sk,
 527                             const struct iphdr *iph,
 528                             int oif, u8 tos,
 529                             u8 prot, u32 mark, int flow_flags)
 530{
 531        if (sk) {
 532                const struct inet_sock *inet = inet_sk(sk);
 533
 534                oif = sk->sk_bound_dev_if;
 535                mark = sk->sk_mark;
 536                tos = RT_CONN_FLAGS(sk);
 537                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 538        }
 539        flowi4_init_output(fl4, oif, mark, tos,
 540                           RT_SCOPE_UNIVERSE, prot,
 541                           flow_flags,
 542                           iph->daddr, iph->saddr, 0, 0,
 543                           sock_net_uid(net, sk));
 544}
 545
 546static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 547                               const struct sock *sk)
 548{
 549        const struct net *net = dev_net(skb->dev);
 550        const struct iphdr *iph = ip_hdr(skb);
 551        int oif = skb->dev->ifindex;
 552        u8 tos = RT_TOS(iph->tos);
 553        u8 prot = iph->protocol;
 554        u32 mark = skb->mark;
 555
 556        __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 557}
 558
 559static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 560{
 561        const struct inet_sock *inet = inet_sk(sk);
 562        const struct ip_options_rcu *inet_opt;
 563        __be32 daddr = inet->inet_daddr;
 564
 565        rcu_read_lock();
 566        inet_opt = rcu_dereference(inet->inet_opt);
 567        if (inet_opt && inet_opt->opt.srr)
 568                daddr = inet_opt->opt.faddr;
 569        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 570                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 571                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 572                           inet_sk_flowi_flags(sk),
 573                           daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 574        rcu_read_unlock();
 575}
 576
 577static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 578                                 const struct sk_buff *skb)
 579{
 580        if (skb)
 581                build_skb_flow_key(fl4, skb, sk);
 582        else
 583                build_sk_flow_key(fl4, sk);
 584}
 585
 586static DEFINE_SPINLOCK(fnhe_lock);
 587
 588static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 589{
 590        struct rtable *rt;
 591
 592        rt = rcu_dereference(fnhe->fnhe_rth_input);
 593        if (rt) {
 594                RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 595                dst_dev_put(&rt->dst);
 596                dst_release(&rt->dst);
 597        }
 598        rt = rcu_dereference(fnhe->fnhe_rth_output);
 599        if (rt) {
 600                RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 601                dst_dev_put(&rt->dst);
 602                dst_release(&rt->dst);
 603        }
 604}
 605
 606static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 607{
 608        struct fib_nh_exception *fnhe, *oldest;
 609
 610        oldest = rcu_dereference(hash->chain);
 611        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 612             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 613                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 614                        oldest = fnhe;
 615        }
 616        fnhe_flush_routes(oldest);
 617        return oldest;
 618}
 619
 620static inline u32 fnhe_hashfun(__be32 daddr)
 621{
 622        static u32 fnhe_hashrnd __read_mostly;
 623        u32 hval;
 624
 625        net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 626        hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 627        return hash_32(hval, FNHE_HASH_SHIFT);
 628}
 629
 630static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 631{
 632        rt->rt_pmtu = fnhe->fnhe_pmtu;
 633        rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 634        rt->dst.expires = fnhe->fnhe_expires;
 635
 636        if (fnhe->fnhe_gw) {
 637                rt->rt_flags |= RTCF_REDIRECTED;
 638                rt->rt_gw_family = AF_INET;
 639                rt->rt_gw4 = fnhe->fnhe_gw;
 640        }
 641}
 642
 643static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
 644                                  __be32 gw, u32 pmtu, bool lock,
 645                                  unsigned long expires)
 646{
 647        struct fnhe_hash_bucket *hash;
 648        struct fib_nh_exception *fnhe;
 649        struct rtable *rt;
 650        u32 genid, hval;
 651        unsigned int i;
 652        int depth;
 653
 654        genid = fnhe_genid(dev_net(nhc->nhc_dev));
 655        hval = fnhe_hashfun(daddr);
 656
 657        spin_lock_bh(&fnhe_lock);
 658
 659        hash = rcu_dereference(nhc->nhc_exceptions);
 660        if (!hash) {
 661                hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 662                if (!hash)
 663                        goto out_unlock;
 664                rcu_assign_pointer(nhc->nhc_exceptions, hash);
 665        }
 666
 667        hash += hval;
 668
 669        depth = 0;
 670        for (fnhe = rcu_dereference(hash->chain); fnhe;
 671             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 672                if (fnhe->fnhe_daddr == daddr)
 673                        break;
 674                depth++;
 675        }
 676
 677        if (fnhe) {
 678                if (fnhe->fnhe_genid != genid)
 679                        fnhe->fnhe_genid = genid;
 680                if (gw)
 681                        fnhe->fnhe_gw = gw;
 682                if (pmtu) {
 683                        fnhe->fnhe_pmtu = pmtu;
 684                        fnhe->fnhe_mtu_locked = lock;
 685                }
 686                fnhe->fnhe_expires = max(1UL, expires);
 687                /* Update all cached dsts too */
 688                rt = rcu_dereference(fnhe->fnhe_rth_input);
 689                if (rt)
 690                        fill_route_from_fnhe(rt, fnhe);
 691                rt = rcu_dereference(fnhe->fnhe_rth_output);
 692                if (rt)
 693                        fill_route_from_fnhe(rt, fnhe);
 694        } else {
 695                if (depth > FNHE_RECLAIM_DEPTH)
 696                        fnhe = fnhe_oldest(hash);
 697                else {
 698                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 699                        if (!fnhe)
 700                                goto out_unlock;
 701
 702                        fnhe->fnhe_next = hash->chain;
 703                        rcu_assign_pointer(hash->chain, fnhe);
 704                }
 705                fnhe->fnhe_genid = genid;
 706                fnhe->fnhe_daddr = daddr;
 707                fnhe->fnhe_gw = gw;
 708                fnhe->fnhe_pmtu = pmtu;
 709                fnhe->fnhe_mtu_locked = lock;
 710                fnhe->fnhe_expires = max(1UL, expires);
 711
 712                /* Exception created; mark the cached routes for the nexthop
 713                 * stale, so anyone caching it rechecks if this exception
 714                 * applies to them.
 715                 */
 716                rt = rcu_dereference(nhc->nhc_rth_input);
 717                if (rt)
 718                        rt->dst.obsolete = DST_OBSOLETE_KILL;
 719
 720                for_each_possible_cpu(i) {
 721                        struct rtable __rcu **prt;
 722                        prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 723                        rt = rcu_dereference(*prt);
 724                        if (rt)
 725                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 726                }
 727        }
 728
 729        fnhe->fnhe_stamp = jiffies;
 730
 731out_unlock:
 732        spin_unlock_bh(&fnhe_lock);
 733}
 734
 735static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 736                             bool kill_route)
 737{
 738        __be32 new_gw = icmp_hdr(skb)->un.gateway;
 739        __be32 old_gw = ip_hdr(skb)->saddr;
 740        struct net_device *dev = skb->dev;
 741        struct in_device *in_dev;
 742        struct fib_result res;
 743        struct neighbour *n;
 744        struct net *net;
 745
 746        switch (icmp_hdr(skb)->code & 7) {
 747        case ICMP_REDIR_NET:
 748        case ICMP_REDIR_NETTOS:
 749        case ICMP_REDIR_HOST:
 750        case ICMP_REDIR_HOSTTOS:
 751                break;
 752
 753        default:
 754                return;
 755        }
 756
 757        if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
 758                return;
 759
 760        in_dev = __in_dev_get_rcu(dev);
 761        if (!in_dev)
 762                return;
 763
 764        net = dev_net(dev);
 765        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 766            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 767            ipv4_is_zeronet(new_gw))
 768                goto reject_redirect;
 769
 770        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 771                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 772                        goto reject_redirect;
 773                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 774                        goto reject_redirect;
 775        } else {
 776                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 777                        goto reject_redirect;
 778        }
 779
 780        n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 781        if (!n)
 782                n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 783        if (!IS_ERR(n)) {
 784                if (!(n->nud_state & NUD_VALID)) {
 785                        neigh_event_send(n, NULL);
 786                } else {
 787                        if (fib_lookup(net, fl4, &res, 0) == 0) {
 788                                struct fib_nh_common *nhc = FIB_RES_NHC(res);
 789
 790                                update_or_create_fnhe(nhc, fl4->daddr, new_gw,
 791                                                0, false,
 792                                                jiffies + ip_rt_gc_timeout);
 793                        }
 794                        if (kill_route)
 795                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 796                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 797                }
 798                neigh_release(n);
 799        }
 800        return;
 801
 802reject_redirect:
 803#ifdef CONFIG_IP_ROUTE_VERBOSE
 804        if (IN_DEV_LOG_MARTIANS(in_dev)) {
 805                const struct iphdr *iph = (const struct iphdr *) skb->data;
 806                __be32 daddr = iph->daddr;
 807                __be32 saddr = iph->saddr;
 808
 809                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 810                                     "  Advised path = %pI4 -> %pI4\n",
 811                                     &old_gw, dev->name, &new_gw,
 812                                     &saddr, &daddr);
 813        }
 814#endif
 815        ;
 816}
 817
 818static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 819{
 820        struct rtable *rt;
 821        struct flowi4 fl4;
 822        const struct iphdr *iph = (const struct iphdr *) skb->data;
 823        struct net *net = dev_net(skb->dev);
 824        int oif = skb->dev->ifindex;
 825        u8 tos = RT_TOS(iph->tos);
 826        u8 prot = iph->protocol;
 827        u32 mark = skb->mark;
 828
 829        rt = (struct rtable *) dst;
 830
 831        __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 832        __ip_do_redirect(rt, skb, &fl4, true);
 833}
 834
 835static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 836{
 837        struct rtable *rt = (struct rtable *)dst;
 838        struct dst_entry *ret = dst;
 839
 840        if (rt) {
 841                if (dst->obsolete > 0) {
 842                        ip_rt_put(rt);
 843                        ret = NULL;
 844                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 845                           rt->dst.expires) {
 846                        ip_rt_put(rt);
 847                        ret = NULL;
 848                }
 849        }
 850        return ret;
 851}
 852
 853/*
 854 * Algorithm:
 855 *      1. The first ip_rt_redirect_number redirects are sent
 856 *         with exponential backoff, then we stop sending them at all,
 857 *         assuming that the host ignores our redirects.
 858 *      2. If we did not see packets requiring redirects
 859 *         during ip_rt_redirect_silence, we assume that the host
 860 *         forgot redirected route and start to send redirects again.
 861 *
 862 * This algorithm is much cheaper and more intelligent than dumb load limiting
 863 * in icmp.c.
 864 *
 865 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 866 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 867 */
 868
 869void ip_rt_send_redirect(struct sk_buff *skb)
 870{
 871        struct rtable *rt = skb_rtable(skb);
 872        struct in_device *in_dev;
 873        struct inet_peer *peer;
 874        struct net *net;
 875        int log_martians;
 876        int vif;
 877
 878        rcu_read_lock();
 879        in_dev = __in_dev_get_rcu(rt->dst.dev);
 880        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 881                rcu_read_unlock();
 882                return;
 883        }
 884        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 885        vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 886        rcu_read_unlock();
 887
 888        net = dev_net(rt->dst.dev);
 889        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 890        if (!peer) {
 891                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 892                          rt_nexthop(rt, ip_hdr(skb)->daddr));
 893                return;
 894        }
 895
 896        /* No redirected packets during ip_rt_redirect_silence;
 897         * reset the algorithm.
 898         */
 899        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 900                peer->rate_tokens = 0;
 901                peer->n_redirects = 0;
 902        }
 903
 904        /* Too many ignored redirects; do not send anything
 905         * set dst.rate_last to the last seen redirected packet.
 906         */
 907        if (peer->n_redirects >= ip_rt_redirect_number) {
 908                peer->rate_last = jiffies;
 909                goto out_put_peer;
 910        }
 911
 912        /* Check for load limit; set rate_last to the latest sent
 913         * redirect.
 914         */
 915        if (peer->rate_tokens == 0 ||
 916            time_after(jiffies,
 917                       (peer->rate_last +
 918                        (ip_rt_redirect_load << peer->rate_tokens)))) {
 919                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 920
 921                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 922                peer->rate_last = jiffies;
 923                ++peer->rate_tokens;
 924                ++peer->n_redirects;
 925#ifdef CONFIG_IP_ROUTE_VERBOSE
 926                if (log_martians &&
 927                    peer->rate_tokens == ip_rt_redirect_number)
 928                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 929                                             &ip_hdr(skb)->saddr, inet_iif(skb),
 930                                             &ip_hdr(skb)->daddr, &gw);
 931#endif
 932        }
 933out_put_peer:
 934        inet_putpeer(peer);
 935}
 936
 937static int ip_error(struct sk_buff *skb)
 938{
 939        struct rtable *rt = skb_rtable(skb);
 940        struct net_device *dev = skb->dev;
 941        struct in_device *in_dev;
 942        struct inet_peer *peer;
 943        unsigned long now;
 944        struct net *net;
 945        bool send;
 946        int code;
 947
 948        if (netif_is_l3_master(skb->dev)) {
 949                dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 950                if (!dev)
 951                        goto out;
 952        }
 953
 954        in_dev = __in_dev_get_rcu(dev);
 955
 956        /* IP on this device is disabled. */
 957        if (!in_dev)
 958                goto out;
 959
 960        net = dev_net(rt->dst.dev);
 961        if (!IN_DEV_FORWARD(in_dev)) {
 962                switch (rt->dst.error) {
 963                case EHOSTUNREACH:
 964                        __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 965                        break;
 966
 967                case ENETUNREACH:
 968                        __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 969                        break;
 970                }
 971                goto out;
 972        }
 973
 974        switch (rt->dst.error) {
 975        case EINVAL:
 976        default:
 977                goto out;
 978        case EHOSTUNREACH:
 979                code = ICMP_HOST_UNREACH;
 980                break;
 981        case ENETUNREACH:
 982                code = ICMP_NET_UNREACH;
 983                __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 984                break;
 985        case EACCES:
 986                code = ICMP_PKT_FILTERED;
 987                break;
 988        }
 989
 990        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 991                               l3mdev_master_ifindex(skb->dev), 1);
 992
 993        send = true;
 994        if (peer) {
 995                now = jiffies;
 996                peer->rate_tokens += now - peer->rate_last;
 997                if (peer->rate_tokens > ip_rt_error_burst)
 998                        peer->rate_tokens = ip_rt_error_burst;
 999                peer->rate_last = now;
1000                if (peer->rate_tokens >= ip_rt_error_cost)
1001                        peer->rate_tokens -= ip_rt_error_cost;
1002                else
1003                        send = false;
1004                inet_putpeer(peer);
1005        }
1006        if (send)
1007                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1008
1009out:    kfree_skb(skb);
1010        return 0;
1011}
1012
1013static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1014{
1015        struct dst_entry *dst = &rt->dst;
1016        u32 old_mtu = ipv4_mtu(dst);
1017        struct fib_result res;
1018        bool lock = false;
1019
1020        if (ip_mtu_locked(dst))
1021                return;
1022
1023        if (old_mtu < mtu)
1024                return;
1025
1026        if (mtu < ip_rt_min_pmtu) {
1027                lock = true;
1028                mtu = min(old_mtu, ip_rt_min_pmtu);
1029        }
1030
1031        if (rt->rt_pmtu == mtu && !lock &&
1032            time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1033                return;
1034
1035        rcu_read_lock();
1036        if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1037                struct fib_nh_common *nhc = FIB_RES_NHC(res);
1038
1039                update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1040                                      jiffies + ip_rt_mtu_expires);
1041        }
1042        rcu_read_unlock();
1043}
1044
1045static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1046                              struct sk_buff *skb, u32 mtu)
1047{
1048        struct rtable *rt = (struct rtable *) dst;
1049        struct flowi4 fl4;
1050
1051        ip_rt_build_flow_key(&fl4, sk, skb);
1052        __ip_rt_update_pmtu(rt, &fl4, mtu);
1053}
1054
1055void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1056                      int oif, u8 protocol)
1057{
1058        const struct iphdr *iph = (const struct iphdr *) skb->data;
1059        struct flowi4 fl4;
1060        struct rtable *rt;
1061        u32 mark = IP4_REPLY_MARK(net, skb->mark);
1062
1063        __build_flow_key(net, &fl4, NULL, iph, oif,
1064                         RT_TOS(iph->tos), protocol, mark, 0);
1065        rt = __ip_route_output_key(net, &fl4);
1066        if (!IS_ERR(rt)) {
1067                __ip_rt_update_pmtu(rt, &fl4, mtu);
1068                ip_rt_put(rt);
1069        }
1070}
1071EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1072
1073static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074{
1075        const struct iphdr *iph = (const struct iphdr *) skb->data;
1076        struct flowi4 fl4;
1077        struct rtable *rt;
1078
1079        __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1080
1081        if (!fl4.flowi4_mark)
1082                fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1083
1084        rt = __ip_route_output_key(sock_net(sk), &fl4);
1085        if (!IS_ERR(rt)) {
1086                __ip_rt_update_pmtu(rt, &fl4, mtu);
1087                ip_rt_put(rt);
1088        }
1089}
1090
1091void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1092{
1093        const struct iphdr *iph = (const struct iphdr *) skb->data;
1094        struct flowi4 fl4;
1095        struct rtable *rt;
1096        struct dst_entry *odst = NULL;
1097        bool new = false;
1098        struct net *net = sock_net(sk);
1099
1100        bh_lock_sock(sk);
1101
1102        if (!ip_sk_accept_pmtu(sk))
1103                goto out;
1104
1105        odst = sk_dst_get(sk);
1106
1107        if (sock_owned_by_user(sk) || !odst) {
1108                __ipv4_sk_update_pmtu(skb, sk, mtu);
1109                goto out;
1110        }
1111
1112        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1113
1114        rt = (struct rtable *)odst;
1115        if (odst->obsolete && !odst->ops->check(odst, 0)) {
1116                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1117                if (IS_ERR(rt))
1118                        goto out;
1119
1120                new = true;
1121        }
1122
1123        __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1124
1125        if (!dst_check(&rt->dst, 0)) {
1126                if (new)
1127                        dst_release(&rt->dst);
1128
1129                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1130                if (IS_ERR(rt))
1131                        goto out;
1132
1133                new = true;
1134        }
1135
1136        if (new)
1137                sk_dst_set(sk, &rt->dst);
1138
1139out:
1140        bh_unlock_sock(sk);
1141        dst_release(odst);
1142}
1143EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1144
1145void ipv4_redirect(struct sk_buff *skb, struct net *net,
1146                   int oif, u8 protocol)
1147{
1148        const struct iphdr *iph = (const struct iphdr *) skb->data;
1149        struct flowi4 fl4;
1150        struct rtable *rt;
1151
1152        __build_flow_key(net, &fl4, NULL, iph, oif,
1153                         RT_TOS(iph->tos), protocol, 0, 0);
1154        rt = __ip_route_output_key(net, &fl4);
1155        if (!IS_ERR(rt)) {
1156                __ip_do_redirect(rt, skb, &fl4, false);
1157                ip_rt_put(rt);
1158        }
1159}
1160EXPORT_SYMBOL_GPL(ipv4_redirect);
1161
1162void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1163{
1164        const struct iphdr *iph = (const struct iphdr *) skb->data;
1165        struct flowi4 fl4;
1166        struct rtable *rt;
1167        struct net *net = sock_net(sk);
1168
1169        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1170        rt = __ip_route_output_key(net, &fl4);
1171        if (!IS_ERR(rt)) {
1172                __ip_do_redirect(rt, skb, &fl4, false);
1173                ip_rt_put(rt);
1174        }
1175}
1176EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1177
1178static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1179{
1180        struct rtable *rt = (struct rtable *) dst;
1181
1182        /* All IPV4 dsts are created with ->obsolete set to the value
1183         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1184         * into this function always.
1185         *
1186         * When a PMTU/redirect information update invalidates a route,
1187         * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1188         * DST_OBSOLETE_DEAD.
1189         */
1190        if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1191                return NULL;
1192        return dst;
1193}
1194
1195static void ipv4_send_dest_unreach(struct sk_buff *skb)
1196{
1197        struct ip_options opt;
1198        int res;
1199
1200        /* Recompile ip options since IPCB may not be valid anymore.
1201         * Also check we have a reasonable ipv4 header.
1202         */
1203        if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1204            ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1205                return;
1206
1207        memset(&opt, 0, sizeof(opt));
1208        if (ip_hdr(skb)->ihl > 5) {
1209                if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1210                        return;
1211                opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1212
1213                rcu_read_lock();
1214                res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1215                rcu_read_unlock();
1216
1217                if (res)
1218                        return;
1219        }
1220        __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1221}
1222
1223static void ipv4_link_failure(struct sk_buff *skb)
1224{
1225        struct rtable *rt;
1226
1227        ipv4_send_dest_unreach(skb);
1228
1229        rt = skb_rtable(skb);
1230        if (rt)
1231                dst_set_expires(&rt->dst, 0);
1232}
1233
1234static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1235{
1236        pr_debug("%s: %pI4 -> %pI4, %s\n",
1237                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1238                 skb->dev ? skb->dev->name : "?");
1239        kfree_skb(skb);
1240        WARN_ON(1);
1241        return 0;
1242}
1243
1244/*
1245   We do not cache source address of outgoing interface,
1246   because it is used only by IP RR, TS and SRR options,
1247   so that it out of fast path.
1248
1249   BTW remember: "addr" is allowed to be not aligned
1250   in IP options!
1251 */
1252
1253void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1254{
1255        __be32 src;
1256
1257        if (rt_is_output_route(rt))
1258                src = ip_hdr(skb)->saddr;
1259        else {
1260                struct fib_result res;
1261                struct iphdr *iph = ip_hdr(skb);
1262                struct flowi4 fl4 = {
1263                        .daddr = iph->daddr,
1264                        .saddr = iph->saddr,
1265                        .flowi4_tos = RT_TOS(iph->tos),
1266                        .flowi4_oif = rt->dst.dev->ifindex,
1267                        .flowi4_iif = skb->dev->ifindex,
1268                        .flowi4_mark = skb->mark,
1269                };
1270
1271                rcu_read_lock();
1272                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1273                        src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1274                else
1275                        src = inet_select_addr(rt->dst.dev,
1276                                               rt_nexthop(rt, iph->daddr),
1277                                               RT_SCOPE_UNIVERSE);
1278                rcu_read_unlock();
1279        }
1280        memcpy(addr, &src, 4);
1281}
1282
1283#ifdef CONFIG_IP_ROUTE_CLASSID
1284static void set_class_tag(struct rtable *rt, u32 tag)
1285{
1286        if (!(rt->dst.tclassid & 0xFFFF))
1287                rt->dst.tclassid |= tag & 0xFFFF;
1288        if (!(rt->dst.tclassid & 0xFFFF0000))
1289                rt->dst.tclassid |= tag & 0xFFFF0000;
1290}
1291#endif
1292
1293static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1294{
1295        unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1296        unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1297                                    ip_rt_min_advmss);
1298
1299        return min(advmss, IPV4_MAX_PMTU - header_size);
1300}
1301
1302static unsigned int ipv4_mtu(const struct dst_entry *dst)
1303{
1304        const struct rtable *rt = (const struct rtable *) dst;
1305        unsigned int mtu = rt->rt_pmtu;
1306
1307        if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1308                mtu = dst_metric_raw(dst, RTAX_MTU);
1309
1310        if (mtu)
1311                return mtu;
1312
1313        mtu = READ_ONCE(dst->dev->mtu);
1314
1315        if (unlikely(ip_mtu_locked(dst))) {
1316                if (rt->rt_gw_family && mtu > 576)
1317                        mtu = 576;
1318        }
1319
1320        mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1321
1322        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1323}
1324
1325static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1326{
1327        struct fnhe_hash_bucket *hash;
1328        struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1329        u32 hval = fnhe_hashfun(daddr);
1330
1331        spin_lock_bh(&fnhe_lock);
1332
1333        hash = rcu_dereference_protected(nhc->nhc_exceptions,
1334                                         lockdep_is_held(&fnhe_lock));
1335        hash += hval;
1336
1337        fnhe_p = &hash->chain;
1338        fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1339        while (fnhe) {
1340                if (fnhe->fnhe_daddr == daddr) {
1341                        rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1342                                fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1343                        /* set fnhe_daddr to 0 to ensure it won't bind with
1344                         * new dsts in rt_bind_exception().
1345                         */
1346                        fnhe->fnhe_daddr = 0;
1347                        fnhe_flush_routes(fnhe);
1348                        kfree_rcu(fnhe, rcu);
1349                        break;
1350                }
1351                fnhe_p = &fnhe->fnhe_next;
1352                fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1353                                                 lockdep_is_held(&fnhe_lock));
1354        }
1355
1356        spin_unlock_bh(&fnhe_lock);
1357}
1358
1359static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1360                                               __be32 daddr)
1361{
1362        struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1363        struct fib_nh_exception *fnhe;
1364        u32 hval;
1365
1366        if (!hash)
1367                return NULL;
1368
1369        hval = fnhe_hashfun(daddr);
1370
1371        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1372             fnhe = rcu_dereference(fnhe->fnhe_next)) {
1373                if (fnhe->fnhe_daddr == daddr) {
1374                        if (fnhe->fnhe_expires &&
1375                            time_after(jiffies, fnhe->fnhe_expires)) {
1376                                ip_del_fnhe(nhc, daddr);
1377                                break;
1378                        }
1379                        return fnhe;
1380                }
1381        }
1382        return NULL;
1383}
1384
1385/* MTU selection:
1386 * 1. mtu on route is locked - use it
1387 * 2. mtu from nexthop exception
1388 * 3. mtu from egress device
1389 */
1390
1391u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1392{
1393        struct fib_nh_common *nhc = res->nhc;
1394        struct net_device *dev = nhc->nhc_dev;
1395        struct fib_info *fi = res->fi;
1396        u32 mtu = 0;
1397
1398        if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1399            fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1400                mtu = fi->fib_mtu;
1401
1402        if (likely(!mtu)) {
1403                struct fib_nh_exception *fnhe;
1404
1405                fnhe = find_exception(nhc, daddr);
1406                if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1407                        mtu = fnhe->fnhe_pmtu;
1408        }
1409
1410        if (likely(!mtu))
1411                mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1412
1413        return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1414}
1415
1416static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1417                              __be32 daddr, const bool do_cache)
1418{
1419        bool ret = false;
1420
1421        spin_lock_bh(&fnhe_lock);
1422
1423        if (daddr == fnhe->fnhe_daddr) {
1424                struct rtable __rcu **porig;
1425                struct rtable *orig;
1426                int genid = fnhe_genid(dev_net(rt->dst.dev));
1427
1428                if (rt_is_input_route(rt))
1429                        porig = &fnhe->fnhe_rth_input;
1430                else
1431                        porig = &fnhe->fnhe_rth_output;
1432                orig = rcu_dereference(*porig);
1433
1434                if (fnhe->fnhe_genid != genid) {
1435                        fnhe->fnhe_genid = genid;
1436                        fnhe->fnhe_gw = 0;
1437                        fnhe->fnhe_pmtu = 0;
1438                        fnhe->fnhe_expires = 0;
1439                        fnhe->fnhe_mtu_locked = false;
1440                        fnhe_flush_routes(fnhe);
1441                        orig = NULL;
1442                }
1443                fill_route_from_fnhe(rt, fnhe);
1444                if (!rt->rt_gw4) {
1445                        rt->rt_gw4 = daddr;
1446                        rt->rt_gw_family = AF_INET;
1447                }
1448
1449                if (do_cache) {
1450                        dst_hold(&rt->dst);
1451                        rcu_assign_pointer(*porig, rt);
1452                        if (orig) {
1453                                dst_dev_put(&orig->dst);
1454                                dst_release(&orig->dst);
1455                        }
1456                        ret = true;
1457                }
1458
1459                fnhe->fnhe_stamp = jiffies;
1460        }
1461        spin_unlock_bh(&fnhe_lock);
1462
1463        return ret;
1464}
1465
1466static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1467{
1468        struct rtable *orig, *prev, **p;
1469        bool ret = true;
1470
1471        if (rt_is_input_route(rt)) {
1472                p = (struct rtable **)&nhc->nhc_rth_input;
1473        } else {
1474                p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1475        }
1476        orig = *p;
1477
1478        /* hold dst before doing cmpxchg() to avoid race condition
1479         * on this dst
1480         */
1481        dst_hold(&rt->dst);
1482        prev = cmpxchg(p, orig, rt);
1483        if (prev == orig) {
1484                if (orig) {
1485                        dst_dev_put(&orig->dst);
1486                        dst_release(&orig->dst);
1487                }
1488        } else {
1489                dst_release(&rt->dst);
1490                ret = false;
1491        }
1492
1493        return ret;
1494}
1495
1496struct uncached_list {
1497        spinlock_t              lock;
1498        struct list_head        head;
1499};
1500
1501static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1502
1503void rt_add_uncached_list(struct rtable *rt)
1504{
1505        struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1506
1507        rt->rt_uncached_list = ul;
1508
1509        spin_lock_bh(&ul->lock);
1510        list_add_tail(&rt->rt_uncached, &ul->head);
1511        spin_unlock_bh(&ul->lock);
1512}
1513
1514void rt_del_uncached_list(struct rtable *rt)
1515{
1516        if (!list_empty(&rt->rt_uncached)) {
1517                struct uncached_list *ul = rt->rt_uncached_list;
1518
1519                spin_lock_bh(&ul->lock);
1520                list_del(&rt->rt_uncached);
1521                spin_unlock_bh(&ul->lock);
1522        }
1523}
1524
1525static void ipv4_dst_destroy(struct dst_entry *dst)
1526{
1527        struct rtable *rt = (struct rtable *)dst;
1528
1529        ip_dst_metrics_put(dst);
1530        rt_del_uncached_list(rt);
1531}
1532
1533void rt_flush_dev(struct net_device *dev)
1534{
1535        struct rtable *rt;
1536        int cpu;
1537
1538        for_each_possible_cpu(cpu) {
1539                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1540
1541                spin_lock_bh(&ul->lock);
1542                list_for_each_entry(rt, &ul->head, rt_uncached) {
1543                        if (rt->dst.dev != dev)
1544                                continue;
1545                        rt->dst.dev = blackhole_netdev;
1546                        dev_hold(rt->dst.dev);
1547                        dev_put(dev);
1548                }
1549                spin_unlock_bh(&ul->lock);
1550        }
1551}
1552
1553static bool rt_cache_valid(const struct rtable *rt)
1554{
1555        return  rt &&
1556                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1557                !rt_is_expired(rt);
1558}
1559
1560static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1561                           const struct fib_result *res,
1562                           struct fib_nh_exception *fnhe,
1563                           struct fib_info *fi, u16 type, u32 itag,
1564                           const bool do_cache)
1565{
1566        bool cached = false;
1567
1568        if (fi) {
1569                struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1570
1571                if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1572                        rt->rt_gw_family = nhc->nhc_gw_family;
1573                        /* only INET and INET6 are supported */
1574                        if (likely(nhc->nhc_gw_family == AF_INET))
1575                                rt->rt_gw4 = nhc->nhc_gw.ipv4;
1576                        else
1577                                rt->rt_gw6 = nhc->nhc_gw.ipv6;
1578                }
1579
1580                ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1581
1582#ifdef CONFIG_IP_ROUTE_CLASSID
1583                if (nhc->nhc_family == AF_INET) {
1584                        struct fib_nh *nh;
1585
1586                        nh = container_of(nhc, struct fib_nh, nh_common);
1587                        rt->dst.tclassid = nh->nh_tclassid;
1588                }
1589#endif
1590                rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1591                if (unlikely(fnhe))
1592                        cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1593                else if (do_cache)
1594                        cached = rt_cache_route(nhc, rt);
1595                if (unlikely(!cached)) {
1596                        /* Routes we intend to cache in nexthop exception or
1597                         * FIB nexthop have the DST_NOCACHE bit clear.
1598                         * However, if we are unsuccessful at storing this
1599                         * route into the cache we really need to set it.
1600                         */
1601                        if (!rt->rt_gw4) {
1602                                rt->rt_gw_family = AF_INET;
1603                                rt->rt_gw4 = daddr;
1604                        }
1605                        rt_add_uncached_list(rt);
1606                }
1607        } else
1608                rt_add_uncached_list(rt);
1609
1610#ifdef CONFIG_IP_ROUTE_CLASSID
1611#ifdef CONFIG_IP_MULTIPLE_TABLES
1612        set_class_tag(rt, res->tclassid);
1613#endif
1614        set_class_tag(rt, itag);
1615#endif
1616}
1617
1618struct rtable *rt_dst_alloc(struct net_device *dev,
1619                            unsigned int flags, u16 type,
1620                            bool nopolicy, bool noxfrm, bool will_cache)
1621{
1622        struct rtable *rt;
1623
1624        rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1625                       (will_cache ? 0 : DST_HOST) |
1626                       (nopolicy ? DST_NOPOLICY : 0) |
1627                       (noxfrm ? DST_NOXFRM : 0));
1628
1629        if (rt) {
1630                rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1631                rt->rt_flags = flags;
1632                rt->rt_type = type;
1633                rt->rt_is_input = 0;
1634                rt->rt_iif = 0;
1635                rt->rt_pmtu = 0;
1636                rt->rt_mtu_locked = 0;
1637                rt->rt_gw_family = 0;
1638                rt->rt_gw4 = 0;
1639                INIT_LIST_HEAD(&rt->rt_uncached);
1640
1641                rt->dst.output = ip_output;
1642                if (flags & RTCF_LOCAL)
1643                        rt->dst.input = ip_local_deliver;
1644        }
1645
1646        return rt;
1647}
1648EXPORT_SYMBOL(rt_dst_alloc);
1649
1650struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1651{
1652        struct rtable *new_rt;
1653
1654        new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1655                           rt->dst.flags);
1656
1657        if (new_rt) {
1658                new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1659                new_rt->rt_flags = rt->rt_flags;
1660                new_rt->rt_type = rt->rt_type;
1661                new_rt->rt_is_input = rt->rt_is_input;
1662                new_rt->rt_iif = rt->rt_iif;
1663                new_rt->rt_pmtu = rt->rt_pmtu;
1664                new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1665                new_rt->rt_gw_family = rt->rt_gw_family;
1666                if (rt->rt_gw_family == AF_INET)
1667                        new_rt->rt_gw4 = rt->rt_gw4;
1668                else if (rt->rt_gw_family == AF_INET6)
1669                        new_rt->rt_gw6 = rt->rt_gw6;
1670                INIT_LIST_HEAD(&new_rt->rt_uncached);
1671
1672                new_rt->dst.flags |= DST_HOST;
1673                new_rt->dst.input = rt->dst.input;
1674                new_rt->dst.output = rt->dst.output;
1675                new_rt->dst.error = rt->dst.error;
1676                new_rt->dst.lastuse = jiffies;
1677                new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1678        }
1679        return new_rt;
1680}
1681EXPORT_SYMBOL(rt_dst_clone);
1682
1683/* called in rcu_read_lock() section */
1684int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1685                          u8 tos, struct net_device *dev,
1686                          struct in_device *in_dev, u32 *itag)
1687{
1688        int err;
1689
1690        /* Primary sanity checks. */
1691        if (!in_dev)
1692                return -EINVAL;
1693
1694        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1695            skb->protocol != htons(ETH_P_IP))
1696                return -EINVAL;
1697
1698        if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1699                return -EINVAL;
1700
1701        if (ipv4_is_zeronet(saddr)) {
1702                if (!ipv4_is_local_multicast(daddr) &&
1703                    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1704                        return -EINVAL;
1705        } else {
1706                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1707                                          in_dev, itag);
1708                if (err < 0)
1709                        return err;
1710        }
1711        return 0;
1712}
1713
1714/* called in rcu_read_lock() section */
1715static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1716                             u8 tos, struct net_device *dev, int our)
1717{
1718        struct in_device *in_dev = __in_dev_get_rcu(dev);
1719        unsigned int flags = RTCF_MULTICAST;
1720        struct rtable *rth;
1721        u32 itag = 0;
1722        int err;
1723
1724        err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1725        if (err)
1726                return err;
1727
1728        if (our)
1729                flags |= RTCF_LOCAL;
1730
1731        rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1732                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1733        if (!rth)
1734                return -ENOBUFS;
1735
1736#ifdef CONFIG_IP_ROUTE_CLASSID
1737        rth->dst.tclassid = itag;
1738#endif
1739        rth->dst.output = ip_rt_bug;
1740        rth->rt_is_input= 1;
1741
1742#ifdef CONFIG_IP_MROUTE
1743        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1744                rth->dst.input = ip_mr_input;
1745#endif
1746        RT_CACHE_STAT_INC(in_slow_mc);
1747
1748        skb_dst_set(skb, &rth->dst);
1749        return 0;
1750}
1751
1752
1753static void ip_handle_martian_source(struct net_device *dev,
1754                                     struct in_device *in_dev,
1755                                     struct sk_buff *skb,
1756                                     __be32 daddr,
1757                                     __be32 saddr)
1758{
1759        RT_CACHE_STAT_INC(in_martian_src);
1760#ifdef CONFIG_IP_ROUTE_VERBOSE
1761        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1762                /*
1763                 *      RFC1812 recommendation, if source is martian,
1764                 *      the only hint is MAC header.
1765                 */
1766                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1767                        &daddr, &saddr, dev->name);
1768                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1769                        print_hex_dump(KERN_WARNING, "ll header: ",
1770                                       DUMP_PREFIX_OFFSET, 16, 1,
1771                                       skb_mac_header(skb),
1772                                       dev->hard_header_len, false);
1773                }
1774        }
1775#endif
1776}
1777
1778/* called in rcu_read_lock() section */
1779static int __mkroute_input(struct sk_buff *skb,
1780                           const struct fib_result *res,
1781                           struct in_device *in_dev,
1782                           __be32 daddr, __be32 saddr, u32 tos)
1783{
1784        struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1785        struct net_device *dev = nhc->nhc_dev;
1786        struct fib_nh_exception *fnhe;
1787        struct rtable *rth;
1788        int err;
1789        struct in_device *out_dev;
1790        bool do_cache;
1791        u32 itag = 0;
1792
1793        /* get a working reference to the output device */
1794        out_dev = __in_dev_get_rcu(dev);
1795        if (!out_dev) {
1796                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1797                return -EINVAL;
1798        }
1799
1800        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1801                                  in_dev->dev, in_dev, &itag);
1802        if (err < 0) {
1803                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1804                                         saddr);
1805
1806                goto cleanup;
1807        }
1808
1809        do_cache = res->fi && !itag;
1810        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1811            skb->protocol == htons(ETH_P_IP)) {
1812                __be32 gw;
1813
1814                gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1815                if (IN_DEV_SHARED_MEDIA(out_dev) ||
1816                    inet_addr_onlink(out_dev, saddr, gw))
1817                        IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1818        }
1819
1820        if (skb->protocol != htons(ETH_P_IP)) {
1821                /* Not IP (i.e. ARP). Do not create route, if it is
1822                 * invalid for proxy arp. DNAT routes are always valid.
1823                 *
1824                 * Proxy arp feature have been extended to allow, ARP
1825                 * replies back to the same interface, to support
1826                 * Private VLAN switch technologies. See arp.c.
1827                 */
1828                if (out_dev == in_dev &&
1829                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1830                        err = -EINVAL;
1831                        goto cleanup;
1832                }
1833        }
1834
1835        fnhe = find_exception(nhc, daddr);
1836        if (do_cache) {
1837                if (fnhe)
1838                        rth = rcu_dereference(fnhe->fnhe_rth_input);
1839                else
1840                        rth = rcu_dereference(nhc->nhc_rth_input);
1841                if (rt_cache_valid(rth)) {
1842                        skb_dst_set_noref(skb, &rth->dst);
1843                        goto out;
1844                }
1845        }
1846
1847        rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1848                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1849                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1850        if (!rth) {
1851                err = -ENOBUFS;
1852                goto cleanup;
1853        }
1854
1855        rth->rt_is_input = 1;
1856        RT_CACHE_STAT_INC(in_slow_tot);
1857
1858        rth->dst.input = ip_forward;
1859
1860        rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1861                       do_cache);
1862        lwtunnel_set_redirect(&rth->dst);
1863        skb_dst_set(skb, &rth->dst);
1864out:
1865        err = 0;
1866 cleanup:
1867        return err;
1868}
1869
1870#ifdef CONFIG_IP_ROUTE_MULTIPATH
1871/* To make ICMP packets follow the right flow, the multipath hash is
1872 * calculated from the inner IP addresses.
1873 */
1874static void ip_multipath_l3_keys(const struct sk_buff *skb,
1875                                 struct flow_keys *hash_keys)
1876{
1877        const struct iphdr *outer_iph = ip_hdr(skb);
1878        const struct iphdr *key_iph = outer_iph;
1879        const struct iphdr *inner_iph;
1880        const struct icmphdr *icmph;
1881        struct iphdr _inner_iph;
1882        struct icmphdr _icmph;
1883
1884        if (likely(outer_iph->protocol != IPPROTO_ICMP))
1885                goto out;
1886
1887        if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1888                goto out;
1889
1890        icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1891                                   &_icmph);
1892        if (!icmph)
1893                goto out;
1894
1895        if (icmph->type != ICMP_DEST_UNREACH &&
1896            icmph->type != ICMP_REDIRECT &&
1897            icmph->type != ICMP_TIME_EXCEEDED &&
1898            icmph->type != ICMP_PARAMETERPROB)
1899                goto out;
1900
1901        inner_iph = skb_header_pointer(skb,
1902                                       outer_iph->ihl * 4 + sizeof(_icmph),
1903                                       sizeof(_inner_iph), &_inner_iph);
1904        if (!inner_iph)
1905                goto out;
1906
1907        key_iph = inner_iph;
1908out:
1909        hash_keys->addrs.v4addrs.src = key_iph->saddr;
1910        hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1911}
1912
1913/* if skb is set it will be used and fl4 can be NULL */
1914int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1915                       const struct sk_buff *skb, struct flow_keys *flkeys)
1916{
1917        u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1918        struct flow_keys hash_keys;
1919        u32 mhash;
1920
1921        switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1922        case 0:
1923                memset(&hash_keys, 0, sizeof(hash_keys));
1924                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1925                if (skb) {
1926                        ip_multipath_l3_keys(skb, &hash_keys);
1927                } else {
1928                        hash_keys.addrs.v4addrs.src = fl4->saddr;
1929                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
1930                }
1931                break;
1932        case 1:
1933                /* skb is currently provided only when forwarding */
1934                if (skb) {
1935                        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1936                        struct flow_keys keys;
1937
1938                        /* short-circuit if we already have L4 hash present */
1939                        if (skb->l4_hash)
1940                                return skb_get_hash_raw(skb) >> 1;
1941
1942                        memset(&hash_keys, 0, sizeof(hash_keys));
1943
1944                        if (!flkeys) {
1945                                skb_flow_dissect_flow_keys(skb, &keys, flag);
1946                                flkeys = &keys;
1947                        }
1948
1949                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1950                        hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1951                        hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1952                        hash_keys.ports.src = flkeys->ports.src;
1953                        hash_keys.ports.dst = flkeys->ports.dst;
1954                        hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1955                } else {
1956                        memset(&hash_keys, 0, sizeof(hash_keys));
1957                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1958                        hash_keys.addrs.v4addrs.src = fl4->saddr;
1959                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
1960                        hash_keys.ports.src = fl4->fl4_sport;
1961                        hash_keys.ports.dst = fl4->fl4_dport;
1962                        hash_keys.basic.ip_proto = fl4->flowi4_proto;
1963                }
1964                break;
1965        case 2:
1966                memset(&hash_keys, 0, sizeof(hash_keys));
1967                /* skb is currently provided only when forwarding */
1968                if (skb) {
1969                        struct flow_keys keys;
1970
1971                        skb_flow_dissect_flow_keys(skb, &keys, 0);
1972                        /* Inner can be v4 or v6 */
1973                        if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1974                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1975                                hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1976                                hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1977                        } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1978                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1979                                hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1980                                hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1981                                hash_keys.tags.flow_label = keys.tags.flow_label;
1982                                hash_keys.basic.ip_proto = keys.basic.ip_proto;
1983                        } else {
1984                                /* Same as case 0 */
1985                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1986                                ip_multipath_l3_keys(skb, &hash_keys);
1987                        }
1988                } else {
1989                        /* Same as case 0 */
1990                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1991                        hash_keys.addrs.v4addrs.src = fl4->saddr;
1992                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
1993                }
1994                break;
1995        }
1996        mhash = flow_hash_from_keys(&hash_keys);
1997
1998        if (multipath_hash)
1999                mhash = jhash_2words(mhash, multipath_hash, 0);
2000
2001        return mhash >> 1;
2002}
2003#endif /* CONFIG_IP_ROUTE_MULTIPATH */
2004
2005static int ip_mkroute_input(struct sk_buff *skb,
2006                            struct fib_result *res,
2007                            struct in_device *in_dev,
2008                            __be32 daddr, __be32 saddr, u32 tos,
2009                            struct flow_keys *hkeys)
2010{
2011#ifdef CONFIG_IP_ROUTE_MULTIPATH
2012        if (res->fi && fib_info_num_path(res->fi) > 1) {
2013                int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2014
2015                fib_select_multipath(res, h);
2016        }
2017#endif
2018
2019        /* create a routing cache entry */
2020        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2021}
2022
2023/*
2024 *      NOTE. We drop all the packets that has local source
2025 *      addresses, because every properly looped back packet
2026 *      must have correct destination already attached by output routine.
2027 *
2028 *      Such approach solves two big problems:
2029 *      1. Not simplex devices are handled properly.
2030 *      2. IP spoofing attempts are filtered with 100% of guarantee.
2031 *      called with rcu_read_lock()
2032 */
2033
2034static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2035                               u8 tos, struct net_device *dev,
2036                               struct fib_result *res)
2037{
2038        struct in_device *in_dev = __in_dev_get_rcu(dev);
2039        struct flow_keys *flkeys = NULL, _flkeys;
2040        struct net    *net = dev_net(dev);
2041        struct ip_tunnel_info *tun_info;
2042        int             err = -EINVAL;
2043        unsigned int    flags = 0;
2044        u32             itag = 0;
2045        struct rtable   *rth;
2046        struct flowi4   fl4;
2047        bool do_cache = true;
2048
2049        /* IP on this device is disabled. */
2050
2051        if (!in_dev)
2052                goto out;
2053
2054        /* Check for the most weird martians, which can be not detected
2055           by fib_lookup.
2056         */
2057
2058        tun_info = skb_tunnel_info(skb);
2059        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2060                fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2061        else
2062                fl4.flowi4_tun_key.tun_id = 0;
2063        skb_dst_drop(skb);
2064
2065        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2066                goto martian_source;
2067
2068        res->fi = NULL;
2069        res->table = NULL;
2070        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2071                goto brd_input;
2072
2073        /* Accept zero addresses only to limited broadcast;
2074         * I even do not know to fix it or not. Waiting for complains :-)
2075         */
2076        if (ipv4_is_zeronet(saddr))
2077                goto martian_source;
2078
2079        if (ipv4_is_zeronet(daddr))
2080                goto martian_destination;
2081
2082        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2083         * and call it once if daddr or/and saddr are loopback addresses
2084         */
2085        if (ipv4_is_loopback(daddr)) {
2086                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2087                        goto martian_destination;
2088        } else if (ipv4_is_loopback(saddr)) {
2089                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2090                        goto martian_source;
2091        }
2092
2093        /*
2094         *      Now we are ready to route packet.
2095         */
2096        fl4.flowi4_oif = 0;
2097        fl4.flowi4_iif = dev->ifindex;
2098        fl4.flowi4_mark = skb->mark;
2099        fl4.flowi4_tos = tos;
2100        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2101        fl4.flowi4_flags = 0;
2102        fl4.daddr = daddr;
2103        fl4.saddr = saddr;
2104        fl4.flowi4_uid = sock_net_uid(net, NULL);
2105
2106        if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2107                flkeys = &_flkeys;
2108        } else {
2109                fl4.flowi4_proto = 0;
2110                fl4.fl4_sport = 0;
2111                fl4.fl4_dport = 0;
2112        }
2113
2114        err = fib_lookup(net, &fl4, res, 0);
2115        if (err != 0) {
2116                if (!IN_DEV_FORWARD(in_dev))
2117                        err = -EHOSTUNREACH;
2118                goto no_route;
2119        }
2120
2121        if (res->type == RTN_BROADCAST) {
2122                if (IN_DEV_BFORWARD(in_dev))
2123                        goto make_route;
2124                /* not do cache if bc_forwarding is enabled */
2125                if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2126                        do_cache = false;
2127                goto brd_input;
2128        }
2129
2130        if (res->type == RTN_LOCAL) {
2131                err = fib_validate_source(skb, saddr, daddr, tos,
2132                                          0, dev, in_dev, &itag);
2133                if (err < 0)
2134                        goto martian_source;
2135                goto local_input;
2136        }
2137
2138        if (!IN_DEV_FORWARD(in_dev)) {
2139                err = -EHOSTUNREACH;
2140                goto no_route;
2141        }
2142        if (res->type != RTN_UNICAST)
2143                goto martian_destination;
2144
2145make_route:
2146        err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2147out:    return err;
2148
2149brd_input:
2150        if (skb->protocol != htons(ETH_P_IP))
2151                goto e_inval;
2152
2153        if (!ipv4_is_zeronet(saddr)) {
2154                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2155                                          in_dev, &itag);
2156                if (err < 0)
2157                        goto martian_source;
2158        }
2159        flags |= RTCF_BROADCAST;
2160        res->type = RTN_BROADCAST;
2161        RT_CACHE_STAT_INC(in_brd);
2162
2163local_input:
2164        do_cache &= res->fi && !itag;
2165        if (do_cache) {
2166                struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2167
2168                rth = rcu_dereference(nhc->nhc_rth_input);
2169                if (rt_cache_valid(rth)) {
2170                        skb_dst_set_noref(skb, &rth->dst);
2171                        err = 0;
2172                        goto out;
2173                }
2174        }
2175
2176        rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2177                           flags | RTCF_LOCAL, res->type,
2178                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2179        if (!rth)
2180                goto e_nobufs;
2181
2182        rth->dst.output= ip_rt_bug;
2183#ifdef CONFIG_IP_ROUTE_CLASSID
2184        rth->dst.tclassid = itag;
2185#endif
2186        rth->rt_is_input = 1;
2187
2188        RT_CACHE_STAT_INC(in_slow_tot);
2189        if (res->type == RTN_UNREACHABLE) {
2190                rth->dst.input= ip_error;
2191                rth->dst.error= -err;
2192                rth->rt_flags   &= ~RTCF_LOCAL;
2193        }
2194
2195        if (do_cache) {
2196                struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2197
2198                rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2199                if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2200                        WARN_ON(rth->dst.input == lwtunnel_input);
2201                        rth->dst.lwtstate->orig_input = rth->dst.input;
2202                        rth->dst.input = lwtunnel_input;
2203                }
2204
2205                if (unlikely(!rt_cache_route(nhc, rth)))
2206                        rt_add_uncached_list(rth);
2207        }
2208        skb_dst_set(skb, &rth->dst);
2209        err = 0;
2210        goto out;
2211
2212no_route:
2213        RT_CACHE_STAT_INC(in_no_route);
2214        res->type = RTN_UNREACHABLE;
2215        res->fi = NULL;
2216        res->table = NULL;
2217        goto local_input;
2218
2219        /*
2220         *      Do not cache martian addresses: they should be logged (RFC1812)
2221         */
2222martian_destination:
2223        RT_CACHE_STAT_INC(in_martian_dst);
2224#ifdef CONFIG_IP_ROUTE_VERBOSE
2225        if (IN_DEV_LOG_MARTIANS(in_dev))
2226                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2227                                     &daddr, &saddr, dev->name);
2228#endif
2229
2230e_inval:
2231        err = -EINVAL;
2232        goto out;
2233
2234e_nobufs:
2235        err = -ENOBUFS;
2236        goto out;
2237
2238martian_source:
2239        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2240        goto out;
2241}
2242
2243int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2244                         u8 tos, struct net_device *dev)
2245{
2246        struct fib_result res;
2247        int err;
2248
2249        tos &= IPTOS_RT_MASK;
2250        rcu_read_lock();
2251        err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2252        rcu_read_unlock();
2253
2254        return err;
2255}
2256EXPORT_SYMBOL(ip_route_input_noref);
2257
2258/* called with rcu_read_lock held */
2259int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2260                       u8 tos, struct net_device *dev, struct fib_result *res)
2261{
2262        /* Multicast recognition logic is moved from route cache to here.
2263           The problem was that too many Ethernet cards have broken/missing
2264           hardware multicast filters :-( As result the host on multicasting
2265           network acquires a lot of useless route cache entries, sort of
2266           SDR messages from all the world. Now we try to get rid of them.
2267           Really, provided software IP multicast filter is organized
2268           reasonably (at least, hashed), it does not result in a slowdown
2269           comparing with route cache reject entries.
2270           Note, that multicast routers are not affected, because
2271           route cache entry is created eventually.
2272         */
2273        if (ipv4_is_multicast(daddr)) {
2274                struct in_device *in_dev = __in_dev_get_rcu(dev);
2275                int our = 0;
2276                int err = -EINVAL;
2277
2278                if (!in_dev)
2279                        return err;
2280                our = ip_check_mc_rcu(in_dev, daddr, saddr,
2281                                      ip_hdr(skb)->protocol);
2282
2283                /* check l3 master if no match yet */
2284                if (!our && netif_is_l3_slave(dev)) {
2285                        struct in_device *l3_in_dev;
2286
2287                        l3_in_dev = __in_dev_get_rcu(skb->dev);
2288                        if (l3_in_dev)
2289                                our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2290                                                      ip_hdr(skb)->protocol);
2291                }
2292
2293                if (our
2294#ifdef CONFIG_IP_MROUTE
2295                        ||
2296                    (!ipv4_is_local_multicast(daddr) &&
2297                     IN_DEV_MFORWARD(in_dev))
2298#endif
2299                   ) {
2300                        err = ip_route_input_mc(skb, daddr, saddr,
2301                                                tos, dev, our);
2302                }
2303                return err;
2304        }
2305
2306        return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2307}
2308
2309/* called with rcu_read_lock() */
2310static struct rtable *__mkroute_output(const struct fib_result *res,
2311                                       const struct flowi4 *fl4, int orig_oif,
2312                                       struct net_device *dev_out,
2313                                       unsigned int flags)
2314{
2315        struct fib_info *fi = res->fi;
2316        struct fib_nh_exception *fnhe;
2317        struct in_device *in_dev;
2318        u16 type = res->type;
2319        struct rtable *rth;
2320        bool do_cache;
2321
2322        in_dev = __in_dev_get_rcu(dev_out);
2323        if (!in_dev)
2324                return ERR_PTR(-EINVAL);
2325
2326        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2327                if (ipv4_is_loopback(fl4->saddr) &&
2328                    !(dev_out->flags & IFF_LOOPBACK) &&
2329                    !netif_is_l3_master(dev_out))
2330                        return ERR_PTR(-EINVAL);
2331
2332        if (ipv4_is_lbcast(fl4->daddr))
2333                type = RTN_BROADCAST;
2334        else if (ipv4_is_multicast(fl4->daddr))
2335                type = RTN_MULTICAST;
2336        else if (ipv4_is_zeronet(fl4->daddr))
2337                return ERR_PTR(-EINVAL);
2338
2339        if (dev_out->flags & IFF_LOOPBACK)
2340                flags |= RTCF_LOCAL;
2341
2342        do_cache = true;
2343        if (type == RTN_BROADCAST) {
2344                flags |= RTCF_BROADCAST | RTCF_LOCAL;
2345                fi = NULL;
2346        } else if (type == RTN_MULTICAST) {
2347                flags |= RTCF_MULTICAST | RTCF_LOCAL;
2348                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2349                                     fl4->flowi4_proto))
2350                        flags &= ~RTCF_LOCAL;
2351                else
2352                        do_cache = false;
2353                /* If multicast route do not exist use
2354                 * default one, but do not gateway in this case.
2355                 * Yes, it is hack.
2356                 */
2357                if (fi && res->prefixlen < 4)
2358                        fi = NULL;
2359        } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2360                   (orig_oif != dev_out->ifindex)) {
2361                /* For local routes that require a particular output interface
2362                 * we do not want to cache the result.  Caching the result
2363                 * causes incorrect behaviour when there are multiple source
2364                 * addresses on the interface, the end result being that if the
2365                 * intended recipient is waiting on that interface for the
2366                 * packet he won't receive it because it will be delivered on
2367                 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2368                 * be set to the loopback interface as well.
2369                 */
2370                do_cache = false;
2371        }
2372
2373        fnhe = NULL;
2374        do_cache &= fi != NULL;
2375        if (fi) {
2376                struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2377                struct rtable __rcu **prth;
2378
2379                fnhe = find_exception(nhc, fl4->daddr);
2380                if (!do_cache)
2381                        goto add;
2382                if (fnhe) {
2383                        prth = &fnhe->fnhe_rth_output;
2384                } else {
2385                        if (unlikely(fl4->flowi4_flags &
2386                                     FLOWI_FLAG_KNOWN_NH &&
2387                                     !(nhc->nhc_gw_family &&
2388                                       nhc->nhc_scope == RT_SCOPE_LINK))) {
2389                                do_cache = false;
2390                                goto add;
2391                        }
2392                        prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2393                }
2394                rth = rcu_dereference(*prth);
2395                if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2396                        return rth;
2397        }
2398
2399add:
2400        rth = rt_dst_alloc(dev_out, flags, type,
2401                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
2402                           IN_DEV_CONF_GET(in_dev, NOXFRM),
2403                           do_cache);
2404        if (!rth)
2405                return ERR_PTR(-ENOBUFS);
2406
2407        rth->rt_iif = orig_oif;
2408
2409        RT_CACHE_STAT_INC(out_slow_tot);
2410
2411        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2412                if (flags & RTCF_LOCAL &&
2413                    !(dev_out->flags & IFF_LOOPBACK)) {
2414                        rth->dst.output = ip_mc_output;
2415                        RT_CACHE_STAT_INC(out_slow_mc);
2416                }
2417#ifdef CONFIG_IP_MROUTE
2418                if (type == RTN_MULTICAST) {
2419                        if (IN_DEV_MFORWARD(in_dev) &&
2420                            !ipv4_is_local_multicast(fl4->daddr)) {
2421                                rth->dst.input = ip_mr_input;
2422                                rth->dst.output = ip_mc_output;
2423                        }
2424                }
2425#endif
2426        }
2427
2428        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2429        lwtunnel_set_redirect(&rth->dst);
2430
2431        return rth;
2432}
2433
2434/*
2435 * Major route resolver routine.
2436 */
2437
2438struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2439                                        const struct sk_buff *skb)
2440{
2441        __u8 tos = RT_FL_TOS(fl4);
2442        struct fib_result res = {
2443                .type           = RTN_UNSPEC,
2444                .fi             = NULL,
2445                .table          = NULL,
2446                .tclassid       = 0,
2447        };
2448        struct rtable *rth;
2449
2450        fl4->flowi4_iif = LOOPBACK_IFINDEX;
2451        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2452        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2453                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2454
2455        rcu_read_lock();
2456        rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2457        rcu_read_unlock();
2458
2459        return rth;
2460}
2461EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2462
2463struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2464                                            struct fib_result *res,
2465                                            const struct sk_buff *skb)
2466{
2467        struct net_device *dev_out = NULL;
2468        int orig_oif = fl4->flowi4_oif;
2469        unsigned int flags = 0;
2470        struct rtable *rth;
2471        int err = -ENETUNREACH;
2472
2473        if (fl4->saddr) {
2474                rth = ERR_PTR(-EINVAL);
2475                if (ipv4_is_multicast(fl4->saddr) ||
2476                    ipv4_is_lbcast(fl4->saddr) ||
2477                    ipv4_is_zeronet(fl4->saddr))
2478                        goto out;
2479
2480                /* I removed check for oif == dev_out->oif here.
2481                   It was wrong for two reasons:
2482                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2483                      is assigned to multiple interfaces.
2484                   2. Moreover, we are allowed to send packets with saddr
2485                      of another iface. --ANK
2486                 */
2487
2488                if (fl4->flowi4_oif == 0 &&
2489                    (ipv4_is_multicast(fl4->daddr) ||
2490                     ipv4_is_lbcast(fl4->daddr))) {
2491                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2492                        dev_out = __ip_dev_find(net, fl4->saddr, false);
2493                        if (!dev_out)
2494                                goto out;
2495
2496                        /* Special hack: user can direct multicasts
2497                           and limited broadcast via necessary interface
2498                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2499                           This hack is not just for fun, it allows
2500                           vic,vat and friends to work.
2501                           They bind socket to loopback, set ttl to zero
2502                           and expect that it will work.
2503                           From the viewpoint of routing cache they are broken,
2504                           because we are not allowed to build multicast path
2505                           with loopback source addr (look, routing cache
2506                           cannot know, that ttl is zero, so that packet
2507                           will not leave this host and route is valid).
2508                           Luckily, this hack is good workaround.
2509                         */
2510
2511                        fl4->flowi4_oif = dev_out->ifindex;
2512                        goto make_route;
2513                }
2514
2515                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2516                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2517                        if (!__ip_dev_find(net, fl4->saddr, false))
2518                                goto out;
2519                }
2520        }
2521
2522
2523        if (fl4->flowi4_oif) {
2524                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2525                rth = ERR_PTR(-ENODEV);
2526                if (!dev_out)
2527                        goto out;
2528
2529                /* RACE: Check return value of inet_select_addr instead. */
2530                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2531                        rth = ERR_PTR(-ENETUNREACH);
2532                        goto out;
2533                }
2534                if (ipv4_is_local_multicast(fl4->daddr) ||
2535                    ipv4_is_lbcast(fl4->daddr) ||
2536                    fl4->flowi4_proto == IPPROTO_IGMP) {
2537                        if (!fl4->saddr)
2538                                fl4->saddr = inet_select_addr(dev_out, 0,
2539                                                              RT_SCOPE_LINK);
2540                        goto make_route;
2541                }
2542                if (!fl4->saddr) {
2543                        if (ipv4_is_multicast(fl4->daddr))
2544                                fl4->saddr = inet_select_addr(dev_out, 0,
2545                                                              fl4->flowi4_scope);
2546                        else if (!fl4->daddr)
2547                                fl4->saddr = inet_select_addr(dev_out, 0,
2548                                                              RT_SCOPE_HOST);
2549                }
2550        }
2551
2552        if (!fl4->daddr) {
2553                fl4->daddr = fl4->saddr;
2554                if (!fl4->daddr)
2555                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2556                dev_out = net->loopback_dev;
2557                fl4->flowi4_oif = LOOPBACK_IFINDEX;
2558                res->type = RTN_LOCAL;
2559                flags |= RTCF_LOCAL;
2560                goto make_route;
2561        }
2562
2563        err = fib_lookup(net, fl4, res, 0);
2564        if (err) {
2565                res->fi = NULL;
2566                res->table = NULL;
2567                if (fl4->flowi4_oif &&
2568                    (ipv4_is_multicast(fl4->daddr) ||
2569                    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2570                        /* Apparently, routing tables are wrong. Assume,
2571                           that the destination is on link.
2572
2573                           WHY? DW.
2574                           Because we are allowed to send to iface
2575                           even if it has NO routes and NO assigned
2576                           addresses. When oif is specified, routing
2577                           tables are looked up with only one purpose:
2578                           to catch if destination is gatewayed, rather than
2579                           direct. Moreover, if MSG_DONTROUTE is set,
2580                           we send packet, ignoring both routing tables
2581                           and ifaddr state. --ANK
2582
2583
2584                           We could make it even if oif is unknown,
2585                           likely IPv6, but we do not.
2586                         */
2587
2588                        if (fl4->saddr == 0)
2589                                fl4->saddr = inet_select_addr(dev_out, 0,
2590                                                              RT_SCOPE_LINK);
2591                        res->type = RTN_UNICAST;
2592                        goto make_route;
2593                }
2594                rth = ERR_PTR(err);
2595                goto out;
2596        }
2597
2598        if (res->type == RTN_LOCAL) {
2599                if (!fl4->saddr) {
2600                        if (res->fi->fib_prefsrc)
2601                                fl4->saddr = res->fi->fib_prefsrc;
2602                        else
2603                                fl4->saddr = fl4->daddr;
2604                }
2605
2606                /* L3 master device is the loopback for that domain */
2607                dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2608                        net->loopback_dev;
2609
2610                /* make sure orig_oif points to fib result device even
2611                 * though packet rx/tx happens over loopback or l3mdev
2612                 */
2613                orig_oif = FIB_RES_OIF(*res);
2614
2615                fl4->flowi4_oif = dev_out->ifindex;
2616                flags |= RTCF_LOCAL;
2617                goto make_route;
2618        }
2619
2620        fib_select_path(net, res, fl4, skb);
2621
2622        dev_out = FIB_RES_DEV(*res);
2623        fl4->flowi4_oif = dev_out->ifindex;
2624
2625
2626make_route:
2627        rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2628
2629out:
2630        return rth;
2631}
2632
2633static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2634{
2635        return NULL;
2636}
2637
2638static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2639{
2640        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2641
2642        return mtu ? : dst->dev->mtu;
2643}
2644
2645static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2646                                          struct sk_buff *skb, u32 mtu)
2647{
2648}
2649
2650static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2651                                       struct sk_buff *skb)
2652{
2653}
2654
2655static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2656                                          unsigned long old)
2657{
2658        return NULL;
2659}
2660
2661static struct dst_ops ipv4_dst_blackhole_ops = {
2662        .family                 =       AF_INET,
2663        .check                  =       ipv4_blackhole_dst_check,
2664        .mtu                    =       ipv4_blackhole_mtu,
2665        .default_advmss         =       ipv4_default_advmss,
2666        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2667        .redirect               =       ipv4_rt_blackhole_redirect,
2668        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2669        .neigh_lookup           =       ipv4_neigh_lookup,
2670};
2671
2672struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2673{
2674        struct rtable *ort = (struct rtable *) dst_orig;
2675        struct rtable *rt;
2676
2677        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2678        if (rt) {
2679                struct dst_entry *new = &rt->dst;
2680
2681                new->__use = 1;
2682                new->input = dst_discard;
2683                new->output = dst_discard_out;
2684
2685                new->dev = net->loopback_dev;
2686                if (new->dev)
2687                        dev_hold(new->dev);
2688
2689                rt->rt_is_input = ort->rt_is_input;
2690                rt->rt_iif = ort->rt_iif;
2691                rt->rt_pmtu = ort->rt_pmtu;
2692                rt->rt_mtu_locked = ort->rt_mtu_locked;
2693
2694                rt->rt_genid = rt_genid_ipv4(net);
2695                rt->rt_flags = ort->rt_flags;
2696                rt->rt_type = ort->rt_type;
2697                rt->rt_gw_family = ort->rt_gw_family;
2698                if (rt->rt_gw_family == AF_INET)
2699                        rt->rt_gw4 = ort->rt_gw4;
2700                else if (rt->rt_gw_family == AF_INET6)
2701                        rt->rt_gw6 = ort->rt_gw6;
2702
2703                INIT_LIST_HEAD(&rt->rt_uncached);
2704        }
2705
2706        dst_release(dst_orig);
2707
2708        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2709}
2710
2711struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2712                                    const struct sock *sk)
2713{
2714        struct rtable *rt = __ip_route_output_key(net, flp4);
2715
2716        if (IS_ERR(rt))
2717                return rt;
2718
2719        if (flp4->flowi4_proto)
2720                rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2721                                                        flowi4_to_flowi(flp4),
2722                                                        sk, 0);
2723
2724        return rt;
2725}
2726EXPORT_SYMBOL_GPL(ip_route_output_flow);
2727
2728/* called with rcu_read_lock held */
2729static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2730                        struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2731                        struct sk_buff *skb, u32 portid, u32 seq,
2732                        unsigned int flags)
2733{
2734        struct rtmsg *r;
2735        struct nlmsghdr *nlh;
2736        unsigned long expires = 0;
2737        u32 error;
2738        u32 metrics[RTAX_MAX];
2739
2740        nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2741        if (!nlh)
2742                return -EMSGSIZE;
2743
2744        r = nlmsg_data(nlh);
2745        r->rtm_family    = AF_INET;
2746        r->rtm_dst_len  = 32;
2747        r->rtm_src_len  = 0;
2748        r->rtm_tos      = fl4 ? fl4->flowi4_tos : 0;
2749        r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2750        if (nla_put_u32(skb, RTA_TABLE, table_id))
2751                goto nla_put_failure;
2752        r->rtm_type     = rt->rt_type;
2753        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2754        r->rtm_protocol = RTPROT_UNSPEC;
2755        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2756        if (rt->rt_flags & RTCF_NOTIFY)
2757                r->rtm_flags |= RTM_F_NOTIFY;
2758        if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2759                r->rtm_flags |= RTCF_DOREDIRECT;
2760
2761        if (nla_put_in_addr(skb, RTA_DST, dst))
2762                goto nla_put_failure;
2763        if (src) {
2764                r->rtm_src_len = 32;
2765                if (nla_put_in_addr(skb, RTA_SRC, src))
2766                        goto nla_put_failure;
2767        }
2768        if (rt->dst.dev &&
2769            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2770                goto nla_put_failure;
2771#ifdef CONFIG_IP_ROUTE_CLASSID
2772        if (rt->dst.tclassid &&
2773            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2774                goto nla_put_failure;
2775#endif
2776        if (fl4 && !rt_is_input_route(rt) &&
2777            fl4->saddr != src) {
2778                if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2779                        goto nla_put_failure;
2780        }
2781        if (rt->rt_gw_family == AF_INET &&
2782            nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2783                goto nla_put_failure;
2784        } else if (rt->rt_gw_family == AF_INET6) {
2785                int alen = sizeof(struct in6_addr);
2786                struct nlattr *nla;
2787                struct rtvia *via;
2788
2789                nla = nla_reserve(skb, RTA_VIA, alen + 2);
2790                if (!nla)
2791                        goto nla_put_failure;
2792
2793                via = nla_data(nla);
2794                via->rtvia_family = AF_INET6;
2795                memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2796        }
2797
2798        expires = rt->dst.expires;
2799        if (expires) {
2800                unsigned long now = jiffies;
2801
2802                if (time_before(now, expires))
2803                        expires -= now;
2804                else
2805                        expires = 0;
2806        }
2807
2808        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2809        if (rt->rt_pmtu && expires)
2810                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2811        if (rt->rt_mtu_locked && expires)
2812                metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2813        if (rtnetlink_put_metrics(skb, metrics) < 0)
2814                goto nla_put_failure;
2815
2816        if (fl4) {
2817                if (fl4->flowi4_mark &&
2818                    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2819                        goto nla_put_failure;
2820
2821                if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2822                    nla_put_u32(skb, RTA_UID,
2823                                from_kuid_munged(current_user_ns(),
2824                                                 fl4->flowi4_uid)))
2825                        goto nla_put_failure;
2826
2827                if (rt_is_input_route(rt)) {
2828#ifdef CONFIG_IP_MROUTE
2829                        if (ipv4_is_multicast(dst) &&
2830                            !ipv4_is_local_multicast(dst) &&
2831                            IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2832                                int err = ipmr_get_route(net, skb,
2833                                                         fl4->saddr, fl4->daddr,
2834                                                         r, portid);
2835
2836                                if (err <= 0) {
2837                                        if (err == 0)
2838                                                return 0;
2839                                        goto nla_put_failure;
2840                                }
2841                        } else
2842#endif
2843                                if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2844                                        goto nla_put_failure;
2845                }
2846        }
2847
2848        error = rt->dst.error;
2849
2850        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2851                goto nla_put_failure;
2852
2853        nlmsg_end(skb, nlh);
2854        return 0;
2855
2856nla_put_failure:
2857        nlmsg_cancel(skb, nlh);
2858        return -EMSGSIZE;
2859}
2860
2861static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2862                            struct netlink_callback *cb, u32 table_id,
2863                            struct fnhe_hash_bucket *bucket, int genid,
2864                            int *fa_index, int fa_start, unsigned int flags)
2865{
2866        int i;
2867
2868        for (i = 0; i < FNHE_HASH_SIZE; i++) {
2869                struct fib_nh_exception *fnhe;
2870
2871                for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2872                     fnhe = rcu_dereference(fnhe->fnhe_next)) {
2873                        struct rtable *rt;
2874                        int err;
2875
2876                        if (*fa_index < fa_start)
2877                                goto next;
2878
2879                        if (fnhe->fnhe_genid != genid)
2880                                goto next;
2881
2882                        if (fnhe->fnhe_expires &&
2883                            time_after(jiffies, fnhe->fnhe_expires))
2884                                goto next;
2885
2886                        rt = rcu_dereference(fnhe->fnhe_rth_input);
2887                        if (!rt)
2888                                rt = rcu_dereference(fnhe->fnhe_rth_output);
2889                        if (!rt)
2890                                goto next;
2891
2892                        err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2893                                           table_id, NULL, skb,
2894                                           NETLINK_CB(cb->skb).portid,
2895                                           cb->nlh->nlmsg_seq, flags);
2896                        if (err)
2897                                return err;
2898next:
2899                        (*fa_index)++;
2900                }
2901        }
2902
2903        return 0;
2904}
2905
2906int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
2907                       u32 table_id, struct fib_info *fi,
2908                       int *fa_index, int fa_start, unsigned int flags)
2909{
2910        struct net *net = sock_net(cb->skb->sk);
2911        int nhsel, genid = fnhe_genid(net);
2912
2913        for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
2914                struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
2915                struct fnhe_hash_bucket *bucket;
2916                int err;
2917
2918                if (nhc->nhc_flags & RTNH_F_DEAD)
2919                        continue;
2920
2921                rcu_read_lock();
2922                bucket = rcu_dereference(nhc->nhc_exceptions);
2923                err = 0;
2924                if (bucket)
2925                        err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
2926                                               genid, fa_index, fa_start,
2927                                               flags);
2928                rcu_read_unlock();
2929                if (err)
2930                        return err;
2931        }
2932
2933        return 0;
2934}
2935
2936static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2937                                                   u8 ip_proto, __be16 sport,
2938                                                   __be16 dport)
2939{
2940        struct sk_buff *skb;
2941        struct iphdr *iph;
2942
2943        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2944        if (!skb)
2945                return NULL;
2946
2947        /* Reserve room for dummy headers, this skb can pass
2948         * through good chunk of routing engine.
2949         */
2950        skb_reset_mac_header(skb);
2951        skb_reset_network_header(skb);
2952        skb->protocol = htons(ETH_P_IP);
2953        iph = skb_put(skb, sizeof(struct iphdr));
2954        iph->protocol = ip_proto;
2955        iph->saddr = src;
2956        iph->daddr = dst;
2957        iph->version = 0x4;
2958        iph->frag_off = 0;
2959        iph->ihl = 0x5;
2960        skb_set_transport_header(skb, skb->len);
2961
2962        switch (iph->protocol) {
2963        case IPPROTO_UDP: {
2964                struct udphdr *udph;
2965
2966                udph = skb_put_zero(skb, sizeof(struct udphdr));
2967                udph->source = sport;
2968                udph->dest = dport;
2969                udph->len = sizeof(struct udphdr);
2970                udph->check = 0;
2971                break;
2972        }
2973        case IPPROTO_TCP: {
2974                struct tcphdr *tcph;
2975
2976                tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2977                tcph->source    = sport;
2978                tcph->dest      = dport;
2979                tcph->doff      = sizeof(struct tcphdr) / 4;
2980                tcph->rst = 1;
2981                tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2982                                            src, dst, 0);
2983                break;
2984        }
2985        case IPPROTO_ICMP: {
2986                struct icmphdr *icmph;
2987
2988                icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2989                icmph->type = ICMP_ECHO;
2990                icmph->code = 0;
2991        }
2992        }
2993
2994        return skb;
2995}
2996
2997static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
2998                                       const struct nlmsghdr *nlh,
2999                                       struct nlattr **tb,
3000                                       struct netlink_ext_ack *extack)
3001{
3002        struct rtmsg *rtm;
3003        int i, err;
3004
3005        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3006                NL_SET_ERR_MSG(extack,
3007                               "ipv4: Invalid header for route get request");
3008                return -EINVAL;
3009        }
3010
3011        if (!netlink_strict_get_check(skb))
3012                return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3013                                              rtm_ipv4_policy, extack);
3014
3015        rtm = nlmsg_data(nlh);
3016        if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3017            (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3018            rtm->rtm_table || rtm->rtm_protocol ||
3019            rtm->rtm_scope || rtm->rtm_type) {
3020                NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3021                return -EINVAL;
3022        }
3023
3024        if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3025                               RTM_F_LOOKUP_TABLE |
3026                               RTM_F_FIB_MATCH)) {
3027                NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3028                return -EINVAL;
3029        }
3030
3031        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3032                                            rtm_ipv4_policy, extack);
3033        if (err)
3034                return err;
3035
3036        if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3037            (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3038                NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3039                return -EINVAL;
3040        }
3041
3042        for (i = 0; i <= RTA_MAX; i++) {
3043                if (!tb[i])
3044                        continue;
3045
3046                switch (i) {
3047                case RTA_IIF:
3048                case RTA_OIF:
3049                case RTA_SRC:
3050                case RTA_DST:
3051                case RTA_IP_PROTO:
3052                case RTA_SPORT:
3053                case RTA_DPORT:
3054                case RTA_MARK:
3055                case RTA_UID:
3056                        break;
3057                default:
3058                        NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3059                        return -EINVAL;
3060                }
3061        }
3062
3063        return 0;
3064}
3065
3066static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3067                             struct netlink_ext_ack *extack)
3068{
3069        struct net *net = sock_net(in_skb->sk);
3070        struct nlattr *tb[RTA_MAX+1];
3071        u32 table_id = RT_TABLE_MAIN;
3072        __be16 sport = 0, dport = 0;
3073        struct fib_result res = {};
3074        u8 ip_proto = IPPROTO_UDP;
3075        struct rtable *rt = NULL;
3076        struct sk_buff *skb;
3077        struct rtmsg *rtm;
3078        struct flowi4 fl4 = {};
3079        __be32 dst = 0;
3080        __be32 src = 0;
3081        kuid_t uid;
3082        u32 iif;
3083        int err;
3084        int mark;
3085
3086        err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3087        if (err < 0)
3088                return err;
3089
3090        rtm = nlmsg_data(nlh);
3091        src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3092        dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3093        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3094        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3095        if (tb[RTA_UID])
3096                uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3097        else
3098                uid = (iif ? INVALID_UID : current_uid());
3099
3100        if (tb[RTA_IP_PROTO]) {
3101                err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3102                                                  &ip_proto, AF_INET, extack);
3103                if (err)
3104                        return err;
3105        }
3106
3107        if (tb[RTA_SPORT])
3108                sport = nla_get_be16(tb[RTA_SPORT]);
3109
3110        if (tb[RTA_DPORT])
3111                dport = nla_get_be16(tb[RTA_DPORT]);
3112
3113        skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3114        if (!skb)
3115                return -ENOBUFS;
3116
3117        fl4.daddr = dst;
3118        fl4.saddr = src;
3119        fl4.flowi4_tos = rtm->rtm_tos;
3120        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3121        fl4.flowi4_mark = mark;
3122        fl4.flowi4_uid = uid;
3123        if (sport)
3124                fl4.fl4_sport = sport;
3125        if (dport)
3126                fl4.fl4_dport = dport;
3127        fl4.flowi4_proto = ip_proto;
3128
3129        rcu_read_lock();
3130
3131        if (iif) {
3132                struct net_device *dev;
3133
3134                dev = dev_get_by_index_rcu(net, iif);
3135                if (!dev) {
3136                        err = -ENODEV;
3137                        goto errout_rcu;
3138                }
3139
3140                fl4.flowi4_iif = iif; /* for rt_fill_info */
3141                skb->dev        = dev;
3142                skb->mark       = mark;
3143                err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3144                                         dev, &res);
3145
3146                rt = skb_rtable(skb);
3147                if (err == 0 && rt->dst.error)
3148                        err = -rt->dst.error;
3149        } else {
3150                fl4.flowi4_iif = LOOPBACK_IFINDEX;
3151                skb->dev = net->loopback_dev;
3152                rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3153                err = 0;
3154                if (IS_ERR(rt))
3155                        err = PTR_ERR(rt);
3156                else
3157                        skb_dst_set(skb, &rt->dst);
3158        }
3159
3160        if (err)
3161                goto errout_rcu;
3162
3163        if (rtm->rtm_flags & RTM_F_NOTIFY)
3164                rt->rt_flags |= RTCF_NOTIFY;
3165
3166        if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3167                table_id = res.table ? res.table->tb_id : 0;
3168
3169        /* reset skb for netlink reply msg */
3170        skb_trim(skb, 0);
3171        skb_reset_network_header(skb);
3172        skb_reset_transport_header(skb);
3173        skb_reset_mac_header(skb);
3174
3175        if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3176                if (!res.fi) {
3177                        err = fib_props[res.type].error;
3178                        if (!err)
3179                                err = -EHOSTUNREACH;
3180                        goto errout_rcu;
3181                }
3182                err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3183                                    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3184                                    rt->rt_type, res.prefix, res.prefixlen,
3185                                    fl4.flowi4_tos, res.fi, 0);
3186        } else {
3187                err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3188                                   NETLINK_CB(in_skb).portid,
3189                                   nlh->nlmsg_seq, 0);
3190        }
3191        if (err < 0)
3192                goto errout_rcu;
3193
3194        rcu_read_unlock();
3195
3196        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3197
3198errout_free:
3199        return err;
3200errout_rcu:
3201        rcu_read_unlock();
3202        kfree_skb(skb);
3203        goto errout_free;
3204}
3205
3206void ip_rt_multicast_event(struct in_device *in_dev)
3207{
3208        rt_cache_flush(dev_net(in_dev->dev));
3209}
3210
3211#ifdef CONFIG_SYSCTL
3212static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3213static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3214static int ip_rt_gc_elasticity __read_mostly    = 8;
3215static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3216
3217static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3218                                        void __user *buffer,
3219                                        size_t *lenp, loff_t *ppos)
3220{
3221        struct net *net = (struct net *)__ctl->extra1;
3222
3223        if (write) {
3224                rt_cache_flush(net);
3225                fnhe_genid_bump(net);
3226                return 0;
3227        }
3228
3229        return -EINVAL;
3230}
3231
3232static struct ctl_table ipv4_route_table[] = {
3233        {
3234                .procname       = "gc_thresh",
3235                .data           = &ipv4_dst_ops.gc_thresh,
3236                .maxlen         = sizeof(int),
3237                .mode           = 0644,
3238                .proc_handler   = proc_dointvec,
3239        },
3240        {
3241                .procname       = "max_size",
3242                .data           = &ip_rt_max_size,
3243                .maxlen         = sizeof(int),
3244                .mode           = 0644,
3245                .proc_handler   = proc_dointvec,
3246        },
3247        {
3248                /*  Deprecated. Use gc_min_interval_ms */
3249
3250                .procname       = "gc_min_interval",
3251                .data           = &ip_rt_gc_min_interval,
3252                .maxlen         = sizeof(int),
3253                .mode           = 0644,
3254                .proc_handler   = proc_dointvec_jiffies,
3255        },
3256        {
3257                .procname       = "gc_min_interval_ms",
3258                .data           = &ip_rt_gc_min_interval,
3259                .maxlen         = sizeof(int),
3260                .mode           = 0644,
3261                .proc_handler   = proc_dointvec_ms_jiffies,
3262        },
3263        {
3264                .procname       = "gc_timeout",
3265                .data           = &ip_rt_gc_timeout,
3266                .maxlen         = sizeof(int),
3267                .mode           = 0644,
3268                .proc_handler   = proc_dointvec_jiffies,
3269        },
3270        {
3271                .procname       = "gc_interval",
3272                .data           = &ip_rt_gc_interval,
3273                .maxlen         = sizeof(int),
3274                .mode           = 0644,
3275                .proc_handler   = proc_dointvec_jiffies,
3276        },
3277        {
3278                .procname       = "redirect_load",
3279                .data           = &ip_rt_redirect_load,
3280                .maxlen         = sizeof(int),
3281                .mode           = 0644,
3282                .proc_handler   = proc_dointvec,
3283        },
3284        {
3285                .procname       = "redirect_number",
3286                .data           = &ip_rt_redirect_number,
3287                .maxlen         = sizeof(int),
3288                .mode           = 0644,
3289                .proc_handler   = proc_dointvec,
3290        },
3291        {
3292                .procname       = "redirect_silence",
3293                .data           = &ip_rt_redirect_silence,
3294                .maxlen         = sizeof(int),
3295                .mode           = 0644,
3296                .proc_handler   = proc_dointvec,
3297        },
3298        {
3299                .procname       = "error_cost",
3300                .data           = &ip_rt_error_cost,
3301                .maxlen         = sizeof(int),
3302                .mode           = 0644,
3303                .proc_handler   = proc_dointvec,
3304        },
3305        {
3306                .procname       = "error_burst",
3307                .data           = &ip_rt_error_burst,
3308                .maxlen         = sizeof(int),
3309                .mode           = 0644,
3310                .proc_handler   = proc_dointvec,
3311        },
3312        {
3313                .procname       = "gc_elasticity",
3314                .data           = &ip_rt_gc_elasticity,
3315                .maxlen         = sizeof(int),
3316                .mode           = 0644,
3317                .proc_handler   = proc_dointvec,
3318        },
3319        {
3320                .procname       = "mtu_expires",
3321                .data           = &ip_rt_mtu_expires,
3322                .maxlen         = sizeof(int),
3323                .mode           = 0644,
3324                .proc_handler   = proc_dointvec_jiffies,
3325        },
3326        {
3327                .procname       = "min_pmtu",
3328                .data           = &ip_rt_min_pmtu,
3329                .maxlen         = sizeof(int),
3330                .mode           = 0644,
3331                .proc_handler   = proc_dointvec_minmax,
3332                .extra1         = &ip_min_valid_pmtu,
3333        },
3334        {
3335                .procname       = "min_adv_mss",
3336                .data           = &ip_rt_min_advmss,
3337                .maxlen         = sizeof(int),
3338                .mode           = 0644,
3339                .proc_handler   = proc_dointvec,
3340        },
3341        { }
3342};
3343
3344static const char ipv4_route_flush_procname[] = "flush";
3345
3346static struct ctl_table ipv4_route_flush_table[] = {
3347        {
3348                .procname       = ipv4_route_flush_procname,
3349                .maxlen         = sizeof(int),
3350                .mode           = 0200,
3351                .proc_handler   = ipv4_sysctl_rtcache_flush,
3352        },
3353        { },
3354};
3355
3356static __net_init int sysctl_route_net_init(struct net *net)
3357{
3358        struct ctl_table *tbl;
3359
3360        tbl = ipv4_route_flush_table;
3361        if (!net_eq(net, &init_net)) {
3362                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3363                if (!tbl)
3364                        goto err_dup;
3365
3366                /* Don't export non-whitelisted sysctls to unprivileged users */
3367                if (net->user_ns != &init_user_ns) {
3368                        if (tbl[0].procname != ipv4_route_flush_procname)
3369                                tbl[0].procname = NULL;
3370                }
3371        }
3372        tbl[0].extra1 = net;
3373
3374        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3375        if (!net->ipv4.route_hdr)
3376                goto err_reg;
3377        return 0;
3378
3379err_reg:
3380        if (tbl != ipv4_route_flush_table)
3381                kfree(tbl);
3382err_dup:
3383        return -ENOMEM;
3384}
3385
3386static __net_exit void sysctl_route_net_exit(struct net *net)
3387{
3388        struct ctl_table *tbl;
3389
3390        tbl = net->ipv4.route_hdr->ctl_table_arg;
3391        unregister_net_sysctl_table(net->ipv4.route_hdr);
3392        BUG_ON(tbl == ipv4_route_flush_table);
3393        kfree(tbl);
3394}
3395
3396static __net_initdata struct pernet_operations sysctl_route_ops = {
3397        .init = sysctl_route_net_init,
3398        .exit = sysctl_route_net_exit,
3399};
3400#endif
3401
3402static __net_init int rt_genid_init(struct net *net)
3403{
3404        atomic_set(&net->ipv4.rt_genid, 0);
3405        atomic_set(&net->fnhe_genid, 0);
3406        atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3407        return 0;
3408}
3409
3410static __net_initdata struct pernet_operations rt_genid_ops = {
3411        .init = rt_genid_init,
3412};
3413
3414static int __net_init ipv4_inetpeer_init(struct net *net)
3415{
3416        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3417
3418        if (!bp)
3419                return -ENOMEM;
3420        inet_peer_base_init(bp);
3421        net->ipv4.peers = bp;
3422        return 0;
3423}
3424
3425static void __net_exit ipv4_inetpeer_exit(struct net *net)
3426{
3427        struct inet_peer_base *bp = net->ipv4.peers;
3428
3429        net->ipv4.peers = NULL;
3430        inetpeer_invalidate_tree(bp);
3431        kfree(bp);
3432}
3433
3434static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3435        .init   =       ipv4_inetpeer_init,
3436        .exit   =       ipv4_inetpeer_exit,
3437};
3438
3439#ifdef CONFIG_IP_ROUTE_CLASSID
3440struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3441#endif /* CONFIG_IP_ROUTE_CLASSID */
3442
3443int __init ip_rt_init(void)
3444{
3445        int cpu;
3446
3447        ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3448                                  GFP_KERNEL);
3449        if (!ip_idents)
3450                panic("IP: failed to allocate ip_idents\n");
3451
3452        prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3453
3454        ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3455        if (!ip_tstamps)
3456                panic("IP: failed to allocate ip_tstamps\n");
3457
3458        for_each_possible_cpu(cpu) {
3459                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3460
3461                INIT_LIST_HEAD(&ul->head);
3462                spin_lock_init(&ul->lock);
3463        }
3464#ifdef CONFIG_IP_ROUTE_CLASSID
3465        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3466        if (!ip_rt_acct)
3467                panic("IP: failed to allocate ip_rt_acct\n");
3468#endif
3469
3470        ipv4_dst_ops.kmem_cachep =
3471                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3472                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3473
3474        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3475
3476        if (dst_entries_init(&ipv4_dst_ops) < 0)
3477                panic("IP: failed to allocate ipv4_dst_ops counter\n");
3478
3479        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3480                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3481
3482        ipv4_dst_ops.gc_thresh = ~0;
3483        ip_rt_max_size = INT_MAX;
3484
3485        devinet_init();
3486        ip_fib_init();
3487
3488        if (ip_rt_proc_init())
3489                pr_err("Unable to create route proc files\n");
3490#ifdef CONFIG_XFRM
3491        xfrm_init();
3492        xfrm4_init();
3493#endif
3494        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3495                      RTNL_FLAG_DOIT_UNLOCKED);
3496
3497#ifdef CONFIG_SYSCTL
3498        register_pernet_subsys(&sysctl_route_ops);
3499#endif
3500        register_pernet_subsys(&rt_genid_ops);
3501        register_pernet_subsys(&ipv4_inetpeer_ops);
3502        return 0;
3503}
3504
3505#ifdef CONFIG_SYSCTL
3506/*
3507 * We really need to sanitize the damn ipv4 init order, then all
3508 * this nonsense will go away.
3509 */
3510void __init ip_static_sysctl_init(void)
3511{
3512        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3513}
3514#endif
3515