linux/net/ipv4/route.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              ROUTE - implementation of the IP router.
   8 *
   9 * Authors:     Ross Biro
  10 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  12 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  13 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       Verify area fixes.
  17 *              Alan Cox        :       cli() protects routing changes
  18 *              Rui Oliveira    :       ICMP routing table updates
  19 *              (rco@di.uminho.pt)      Routing table insertion and update
  20 *              Linus Torvalds  :       Rewrote bits to be sensible
  21 *              Alan Cox        :       Added BSD route gw semantics
  22 *              Alan Cox        :       Super /proc >4K
  23 *              Alan Cox        :       MTU in route table
  24 *              Alan Cox        :       MSS actually. Also added the window
  25 *                                      clamper.
  26 *              Sam Lantinga    :       Fixed route matching in rt_del()
  27 *              Alan Cox        :       Routing cache support.
  28 *              Alan Cox        :       Removed compatibility cruft.
  29 *              Alan Cox        :       RTF_REJECT support.
  30 *              Alan Cox        :       TCP irtt support.
  31 *              Jonathan Naylor :       Added Metric support.
  32 *      Miquel van Smoorenburg  :       BSD API fixes.
  33 *      Miquel van Smoorenburg  :       Metrics.
  34 *              Alan Cox        :       Use __u32 properly
  35 *              Alan Cox        :       Aligned routing errors more closely with BSD
  36 *                                      our system is still very different.
  37 *              Alan Cox        :       Faster /proc handling
  38 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  39 *                                      routing caches and better behaviour.
  40 *
  41 *              Olaf Erb        :       irtt wasn't being copied right.
  42 *              Bjorn Ekwall    :       Kerneld route support.
  43 *              Alan Cox        :       Multicast fixed (I hope)
  44 *              Pavel Krauz     :       Limited broadcast fixed
  45 *              Mike McLagan    :       Routing by source
  46 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  47 *                                      route.c and rewritten from scratch.
  48 *              Andi Kleen      :       Load-limit warning messages.
  49 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  50 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  51 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  52 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  53 *              Marc Boucher    :       routing by fwmark
  54 *      Robert Olsson           :       Added rt_cache statistics
  55 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  56 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  57 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  58 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  59 */
  60
  61#define pr_fmt(fmt) "IPv4: " fmt
  62
  63#include <linux/module.h>
  64#include <linux/uaccess.h>
  65#include <linux/bitops.h>
  66#include <linux/types.h>
  67#include <linux/kernel.h>
  68#include <linux/mm.h>
  69#include <linux/string.h>
  70#include <linux/socket.h>
  71#include <linux/sockios.h>
  72#include <linux/errno.h>
  73#include <linux/in.h>
  74#include <linux/inet.h>
  75#include <linux/netdevice.h>
  76#include <linux/proc_fs.h>
  77#include <linux/init.h>
  78#include <linux/skbuff.h>
  79#include <linux/inetdevice.h>
  80#include <linux/igmp.h>
  81#include <linux/pkt_sched.h>
  82#include <linux/mroute.h>
  83#include <linux/netfilter_ipv4.h>
  84#include <linux/random.h>
  85#include <linux/rcupdate.h>
  86#include <linux/times.h>
  87#include <linux/slab.h>
  88#include <linux/jhash.h>
  89#include <net/dst.h>
  90#include <net/dst_metadata.h>
  91#include <net/net_namespace.h>
  92#include <net/protocol.h>
  93#include <net/ip.h>
  94#include <net/route.h>
  95#include <net/inetpeer.h>
  96#include <net/sock.h>
  97#include <net/ip_fib.h>
  98#include <net/nexthop.h>
  99#include <net/arp.h>
 100#include <net/tcp.h>
 101#include <net/icmp.h>
 102#include <net/xfrm.h>
 103#include <net/lwtunnel.h>
 104#include <net/netevent.h>
 105#include <net/rtnetlink.h>
 106#ifdef CONFIG_SYSCTL
 107#include <linux/sysctl.h>
 108#endif
 109#include <net/secure_seq.h>
 110#include <net/ip_tunnels.h>
 111#include <net/l3mdev.h>
 112
 113#include "fib_lookup.h"
 114
 115#define RT_FL_TOS(oldflp4) \
 116        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 117
 118#define RT_GC_TIMEOUT (300*HZ)
 119
 120static int ip_rt_max_size;
 121static int ip_rt_redirect_number __read_mostly  = 9;
 122static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 123static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 124static int ip_rt_error_cost __read_mostly       = HZ;
 125static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 126static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 127static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 128static int ip_rt_min_advmss __read_mostly       = 256;
 129
 130static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 131
 132/*
 133 *      Interface to generic destination cache.
 134 */
 135
 136static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 137static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 138static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 139static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 140static void              ipv4_link_failure(struct sk_buff *skb);
 141static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 142                                           struct sk_buff *skb, u32 mtu,
 143                                           bool confirm_neigh);
 144static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 145                                        struct sk_buff *skb);
 146static void             ipv4_dst_destroy(struct dst_entry *dst);
 147
 148static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 149{
 150        WARN_ON(1);
 151        return NULL;
 152}
 153
 154static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 155                                           struct sk_buff *skb,
 156                                           const void *daddr);
 157static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 158
 159static struct dst_ops ipv4_dst_ops = {
 160        .family =               AF_INET,
 161        .check =                ipv4_dst_check,
 162        .default_advmss =       ipv4_default_advmss,
 163        .mtu =                  ipv4_mtu,
 164        .cow_metrics =          ipv4_cow_metrics,
 165        .destroy =              ipv4_dst_destroy,
 166        .negative_advice =      ipv4_negative_advice,
 167        .link_failure =         ipv4_link_failure,
 168        .update_pmtu =          ip_rt_update_pmtu,
 169        .redirect =             ip_do_redirect,
 170        .local_out =            __ip_local_out,
 171        .neigh_lookup =         ipv4_neigh_lookup,
 172        .confirm_neigh =        ipv4_confirm_neigh,
 173};
 174
 175#define ECN_OR_COST(class)      TC_PRIO_##class
 176
 177const __u8 ip_tos2prio[16] = {
 178        TC_PRIO_BESTEFFORT,
 179        ECN_OR_COST(BESTEFFORT),
 180        TC_PRIO_BESTEFFORT,
 181        ECN_OR_COST(BESTEFFORT),
 182        TC_PRIO_BULK,
 183        ECN_OR_COST(BULK),
 184        TC_PRIO_BULK,
 185        ECN_OR_COST(BULK),
 186        TC_PRIO_INTERACTIVE,
 187        ECN_OR_COST(INTERACTIVE),
 188        TC_PRIO_INTERACTIVE,
 189        ECN_OR_COST(INTERACTIVE),
 190        TC_PRIO_INTERACTIVE_BULK,
 191        ECN_OR_COST(INTERACTIVE_BULK),
 192        TC_PRIO_INTERACTIVE_BULK,
 193        ECN_OR_COST(INTERACTIVE_BULK)
 194};
 195EXPORT_SYMBOL(ip_tos2prio);
 196
 197static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 198#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 199
 200#ifdef CONFIG_PROC_FS
 201static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 202{
 203        if (*pos)
 204                return NULL;
 205        return SEQ_START_TOKEN;
 206}
 207
 208static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 209{
 210        ++*pos;
 211        return NULL;
 212}
 213
 214static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 215{
 216}
 217
 218static int rt_cache_seq_show(struct seq_file *seq, void *v)
 219{
 220        if (v == SEQ_START_TOKEN)
 221                seq_printf(seq, "%-127s\n",
 222                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 223                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 224                           "HHUptod\tSpecDst");
 225        return 0;
 226}
 227
 228static const struct seq_operations rt_cache_seq_ops = {
 229        .start  = rt_cache_seq_start,
 230        .next   = rt_cache_seq_next,
 231        .stop   = rt_cache_seq_stop,
 232        .show   = rt_cache_seq_show,
 233};
 234
 235static int rt_cache_seq_open(struct inode *inode, struct file *file)
 236{
 237        return seq_open(file, &rt_cache_seq_ops);
 238}
 239
 240static const struct proc_ops rt_cache_proc_ops = {
 241        .proc_open      = rt_cache_seq_open,
 242        .proc_read      = seq_read,
 243        .proc_lseek     = seq_lseek,
 244        .proc_release   = seq_release,
 245};
 246
 247
 248static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 249{
 250        int cpu;
 251
 252        if (*pos == 0)
 253                return SEQ_START_TOKEN;
 254
 255        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 256                if (!cpu_possible(cpu))
 257                        continue;
 258                *pos = cpu+1;
 259                return &per_cpu(rt_cache_stat, cpu);
 260        }
 261        return NULL;
 262}
 263
 264static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 265{
 266        int cpu;
 267
 268        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 269                if (!cpu_possible(cpu))
 270                        continue;
 271                *pos = cpu+1;
 272                return &per_cpu(rt_cache_stat, cpu);
 273        }
 274        (*pos)++;
 275        return NULL;
 276
 277}
 278
 279static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 280{
 281
 282}
 283
 284static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 285{
 286        struct rt_cache_stat *st = v;
 287
 288        if (v == SEQ_START_TOKEN) {
 289                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 290                return 0;
 291        }
 292
 293        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 294                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 295                   dst_entries_get_slow(&ipv4_dst_ops),
 296                   0, /* st->in_hit */
 297                   st->in_slow_tot,
 298                   st->in_slow_mc,
 299                   st->in_no_route,
 300                   st->in_brd,
 301                   st->in_martian_dst,
 302                   st->in_martian_src,
 303
 304                   0, /* st->out_hit */
 305                   st->out_slow_tot,
 306                   st->out_slow_mc,
 307
 308                   0, /* st->gc_total */
 309                   0, /* st->gc_ignored */
 310                   0, /* st->gc_goal_miss */
 311                   0, /* st->gc_dst_overflow */
 312                   0, /* st->in_hlist_search */
 313                   0  /* st->out_hlist_search */
 314                );
 315        return 0;
 316}
 317
 318static const struct seq_operations rt_cpu_seq_ops = {
 319        .start  = rt_cpu_seq_start,
 320        .next   = rt_cpu_seq_next,
 321        .stop   = rt_cpu_seq_stop,
 322        .show   = rt_cpu_seq_show,
 323};
 324
 325
 326static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 327{
 328        return seq_open(file, &rt_cpu_seq_ops);
 329}
 330
 331static const struct proc_ops rt_cpu_proc_ops = {
 332        .proc_open      = rt_cpu_seq_open,
 333        .proc_read      = seq_read,
 334        .proc_lseek     = seq_lseek,
 335        .proc_release   = seq_release,
 336};
 337
 338#ifdef CONFIG_IP_ROUTE_CLASSID
 339static int rt_acct_proc_show(struct seq_file *m, void *v)
 340{
 341        struct ip_rt_acct *dst, *src;
 342        unsigned int i, j;
 343
 344        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 345        if (!dst)
 346                return -ENOMEM;
 347
 348        for_each_possible_cpu(i) {
 349                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 350                for (j = 0; j < 256; j++) {
 351                        dst[j].o_bytes   += src[j].o_bytes;
 352                        dst[j].o_packets += src[j].o_packets;
 353                        dst[j].i_bytes   += src[j].i_bytes;
 354                        dst[j].i_packets += src[j].i_packets;
 355                }
 356        }
 357
 358        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 359        kfree(dst);
 360        return 0;
 361}
 362#endif
 363
 364static int __net_init ip_rt_do_proc_init(struct net *net)
 365{
 366        struct proc_dir_entry *pde;
 367
 368        pde = proc_create("rt_cache", 0444, net->proc_net,
 369                          &rt_cache_proc_ops);
 370        if (!pde)
 371                goto err1;
 372
 373        pde = proc_create("rt_cache", 0444,
 374                          net->proc_net_stat, &rt_cpu_proc_ops);
 375        if (!pde)
 376                goto err2;
 377
 378#ifdef CONFIG_IP_ROUTE_CLASSID
 379        pde = proc_create_single("rt_acct", 0, net->proc_net,
 380                        rt_acct_proc_show);
 381        if (!pde)
 382                goto err3;
 383#endif
 384        return 0;
 385
 386#ifdef CONFIG_IP_ROUTE_CLASSID
 387err3:
 388        remove_proc_entry("rt_cache", net->proc_net_stat);
 389#endif
 390err2:
 391        remove_proc_entry("rt_cache", net->proc_net);
 392err1:
 393        return -ENOMEM;
 394}
 395
 396static void __net_exit ip_rt_do_proc_exit(struct net *net)
 397{
 398        remove_proc_entry("rt_cache", net->proc_net_stat);
 399        remove_proc_entry("rt_cache", net->proc_net);
 400#ifdef CONFIG_IP_ROUTE_CLASSID
 401        remove_proc_entry("rt_acct", net->proc_net);
 402#endif
 403}
 404
 405static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 406        .init = ip_rt_do_proc_init,
 407        .exit = ip_rt_do_proc_exit,
 408};
 409
 410static int __init ip_rt_proc_init(void)
 411{
 412        return register_pernet_subsys(&ip_rt_proc_ops);
 413}
 414
 415#else
 416static inline int ip_rt_proc_init(void)
 417{
 418        return 0;
 419}
 420#endif /* CONFIG_PROC_FS */
 421
 422static inline bool rt_is_expired(const struct rtable *rth)
 423{
 424        return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 425}
 426
 427void rt_cache_flush(struct net *net)
 428{
 429        rt_genid_bump_ipv4(net);
 430}
 431
 432static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 433                                           struct sk_buff *skb,
 434                                           const void *daddr)
 435{
 436        const struct rtable *rt = container_of(dst, struct rtable, dst);
 437        struct net_device *dev = dst->dev;
 438        struct neighbour *n;
 439
 440        rcu_read_lock_bh();
 441
 442        if (likely(rt->rt_gw_family == AF_INET)) {
 443                n = ip_neigh_gw4(dev, rt->rt_gw4);
 444        } else if (rt->rt_gw_family == AF_INET6) {
 445                n = ip_neigh_gw6(dev, &rt->rt_gw6);
 446        } else {
 447                __be32 pkey;
 448
 449                pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
 450                n = ip_neigh_gw4(dev, pkey);
 451        }
 452
 453        if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
 454                n = NULL;
 455
 456        rcu_read_unlock_bh();
 457
 458        return n;
 459}
 460
 461static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 462{
 463        const struct rtable *rt = container_of(dst, struct rtable, dst);
 464        struct net_device *dev = dst->dev;
 465        const __be32 *pkey = daddr;
 466
 467        if (rt->rt_gw_family == AF_INET) {
 468                pkey = (const __be32 *)&rt->rt_gw4;
 469        } else if (rt->rt_gw_family == AF_INET6) {
 470                return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
 471        } else if (!daddr ||
 472                 (rt->rt_flags &
 473                  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
 474                return;
 475        }
 476        __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 477}
 478
 479#define IP_IDENTS_SZ 2048u
 480
 481static atomic_t *ip_idents __read_mostly;
 482static u32 *ip_tstamps __read_mostly;
 483
 484/* In order to protect privacy, we add a perturbation to identifiers
 485 * if one generator is seldom used. This makes hard for an attacker
 486 * to infer how many packets were sent between two points in time.
 487 */
 488u32 ip_idents_reserve(u32 hash, int segs)
 489{
 490        u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 491        atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 492        u32 old = READ_ONCE(*p_tstamp);
 493        u32 now = (u32)jiffies;
 494        u32 delta = 0;
 495
 496        if (old != now && cmpxchg(p_tstamp, old, now) == old)
 497                delta = prandom_u32_max(now - old);
 498
 499        /* If UBSAN reports an error there, please make sure your compiler
 500         * supports -fno-strict-overflow before reporting it that was a bug
 501         * in UBSAN, and it has been fixed in GCC-8.
 502         */
 503        return atomic_add_return(segs + delta, p_id) - segs;
 504}
 505EXPORT_SYMBOL(ip_idents_reserve);
 506
 507void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 508{
 509        u32 hash, id;
 510
 511        /* Note the following code is not safe, but this is okay. */
 512        if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 513                get_random_bytes(&net->ipv4.ip_id_key,
 514                                 sizeof(net->ipv4.ip_id_key));
 515
 516        hash = siphash_3u32((__force u32)iph->daddr,
 517                            (__force u32)iph->saddr,
 518                            iph->protocol,
 519                            &net->ipv4.ip_id_key);
 520        id = ip_idents_reserve(hash, segs);
 521        iph->id = htons(id);
 522}
 523EXPORT_SYMBOL(__ip_select_ident);
 524
 525static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 526                             const struct sock *sk,
 527                             const struct iphdr *iph,
 528                             int oif, u8 tos,
 529                             u8 prot, u32 mark, int flow_flags)
 530{
 531        if (sk) {
 532                const struct inet_sock *inet = inet_sk(sk);
 533
 534                oif = sk->sk_bound_dev_if;
 535                mark = sk->sk_mark;
 536                tos = RT_CONN_FLAGS(sk);
 537                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 538        }
 539        flowi4_init_output(fl4, oif, mark, tos,
 540                           RT_SCOPE_UNIVERSE, prot,
 541                           flow_flags,
 542                           iph->daddr, iph->saddr, 0, 0,
 543                           sock_net_uid(net, sk));
 544}
 545
 546static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 547                               const struct sock *sk)
 548{
 549        const struct net *net = dev_net(skb->dev);
 550        const struct iphdr *iph = ip_hdr(skb);
 551        int oif = skb->dev->ifindex;
 552        u8 tos = RT_TOS(iph->tos);
 553        u8 prot = iph->protocol;
 554        u32 mark = skb->mark;
 555
 556        __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 557}
 558
 559static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 560{
 561        const struct inet_sock *inet = inet_sk(sk);
 562        const struct ip_options_rcu *inet_opt;
 563        __be32 daddr = inet->inet_daddr;
 564
 565        rcu_read_lock();
 566        inet_opt = rcu_dereference(inet->inet_opt);
 567        if (inet_opt && inet_opt->opt.srr)
 568                daddr = inet_opt->opt.faddr;
 569        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 570                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 571                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 572                           inet_sk_flowi_flags(sk),
 573                           daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 574        rcu_read_unlock();
 575}
 576
 577static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 578                                 const struct sk_buff *skb)
 579{
 580        if (skb)
 581                build_skb_flow_key(fl4, skb, sk);
 582        else
 583                build_sk_flow_key(fl4, sk);
 584}
 585
 586static DEFINE_SPINLOCK(fnhe_lock);
 587
 588static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 589{
 590        struct rtable *rt;
 591
 592        rt = rcu_dereference(fnhe->fnhe_rth_input);
 593        if (rt) {
 594                RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 595                dst_dev_put(&rt->dst);
 596                dst_release(&rt->dst);
 597        }
 598        rt = rcu_dereference(fnhe->fnhe_rth_output);
 599        if (rt) {
 600                RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 601                dst_dev_put(&rt->dst);
 602                dst_release(&rt->dst);
 603        }
 604}
 605
 606static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 607{
 608        struct fib_nh_exception *fnhe, *oldest;
 609
 610        oldest = rcu_dereference(hash->chain);
 611        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 612             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 613                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 614                        oldest = fnhe;
 615        }
 616        fnhe_flush_routes(oldest);
 617        return oldest;
 618}
 619
 620static inline u32 fnhe_hashfun(__be32 daddr)
 621{
 622        static u32 fnhe_hashrnd __read_mostly;
 623        u32 hval;
 624
 625        net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 626        hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 627        return hash_32(hval, FNHE_HASH_SHIFT);
 628}
 629
 630static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 631{
 632        rt->rt_pmtu = fnhe->fnhe_pmtu;
 633        rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 634        rt->dst.expires = fnhe->fnhe_expires;
 635
 636        if (fnhe->fnhe_gw) {
 637                rt->rt_flags |= RTCF_REDIRECTED;
 638                rt->rt_uses_gateway = 1;
 639                rt->rt_gw_family = AF_INET;
 640                rt->rt_gw4 = fnhe->fnhe_gw;
 641        }
 642}
 643
 644static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
 645                                  __be32 gw, u32 pmtu, bool lock,
 646                                  unsigned long expires)
 647{
 648        struct fnhe_hash_bucket *hash;
 649        struct fib_nh_exception *fnhe;
 650        struct rtable *rt;
 651        u32 genid, hval;
 652        unsigned int i;
 653        int depth;
 654
 655        genid = fnhe_genid(dev_net(nhc->nhc_dev));
 656        hval = fnhe_hashfun(daddr);
 657
 658        spin_lock_bh(&fnhe_lock);
 659
 660        hash = rcu_dereference(nhc->nhc_exceptions);
 661        if (!hash) {
 662                hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 663                if (!hash)
 664                        goto out_unlock;
 665                rcu_assign_pointer(nhc->nhc_exceptions, hash);
 666        }
 667
 668        hash += hval;
 669
 670        depth = 0;
 671        for (fnhe = rcu_dereference(hash->chain); fnhe;
 672             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 673                if (fnhe->fnhe_daddr == daddr)
 674                        break;
 675                depth++;
 676        }
 677
 678        if (fnhe) {
 679                if (fnhe->fnhe_genid != genid)
 680                        fnhe->fnhe_genid = genid;
 681                if (gw)
 682                        fnhe->fnhe_gw = gw;
 683                if (pmtu) {
 684                        fnhe->fnhe_pmtu = pmtu;
 685                        fnhe->fnhe_mtu_locked = lock;
 686                }
 687                fnhe->fnhe_expires = max(1UL, expires);
 688                /* Update all cached dsts too */
 689                rt = rcu_dereference(fnhe->fnhe_rth_input);
 690                if (rt)
 691                        fill_route_from_fnhe(rt, fnhe);
 692                rt = rcu_dereference(fnhe->fnhe_rth_output);
 693                if (rt)
 694                        fill_route_from_fnhe(rt, fnhe);
 695        } else {
 696                if (depth > FNHE_RECLAIM_DEPTH)
 697                        fnhe = fnhe_oldest(hash);
 698                else {
 699                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 700                        if (!fnhe)
 701                                goto out_unlock;
 702
 703                        fnhe->fnhe_next = hash->chain;
 704                        rcu_assign_pointer(hash->chain, fnhe);
 705                }
 706                fnhe->fnhe_genid = genid;
 707                fnhe->fnhe_daddr = daddr;
 708                fnhe->fnhe_gw = gw;
 709                fnhe->fnhe_pmtu = pmtu;
 710                fnhe->fnhe_mtu_locked = lock;
 711                fnhe->fnhe_expires = max(1UL, expires);
 712
 713                /* Exception created; mark the cached routes for the nexthop
 714                 * stale, so anyone caching it rechecks if this exception
 715                 * applies to them.
 716                 */
 717                rt = rcu_dereference(nhc->nhc_rth_input);
 718                if (rt)
 719                        rt->dst.obsolete = DST_OBSOLETE_KILL;
 720
 721                for_each_possible_cpu(i) {
 722                        struct rtable __rcu **prt;
 723                        prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 724                        rt = rcu_dereference(*prt);
 725                        if (rt)
 726                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 727                }
 728        }
 729
 730        fnhe->fnhe_stamp = jiffies;
 731
 732out_unlock:
 733        spin_unlock_bh(&fnhe_lock);
 734}
 735
 736static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 737                             bool kill_route)
 738{
 739        __be32 new_gw = icmp_hdr(skb)->un.gateway;
 740        __be32 old_gw = ip_hdr(skb)->saddr;
 741        struct net_device *dev = skb->dev;
 742        struct in_device *in_dev;
 743        struct fib_result res;
 744        struct neighbour *n;
 745        struct net *net;
 746
 747        switch (icmp_hdr(skb)->code & 7) {
 748        case ICMP_REDIR_NET:
 749        case ICMP_REDIR_NETTOS:
 750        case ICMP_REDIR_HOST:
 751        case ICMP_REDIR_HOSTTOS:
 752                break;
 753
 754        default:
 755                return;
 756        }
 757
 758        if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
 759                return;
 760
 761        in_dev = __in_dev_get_rcu(dev);
 762        if (!in_dev)
 763                return;
 764
 765        net = dev_net(dev);
 766        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 767            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 768            ipv4_is_zeronet(new_gw))
 769                goto reject_redirect;
 770
 771        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 772                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 773                        goto reject_redirect;
 774                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 775                        goto reject_redirect;
 776        } else {
 777                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 778                        goto reject_redirect;
 779        }
 780
 781        n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 782        if (!n)
 783                n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 784        if (!IS_ERR(n)) {
 785                if (!(n->nud_state & NUD_VALID)) {
 786                        neigh_event_send(n, NULL);
 787                } else {
 788                        if (fib_lookup(net, fl4, &res, 0) == 0) {
 789                                struct fib_nh_common *nhc = FIB_RES_NHC(res);
 790
 791                                update_or_create_fnhe(nhc, fl4->daddr, new_gw,
 792                                                0, false,
 793                                                jiffies + ip_rt_gc_timeout);
 794                        }
 795                        if (kill_route)
 796                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 797                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 798                }
 799                neigh_release(n);
 800        }
 801        return;
 802
 803reject_redirect:
 804#ifdef CONFIG_IP_ROUTE_VERBOSE
 805        if (IN_DEV_LOG_MARTIANS(in_dev)) {
 806                const struct iphdr *iph = (const struct iphdr *) skb->data;
 807                __be32 daddr = iph->daddr;
 808                __be32 saddr = iph->saddr;
 809
 810                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 811                                     "  Advised path = %pI4 -> %pI4\n",
 812                                     &old_gw, dev->name, &new_gw,
 813                                     &saddr, &daddr);
 814        }
 815#endif
 816        ;
 817}
 818
 819static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 820{
 821        struct rtable *rt;
 822        struct flowi4 fl4;
 823        const struct iphdr *iph = (const struct iphdr *) skb->data;
 824        struct net *net = dev_net(skb->dev);
 825        int oif = skb->dev->ifindex;
 826        u8 tos = RT_TOS(iph->tos);
 827        u8 prot = iph->protocol;
 828        u32 mark = skb->mark;
 829
 830        rt = (struct rtable *) dst;
 831
 832        __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 833        __ip_do_redirect(rt, skb, &fl4, true);
 834}
 835
 836static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 837{
 838        struct rtable *rt = (struct rtable *)dst;
 839        struct dst_entry *ret = dst;
 840
 841        if (rt) {
 842                if (dst->obsolete > 0) {
 843                        ip_rt_put(rt);
 844                        ret = NULL;
 845                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 846                           rt->dst.expires) {
 847                        ip_rt_put(rt);
 848                        ret = NULL;
 849                }
 850        }
 851        return ret;
 852}
 853
 854/*
 855 * Algorithm:
 856 *      1. The first ip_rt_redirect_number redirects are sent
 857 *         with exponential backoff, then we stop sending them at all,
 858 *         assuming that the host ignores our redirects.
 859 *      2. If we did not see packets requiring redirects
 860 *         during ip_rt_redirect_silence, we assume that the host
 861 *         forgot redirected route and start to send redirects again.
 862 *
 863 * This algorithm is much cheaper and more intelligent than dumb load limiting
 864 * in icmp.c.
 865 *
 866 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 867 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 868 */
 869
 870void ip_rt_send_redirect(struct sk_buff *skb)
 871{
 872        struct rtable *rt = skb_rtable(skb);
 873        struct in_device *in_dev;
 874        struct inet_peer *peer;
 875        struct net *net;
 876        int log_martians;
 877        int vif;
 878
 879        rcu_read_lock();
 880        in_dev = __in_dev_get_rcu(rt->dst.dev);
 881        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 882                rcu_read_unlock();
 883                return;
 884        }
 885        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 886        vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 887        rcu_read_unlock();
 888
 889        net = dev_net(rt->dst.dev);
 890        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 891        if (!peer) {
 892                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 893                          rt_nexthop(rt, ip_hdr(skb)->daddr));
 894                return;
 895        }
 896
 897        /* No redirected packets during ip_rt_redirect_silence;
 898         * reset the algorithm.
 899         */
 900        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 901                peer->rate_tokens = 0;
 902                peer->n_redirects = 0;
 903        }
 904
 905        /* Too many ignored redirects; do not send anything
 906         * set dst.rate_last to the last seen redirected packet.
 907         */
 908        if (peer->n_redirects >= ip_rt_redirect_number) {
 909                peer->rate_last = jiffies;
 910                goto out_put_peer;
 911        }
 912
 913        /* Check for load limit; set rate_last to the latest sent
 914         * redirect.
 915         */
 916        if (peer->n_redirects == 0 ||
 917            time_after(jiffies,
 918                       (peer->rate_last +
 919                        (ip_rt_redirect_load << peer->n_redirects)))) {
 920                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 921
 922                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 923                peer->rate_last = jiffies;
 924                ++peer->n_redirects;
 925#ifdef CONFIG_IP_ROUTE_VERBOSE
 926                if (log_martians &&
 927                    peer->n_redirects == ip_rt_redirect_number)
 928                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 929                                             &ip_hdr(skb)->saddr, inet_iif(skb),
 930                                             &ip_hdr(skb)->daddr, &gw);
 931#endif
 932        }
 933out_put_peer:
 934        inet_putpeer(peer);
 935}
 936
 937static int ip_error(struct sk_buff *skb)
 938{
 939        struct rtable *rt = skb_rtable(skb);
 940        struct net_device *dev = skb->dev;
 941        struct in_device *in_dev;
 942        struct inet_peer *peer;
 943        unsigned long now;
 944        struct net *net;
 945        bool send;
 946        int code;
 947
 948        if (netif_is_l3_master(skb->dev)) {
 949                dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 950                if (!dev)
 951                        goto out;
 952        }
 953
 954        in_dev = __in_dev_get_rcu(dev);
 955
 956        /* IP on this device is disabled. */
 957        if (!in_dev)
 958                goto out;
 959
 960        net = dev_net(rt->dst.dev);
 961        if (!IN_DEV_FORWARD(in_dev)) {
 962                switch (rt->dst.error) {
 963                case EHOSTUNREACH:
 964                        __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 965                        break;
 966
 967                case ENETUNREACH:
 968                        __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 969                        break;
 970                }
 971                goto out;
 972        }
 973
 974        switch (rt->dst.error) {
 975        case EINVAL:
 976        default:
 977                goto out;
 978        case EHOSTUNREACH:
 979                code = ICMP_HOST_UNREACH;
 980                break;
 981        case ENETUNREACH:
 982                code = ICMP_NET_UNREACH;
 983                __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 984                break;
 985        case EACCES:
 986                code = ICMP_PKT_FILTERED;
 987                break;
 988        }
 989
 990        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 991                               l3mdev_master_ifindex(skb->dev), 1);
 992
 993        send = true;
 994        if (peer) {
 995                now = jiffies;
 996                peer->rate_tokens += now - peer->rate_last;
 997                if (peer->rate_tokens > ip_rt_error_burst)
 998                        peer->rate_tokens = ip_rt_error_burst;
 999                peer->rate_last = now;
1000                if (peer->rate_tokens >= ip_rt_error_cost)
1001                        peer->rate_tokens -= ip_rt_error_cost;
1002                else
1003                        send = false;
1004                inet_putpeer(peer);
1005        }
1006        if (send)
1007                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1008
1009out:    kfree_skb(skb);
1010        return 0;
1011}
1012
1013static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1014{
1015        struct dst_entry *dst = &rt->dst;
1016        u32 old_mtu = ipv4_mtu(dst);
1017        struct fib_result res;
1018        bool lock = false;
1019
1020        if (ip_mtu_locked(dst))
1021                return;
1022
1023        if (old_mtu < mtu)
1024                return;
1025
1026        if (mtu < ip_rt_min_pmtu) {
1027                lock = true;
1028                mtu = min(old_mtu, ip_rt_min_pmtu);
1029        }
1030
1031        if (rt->rt_pmtu == mtu && !lock &&
1032            time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1033                return;
1034
1035        rcu_read_lock();
1036        if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1037                struct fib_nh_common *nhc = FIB_RES_NHC(res);
1038
1039                update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1040                                      jiffies + ip_rt_mtu_expires);
1041        }
1042        rcu_read_unlock();
1043}
1044
1045static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1046                              struct sk_buff *skb, u32 mtu,
1047                              bool confirm_neigh)
1048{
1049        struct rtable *rt = (struct rtable *) dst;
1050        struct flowi4 fl4;
1051
1052        ip_rt_build_flow_key(&fl4, sk, skb);
1053        __ip_rt_update_pmtu(rt, &fl4, mtu);
1054}
1055
1056void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1057                      int oif, u8 protocol)
1058{
1059        const struct iphdr *iph = (const struct iphdr *) skb->data;
1060        struct flowi4 fl4;
1061        struct rtable *rt;
1062        u32 mark = IP4_REPLY_MARK(net, skb->mark);
1063
1064        __build_flow_key(net, &fl4, NULL, iph, oif,
1065                         RT_TOS(iph->tos), protocol, mark, 0);
1066        rt = __ip_route_output_key(net, &fl4);
1067        if (!IS_ERR(rt)) {
1068                __ip_rt_update_pmtu(rt, &fl4, mtu);
1069                ip_rt_put(rt);
1070        }
1071}
1072EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1073
1074static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1075{
1076        const struct iphdr *iph = (const struct iphdr *) skb->data;
1077        struct flowi4 fl4;
1078        struct rtable *rt;
1079
1080        __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1081
1082        if (!fl4.flowi4_mark)
1083                fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1084
1085        rt = __ip_route_output_key(sock_net(sk), &fl4);
1086        if (!IS_ERR(rt)) {
1087                __ip_rt_update_pmtu(rt, &fl4, mtu);
1088                ip_rt_put(rt);
1089        }
1090}
1091
1092void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1093{
1094        const struct iphdr *iph = (const struct iphdr *) skb->data;
1095        struct flowi4 fl4;
1096        struct rtable *rt;
1097        struct dst_entry *odst = NULL;
1098        bool new = false;
1099        struct net *net = sock_net(sk);
1100
1101        bh_lock_sock(sk);
1102
1103        if (!ip_sk_accept_pmtu(sk))
1104                goto out;
1105
1106        odst = sk_dst_get(sk);
1107
1108        if (sock_owned_by_user(sk) || !odst) {
1109                __ipv4_sk_update_pmtu(skb, sk, mtu);
1110                goto out;
1111        }
1112
1113        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1114
1115        rt = (struct rtable *)odst;
1116        if (odst->obsolete && !odst->ops->check(odst, 0)) {
1117                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1118                if (IS_ERR(rt))
1119                        goto out;
1120
1121                new = true;
1122        }
1123
1124        __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1125
1126        if (!dst_check(&rt->dst, 0)) {
1127                if (new)
1128                        dst_release(&rt->dst);
1129
1130                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1131                if (IS_ERR(rt))
1132                        goto out;
1133
1134                new = true;
1135        }
1136
1137        if (new)
1138                sk_dst_set(sk, &rt->dst);
1139
1140out:
1141        bh_unlock_sock(sk);
1142        dst_release(odst);
1143}
1144EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1145
1146void ipv4_redirect(struct sk_buff *skb, struct net *net,
1147                   int oif, u8 protocol)
1148{
1149        const struct iphdr *iph = (const struct iphdr *) skb->data;
1150        struct flowi4 fl4;
1151        struct rtable *rt;
1152
1153        __build_flow_key(net, &fl4, NULL, iph, oif,
1154                         RT_TOS(iph->tos), protocol, 0, 0);
1155        rt = __ip_route_output_key(net, &fl4);
1156        if (!IS_ERR(rt)) {
1157                __ip_do_redirect(rt, skb, &fl4, false);
1158                ip_rt_put(rt);
1159        }
1160}
1161EXPORT_SYMBOL_GPL(ipv4_redirect);
1162
1163void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1164{
1165        const struct iphdr *iph = (const struct iphdr *) skb->data;
1166        struct flowi4 fl4;
1167        struct rtable *rt;
1168        struct net *net = sock_net(sk);
1169
1170        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1171        rt = __ip_route_output_key(net, &fl4);
1172        if (!IS_ERR(rt)) {
1173                __ip_do_redirect(rt, skb, &fl4, false);
1174                ip_rt_put(rt);
1175        }
1176}
1177EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1178
1179static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1180{
1181        struct rtable *rt = (struct rtable *) dst;
1182
1183        /* All IPV4 dsts are created with ->obsolete set to the value
1184         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1185         * into this function always.
1186         *
1187         * When a PMTU/redirect information update invalidates a route,
1188         * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1189         * DST_OBSOLETE_DEAD.
1190         */
1191        if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1192                return NULL;
1193        return dst;
1194}
1195
1196static void ipv4_send_dest_unreach(struct sk_buff *skb)
1197{
1198        struct ip_options opt;
1199        int res;
1200
1201        /* Recompile ip options since IPCB may not be valid anymore.
1202         * Also check we have a reasonable ipv4 header.
1203         */
1204        if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1205            ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1206                return;
1207
1208        memset(&opt, 0, sizeof(opt));
1209        if (ip_hdr(skb)->ihl > 5) {
1210                if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1211                        return;
1212                opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1213
1214                rcu_read_lock();
1215                res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1216                rcu_read_unlock();
1217
1218                if (res)
1219                        return;
1220        }
1221        __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1222}
1223
1224static void ipv4_link_failure(struct sk_buff *skb)
1225{
1226        struct rtable *rt;
1227
1228        ipv4_send_dest_unreach(skb);
1229
1230        rt = skb_rtable(skb);
1231        if (rt)
1232                dst_set_expires(&rt->dst, 0);
1233}
1234
1235static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1236{
1237        pr_debug("%s: %pI4 -> %pI4, %s\n",
1238                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1239                 skb->dev ? skb->dev->name : "?");
1240        kfree_skb(skb);
1241        WARN_ON(1);
1242        return 0;
1243}
1244
1245/*
1246   We do not cache source address of outgoing interface,
1247   because it is used only by IP RR, TS and SRR options,
1248   so that it out of fast path.
1249
1250   BTW remember: "addr" is allowed to be not aligned
1251   in IP options!
1252 */
1253
1254void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1255{
1256        __be32 src;
1257
1258        if (rt_is_output_route(rt))
1259                src = ip_hdr(skb)->saddr;
1260        else {
1261                struct fib_result res;
1262                struct iphdr *iph = ip_hdr(skb);
1263                struct flowi4 fl4 = {
1264                        .daddr = iph->daddr,
1265                        .saddr = iph->saddr,
1266                        .flowi4_tos = RT_TOS(iph->tos),
1267                        .flowi4_oif = rt->dst.dev->ifindex,
1268                        .flowi4_iif = skb->dev->ifindex,
1269                        .flowi4_mark = skb->mark,
1270                };
1271
1272                rcu_read_lock();
1273                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1274                        src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1275                else
1276                        src = inet_select_addr(rt->dst.dev,
1277                                               rt_nexthop(rt, iph->daddr),
1278                                               RT_SCOPE_UNIVERSE);
1279                rcu_read_unlock();
1280        }
1281        memcpy(addr, &src, 4);
1282}
1283
1284#ifdef CONFIG_IP_ROUTE_CLASSID
1285static void set_class_tag(struct rtable *rt, u32 tag)
1286{
1287        if (!(rt->dst.tclassid & 0xFFFF))
1288                rt->dst.tclassid |= tag & 0xFFFF;
1289        if (!(rt->dst.tclassid & 0xFFFF0000))
1290                rt->dst.tclassid |= tag & 0xFFFF0000;
1291}
1292#endif
1293
1294static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1295{
1296        unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1297        unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1298                                    ip_rt_min_advmss);
1299
1300        return min(advmss, IPV4_MAX_PMTU - header_size);
1301}
1302
1303static unsigned int ipv4_mtu(const struct dst_entry *dst)
1304{
1305        const struct rtable *rt = (const struct rtable *) dst;
1306        unsigned int mtu = rt->rt_pmtu;
1307
1308        if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1309                mtu = dst_metric_raw(dst, RTAX_MTU);
1310
1311        if (mtu)
1312                return mtu;
1313
1314        mtu = READ_ONCE(dst->dev->mtu);
1315
1316        if (unlikely(ip_mtu_locked(dst))) {
1317                if (rt->rt_uses_gateway && mtu > 576)
1318                        mtu = 576;
1319        }
1320
1321        mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1322
1323        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1324}
1325
1326static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1327{
1328        struct fnhe_hash_bucket *hash;
1329        struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1330        u32 hval = fnhe_hashfun(daddr);
1331
1332        spin_lock_bh(&fnhe_lock);
1333
1334        hash = rcu_dereference_protected(nhc->nhc_exceptions,
1335                                         lockdep_is_held(&fnhe_lock));
1336        hash += hval;
1337
1338        fnhe_p = &hash->chain;
1339        fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1340        while (fnhe) {
1341                if (fnhe->fnhe_daddr == daddr) {
1342                        rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1343                                fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1344                        /* set fnhe_daddr to 0 to ensure it won't bind with
1345                         * new dsts in rt_bind_exception().
1346                         */
1347                        fnhe->fnhe_daddr = 0;
1348                        fnhe_flush_routes(fnhe);
1349                        kfree_rcu(fnhe, rcu);
1350                        break;
1351                }
1352                fnhe_p = &fnhe->fnhe_next;
1353                fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1354                                                 lockdep_is_held(&fnhe_lock));
1355        }
1356
1357        spin_unlock_bh(&fnhe_lock);
1358}
1359
1360static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1361                                               __be32 daddr)
1362{
1363        struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1364        struct fib_nh_exception *fnhe;
1365        u32 hval;
1366
1367        if (!hash)
1368                return NULL;
1369
1370        hval = fnhe_hashfun(daddr);
1371
1372        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1373             fnhe = rcu_dereference(fnhe->fnhe_next)) {
1374                if (fnhe->fnhe_daddr == daddr) {
1375                        if (fnhe->fnhe_expires &&
1376                            time_after(jiffies, fnhe->fnhe_expires)) {
1377                                ip_del_fnhe(nhc, daddr);
1378                                break;
1379                        }
1380                        return fnhe;
1381                }
1382        }
1383        return NULL;
1384}
1385
1386/* MTU selection:
1387 * 1. mtu on route is locked - use it
1388 * 2. mtu from nexthop exception
1389 * 3. mtu from egress device
1390 */
1391
1392u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1393{
1394        struct fib_nh_common *nhc = res->nhc;
1395        struct net_device *dev = nhc->nhc_dev;
1396        struct fib_info *fi = res->fi;
1397        u32 mtu = 0;
1398
1399        if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1400            fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1401                mtu = fi->fib_mtu;
1402
1403        if (likely(!mtu)) {
1404                struct fib_nh_exception *fnhe;
1405
1406                fnhe = find_exception(nhc, daddr);
1407                if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1408                        mtu = fnhe->fnhe_pmtu;
1409        }
1410
1411        if (likely(!mtu))
1412                mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1413
1414        return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1415}
1416
1417static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1418                              __be32 daddr, const bool do_cache)
1419{
1420        bool ret = false;
1421
1422        spin_lock_bh(&fnhe_lock);
1423
1424        if (daddr == fnhe->fnhe_daddr) {
1425                struct rtable __rcu **porig;
1426                struct rtable *orig;
1427                int genid = fnhe_genid(dev_net(rt->dst.dev));
1428
1429                if (rt_is_input_route(rt))
1430                        porig = &fnhe->fnhe_rth_input;
1431                else
1432                        porig = &fnhe->fnhe_rth_output;
1433                orig = rcu_dereference(*porig);
1434
1435                if (fnhe->fnhe_genid != genid) {
1436                        fnhe->fnhe_genid = genid;
1437                        fnhe->fnhe_gw = 0;
1438                        fnhe->fnhe_pmtu = 0;
1439                        fnhe->fnhe_expires = 0;
1440                        fnhe->fnhe_mtu_locked = false;
1441                        fnhe_flush_routes(fnhe);
1442                        orig = NULL;
1443                }
1444                fill_route_from_fnhe(rt, fnhe);
1445                if (!rt->rt_gw4) {
1446                        rt->rt_gw4 = daddr;
1447                        rt->rt_gw_family = AF_INET;
1448                }
1449
1450                if (do_cache) {
1451                        dst_hold(&rt->dst);
1452                        rcu_assign_pointer(*porig, rt);
1453                        if (orig) {
1454                                dst_dev_put(&orig->dst);
1455                                dst_release(&orig->dst);
1456                        }
1457                        ret = true;
1458                }
1459
1460                fnhe->fnhe_stamp = jiffies;
1461        }
1462        spin_unlock_bh(&fnhe_lock);
1463
1464        return ret;
1465}
1466
1467static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1468{
1469        struct rtable *orig, *prev, **p;
1470        bool ret = true;
1471
1472        if (rt_is_input_route(rt)) {
1473                p = (struct rtable **)&nhc->nhc_rth_input;
1474        } else {
1475                p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1476        }
1477        orig = *p;
1478
1479        /* hold dst before doing cmpxchg() to avoid race condition
1480         * on this dst
1481         */
1482        dst_hold(&rt->dst);
1483        prev = cmpxchg(p, orig, rt);
1484        if (prev == orig) {
1485                if (orig) {
1486                        rt_add_uncached_list(orig);
1487                        dst_release(&orig->dst);
1488                }
1489        } else {
1490                dst_release(&rt->dst);
1491                ret = false;
1492        }
1493
1494        return ret;
1495}
1496
1497struct uncached_list {
1498        spinlock_t              lock;
1499        struct list_head        head;
1500};
1501
1502static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1503
1504void rt_add_uncached_list(struct rtable *rt)
1505{
1506        struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1507
1508        rt->rt_uncached_list = ul;
1509
1510        spin_lock_bh(&ul->lock);
1511        list_add_tail(&rt->rt_uncached, &ul->head);
1512        spin_unlock_bh(&ul->lock);
1513}
1514
1515void rt_del_uncached_list(struct rtable *rt)
1516{
1517        if (!list_empty(&rt->rt_uncached)) {
1518                struct uncached_list *ul = rt->rt_uncached_list;
1519
1520                spin_lock_bh(&ul->lock);
1521                list_del(&rt->rt_uncached);
1522                spin_unlock_bh(&ul->lock);
1523        }
1524}
1525
1526static void ipv4_dst_destroy(struct dst_entry *dst)
1527{
1528        struct rtable *rt = (struct rtable *)dst;
1529
1530        ip_dst_metrics_put(dst);
1531        rt_del_uncached_list(rt);
1532}
1533
1534void rt_flush_dev(struct net_device *dev)
1535{
1536        struct rtable *rt;
1537        int cpu;
1538
1539        for_each_possible_cpu(cpu) {
1540                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1541
1542                spin_lock_bh(&ul->lock);
1543                list_for_each_entry(rt, &ul->head, rt_uncached) {
1544                        if (rt->dst.dev != dev)
1545                                continue;
1546                        rt->dst.dev = blackhole_netdev;
1547                        dev_hold(rt->dst.dev);
1548                        dev_put(dev);
1549                }
1550                spin_unlock_bh(&ul->lock);
1551        }
1552}
1553
1554static bool rt_cache_valid(const struct rtable *rt)
1555{
1556        return  rt &&
1557                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1558                !rt_is_expired(rt);
1559}
1560
1561static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1562                           const struct fib_result *res,
1563                           struct fib_nh_exception *fnhe,
1564                           struct fib_info *fi, u16 type, u32 itag,
1565                           const bool do_cache)
1566{
1567        bool cached = false;
1568
1569        if (fi) {
1570                struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1571
1572                if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1573                        rt->rt_uses_gateway = 1;
1574                        rt->rt_gw_family = nhc->nhc_gw_family;
1575                        /* only INET and INET6 are supported */
1576                        if (likely(nhc->nhc_gw_family == AF_INET))
1577                                rt->rt_gw4 = nhc->nhc_gw.ipv4;
1578                        else
1579                                rt->rt_gw6 = nhc->nhc_gw.ipv6;
1580                }
1581
1582                ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1583
1584#ifdef CONFIG_IP_ROUTE_CLASSID
1585                if (nhc->nhc_family == AF_INET) {
1586                        struct fib_nh *nh;
1587
1588                        nh = container_of(nhc, struct fib_nh, nh_common);
1589                        rt->dst.tclassid = nh->nh_tclassid;
1590                }
1591#endif
1592                rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1593                if (unlikely(fnhe))
1594                        cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1595                else if (do_cache)
1596                        cached = rt_cache_route(nhc, rt);
1597                if (unlikely(!cached)) {
1598                        /* Routes we intend to cache in nexthop exception or
1599                         * FIB nexthop have the DST_NOCACHE bit clear.
1600                         * However, if we are unsuccessful at storing this
1601                         * route into the cache we really need to set it.
1602                         */
1603                        if (!rt->rt_gw4) {
1604                                rt->rt_gw_family = AF_INET;
1605                                rt->rt_gw4 = daddr;
1606                        }
1607                        rt_add_uncached_list(rt);
1608                }
1609        } else
1610                rt_add_uncached_list(rt);
1611
1612#ifdef CONFIG_IP_ROUTE_CLASSID
1613#ifdef CONFIG_IP_MULTIPLE_TABLES
1614        set_class_tag(rt, res->tclassid);
1615#endif
1616        set_class_tag(rt, itag);
1617#endif
1618}
1619
1620struct rtable *rt_dst_alloc(struct net_device *dev,
1621                            unsigned int flags, u16 type,
1622                            bool nopolicy, bool noxfrm)
1623{
1624        struct rtable *rt;
1625
1626        rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1627                       (nopolicy ? DST_NOPOLICY : 0) |
1628                       (noxfrm ? DST_NOXFRM : 0));
1629
1630        if (rt) {
1631                rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1632                rt->rt_flags = flags;
1633                rt->rt_type = type;
1634                rt->rt_is_input = 0;
1635                rt->rt_iif = 0;
1636                rt->rt_pmtu = 0;
1637                rt->rt_mtu_locked = 0;
1638                rt->rt_uses_gateway = 0;
1639                rt->rt_gw_family = 0;
1640                rt->rt_gw4 = 0;
1641                INIT_LIST_HEAD(&rt->rt_uncached);
1642
1643                rt->dst.output = ip_output;
1644                if (flags & RTCF_LOCAL)
1645                        rt->dst.input = ip_local_deliver;
1646        }
1647
1648        return rt;
1649}
1650EXPORT_SYMBOL(rt_dst_alloc);
1651
1652struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1653{
1654        struct rtable *new_rt;
1655
1656        new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1657                           rt->dst.flags);
1658
1659        if (new_rt) {
1660                new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1661                new_rt->rt_flags = rt->rt_flags;
1662                new_rt->rt_type = rt->rt_type;
1663                new_rt->rt_is_input = rt->rt_is_input;
1664                new_rt->rt_iif = rt->rt_iif;
1665                new_rt->rt_pmtu = rt->rt_pmtu;
1666                new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1667                new_rt->rt_gw_family = rt->rt_gw_family;
1668                if (rt->rt_gw_family == AF_INET)
1669                        new_rt->rt_gw4 = rt->rt_gw4;
1670                else if (rt->rt_gw_family == AF_INET6)
1671                        new_rt->rt_gw6 = rt->rt_gw6;
1672                INIT_LIST_HEAD(&new_rt->rt_uncached);
1673
1674                new_rt->dst.input = rt->dst.input;
1675                new_rt->dst.output = rt->dst.output;
1676                new_rt->dst.error = rt->dst.error;
1677                new_rt->dst.lastuse = jiffies;
1678                new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1679        }
1680        return new_rt;
1681}
1682EXPORT_SYMBOL(rt_dst_clone);
1683
1684/* called in rcu_read_lock() section */
1685int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1686                          u8 tos, struct net_device *dev,
1687                          struct in_device *in_dev, u32 *itag)
1688{
1689        int err;
1690
1691        /* Primary sanity checks. */
1692        if (!in_dev)
1693                return -EINVAL;
1694
1695        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1696            skb->protocol != htons(ETH_P_IP))
1697                return -EINVAL;
1698
1699        if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1700                return -EINVAL;
1701
1702        if (ipv4_is_zeronet(saddr)) {
1703                if (!ipv4_is_local_multicast(daddr) &&
1704                    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1705                        return -EINVAL;
1706        } else {
1707                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1708                                          in_dev, itag);
1709                if (err < 0)
1710                        return err;
1711        }
1712        return 0;
1713}
1714
1715/* called in rcu_read_lock() section */
1716static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1717                             u8 tos, struct net_device *dev, int our)
1718{
1719        struct in_device *in_dev = __in_dev_get_rcu(dev);
1720        unsigned int flags = RTCF_MULTICAST;
1721        struct rtable *rth;
1722        u32 itag = 0;
1723        int err;
1724
1725        err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1726        if (err)
1727                return err;
1728
1729        if (our)
1730                flags |= RTCF_LOCAL;
1731
1732        rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1733                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1734        if (!rth)
1735                return -ENOBUFS;
1736
1737#ifdef CONFIG_IP_ROUTE_CLASSID
1738        rth->dst.tclassid = itag;
1739#endif
1740        rth->dst.output = ip_rt_bug;
1741        rth->rt_is_input= 1;
1742
1743#ifdef CONFIG_IP_MROUTE
1744        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1745                rth->dst.input = ip_mr_input;
1746#endif
1747        RT_CACHE_STAT_INC(in_slow_mc);
1748
1749        skb_dst_set(skb, &rth->dst);
1750        return 0;
1751}
1752
1753
1754static void ip_handle_martian_source(struct net_device *dev,
1755                                     struct in_device *in_dev,
1756                                     struct sk_buff *skb,
1757                                     __be32 daddr,
1758                                     __be32 saddr)
1759{
1760        RT_CACHE_STAT_INC(in_martian_src);
1761#ifdef CONFIG_IP_ROUTE_VERBOSE
1762        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1763                /*
1764                 *      RFC1812 recommendation, if source is martian,
1765                 *      the only hint is MAC header.
1766                 */
1767                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1768                        &daddr, &saddr, dev->name);
1769                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1770                        print_hex_dump(KERN_WARNING, "ll header: ",
1771                                       DUMP_PREFIX_OFFSET, 16, 1,
1772                                       skb_mac_header(skb),
1773                                       dev->hard_header_len, false);
1774                }
1775        }
1776#endif
1777}
1778
1779/* called in rcu_read_lock() section */
1780static int __mkroute_input(struct sk_buff *skb,
1781                           const struct fib_result *res,
1782                           struct in_device *in_dev,
1783                           __be32 daddr, __be32 saddr, u32 tos)
1784{
1785        struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1786        struct net_device *dev = nhc->nhc_dev;
1787        struct fib_nh_exception *fnhe;
1788        struct rtable *rth;
1789        int err;
1790        struct in_device *out_dev;
1791        bool do_cache;
1792        u32 itag = 0;
1793
1794        /* get a working reference to the output device */
1795        out_dev = __in_dev_get_rcu(dev);
1796        if (!out_dev) {
1797                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1798                return -EINVAL;
1799        }
1800
1801        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1802                                  in_dev->dev, in_dev, &itag);
1803        if (err < 0) {
1804                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1805                                         saddr);
1806
1807                goto cleanup;
1808        }
1809
1810        do_cache = res->fi && !itag;
1811        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1812            skb->protocol == htons(ETH_P_IP)) {
1813                __be32 gw;
1814
1815                gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1816                if (IN_DEV_SHARED_MEDIA(out_dev) ||
1817                    inet_addr_onlink(out_dev, saddr, gw))
1818                        IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1819        }
1820
1821        if (skb->protocol != htons(ETH_P_IP)) {
1822                /* Not IP (i.e. ARP). Do not create route, if it is
1823                 * invalid for proxy arp. DNAT routes are always valid.
1824                 *
1825                 * Proxy arp feature have been extended to allow, ARP
1826                 * replies back to the same interface, to support
1827                 * Private VLAN switch technologies. See arp.c.
1828                 */
1829                if (out_dev == in_dev &&
1830                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1831                        err = -EINVAL;
1832                        goto cleanup;
1833                }
1834        }
1835
1836        fnhe = find_exception(nhc, daddr);
1837        if (do_cache) {
1838                if (fnhe)
1839                        rth = rcu_dereference(fnhe->fnhe_rth_input);
1840                else
1841                        rth = rcu_dereference(nhc->nhc_rth_input);
1842                if (rt_cache_valid(rth)) {
1843                        skb_dst_set_noref(skb, &rth->dst);
1844                        goto out;
1845                }
1846        }
1847
1848        rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1849                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1850                           IN_DEV_CONF_GET(out_dev, NOXFRM));
1851        if (!rth) {
1852                err = -ENOBUFS;
1853                goto cleanup;
1854        }
1855
1856        rth->rt_is_input = 1;
1857        RT_CACHE_STAT_INC(in_slow_tot);
1858
1859        rth->dst.input = ip_forward;
1860
1861        rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1862                       do_cache);
1863        lwtunnel_set_redirect(&rth->dst);
1864        skb_dst_set(skb, &rth->dst);
1865out:
1866        err = 0;
1867 cleanup:
1868        return err;
1869}
1870
1871#ifdef CONFIG_IP_ROUTE_MULTIPATH
1872/* To make ICMP packets follow the right flow, the multipath hash is
1873 * calculated from the inner IP addresses.
1874 */
1875static void ip_multipath_l3_keys(const struct sk_buff *skb,
1876                                 struct flow_keys *hash_keys)
1877{
1878        const struct iphdr *outer_iph = ip_hdr(skb);
1879        const struct iphdr *key_iph = outer_iph;
1880        const struct iphdr *inner_iph;
1881        const struct icmphdr *icmph;
1882        struct iphdr _inner_iph;
1883        struct icmphdr _icmph;
1884
1885        if (likely(outer_iph->protocol != IPPROTO_ICMP))
1886                goto out;
1887
1888        if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1889                goto out;
1890
1891        icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1892                                   &_icmph);
1893        if (!icmph)
1894                goto out;
1895
1896        if (!icmp_is_err(icmph->type))
1897                goto out;
1898
1899        inner_iph = skb_header_pointer(skb,
1900                                       outer_iph->ihl * 4 + sizeof(_icmph),
1901                                       sizeof(_inner_iph), &_inner_iph);
1902        if (!inner_iph)
1903                goto out;
1904
1905        key_iph = inner_iph;
1906out:
1907        hash_keys->addrs.v4addrs.src = key_iph->saddr;
1908        hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1909}
1910
1911/* if skb is set it will be used and fl4 can be NULL */
1912int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1913                       const struct sk_buff *skb, struct flow_keys *flkeys)
1914{
1915        u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1916        struct flow_keys hash_keys;
1917        u32 mhash;
1918
1919        switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1920        case 0:
1921                memset(&hash_keys, 0, sizeof(hash_keys));
1922                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1923                if (skb) {
1924                        ip_multipath_l3_keys(skb, &hash_keys);
1925                } else {
1926                        hash_keys.addrs.v4addrs.src = fl4->saddr;
1927                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
1928                }
1929                break;
1930        case 1:
1931                /* skb is currently provided only when forwarding */
1932                if (skb) {
1933                        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1934                        struct flow_keys keys;
1935
1936                        /* short-circuit if we already have L4 hash present */
1937                        if (skb->l4_hash)
1938                                return skb_get_hash_raw(skb) >> 1;
1939
1940                        memset(&hash_keys, 0, sizeof(hash_keys));
1941
1942                        if (!flkeys) {
1943                                skb_flow_dissect_flow_keys(skb, &keys, flag);
1944                                flkeys = &keys;
1945                        }
1946
1947                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1948                        hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1949                        hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1950                        hash_keys.ports.src = flkeys->ports.src;
1951                        hash_keys.ports.dst = flkeys->ports.dst;
1952                        hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1953                } else {
1954                        memset(&hash_keys, 0, sizeof(hash_keys));
1955                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1956                        hash_keys.addrs.v4addrs.src = fl4->saddr;
1957                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
1958                        hash_keys.ports.src = fl4->fl4_sport;
1959                        hash_keys.ports.dst = fl4->fl4_dport;
1960                        hash_keys.basic.ip_proto = fl4->flowi4_proto;
1961                }
1962                break;
1963        case 2:
1964                memset(&hash_keys, 0, sizeof(hash_keys));
1965                /* skb is currently provided only when forwarding */
1966                if (skb) {
1967                        struct flow_keys keys;
1968
1969                        skb_flow_dissect_flow_keys(skb, &keys, 0);
1970                        /* Inner can be v4 or v6 */
1971                        if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1972                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1973                                hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1974                                hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1975                        } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1976                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1977                                hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1978                                hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1979                                hash_keys.tags.flow_label = keys.tags.flow_label;
1980                                hash_keys.basic.ip_proto = keys.basic.ip_proto;
1981                        } else {
1982                                /* Same as case 0 */
1983                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1984                                ip_multipath_l3_keys(skb, &hash_keys);
1985                        }
1986                } else {
1987                        /* Same as case 0 */
1988                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1989                        hash_keys.addrs.v4addrs.src = fl4->saddr;
1990                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
1991                }
1992                break;
1993        }
1994        mhash = flow_hash_from_keys(&hash_keys);
1995
1996        if (multipath_hash)
1997                mhash = jhash_2words(mhash, multipath_hash, 0);
1998
1999        return mhash >> 1;
2000}
2001#endif /* CONFIG_IP_ROUTE_MULTIPATH */
2002
2003static int ip_mkroute_input(struct sk_buff *skb,
2004                            struct fib_result *res,
2005                            struct in_device *in_dev,
2006                            __be32 daddr, __be32 saddr, u32 tos,
2007                            struct flow_keys *hkeys)
2008{
2009#ifdef CONFIG_IP_ROUTE_MULTIPATH
2010        if (res->fi && fib_info_num_path(res->fi) > 1) {
2011                int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2012
2013                fib_select_multipath(res, h);
2014        }
2015#endif
2016
2017        /* create a routing cache entry */
2018        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2019}
2020
2021/* Implements all the saddr-related checks as ip_route_input_slow(),
2022 * assuming daddr is valid and the destination is not a local broadcast one.
2023 * Uses the provided hint instead of performing a route lookup.
2024 */
2025int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2026                      u8 tos, struct net_device *dev,
2027                      const struct sk_buff *hint)
2028{
2029        struct in_device *in_dev = __in_dev_get_rcu(dev);
2030        struct rtable *rt = skb_rtable(hint);
2031        struct net *net = dev_net(dev);
2032        int err = -EINVAL;
2033        u32 tag = 0;
2034
2035        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2036                goto martian_source;
2037
2038        if (ipv4_is_zeronet(saddr))
2039                goto martian_source;
2040
2041        if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2042                goto martian_source;
2043
2044        if (rt->rt_type != RTN_LOCAL)
2045                goto skip_validate_source;
2046
2047        tos &= IPTOS_RT_MASK;
2048        err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2049        if (err < 0)
2050                goto martian_source;
2051
2052skip_validate_source:
2053        skb_dst_copy(skb, hint);
2054        return 0;
2055
2056martian_source:
2057        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2058        return err;
2059}
2060
2061/*
2062 *      NOTE. We drop all the packets that has local source
2063 *      addresses, because every properly looped back packet
2064 *      must have correct destination already attached by output routine.
2065 *      Changes in the enforced policies must be applied also to
2066 *      ip_route_use_hint().
2067 *
2068 *      Such approach solves two big problems:
2069 *      1. Not simplex devices are handled properly.
2070 *      2. IP spoofing attempts are filtered with 100% of guarantee.
2071 *      called with rcu_read_lock()
2072 */
2073
2074static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2075                               u8 tos, struct net_device *dev,
2076                               struct fib_result *res)
2077{
2078        struct in_device *in_dev = __in_dev_get_rcu(dev);
2079        struct flow_keys *flkeys = NULL, _flkeys;
2080        struct net    *net = dev_net(dev);
2081        struct ip_tunnel_info *tun_info;
2082        int             err = -EINVAL;
2083        unsigned int    flags = 0;
2084        u32             itag = 0;
2085        struct rtable   *rth;
2086        struct flowi4   fl4;
2087        bool do_cache = true;
2088
2089        /* IP on this device is disabled. */
2090
2091        if (!in_dev)
2092                goto out;
2093
2094        /* Check for the most weird martians, which can be not detected
2095           by fib_lookup.
2096         */
2097
2098        tun_info = skb_tunnel_info(skb);
2099        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2100                fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2101        else
2102                fl4.flowi4_tun_key.tun_id = 0;
2103        skb_dst_drop(skb);
2104
2105        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2106                goto martian_source;
2107
2108        res->fi = NULL;
2109        res->table = NULL;
2110        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2111                goto brd_input;
2112
2113        /* Accept zero addresses only to limited broadcast;
2114         * I even do not know to fix it or not. Waiting for complains :-)
2115         */
2116        if (ipv4_is_zeronet(saddr))
2117                goto martian_source;
2118
2119        if (ipv4_is_zeronet(daddr))
2120                goto martian_destination;
2121
2122        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2123         * and call it once if daddr or/and saddr are loopback addresses
2124         */
2125        if (ipv4_is_loopback(daddr)) {
2126                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2127                        goto martian_destination;
2128        } else if (ipv4_is_loopback(saddr)) {
2129                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2130                        goto martian_source;
2131        }
2132
2133        /*
2134         *      Now we are ready to route packet.
2135         */
2136        fl4.flowi4_oif = 0;
2137        fl4.flowi4_iif = dev->ifindex;
2138        fl4.flowi4_mark = skb->mark;
2139        fl4.flowi4_tos = tos;
2140        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2141        fl4.flowi4_flags = 0;
2142        fl4.daddr = daddr;
2143        fl4.saddr = saddr;
2144        fl4.flowi4_uid = sock_net_uid(net, NULL);
2145
2146        if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2147                flkeys = &_flkeys;
2148        } else {
2149                fl4.flowi4_proto = 0;
2150                fl4.fl4_sport = 0;
2151                fl4.fl4_dport = 0;
2152        }
2153
2154        err = fib_lookup(net, &fl4, res, 0);
2155        if (err != 0) {
2156                if (!IN_DEV_FORWARD(in_dev))
2157                        err = -EHOSTUNREACH;
2158                goto no_route;
2159        }
2160
2161        if (res->type == RTN_BROADCAST) {
2162                if (IN_DEV_BFORWARD(in_dev))
2163                        goto make_route;
2164                /* not do cache if bc_forwarding is enabled */
2165                if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2166                        do_cache = false;
2167                goto brd_input;
2168        }
2169
2170        if (res->type == RTN_LOCAL) {
2171                err = fib_validate_source(skb, saddr, daddr, tos,
2172                                          0, dev, in_dev, &itag);
2173                if (err < 0)
2174                        goto martian_source;
2175                goto local_input;
2176        }
2177
2178        if (!IN_DEV_FORWARD(in_dev)) {
2179                err = -EHOSTUNREACH;
2180                goto no_route;
2181        }
2182        if (res->type != RTN_UNICAST)
2183                goto martian_destination;
2184
2185make_route:
2186        err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2187out:    return err;
2188
2189brd_input:
2190        if (skb->protocol != htons(ETH_P_IP))
2191                goto e_inval;
2192
2193        if (!ipv4_is_zeronet(saddr)) {
2194                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2195                                          in_dev, &itag);
2196                if (err < 0)
2197                        goto martian_source;
2198        }
2199        flags |= RTCF_BROADCAST;
2200        res->type = RTN_BROADCAST;
2201        RT_CACHE_STAT_INC(in_brd);
2202
2203local_input:
2204        do_cache &= res->fi && !itag;
2205        if (do_cache) {
2206                struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2207
2208                rth = rcu_dereference(nhc->nhc_rth_input);
2209                if (rt_cache_valid(rth)) {
2210                        skb_dst_set_noref(skb, &rth->dst);
2211                        err = 0;
2212                        goto out;
2213                }
2214        }
2215
2216        rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2217                           flags | RTCF_LOCAL, res->type,
2218                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2219        if (!rth)
2220                goto e_nobufs;
2221
2222        rth->dst.output= ip_rt_bug;
2223#ifdef CONFIG_IP_ROUTE_CLASSID
2224        rth->dst.tclassid = itag;
2225#endif
2226        rth->rt_is_input = 1;
2227
2228        RT_CACHE_STAT_INC(in_slow_tot);
2229        if (res->type == RTN_UNREACHABLE) {
2230                rth->dst.input= ip_error;
2231                rth->dst.error= -err;
2232                rth->rt_flags   &= ~RTCF_LOCAL;
2233        }
2234
2235        if (do_cache) {
2236                struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2237
2238                rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2239                if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2240                        WARN_ON(rth->dst.input == lwtunnel_input);
2241                        rth->dst.lwtstate->orig_input = rth->dst.input;
2242                        rth->dst.input = lwtunnel_input;
2243                }
2244
2245                if (unlikely(!rt_cache_route(nhc, rth)))
2246                        rt_add_uncached_list(rth);
2247        }
2248        skb_dst_set(skb, &rth->dst);
2249        err = 0;
2250        goto out;
2251
2252no_route:
2253        RT_CACHE_STAT_INC(in_no_route);
2254        res->type = RTN_UNREACHABLE;
2255        res->fi = NULL;
2256        res->table = NULL;
2257        goto local_input;
2258
2259        /*
2260         *      Do not cache martian addresses: they should be logged (RFC1812)
2261         */
2262martian_destination:
2263        RT_CACHE_STAT_INC(in_martian_dst);
2264#ifdef CONFIG_IP_ROUTE_VERBOSE
2265        if (IN_DEV_LOG_MARTIANS(in_dev))
2266                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2267                                     &daddr, &saddr, dev->name);
2268#endif
2269
2270e_inval:
2271        err = -EINVAL;
2272        goto out;
2273
2274e_nobufs:
2275        err = -ENOBUFS;
2276        goto out;
2277
2278martian_source:
2279        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2280        goto out;
2281}
2282
2283int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2284                         u8 tos, struct net_device *dev)
2285{
2286        struct fib_result res;
2287        int err;
2288
2289        tos &= IPTOS_RT_MASK;
2290        rcu_read_lock();
2291        err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2292        rcu_read_unlock();
2293
2294        return err;
2295}
2296EXPORT_SYMBOL(ip_route_input_noref);
2297
2298/* called with rcu_read_lock held */
2299int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2300                       u8 tos, struct net_device *dev, struct fib_result *res)
2301{
2302        /* Multicast recognition logic is moved from route cache to here.
2303           The problem was that too many Ethernet cards have broken/missing
2304           hardware multicast filters :-( As result the host on multicasting
2305           network acquires a lot of useless route cache entries, sort of
2306           SDR messages from all the world. Now we try to get rid of them.
2307           Really, provided software IP multicast filter is organized
2308           reasonably (at least, hashed), it does not result in a slowdown
2309           comparing with route cache reject entries.
2310           Note, that multicast routers are not affected, because
2311           route cache entry is created eventually.
2312         */
2313        if (ipv4_is_multicast(daddr)) {
2314                struct in_device *in_dev = __in_dev_get_rcu(dev);
2315                int our = 0;
2316                int err = -EINVAL;
2317
2318                if (!in_dev)
2319                        return err;
2320                our = ip_check_mc_rcu(in_dev, daddr, saddr,
2321                                      ip_hdr(skb)->protocol);
2322
2323                /* check l3 master if no match yet */
2324                if (!our && netif_is_l3_slave(dev)) {
2325                        struct in_device *l3_in_dev;
2326
2327                        l3_in_dev = __in_dev_get_rcu(skb->dev);
2328                        if (l3_in_dev)
2329                                our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2330                                                      ip_hdr(skb)->protocol);
2331                }
2332
2333                if (our
2334#ifdef CONFIG_IP_MROUTE
2335                        ||
2336                    (!ipv4_is_local_multicast(daddr) &&
2337                     IN_DEV_MFORWARD(in_dev))
2338#endif
2339                   ) {
2340                        err = ip_route_input_mc(skb, daddr, saddr,
2341                                                tos, dev, our);
2342                }
2343                return err;
2344        }
2345
2346        return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2347}
2348
2349/* called with rcu_read_lock() */
2350static struct rtable *__mkroute_output(const struct fib_result *res,
2351                                       const struct flowi4 *fl4, int orig_oif,
2352                                       struct net_device *dev_out,
2353                                       unsigned int flags)
2354{
2355        struct fib_info *fi = res->fi;
2356        struct fib_nh_exception *fnhe;
2357        struct in_device *in_dev;
2358        u16 type = res->type;
2359        struct rtable *rth;
2360        bool do_cache;
2361
2362        in_dev = __in_dev_get_rcu(dev_out);
2363        if (!in_dev)
2364                return ERR_PTR(-EINVAL);
2365
2366        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2367                if (ipv4_is_loopback(fl4->saddr) &&
2368                    !(dev_out->flags & IFF_LOOPBACK) &&
2369                    !netif_is_l3_master(dev_out))
2370                        return ERR_PTR(-EINVAL);
2371
2372        if (ipv4_is_lbcast(fl4->daddr))
2373                type = RTN_BROADCAST;
2374        else if (ipv4_is_multicast(fl4->daddr))
2375                type = RTN_MULTICAST;
2376        else if (ipv4_is_zeronet(fl4->daddr))
2377                return ERR_PTR(-EINVAL);
2378
2379        if (dev_out->flags & IFF_LOOPBACK)
2380                flags |= RTCF_LOCAL;
2381
2382        do_cache = true;
2383        if (type == RTN_BROADCAST) {
2384                flags |= RTCF_BROADCAST | RTCF_LOCAL;
2385                fi = NULL;
2386        } else if (type == RTN_MULTICAST) {
2387                flags |= RTCF_MULTICAST | RTCF_LOCAL;
2388                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2389                                     fl4->flowi4_proto))
2390                        flags &= ~RTCF_LOCAL;
2391                else
2392                        do_cache = false;
2393                /* If multicast route do not exist use
2394                 * default one, but do not gateway in this case.
2395                 * Yes, it is hack.
2396                 */
2397                if (fi && res->prefixlen < 4)
2398                        fi = NULL;
2399        } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2400                   (orig_oif != dev_out->ifindex)) {
2401                /* For local routes that require a particular output interface
2402                 * we do not want to cache the result.  Caching the result
2403                 * causes incorrect behaviour when there are multiple source
2404                 * addresses on the interface, the end result being that if the
2405                 * intended recipient is waiting on that interface for the
2406                 * packet he won't receive it because it will be delivered on
2407                 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2408                 * be set to the loopback interface as well.
2409                 */
2410                do_cache = false;
2411        }
2412
2413        fnhe = NULL;
2414        do_cache &= fi != NULL;
2415        if (fi) {
2416                struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2417                struct rtable __rcu **prth;
2418
2419                fnhe = find_exception(nhc, fl4->daddr);
2420                if (!do_cache)
2421                        goto add;
2422                if (fnhe) {
2423                        prth = &fnhe->fnhe_rth_output;
2424                } else {
2425                        if (unlikely(fl4->flowi4_flags &
2426                                     FLOWI_FLAG_KNOWN_NH &&
2427                                     !(nhc->nhc_gw_family &&
2428                                       nhc->nhc_scope == RT_SCOPE_LINK))) {
2429                                do_cache = false;
2430                                goto add;
2431                        }
2432                        prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2433                }
2434                rth = rcu_dereference(*prth);
2435                if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2436                        return rth;
2437        }
2438
2439add:
2440        rth = rt_dst_alloc(dev_out, flags, type,
2441                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
2442                           IN_DEV_CONF_GET(in_dev, NOXFRM));
2443        if (!rth)
2444                return ERR_PTR(-ENOBUFS);
2445
2446        rth->rt_iif = orig_oif;
2447
2448        RT_CACHE_STAT_INC(out_slow_tot);
2449
2450        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2451                if (flags & RTCF_LOCAL &&
2452                    !(dev_out->flags & IFF_LOOPBACK)) {
2453                        rth->dst.output = ip_mc_output;
2454                        RT_CACHE_STAT_INC(out_slow_mc);
2455                }
2456#ifdef CONFIG_IP_MROUTE
2457                if (type == RTN_MULTICAST) {
2458                        if (IN_DEV_MFORWARD(in_dev) &&
2459                            !ipv4_is_local_multicast(fl4->daddr)) {
2460                                rth->dst.input = ip_mr_input;
2461                                rth->dst.output = ip_mc_output;
2462                        }
2463                }
2464#endif
2465        }
2466
2467        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2468        lwtunnel_set_redirect(&rth->dst);
2469
2470        return rth;
2471}
2472
2473/*
2474 * Major route resolver routine.
2475 */
2476
2477struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2478                                        const struct sk_buff *skb)
2479{
2480        __u8 tos = RT_FL_TOS(fl4);
2481        struct fib_result res = {
2482                .type           = RTN_UNSPEC,
2483                .fi             = NULL,
2484                .table          = NULL,
2485                .tclassid       = 0,
2486        };
2487        struct rtable *rth;
2488
2489        fl4->flowi4_iif = LOOPBACK_IFINDEX;
2490        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2491        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2492                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2493
2494        rcu_read_lock();
2495        rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2496        rcu_read_unlock();
2497
2498        return rth;
2499}
2500EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2501
2502struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2503                                            struct fib_result *res,
2504                                            const struct sk_buff *skb)
2505{
2506        struct net_device *dev_out = NULL;
2507        int orig_oif = fl4->flowi4_oif;
2508        unsigned int flags = 0;
2509        struct rtable *rth;
2510        int err;
2511
2512        if (fl4->saddr) {
2513                if (ipv4_is_multicast(fl4->saddr) ||
2514                    ipv4_is_lbcast(fl4->saddr) ||
2515                    ipv4_is_zeronet(fl4->saddr)) {
2516                        rth = ERR_PTR(-EINVAL);
2517                        goto out;
2518                }
2519
2520                rth = ERR_PTR(-ENETUNREACH);
2521
2522                /* I removed check for oif == dev_out->oif here.
2523                   It was wrong for two reasons:
2524                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2525                      is assigned to multiple interfaces.
2526                   2. Moreover, we are allowed to send packets with saddr
2527                      of another iface. --ANK
2528                 */
2529
2530                if (fl4->flowi4_oif == 0 &&
2531                    (ipv4_is_multicast(fl4->daddr) ||
2532                     ipv4_is_lbcast(fl4->daddr))) {
2533                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2534                        dev_out = __ip_dev_find(net, fl4->saddr, false);
2535                        if (!dev_out)
2536                                goto out;
2537
2538                        /* Special hack: user can direct multicasts
2539                           and limited broadcast via necessary interface
2540                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2541                           This hack is not just for fun, it allows
2542                           vic,vat and friends to work.
2543                           They bind socket to loopback, set ttl to zero
2544                           and expect that it will work.
2545                           From the viewpoint of routing cache they are broken,
2546                           because we are not allowed to build multicast path
2547                           with loopback source addr (look, routing cache
2548                           cannot know, that ttl is zero, so that packet
2549                           will not leave this host and route is valid).
2550                           Luckily, this hack is good workaround.
2551                         */
2552
2553                        fl4->flowi4_oif = dev_out->ifindex;
2554                        goto make_route;
2555                }
2556
2557                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2558                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2559                        if (!__ip_dev_find(net, fl4->saddr, false))
2560                                goto out;
2561                }
2562        }
2563
2564
2565        if (fl4->flowi4_oif) {
2566                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2567                rth = ERR_PTR(-ENODEV);
2568                if (!dev_out)
2569                        goto out;
2570
2571                /* RACE: Check return value of inet_select_addr instead. */
2572                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2573                        rth = ERR_PTR(-ENETUNREACH);
2574                        goto out;
2575                }
2576                if (ipv4_is_local_multicast(fl4->daddr) ||
2577                    ipv4_is_lbcast(fl4->daddr) ||
2578                    fl4->flowi4_proto == IPPROTO_IGMP) {
2579                        if (!fl4->saddr)
2580                                fl4->saddr = inet_select_addr(dev_out, 0,
2581                                                              RT_SCOPE_LINK);
2582                        goto make_route;
2583                }
2584                if (!fl4->saddr) {
2585                        if (ipv4_is_multicast(fl4->daddr))
2586                                fl4->saddr = inet_select_addr(dev_out, 0,
2587                                                              fl4->flowi4_scope);
2588                        else if (!fl4->daddr)
2589                                fl4->saddr = inet_select_addr(dev_out, 0,
2590                                                              RT_SCOPE_HOST);
2591                }
2592        }
2593
2594        if (!fl4->daddr) {
2595                fl4->daddr = fl4->saddr;
2596                if (!fl4->daddr)
2597                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2598                dev_out = net->loopback_dev;
2599                fl4->flowi4_oif = LOOPBACK_IFINDEX;
2600                res->type = RTN_LOCAL;
2601                flags |= RTCF_LOCAL;
2602                goto make_route;
2603        }
2604
2605        err = fib_lookup(net, fl4, res, 0);
2606        if (err) {
2607                res->fi = NULL;
2608                res->table = NULL;
2609                if (fl4->flowi4_oif &&
2610                    (ipv4_is_multicast(fl4->daddr) ||
2611                    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2612                        /* Apparently, routing tables are wrong. Assume,
2613                           that the destination is on link.
2614
2615                           WHY? DW.
2616                           Because we are allowed to send to iface
2617                           even if it has NO routes and NO assigned
2618                           addresses. When oif is specified, routing
2619                           tables are looked up with only one purpose:
2620                           to catch if destination is gatewayed, rather than
2621                           direct. Moreover, if MSG_DONTROUTE is set,
2622                           we send packet, ignoring both routing tables
2623                           and ifaddr state. --ANK
2624
2625
2626                           We could make it even if oif is unknown,
2627                           likely IPv6, but we do not.
2628                         */
2629
2630                        if (fl4->saddr == 0)
2631                                fl4->saddr = inet_select_addr(dev_out, 0,
2632                                                              RT_SCOPE_LINK);
2633                        res->type = RTN_UNICAST;
2634                        goto make_route;
2635                }
2636                rth = ERR_PTR(err);
2637                goto out;
2638        }
2639
2640        if (res->type == RTN_LOCAL) {
2641                if (!fl4->saddr) {
2642                        if (res->fi->fib_prefsrc)
2643                                fl4->saddr = res->fi->fib_prefsrc;
2644                        else
2645                                fl4->saddr = fl4->daddr;
2646                }
2647
2648                /* L3 master device is the loopback for that domain */
2649                dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2650                        net->loopback_dev;
2651
2652                /* make sure orig_oif points to fib result device even
2653                 * though packet rx/tx happens over loopback or l3mdev
2654                 */
2655                orig_oif = FIB_RES_OIF(*res);
2656
2657                fl4->flowi4_oif = dev_out->ifindex;
2658                flags |= RTCF_LOCAL;
2659                goto make_route;
2660        }
2661
2662        fib_select_path(net, res, fl4, skb);
2663
2664        dev_out = FIB_RES_DEV(*res);
2665        fl4->flowi4_oif = dev_out->ifindex;
2666
2667
2668make_route:
2669        rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2670
2671out:
2672        return rth;
2673}
2674
2675static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2676{
2677        return NULL;
2678}
2679
2680static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2681{
2682        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2683
2684        return mtu ? : dst->dev->mtu;
2685}
2686
2687static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2688                                          struct sk_buff *skb, u32 mtu,
2689                                          bool confirm_neigh)
2690{
2691}
2692
2693static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2694                                       struct sk_buff *skb)
2695{
2696}
2697
2698static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2699                                          unsigned long old)
2700{
2701        return NULL;
2702}
2703
2704static struct dst_ops ipv4_dst_blackhole_ops = {
2705        .family                 =       AF_INET,
2706        .check                  =       ipv4_blackhole_dst_check,
2707        .mtu                    =       ipv4_blackhole_mtu,
2708        .default_advmss         =       ipv4_default_advmss,
2709        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2710        .redirect               =       ipv4_rt_blackhole_redirect,
2711        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2712        .neigh_lookup           =       ipv4_neigh_lookup,
2713};
2714
2715struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2716{
2717        struct rtable *ort = (struct rtable *) dst_orig;
2718        struct rtable *rt;
2719
2720        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2721        if (rt) {
2722                struct dst_entry *new = &rt->dst;
2723
2724                new->__use = 1;
2725                new->input = dst_discard;
2726                new->output = dst_discard_out;
2727
2728                new->dev = net->loopback_dev;
2729                if (new->dev)
2730                        dev_hold(new->dev);
2731
2732                rt->rt_is_input = ort->rt_is_input;
2733                rt->rt_iif = ort->rt_iif;
2734                rt->rt_pmtu = ort->rt_pmtu;
2735                rt->rt_mtu_locked = ort->rt_mtu_locked;
2736
2737                rt->rt_genid = rt_genid_ipv4(net);
2738                rt->rt_flags = ort->rt_flags;
2739                rt->rt_type = ort->rt_type;
2740                rt->rt_uses_gateway = ort->rt_uses_gateway;
2741                rt->rt_gw_family = ort->rt_gw_family;
2742                if (rt->rt_gw_family == AF_INET)
2743                        rt->rt_gw4 = ort->rt_gw4;
2744                else if (rt->rt_gw_family == AF_INET6)
2745                        rt->rt_gw6 = ort->rt_gw6;
2746
2747                INIT_LIST_HEAD(&rt->rt_uncached);
2748        }
2749
2750        dst_release(dst_orig);
2751
2752        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2753}
2754
2755struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2756                                    const struct sock *sk)
2757{
2758        struct rtable *rt = __ip_route_output_key(net, flp4);
2759
2760        if (IS_ERR(rt))
2761                return rt;
2762
2763        if (flp4->flowi4_proto)
2764                rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2765                                                        flowi4_to_flowi(flp4),
2766                                                        sk, 0);
2767
2768        return rt;
2769}
2770EXPORT_SYMBOL_GPL(ip_route_output_flow);
2771
2772struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2773                                      struct net_device *dev,
2774                                      struct net *net, __be32 *saddr,
2775                                      const struct ip_tunnel_info *info,
2776                                      u8 protocol, bool use_cache)
2777{
2778#ifdef CONFIG_DST_CACHE
2779        struct dst_cache *dst_cache;
2780#endif
2781        struct rtable *rt = NULL;
2782        struct flowi4 fl4;
2783        __u8 tos;
2784
2785#ifdef CONFIG_DST_CACHE
2786        dst_cache = (struct dst_cache *)&info->dst_cache;
2787        if (use_cache) {
2788                rt = dst_cache_get_ip4(dst_cache, saddr);
2789                if (rt)
2790                        return rt;
2791        }
2792#endif
2793        memset(&fl4, 0, sizeof(fl4));
2794        fl4.flowi4_mark = skb->mark;
2795        fl4.flowi4_proto = protocol;
2796        fl4.daddr = info->key.u.ipv4.dst;
2797        fl4.saddr = info->key.u.ipv4.src;
2798        tos = info->key.tos;
2799        fl4.flowi4_tos = RT_TOS(tos);
2800
2801        rt = ip_route_output_key(net, &fl4);
2802        if (IS_ERR(rt)) {
2803                netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2804                return ERR_PTR(-ENETUNREACH);
2805        }
2806        if (rt->dst.dev == dev) { /* is this necessary? */
2807                netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2808                ip_rt_put(rt);
2809                return ERR_PTR(-ELOOP);
2810        }
2811#ifdef CONFIG_DST_CACHE
2812        if (use_cache)
2813                dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2814#endif
2815        *saddr = fl4.saddr;
2816        return rt;
2817}
2818EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2819
2820/* called with rcu_read_lock held */
2821static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2822                        struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2823                        struct sk_buff *skb, u32 portid, u32 seq,
2824                        unsigned int flags)
2825{
2826        struct rtmsg *r;
2827        struct nlmsghdr *nlh;
2828        unsigned long expires = 0;
2829        u32 error;
2830        u32 metrics[RTAX_MAX];
2831
2832        nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2833        if (!nlh)
2834                return -EMSGSIZE;
2835
2836        r = nlmsg_data(nlh);
2837        r->rtm_family    = AF_INET;
2838        r->rtm_dst_len  = 32;
2839        r->rtm_src_len  = 0;
2840        r->rtm_tos      = fl4 ? fl4->flowi4_tos : 0;
2841        r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2842        if (nla_put_u32(skb, RTA_TABLE, table_id))
2843                goto nla_put_failure;
2844        r->rtm_type     = rt->rt_type;
2845        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2846        r->rtm_protocol = RTPROT_UNSPEC;
2847        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2848        if (rt->rt_flags & RTCF_NOTIFY)
2849                r->rtm_flags |= RTM_F_NOTIFY;
2850        if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2851                r->rtm_flags |= RTCF_DOREDIRECT;
2852
2853        if (nla_put_in_addr(skb, RTA_DST, dst))
2854                goto nla_put_failure;
2855        if (src) {
2856                r->rtm_src_len = 32;
2857                if (nla_put_in_addr(skb, RTA_SRC, src))
2858                        goto nla_put_failure;
2859        }
2860        if (rt->dst.dev &&
2861            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2862                goto nla_put_failure;
2863#ifdef CONFIG_IP_ROUTE_CLASSID
2864        if (rt->dst.tclassid &&
2865            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2866                goto nla_put_failure;
2867#endif
2868        if (fl4 && !rt_is_input_route(rt) &&
2869            fl4->saddr != src) {
2870                if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2871                        goto nla_put_failure;
2872        }
2873        if (rt->rt_uses_gateway) {
2874                if (rt->rt_gw_family == AF_INET &&
2875                    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2876                        goto nla_put_failure;
2877                } else if (rt->rt_gw_family == AF_INET6) {
2878                        int alen = sizeof(struct in6_addr);
2879                        struct nlattr *nla;
2880                        struct rtvia *via;
2881
2882                        nla = nla_reserve(skb, RTA_VIA, alen + 2);
2883                        if (!nla)
2884                                goto nla_put_failure;
2885
2886                        via = nla_data(nla);
2887                        via->rtvia_family = AF_INET6;
2888                        memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2889                }
2890        }
2891
2892        expires = rt->dst.expires;
2893        if (expires) {
2894                unsigned long now = jiffies;
2895
2896                if (time_before(now, expires))
2897                        expires -= now;
2898                else
2899                        expires = 0;
2900        }
2901
2902        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2903        if (rt->rt_pmtu && expires)
2904                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2905        if (rt->rt_mtu_locked && expires)
2906                metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2907        if (rtnetlink_put_metrics(skb, metrics) < 0)
2908                goto nla_put_failure;
2909
2910        if (fl4) {
2911                if (fl4->flowi4_mark &&
2912                    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2913                        goto nla_put_failure;
2914
2915                if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2916                    nla_put_u32(skb, RTA_UID,
2917                                from_kuid_munged(current_user_ns(),
2918                                                 fl4->flowi4_uid)))
2919                        goto nla_put_failure;
2920
2921                if (rt_is_input_route(rt)) {
2922#ifdef CONFIG_IP_MROUTE
2923                        if (ipv4_is_multicast(dst) &&
2924                            !ipv4_is_local_multicast(dst) &&
2925                            IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2926                                int err = ipmr_get_route(net, skb,
2927                                                         fl4->saddr, fl4->daddr,
2928                                                         r, portid);
2929
2930                                if (err <= 0) {
2931                                        if (err == 0)
2932                                                return 0;
2933                                        goto nla_put_failure;
2934                                }
2935                        } else
2936#endif
2937                                if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2938                                        goto nla_put_failure;
2939                }
2940        }
2941
2942        error = rt->dst.error;
2943
2944        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2945                goto nla_put_failure;
2946
2947        nlmsg_end(skb, nlh);
2948        return 0;
2949
2950nla_put_failure:
2951        nlmsg_cancel(skb, nlh);
2952        return -EMSGSIZE;
2953}
2954
2955static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2956                            struct netlink_callback *cb, u32 table_id,
2957                            struct fnhe_hash_bucket *bucket, int genid,
2958                            int *fa_index, int fa_start, unsigned int flags)
2959{
2960        int i;
2961
2962        for (i = 0; i < FNHE_HASH_SIZE; i++) {
2963                struct fib_nh_exception *fnhe;
2964
2965                for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2966                     fnhe = rcu_dereference(fnhe->fnhe_next)) {
2967                        struct rtable *rt;
2968                        int err;
2969
2970                        if (*fa_index < fa_start)
2971                                goto next;
2972
2973                        if (fnhe->fnhe_genid != genid)
2974                                goto next;
2975
2976                        if (fnhe->fnhe_expires &&
2977                            time_after(jiffies, fnhe->fnhe_expires))
2978                                goto next;
2979
2980                        rt = rcu_dereference(fnhe->fnhe_rth_input);
2981                        if (!rt)
2982                                rt = rcu_dereference(fnhe->fnhe_rth_output);
2983                        if (!rt)
2984                                goto next;
2985
2986                        err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2987                                           table_id, NULL, skb,
2988                                           NETLINK_CB(cb->skb).portid,
2989                                           cb->nlh->nlmsg_seq, flags);
2990                        if (err)
2991                                return err;
2992next:
2993                        (*fa_index)++;
2994                }
2995        }
2996
2997        return 0;
2998}
2999
3000int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3001                       u32 table_id, struct fib_info *fi,
3002                       int *fa_index, int fa_start, unsigned int flags)
3003{
3004        struct net *net = sock_net(cb->skb->sk);
3005        int nhsel, genid = fnhe_genid(net);
3006
3007        for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3008                struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3009                struct fnhe_hash_bucket *bucket;
3010                int err;
3011
3012                if (nhc->nhc_flags & RTNH_F_DEAD)
3013                        continue;
3014
3015                rcu_read_lock();
3016                bucket = rcu_dereference(nhc->nhc_exceptions);
3017                err = 0;
3018                if (bucket)
3019                        err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3020                                               genid, fa_index, fa_start,
3021                                               flags);
3022                rcu_read_unlock();
3023                if (err)
3024                        return err;
3025        }
3026
3027        return 0;
3028}
3029
3030static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3031                                                   u8 ip_proto, __be16 sport,
3032                                                   __be16 dport)
3033{
3034        struct sk_buff *skb;
3035        struct iphdr *iph;
3036
3037        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3038        if (!skb)
3039                return NULL;
3040
3041        /* Reserve room for dummy headers, this skb can pass
3042         * through good chunk of routing engine.
3043         */
3044        skb_reset_mac_header(skb);
3045        skb_reset_network_header(skb);
3046        skb->protocol = htons(ETH_P_IP);
3047        iph = skb_put(skb, sizeof(struct iphdr));
3048        iph->protocol = ip_proto;
3049        iph->saddr = src;
3050        iph->daddr = dst;
3051        iph->version = 0x4;
3052        iph->frag_off = 0;
3053        iph->ihl = 0x5;
3054        skb_set_transport_header(skb, skb->len);
3055
3056        switch (iph->protocol) {
3057        case IPPROTO_UDP: {
3058                struct udphdr *udph;
3059
3060                udph = skb_put_zero(skb, sizeof(struct udphdr));
3061                udph->source = sport;
3062                udph->dest = dport;
3063                udph->len = sizeof(struct udphdr);
3064                udph->check = 0;
3065                break;
3066        }
3067        case IPPROTO_TCP: {
3068                struct tcphdr *tcph;
3069
3070                tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3071                tcph->source    = sport;
3072                tcph->dest      = dport;
3073                tcph->doff      = sizeof(struct tcphdr) / 4;
3074                tcph->rst = 1;
3075                tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3076                                            src, dst, 0);
3077                break;
3078        }
3079        case IPPROTO_ICMP: {
3080                struct icmphdr *icmph;
3081
3082                icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3083                icmph->type = ICMP_ECHO;
3084                icmph->code = 0;
3085        }
3086        }
3087
3088        return skb;
3089}
3090
3091static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3092                                       const struct nlmsghdr *nlh,
3093                                       struct nlattr **tb,
3094                                       struct netlink_ext_ack *extack)
3095{
3096        struct rtmsg *rtm;
3097        int i, err;
3098
3099        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3100                NL_SET_ERR_MSG(extack,
3101                               "ipv4: Invalid header for route get request");
3102                return -EINVAL;
3103        }
3104
3105        if (!netlink_strict_get_check(skb))
3106                return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3107                                              rtm_ipv4_policy, extack);
3108
3109        rtm = nlmsg_data(nlh);
3110        if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3111            (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3112            rtm->rtm_table || rtm->rtm_protocol ||
3113            rtm->rtm_scope || rtm->rtm_type) {
3114                NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3115                return -EINVAL;
3116        }
3117
3118        if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3119                               RTM_F_LOOKUP_TABLE |
3120                               RTM_F_FIB_MATCH)) {
3121                NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3122                return -EINVAL;
3123        }
3124
3125        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3126                                            rtm_ipv4_policy, extack);
3127        if (err)
3128                return err;
3129
3130        if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3131            (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3132                NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3133                return -EINVAL;
3134        }
3135
3136        for (i = 0; i <= RTA_MAX; i++) {
3137                if (!tb[i])
3138                        continue;
3139
3140                switch (i) {
3141                case RTA_IIF:
3142                case RTA_OIF:
3143                case RTA_SRC:
3144                case RTA_DST:
3145                case RTA_IP_PROTO:
3146                case RTA_SPORT:
3147                case RTA_DPORT:
3148                case RTA_MARK:
3149                case RTA_UID:
3150                        break;
3151                default:
3152                        NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3153                        return -EINVAL;
3154                }
3155        }
3156
3157        return 0;
3158}
3159
3160static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3161                             struct netlink_ext_ack *extack)
3162{
3163        struct net *net = sock_net(in_skb->sk);
3164        struct nlattr *tb[RTA_MAX+1];
3165        u32 table_id = RT_TABLE_MAIN;
3166        __be16 sport = 0, dport = 0;
3167        struct fib_result res = {};
3168        u8 ip_proto = IPPROTO_UDP;
3169        struct rtable *rt = NULL;
3170        struct sk_buff *skb;
3171        struct rtmsg *rtm;
3172        struct flowi4 fl4 = {};
3173        __be32 dst = 0;
3174        __be32 src = 0;
3175        kuid_t uid;
3176        u32 iif;
3177        int err;
3178        int mark;
3179
3180        err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3181        if (err < 0)
3182                return err;
3183
3184        rtm = nlmsg_data(nlh);
3185        src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3186        dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3187        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3188        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3189        if (tb[RTA_UID])
3190                uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3191        else
3192                uid = (iif ? INVALID_UID : current_uid());
3193
3194        if (tb[RTA_IP_PROTO]) {
3195                err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3196                                                  &ip_proto, AF_INET, extack);
3197                if (err)
3198                        return err;
3199        }
3200
3201        if (tb[RTA_SPORT])
3202                sport = nla_get_be16(tb[RTA_SPORT]);
3203
3204        if (tb[RTA_DPORT])
3205                dport = nla_get_be16(tb[RTA_DPORT]);
3206
3207        skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3208        if (!skb)
3209                return -ENOBUFS;
3210
3211        fl4.daddr = dst;
3212        fl4.saddr = src;
3213        fl4.flowi4_tos = rtm->rtm_tos;
3214        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3215        fl4.flowi4_mark = mark;
3216        fl4.flowi4_uid = uid;
3217        if (sport)
3218                fl4.fl4_sport = sport;
3219        if (dport)
3220                fl4.fl4_dport = dport;
3221        fl4.flowi4_proto = ip_proto;
3222
3223        rcu_read_lock();
3224
3225        if (iif) {
3226                struct net_device *dev;
3227
3228                dev = dev_get_by_index_rcu(net, iif);
3229                if (!dev) {
3230                        err = -ENODEV;
3231                        goto errout_rcu;
3232                }
3233
3234                fl4.flowi4_iif = iif; /* for rt_fill_info */
3235                skb->dev        = dev;
3236                skb->mark       = mark;
3237                err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3238                                         dev, &res);
3239
3240                rt = skb_rtable(skb);
3241                if (err == 0 && rt->dst.error)
3242                        err = -rt->dst.error;
3243        } else {
3244                fl4.flowi4_iif = LOOPBACK_IFINDEX;
3245                skb->dev = net->loopback_dev;
3246                rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3247                err = 0;
3248                if (IS_ERR(rt))
3249                        err = PTR_ERR(rt);
3250                else
3251                        skb_dst_set(skb, &rt->dst);
3252        }
3253
3254        if (err)
3255                goto errout_rcu;
3256
3257        if (rtm->rtm_flags & RTM_F_NOTIFY)
3258                rt->rt_flags |= RTCF_NOTIFY;
3259
3260        if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3261                table_id = res.table ? res.table->tb_id : 0;
3262
3263        /* reset skb for netlink reply msg */
3264        skb_trim(skb, 0);
3265        skb_reset_network_header(skb);
3266        skb_reset_transport_header(skb);
3267        skb_reset_mac_header(skb);
3268
3269        if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3270                struct fib_rt_info fri;
3271
3272                if (!res.fi) {
3273                        err = fib_props[res.type].error;
3274                        if (!err)
3275                                err = -EHOSTUNREACH;
3276                        goto errout_rcu;
3277                }
3278                fri.fi = res.fi;
3279                fri.tb_id = table_id;
3280                fri.dst = res.prefix;
3281                fri.dst_len = res.prefixlen;
3282                fri.tos = fl4.flowi4_tos;
3283                fri.type = rt->rt_type;
3284                fri.offload = 0;
3285                fri.trap = 0;
3286                if (res.fa_head) {
3287                        struct fib_alias *fa;
3288
3289                        hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3290                                u8 slen = 32 - fri.dst_len;
3291
3292                                if (fa->fa_slen == slen &&
3293                                    fa->tb_id == fri.tb_id &&
3294                                    fa->fa_tos == fri.tos &&
3295                                    fa->fa_info == res.fi &&
3296                                    fa->fa_type == fri.type) {
3297                                        fri.offload = fa->offload;
3298                                        fri.trap = fa->trap;
3299                                        break;
3300                                }
3301                        }
3302                }
3303                err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3304                                    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3305        } else {
3306                err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3307                                   NETLINK_CB(in_skb).portid,
3308                                   nlh->nlmsg_seq, 0);
3309        }
3310        if (err < 0)
3311                goto errout_rcu;
3312
3313        rcu_read_unlock();
3314
3315        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3316
3317errout_free:
3318        return err;
3319errout_rcu:
3320        rcu_read_unlock();
3321        kfree_skb(skb);
3322        goto errout_free;
3323}
3324
3325void ip_rt_multicast_event(struct in_device *in_dev)
3326{
3327        rt_cache_flush(dev_net(in_dev->dev));
3328}
3329
3330#ifdef CONFIG_SYSCTL
3331static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3332static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3333static int ip_rt_gc_elasticity __read_mostly    = 8;
3334static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3335
3336static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3337                void *buffer, size_t *lenp, loff_t *ppos)
3338{
3339        struct net *net = (struct net *)__ctl->extra1;
3340
3341        if (write) {
3342                rt_cache_flush(net);
3343                fnhe_genid_bump(net);
3344                return 0;
3345        }
3346
3347        return -EINVAL;
3348}
3349
3350static struct ctl_table ipv4_route_table[] = {
3351        {
3352                .procname       = "gc_thresh",
3353                .data           = &ipv4_dst_ops.gc_thresh,
3354                .maxlen         = sizeof(int),
3355                .mode           = 0644,
3356                .proc_handler   = proc_dointvec,
3357        },
3358        {
3359                .procname       = "max_size",
3360                .data           = &ip_rt_max_size,
3361                .maxlen         = sizeof(int),
3362                .mode           = 0644,
3363                .proc_handler   = proc_dointvec,
3364        },
3365        {
3366                /*  Deprecated. Use gc_min_interval_ms */
3367
3368                .procname       = "gc_min_interval",
3369                .data           = &ip_rt_gc_min_interval,
3370                .maxlen         = sizeof(int),
3371                .mode           = 0644,
3372                .proc_handler   = proc_dointvec_jiffies,
3373        },
3374        {
3375                .procname       = "gc_min_interval_ms",
3376                .data           = &ip_rt_gc_min_interval,
3377                .maxlen         = sizeof(int),
3378                .mode           = 0644,
3379                .proc_handler   = proc_dointvec_ms_jiffies,
3380        },
3381        {
3382                .procname       = "gc_timeout",
3383                .data           = &ip_rt_gc_timeout,
3384                .maxlen         = sizeof(int),
3385                .mode           = 0644,
3386                .proc_handler   = proc_dointvec_jiffies,
3387        },
3388        {
3389                .procname       = "gc_interval",
3390                .data           = &ip_rt_gc_interval,
3391                .maxlen         = sizeof(int),
3392                .mode           = 0644,
3393                .proc_handler   = proc_dointvec_jiffies,
3394        },
3395        {
3396                .procname       = "redirect_load",
3397                .data           = &ip_rt_redirect_load,
3398                .maxlen         = sizeof(int),
3399                .mode           = 0644,
3400                .proc_handler   = proc_dointvec,
3401        },
3402        {
3403                .procname       = "redirect_number",
3404                .data           = &ip_rt_redirect_number,
3405                .maxlen         = sizeof(int),
3406                .mode           = 0644,
3407                .proc_handler   = proc_dointvec,
3408        },
3409        {
3410                .procname       = "redirect_silence",
3411                .data           = &ip_rt_redirect_silence,
3412                .maxlen         = sizeof(int),
3413                .mode           = 0644,
3414                .proc_handler   = proc_dointvec,
3415        },
3416        {
3417                .procname       = "error_cost",
3418                .data           = &ip_rt_error_cost,
3419                .maxlen         = sizeof(int),
3420                .mode           = 0644,
3421                .proc_handler   = proc_dointvec,
3422        },
3423        {
3424                .procname       = "error_burst",
3425                .data           = &ip_rt_error_burst,
3426                .maxlen         = sizeof(int),
3427                .mode           = 0644,
3428                .proc_handler   = proc_dointvec,
3429        },
3430        {
3431                .procname       = "gc_elasticity",
3432                .data           = &ip_rt_gc_elasticity,
3433                .maxlen         = sizeof(int),
3434                .mode           = 0644,
3435                .proc_handler   = proc_dointvec,
3436        },
3437        {
3438                .procname       = "mtu_expires",
3439                .data           = &ip_rt_mtu_expires,
3440                .maxlen         = sizeof(int),
3441                .mode           = 0644,
3442                .proc_handler   = proc_dointvec_jiffies,
3443        },
3444        {
3445                .procname       = "min_pmtu",
3446                .data           = &ip_rt_min_pmtu,
3447                .maxlen         = sizeof(int),
3448                .mode           = 0644,
3449                .proc_handler   = proc_dointvec_minmax,
3450                .extra1         = &ip_min_valid_pmtu,
3451        },
3452        {
3453                .procname       = "min_adv_mss",
3454                .data           = &ip_rt_min_advmss,
3455                .maxlen         = sizeof(int),
3456                .mode           = 0644,
3457                .proc_handler   = proc_dointvec,
3458        },
3459        { }
3460};
3461
3462static const char ipv4_route_flush_procname[] = "flush";
3463
3464static struct ctl_table ipv4_route_flush_table[] = {
3465        {
3466                .procname       = ipv4_route_flush_procname,
3467                .maxlen         = sizeof(int),
3468                .mode           = 0200,
3469                .proc_handler   = ipv4_sysctl_rtcache_flush,
3470        },
3471        { },
3472};
3473
3474static __net_init int sysctl_route_net_init(struct net *net)
3475{
3476        struct ctl_table *tbl;
3477
3478        tbl = ipv4_route_flush_table;
3479        if (!net_eq(net, &init_net)) {
3480                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3481                if (!tbl)
3482                        goto err_dup;
3483
3484                /* Don't export non-whitelisted sysctls to unprivileged users */
3485                if (net->user_ns != &init_user_ns) {
3486                        if (tbl[0].procname != ipv4_route_flush_procname)
3487                                tbl[0].procname = NULL;
3488                }
3489        }
3490        tbl[0].extra1 = net;
3491
3492        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3493        if (!net->ipv4.route_hdr)
3494                goto err_reg;
3495        return 0;
3496
3497err_reg:
3498        if (tbl != ipv4_route_flush_table)
3499                kfree(tbl);
3500err_dup:
3501        return -ENOMEM;
3502}
3503
3504static __net_exit void sysctl_route_net_exit(struct net *net)
3505{
3506        struct ctl_table *tbl;
3507
3508        tbl = net->ipv4.route_hdr->ctl_table_arg;
3509        unregister_net_sysctl_table(net->ipv4.route_hdr);
3510        BUG_ON(tbl == ipv4_route_flush_table);
3511        kfree(tbl);
3512}
3513
3514static __net_initdata struct pernet_operations sysctl_route_ops = {
3515        .init = sysctl_route_net_init,
3516        .exit = sysctl_route_net_exit,
3517};
3518#endif
3519
3520static __net_init int rt_genid_init(struct net *net)
3521{
3522        atomic_set(&net->ipv4.rt_genid, 0);
3523        atomic_set(&net->fnhe_genid, 0);
3524        atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3525        return 0;
3526}
3527
3528static __net_initdata struct pernet_operations rt_genid_ops = {
3529        .init = rt_genid_init,
3530};
3531
3532static int __net_init ipv4_inetpeer_init(struct net *net)
3533{
3534        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3535
3536        if (!bp)
3537                return -ENOMEM;
3538        inet_peer_base_init(bp);
3539        net->ipv4.peers = bp;
3540        return 0;
3541}
3542
3543static void __net_exit ipv4_inetpeer_exit(struct net *net)
3544{
3545        struct inet_peer_base *bp = net->ipv4.peers;
3546
3547        net->ipv4.peers = NULL;
3548        inetpeer_invalidate_tree(bp);
3549        kfree(bp);
3550}
3551
3552static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3553        .init   =       ipv4_inetpeer_init,
3554        .exit   =       ipv4_inetpeer_exit,
3555};
3556
3557#ifdef CONFIG_IP_ROUTE_CLASSID
3558struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3559#endif /* CONFIG_IP_ROUTE_CLASSID */
3560
3561int __init ip_rt_init(void)
3562{
3563        int cpu;
3564
3565        ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3566                                  GFP_KERNEL);
3567        if (!ip_idents)
3568                panic("IP: failed to allocate ip_idents\n");
3569
3570        prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3571
3572        ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3573        if (!ip_tstamps)
3574                panic("IP: failed to allocate ip_tstamps\n");
3575
3576        for_each_possible_cpu(cpu) {
3577                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3578
3579                INIT_LIST_HEAD(&ul->head);
3580                spin_lock_init(&ul->lock);
3581        }
3582#ifdef CONFIG_IP_ROUTE_CLASSID
3583        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3584        if (!ip_rt_acct)
3585                panic("IP: failed to allocate ip_rt_acct\n");
3586#endif
3587
3588        ipv4_dst_ops.kmem_cachep =
3589                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3590                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3591
3592        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3593
3594        if (dst_entries_init(&ipv4_dst_ops) < 0)
3595                panic("IP: failed to allocate ipv4_dst_ops counter\n");
3596
3597        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3598                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3599
3600        ipv4_dst_ops.gc_thresh = ~0;
3601        ip_rt_max_size = INT_MAX;
3602
3603        devinet_init();
3604        ip_fib_init();
3605
3606        if (ip_rt_proc_init())
3607                pr_err("Unable to create route proc files\n");
3608#ifdef CONFIG_XFRM
3609        xfrm_init();
3610        xfrm4_init();
3611#endif
3612        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3613                      RTNL_FLAG_DOIT_UNLOCKED);
3614
3615#ifdef CONFIG_SYSCTL
3616        register_pernet_subsys(&sysctl_route_ops);
3617#endif
3618        register_pernet_subsys(&rt_genid_ops);
3619        register_pernet_subsys(&ipv4_inetpeer_ops);
3620        return 0;
3621}
3622
3623#ifdef CONFIG_SYSCTL
3624/*
3625 * We really need to sanitize the damn ipv4 init order, then all
3626 * this nonsense will go away.
3627 */
3628void __init ip_static_sysctl_init(void)
3629{
3630        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3631}
3632#endif
3633