linux/net/ipv4/route.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              ROUTE - implementation of the IP router.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *              Alan Cox        :       Verify area fixes.
  16 *              Alan Cox        :       cli() protects routing changes
  17 *              Rui Oliveira    :       ICMP routing table updates
  18 *              (rco@di.uminho.pt)      Routing table insertion and update
  19 *              Linus Torvalds  :       Rewrote bits to be sensible
  20 *              Alan Cox        :       Added BSD route gw semantics
  21 *              Alan Cox        :       Super /proc >4K
  22 *              Alan Cox        :       MTU in route table
  23 *              Alan Cox        :       MSS actually. Also added the window
  24 *                                      clamper.
  25 *              Sam Lantinga    :       Fixed route matching in rt_del()
  26 *              Alan Cox        :       Routing cache support.
  27 *              Alan Cox        :       Removed compatibility cruft.
  28 *              Alan Cox        :       RTF_REJECT support.
  29 *              Alan Cox        :       TCP irtt support.
  30 *              Jonathan Naylor :       Added Metric support.
  31 *      Miquel van Smoorenburg  :       BSD API fixes.
  32 *      Miquel van Smoorenburg  :       Metrics.
  33 *              Alan Cox        :       Use __u32 properly
  34 *              Alan Cox        :       Aligned routing errors more closely with BSD
  35 *                                      our system is still very different.
  36 *              Alan Cox        :       Faster /proc handling
  37 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38 *                                      routing caches and better behaviour.
  39 *
  40 *              Olaf Erb        :       irtt wasn't being copied right.
  41 *              Bjorn Ekwall    :       Kerneld route support.
  42 *              Alan Cox        :       Multicast fixed (I hope)
  43 *              Pavel Krauz     :       Limited broadcast fixed
  44 *              Mike McLagan    :       Routing by source
  45 *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46 *                                      route.c and rewritten from scratch.
  47 *              Andi Kleen      :       Load-limit warning messages.
  48 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49 *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50 *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51 *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52 *              Marc Boucher    :       routing by fwmark
  53 *      Robert Olsson           :       Added rt_cache statistics
  54 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55 *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56 *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57 *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58 *
  59 *              This program is free software; you can redistribute it and/or
  60 *              modify it under the terms of the GNU General Public License
  61 *              as published by the Free Software Foundation; either version
  62 *              2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <linux/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
  92#include <linux/jhash.h>
  93#include <net/dst.h>
  94#include <net/dst_metadata.h>
  95#include <net/net_namespace.h>
  96#include <net/protocol.h>
  97#include <net/ip.h>
  98#include <net/route.h>
  99#include <net/inetpeer.h>
 100#include <net/sock.h>
 101#include <net/ip_fib.h>
 102#include <net/arp.h>
 103#include <net/tcp.h>
 104#include <net/icmp.h>
 105#include <net/xfrm.h>
 106#include <net/lwtunnel.h>
 107#include <net/netevent.h>
 108#include <net/rtnetlink.h>
 109#ifdef CONFIG_SYSCTL
 110#include <linux/sysctl.h>
 111#include <linux/kmemleak.h>
 112#endif
 113#include <net/secure_seq.h>
 114#include <net/ip_tunnels.h>
 115#include <net/l3mdev.h>
 116
 117#include "fib_lookup.h"
 118
 119#define RT_FL_TOS(oldflp4) \
 120        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 121
 122#define RT_GC_TIMEOUT (300*HZ)
 123
 124static int ip_rt_max_size;
 125static int ip_rt_redirect_number __read_mostly  = 9;
 126static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128static int ip_rt_error_cost __read_mostly       = HZ;
 129static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132static int ip_rt_min_advmss __read_mostly       = 256;
 133
 134static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 135/*
 136 *      Interface to generic destination cache.
 137 */
 138
 139static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 140static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 141static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 142static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143static void              ipv4_link_failure(struct sk_buff *skb);
 144static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 145                                           struct sk_buff *skb, u32 mtu);
 146static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 147                                        struct sk_buff *skb);
 148static void             ipv4_dst_destroy(struct dst_entry *dst);
 149
 150static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 151{
 152        WARN_ON(1);
 153        return NULL;
 154}
 155
 156static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 157                                           struct sk_buff *skb,
 158                                           const void *daddr);
 159static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 160
 161static struct dst_ops ipv4_dst_ops = {
 162        .family =               AF_INET,
 163        .check =                ipv4_dst_check,
 164        .default_advmss =       ipv4_default_advmss,
 165        .mtu =                  ipv4_mtu,
 166        .cow_metrics =          ipv4_cow_metrics,
 167        .destroy =              ipv4_dst_destroy,
 168        .negative_advice =      ipv4_negative_advice,
 169        .link_failure =         ipv4_link_failure,
 170        .update_pmtu =          ip_rt_update_pmtu,
 171        .redirect =             ip_do_redirect,
 172        .local_out =            __ip_local_out,
 173        .neigh_lookup =         ipv4_neigh_lookup,
 174        .confirm_neigh =        ipv4_confirm_neigh,
 175};
 176
 177#define ECN_OR_COST(class)      TC_PRIO_##class
 178
 179const __u8 ip_tos2prio[16] = {
 180        TC_PRIO_BESTEFFORT,
 181        ECN_OR_COST(BESTEFFORT),
 182        TC_PRIO_BESTEFFORT,
 183        ECN_OR_COST(BESTEFFORT),
 184        TC_PRIO_BULK,
 185        ECN_OR_COST(BULK),
 186        TC_PRIO_BULK,
 187        ECN_OR_COST(BULK),
 188        TC_PRIO_INTERACTIVE,
 189        ECN_OR_COST(INTERACTIVE),
 190        TC_PRIO_INTERACTIVE,
 191        ECN_OR_COST(INTERACTIVE),
 192        TC_PRIO_INTERACTIVE_BULK,
 193        ECN_OR_COST(INTERACTIVE_BULK),
 194        TC_PRIO_INTERACTIVE_BULK,
 195        ECN_OR_COST(INTERACTIVE_BULK)
 196};
 197EXPORT_SYMBOL(ip_tos2prio);
 198
 199static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 200#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 201
 202#ifdef CONFIG_PROC_FS
 203static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 204{
 205        if (*pos)
 206                return NULL;
 207        return SEQ_START_TOKEN;
 208}
 209
 210static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 211{
 212        ++*pos;
 213        return NULL;
 214}
 215
 216static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 217{
 218}
 219
 220static int rt_cache_seq_show(struct seq_file *seq, void *v)
 221{
 222        if (v == SEQ_START_TOKEN)
 223                seq_printf(seq, "%-127s\n",
 224                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 225                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 226                           "HHUptod\tSpecDst");
 227        return 0;
 228}
 229
 230static const struct seq_operations rt_cache_seq_ops = {
 231        .start  = rt_cache_seq_start,
 232        .next   = rt_cache_seq_next,
 233        .stop   = rt_cache_seq_stop,
 234        .show   = rt_cache_seq_show,
 235};
 236
 237static int rt_cache_seq_open(struct inode *inode, struct file *file)
 238{
 239        return seq_open(file, &rt_cache_seq_ops);
 240}
 241
 242static const struct file_operations rt_cache_seq_fops = {
 243        .owner   = THIS_MODULE,
 244        .open    = rt_cache_seq_open,
 245        .read    = seq_read,
 246        .llseek  = seq_lseek,
 247        .release = seq_release,
 248};
 249
 250
 251static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 252{
 253        int cpu;
 254
 255        if (*pos == 0)
 256                return SEQ_START_TOKEN;
 257
 258        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 259                if (!cpu_possible(cpu))
 260                        continue;
 261                *pos = cpu+1;
 262                return &per_cpu(rt_cache_stat, cpu);
 263        }
 264        return NULL;
 265}
 266
 267static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 268{
 269        int cpu;
 270
 271        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 272                if (!cpu_possible(cpu))
 273                        continue;
 274                *pos = cpu+1;
 275                return &per_cpu(rt_cache_stat, cpu);
 276        }
 277        return NULL;
 278
 279}
 280
 281static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 282{
 283
 284}
 285
 286static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 287{
 288        struct rt_cache_stat *st = v;
 289
 290        if (v == SEQ_START_TOKEN) {
 291                seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 292                return 0;
 293        }
 294
 295        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 296                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 297                   dst_entries_get_slow(&ipv4_dst_ops),
 298                   0, /* st->in_hit */
 299                   st->in_slow_tot,
 300                   st->in_slow_mc,
 301                   st->in_no_route,
 302                   st->in_brd,
 303                   st->in_martian_dst,
 304                   st->in_martian_src,
 305
 306                   0, /* st->out_hit */
 307                   st->out_slow_tot,
 308                   st->out_slow_mc,
 309
 310                   0, /* st->gc_total */
 311                   0, /* st->gc_ignored */
 312                   0, /* st->gc_goal_miss */
 313                   0, /* st->gc_dst_overflow */
 314                   0, /* st->in_hlist_search */
 315                   0  /* st->out_hlist_search */
 316                );
 317        return 0;
 318}
 319
 320static const struct seq_operations rt_cpu_seq_ops = {
 321        .start  = rt_cpu_seq_start,
 322        .next   = rt_cpu_seq_next,
 323        .stop   = rt_cpu_seq_stop,
 324        .show   = rt_cpu_seq_show,
 325};
 326
 327
 328static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 329{
 330        return seq_open(file, &rt_cpu_seq_ops);
 331}
 332
 333static const struct file_operations rt_cpu_seq_fops = {
 334        .owner   = THIS_MODULE,
 335        .open    = rt_cpu_seq_open,
 336        .read    = seq_read,
 337        .llseek  = seq_lseek,
 338        .release = seq_release,
 339};
 340
 341#ifdef CONFIG_IP_ROUTE_CLASSID
 342static int rt_acct_proc_show(struct seq_file *m, void *v)
 343{
 344        struct ip_rt_acct *dst, *src;
 345        unsigned int i, j;
 346
 347        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 348        if (!dst)
 349                return -ENOMEM;
 350
 351        for_each_possible_cpu(i) {
 352                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 353                for (j = 0; j < 256; j++) {
 354                        dst[j].o_bytes   += src[j].o_bytes;
 355                        dst[j].o_packets += src[j].o_packets;
 356                        dst[j].i_bytes   += src[j].i_bytes;
 357                        dst[j].i_packets += src[j].i_packets;
 358                }
 359        }
 360
 361        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 362        kfree(dst);
 363        return 0;
 364}
 365
 366static int rt_acct_proc_open(struct inode *inode, struct file *file)
 367{
 368        return single_open(file, rt_acct_proc_show, NULL);
 369}
 370
 371static const struct file_operations rt_acct_proc_fops = {
 372        .owner          = THIS_MODULE,
 373        .open           = rt_acct_proc_open,
 374        .read           = seq_read,
 375        .llseek         = seq_lseek,
 376        .release        = single_release,
 377};
 378#endif
 379
 380static int __net_init ip_rt_do_proc_init(struct net *net)
 381{
 382        struct proc_dir_entry *pde;
 383
 384        pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 385                          &rt_cache_seq_fops);
 386        if (!pde)
 387                goto err1;
 388
 389        pde = proc_create("rt_cache", S_IRUGO,
 390                          net->proc_net_stat, &rt_cpu_seq_fops);
 391        if (!pde)
 392                goto err2;
 393
 394#ifdef CONFIG_IP_ROUTE_CLASSID
 395        pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 396        if (!pde)
 397                goto err3;
 398#endif
 399        return 0;
 400
 401#ifdef CONFIG_IP_ROUTE_CLASSID
 402err3:
 403        remove_proc_entry("rt_cache", net->proc_net_stat);
 404#endif
 405err2:
 406        remove_proc_entry("rt_cache", net->proc_net);
 407err1:
 408        return -ENOMEM;
 409}
 410
 411static void __net_exit ip_rt_do_proc_exit(struct net *net)
 412{
 413        remove_proc_entry("rt_cache", net->proc_net_stat);
 414        remove_proc_entry("rt_cache", net->proc_net);
 415#ifdef CONFIG_IP_ROUTE_CLASSID
 416        remove_proc_entry("rt_acct", net->proc_net);
 417#endif
 418}
 419
 420static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 421        .init = ip_rt_do_proc_init,
 422        .exit = ip_rt_do_proc_exit,
 423};
 424
 425static int __init ip_rt_proc_init(void)
 426{
 427        return register_pernet_subsys(&ip_rt_proc_ops);
 428}
 429
 430#else
 431static inline int ip_rt_proc_init(void)
 432{
 433        return 0;
 434}
 435#endif /* CONFIG_PROC_FS */
 436
 437static inline bool rt_is_expired(const struct rtable *rth)
 438{
 439        return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 440}
 441
 442void rt_cache_flush(struct net *net)
 443{
 444        rt_genid_bump_ipv4(net);
 445}
 446
 447static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 448                                           struct sk_buff *skb,
 449                                           const void *daddr)
 450{
 451        struct net_device *dev = dst->dev;
 452        const __be32 *pkey = daddr;
 453        const struct rtable *rt;
 454        struct neighbour *n;
 455
 456        rt = (const struct rtable *) dst;
 457        if (rt->rt_gateway)
 458                pkey = (const __be32 *) &rt->rt_gateway;
 459        else if (skb)
 460                pkey = &ip_hdr(skb)->daddr;
 461
 462        n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 463        if (n)
 464                return n;
 465        return neigh_create(&arp_tbl, pkey, dev);
 466}
 467
 468static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 469{
 470        struct net_device *dev = dst->dev;
 471        const __be32 *pkey = daddr;
 472        const struct rtable *rt;
 473
 474        rt = (const struct rtable *)dst;
 475        if (rt->rt_gateway)
 476                pkey = (const __be32 *)&rt->rt_gateway;
 477        else if (!daddr ||
 478                 (rt->rt_flags &
 479                  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 480                return;
 481
 482        __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 483}
 484
 485#define IP_IDENTS_SZ 2048u
 486
 487static atomic_t *ip_idents __read_mostly;
 488static u32 *ip_tstamps __read_mostly;
 489
 490/* In order to protect privacy, we add a perturbation to identifiers
 491 * if one generator is seldom used. This makes hard for an attacker
 492 * to infer how many packets were sent between two points in time.
 493 */
 494u32 ip_idents_reserve(u32 hash, int segs)
 495{
 496        u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 497        atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 498        u32 old = ACCESS_ONCE(*p_tstamp);
 499        u32 now = (u32)jiffies;
 500        u32 new, delta = 0;
 501
 502        if (old != now && cmpxchg(p_tstamp, old, now) == old)
 503                delta = prandom_u32_max(now - old);
 504
 505        /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 506        do {
 507                old = (u32)atomic_read(p_id);
 508                new = old + delta + segs;
 509        } while (atomic_cmpxchg(p_id, old, new) != old);
 510
 511        return new - segs;
 512}
 513EXPORT_SYMBOL(ip_idents_reserve);
 514
 515void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 516{
 517        static u32 ip_idents_hashrnd __read_mostly;
 518        u32 hash, id;
 519
 520        net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 521
 522        hash = jhash_3words((__force u32)iph->daddr,
 523                            (__force u32)iph->saddr,
 524                            iph->protocol ^ net_hash_mix(net),
 525                            ip_idents_hashrnd);
 526        id = ip_idents_reserve(hash, segs);
 527        iph->id = htons(id);
 528}
 529EXPORT_SYMBOL(__ip_select_ident);
 530
 531static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 532                             const struct sock *sk,
 533                             const struct iphdr *iph,
 534                             int oif, u8 tos,
 535                             u8 prot, u32 mark, int flow_flags)
 536{
 537        if (sk) {
 538                const struct inet_sock *inet = inet_sk(sk);
 539
 540                oif = sk->sk_bound_dev_if;
 541                mark = sk->sk_mark;
 542                tos = RT_CONN_FLAGS(sk);
 543                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 544        }
 545        flowi4_init_output(fl4, oif, mark, tos,
 546                           RT_SCOPE_UNIVERSE, prot,
 547                           flow_flags,
 548                           iph->daddr, iph->saddr, 0, 0,
 549                           sock_net_uid(net, sk));
 550}
 551
 552static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 553                               const struct sock *sk)
 554{
 555        const struct net *net = dev_net(skb->dev);
 556        const struct iphdr *iph = ip_hdr(skb);
 557        int oif = skb->dev->ifindex;
 558        u8 tos = RT_TOS(iph->tos);
 559        u8 prot = iph->protocol;
 560        u32 mark = skb->mark;
 561
 562        __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 563}
 564
 565static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 566{
 567        const struct inet_sock *inet = inet_sk(sk);
 568        const struct ip_options_rcu *inet_opt;
 569        __be32 daddr = inet->inet_daddr;
 570
 571        rcu_read_lock();
 572        inet_opt = rcu_dereference(inet->inet_opt);
 573        if (inet_opt && inet_opt->opt.srr)
 574                daddr = inet_opt->opt.faddr;
 575        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 576                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 577                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 578                           inet_sk_flowi_flags(sk),
 579                           daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 580        rcu_read_unlock();
 581}
 582
 583static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 584                                 const struct sk_buff *skb)
 585{
 586        if (skb)
 587                build_skb_flow_key(fl4, skb, sk);
 588        else
 589                build_sk_flow_key(fl4, sk);
 590}
 591
 592static DEFINE_SPINLOCK(fnhe_lock);
 593
 594static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 595{
 596        struct rtable *rt;
 597
 598        rt = rcu_dereference(fnhe->fnhe_rth_input);
 599        if (rt) {
 600                RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 601                dst_dev_put(&rt->dst);
 602                dst_release(&rt->dst);
 603        }
 604        rt = rcu_dereference(fnhe->fnhe_rth_output);
 605        if (rt) {
 606                RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 607                dst_dev_put(&rt->dst);
 608                dst_release(&rt->dst);
 609        }
 610}
 611
 612static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 613{
 614        struct fib_nh_exception *fnhe, *oldest;
 615
 616        oldest = rcu_dereference(hash->chain);
 617        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 618             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 619                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 620                        oldest = fnhe;
 621        }
 622        fnhe_flush_routes(oldest);
 623        return oldest;
 624}
 625
 626static inline u32 fnhe_hashfun(__be32 daddr)
 627{
 628        static u32 fnhe_hashrnd __read_mostly;
 629        u32 hval;
 630
 631        net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 632        hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 633        return hash_32(hval, FNHE_HASH_SHIFT);
 634}
 635
 636static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 637{
 638        rt->rt_pmtu = fnhe->fnhe_pmtu;
 639        rt->dst.expires = fnhe->fnhe_expires;
 640
 641        if (fnhe->fnhe_gw) {
 642                rt->rt_flags |= RTCF_REDIRECTED;
 643                rt->rt_gateway = fnhe->fnhe_gw;
 644                rt->rt_uses_gateway = 1;
 645        }
 646}
 647
 648static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 649                                  u32 pmtu, unsigned long expires)
 650{
 651        struct fnhe_hash_bucket *hash;
 652        struct fib_nh_exception *fnhe;
 653        struct rtable *rt;
 654        unsigned int i;
 655        int depth;
 656        u32 hval = fnhe_hashfun(daddr);
 657
 658        spin_lock_bh(&fnhe_lock);
 659
 660        hash = rcu_dereference(nh->nh_exceptions);
 661        if (!hash) {
 662                hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 663                if (!hash)
 664                        goto out_unlock;
 665                rcu_assign_pointer(nh->nh_exceptions, hash);
 666        }
 667
 668        hash += hval;
 669
 670        depth = 0;
 671        for (fnhe = rcu_dereference(hash->chain); fnhe;
 672             fnhe = rcu_dereference(fnhe->fnhe_next)) {
 673                if (fnhe->fnhe_daddr == daddr)
 674                        break;
 675                depth++;
 676        }
 677
 678        if (fnhe) {
 679                if (gw)
 680                        fnhe->fnhe_gw = gw;
 681                if (pmtu) {
 682                        fnhe->fnhe_pmtu = pmtu;
 683                        fnhe->fnhe_expires = max(1UL, expires);
 684                }
 685                /* Update all cached dsts too */
 686                rt = rcu_dereference(fnhe->fnhe_rth_input);
 687                if (rt)
 688                        fill_route_from_fnhe(rt, fnhe);
 689                rt = rcu_dereference(fnhe->fnhe_rth_output);
 690                if (rt)
 691                        fill_route_from_fnhe(rt, fnhe);
 692        } else {
 693                if (depth > FNHE_RECLAIM_DEPTH)
 694                        fnhe = fnhe_oldest(hash);
 695                else {
 696                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 697                        if (!fnhe)
 698                                goto out_unlock;
 699
 700                        fnhe->fnhe_next = hash->chain;
 701                        rcu_assign_pointer(hash->chain, fnhe);
 702                }
 703                fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
 704                fnhe->fnhe_daddr = daddr;
 705                fnhe->fnhe_gw = gw;
 706                fnhe->fnhe_pmtu = pmtu;
 707                fnhe->fnhe_expires = expires;
 708
 709                /* Exception created; mark the cached routes for the nexthop
 710                 * stale, so anyone caching it rechecks if this exception
 711                 * applies to them.
 712                 */
 713                rt = rcu_dereference(nh->nh_rth_input);
 714                if (rt)
 715                        rt->dst.obsolete = DST_OBSOLETE_KILL;
 716
 717                for_each_possible_cpu(i) {
 718                        struct rtable __rcu **prt;
 719                        prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 720                        rt = rcu_dereference(*prt);
 721                        if (rt)
 722                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 723                }
 724        }
 725
 726        fnhe->fnhe_stamp = jiffies;
 727
 728out_unlock:
 729        spin_unlock_bh(&fnhe_lock);
 730}
 731
 732static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 733                             bool kill_route)
 734{
 735        __be32 new_gw = icmp_hdr(skb)->un.gateway;
 736        __be32 old_gw = ip_hdr(skb)->saddr;
 737        struct net_device *dev = skb->dev;
 738        struct in_device *in_dev;
 739        struct fib_result res;
 740        struct neighbour *n;
 741        struct net *net;
 742
 743        switch (icmp_hdr(skb)->code & 7) {
 744        case ICMP_REDIR_NET:
 745        case ICMP_REDIR_NETTOS:
 746        case ICMP_REDIR_HOST:
 747        case ICMP_REDIR_HOSTTOS:
 748                break;
 749
 750        default:
 751                return;
 752        }
 753
 754        if (rt->rt_gateway != old_gw)
 755                return;
 756
 757        in_dev = __in_dev_get_rcu(dev);
 758        if (!in_dev)
 759                return;
 760
 761        net = dev_net(dev);
 762        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 763            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 764            ipv4_is_zeronet(new_gw))
 765                goto reject_redirect;
 766
 767        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 768                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 769                        goto reject_redirect;
 770                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 771                        goto reject_redirect;
 772        } else {
 773                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 774                        goto reject_redirect;
 775        }
 776
 777        n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 778        if (!n)
 779                n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 780        if (!IS_ERR(n)) {
 781                if (!(n->nud_state & NUD_VALID)) {
 782                        neigh_event_send(n, NULL);
 783                } else {
 784                        if (fib_lookup(net, fl4, &res, 0) == 0) {
 785                                struct fib_nh *nh = &FIB_RES_NH(res);
 786
 787                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
 788                                                0, jiffies + ip_rt_gc_timeout);
 789                        }
 790                        if (kill_route)
 791                                rt->dst.obsolete = DST_OBSOLETE_KILL;
 792                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 793                }
 794                neigh_release(n);
 795        }
 796        return;
 797
 798reject_redirect:
 799#ifdef CONFIG_IP_ROUTE_VERBOSE
 800        if (IN_DEV_LOG_MARTIANS(in_dev)) {
 801                const struct iphdr *iph = (const struct iphdr *) skb->data;
 802                __be32 daddr = iph->daddr;
 803                __be32 saddr = iph->saddr;
 804
 805                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 806                                     "  Advised path = %pI4 -> %pI4\n",
 807                                     &old_gw, dev->name, &new_gw,
 808                                     &saddr, &daddr);
 809        }
 810#endif
 811        ;
 812}
 813
 814static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 815{
 816        struct rtable *rt;
 817        struct flowi4 fl4;
 818        const struct iphdr *iph = (const struct iphdr *) skb->data;
 819        struct net *net = dev_net(skb->dev);
 820        int oif = skb->dev->ifindex;
 821        u8 tos = RT_TOS(iph->tos);
 822        u8 prot = iph->protocol;
 823        u32 mark = skb->mark;
 824
 825        rt = (struct rtable *) dst;
 826
 827        __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 828        __ip_do_redirect(rt, skb, &fl4, true);
 829}
 830
 831static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 832{
 833        struct rtable *rt = (struct rtable *)dst;
 834        struct dst_entry *ret = dst;
 835
 836        if (rt) {
 837                if (dst->obsolete > 0) {
 838                        ip_rt_put(rt);
 839                        ret = NULL;
 840                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 841                           rt->dst.expires) {
 842                        ip_rt_put(rt);
 843                        ret = NULL;
 844                }
 845        }
 846        return ret;
 847}
 848
 849/*
 850 * Algorithm:
 851 *      1. The first ip_rt_redirect_number redirects are sent
 852 *         with exponential backoff, then we stop sending them at all,
 853 *         assuming that the host ignores our redirects.
 854 *      2. If we did not see packets requiring redirects
 855 *         during ip_rt_redirect_silence, we assume that the host
 856 *         forgot redirected route and start to send redirects again.
 857 *
 858 * This algorithm is much cheaper and more intelligent than dumb load limiting
 859 * in icmp.c.
 860 *
 861 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 862 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 863 */
 864
 865void ip_rt_send_redirect(struct sk_buff *skb)
 866{
 867        struct rtable *rt = skb_rtable(skb);
 868        struct in_device *in_dev;
 869        struct inet_peer *peer;
 870        struct net *net;
 871        int log_martians;
 872        int vif;
 873
 874        rcu_read_lock();
 875        in_dev = __in_dev_get_rcu(rt->dst.dev);
 876        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 877                rcu_read_unlock();
 878                return;
 879        }
 880        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 881        vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 882        rcu_read_unlock();
 883
 884        net = dev_net(rt->dst.dev);
 885        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 886        if (!peer) {
 887                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 888                          rt_nexthop(rt, ip_hdr(skb)->daddr));
 889                return;
 890        }
 891
 892        /* No redirected packets during ip_rt_redirect_silence;
 893         * reset the algorithm.
 894         */
 895        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 896                peer->rate_tokens = 0;
 897
 898        /* Too many ignored redirects; do not send anything
 899         * set dst.rate_last to the last seen redirected packet.
 900         */
 901        if (peer->rate_tokens >= ip_rt_redirect_number) {
 902                peer->rate_last = jiffies;
 903                goto out_put_peer;
 904        }
 905
 906        /* Check for load limit; set rate_last to the latest sent
 907         * redirect.
 908         */
 909        if (peer->rate_tokens == 0 ||
 910            time_after(jiffies,
 911                       (peer->rate_last +
 912                        (ip_rt_redirect_load << peer->rate_tokens)))) {
 913                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 914
 915                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 916                peer->rate_last = jiffies;
 917                ++peer->rate_tokens;
 918#ifdef CONFIG_IP_ROUTE_VERBOSE
 919                if (log_martians &&
 920                    peer->rate_tokens == ip_rt_redirect_number)
 921                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 922                                             &ip_hdr(skb)->saddr, inet_iif(skb),
 923                                             &ip_hdr(skb)->daddr, &gw);
 924#endif
 925        }
 926out_put_peer:
 927        inet_putpeer(peer);
 928}
 929
 930static int ip_error(struct sk_buff *skb)
 931{
 932        struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 933        struct rtable *rt = skb_rtable(skb);
 934        struct inet_peer *peer;
 935        unsigned long now;
 936        struct net *net;
 937        bool send;
 938        int code;
 939
 940        /* IP on this device is disabled. */
 941        if (!in_dev)
 942                goto out;
 943
 944        net = dev_net(rt->dst.dev);
 945        if (!IN_DEV_FORWARD(in_dev)) {
 946                switch (rt->dst.error) {
 947                case EHOSTUNREACH:
 948                        __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 949                        break;
 950
 951                case ENETUNREACH:
 952                        __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 953                        break;
 954                }
 955                goto out;
 956        }
 957
 958        switch (rt->dst.error) {
 959        case EINVAL:
 960        default:
 961                goto out;
 962        case EHOSTUNREACH:
 963                code = ICMP_HOST_UNREACH;
 964                break;
 965        case ENETUNREACH:
 966                code = ICMP_NET_UNREACH;
 967                __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 968                break;
 969        case EACCES:
 970                code = ICMP_PKT_FILTERED;
 971                break;
 972        }
 973
 974        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 975                               l3mdev_master_ifindex(skb->dev), 1);
 976
 977        send = true;
 978        if (peer) {
 979                now = jiffies;
 980                peer->rate_tokens += now - peer->rate_last;
 981                if (peer->rate_tokens > ip_rt_error_burst)
 982                        peer->rate_tokens = ip_rt_error_burst;
 983                peer->rate_last = now;
 984                if (peer->rate_tokens >= ip_rt_error_cost)
 985                        peer->rate_tokens -= ip_rt_error_cost;
 986                else
 987                        send = false;
 988                inet_putpeer(peer);
 989        }
 990        if (send)
 991                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 992
 993out:    kfree_skb(skb);
 994        return 0;
 995}
 996
 997static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 998{
 999        struct dst_entry *dst = &rt->dst;
1000        struct fib_result res;
1001
1002        if (dst_metric_locked(dst, RTAX_MTU))
1003                return;
1004
1005        if (ipv4_mtu(dst) < mtu)
1006                return;
1007
1008        if (mtu < ip_rt_min_pmtu)
1009                mtu = ip_rt_min_pmtu;
1010
1011        if (rt->rt_pmtu == mtu &&
1012            time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1013                return;
1014
1015        rcu_read_lock();
1016        if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1017                struct fib_nh *nh = &FIB_RES_NH(res);
1018
1019                update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1020                                      jiffies + ip_rt_mtu_expires);
1021        }
1022        rcu_read_unlock();
1023}
1024
1025static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1026                              struct sk_buff *skb, u32 mtu)
1027{
1028        struct rtable *rt = (struct rtable *) dst;
1029        struct flowi4 fl4;
1030
1031        ip_rt_build_flow_key(&fl4, sk, skb);
1032        __ip_rt_update_pmtu(rt, &fl4, mtu);
1033}
1034
1035void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1036                      int oif, u32 mark, u8 protocol, int flow_flags)
1037{
1038        const struct iphdr *iph = (const struct iphdr *) skb->data;
1039        struct flowi4 fl4;
1040        struct rtable *rt;
1041
1042        if (!mark)
1043                mark = IP4_REPLY_MARK(net, skb->mark);
1044
1045        __build_flow_key(net, &fl4, NULL, iph, oif,
1046                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1047        rt = __ip_route_output_key(net, &fl4);
1048        if (!IS_ERR(rt)) {
1049                __ip_rt_update_pmtu(rt, &fl4, mtu);
1050                ip_rt_put(rt);
1051        }
1052}
1053EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1054
1055static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1056{
1057        const struct iphdr *iph = (const struct iphdr *) skb->data;
1058        struct flowi4 fl4;
1059        struct rtable *rt;
1060
1061        __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1062
1063        if (!fl4.flowi4_mark)
1064                fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1065
1066        rt = __ip_route_output_key(sock_net(sk), &fl4);
1067        if (!IS_ERR(rt)) {
1068                __ip_rt_update_pmtu(rt, &fl4, mtu);
1069                ip_rt_put(rt);
1070        }
1071}
1072
1073void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074{
1075        const struct iphdr *iph = (const struct iphdr *) skb->data;
1076        struct flowi4 fl4;
1077        struct rtable *rt;
1078        struct dst_entry *odst = NULL;
1079        bool new = false;
1080        struct net *net = sock_net(sk);
1081
1082        bh_lock_sock(sk);
1083
1084        if (!ip_sk_accept_pmtu(sk))
1085                goto out;
1086
1087        odst = sk_dst_get(sk);
1088
1089        if (sock_owned_by_user(sk) || !odst) {
1090                __ipv4_sk_update_pmtu(skb, sk, mtu);
1091                goto out;
1092        }
1093
1094        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1095
1096        rt = (struct rtable *)odst;
1097        if (odst->obsolete && !odst->ops->check(odst, 0)) {
1098                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1099                if (IS_ERR(rt))
1100                        goto out;
1101
1102                new = true;
1103        }
1104
1105        __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1106
1107        if (!dst_check(&rt->dst, 0)) {
1108                if (new)
1109                        dst_release(&rt->dst);
1110
1111                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1112                if (IS_ERR(rt))
1113                        goto out;
1114
1115                new = true;
1116        }
1117
1118        if (new)
1119                sk_dst_set(sk, &rt->dst);
1120
1121out:
1122        bh_unlock_sock(sk);
1123        dst_release(odst);
1124}
1125EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1126
1127void ipv4_redirect(struct sk_buff *skb, struct net *net,
1128                   int oif, u32 mark, u8 protocol, int flow_flags)
1129{
1130        const struct iphdr *iph = (const struct iphdr *) skb->data;
1131        struct flowi4 fl4;
1132        struct rtable *rt;
1133
1134        __build_flow_key(net, &fl4, NULL, iph, oif,
1135                         RT_TOS(iph->tos), protocol, mark, flow_flags);
1136        rt = __ip_route_output_key(net, &fl4);
1137        if (!IS_ERR(rt)) {
1138                __ip_do_redirect(rt, skb, &fl4, false);
1139                ip_rt_put(rt);
1140        }
1141}
1142EXPORT_SYMBOL_GPL(ipv4_redirect);
1143
1144void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1145{
1146        const struct iphdr *iph = (const struct iphdr *) skb->data;
1147        struct flowi4 fl4;
1148        struct rtable *rt;
1149        struct net *net = sock_net(sk);
1150
1151        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1152        rt = __ip_route_output_key(net, &fl4);
1153        if (!IS_ERR(rt)) {
1154                __ip_do_redirect(rt, skb, &fl4, false);
1155                ip_rt_put(rt);
1156        }
1157}
1158EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1159
1160static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1161{
1162        struct rtable *rt = (struct rtable *) dst;
1163
1164        /* All IPV4 dsts are created with ->obsolete set to the value
1165         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1166         * into this function always.
1167         *
1168         * When a PMTU/redirect information update invalidates a route,
1169         * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1170         * DST_OBSOLETE_DEAD by dst_free().
1171         */
1172        if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1173                return NULL;
1174        return dst;
1175}
1176
1177static void ipv4_link_failure(struct sk_buff *skb)
1178{
1179        struct rtable *rt;
1180
1181        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1182
1183        rt = skb_rtable(skb);
1184        if (rt)
1185                dst_set_expires(&rt->dst, 0);
1186}
1187
1188static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1189{
1190        pr_debug("%s: %pI4 -> %pI4, %s\n",
1191                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1192                 skb->dev ? skb->dev->name : "?");
1193        kfree_skb(skb);
1194        WARN_ON(1);
1195        return 0;
1196}
1197
1198/*
1199   We do not cache source address of outgoing interface,
1200   because it is used only by IP RR, TS and SRR options,
1201   so that it out of fast path.
1202
1203   BTW remember: "addr" is allowed to be not aligned
1204   in IP options!
1205 */
1206
1207void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1208{
1209        __be32 src;
1210
1211        if (rt_is_output_route(rt))
1212                src = ip_hdr(skb)->saddr;
1213        else {
1214                struct fib_result res;
1215                struct flowi4 fl4;
1216                struct iphdr *iph;
1217
1218                iph = ip_hdr(skb);
1219
1220                memset(&fl4, 0, sizeof(fl4));
1221                fl4.daddr = iph->daddr;
1222                fl4.saddr = iph->saddr;
1223                fl4.flowi4_tos = RT_TOS(iph->tos);
1224                fl4.flowi4_oif = rt->dst.dev->ifindex;
1225                fl4.flowi4_iif = skb->dev->ifindex;
1226                fl4.flowi4_mark = skb->mark;
1227
1228                rcu_read_lock();
1229                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1230                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1231                else
1232                        src = inet_select_addr(rt->dst.dev,
1233                                               rt_nexthop(rt, iph->daddr),
1234                                               RT_SCOPE_UNIVERSE);
1235                rcu_read_unlock();
1236        }
1237        memcpy(addr, &src, 4);
1238}
1239
1240#ifdef CONFIG_IP_ROUTE_CLASSID
1241static void set_class_tag(struct rtable *rt, u32 tag)
1242{
1243        if (!(rt->dst.tclassid & 0xFFFF))
1244                rt->dst.tclassid |= tag & 0xFFFF;
1245        if (!(rt->dst.tclassid & 0xFFFF0000))
1246                rt->dst.tclassid |= tag & 0xFFFF0000;
1247}
1248#endif
1249
1250static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1251{
1252        unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1253        unsigned int advmss = max_t(unsigned int, dst->dev->mtu - header_size,
1254                                    ip_rt_min_advmss);
1255
1256        return min(advmss, IPV4_MAX_PMTU - header_size);
1257}
1258
1259static unsigned int ipv4_mtu(const struct dst_entry *dst)
1260{
1261        const struct rtable *rt = (const struct rtable *) dst;
1262        unsigned int mtu = rt->rt_pmtu;
1263
1264        if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1265                mtu = dst_metric_raw(dst, RTAX_MTU);
1266
1267        if (mtu)
1268                return mtu;
1269
1270        mtu = READ_ONCE(dst->dev->mtu);
1271
1272        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1273                if (rt->rt_uses_gateway && mtu > 576)
1274                        mtu = 576;
1275        }
1276
1277        mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1278
1279        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1280}
1281
1282static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1283{
1284        struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1285        struct fib_nh_exception *fnhe;
1286        u32 hval;
1287
1288        if (!hash)
1289                return NULL;
1290
1291        hval = fnhe_hashfun(daddr);
1292
1293        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1294             fnhe = rcu_dereference(fnhe->fnhe_next)) {
1295                if (fnhe->fnhe_daddr == daddr)
1296                        return fnhe;
1297        }
1298        return NULL;
1299}
1300
1301static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1302                              __be32 daddr, const bool do_cache)
1303{
1304        bool ret = false;
1305
1306        spin_lock_bh(&fnhe_lock);
1307
1308        if (daddr == fnhe->fnhe_daddr) {
1309                struct rtable __rcu **porig;
1310                struct rtable *orig;
1311                int genid = fnhe_genid(dev_net(rt->dst.dev));
1312
1313                if (rt_is_input_route(rt))
1314                        porig = &fnhe->fnhe_rth_input;
1315                else
1316                        porig = &fnhe->fnhe_rth_output;
1317                orig = rcu_dereference(*porig);
1318
1319                if (fnhe->fnhe_genid != genid) {
1320                        fnhe->fnhe_genid = genid;
1321                        fnhe->fnhe_gw = 0;
1322                        fnhe->fnhe_pmtu = 0;
1323                        fnhe->fnhe_expires = 0;
1324                        fnhe_flush_routes(fnhe);
1325                        orig = NULL;
1326                }
1327                fill_route_from_fnhe(rt, fnhe);
1328                if (!rt->rt_gateway)
1329                        rt->rt_gateway = daddr;
1330
1331                if (do_cache) {
1332                        dst_hold(&rt->dst);
1333                        rcu_assign_pointer(*porig, rt);
1334                        if (orig) {
1335                                dst_dev_put(&orig->dst);
1336                                dst_release(&orig->dst);
1337                        }
1338                        ret = true;
1339                }
1340
1341                fnhe->fnhe_stamp = jiffies;
1342        }
1343        spin_unlock_bh(&fnhe_lock);
1344
1345        return ret;
1346}
1347
1348static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1349{
1350        struct rtable *orig, *prev, **p;
1351        bool ret = true;
1352
1353        if (rt_is_input_route(rt)) {
1354                p = (struct rtable **)&nh->nh_rth_input;
1355        } else {
1356                p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1357        }
1358        orig = *p;
1359
1360        /* hold dst before doing cmpxchg() to avoid race condition
1361         * on this dst
1362         */
1363        dst_hold(&rt->dst);
1364        prev = cmpxchg(p, orig, rt);
1365        if (prev == orig) {
1366                if (orig) {
1367                        dst_dev_put(&orig->dst);
1368                        dst_release(&orig->dst);
1369                }
1370        } else {
1371                dst_release(&rt->dst);
1372                ret = false;
1373        }
1374
1375        return ret;
1376}
1377
1378struct uncached_list {
1379        spinlock_t              lock;
1380        struct list_head        head;
1381};
1382
1383static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1384
1385static void rt_add_uncached_list(struct rtable *rt)
1386{
1387        struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1388
1389        rt->rt_uncached_list = ul;
1390
1391        spin_lock_bh(&ul->lock);
1392        list_add_tail(&rt->rt_uncached, &ul->head);
1393        spin_unlock_bh(&ul->lock);
1394}
1395
1396static void ipv4_dst_destroy(struct dst_entry *dst)
1397{
1398        struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1399        struct rtable *rt = (struct rtable *) dst;
1400
1401        if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
1402                kfree(p);
1403
1404        if (!list_empty(&rt->rt_uncached)) {
1405                struct uncached_list *ul = rt->rt_uncached_list;
1406
1407                spin_lock_bh(&ul->lock);
1408                list_del(&rt->rt_uncached);
1409                spin_unlock_bh(&ul->lock);
1410        }
1411}
1412
1413void rt_flush_dev(struct net_device *dev)
1414{
1415        struct net *net = dev_net(dev);
1416        struct rtable *rt;
1417        int cpu;
1418
1419        for_each_possible_cpu(cpu) {
1420                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1421
1422                spin_lock_bh(&ul->lock);
1423                list_for_each_entry(rt, &ul->head, rt_uncached) {
1424                        if (rt->dst.dev != dev)
1425                                continue;
1426                        rt->dst.dev = net->loopback_dev;
1427                        dev_hold(rt->dst.dev);
1428                        dev_put(dev);
1429                }
1430                spin_unlock_bh(&ul->lock);
1431        }
1432}
1433
1434static bool rt_cache_valid(const struct rtable *rt)
1435{
1436        return  rt &&
1437                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1438                !rt_is_expired(rt);
1439}
1440
1441static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1442                           const struct fib_result *res,
1443                           struct fib_nh_exception *fnhe,
1444                           struct fib_info *fi, u16 type, u32 itag,
1445                           const bool do_cache)
1446{
1447        bool cached = false;
1448
1449        if (fi) {
1450                struct fib_nh *nh = &FIB_RES_NH(*res);
1451
1452                if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1453                        rt->rt_gateway = nh->nh_gw;
1454                        rt->rt_uses_gateway = 1;
1455                }
1456                dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1457                if (fi->fib_metrics != &dst_default_metrics) {
1458                        rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1459                        atomic_inc(&fi->fib_metrics->refcnt);
1460                }
1461#ifdef CONFIG_IP_ROUTE_CLASSID
1462                rt->dst.tclassid = nh->nh_tclassid;
1463#endif
1464                rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1465                if (unlikely(fnhe))
1466                        cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1467                else if (do_cache)
1468                        cached = rt_cache_route(nh, rt);
1469                if (unlikely(!cached)) {
1470                        /* Routes we intend to cache in nexthop exception or
1471                         * FIB nexthop have the DST_NOCACHE bit clear.
1472                         * However, if we are unsuccessful at storing this
1473                         * route into the cache we really need to set it.
1474                         */
1475                        if (!rt->rt_gateway)
1476                                rt->rt_gateway = daddr;
1477                        rt_add_uncached_list(rt);
1478                }
1479        } else
1480                rt_add_uncached_list(rt);
1481
1482#ifdef CONFIG_IP_ROUTE_CLASSID
1483#ifdef CONFIG_IP_MULTIPLE_TABLES
1484        set_class_tag(rt, res->tclassid);
1485#endif
1486        set_class_tag(rt, itag);
1487#endif
1488}
1489
1490struct rtable *rt_dst_alloc(struct net_device *dev,
1491                            unsigned int flags, u16 type,
1492                            bool nopolicy, bool noxfrm, bool will_cache)
1493{
1494        struct rtable *rt;
1495
1496        rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1497                       (will_cache ? 0 : DST_HOST) |
1498                       (nopolicy ? DST_NOPOLICY : 0) |
1499                       (noxfrm ? DST_NOXFRM : 0));
1500
1501        if (rt) {
1502                rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1503                rt->rt_flags = flags;
1504                rt->rt_type = type;
1505                rt->rt_is_input = 0;
1506                rt->rt_iif = 0;
1507                rt->rt_pmtu = 0;
1508                rt->rt_gateway = 0;
1509                rt->rt_uses_gateway = 0;
1510                rt->rt_table_id = 0;
1511                INIT_LIST_HEAD(&rt->rt_uncached);
1512
1513                rt->dst.output = ip_output;
1514                if (flags & RTCF_LOCAL)
1515                        rt->dst.input = ip_local_deliver;
1516        }
1517
1518        return rt;
1519}
1520EXPORT_SYMBOL(rt_dst_alloc);
1521
1522/* called in rcu_read_lock() section */
1523static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1524                                u8 tos, struct net_device *dev, int our)
1525{
1526        struct rtable *rth;
1527        struct in_device *in_dev = __in_dev_get_rcu(dev);
1528        unsigned int flags = RTCF_MULTICAST;
1529        u32 itag = 0;
1530        int err;
1531
1532        /* Primary sanity checks. */
1533
1534        if (!in_dev)
1535                return -EINVAL;
1536
1537        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1538            skb->protocol != htons(ETH_P_IP))
1539                goto e_inval;
1540
1541        if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1542                goto e_inval;
1543
1544        if (ipv4_is_zeronet(saddr)) {
1545                if (!ipv4_is_local_multicast(daddr))
1546                        goto e_inval;
1547        } else {
1548                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1549                                          in_dev, &itag);
1550                if (err < 0)
1551                        goto e_err;
1552        }
1553        if (our)
1554                flags |= RTCF_LOCAL;
1555
1556        rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1557                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1558        if (!rth)
1559                goto e_nobufs;
1560
1561#ifdef CONFIG_IP_ROUTE_CLASSID
1562        rth->dst.tclassid = itag;
1563#endif
1564        rth->dst.output = ip_rt_bug;
1565        rth->rt_is_input= 1;
1566
1567#ifdef CONFIG_IP_MROUTE
1568        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1569                rth->dst.input = ip_mr_input;
1570#endif
1571        RT_CACHE_STAT_INC(in_slow_mc);
1572
1573        skb_dst_set(skb, &rth->dst);
1574        return 0;
1575
1576e_nobufs:
1577        return -ENOBUFS;
1578e_inval:
1579        return -EINVAL;
1580e_err:
1581        return err;
1582}
1583
1584
1585static void ip_handle_martian_source(struct net_device *dev,
1586                                     struct in_device *in_dev,
1587                                     struct sk_buff *skb,
1588                                     __be32 daddr,
1589                                     __be32 saddr)
1590{
1591        RT_CACHE_STAT_INC(in_martian_src);
1592#ifdef CONFIG_IP_ROUTE_VERBOSE
1593        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1594                /*
1595                 *      RFC1812 recommendation, if source is martian,
1596                 *      the only hint is MAC header.
1597                 */
1598                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1599                        &daddr, &saddr, dev->name);
1600                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1601                        print_hex_dump(KERN_WARNING, "ll header: ",
1602                                       DUMP_PREFIX_OFFSET, 16, 1,
1603                                       skb_mac_header(skb),
1604                                       dev->hard_header_len, true);
1605                }
1606        }
1607#endif
1608}
1609
1610static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1611{
1612        struct fnhe_hash_bucket *hash;
1613        struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1614        u32 hval = fnhe_hashfun(daddr);
1615
1616        spin_lock_bh(&fnhe_lock);
1617
1618        hash = rcu_dereference_protected(nh->nh_exceptions,
1619                                         lockdep_is_held(&fnhe_lock));
1620        hash += hval;
1621
1622        fnhe_p = &hash->chain;
1623        fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1624        while (fnhe) {
1625                if (fnhe->fnhe_daddr == daddr) {
1626                        rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1627                                fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1628                        fnhe_flush_routes(fnhe);
1629                        kfree_rcu(fnhe, rcu);
1630                        break;
1631                }
1632                fnhe_p = &fnhe->fnhe_next;
1633                fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1634                                                 lockdep_is_held(&fnhe_lock));
1635        }
1636
1637        spin_unlock_bh(&fnhe_lock);
1638}
1639
1640static void set_lwt_redirect(struct rtable *rth)
1641{
1642        if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1643                rth->dst.lwtstate->orig_output = rth->dst.output;
1644                rth->dst.output = lwtunnel_output;
1645        }
1646
1647        if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1648                rth->dst.lwtstate->orig_input = rth->dst.input;
1649                rth->dst.input = lwtunnel_input;
1650        }
1651}
1652
1653/* called in rcu_read_lock() section */
1654static int __mkroute_input(struct sk_buff *skb,
1655                           const struct fib_result *res,
1656                           struct in_device *in_dev,
1657                           __be32 daddr, __be32 saddr, u32 tos)
1658{
1659        struct fib_nh_exception *fnhe;
1660        struct rtable *rth;
1661        int err;
1662        struct in_device *out_dev;
1663        bool do_cache;
1664        u32 itag = 0;
1665
1666        /* get a working reference to the output device */
1667        out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1668        if (!out_dev) {
1669                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1670                return -EINVAL;
1671        }
1672
1673        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1674                                  in_dev->dev, in_dev, &itag);
1675        if (err < 0) {
1676                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1677                                         saddr);
1678
1679                goto cleanup;
1680        }
1681
1682        do_cache = res->fi && !itag;
1683        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1684            skb->protocol == htons(ETH_P_IP) &&
1685            (IN_DEV_SHARED_MEDIA(out_dev) ||
1686             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1687                IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1688
1689        if (skb->protocol != htons(ETH_P_IP)) {
1690                /* Not IP (i.e. ARP). Do not create route, if it is
1691                 * invalid for proxy arp. DNAT routes are always valid.
1692                 *
1693                 * Proxy arp feature have been extended to allow, ARP
1694                 * replies back to the same interface, to support
1695                 * Private VLAN switch technologies. See arp.c.
1696                 */
1697                if (out_dev == in_dev &&
1698                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1699                        err = -EINVAL;
1700                        goto cleanup;
1701                }
1702        }
1703
1704        fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1705        if (do_cache) {
1706                if (fnhe) {
1707                        rth = rcu_dereference(fnhe->fnhe_rth_input);
1708                        if (rth && rth->dst.expires &&
1709                            time_after(jiffies, rth->dst.expires)) {
1710                                ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1711                                fnhe = NULL;
1712                        } else {
1713                                goto rt_cache;
1714                        }
1715                }
1716
1717                rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1718
1719rt_cache:
1720                if (rt_cache_valid(rth)) {
1721                        skb_dst_set_noref(skb, &rth->dst);
1722                        goto out;
1723                }
1724        }
1725
1726        rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1727                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
1728                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1729        if (!rth) {
1730                err = -ENOBUFS;
1731                goto cleanup;
1732        }
1733
1734        rth->rt_is_input = 1;
1735        if (res->table)
1736                rth->rt_table_id = res->table->tb_id;
1737        RT_CACHE_STAT_INC(in_slow_tot);
1738
1739        rth->dst.input = ip_forward;
1740
1741        rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1742                       do_cache);
1743        set_lwt_redirect(rth);
1744        skb_dst_set(skb, &rth->dst);
1745out:
1746        err = 0;
1747 cleanup:
1748        return err;
1749}
1750
1751#ifdef CONFIG_IP_ROUTE_MULTIPATH
1752/* To make ICMP packets follow the right flow, the multipath hash is
1753 * calculated from the inner IP addresses.
1754 */
1755static void ip_multipath_l3_keys(const struct sk_buff *skb,
1756                                 struct flow_keys *hash_keys)
1757{
1758        const struct iphdr *outer_iph = ip_hdr(skb);
1759        const struct iphdr *inner_iph;
1760        const struct icmphdr *icmph;
1761        struct iphdr _inner_iph;
1762        struct icmphdr _icmph;
1763
1764        hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1765        hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1766        if (likely(outer_iph->protocol != IPPROTO_ICMP))
1767                return;
1768
1769        if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1770                return;
1771
1772        icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1773                                   &_icmph);
1774        if (!icmph)
1775                return;
1776
1777        if (icmph->type != ICMP_DEST_UNREACH &&
1778            icmph->type != ICMP_REDIRECT &&
1779            icmph->type != ICMP_TIME_EXCEEDED &&
1780            icmph->type != ICMP_PARAMETERPROB)
1781                return;
1782
1783        inner_iph = skb_header_pointer(skb,
1784                                       outer_iph->ihl * 4 + sizeof(_icmph),
1785                                       sizeof(_inner_iph), &_inner_iph);
1786        if (!inner_iph)
1787                return;
1788        hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1789        hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1790}
1791
1792/* if skb is set it will be used and fl4 can be NULL */
1793int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1794                       const struct sk_buff *skb)
1795{
1796        struct net *net = fi->fib_net;
1797        struct flow_keys hash_keys;
1798        u32 mhash;
1799
1800        switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1801        case 0:
1802                memset(&hash_keys, 0, sizeof(hash_keys));
1803                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1804                if (skb) {
1805                        ip_multipath_l3_keys(skb, &hash_keys);
1806                } else {
1807                        hash_keys.addrs.v4addrs.src = fl4->saddr;
1808                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
1809                }
1810                break;
1811        case 1:
1812                /* skb is currently provided only when forwarding */
1813                if (skb) {
1814                        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1815                        struct flow_keys keys;
1816
1817                        /* short-circuit if we already have L4 hash present */
1818                        if (skb->l4_hash)
1819                                return skb_get_hash_raw(skb) >> 1;
1820                        memset(&hash_keys, 0, sizeof(hash_keys));
1821                        skb_flow_dissect_flow_keys(skb, &keys, flag);
1822                        hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1823                        hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1824                        hash_keys.ports.src = keys.ports.src;
1825                        hash_keys.ports.dst = keys.ports.dst;
1826                        hash_keys.basic.ip_proto = keys.basic.ip_proto;
1827                } else {
1828                        memset(&hash_keys, 0, sizeof(hash_keys));
1829                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1830                        hash_keys.addrs.v4addrs.src = fl4->saddr;
1831                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
1832                        hash_keys.ports.src = fl4->fl4_sport;
1833                        hash_keys.ports.dst = fl4->fl4_dport;
1834                        hash_keys.basic.ip_proto = fl4->flowi4_proto;
1835                }
1836                break;
1837        }
1838        mhash = flow_hash_from_keys(&hash_keys);
1839
1840        return mhash >> 1;
1841}
1842EXPORT_SYMBOL_GPL(fib_multipath_hash);
1843#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1844
1845static int ip_mkroute_input(struct sk_buff *skb,
1846                            struct fib_result *res,
1847                            struct in_device *in_dev,
1848                            __be32 daddr, __be32 saddr, u32 tos)
1849{
1850#ifdef CONFIG_IP_ROUTE_MULTIPATH
1851        if (res->fi && res->fi->fib_nhs > 1) {
1852                int h = fib_multipath_hash(res->fi, NULL, skb);
1853
1854                fib_select_multipath(res, h);
1855        }
1856#endif
1857
1858        /* create a routing cache entry */
1859        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1860}
1861
1862/*
1863 *      NOTE. We drop all the packets that has local source
1864 *      addresses, because every properly looped back packet
1865 *      must have correct destination already attached by output routine.
1866 *
1867 *      Such approach solves two big problems:
1868 *      1. Not simplex devices are handled properly.
1869 *      2. IP spoofing attempts are filtered with 100% of guarantee.
1870 *      called with rcu_read_lock()
1871 */
1872
1873static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1874                               u8 tos, struct net_device *dev,
1875                               struct fib_result *res)
1876{
1877        struct in_device *in_dev = __in_dev_get_rcu(dev);
1878        struct ip_tunnel_info *tun_info;
1879        struct flowi4   fl4;
1880        unsigned int    flags = 0;
1881        u32             itag = 0;
1882        struct rtable   *rth;
1883        int             err = -EINVAL;
1884        struct net    *net = dev_net(dev);
1885        bool do_cache;
1886
1887        /* IP on this device is disabled. */
1888
1889        if (!in_dev)
1890                goto out;
1891
1892        /* Check for the most weird martians, which can be not detected
1893           by fib_lookup.
1894         */
1895
1896        tun_info = skb_tunnel_info(skb);
1897        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1898                fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1899        else
1900                fl4.flowi4_tun_key.tun_id = 0;
1901        skb_dst_drop(skb);
1902
1903        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1904                goto martian_source;
1905
1906        res->fi = NULL;
1907        res->table = NULL;
1908        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1909                goto brd_input;
1910
1911        /* Accept zero addresses only to limited broadcast;
1912         * I even do not know to fix it or not. Waiting for complains :-)
1913         */
1914        if (ipv4_is_zeronet(saddr))
1915                goto martian_source;
1916
1917        if (ipv4_is_zeronet(daddr))
1918                goto martian_destination;
1919
1920        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1921         * and call it once if daddr or/and saddr are loopback addresses
1922         */
1923        if (ipv4_is_loopback(daddr)) {
1924                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1925                        goto martian_destination;
1926        } else if (ipv4_is_loopback(saddr)) {
1927                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1928                        goto martian_source;
1929        }
1930
1931        /*
1932         *      Now we are ready to route packet.
1933         */
1934        fl4.flowi4_oif = 0;
1935        fl4.flowi4_iif = dev->ifindex;
1936        fl4.flowi4_mark = skb->mark;
1937        fl4.flowi4_tos = tos;
1938        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1939        fl4.flowi4_flags = 0;
1940        fl4.daddr = daddr;
1941        fl4.saddr = saddr;
1942        fl4.flowi4_uid = sock_net_uid(net, NULL);
1943        err = fib_lookup(net, &fl4, res, 0);
1944        if (err != 0) {
1945                if (!IN_DEV_FORWARD(in_dev))
1946                        err = -EHOSTUNREACH;
1947                goto no_route;
1948        }
1949
1950        if (res->type == RTN_BROADCAST)
1951                goto brd_input;
1952
1953        if (res->type == RTN_LOCAL) {
1954                err = fib_validate_source(skb, saddr, daddr, tos,
1955                                          0, dev, in_dev, &itag);
1956                if (err < 0)
1957                        goto martian_source;
1958                goto local_input;
1959        }
1960
1961        if (!IN_DEV_FORWARD(in_dev)) {
1962                err = -EHOSTUNREACH;
1963                goto no_route;
1964        }
1965        if (res->type != RTN_UNICAST)
1966                goto martian_destination;
1967
1968        err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1969out:    return err;
1970
1971brd_input:
1972        if (skb->protocol != htons(ETH_P_IP))
1973                goto e_inval;
1974
1975        if (!ipv4_is_zeronet(saddr)) {
1976                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1977                                          in_dev, &itag);
1978                if (err < 0)
1979                        goto martian_source;
1980        }
1981        flags |= RTCF_BROADCAST;
1982        res->type = RTN_BROADCAST;
1983        RT_CACHE_STAT_INC(in_brd);
1984
1985local_input:
1986        do_cache = false;
1987        if (res->fi) {
1988                if (!itag) {
1989                        rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1990                        if (rt_cache_valid(rth)) {
1991                                skb_dst_set_noref(skb, &rth->dst);
1992                                err = 0;
1993                                goto out;
1994                        }
1995                        do_cache = true;
1996                }
1997        }
1998
1999        rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2000                           flags | RTCF_LOCAL, res->type,
2001                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2002        if (!rth)
2003                goto e_nobufs;
2004
2005        rth->dst.output= ip_rt_bug;
2006#ifdef CONFIG_IP_ROUTE_CLASSID
2007        rth->dst.tclassid = itag;
2008#endif
2009        rth->rt_is_input = 1;
2010        if (res->table)
2011                rth->rt_table_id = res->table->tb_id;
2012
2013        RT_CACHE_STAT_INC(in_slow_tot);
2014        if (res->type == RTN_UNREACHABLE) {
2015                rth->dst.input= ip_error;
2016                rth->dst.error= -err;
2017                rth->rt_flags   &= ~RTCF_LOCAL;
2018        }
2019
2020        if (do_cache) {
2021                struct fib_nh *nh = &FIB_RES_NH(*res);
2022
2023                rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2024                if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2025                        WARN_ON(rth->dst.input == lwtunnel_input);
2026                        rth->dst.lwtstate->orig_input = rth->dst.input;
2027                        rth->dst.input = lwtunnel_input;
2028                }
2029
2030                if (unlikely(!rt_cache_route(nh, rth)))
2031                        rt_add_uncached_list(rth);
2032        }
2033        skb_dst_set(skb, &rth->dst);
2034        err = 0;
2035        goto out;
2036
2037no_route:
2038        RT_CACHE_STAT_INC(in_no_route);
2039        res->type = RTN_UNREACHABLE;
2040        res->fi = NULL;
2041        res->table = NULL;
2042        goto local_input;
2043
2044        /*
2045         *      Do not cache martian addresses: they should be logged (RFC1812)
2046         */
2047martian_destination:
2048        RT_CACHE_STAT_INC(in_martian_dst);
2049#ifdef CONFIG_IP_ROUTE_VERBOSE
2050        if (IN_DEV_LOG_MARTIANS(in_dev))
2051                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2052                                     &daddr, &saddr, dev->name);
2053#endif
2054
2055e_inval:
2056        err = -EINVAL;
2057        goto out;
2058
2059e_nobufs:
2060        err = -ENOBUFS;
2061        goto out;
2062
2063martian_source:
2064        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2065        goto out;
2066}
2067
2068int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2069                         u8 tos, struct net_device *dev)
2070{
2071        struct fib_result res;
2072        int err;
2073
2074        tos &= IPTOS_RT_MASK;
2075        rcu_read_lock();
2076        err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2077        rcu_read_unlock();
2078
2079        return err;
2080}
2081EXPORT_SYMBOL(ip_route_input_noref);
2082
2083/* called with rcu_read_lock held */
2084int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2085                       u8 tos, struct net_device *dev, struct fib_result *res)
2086{
2087        /* Multicast recognition logic is moved from route cache to here.
2088           The problem was that too many Ethernet cards have broken/missing
2089           hardware multicast filters :-( As result the host on multicasting
2090           network acquires a lot of useless route cache entries, sort of
2091           SDR messages from all the world. Now we try to get rid of them.
2092           Really, provided software IP multicast filter is organized
2093           reasonably (at least, hashed), it does not result in a slowdown
2094           comparing with route cache reject entries.
2095           Note, that multicast routers are not affected, because
2096           route cache entry is created eventually.
2097         */
2098        if (ipv4_is_multicast(daddr)) {
2099                struct in_device *in_dev = __in_dev_get_rcu(dev);
2100                int our = 0;
2101                int err = -EINVAL;
2102
2103                if (in_dev)
2104                        our = ip_check_mc_rcu(in_dev, daddr, saddr,
2105                                              ip_hdr(skb)->protocol);
2106
2107                /* check l3 master if no match yet */
2108                if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2109                        struct in_device *l3_in_dev;
2110
2111                        l3_in_dev = __in_dev_get_rcu(skb->dev);
2112                        if (l3_in_dev)
2113                                our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2114                                                      ip_hdr(skb)->protocol);
2115                }
2116
2117                if (our
2118#ifdef CONFIG_IP_MROUTE
2119                        ||
2120                    (!ipv4_is_local_multicast(daddr) &&
2121                     IN_DEV_MFORWARD(in_dev))
2122#endif
2123                   ) {
2124                        err = ip_route_input_mc(skb, daddr, saddr,
2125                                                tos, dev, our);
2126                }
2127                return err;
2128        }
2129
2130        return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2131}
2132
2133/* called with rcu_read_lock() */
2134static struct rtable *__mkroute_output(const struct fib_result *res,
2135                                       const struct flowi4 *fl4, int orig_oif,
2136                                       struct net_device *dev_out,
2137                                       unsigned int flags)
2138{
2139        struct fib_info *fi = res->fi;
2140        struct fib_nh_exception *fnhe;
2141        struct in_device *in_dev;
2142        u16 type = res->type;
2143        struct rtable *rth;
2144        bool do_cache;
2145
2146        in_dev = __in_dev_get_rcu(dev_out);
2147        if (!in_dev)
2148                return ERR_PTR(-EINVAL);
2149
2150        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2151                if (ipv4_is_loopback(fl4->saddr) &&
2152                    !(dev_out->flags & IFF_LOOPBACK) &&
2153                    !netif_is_l3_master(dev_out))
2154                        return ERR_PTR(-EINVAL);
2155
2156        if (ipv4_is_lbcast(fl4->daddr))
2157                type = RTN_BROADCAST;
2158        else if (ipv4_is_multicast(fl4->daddr))
2159                type = RTN_MULTICAST;
2160        else if (ipv4_is_zeronet(fl4->daddr))
2161                return ERR_PTR(-EINVAL);
2162
2163        if (dev_out->flags & IFF_LOOPBACK)
2164                flags |= RTCF_LOCAL;
2165
2166        do_cache = true;
2167        if (type == RTN_BROADCAST) {
2168                flags |= RTCF_BROADCAST | RTCF_LOCAL;
2169                fi = NULL;
2170        } else if (type == RTN_MULTICAST) {
2171                flags |= RTCF_MULTICAST | RTCF_LOCAL;
2172                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2173                                     fl4->flowi4_proto))
2174                        flags &= ~RTCF_LOCAL;
2175                else
2176                        do_cache = false;
2177                /* If multicast route do not exist use
2178                 * default one, but do not gateway in this case.
2179                 * Yes, it is hack.
2180                 */
2181                if (fi && res->prefixlen < 4)
2182                        fi = NULL;
2183        } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2184                   (orig_oif != dev_out->ifindex)) {
2185                /* For local routes that require a particular output interface
2186                 * we do not want to cache the result.  Caching the result
2187                 * causes incorrect behaviour when there are multiple source
2188                 * addresses on the interface, the end result being that if the
2189                 * intended recipient is waiting on that interface for the
2190                 * packet he won't receive it because it will be delivered on
2191                 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2192                 * be set to the loopback interface as well.
2193                 */
2194                fi = NULL;
2195        }
2196
2197        fnhe = NULL;
2198        do_cache &= fi != NULL;
2199        if (do_cache) {
2200                struct rtable __rcu **prth;
2201                struct fib_nh *nh = &FIB_RES_NH(*res);
2202
2203                fnhe = find_exception(nh, fl4->daddr);
2204                if (fnhe) {
2205                        prth = &fnhe->fnhe_rth_output;
2206                        rth = rcu_dereference(*prth);
2207                        if (rth && rth->dst.expires &&
2208                            time_after(jiffies, rth->dst.expires)) {
2209                                ip_del_fnhe(nh, fl4->daddr);
2210                                fnhe = NULL;
2211                        } else {
2212                                goto rt_cache;
2213                        }
2214                }
2215
2216                if (unlikely(fl4->flowi4_flags &
2217                             FLOWI_FLAG_KNOWN_NH &&
2218                             !(nh->nh_gw &&
2219                               nh->nh_scope == RT_SCOPE_LINK))) {
2220                        do_cache = false;
2221                        goto add;
2222                }
2223                prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2224                rth = rcu_dereference(*prth);
2225
2226rt_cache:
2227                if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2228                        return rth;
2229        }
2230
2231add:
2232        rth = rt_dst_alloc(dev_out, flags, type,
2233                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
2234                           IN_DEV_CONF_GET(in_dev, NOXFRM),
2235                           do_cache);
2236        if (!rth)
2237                return ERR_PTR(-ENOBUFS);
2238
2239        rth->rt_iif     = orig_oif ? : 0;
2240        if (res->table)
2241                rth->rt_table_id = res->table->tb_id;
2242
2243        RT_CACHE_STAT_INC(out_slow_tot);
2244
2245        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2246                if (flags & RTCF_LOCAL &&
2247                    !(dev_out->flags & IFF_LOOPBACK)) {
2248                        rth->dst.output = ip_mc_output;
2249                        RT_CACHE_STAT_INC(out_slow_mc);
2250                }
2251#ifdef CONFIG_IP_MROUTE
2252                if (type == RTN_MULTICAST) {
2253                        if (IN_DEV_MFORWARD(in_dev) &&
2254                            !ipv4_is_local_multicast(fl4->daddr)) {
2255                                rth->dst.input = ip_mr_input;
2256                                rth->dst.output = ip_mc_output;
2257                        }
2258                }
2259#endif
2260        }
2261
2262        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2263        set_lwt_redirect(rth);
2264
2265        return rth;
2266}
2267
2268/*
2269 * Major route resolver routine.
2270 */
2271
2272struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2273                                        const struct sk_buff *skb)
2274{
2275        __u8 tos = RT_FL_TOS(fl4);
2276        struct fib_result res;
2277        struct rtable *rth;
2278
2279        res.tclassid    = 0;
2280        res.fi          = NULL;
2281        res.table       = NULL;
2282
2283        fl4->flowi4_iif = LOOPBACK_IFINDEX;
2284        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2285        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2286                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2287
2288        rcu_read_lock();
2289        rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2290        rcu_read_unlock();
2291
2292        return rth;
2293}
2294EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2295
2296struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2297                                            struct fib_result *res,
2298                                            const struct sk_buff *skb)
2299{
2300        struct net_device *dev_out = NULL;
2301        int orig_oif = fl4->flowi4_oif;
2302        unsigned int flags = 0;
2303        struct rtable *rth;
2304        int err = -ENETUNREACH;
2305
2306        if (fl4->saddr) {
2307                rth = ERR_PTR(-EINVAL);
2308                if (ipv4_is_multicast(fl4->saddr) ||
2309                    ipv4_is_lbcast(fl4->saddr) ||
2310                    ipv4_is_zeronet(fl4->saddr))
2311                        goto out;
2312
2313                /* I removed check for oif == dev_out->oif here.
2314                   It was wrong for two reasons:
2315                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2316                      is assigned to multiple interfaces.
2317                   2. Moreover, we are allowed to send packets with saddr
2318                      of another iface. --ANK
2319                 */
2320
2321                if (fl4->flowi4_oif == 0 &&
2322                    (ipv4_is_multicast(fl4->daddr) ||
2323                     ipv4_is_lbcast(fl4->daddr))) {
2324                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2325                        dev_out = __ip_dev_find(net, fl4->saddr, false);
2326                        if (!dev_out)
2327                                goto out;
2328
2329                        /* Special hack: user can direct multicasts
2330                           and limited broadcast via necessary interface
2331                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2332                           This hack is not just for fun, it allows
2333                           vic,vat and friends to work.
2334                           They bind socket to loopback, set ttl to zero
2335                           and expect that it will work.
2336                           From the viewpoint of routing cache they are broken,
2337                           because we are not allowed to build multicast path
2338                           with loopback source addr (look, routing cache
2339                           cannot know, that ttl is zero, so that packet
2340                           will not leave this host and route is valid).
2341                           Luckily, this hack is good workaround.
2342                         */
2343
2344                        fl4->flowi4_oif = dev_out->ifindex;
2345                        goto make_route;
2346                }
2347
2348                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2349                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2350                        if (!__ip_dev_find(net, fl4->saddr, false))
2351                                goto out;
2352                }
2353        }
2354
2355
2356        if (fl4->flowi4_oif) {
2357                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2358                rth = ERR_PTR(-ENODEV);
2359                if (!dev_out)
2360                        goto out;
2361
2362                /* RACE: Check return value of inet_select_addr instead. */
2363                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2364                        rth = ERR_PTR(-ENETUNREACH);
2365                        goto out;
2366                }
2367                if (ipv4_is_local_multicast(fl4->daddr) ||
2368                    ipv4_is_lbcast(fl4->daddr) ||
2369                    fl4->flowi4_proto == IPPROTO_IGMP) {
2370                        if (!fl4->saddr)
2371                                fl4->saddr = inet_select_addr(dev_out, 0,
2372                                                              RT_SCOPE_LINK);
2373                        goto make_route;
2374                }
2375                if (!fl4->saddr) {
2376                        if (ipv4_is_multicast(fl4->daddr))
2377                                fl4->saddr = inet_select_addr(dev_out, 0,
2378                                                              fl4->flowi4_scope);
2379                        else if (!fl4->daddr)
2380                                fl4->saddr = inet_select_addr(dev_out, 0,
2381                                                              RT_SCOPE_HOST);
2382                }
2383        }
2384
2385        if (!fl4->daddr) {
2386                fl4->daddr = fl4->saddr;
2387                if (!fl4->daddr)
2388                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2389                dev_out = net->loopback_dev;
2390                fl4->flowi4_oif = LOOPBACK_IFINDEX;
2391                res->type = RTN_LOCAL;
2392                flags |= RTCF_LOCAL;
2393                goto make_route;
2394        }
2395
2396        err = fib_lookup(net, fl4, res, 0);
2397        if (err) {
2398                res->fi = NULL;
2399                res->table = NULL;
2400                if (fl4->flowi4_oif &&
2401                    (ipv4_is_multicast(fl4->daddr) ||
2402                    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2403                        /* Apparently, routing tables are wrong. Assume,
2404                           that the destination is on link.
2405
2406                           WHY? DW.
2407                           Because we are allowed to send to iface
2408                           even if it has NO routes and NO assigned
2409                           addresses. When oif is specified, routing
2410                           tables are looked up with only one purpose:
2411                           to catch if destination is gatewayed, rather than
2412                           direct. Moreover, if MSG_DONTROUTE is set,
2413                           we send packet, ignoring both routing tables
2414                           and ifaddr state. --ANK
2415
2416
2417                           We could make it even if oif is unknown,
2418                           likely IPv6, but we do not.
2419                         */
2420
2421                        if (fl4->saddr == 0)
2422                                fl4->saddr = inet_select_addr(dev_out, 0,
2423                                                              RT_SCOPE_LINK);
2424                        res->type = RTN_UNICAST;
2425                        goto make_route;
2426                }
2427                rth = ERR_PTR(err);
2428                goto out;
2429        }
2430
2431        if (res->type == RTN_LOCAL) {
2432                if (!fl4->saddr) {
2433                        if (res->fi->fib_prefsrc)
2434                                fl4->saddr = res->fi->fib_prefsrc;
2435                        else
2436                                fl4->saddr = fl4->daddr;
2437                }
2438
2439                /* L3 master device is the loopback for that domain */
2440                dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2441                        net->loopback_dev;
2442                fl4->flowi4_oif = dev_out->ifindex;
2443                flags |= RTCF_LOCAL;
2444                goto make_route;
2445        }
2446
2447        fib_select_path(net, res, fl4, skb);
2448
2449        dev_out = FIB_RES_DEV(*res);
2450        fl4->flowi4_oif = dev_out->ifindex;
2451
2452
2453make_route:
2454        rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2455
2456out:
2457        return rth;
2458}
2459
2460static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2461{
2462        return NULL;
2463}
2464
2465static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2466{
2467        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2468
2469        return mtu ? : dst->dev->mtu;
2470}
2471
2472static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2473                                          struct sk_buff *skb, u32 mtu)
2474{
2475}
2476
2477static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2478                                       struct sk_buff *skb)
2479{
2480}
2481
2482static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2483                                          unsigned long old)
2484{
2485        return NULL;
2486}
2487
2488static struct dst_ops ipv4_dst_blackhole_ops = {
2489        .family                 =       AF_INET,
2490        .check                  =       ipv4_blackhole_dst_check,
2491        .mtu                    =       ipv4_blackhole_mtu,
2492        .default_advmss         =       ipv4_default_advmss,
2493        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2494        .redirect               =       ipv4_rt_blackhole_redirect,
2495        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2496        .neigh_lookup           =       ipv4_neigh_lookup,
2497};
2498
2499struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2500{
2501        struct rtable *ort = (struct rtable *) dst_orig;
2502        struct rtable *rt;
2503
2504        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2505        if (rt) {
2506                struct dst_entry *new = &rt->dst;
2507
2508                new->__use = 1;
2509                new->input = dst_discard;
2510                new->output = dst_discard_out;
2511
2512                new->dev = net->loopback_dev;
2513                if (new->dev)
2514                        dev_hold(new->dev);
2515
2516                rt->rt_is_input = ort->rt_is_input;
2517                rt->rt_iif = ort->rt_iif;
2518                rt->rt_pmtu = ort->rt_pmtu;
2519
2520                rt->rt_genid = rt_genid_ipv4(net);
2521                rt->rt_flags = ort->rt_flags;
2522                rt->rt_type = ort->rt_type;
2523                rt->rt_gateway = ort->rt_gateway;
2524                rt->rt_uses_gateway = ort->rt_uses_gateway;
2525
2526                INIT_LIST_HEAD(&rt->rt_uncached);
2527        }
2528
2529        dst_release(dst_orig);
2530
2531        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2532}
2533
2534struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2535                                    const struct sock *sk)
2536{
2537        struct rtable *rt = __ip_route_output_key(net, flp4);
2538
2539        if (IS_ERR(rt))
2540                return rt;
2541
2542        if (flp4->flowi4_proto)
2543                rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2544                                                        flowi4_to_flowi(flp4),
2545                                                        sk, 0);
2546
2547        return rt;
2548}
2549EXPORT_SYMBOL_GPL(ip_route_output_flow);
2550
2551/* called with rcu_read_lock held */
2552static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2553                        struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2554                        u32 seq)
2555{
2556        struct rtable *rt = skb_rtable(skb);
2557        struct rtmsg *r;
2558        struct nlmsghdr *nlh;
2559        unsigned long expires = 0;
2560        u32 error;
2561        u32 metrics[RTAX_MAX];
2562
2563        nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2564        if (!nlh)
2565                return -EMSGSIZE;
2566
2567        r = nlmsg_data(nlh);
2568        r->rtm_family    = AF_INET;
2569        r->rtm_dst_len  = 32;
2570        r->rtm_src_len  = 0;
2571        r->rtm_tos      = fl4->flowi4_tos;
2572        r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2573        if (nla_put_u32(skb, RTA_TABLE, table_id))
2574                goto nla_put_failure;
2575        r->rtm_type     = rt->rt_type;
2576        r->rtm_scope    = RT_SCOPE_UNIVERSE;
2577        r->rtm_protocol = RTPROT_UNSPEC;
2578        r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2579        if (rt->rt_flags & RTCF_NOTIFY)
2580                r->rtm_flags |= RTM_F_NOTIFY;
2581        if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2582                r->rtm_flags |= RTCF_DOREDIRECT;
2583
2584        if (nla_put_in_addr(skb, RTA_DST, dst))
2585                goto nla_put_failure;
2586        if (src) {
2587                r->rtm_src_len = 32;
2588                if (nla_put_in_addr(skb, RTA_SRC, src))
2589                        goto nla_put_failure;
2590        }
2591        if (rt->dst.dev &&
2592            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2593                goto nla_put_failure;
2594#ifdef CONFIG_IP_ROUTE_CLASSID
2595        if (rt->dst.tclassid &&
2596            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2597                goto nla_put_failure;
2598#endif
2599        if (!rt_is_input_route(rt) &&
2600            fl4->saddr != src) {
2601                if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2602                        goto nla_put_failure;
2603        }
2604        if (rt->rt_uses_gateway &&
2605            nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2606                goto nla_put_failure;
2607
2608        expires = rt->dst.expires;
2609        if (expires) {
2610                unsigned long now = jiffies;
2611
2612                if (time_before(now, expires))
2613                        expires -= now;
2614                else
2615                        expires = 0;
2616        }
2617
2618        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2619        if (rt->rt_pmtu && expires)
2620                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2621        if (rtnetlink_put_metrics(skb, metrics) < 0)
2622                goto nla_put_failure;
2623
2624        if (fl4->flowi4_mark &&
2625            nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2626                goto nla_put_failure;
2627
2628        if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2629            nla_put_u32(skb, RTA_UID,
2630                        from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2631                goto nla_put_failure;
2632
2633        error = rt->dst.error;
2634
2635        if (rt_is_input_route(rt)) {
2636#ifdef CONFIG_IP_MROUTE
2637                if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2638                    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2639                        int err = ipmr_get_route(net, skb,
2640                                                 fl4->saddr, fl4->daddr,
2641                                                 r, portid);
2642
2643                        if (err <= 0) {
2644                                if (err == 0)
2645                                        return 0;
2646                                goto nla_put_failure;
2647                        }
2648                } else
2649#endif
2650                        if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2651                                goto nla_put_failure;
2652        }
2653
2654        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2655                goto nla_put_failure;
2656
2657        nlmsg_end(skb, nlh);
2658        return 0;
2659
2660nla_put_failure:
2661        nlmsg_cancel(skb, nlh);
2662        return -EMSGSIZE;
2663}
2664
2665static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2666                             struct netlink_ext_ack *extack)
2667{
2668        struct net *net = sock_net(in_skb->sk);
2669        struct rtmsg *rtm;
2670        struct nlattr *tb[RTA_MAX+1];
2671        struct fib_result res = {};
2672        struct rtable *rt = NULL;
2673        struct flowi4 fl4;
2674        __be32 dst = 0;
2675        __be32 src = 0;
2676        u32 iif;
2677        int err;
2678        int mark;
2679        struct sk_buff *skb;
2680        u32 table_id = RT_TABLE_MAIN;
2681        kuid_t uid;
2682
2683        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2684                          extack);
2685        if (err < 0)
2686                goto errout;
2687
2688        rtm = nlmsg_data(nlh);
2689
2690        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2691        if (!skb) {
2692                err = -ENOBUFS;
2693                goto errout;
2694        }
2695
2696        /* Reserve room for dummy headers, this skb can pass
2697           through good chunk of routing engine.
2698         */
2699        skb_reset_mac_header(skb);
2700        skb_reset_network_header(skb);
2701
2702        src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2703        dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2704        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2705        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2706        if (tb[RTA_UID])
2707                uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2708        else
2709                uid = (iif ? INVALID_UID : current_uid());
2710
2711        /* Bugfix: need to give ip_route_input enough of an IP header to
2712         * not gag.
2713         */
2714        ip_hdr(skb)->protocol = IPPROTO_UDP;
2715        ip_hdr(skb)->saddr = src;
2716        ip_hdr(skb)->daddr = dst;
2717
2718        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2719
2720        memset(&fl4, 0, sizeof(fl4));
2721        fl4.daddr = dst;
2722        fl4.saddr = src;
2723        fl4.flowi4_tos = rtm->rtm_tos;
2724        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2725        fl4.flowi4_mark = mark;
2726        fl4.flowi4_uid = uid;
2727
2728        rcu_read_lock();
2729
2730        if (iif) {
2731                struct net_device *dev;
2732
2733                dev = dev_get_by_index_rcu(net, iif);
2734                if (!dev) {
2735                        err = -ENODEV;
2736                        goto errout_free;
2737                }
2738
2739                skb->protocol   = htons(ETH_P_IP);
2740                skb->dev        = dev;
2741                skb->mark       = mark;
2742                err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2743                                         dev, &res);
2744
2745                rt = skb_rtable(skb);
2746                if (err == 0 && rt->dst.error)
2747                        err = -rt->dst.error;
2748        } else {
2749                rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2750                err = 0;
2751                if (IS_ERR(rt))
2752                        err = PTR_ERR(rt);
2753                else
2754                        skb_dst_set(skb, &rt->dst);
2755        }
2756
2757        if (err)
2758                goto errout_free;
2759
2760        if (rtm->rtm_flags & RTM_F_NOTIFY)
2761                rt->rt_flags |= RTCF_NOTIFY;
2762
2763        if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2764                table_id = rt->rt_table_id;
2765
2766        if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2767                if (!res.fi) {
2768                        err = fib_props[res.type].error;
2769                        if (!err)
2770                                err = -EHOSTUNREACH;
2771                        goto errout_free;
2772                }
2773                err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2774                                    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2775                                    rt->rt_type, res.prefix, res.prefixlen,
2776                                    fl4.flowi4_tos, res.fi, 0);
2777        } else {
2778                err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2779                                   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2780        }
2781        if (err < 0)
2782                goto errout_free;
2783
2784        rcu_read_unlock();
2785
2786        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2787errout:
2788        return err;
2789
2790errout_free:
2791        rcu_read_unlock();
2792        kfree_skb(skb);
2793        goto errout;
2794}
2795
2796void ip_rt_multicast_event(struct in_device *in_dev)
2797{
2798        rt_cache_flush(dev_net(in_dev->dev));
2799}
2800
2801#ifdef CONFIG_SYSCTL
2802static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2803static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2804static int ip_rt_gc_elasticity __read_mostly    = 8;
2805
2806static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2807                                        void __user *buffer,
2808                                        size_t *lenp, loff_t *ppos)
2809{
2810        struct net *net = (struct net *)__ctl->extra1;
2811
2812        if (write) {
2813                rt_cache_flush(net);
2814                fnhe_genid_bump(net);
2815                return 0;
2816        }
2817
2818        return -EINVAL;
2819}
2820
2821static struct ctl_table ipv4_route_table[] = {
2822        {
2823                .procname       = "gc_thresh",
2824                .data           = &ipv4_dst_ops.gc_thresh,
2825                .maxlen         = sizeof(int),
2826                .mode           = 0644,
2827                .proc_handler   = proc_dointvec,
2828        },
2829        {
2830                .procname       = "max_size",
2831                .data           = &ip_rt_max_size,
2832                .maxlen         = sizeof(int),
2833                .mode           = 0644,
2834                .proc_handler   = proc_dointvec,
2835        },
2836        {
2837                /*  Deprecated. Use gc_min_interval_ms */
2838
2839                .procname       = "gc_min_interval",
2840                .data           = &ip_rt_gc_min_interval,
2841                .maxlen         = sizeof(int),
2842                .mode           = 0644,
2843                .proc_handler   = proc_dointvec_jiffies,
2844        },
2845        {
2846                .procname       = "gc_min_interval_ms",
2847                .data           = &ip_rt_gc_min_interval,
2848                .maxlen         = sizeof(int),
2849                .mode           = 0644,
2850                .proc_handler   = proc_dointvec_ms_jiffies,
2851        },
2852        {
2853                .procname       = "gc_timeout",
2854                .data           = &ip_rt_gc_timeout,
2855                .maxlen         = sizeof(int),
2856                .mode           = 0644,
2857                .proc_handler   = proc_dointvec_jiffies,
2858        },
2859        {
2860                .procname       = "gc_interval",
2861                .data           = &ip_rt_gc_interval,
2862                .maxlen         = sizeof(int),
2863                .mode           = 0644,
2864                .proc_handler   = proc_dointvec_jiffies,
2865        },
2866        {
2867                .procname       = "redirect_load",
2868                .data           = &ip_rt_redirect_load,
2869                .maxlen         = sizeof(int),
2870                .mode           = 0644,
2871                .proc_handler   = proc_dointvec,
2872        },
2873        {
2874                .procname       = "redirect_number",
2875                .data           = &ip_rt_redirect_number,
2876                .maxlen         = sizeof(int),
2877                .mode           = 0644,
2878                .proc_handler   = proc_dointvec,
2879        },
2880        {
2881                .procname       = "redirect_silence",
2882                .data           = &ip_rt_redirect_silence,
2883                .maxlen         = sizeof(int),
2884                .mode           = 0644,
2885                .proc_handler   = proc_dointvec,
2886        },
2887        {
2888                .procname       = "error_cost",
2889                .data           = &ip_rt_error_cost,
2890                .maxlen         = sizeof(int),
2891                .mode           = 0644,
2892                .proc_handler   = proc_dointvec,
2893        },
2894        {
2895                .procname       = "error_burst",
2896                .data           = &ip_rt_error_burst,
2897                .maxlen         = sizeof(int),
2898                .mode           = 0644,
2899                .proc_handler   = proc_dointvec,
2900        },
2901        {
2902                .procname       = "gc_elasticity",
2903                .data           = &ip_rt_gc_elasticity,
2904                .maxlen         = sizeof(int),
2905                .mode           = 0644,
2906                .proc_handler   = proc_dointvec,
2907        },
2908        {
2909                .procname       = "mtu_expires",
2910                .data           = &ip_rt_mtu_expires,
2911                .maxlen         = sizeof(int),
2912                .mode           = 0644,
2913                .proc_handler   = proc_dointvec_jiffies,
2914        },
2915        {
2916                .procname       = "min_pmtu",
2917                .data           = &ip_rt_min_pmtu,
2918                .maxlen         = sizeof(int),
2919                .mode           = 0644,
2920                .proc_handler   = proc_dointvec,
2921        },
2922        {
2923                .procname       = "min_adv_mss",
2924                .data           = &ip_rt_min_advmss,
2925                .maxlen         = sizeof(int),
2926                .mode           = 0644,
2927                .proc_handler   = proc_dointvec,
2928        },
2929        { }
2930};
2931
2932static struct ctl_table ipv4_route_flush_table[] = {
2933        {
2934                .procname       = "flush",
2935                .maxlen         = sizeof(int),
2936                .mode           = 0200,
2937                .proc_handler   = ipv4_sysctl_rtcache_flush,
2938        },
2939        { },
2940};
2941
2942static __net_init int sysctl_route_net_init(struct net *net)
2943{
2944        struct ctl_table *tbl;
2945
2946        tbl = ipv4_route_flush_table;
2947        if (!net_eq(net, &init_net)) {
2948                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2949                if (!tbl)
2950                        goto err_dup;
2951
2952                /* Don't export sysctls to unprivileged users */
2953                if (net->user_ns != &init_user_ns)
2954                        tbl[0].procname = NULL;
2955        }
2956        tbl[0].extra1 = net;
2957
2958        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2959        if (!net->ipv4.route_hdr)
2960                goto err_reg;
2961        return 0;
2962
2963err_reg:
2964        if (tbl != ipv4_route_flush_table)
2965                kfree(tbl);
2966err_dup:
2967        return -ENOMEM;
2968}
2969
2970static __net_exit void sysctl_route_net_exit(struct net *net)
2971{
2972        struct ctl_table *tbl;
2973
2974        tbl = net->ipv4.route_hdr->ctl_table_arg;
2975        unregister_net_sysctl_table(net->ipv4.route_hdr);
2976        BUG_ON(tbl == ipv4_route_flush_table);
2977        kfree(tbl);
2978}
2979
2980static __net_initdata struct pernet_operations sysctl_route_ops = {
2981        .init = sysctl_route_net_init,
2982        .exit = sysctl_route_net_exit,
2983};
2984#endif
2985
2986static __net_init int rt_genid_init(struct net *net)
2987{
2988        atomic_set(&net->ipv4.rt_genid, 0);
2989        atomic_set(&net->fnhe_genid, 0);
2990        atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
2991        return 0;
2992}
2993
2994static __net_initdata struct pernet_operations rt_genid_ops = {
2995        .init = rt_genid_init,
2996};
2997
2998static int __net_init ipv4_inetpeer_init(struct net *net)
2999{
3000        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3001
3002        if (!bp)
3003                return -ENOMEM;
3004        inet_peer_base_init(bp);
3005        net->ipv4.peers = bp;
3006        return 0;
3007}
3008
3009static void __net_exit ipv4_inetpeer_exit(struct net *net)
3010{
3011        struct inet_peer_base *bp = net->ipv4.peers;
3012
3013        net->ipv4.peers = NULL;
3014        inetpeer_invalidate_tree(bp);
3015        kfree(bp);
3016}
3017
3018static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3019        .init   =       ipv4_inetpeer_init,
3020        .exit   =       ipv4_inetpeer_exit,
3021};
3022
3023#ifdef CONFIG_IP_ROUTE_CLASSID
3024struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3025#endif /* CONFIG_IP_ROUTE_CLASSID */
3026
3027int __init ip_rt_init(void)
3028{
3029        int rc = 0;
3030        int cpu;
3031
3032        ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3033        if (!ip_idents)
3034                panic("IP: failed to allocate ip_idents\n");
3035
3036        prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3037
3038        ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3039        if (!ip_tstamps)
3040                panic("IP: failed to allocate ip_tstamps\n");
3041
3042        for_each_possible_cpu(cpu) {
3043                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3044
3045                INIT_LIST_HEAD(&ul->head);
3046                spin_lock_init(&ul->lock);
3047        }
3048#ifdef CONFIG_IP_ROUTE_CLASSID
3049        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3050        if (!ip_rt_acct)
3051                panic("IP: failed to allocate ip_rt_acct\n");
3052#endif
3053
3054        ipv4_dst_ops.kmem_cachep =
3055                kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3056                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3057
3058        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3059
3060        if (dst_entries_init(&ipv4_dst_ops) < 0)
3061                panic("IP: failed to allocate ipv4_dst_ops counter\n");
3062
3063        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3064                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3065
3066        ipv4_dst_ops.gc_thresh = ~0;
3067        ip_rt_max_size = INT_MAX;
3068
3069        devinet_init();
3070        ip_fib_init();
3071
3072        if (ip_rt_proc_init())
3073                pr_err("Unable to create route proc files\n");
3074#ifdef CONFIG_XFRM
3075        xfrm_init();
3076        xfrm4_init();
3077#endif
3078        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3079
3080#ifdef CONFIG_SYSCTL
3081        register_pernet_subsys(&sysctl_route_ops);
3082#endif
3083        register_pernet_subsys(&rt_genid_ops);
3084        register_pernet_subsys(&ipv4_inetpeer_ops);
3085        return rc;
3086}
3087
3088#ifdef CONFIG_SYSCTL
3089/*
3090 * We really need to sanitize the damn ipv4 init order, then all
3091 * this nonsense will go away.
3092 */
3093void __init ip_static_sysctl_init(void)
3094{
3095        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3096}
3097#endif
3098