1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
71#include <linux/mm.h>
72#include <linux/bootmem.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/workqueue.h>
83#include <linux/skbuff.h>
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
93#include <net/dst.h>
94#include <net/net_namespace.h>
95#include <net/protocol.h>
96#include <net/ip.h>
97#include <net/route.h>
98#include <net/inetpeer.h>
99#include <net/sock.h>
100#include <net/ip_fib.h>
101#include <net/arp.h>
102#include <net/tcp.h>
103#include <net/icmp.h>
104#include <net/xfrm.h>
105#include <net/netevent.h>
106#include <net/rtnetlink.h>
107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110
111#define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114#define IP_MAX_MTU 0xFFF0
115
116#define RT_GC_TIMEOUT (300*HZ)
117
118static int ip_rt_max_size;
119static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
120static int ip_rt_gc_interval __read_mostly = 60 * HZ;
121static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
122static int ip_rt_redirect_number __read_mostly = 9;
123static int ip_rt_redirect_load __read_mostly = HZ / 50;
124static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125static int ip_rt_error_cost __read_mostly = HZ;
126static int ip_rt_error_burst __read_mostly = 5 * HZ;
127static int ip_rt_gc_elasticity __read_mostly = 8;
128static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130static int ip_rt_min_advmss __read_mostly = 256;
131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
132static int rt_chain_length_max __read_mostly = 20;
133
134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
136
137
138
139
140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142static void ipv4_dst_destroy(struct dst_entry *dst);
143static void ipv4_dst_ifdown(struct dst_entry *dst,
144 struct net_device *dev, int how);
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148static int rt_garbage_collect(struct dst_ops *ops);
149static void rt_emergency_hash_rebuild(struct net *net);
150
151
152static struct dst_ops ipv4_dst_ops = {
153 .family = AF_INET,
154 .protocol = cpu_to_be16(ETH_P_IP),
155 .gc = rt_garbage_collect,
156 .check = ipv4_dst_check,
157 .destroy = ipv4_dst_destroy,
158 .ifdown = ipv4_dst_ifdown,
159 .negative_advice = ipv4_negative_advice,
160 .link_failure = ipv4_link_failure,
161 .update_pmtu = ip_rt_update_pmtu,
162 .local_out = __ip_local_out,
163 .entries = ATOMIC_INIT(0),
164};
165
166#define ECN_OR_COST(class) TC_PRIO_##class
167
168const __u8 ip_tos2prio[16] = {
169 TC_PRIO_BESTEFFORT,
170 ECN_OR_COST(FILLER),
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(BESTEFFORT),
173 TC_PRIO_BULK,
174 ECN_OR_COST(BULK),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_INTERACTIVE,
178 ECN_OR_COST(INTERACTIVE),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE_BULK,
182 ECN_OR_COST(INTERACTIVE_BULK),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK)
185};
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202struct rt_hash_bucket {
203 struct rtable *chain;
204};
205
206#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 defined(CONFIG_PROVE_LOCKING)
208
209
210
211
212
213#ifdef CONFIG_LOCKDEP
214# define RT_HASH_LOCK_SZ 256
215#else
216# if NR_CPUS >= 32
217# define RT_HASH_LOCK_SZ 4096
218# elif NR_CPUS >= 16
219# define RT_HASH_LOCK_SZ 2048
220# elif NR_CPUS >= 8
221# define RT_HASH_LOCK_SZ 1024
222# elif NR_CPUS >= 4
223# define RT_HASH_LOCK_SZ 512
224# else
225# define RT_HASH_LOCK_SZ 256
226# endif
227#endif
228
229static spinlock_t *rt_hash_locks;
230# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
231
232static __init void rt_hash_lock_init(void)
233{
234 int i;
235
236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 GFP_KERNEL);
238 if (!rt_hash_locks)
239 panic("IP: failed to allocate rt_hash_locks\n");
240
241 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 spin_lock_init(&rt_hash_locks[i]);
243}
244#else
245# define rt_hash_lock_addr(slot) NULL
246
247static inline void rt_hash_lock_init(void)
248{
249}
250#endif
251
252static struct rt_hash_bucket *rt_hash_table __read_mostly;
253static unsigned rt_hash_mask __read_mostly;
254static unsigned int rt_hash_log __read_mostly;
255
256static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257#define RT_CACHE_STAT_INC(field) \
258 (__raw_get_cpu_var(rt_cache_stat).field++)
259
260static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 int genid)
262{
263 return jhash_3words((__force u32)(__be32)(daddr),
264 (__force u32)(__be32)(saddr),
265 idx, genid)
266 & rt_hash_mask;
267}
268
269static inline int rt_genid(struct net *net)
270{
271 return atomic_read(&net->ipv4.rt_genid);
272}
273
274#ifdef CONFIG_PROC_FS
275struct rt_cache_iter_state {
276 struct seq_net_private p;
277 int bucket;
278 int genid;
279};
280
281static struct rtable *rt_cache_get_first(struct seq_file *seq)
282{
283 struct rt_cache_iter_state *st = seq->private;
284 struct rtable *r = NULL;
285
286 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
287 if (!rt_hash_table[st->bucket].chain)
288 continue;
289 rcu_read_lock_bh();
290 r = rcu_dereference(rt_hash_table[st->bucket].chain);
291 while (r) {
292 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
293 r->rt_genid == st->genid)
294 return r;
295 r = rcu_dereference(r->u.dst.rt_next);
296 }
297 rcu_read_unlock_bh();
298 }
299 return r;
300}
301
302static struct rtable *__rt_cache_get_next(struct seq_file *seq,
303 struct rtable *r)
304{
305 struct rt_cache_iter_state *st = seq->private;
306
307 r = r->u.dst.rt_next;
308 while (!r) {
309 rcu_read_unlock_bh();
310 do {
311 if (--st->bucket < 0)
312 return NULL;
313 } while (!rt_hash_table[st->bucket].chain);
314 rcu_read_lock_bh();
315 r = rt_hash_table[st->bucket].chain;
316 }
317 return rcu_dereference(r);
318}
319
320static struct rtable *rt_cache_get_next(struct seq_file *seq,
321 struct rtable *r)
322{
323 struct rt_cache_iter_state *st = seq->private;
324 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
326 continue;
327 if (r->rt_genid == st->genid)
328 break;
329 }
330 return r;
331}
332
333static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
334{
335 struct rtable *r = rt_cache_get_first(seq);
336
337 if (r)
338 while (pos && (r = rt_cache_get_next(seq, r)))
339 --pos;
340 return pos ? NULL : r;
341}
342
343static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344{
345 struct rt_cache_iter_state *st = seq->private;
346 if (*pos)
347 return rt_cache_get_idx(seq, *pos - 1);
348 st->genid = rt_genid(seq_file_net(seq));
349 return SEQ_START_TOKEN;
350}
351
352static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353{
354 struct rtable *r;
355
356 if (v == SEQ_START_TOKEN)
357 r = rt_cache_get_first(seq);
358 else
359 r = rt_cache_get_next(seq, v);
360 ++*pos;
361 return r;
362}
363
364static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365{
366 if (v && v != SEQ_START_TOKEN)
367 rcu_read_unlock_bh();
368}
369
370static int rt_cache_seq_show(struct seq_file *seq, void *v)
371{
372 if (v == SEQ_START_TOKEN)
373 seq_printf(seq, "%-127s\n",
374 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 "HHUptod\tSpecDst");
377 else {
378 struct rtable *r = v;
379 int len;
380
381 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
383 r->u.dst.dev ? r->u.dst.dev->name : "*",
384 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 r->u.dst.__use, 0, (unsigned long)r->rt_src,
387 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 dst_metric(&r->u.dst, RTAX_WINDOW),
390 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 r->fl.fl4_tos,
393 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 dev_queue_xmit) : 0,
396 r->rt_spec_dst, &len);
397
398 seq_printf(seq, "%*s\n", 127 - len, "");
399 }
400 return 0;
401}
402
403static const struct seq_operations rt_cache_seq_ops = {
404 .start = rt_cache_seq_start,
405 .next = rt_cache_seq_next,
406 .stop = rt_cache_seq_stop,
407 .show = rt_cache_seq_show,
408};
409
410static int rt_cache_seq_open(struct inode *inode, struct file *file)
411{
412 return seq_open_net(inode, file, &rt_cache_seq_ops,
413 sizeof(struct rt_cache_iter_state));
414}
415
416static const struct file_operations rt_cache_seq_fops = {
417 .owner = THIS_MODULE,
418 .open = rt_cache_seq_open,
419 .read = seq_read,
420 .llseek = seq_lseek,
421 .release = seq_release_net,
422};
423
424
425static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426{
427 int cpu;
428
429 if (*pos == 0)
430 return SEQ_START_TOKEN;
431
432 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
433 if (!cpu_possible(cpu))
434 continue;
435 *pos = cpu+1;
436 return &per_cpu(rt_cache_stat, cpu);
437 }
438 return NULL;
439}
440
441static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442{
443 int cpu;
444
445 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
446 if (!cpu_possible(cpu))
447 continue;
448 *pos = cpu+1;
449 return &per_cpu(rt_cache_stat, cpu);
450 }
451 return NULL;
452
453}
454
455static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456{
457
458}
459
460static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461{
462 struct rt_cache_stat *st = v;
463
464 if (v == SEQ_START_TOKEN) {
465 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
466 return 0;
467 }
468
469 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 atomic_read(&ipv4_dst_ops.entries),
472 st->in_hit,
473 st->in_slow_tot,
474 st->in_slow_mc,
475 st->in_no_route,
476 st->in_brd,
477 st->in_martian_dst,
478 st->in_martian_src,
479
480 st->out_hit,
481 st->out_slow_tot,
482 st->out_slow_mc,
483
484 st->gc_total,
485 st->gc_ignored,
486 st->gc_goal_miss,
487 st->gc_dst_overflow,
488 st->in_hlist_search,
489 st->out_hlist_search
490 );
491 return 0;
492}
493
494static const struct seq_operations rt_cpu_seq_ops = {
495 .start = rt_cpu_seq_start,
496 .next = rt_cpu_seq_next,
497 .stop = rt_cpu_seq_stop,
498 .show = rt_cpu_seq_show,
499};
500
501
502static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503{
504 return seq_open(file, &rt_cpu_seq_ops);
505}
506
507static const struct file_operations rt_cpu_seq_fops = {
508 .owner = THIS_MODULE,
509 .open = rt_cpu_seq_open,
510 .read = seq_read,
511 .llseek = seq_lseek,
512 .release = seq_release,
513};
514
515#ifdef CONFIG_NET_CLS_ROUTE
516static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
517 int length, int *eof, void *data)
518{
519 unsigned int i;
520
521 if ((offset & 3) || (length & 3))
522 return -EIO;
523
524 if (offset >= sizeof(struct ip_rt_acct) * 256) {
525 *eof = 1;
526 return 0;
527 }
528
529 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
530 length = sizeof(struct ip_rt_acct) * 256 - offset;
531 *eof = 1;
532 }
533
534 offset /= sizeof(u32);
535
536 if (length > 0) {
537 u32 *dst = (u32 *) buffer;
538
539 *start = buffer;
540 memset(dst, 0, length);
541
542 for_each_possible_cpu(i) {
543 unsigned int j;
544 u32 *src;
545
546 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
547 for (j = 0; j < length/4; j++)
548 dst[j] += src[j];
549 }
550 }
551 return length;
552}
553#endif
554
555static int __net_init ip_rt_do_proc_init(struct net *net)
556{
557 struct proc_dir_entry *pde;
558
559 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
560 &rt_cache_seq_fops);
561 if (!pde)
562 goto err1;
563
564 pde = proc_create("rt_cache", S_IRUGO,
565 net->proc_net_stat, &rt_cpu_seq_fops);
566 if (!pde)
567 goto err2;
568
569#ifdef CONFIG_NET_CLS_ROUTE
570 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
571 ip_rt_acct_read, NULL);
572 if (!pde)
573 goto err3;
574#endif
575 return 0;
576
577#ifdef CONFIG_NET_CLS_ROUTE
578err3:
579 remove_proc_entry("rt_cache", net->proc_net_stat);
580#endif
581err2:
582 remove_proc_entry("rt_cache", net->proc_net);
583err1:
584 return -ENOMEM;
585}
586
587static void __net_exit ip_rt_do_proc_exit(struct net *net)
588{
589 remove_proc_entry("rt_cache", net->proc_net_stat);
590 remove_proc_entry("rt_cache", net->proc_net);
591 remove_proc_entry("rt_acct", net->proc_net);
592}
593
594static struct pernet_operations ip_rt_proc_ops __net_initdata = {
595 .init = ip_rt_do_proc_init,
596 .exit = ip_rt_do_proc_exit,
597};
598
599static int __init ip_rt_proc_init(void)
600{
601 return register_pernet_subsys(&ip_rt_proc_ops);
602}
603
604#else
605static inline int ip_rt_proc_init(void)
606{
607 return 0;
608}
609#endif
610
611static inline void rt_free(struct rtable *rt)
612{
613 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
614}
615
616static inline void rt_drop(struct rtable *rt)
617{
618 ip_rt_put(rt);
619 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
620}
621
622static inline int rt_fast_clean(struct rtable *rth)
623{
624
625
626 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
627 rth->fl.iif && rth->u.dst.rt_next;
628}
629
630static inline int rt_valuable(struct rtable *rth)
631{
632 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 rth->u.dst.expires;
634}
635
636static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
637{
638 unsigned long age;
639 int ret = 0;
640
641 if (atomic_read(&rth->u.dst.__refcnt))
642 goto out;
643
644 ret = 1;
645 if (rth->u.dst.expires &&
646 time_after_eq(jiffies, rth->u.dst.expires))
647 goto out;
648
649 age = jiffies - rth->u.dst.lastuse;
650 ret = 0;
651 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 (age <= tmo2 && rt_valuable(rth)))
653 goto out;
654 ret = 1;
655out: return ret;
656}
657
658
659
660
661
662
663static inline u32 rt_score(struct rtable *rt)
664{
665 u32 score = jiffies - rt->u.dst.lastuse;
666
667 score = ~score & ~(3<<30);
668
669 if (rt_valuable(rt))
670 score |= (1<<31);
671
672 if (!rt->fl.iif ||
673 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674 score |= (1<<30);
675
676 return score;
677}
678
679static inline bool rt_caching(const struct net *net)
680{
681 return net->ipv4.current_rt_cache_rebuild_count <=
682 net->ipv4.sysctl_rt_cache_rebuild_count;
683}
684
685static inline bool compare_hash_inputs(const struct flowi *fl1,
686 const struct flowi *fl2)
687{
688 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
689 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
690 (fl1->iif ^ fl2->iif)) == 0);
691}
692
693static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
694{
695 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
696 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
697 (fl1->mark ^ fl2->mark) |
698 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
699 *(u16 *)&fl2->nl_u.ip4_u.tos) |
700 (fl1->oif ^ fl2->oif) |
701 (fl1->iif ^ fl2->iif)) == 0;
702}
703
704static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705{
706 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
707}
708
709static inline int rt_is_expired(struct rtable *rth)
710{
711 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
712}
713
714
715
716
717
718
719static void rt_do_flush(int process_context)
720{
721 unsigned int i;
722 struct rtable *rth, *next;
723 struct rtable * tail;
724
725 for (i = 0; i <= rt_hash_mask; i++) {
726 if (process_context && need_resched())
727 cond_resched();
728 rth = rt_hash_table[i].chain;
729 if (!rth)
730 continue;
731
732 spin_lock_bh(rt_hash_lock_addr(i));
733#ifdef CONFIG_NET_NS
734 {
735 struct rtable ** prev, * p;
736
737 rth = rt_hash_table[i].chain;
738
739
740 for (tail = rth; tail; tail = tail->u.dst.rt_next)
741 if (!rt_is_expired(tail))
742 break;
743 if (rth != tail)
744 rt_hash_table[i].chain = tail;
745
746
747 prev = &rt_hash_table[i].chain;
748 for (p = *prev; p; p = next) {
749 next = p->u.dst.rt_next;
750 if (!rt_is_expired(p)) {
751 prev = &p->u.dst.rt_next;
752 } else {
753 *prev = next;
754 rt_free(p);
755 }
756 }
757 }
758#else
759 rth = rt_hash_table[i].chain;
760 rt_hash_table[i].chain = NULL;
761 tail = NULL;
762#endif
763 spin_unlock_bh(rt_hash_lock_addr(i));
764
765 for (; rth != tail; rth = next) {
766 next = rth->u.dst.rt_next;
767 rt_free(rth);
768 }
769 }
770}
771
772
773
774
775
776
777
778
779
780#define FRACT_BITS 3
781#define ONE (1UL << FRACT_BITS)
782
783static void rt_check_expire(void)
784{
785 static unsigned int rover;
786 unsigned int i = rover, goal;
787 struct rtable *rth, *aux, **rthp;
788 unsigned long samples = 0;
789 unsigned long sum = 0, sum2 = 0;
790 unsigned long delta;
791 u64 mult;
792
793 delta = jiffies - expires_ljiffies;
794 expires_ljiffies = jiffies;
795 mult = ((u64)delta) << rt_hash_log;
796 if (ip_rt_gc_timeout > 1)
797 do_div(mult, ip_rt_gc_timeout);
798 goal = (unsigned int)mult;
799 if (goal > rt_hash_mask)
800 goal = rt_hash_mask + 1;
801 for (; goal > 0; goal--) {
802 unsigned long tmo = ip_rt_gc_timeout;
803 unsigned long length;
804
805 i = (i + 1) & rt_hash_mask;
806 rthp = &rt_hash_table[i].chain;
807
808 if (need_resched())
809 cond_resched();
810
811 samples++;
812
813 if (*rthp == NULL)
814 continue;
815 length = 0;
816 spin_lock_bh(rt_hash_lock_addr(i));
817 while ((rth = *rthp) != NULL) {
818 prefetch(rth->u.dst.rt_next);
819 if (rt_is_expired(rth)) {
820 *rthp = rth->u.dst.rt_next;
821 rt_free(rth);
822 continue;
823 }
824 if (rth->u.dst.expires) {
825
826 if (time_before_eq(jiffies, rth->u.dst.expires)) {
827nofree:
828 tmo >>= 1;
829 rthp = &rth->u.dst.rt_next;
830
831
832
833
834
835
836
837
838 for (aux = rt_hash_table[i].chain;;) {
839 if (aux == rth) {
840 length += ONE;
841 break;
842 }
843 if (compare_hash_inputs(&aux->fl, &rth->fl))
844 break;
845 aux = aux->u.dst.rt_next;
846 }
847 continue;
848 }
849 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
850 goto nofree;
851
852
853 *rthp = rth->u.dst.rt_next;
854 rt_free(rth);
855 }
856 spin_unlock_bh(rt_hash_lock_addr(i));
857 sum += length;
858 sum2 += length*length;
859 }
860 if (samples) {
861 unsigned long avg = sum / samples;
862 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
863 rt_chain_length_max = max_t(unsigned long,
864 ip_rt_gc_elasticity,
865 (avg + 4*sd) >> FRACT_BITS);
866 }
867 rover = i;
868}
869
870
871
872
873
874static void rt_worker_func(struct work_struct *work)
875{
876 rt_check_expire();
877 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
878}
879
880
881
882
883
884
885
886static void rt_cache_invalidate(struct net *net)
887{
888 unsigned char shuffle;
889
890 get_random_bytes(&shuffle, sizeof(shuffle));
891 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
892}
893
894
895
896
897
898void rt_cache_flush(struct net *net, int delay)
899{
900 rt_cache_invalidate(net);
901 if (delay >= 0)
902 rt_do_flush(!in_softirq());
903}
904
905
906
907
908static void rt_secret_rebuild(unsigned long __net)
909{
910 struct net *net = (struct net *)__net;
911 rt_cache_invalidate(net);
912 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
913}
914
915static void rt_secret_rebuild_oneshot(struct net *net)
916{
917 del_timer_sync(&net->ipv4.rt_secret_timer);
918 rt_cache_invalidate(net);
919 if (ip_rt_secret_interval) {
920 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
921 add_timer(&net->ipv4.rt_secret_timer);
922 }
923}
924
925static void rt_emergency_hash_rebuild(struct net *net)
926{
927 if (net_ratelimit()) {
928 printk(KERN_WARNING "Route hash chain too long!\n");
929 printk(KERN_WARNING "Adjust your secret_interval!\n");
930 }
931
932 rt_secret_rebuild_oneshot(net);
933}
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948static int rt_garbage_collect(struct dst_ops *ops)
949{
950 static unsigned long expire = RT_GC_TIMEOUT;
951 static unsigned long last_gc;
952 static int rover;
953 static int equilibrium;
954 struct rtable *rth, **rthp;
955 unsigned long now = jiffies;
956 int goal;
957
958
959
960
961
962
963 RT_CACHE_STAT_INC(gc_total);
964
965 if (now - last_gc < ip_rt_gc_min_interval &&
966 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
967 RT_CACHE_STAT_INC(gc_ignored);
968 goto out;
969 }
970
971
972 goal = atomic_read(&ipv4_dst_ops.entries) -
973 (ip_rt_gc_elasticity << rt_hash_log);
974 if (goal <= 0) {
975 if (equilibrium < ipv4_dst_ops.gc_thresh)
976 equilibrium = ipv4_dst_ops.gc_thresh;
977 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
978 if (goal > 0) {
979 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
980 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
981 }
982 } else {
983
984
985
986 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
987 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
988 }
989
990 if (now - last_gc >= ip_rt_gc_min_interval)
991 last_gc = now;
992
993 if (goal <= 0) {
994 equilibrium += goal;
995 goto work_done;
996 }
997
998 do {
999 int i, k;
1000
1001 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1002 unsigned long tmo = expire;
1003
1004 k = (k + 1) & rt_hash_mask;
1005 rthp = &rt_hash_table[k].chain;
1006 spin_lock_bh(rt_hash_lock_addr(k));
1007 while ((rth = *rthp) != NULL) {
1008 if (!rt_is_expired(rth) &&
1009 !rt_may_expire(rth, tmo, expire)) {
1010 tmo >>= 1;
1011 rthp = &rth->u.dst.rt_next;
1012 continue;
1013 }
1014 *rthp = rth->u.dst.rt_next;
1015 rt_free(rth);
1016 goal--;
1017 }
1018 spin_unlock_bh(rt_hash_lock_addr(k));
1019 if (goal <= 0)
1020 break;
1021 }
1022 rover = k;
1023
1024 if (goal <= 0)
1025 goto work_done;
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036 RT_CACHE_STAT_INC(gc_goal_miss);
1037
1038 if (expire == 0)
1039 break;
1040
1041 expire >>= 1;
1042#if RT_CACHE_DEBUG >= 2
1043 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1044 atomic_read(&ipv4_dst_ops.entries), goal, i);
1045#endif
1046
1047 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1048 goto out;
1049 } while (!in_softirq() && time_before_eq(jiffies, now));
1050
1051 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1052 goto out;
1053 if (net_ratelimit())
1054 printk(KERN_WARNING "dst cache overflow\n");
1055 RT_CACHE_STAT_INC(gc_dst_overflow);
1056 return 1;
1057
1058work_done:
1059 expire += ip_rt_gc_min_interval;
1060 if (expire > ip_rt_gc_timeout ||
1061 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1062 expire = ip_rt_gc_timeout;
1063#if RT_CACHE_DEBUG >= 2
1064 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1065 atomic_read(&ipv4_dst_ops.entries), goal, rover);
1066#endif
1067out: return 0;
1068}
1069
1070static int rt_intern_hash(unsigned hash, struct rtable *rt,
1071 struct rtable **rp, struct sk_buff *skb)
1072{
1073 struct rtable *rth, **rthp;
1074 unsigned long now;
1075 struct rtable *cand, **candp;
1076 u32 min_score;
1077 int chain_length;
1078 int attempts = !in_softirq();
1079
1080restart:
1081 chain_length = 0;
1082 min_score = ~(u32)0;
1083 cand = NULL;
1084 candp = NULL;
1085 now = jiffies;
1086
1087 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1105 int err = arp_bind_neighbour(&rt->u.dst);
1106 if (err) {
1107 if (net_ratelimit())
1108 printk(KERN_WARNING
1109 "Neighbour table failure & not caching routes.\n");
1110 rt_drop(rt);
1111 return err;
1112 }
1113 }
1114
1115 rt_free(rt);
1116 goto skip_hashing;
1117 }
1118
1119 rthp = &rt_hash_table[hash].chain;
1120
1121 spin_lock_bh(rt_hash_lock_addr(hash));
1122 while ((rth = *rthp) != NULL) {
1123 if (rt_is_expired(rth)) {
1124 *rthp = rth->u.dst.rt_next;
1125 rt_free(rth);
1126 continue;
1127 }
1128 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1129
1130 *rthp = rth->u.dst.rt_next;
1131
1132
1133
1134
1135
1136 rcu_assign_pointer(rth->u.dst.rt_next,
1137 rt_hash_table[hash].chain);
1138
1139
1140
1141
1142 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1143
1144 dst_use(&rth->u.dst, now);
1145 spin_unlock_bh(rt_hash_lock_addr(hash));
1146
1147 rt_drop(rt);
1148 if (rp)
1149 *rp = rth;
1150 else
1151 skb_dst_set(skb, &rth->u.dst);
1152 return 0;
1153 }
1154
1155 if (!atomic_read(&rth->u.dst.__refcnt)) {
1156 u32 score = rt_score(rth);
1157
1158 if (score <= min_score) {
1159 cand = rth;
1160 candp = rthp;
1161 min_score = score;
1162 }
1163 }
1164
1165 chain_length++;
1166
1167 rthp = &rth->u.dst.rt_next;
1168 }
1169
1170 if (cand) {
1171
1172
1173
1174
1175
1176
1177 if (chain_length > ip_rt_gc_elasticity) {
1178 *candp = cand->u.dst.rt_next;
1179 rt_free(cand);
1180 }
1181 } else {
1182 if (chain_length > rt_chain_length_max) {
1183 struct net *net = dev_net(rt->u.dst.dev);
1184 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1185 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1186 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1187 rt->u.dst.dev->name, num);
1188 }
1189 rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1190 }
1191 }
1192
1193
1194
1195
1196 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1197 int err = arp_bind_neighbour(&rt->u.dst);
1198 if (err) {
1199 spin_unlock_bh(rt_hash_lock_addr(hash));
1200
1201 if (err != -ENOBUFS) {
1202 rt_drop(rt);
1203 return err;
1204 }
1205
1206
1207
1208
1209
1210 if (attempts-- > 0) {
1211 int saved_elasticity = ip_rt_gc_elasticity;
1212 int saved_int = ip_rt_gc_min_interval;
1213 ip_rt_gc_elasticity = 1;
1214 ip_rt_gc_min_interval = 0;
1215 rt_garbage_collect(&ipv4_dst_ops);
1216 ip_rt_gc_min_interval = saved_int;
1217 ip_rt_gc_elasticity = saved_elasticity;
1218 goto restart;
1219 }
1220
1221 if (net_ratelimit())
1222 printk(KERN_WARNING "Neighbour table overflow.\n");
1223 rt_drop(rt);
1224 return -ENOBUFS;
1225 }
1226 }
1227
1228 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1229
1230#if RT_CACHE_DEBUG >= 2
1231 if (rt->u.dst.rt_next) {
1232 struct rtable *trt;
1233 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1234 hash, &rt->rt_dst);
1235 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1236 printk(" . %pI4", &trt->rt_dst);
1237 printk("\n");
1238 }
1239#endif
1240
1241
1242
1243
1244
1245 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1246
1247 spin_unlock_bh(rt_hash_lock_addr(hash));
1248
1249skip_hashing:
1250 if (rp)
1251 *rp = rt;
1252 else
1253 skb_dst_set(skb, &rt->u.dst);
1254 return 0;
1255}
1256
1257void rt_bind_peer(struct rtable *rt, int create)
1258{
1259 static DEFINE_SPINLOCK(rt_peer_lock);
1260 struct inet_peer *peer;
1261
1262 peer = inet_getpeer(rt->rt_dst, create);
1263
1264 spin_lock_bh(&rt_peer_lock);
1265 if (rt->peer == NULL) {
1266 rt->peer = peer;
1267 peer = NULL;
1268 }
1269 spin_unlock_bh(&rt_peer_lock);
1270 if (peer)
1271 inet_putpeer(peer);
1272}
1273
1274
1275
1276
1277
1278
1279
1280
1281static void ip_select_fb_ident(struct iphdr *iph)
1282{
1283 static DEFINE_SPINLOCK(ip_fb_id_lock);
1284 static u32 ip_fallback_id;
1285 u32 salt;
1286
1287 spin_lock_bh(&ip_fb_id_lock);
1288 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1289 iph->id = htons(salt & 0xFFFF);
1290 ip_fallback_id = salt;
1291 spin_unlock_bh(&ip_fb_id_lock);
1292}
1293
1294void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1295{
1296 struct rtable *rt = (struct rtable *) dst;
1297
1298 if (rt) {
1299 if (rt->peer == NULL)
1300 rt_bind_peer(rt, 1);
1301
1302
1303
1304
1305 if (rt->peer) {
1306 iph->id = htons(inet_getid(rt->peer, more));
1307 return;
1308 }
1309 } else
1310 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1311 __builtin_return_address(0));
1312
1313 ip_select_fb_ident(iph);
1314}
1315
1316static void rt_del(unsigned hash, struct rtable *rt)
1317{
1318 struct rtable **rthp, *aux;
1319
1320 rthp = &rt_hash_table[hash].chain;
1321 spin_lock_bh(rt_hash_lock_addr(hash));
1322 ip_rt_put(rt);
1323 while ((aux = *rthp) != NULL) {
1324 if (aux == rt || rt_is_expired(aux)) {
1325 *rthp = aux->u.dst.rt_next;
1326 rt_free(aux);
1327 continue;
1328 }
1329 rthp = &aux->u.dst.rt_next;
1330 }
1331 spin_unlock_bh(rt_hash_lock_addr(hash));
1332}
1333
1334void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1335 __be32 saddr, struct net_device *dev)
1336{
1337 int i, k;
1338 struct in_device *in_dev = in_dev_get(dev);
1339 struct rtable *rth, **rthp;
1340 __be32 skeys[2] = { saddr, 0 };
1341 int ikeys[2] = { dev->ifindex, 0 };
1342 struct netevent_redirect netevent;
1343 struct net *net;
1344
1345 if (!in_dev)
1346 return;
1347
1348 net = dev_net(dev);
1349 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1350 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1351 || ipv4_is_zeronet(new_gw))
1352 goto reject_redirect;
1353
1354 if (!rt_caching(net))
1355 goto reject_redirect;
1356
1357 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1358 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1359 goto reject_redirect;
1360 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1361 goto reject_redirect;
1362 } else {
1363 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1364 goto reject_redirect;
1365 }
1366
1367 for (i = 0; i < 2; i++) {
1368 for (k = 0; k < 2; k++) {
1369 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1370 rt_genid(net));
1371
1372 rthp=&rt_hash_table[hash].chain;
1373
1374 rcu_read_lock();
1375 while ((rth = rcu_dereference(*rthp)) != NULL) {
1376 struct rtable *rt;
1377
1378 if (rth->fl.fl4_dst != daddr ||
1379 rth->fl.fl4_src != skeys[i] ||
1380 rth->fl.oif != ikeys[k] ||
1381 rth->fl.iif != 0 ||
1382 rt_is_expired(rth) ||
1383 !net_eq(dev_net(rth->u.dst.dev), net)) {
1384 rthp = &rth->u.dst.rt_next;
1385 continue;
1386 }
1387
1388 if (rth->rt_dst != daddr ||
1389 rth->rt_src != saddr ||
1390 rth->u.dst.error ||
1391 rth->rt_gateway != old_gw ||
1392 rth->u.dst.dev != dev)
1393 break;
1394
1395 dst_hold(&rth->u.dst);
1396 rcu_read_unlock();
1397
1398 rt = dst_alloc(&ipv4_dst_ops);
1399 if (rt == NULL) {
1400 ip_rt_put(rth);
1401 in_dev_put(in_dev);
1402 return;
1403 }
1404
1405
1406 *rt = *rth;
1407 rt->u.dst.__use = 1;
1408 atomic_set(&rt->u.dst.__refcnt, 1);
1409 rt->u.dst.child = NULL;
1410 if (rt->u.dst.dev)
1411 dev_hold(rt->u.dst.dev);
1412 if (rt->idev)
1413 in_dev_hold(rt->idev);
1414 rt->u.dst.obsolete = 0;
1415 rt->u.dst.lastuse = jiffies;
1416 rt->u.dst.path = &rt->u.dst;
1417 rt->u.dst.neighbour = NULL;
1418 rt->u.dst.hh = NULL;
1419#ifdef CONFIG_XFRM
1420 rt->u.dst.xfrm = NULL;
1421#endif
1422 rt->rt_genid = rt_genid(net);
1423 rt->rt_flags |= RTCF_REDIRECTED;
1424
1425
1426 rt->rt_gateway = new_gw;
1427
1428
1429 dst_confirm(&rth->u.dst);
1430
1431 if (rt->peer)
1432 atomic_inc(&rt->peer->refcnt);
1433
1434 if (arp_bind_neighbour(&rt->u.dst) ||
1435 !(rt->u.dst.neighbour->nud_state &
1436 NUD_VALID)) {
1437 if (rt->u.dst.neighbour)
1438 neigh_event_send(rt->u.dst.neighbour, NULL);
1439 ip_rt_put(rth);
1440 rt_drop(rt);
1441 goto do_next;
1442 }
1443
1444 netevent.old = &rth->u.dst;
1445 netevent.new = &rt->u.dst;
1446 call_netevent_notifiers(NETEVENT_REDIRECT,
1447 &netevent);
1448
1449 rt_del(hash, rth);
1450 if (!rt_intern_hash(hash, rt, &rt, NULL))
1451 ip_rt_put(rt);
1452 goto do_next;
1453 }
1454 rcu_read_unlock();
1455 do_next:
1456 ;
1457 }
1458 }
1459 in_dev_put(in_dev);
1460 return;
1461
1462reject_redirect:
1463#ifdef CONFIG_IP_ROUTE_VERBOSE
1464 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1465 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1466 " Advised path = %pI4 -> %pI4\n",
1467 &old_gw, dev->name, &new_gw,
1468 &saddr, &daddr);
1469#endif
1470 in_dev_put(in_dev);
1471}
1472
1473static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1474{
1475 struct rtable *rt = (struct rtable *)dst;
1476 struct dst_entry *ret = dst;
1477
1478 if (rt) {
1479 if (dst->obsolete) {
1480 ip_rt_put(rt);
1481 ret = NULL;
1482 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1483 rt->u.dst.expires) {
1484 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1485 rt->fl.oif,
1486 rt_genid(dev_net(dst->dev)));
1487#if RT_CACHE_DEBUG >= 1
1488 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1489 &rt->rt_dst, rt->fl.fl4_tos);
1490#endif
1491 rt_del(hash, rt);
1492 ret = NULL;
1493 }
1494 }
1495 return ret;
1496}
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514void ip_rt_send_redirect(struct sk_buff *skb)
1515{
1516 struct rtable *rt = skb_rtable(skb);
1517 struct in_device *in_dev;
1518 int log_martians;
1519
1520 rcu_read_lock();
1521 in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1522 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1523 rcu_read_unlock();
1524 return;
1525 }
1526 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1527 rcu_read_unlock();
1528
1529
1530
1531
1532 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1533 rt->u.dst.rate_tokens = 0;
1534
1535
1536
1537
1538 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1539 rt->u.dst.rate_last = jiffies;
1540 return;
1541 }
1542
1543
1544
1545
1546 if (rt->u.dst.rate_tokens == 0 ||
1547 time_after(jiffies,
1548 (rt->u.dst.rate_last +
1549 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1550 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1551 rt->u.dst.rate_last = jiffies;
1552 ++rt->u.dst.rate_tokens;
1553#ifdef CONFIG_IP_ROUTE_VERBOSE
1554 if (log_martians &&
1555 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1556 net_ratelimit())
1557 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1558 &rt->rt_src, rt->rt_iif,
1559 &rt->rt_dst, &rt->rt_gateway);
1560#endif
1561 }
1562}
1563
1564static int ip_error(struct sk_buff *skb)
1565{
1566 struct rtable *rt = skb_rtable(skb);
1567 unsigned long now;
1568 int code;
1569
1570 switch (rt->u.dst.error) {
1571 case EINVAL:
1572 default:
1573 goto out;
1574 case EHOSTUNREACH:
1575 code = ICMP_HOST_UNREACH;
1576 break;
1577 case ENETUNREACH:
1578 code = ICMP_NET_UNREACH;
1579 IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1580 IPSTATS_MIB_INNOROUTES);
1581 break;
1582 case EACCES:
1583 code = ICMP_PKT_FILTERED;
1584 break;
1585 }
1586
1587 now = jiffies;
1588 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1589 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1590 rt->u.dst.rate_tokens = ip_rt_error_burst;
1591 rt->u.dst.rate_last = now;
1592 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1593 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1594 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1595 }
1596
1597out: kfree_skb(skb);
1598 return 0;
1599}
1600
1601
1602
1603
1604
1605
1606static const unsigned short mtu_plateau[] =
1607{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1608
1609static inline unsigned short guess_mtu(unsigned short old_mtu)
1610{
1611 int i;
1612
1613 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1614 if (old_mtu > mtu_plateau[i])
1615 return mtu_plateau[i];
1616 return 68;
1617}
1618
1619unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1620 unsigned short new_mtu,
1621 struct net_device *dev)
1622{
1623 int i, k;
1624 unsigned short old_mtu = ntohs(iph->tot_len);
1625 struct rtable *rth;
1626 int ikeys[2] = { dev->ifindex, 0 };
1627 __be32 skeys[2] = { iph->saddr, 0, };
1628 __be32 daddr = iph->daddr;
1629 unsigned short est_mtu = 0;
1630
1631 if (ipv4_config.no_pmtu_disc)
1632 return 0;
1633
1634 for (k = 0; k < 2; k++) {
1635 for (i = 0; i < 2; i++) {
1636 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1637 rt_genid(net));
1638
1639 rcu_read_lock();
1640 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1641 rth = rcu_dereference(rth->u.dst.rt_next)) {
1642 unsigned short mtu = new_mtu;
1643
1644 if (rth->fl.fl4_dst != daddr ||
1645 rth->fl.fl4_src != skeys[i] ||
1646 rth->rt_dst != daddr ||
1647 rth->rt_src != iph->saddr ||
1648 rth->fl.oif != ikeys[k] ||
1649 rth->fl.iif != 0 ||
1650 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1651 !net_eq(dev_net(rth->u.dst.dev), net) ||
1652 rt_is_expired(rth))
1653 continue;
1654
1655 if (new_mtu < 68 || new_mtu >= old_mtu) {
1656
1657
1658 if (mtu == 0 &&
1659 old_mtu >= dst_mtu(&rth->u.dst) &&
1660 old_mtu >= 68 + (iph->ihl << 2))
1661 old_mtu -= iph->ihl << 2;
1662
1663 mtu = guess_mtu(old_mtu);
1664 }
1665 if (mtu <= dst_mtu(&rth->u.dst)) {
1666 if (mtu < dst_mtu(&rth->u.dst)) {
1667 dst_confirm(&rth->u.dst);
1668 if (mtu < ip_rt_min_pmtu) {
1669 mtu = ip_rt_min_pmtu;
1670 rth->u.dst.metrics[RTAX_LOCK-1] |=
1671 (1 << RTAX_MTU);
1672 }
1673 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1674 dst_set_expires(&rth->u.dst,
1675 ip_rt_mtu_expires);
1676 }
1677 est_mtu = mtu;
1678 }
1679 }
1680 rcu_read_unlock();
1681 }
1682 }
1683 return est_mtu ? : new_mtu;
1684}
1685
1686static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1687{
1688 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1689 !(dst_metric_locked(dst, RTAX_MTU))) {
1690 if (mtu < ip_rt_min_pmtu) {
1691 mtu = ip_rt_min_pmtu;
1692 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1693 }
1694 dst->metrics[RTAX_MTU-1] = mtu;
1695 dst_set_expires(dst, ip_rt_mtu_expires);
1696 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1697 }
1698}
1699
1700static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1701{
1702 return NULL;
1703}
1704
1705static void ipv4_dst_destroy(struct dst_entry *dst)
1706{
1707 struct rtable *rt = (struct rtable *) dst;
1708 struct inet_peer *peer = rt->peer;
1709 struct in_device *idev = rt->idev;
1710
1711 if (peer) {
1712 rt->peer = NULL;
1713 inet_putpeer(peer);
1714 }
1715
1716 if (idev) {
1717 rt->idev = NULL;
1718 in_dev_put(idev);
1719 }
1720}
1721
1722static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1723 int how)
1724{
1725 struct rtable *rt = (struct rtable *) dst;
1726 struct in_device *idev = rt->idev;
1727 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1728 struct in_device *loopback_idev =
1729 in_dev_get(dev_net(dev)->loopback_dev);
1730 if (loopback_idev) {
1731 rt->idev = loopback_idev;
1732 in_dev_put(idev);
1733 }
1734 }
1735}
1736
1737static void ipv4_link_failure(struct sk_buff *skb)
1738{
1739 struct rtable *rt;
1740
1741 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1742
1743 rt = skb_rtable(skb);
1744 if (rt)
1745 dst_set_expires(&rt->u.dst, 0);
1746}
1747
1748static int ip_rt_bug(struct sk_buff *skb)
1749{
1750 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1751 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1752 skb->dev ? skb->dev->name : "?");
1753 kfree_skb(skb);
1754 return 0;
1755}
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766void ip_rt_get_source(u8 *addr, struct rtable *rt)
1767{
1768 __be32 src;
1769 struct fib_result res;
1770
1771 if (rt->fl.iif == 0)
1772 src = rt->rt_src;
1773 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1774 src = FIB_RES_PREFSRC(res);
1775 fib_res_put(&res);
1776 } else
1777 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1778 RT_SCOPE_UNIVERSE);
1779 memcpy(addr, &src, 4);
1780}
1781
1782#ifdef CONFIG_NET_CLS_ROUTE
1783static void set_class_tag(struct rtable *rt, u32 tag)
1784{
1785 if (!(rt->u.dst.tclassid & 0xFFFF))
1786 rt->u.dst.tclassid |= tag & 0xFFFF;
1787 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1788 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1789}
1790#endif
1791
1792static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1793{
1794 struct fib_info *fi = res->fi;
1795
1796 if (fi) {
1797 if (FIB_RES_GW(*res) &&
1798 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1799 rt->rt_gateway = FIB_RES_GW(*res);
1800 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1801 sizeof(rt->u.dst.metrics));
1802 if (fi->fib_mtu == 0) {
1803 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1804 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1805 rt->rt_gateway != rt->rt_dst &&
1806 rt->u.dst.dev->mtu > 576)
1807 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1808 }
1809#ifdef CONFIG_NET_CLS_ROUTE
1810 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1811#endif
1812 } else
1813 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1814
1815 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1816 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1817 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1818 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1819 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1820 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1821 ip_rt_min_advmss);
1822 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1823 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1824
1825#ifdef CONFIG_NET_CLS_ROUTE
1826#ifdef CONFIG_IP_MULTIPLE_TABLES
1827 set_class_tag(rt, fib_rules_tclass(res));
1828#endif
1829 set_class_tag(rt, itag);
1830#endif
1831 rt->rt_type = res->type;
1832}
1833
1834static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1835 u8 tos, struct net_device *dev, int our)
1836{
1837 unsigned hash;
1838 struct rtable *rth;
1839 __be32 spec_dst;
1840 struct in_device *in_dev = in_dev_get(dev);
1841 u32 itag = 0;
1842
1843
1844
1845 if (in_dev == NULL)
1846 return -EINVAL;
1847
1848 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1849 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1850 goto e_inval;
1851
1852 if (ipv4_is_zeronet(saddr)) {
1853 if (!ipv4_is_local_multicast(daddr))
1854 goto e_inval;
1855 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1856 } else if (fib_validate_source(saddr, 0, tos, 0,
1857 dev, &spec_dst, &itag, 0) < 0)
1858 goto e_inval;
1859
1860 rth = dst_alloc(&ipv4_dst_ops);
1861 if (!rth)
1862 goto e_nobufs;
1863
1864 rth->u.dst.output= ip_rt_bug;
1865
1866 atomic_set(&rth->u.dst.__refcnt, 1);
1867 rth->u.dst.flags= DST_HOST;
1868 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1869 rth->u.dst.flags |= DST_NOPOLICY;
1870 rth->fl.fl4_dst = daddr;
1871 rth->rt_dst = daddr;
1872 rth->fl.fl4_tos = tos;
1873 rth->fl.mark = skb->mark;
1874 rth->fl.fl4_src = saddr;
1875 rth->rt_src = saddr;
1876#ifdef CONFIG_NET_CLS_ROUTE
1877 rth->u.dst.tclassid = itag;
1878#endif
1879 rth->rt_iif =
1880 rth->fl.iif = dev->ifindex;
1881 rth->u.dst.dev = init_net.loopback_dev;
1882 dev_hold(rth->u.dst.dev);
1883 rth->idev = in_dev_get(rth->u.dst.dev);
1884 rth->fl.oif = 0;
1885 rth->rt_gateway = daddr;
1886 rth->rt_spec_dst= spec_dst;
1887 rth->rt_genid = rt_genid(dev_net(dev));
1888 rth->rt_flags = RTCF_MULTICAST;
1889 rth->rt_type = RTN_MULTICAST;
1890 if (our) {
1891 rth->u.dst.input= ip_local_deliver;
1892 rth->rt_flags |= RTCF_LOCAL;
1893 }
1894
1895#ifdef CONFIG_IP_MROUTE
1896 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1897 rth->u.dst.input = ip_mr_input;
1898#endif
1899 RT_CACHE_STAT_INC(in_slow_mc);
1900
1901 in_dev_put(in_dev);
1902 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1903 return rt_intern_hash(hash, rth, NULL, skb);
1904
1905e_nobufs:
1906 in_dev_put(in_dev);
1907 return -ENOBUFS;
1908
1909e_inval:
1910 in_dev_put(in_dev);
1911 return -EINVAL;
1912}
1913
1914
1915static void ip_handle_martian_source(struct net_device *dev,
1916 struct in_device *in_dev,
1917 struct sk_buff *skb,
1918 __be32 daddr,
1919 __be32 saddr)
1920{
1921 RT_CACHE_STAT_INC(in_martian_src);
1922#ifdef CONFIG_IP_ROUTE_VERBOSE
1923 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1924
1925
1926
1927
1928 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1929 &daddr, &saddr, dev->name);
1930 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1931 int i;
1932 const unsigned char *p = skb_mac_header(skb);
1933 printk(KERN_WARNING "ll header: ");
1934 for (i = 0; i < dev->hard_header_len; i++, p++) {
1935 printk("%02x", *p);
1936 if (i < (dev->hard_header_len - 1))
1937 printk(":");
1938 }
1939 printk("\n");
1940 }
1941 }
1942#endif
1943}
1944
1945static int __mkroute_input(struct sk_buff *skb,
1946 struct fib_result *res,
1947 struct in_device *in_dev,
1948 __be32 daddr, __be32 saddr, u32 tos,
1949 struct rtable **result)
1950{
1951
1952 struct rtable *rth;
1953 int err;
1954 struct in_device *out_dev;
1955 unsigned flags = 0;
1956 __be32 spec_dst;
1957 u32 itag;
1958
1959
1960 out_dev = in_dev_get(FIB_RES_DEV(*res));
1961 if (out_dev == NULL) {
1962 if (net_ratelimit())
1963 printk(KERN_CRIT "Bug in ip_route_input" \
1964 "_slow(). Please, report\n");
1965 return -EINVAL;
1966 }
1967
1968
1969 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1970 in_dev->dev, &spec_dst, &itag, skb->mark);
1971 if (err < 0) {
1972 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1973 saddr);
1974
1975 err = -EINVAL;
1976 goto cleanup;
1977 }
1978
1979 if (err)
1980 flags |= RTCF_DIRECTSRC;
1981
1982 if (out_dev == in_dev && err &&
1983 (IN_DEV_SHARED_MEDIA(out_dev) ||
1984 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1985 flags |= RTCF_DOREDIRECT;
1986
1987 if (skb->protocol != htons(ETH_P_IP)) {
1988
1989
1990
1991 if (out_dev == in_dev) {
1992 err = -EINVAL;
1993 goto cleanup;
1994 }
1995 }
1996
1997
1998 rth = dst_alloc(&ipv4_dst_ops);
1999 if (!rth) {
2000 err = -ENOBUFS;
2001 goto cleanup;
2002 }
2003
2004 atomic_set(&rth->u.dst.__refcnt, 1);
2005 rth->u.dst.flags= DST_HOST;
2006 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2007 rth->u.dst.flags |= DST_NOPOLICY;
2008 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2009 rth->u.dst.flags |= DST_NOXFRM;
2010 rth->fl.fl4_dst = daddr;
2011 rth->rt_dst = daddr;
2012 rth->fl.fl4_tos = tos;
2013 rth->fl.mark = skb->mark;
2014 rth->fl.fl4_src = saddr;
2015 rth->rt_src = saddr;
2016 rth->rt_gateway = daddr;
2017 rth->rt_iif =
2018 rth->fl.iif = in_dev->dev->ifindex;
2019 rth->u.dst.dev = (out_dev)->dev;
2020 dev_hold(rth->u.dst.dev);
2021 rth->idev = in_dev_get(rth->u.dst.dev);
2022 rth->fl.oif = 0;
2023 rth->rt_spec_dst= spec_dst;
2024
2025 rth->u.dst.input = ip_forward;
2026 rth->u.dst.output = ip_output;
2027 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2028
2029 rt_set_nexthop(rth, res, itag);
2030
2031 rth->rt_flags = flags;
2032
2033 *result = rth;
2034 err = 0;
2035 cleanup:
2036
2037 in_dev_put(out_dev);
2038 return err;
2039}
2040
2041static int ip_mkroute_input(struct sk_buff *skb,
2042 struct fib_result *res,
2043 const struct flowi *fl,
2044 struct in_device *in_dev,
2045 __be32 daddr, __be32 saddr, u32 tos)
2046{
2047 struct rtable* rth = NULL;
2048 int err;
2049 unsigned hash;
2050
2051#ifdef CONFIG_IP_ROUTE_MULTIPATH
2052 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2053 fib_select_multipath(fl, res);
2054#endif
2055
2056
2057 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2058 if (err)
2059 return err;
2060
2061
2062 hash = rt_hash(daddr, saddr, fl->iif,
2063 rt_genid(dev_net(rth->u.dst.dev)));
2064 return rt_intern_hash(hash, rth, NULL, skb);
2065}
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2078 u8 tos, struct net_device *dev)
2079{
2080 struct fib_result res;
2081 struct in_device *in_dev = in_dev_get(dev);
2082 struct flowi fl = { .nl_u = { .ip4_u =
2083 { .daddr = daddr,
2084 .saddr = saddr,
2085 .tos = tos,
2086 .scope = RT_SCOPE_UNIVERSE,
2087 } },
2088 .mark = skb->mark,
2089 .iif = dev->ifindex };
2090 unsigned flags = 0;
2091 u32 itag = 0;
2092 struct rtable * rth;
2093 unsigned hash;
2094 __be32 spec_dst;
2095 int err = -EINVAL;
2096 int free_res = 0;
2097 struct net * net = dev_net(dev);
2098
2099
2100
2101 if (!in_dev)
2102 goto out;
2103
2104
2105
2106
2107
2108 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2109 ipv4_is_loopback(saddr))
2110 goto martian_source;
2111
2112 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2113 goto brd_input;
2114
2115
2116
2117
2118 if (ipv4_is_zeronet(saddr))
2119 goto martian_source;
2120
2121 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2122 ipv4_is_loopback(daddr))
2123 goto martian_destination;
2124
2125
2126
2127
2128 if ((err = fib_lookup(net, &fl, &res)) != 0) {
2129 if (!IN_DEV_FORWARD(in_dev))
2130 goto e_hostunreach;
2131 goto no_route;
2132 }
2133 free_res = 1;
2134
2135 RT_CACHE_STAT_INC(in_slow_tot);
2136
2137 if (res.type == RTN_BROADCAST)
2138 goto brd_input;
2139
2140 if (res.type == RTN_LOCAL) {
2141 int result;
2142 result = fib_validate_source(saddr, daddr, tos,
2143 net->loopback_dev->ifindex,
2144 dev, &spec_dst, &itag, skb->mark);
2145 if (result < 0)
2146 goto martian_source;
2147 if (result)
2148 flags |= RTCF_DIRECTSRC;
2149 spec_dst = daddr;
2150 goto local_input;
2151 }
2152
2153 if (!IN_DEV_FORWARD(in_dev))
2154 goto e_hostunreach;
2155 if (res.type != RTN_UNICAST)
2156 goto martian_destination;
2157
2158 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2159done:
2160 in_dev_put(in_dev);
2161 if (free_res)
2162 fib_res_put(&res);
2163out: return err;
2164
2165brd_input:
2166 if (skb->protocol != htons(ETH_P_IP))
2167 goto e_inval;
2168
2169 if (ipv4_is_zeronet(saddr))
2170 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2171 else {
2172 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2173 &itag, skb->mark);
2174 if (err < 0)
2175 goto martian_source;
2176 if (err)
2177 flags |= RTCF_DIRECTSRC;
2178 }
2179 flags |= RTCF_BROADCAST;
2180 res.type = RTN_BROADCAST;
2181 RT_CACHE_STAT_INC(in_brd);
2182
2183local_input:
2184 rth = dst_alloc(&ipv4_dst_ops);
2185 if (!rth)
2186 goto e_nobufs;
2187
2188 rth->u.dst.output= ip_rt_bug;
2189 rth->rt_genid = rt_genid(net);
2190
2191 atomic_set(&rth->u.dst.__refcnt, 1);
2192 rth->u.dst.flags= DST_HOST;
2193 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2194 rth->u.dst.flags |= DST_NOPOLICY;
2195 rth->fl.fl4_dst = daddr;
2196 rth->rt_dst = daddr;
2197 rth->fl.fl4_tos = tos;
2198 rth->fl.mark = skb->mark;
2199 rth->fl.fl4_src = saddr;
2200 rth->rt_src = saddr;
2201#ifdef CONFIG_NET_CLS_ROUTE
2202 rth->u.dst.tclassid = itag;
2203#endif
2204 rth->rt_iif =
2205 rth->fl.iif = dev->ifindex;
2206 rth->u.dst.dev = net->loopback_dev;
2207 dev_hold(rth->u.dst.dev);
2208 rth->idev = in_dev_get(rth->u.dst.dev);
2209 rth->rt_gateway = daddr;
2210 rth->rt_spec_dst= spec_dst;
2211 rth->u.dst.input= ip_local_deliver;
2212 rth->rt_flags = flags|RTCF_LOCAL;
2213 if (res.type == RTN_UNREACHABLE) {
2214 rth->u.dst.input= ip_error;
2215 rth->u.dst.error= -err;
2216 rth->rt_flags &= ~RTCF_LOCAL;
2217 }
2218 rth->rt_type = res.type;
2219 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2220 err = rt_intern_hash(hash, rth, NULL, skb);
2221 goto done;
2222
2223no_route:
2224 RT_CACHE_STAT_INC(in_no_route);
2225 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2226 res.type = RTN_UNREACHABLE;
2227 if (err == -ESRCH)
2228 err = -ENETUNREACH;
2229 goto local_input;
2230
2231
2232
2233
2234martian_destination:
2235 RT_CACHE_STAT_INC(in_martian_dst);
2236#ifdef CONFIG_IP_ROUTE_VERBOSE
2237 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2238 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2239 &daddr, &saddr, dev->name);
2240#endif
2241
2242e_hostunreach:
2243 err = -EHOSTUNREACH;
2244 goto done;
2245
2246e_inval:
2247 err = -EINVAL;
2248 goto done;
2249
2250e_nobufs:
2251 err = -ENOBUFS;
2252 goto done;
2253
2254martian_source:
2255 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2256 goto e_inval;
2257}
2258
2259int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2260 u8 tos, struct net_device *dev)
2261{
2262 struct rtable * rth;
2263 unsigned hash;
2264 int iif = dev->ifindex;
2265 struct net *net;
2266
2267 net = dev_net(dev);
2268
2269 if (!rt_caching(net))
2270 goto skip_cache;
2271
2272 tos &= IPTOS_RT_MASK;
2273 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2274
2275 rcu_read_lock();
2276 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2277 rth = rcu_dereference(rth->u.dst.rt_next)) {
2278 if (((rth->fl.fl4_dst ^ daddr) |
2279 (rth->fl.fl4_src ^ saddr) |
2280 (rth->fl.iif ^ iif) |
2281 rth->fl.oif |
2282 (rth->fl.fl4_tos ^ tos)) == 0 &&
2283 rth->fl.mark == skb->mark &&
2284 net_eq(dev_net(rth->u.dst.dev), net) &&
2285 !rt_is_expired(rth)) {
2286 dst_use(&rth->u.dst, jiffies);
2287 RT_CACHE_STAT_INC(in_hit);
2288 rcu_read_unlock();
2289 skb_dst_set(skb, &rth->u.dst);
2290 return 0;
2291 }
2292 RT_CACHE_STAT_INC(in_hlist_search);
2293 }
2294 rcu_read_unlock();
2295
2296skip_cache:
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308 if (ipv4_is_multicast(daddr)) {
2309 struct in_device *in_dev;
2310
2311 rcu_read_lock();
2312 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2313 int our = ip_check_mc(in_dev, daddr, saddr,
2314 ip_hdr(skb)->protocol);
2315 if (our
2316#ifdef CONFIG_IP_MROUTE
2317 || (!ipv4_is_local_multicast(daddr) &&
2318 IN_DEV_MFORWARD(in_dev))
2319#endif
2320 ) {
2321 rcu_read_unlock();
2322 return ip_route_input_mc(skb, daddr, saddr,
2323 tos, dev, our);
2324 }
2325 }
2326 rcu_read_unlock();
2327 return -EINVAL;
2328 }
2329 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2330}
2331
2332static int __mkroute_output(struct rtable **result,
2333 struct fib_result *res,
2334 const struct flowi *fl,
2335 const struct flowi *oldflp,
2336 struct net_device *dev_out,
2337 unsigned flags)
2338{
2339 struct rtable *rth;
2340 struct in_device *in_dev;
2341 u32 tos = RT_FL_TOS(oldflp);
2342 int err = 0;
2343
2344 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2345 return -EINVAL;
2346
2347 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2348 res->type = RTN_BROADCAST;
2349 else if (ipv4_is_multicast(fl->fl4_dst))
2350 res->type = RTN_MULTICAST;
2351 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2352 return -EINVAL;
2353
2354 if (dev_out->flags & IFF_LOOPBACK)
2355 flags |= RTCF_LOCAL;
2356
2357
2358 in_dev = in_dev_get(dev_out);
2359 if (!in_dev)
2360 return -EINVAL;
2361
2362 if (res->type == RTN_BROADCAST) {
2363 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2364 if (res->fi) {
2365 fib_info_put(res->fi);
2366 res->fi = NULL;
2367 }
2368 } else if (res->type == RTN_MULTICAST) {
2369 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2370 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2371 oldflp->proto))
2372 flags &= ~RTCF_LOCAL;
2373
2374
2375
2376
2377 if (res->fi && res->prefixlen < 4) {
2378 fib_info_put(res->fi);
2379 res->fi = NULL;
2380 }
2381 }
2382
2383
2384 rth = dst_alloc(&ipv4_dst_ops);
2385 if (!rth) {
2386 err = -ENOBUFS;
2387 goto cleanup;
2388 }
2389
2390 atomic_set(&rth->u.dst.__refcnt, 1);
2391 rth->u.dst.flags= DST_HOST;
2392 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2393 rth->u.dst.flags |= DST_NOXFRM;
2394 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2395 rth->u.dst.flags |= DST_NOPOLICY;
2396
2397 rth->fl.fl4_dst = oldflp->fl4_dst;
2398 rth->fl.fl4_tos = tos;
2399 rth->fl.fl4_src = oldflp->fl4_src;
2400 rth->fl.oif = oldflp->oif;
2401 rth->fl.mark = oldflp->mark;
2402 rth->rt_dst = fl->fl4_dst;
2403 rth->rt_src = fl->fl4_src;
2404 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2405
2406
2407 rth->u.dst.dev = dev_out;
2408 dev_hold(dev_out);
2409 rth->idev = in_dev_get(dev_out);
2410 rth->rt_gateway = fl->fl4_dst;
2411 rth->rt_spec_dst= fl->fl4_src;
2412
2413 rth->u.dst.output=ip_output;
2414 rth->rt_genid = rt_genid(dev_net(dev_out));
2415
2416 RT_CACHE_STAT_INC(out_slow_tot);
2417
2418 if (flags & RTCF_LOCAL) {
2419 rth->u.dst.input = ip_local_deliver;
2420 rth->rt_spec_dst = fl->fl4_dst;
2421 }
2422 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2423 rth->rt_spec_dst = fl->fl4_src;
2424 if (flags & RTCF_LOCAL &&
2425 !(dev_out->flags & IFF_LOOPBACK)) {
2426 rth->u.dst.output = ip_mc_output;
2427 RT_CACHE_STAT_INC(out_slow_mc);
2428 }
2429#ifdef CONFIG_IP_MROUTE
2430 if (res->type == RTN_MULTICAST) {
2431 if (IN_DEV_MFORWARD(in_dev) &&
2432 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2433 rth->u.dst.input = ip_mr_input;
2434 rth->u.dst.output = ip_mc_output;
2435 }
2436 }
2437#endif
2438 }
2439
2440 rt_set_nexthop(rth, res, 0);
2441
2442 rth->rt_flags = flags;
2443
2444 *result = rth;
2445 cleanup:
2446
2447 in_dev_put(in_dev);
2448
2449 return err;
2450}
2451
2452static int ip_mkroute_output(struct rtable **rp,
2453 struct fib_result *res,
2454 const struct flowi *fl,
2455 const struct flowi *oldflp,
2456 struct net_device *dev_out,
2457 unsigned flags)
2458{
2459 struct rtable *rth = NULL;
2460 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2461 unsigned hash;
2462 if (err == 0) {
2463 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2464 rt_genid(dev_net(dev_out)));
2465 err = rt_intern_hash(hash, rth, rp, NULL);
2466 }
2467
2468 return err;
2469}
2470
2471
2472
2473
2474
2475static int ip_route_output_slow(struct net *net, struct rtable **rp,
2476 const struct flowi *oldflp)
2477{
2478 u32 tos = RT_FL_TOS(oldflp);
2479 struct flowi fl = { .nl_u = { .ip4_u =
2480 { .daddr = oldflp->fl4_dst,
2481 .saddr = oldflp->fl4_src,
2482 .tos = tos & IPTOS_RT_MASK,
2483 .scope = ((tos & RTO_ONLINK) ?
2484 RT_SCOPE_LINK :
2485 RT_SCOPE_UNIVERSE),
2486 } },
2487 .mark = oldflp->mark,
2488 .iif = net->loopback_dev->ifindex,
2489 .oif = oldflp->oif };
2490 struct fib_result res;
2491 unsigned flags = 0;
2492 struct net_device *dev_out = NULL;
2493 int free_res = 0;
2494 int err;
2495
2496
2497 res.fi = NULL;
2498#ifdef CONFIG_IP_MULTIPLE_TABLES
2499 res.r = NULL;
2500#endif
2501
2502 if (oldflp->fl4_src) {
2503 err = -EINVAL;
2504 if (ipv4_is_multicast(oldflp->fl4_src) ||
2505 ipv4_is_lbcast(oldflp->fl4_src) ||
2506 ipv4_is_zeronet(oldflp->fl4_src))
2507 goto out;
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517 if (oldflp->oif == 0
2518 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2519 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2520
2521 dev_out = ip_dev_find(net, oldflp->fl4_src);
2522 if (dev_out == NULL)
2523 goto out;
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540 fl.oif = dev_out->ifindex;
2541 goto make_route;
2542 }
2543
2544 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2545
2546 dev_out = ip_dev_find(net, oldflp->fl4_src);
2547 if (dev_out == NULL)
2548 goto out;
2549 dev_put(dev_out);
2550 dev_out = NULL;
2551 }
2552 }
2553
2554
2555 if (oldflp->oif) {
2556 dev_out = dev_get_by_index(net, oldflp->oif);
2557 err = -ENODEV;
2558 if (dev_out == NULL)
2559 goto out;
2560
2561
2562 if (__in_dev_get_rtnl(dev_out) == NULL) {
2563 dev_put(dev_out);
2564 goto out;
2565 }
2566
2567 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2568 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2569 if (!fl.fl4_src)
2570 fl.fl4_src = inet_select_addr(dev_out, 0,
2571 RT_SCOPE_LINK);
2572 goto make_route;
2573 }
2574 if (!fl.fl4_src) {
2575 if (ipv4_is_multicast(oldflp->fl4_dst))
2576 fl.fl4_src = inet_select_addr(dev_out, 0,
2577 fl.fl4_scope);
2578 else if (!oldflp->fl4_dst)
2579 fl.fl4_src = inet_select_addr(dev_out, 0,
2580 RT_SCOPE_HOST);
2581 }
2582 }
2583
2584 if (!fl.fl4_dst) {
2585 fl.fl4_dst = fl.fl4_src;
2586 if (!fl.fl4_dst)
2587 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2588 if (dev_out)
2589 dev_put(dev_out);
2590 dev_out = net->loopback_dev;
2591 dev_hold(dev_out);
2592 fl.oif = net->loopback_dev->ifindex;
2593 res.type = RTN_LOCAL;
2594 flags |= RTCF_LOCAL;
2595 goto make_route;
2596 }
2597
2598 if (fib_lookup(net, &fl, &res)) {
2599 res.fi = NULL;
2600 if (oldflp->oif) {
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619 if (fl.fl4_src == 0)
2620 fl.fl4_src = inet_select_addr(dev_out, 0,
2621 RT_SCOPE_LINK);
2622 res.type = RTN_UNICAST;
2623 goto make_route;
2624 }
2625 if (dev_out)
2626 dev_put(dev_out);
2627 err = -ENETUNREACH;
2628 goto out;
2629 }
2630 free_res = 1;
2631
2632 if (res.type == RTN_LOCAL) {
2633 if (!fl.fl4_src)
2634 fl.fl4_src = fl.fl4_dst;
2635 if (dev_out)
2636 dev_put(dev_out);
2637 dev_out = net->loopback_dev;
2638 dev_hold(dev_out);
2639 fl.oif = dev_out->ifindex;
2640 if (res.fi)
2641 fib_info_put(res.fi);
2642 res.fi = NULL;
2643 flags |= RTCF_LOCAL;
2644 goto make_route;
2645 }
2646
2647#ifdef CONFIG_IP_ROUTE_MULTIPATH
2648 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2649 fib_select_multipath(&fl, &res);
2650 else
2651#endif
2652 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2653 fib_select_default(net, &fl, &res);
2654
2655 if (!fl.fl4_src)
2656 fl.fl4_src = FIB_RES_PREFSRC(res);
2657
2658 if (dev_out)
2659 dev_put(dev_out);
2660 dev_out = FIB_RES_DEV(res);
2661 dev_hold(dev_out);
2662 fl.oif = dev_out->ifindex;
2663
2664
2665make_route:
2666 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2667
2668
2669 if (free_res)
2670 fib_res_put(&res);
2671 if (dev_out)
2672 dev_put(dev_out);
2673out: return err;
2674}
2675
2676int __ip_route_output_key(struct net *net, struct rtable **rp,
2677 const struct flowi *flp)
2678{
2679 unsigned hash;
2680 struct rtable *rth;
2681
2682 if (!rt_caching(net))
2683 goto slow_output;
2684
2685 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2686
2687 rcu_read_lock_bh();
2688 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2689 rth = rcu_dereference(rth->u.dst.rt_next)) {
2690 if (rth->fl.fl4_dst == flp->fl4_dst &&
2691 rth->fl.fl4_src == flp->fl4_src &&
2692 rth->fl.iif == 0 &&
2693 rth->fl.oif == flp->oif &&
2694 rth->fl.mark == flp->mark &&
2695 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2696 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2697 net_eq(dev_net(rth->u.dst.dev), net) &&
2698 !rt_is_expired(rth)) {
2699 dst_use(&rth->u.dst, jiffies);
2700 RT_CACHE_STAT_INC(out_hit);
2701 rcu_read_unlock_bh();
2702 *rp = rth;
2703 return 0;
2704 }
2705 RT_CACHE_STAT_INC(out_hlist_search);
2706 }
2707 rcu_read_unlock_bh();
2708
2709slow_output:
2710 return ip_route_output_slow(net, rp, flp);
2711}
2712
2713EXPORT_SYMBOL_GPL(__ip_route_output_key);
2714
2715static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2716{
2717}
2718
2719static struct dst_ops ipv4_dst_blackhole_ops = {
2720 .family = AF_INET,
2721 .protocol = cpu_to_be16(ETH_P_IP),
2722 .destroy = ipv4_dst_destroy,
2723 .check = ipv4_dst_check,
2724 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2725 .entries = ATOMIC_INIT(0),
2726};
2727
2728
2729static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2730{
2731 struct rtable *ort = *rp;
2732 struct rtable *rt = (struct rtable *)
2733 dst_alloc(&ipv4_dst_blackhole_ops);
2734
2735 if (rt) {
2736 struct dst_entry *new = &rt->u.dst;
2737
2738 atomic_set(&new->__refcnt, 1);
2739 new->__use = 1;
2740 new->input = dst_discard;
2741 new->output = dst_discard;
2742 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2743
2744 new->dev = ort->u.dst.dev;
2745 if (new->dev)
2746 dev_hold(new->dev);
2747
2748 rt->fl = ort->fl;
2749
2750 rt->idev = ort->idev;
2751 if (rt->idev)
2752 in_dev_hold(rt->idev);
2753 rt->rt_genid = rt_genid(net);
2754 rt->rt_flags = ort->rt_flags;
2755 rt->rt_type = ort->rt_type;
2756 rt->rt_dst = ort->rt_dst;
2757 rt->rt_src = ort->rt_src;
2758 rt->rt_iif = ort->rt_iif;
2759 rt->rt_gateway = ort->rt_gateway;
2760 rt->rt_spec_dst = ort->rt_spec_dst;
2761 rt->peer = ort->peer;
2762 if (rt->peer)
2763 atomic_inc(&rt->peer->refcnt);
2764
2765 dst_free(new);
2766 }
2767
2768 dst_release(&(*rp)->u.dst);
2769 *rp = rt;
2770 return (rt ? 0 : -ENOMEM);
2771}
2772
2773int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2774 struct sock *sk, int flags)
2775{
2776 int err;
2777
2778 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2779 return err;
2780
2781 if (flp->proto) {
2782 if (!flp->fl4_src)
2783 flp->fl4_src = (*rp)->rt_src;
2784 if (!flp->fl4_dst)
2785 flp->fl4_dst = (*rp)->rt_dst;
2786 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2787 flags ? XFRM_LOOKUP_WAIT : 0);
2788 if (err == -EREMOTE)
2789 err = ipv4_dst_blackhole(net, rp, flp);
2790
2791 return err;
2792 }
2793
2794 return 0;
2795}
2796
2797EXPORT_SYMBOL_GPL(ip_route_output_flow);
2798
2799int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2800{
2801 return ip_route_output_flow(net, rp, flp, NULL, 0);
2802}
2803
2804static int rt_fill_info(struct net *net,
2805 struct sk_buff *skb, u32 pid, u32 seq, int event,
2806 int nowait, unsigned int flags)
2807{
2808 struct rtable *rt = skb_rtable(skb);
2809 struct rtmsg *r;
2810 struct nlmsghdr *nlh;
2811 long expires;
2812 u32 id = 0, ts = 0, tsage = 0, error;
2813
2814 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2815 if (nlh == NULL)
2816 return -EMSGSIZE;
2817
2818 r = nlmsg_data(nlh);
2819 r->rtm_family = AF_INET;
2820 r->rtm_dst_len = 32;
2821 r->rtm_src_len = 0;
2822 r->rtm_tos = rt->fl.fl4_tos;
2823 r->rtm_table = RT_TABLE_MAIN;
2824 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2825 r->rtm_type = rt->rt_type;
2826 r->rtm_scope = RT_SCOPE_UNIVERSE;
2827 r->rtm_protocol = RTPROT_UNSPEC;
2828 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2829 if (rt->rt_flags & RTCF_NOTIFY)
2830 r->rtm_flags |= RTM_F_NOTIFY;
2831
2832 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2833
2834 if (rt->fl.fl4_src) {
2835 r->rtm_src_len = 32;
2836 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2837 }
2838 if (rt->u.dst.dev)
2839 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2840#ifdef CONFIG_NET_CLS_ROUTE
2841 if (rt->u.dst.tclassid)
2842 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2843#endif
2844 if (rt->fl.iif)
2845 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2846 else if (rt->rt_src != rt->fl.fl4_src)
2847 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2848
2849 if (rt->rt_dst != rt->rt_gateway)
2850 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2851
2852 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2853 goto nla_put_failure;
2854
2855 error = rt->u.dst.error;
2856 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2857 if (rt->peer) {
2858 id = rt->peer->ip_id_count;
2859 if (rt->peer->tcp_ts_stamp) {
2860 ts = rt->peer->tcp_ts;
2861 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2862 }
2863 }
2864
2865 if (rt->fl.iif) {
2866#ifdef CONFIG_IP_MROUTE
2867 __be32 dst = rt->rt_dst;
2868
2869 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2870 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2871 int err = ipmr_get_route(net, skb, r, nowait);
2872 if (err <= 0) {
2873 if (!nowait) {
2874 if (err == 0)
2875 return 0;
2876 goto nla_put_failure;
2877 } else {
2878 if (err == -EMSGSIZE)
2879 goto nla_put_failure;
2880 error = err;
2881 }
2882 }
2883 } else
2884#endif
2885 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2886 }
2887
2888 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2889 expires, error) < 0)
2890 goto nla_put_failure;
2891
2892 return nlmsg_end(skb, nlh);
2893
2894nla_put_failure:
2895 nlmsg_cancel(skb, nlh);
2896 return -EMSGSIZE;
2897}
2898
2899static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2900{
2901 struct net *net = sock_net(in_skb->sk);
2902 struct rtmsg *rtm;
2903 struct nlattr *tb[RTA_MAX+1];
2904 struct rtable *rt = NULL;
2905 __be32 dst = 0;
2906 __be32 src = 0;
2907 u32 iif;
2908 int err;
2909 struct sk_buff *skb;
2910
2911 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2912 if (err < 0)
2913 goto errout;
2914
2915 rtm = nlmsg_data(nlh);
2916
2917 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2918 if (skb == NULL) {
2919 err = -ENOBUFS;
2920 goto errout;
2921 }
2922
2923
2924
2925
2926 skb_reset_mac_header(skb);
2927 skb_reset_network_header(skb);
2928
2929
2930 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2931 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2932
2933 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2934 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2935 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2936
2937 if (iif) {
2938 struct net_device *dev;
2939
2940 dev = __dev_get_by_index(net, iif);
2941 if (dev == NULL) {
2942 err = -ENODEV;
2943 goto errout_free;
2944 }
2945
2946 skb->protocol = htons(ETH_P_IP);
2947 skb->dev = dev;
2948 local_bh_disable();
2949 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2950 local_bh_enable();
2951
2952 rt = skb_rtable(skb);
2953 if (err == 0 && rt->u.dst.error)
2954 err = -rt->u.dst.error;
2955 } else {
2956 struct flowi fl = {
2957 .nl_u = {
2958 .ip4_u = {
2959 .daddr = dst,
2960 .saddr = src,
2961 .tos = rtm->rtm_tos,
2962 },
2963 },
2964 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2965 };
2966 err = ip_route_output_key(net, &rt, &fl);
2967 }
2968
2969 if (err)
2970 goto errout_free;
2971
2972 skb_dst_set(skb, &rt->u.dst);
2973 if (rtm->rtm_flags & RTM_F_NOTIFY)
2974 rt->rt_flags |= RTCF_NOTIFY;
2975
2976 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2977 RTM_NEWROUTE, 0, 0);
2978 if (err <= 0)
2979 goto errout_free;
2980
2981 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2982errout:
2983 return err;
2984
2985errout_free:
2986 kfree_skb(skb);
2987 goto errout;
2988}
2989
2990int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2991{
2992 struct rtable *rt;
2993 int h, s_h;
2994 int idx, s_idx;
2995 struct net *net;
2996
2997 net = sock_net(skb->sk);
2998
2999 s_h = cb->args[0];
3000 if (s_h < 0)
3001 s_h = 0;
3002 s_idx = idx = cb->args[1];
3003 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3004 if (!rt_hash_table[h].chain)
3005 continue;
3006 rcu_read_lock_bh();
3007 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
3008 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
3009 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3010 continue;
3011 if (rt_is_expired(rt))
3012 continue;
3013 skb_dst_set(skb, dst_clone(&rt->u.dst));
3014 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3015 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3016 1, NLM_F_MULTI) <= 0) {
3017 skb_dst_drop(skb);
3018 rcu_read_unlock_bh();
3019 goto done;
3020 }
3021 skb_dst_drop(skb);
3022 }
3023 rcu_read_unlock_bh();
3024 }
3025
3026done:
3027 cb->args[0] = h;
3028 cb->args[1] = idx;
3029 return skb->len;
3030}
3031
3032void ip_rt_multicast_event(struct in_device *in_dev)
3033{
3034 rt_cache_flush(dev_net(in_dev->dev), 0);
3035}
3036
3037#ifdef CONFIG_SYSCTL
3038static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3039 void __user *buffer,
3040 size_t *lenp, loff_t *ppos)
3041{
3042 if (write) {
3043 int flush_delay;
3044 ctl_table ctl;
3045 struct net *net;
3046
3047 memcpy(&ctl, __ctl, sizeof(ctl));
3048 ctl.data = &flush_delay;
3049 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3050
3051 net = (struct net *)__ctl->extra1;
3052 rt_cache_flush(net, flush_delay);
3053 return 0;
3054 }
3055
3056 return -EINVAL;
3057}
3058
3059static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
3060 void __user *oldval,
3061 size_t __user *oldlenp,
3062 void __user *newval,
3063 size_t newlen)
3064{
3065 int delay;
3066 struct net *net;
3067 if (newlen != sizeof(int))
3068 return -EINVAL;
3069 if (get_user(delay, (int __user *)newval))
3070 return -EFAULT;
3071 net = (struct net *)table->extra1;
3072 rt_cache_flush(net, delay);
3073 return 0;
3074}
3075
3076static void rt_secret_reschedule(int old)
3077{
3078 struct net *net;
3079 int new = ip_rt_secret_interval;
3080 int diff = new - old;
3081
3082 if (!diff)
3083 return;
3084
3085 rtnl_lock();
3086 for_each_net(net) {
3087 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3088
3089 if (!new)
3090 continue;
3091
3092 if (deleted) {
3093 long time = net->ipv4.rt_secret_timer.expires - jiffies;
3094
3095 if (time <= 0 || (time += diff) <= 0)
3096 time = 0;
3097
3098 net->ipv4.rt_secret_timer.expires = time;
3099 } else
3100 net->ipv4.rt_secret_timer.expires = new;
3101
3102 net->ipv4.rt_secret_timer.expires += jiffies;
3103 add_timer(&net->ipv4.rt_secret_timer);
3104 }
3105 rtnl_unlock();
3106}
3107
3108static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3109 void __user *buffer, size_t *lenp,
3110 loff_t *ppos)
3111{
3112 int old = ip_rt_secret_interval;
3113 int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
3114
3115 rt_secret_reschedule(old);
3116
3117 return ret;
3118}
3119
3120static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
3121 void __user *oldval,
3122 size_t __user *oldlenp,
3123 void __user *newval,
3124 size_t newlen)
3125{
3126 int old = ip_rt_secret_interval;
3127 int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
3128
3129 rt_secret_reschedule(old);
3130
3131 return ret;
3132}
3133
3134static ctl_table ipv4_route_table[] = {
3135 {
3136 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
3137 .procname = "gc_thresh",
3138 .data = &ipv4_dst_ops.gc_thresh,
3139 .maxlen = sizeof(int),
3140 .mode = 0644,
3141 .proc_handler = proc_dointvec,
3142 },
3143 {
3144 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
3145 .procname = "max_size",
3146 .data = &ip_rt_max_size,
3147 .maxlen = sizeof(int),
3148 .mode = 0644,
3149 .proc_handler = proc_dointvec,
3150 },
3151 {
3152
3153
3154 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3155 .procname = "gc_min_interval",
3156 .data = &ip_rt_gc_min_interval,
3157 .maxlen = sizeof(int),
3158 .mode = 0644,
3159 .proc_handler = proc_dointvec_jiffies,
3160 .strategy = sysctl_jiffies,
3161 },
3162 {
3163 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3164 .procname = "gc_min_interval_ms",
3165 .data = &ip_rt_gc_min_interval,
3166 .maxlen = sizeof(int),
3167 .mode = 0644,
3168 .proc_handler = proc_dointvec_ms_jiffies,
3169 .strategy = sysctl_ms_jiffies,
3170 },
3171 {
3172 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
3173 .procname = "gc_timeout",
3174 .data = &ip_rt_gc_timeout,
3175 .maxlen = sizeof(int),
3176 .mode = 0644,
3177 .proc_handler = proc_dointvec_jiffies,
3178 .strategy = sysctl_jiffies,
3179 },
3180 {
3181 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
3182 .procname = "gc_interval",
3183 .data = &ip_rt_gc_interval,
3184 .maxlen = sizeof(int),
3185 .mode = 0644,
3186 .proc_handler = proc_dointvec_jiffies,
3187 .strategy = sysctl_jiffies,
3188 },
3189 {
3190 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
3191 .procname = "redirect_load",
3192 .data = &ip_rt_redirect_load,
3193 .maxlen = sizeof(int),
3194 .mode = 0644,
3195 .proc_handler = proc_dointvec,
3196 },
3197 {
3198 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3199 .procname = "redirect_number",
3200 .data = &ip_rt_redirect_number,
3201 .maxlen = sizeof(int),
3202 .mode = 0644,
3203 .proc_handler = proc_dointvec,
3204 },
3205 {
3206 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3207 .procname = "redirect_silence",
3208 .data = &ip_rt_redirect_silence,
3209 .maxlen = sizeof(int),
3210 .mode = 0644,
3211 .proc_handler = proc_dointvec,
3212 },
3213 {
3214 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
3215 .procname = "error_cost",
3216 .data = &ip_rt_error_cost,
3217 .maxlen = sizeof(int),
3218 .mode = 0644,
3219 .proc_handler = proc_dointvec,
3220 },
3221 {
3222 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3223 .procname = "error_burst",
3224 .data = &ip_rt_error_burst,
3225 .maxlen = sizeof(int),
3226 .mode = 0644,
3227 .proc_handler = proc_dointvec,
3228 },
3229 {
3230 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3231 .procname = "gc_elasticity",
3232 .data = &ip_rt_gc_elasticity,
3233 .maxlen = sizeof(int),
3234 .mode = 0644,
3235 .proc_handler = proc_dointvec,
3236 },
3237 {
3238 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3239 .procname = "mtu_expires",
3240 .data = &ip_rt_mtu_expires,
3241 .maxlen = sizeof(int),
3242 .mode = 0644,
3243 .proc_handler = proc_dointvec_jiffies,
3244 .strategy = sysctl_jiffies,
3245 },
3246 {
3247 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3248 .procname = "min_pmtu",
3249 .data = &ip_rt_min_pmtu,
3250 .maxlen = sizeof(int),
3251 .mode = 0644,
3252 .proc_handler = proc_dointvec,
3253 },
3254 {
3255 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3256 .procname = "min_adv_mss",
3257 .data = &ip_rt_min_advmss,
3258 .maxlen = sizeof(int),
3259 .mode = 0644,
3260 .proc_handler = proc_dointvec,
3261 },
3262 {
3263 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3264 .procname = "secret_interval",
3265 .data = &ip_rt_secret_interval,
3266 .maxlen = sizeof(int),
3267 .mode = 0644,
3268 .proc_handler = ipv4_sysctl_rt_secret_interval,
3269 .strategy = ipv4_sysctl_rt_secret_interval_strategy,
3270 },
3271 { .ctl_name = 0 }
3272};
3273
3274static struct ctl_table empty[1];
3275
3276static struct ctl_table ipv4_skeleton[] =
3277{
3278 { .procname = "route", .ctl_name = NET_IPV4_ROUTE,
3279 .mode = 0555, .child = ipv4_route_table},
3280 { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
3281 .mode = 0555, .child = empty},
3282 { }
3283};
3284
3285static __net_initdata struct ctl_path ipv4_path[] = {
3286 { .procname = "net", .ctl_name = CTL_NET, },
3287 { .procname = "ipv4", .ctl_name = NET_IPV4, },
3288 { },
3289};
3290
3291static struct ctl_table ipv4_route_flush_table[] = {
3292 {
3293 .ctl_name = NET_IPV4_ROUTE_FLUSH,
3294 .procname = "flush",
3295 .maxlen = sizeof(int),
3296 .mode = 0200,
3297 .proc_handler = ipv4_sysctl_rtcache_flush,
3298 .strategy = ipv4_sysctl_rtcache_flush_strategy,
3299 },
3300 { .ctl_name = 0 },
3301};
3302
3303static __net_initdata struct ctl_path ipv4_route_path[] = {
3304 { .procname = "net", .ctl_name = CTL_NET, },
3305 { .procname = "ipv4", .ctl_name = NET_IPV4, },
3306 { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3307 { },
3308};
3309
3310static __net_init int sysctl_route_net_init(struct net *net)
3311{
3312 struct ctl_table *tbl;
3313
3314 tbl = ipv4_route_flush_table;
3315 if (net != &init_net) {
3316 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3317 if (tbl == NULL)
3318 goto err_dup;
3319 }
3320 tbl[0].extra1 = net;
3321
3322 net->ipv4.route_hdr =
3323 register_net_sysctl_table(net, ipv4_route_path, tbl);
3324 if (net->ipv4.route_hdr == NULL)
3325 goto err_reg;
3326 return 0;
3327
3328err_reg:
3329 if (tbl != ipv4_route_flush_table)
3330 kfree(tbl);
3331err_dup:
3332 return -ENOMEM;
3333}
3334
3335static __net_exit void sysctl_route_net_exit(struct net *net)
3336{
3337 struct ctl_table *tbl;
3338
3339 tbl = net->ipv4.route_hdr->ctl_table_arg;
3340 unregister_net_sysctl_table(net->ipv4.route_hdr);
3341 BUG_ON(tbl == ipv4_route_flush_table);
3342 kfree(tbl);
3343}
3344
3345static __net_initdata struct pernet_operations sysctl_route_ops = {
3346 .init = sysctl_route_net_init,
3347 .exit = sysctl_route_net_exit,
3348};
3349#endif
3350
3351
3352static __net_init int rt_secret_timer_init(struct net *net)
3353{
3354 atomic_set(&net->ipv4.rt_genid,
3355 (int) ((num_physpages ^ (num_physpages>>8)) ^
3356 (jiffies ^ (jiffies >> 7))));
3357
3358 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3359 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3360 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3361
3362 if (ip_rt_secret_interval) {
3363 net->ipv4.rt_secret_timer.expires =
3364 jiffies + net_random() % ip_rt_secret_interval +
3365 ip_rt_secret_interval;
3366 add_timer(&net->ipv4.rt_secret_timer);
3367 }
3368 return 0;
3369}
3370
3371static __net_exit void rt_secret_timer_exit(struct net *net)
3372{
3373 del_timer_sync(&net->ipv4.rt_secret_timer);
3374}
3375
3376static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3377 .init = rt_secret_timer_init,
3378 .exit = rt_secret_timer_exit,
3379};
3380
3381
3382#ifdef CONFIG_NET_CLS_ROUTE
3383struct ip_rt_acct *ip_rt_acct __read_mostly;
3384#endif
3385
3386static __initdata unsigned long rhash_entries;
3387static int __init set_rhash_entries(char *str)
3388{
3389 if (!str)
3390 return 0;
3391 rhash_entries = simple_strtoul(str, &str, 0);
3392 return 1;
3393}
3394__setup("rhash_entries=", set_rhash_entries);
3395
3396int __init ip_rt_init(void)
3397{
3398 int rc = 0;
3399
3400#ifdef CONFIG_NET_CLS_ROUTE
3401 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3402 if (!ip_rt_acct)
3403 panic("IP: failed to allocate ip_rt_acct\n");
3404#endif
3405
3406 ipv4_dst_ops.kmem_cachep =
3407 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3408 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3409
3410 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3411
3412 rt_hash_table = (struct rt_hash_bucket *)
3413 alloc_large_system_hash("IP route cache",
3414 sizeof(struct rt_hash_bucket),
3415 rhash_entries,
3416 (totalram_pages >= 128 * 1024) ?
3417 15 : 17,
3418 0,
3419 &rt_hash_log,
3420 &rt_hash_mask,
3421 rhash_entries ? 0 : 512 * 1024);
3422 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3423 rt_hash_lock_init();
3424
3425 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3426 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3427
3428 devinet_init();
3429 ip_fib_init();
3430
3431
3432
3433
3434 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3435 expires_ljiffies = jiffies;
3436 schedule_delayed_work(&expires_work,
3437 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3438
3439 if (register_pernet_subsys(&rt_secret_timer_ops))
3440 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3441
3442 if (ip_rt_proc_init())
3443 printk(KERN_ERR "Unable to create route proc files\n");
3444#ifdef CONFIG_XFRM
3445 xfrm_init();
3446 xfrm4_init(ip_rt_max_size);
3447#endif
3448 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3449
3450#ifdef CONFIG_SYSCTL
3451 register_pernet_subsys(&sysctl_route_ops);
3452#endif
3453 return rc;
3454}
3455
3456#ifdef CONFIG_SYSCTL
3457
3458
3459
3460
3461void __init ip_static_sysctl_init(void)
3462{
3463 register_sysctl_paths(ipv4_path, ipv4_skeleton);
3464}
3465#endif
3466
3467EXPORT_SYMBOL(__ip_select_ident);
3468EXPORT_SYMBOL(ip_route_input);
3469EXPORT_SYMBOL(ip_route_output_key);
3470