1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
73#include <linux/mm.h>
74#include <linux/bootmem.h>
75#include <linux/string.h>
76#include <linux/socket.h>
77#include <linux/sockios.h>
78#include <linux/errno.h>
79#include <linux/in.h>
80#include <linux/inet.h>
81#include <linux/netdevice.h>
82#include <linux/proc_fs.h>
83#include <linux/init.h>
84#include <linux/workqueue.h>
85#include <linux/skbuff.h>
86#include <linux/inetdevice.h>
87#include <linux/igmp.h>
88#include <linux/pkt_sched.h>
89#include <linux/mroute.h>
90#include <linux/netfilter_ipv4.h>
91#include <linux/random.h>
92#include <linux/jhash.h>
93#include <linux/rcupdate.h>
94#include <linux/times.h>
95#include <net/net_namespace.h>
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
106#include <net/netevent.h>
107#include <net/rtnetlink.h>
108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
111
112#define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
119static int ip_rt_min_delay = 2 * HZ;
120static int ip_rt_max_delay = 10 * HZ;
121static int ip_rt_max_size;
122static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
123static int ip_rt_gc_interval = 60 * HZ;
124static int ip_rt_gc_min_interval = HZ / 2;
125static int ip_rt_redirect_number = 9;
126static int ip_rt_redirect_load = HZ / 50;
127static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost = HZ;
129static int ip_rt_error_burst = 5 * HZ;
130static int ip_rt_gc_elasticity = 8;
131static int ip_rt_mtu_expires = 10 * 60 * HZ;
132static int ip_rt_min_pmtu = 512 + 20 + 20;
133static int ip_rt_min_advmss = 256;
134static int ip_rt_secret_interval = 10 * 60 * HZ;
135static unsigned long rt_deadline;
136
137#define RTprint(a...) printk(KERN_DEBUG a)
138
139static struct timer_list rt_flush_timer;
140static void rt_check_expire(struct work_struct *work);
141static DECLARE_DELAYED_WORK(expires_work, rt_check_expire);
142static struct timer_list rt_secret_timer;
143
144
145
146
147
148static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
149static void ipv4_dst_destroy(struct dst_entry *dst);
150static void ipv4_dst_ifdown(struct dst_entry *dst,
151 struct net_device *dev, int how);
152static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
153static void ipv4_link_failure(struct sk_buff *skb);
154static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
155static int rt_garbage_collect(void);
156
157
158static struct dst_ops ipv4_dst_ops = {
159 .family = AF_INET,
160 .protocol = __constant_htons(ETH_P_IP),
161 .gc = rt_garbage_collect,
162 .check = ipv4_dst_check,
163 .destroy = ipv4_dst_destroy,
164 .ifdown = ipv4_dst_ifdown,
165 .negative_advice = ipv4_negative_advice,
166 .link_failure = ipv4_link_failure,
167 .update_pmtu = ip_rt_update_pmtu,
168 .entry_size = sizeof(struct rtable),
169};
170
171#define ECN_OR_COST(class) TC_PRIO_##class
172
173const __u8 ip_tos2prio[16] = {
174 TC_PRIO_BESTEFFORT,
175 ECN_OR_COST(FILLER),
176 TC_PRIO_BESTEFFORT,
177 ECN_OR_COST(BESTEFFORT),
178 TC_PRIO_BULK,
179 ECN_OR_COST(BULK),
180 TC_PRIO_BULK,
181 ECN_OR_COST(BULK),
182 TC_PRIO_INTERACTIVE,
183 ECN_OR_COST(INTERACTIVE),
184 TC_PRIO_INTERACTIVE,
185 ECN_OR_COST(INTERACTIVE),
186 TC_PRIO_INTERACTIVE_BULK,
187 ECN_OR_COST(INTERACTIVE_BULK),
188 TC_PRIO_INTERACTIVE_BULK,
189 ECN_OR_COST(INTERACTIVE_BULK)
190};
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207struct rt_hash_bucket {
208 struct rtable *chain;
209};
210#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
211 defined(CONFIG_PROVE_LOCKING)
212
213
214
215
216
217#ifdef CONFIG_LOCKDEP
218# define RT_HASH_LOCK_SZ 256
219#else
220# if NR_CPUS >= 32
221# define RT_HASH_LOCK_SZ 4096
222# elif NR_CPUS >= 16
223# define RT_HASH_LOCK_SZ 2048
224# elif NR_CPUS >= 8
225# define RT_HASH_LOCK_SZ 1024
226# elif NR_CPUS >= 4
227# define RT_HASH_LOCK_SZ 512
228# else
229# define RT_HASH_LOCK_SZ 256
230# endif
231#endif
232
233static spinlock_t *rt_hash_locks;
234# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
235# define rt_hash_lock_init() { \
236 int i; \
237 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
238 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
239 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
240 spin_lock_init(&rt_hash_locks[i]); \
241 }
242#else
243# define rt_hash_lock_addr(slot) NULL
244# define rt_hash_lock_init()
245#endif
246
247static struct rt_hash_bucket *rt_hash_table;
248static unsigned rt_hash_mask;
249static unsigned int rt_hash_log;
250static unsigned int rt_hash_rnd;
251
252static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
253#define RT_CACHE_STAT_INC(field) \
254 (__raw_get_cpu_var(rt_cache_stat).field++)
255
256static int rt_intern_hash(unsigned hash, struct rtable *rth,
257 struct rtable **res);
258
259static unsigned int rt_hash_code(u32 daddr, u32 saddr)
260{
261 return (jhash_2words(daddr, saddr, rt_hash_rnd)
262 & rt_hash_mask);
263}
264
265#define rt_hash(daddr, saddr, idx) \
266 rt_hash_code((__force u32)(__be32)(daddr),\
267 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
268
269#ifdef CONFIG_PROC_FS
270struct rt_cache_iter_state {
271 int bucket;
272};
273
274static struct rtable *rt_cache_get_first(struct seq_file *seq)
275{
276 struct rtable *r = NULL;
277 struct rt_cache_iter_state *st = seq->private;
278
279 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
280 rcu_read_lock_bh();
281 r = rt_hash_table[st->bucket].chain;
282 if (r)
283 break;
284 rcu_read_unlock_bh();
285 }
286 return rcu_dereference(r);
287}
288
289static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
290{
291 struct rt_cache_iter_state *st = seq->private;
292
293 r = r->u.dst.rt_next;
294 while (!r) {
295 rcu_read_unlock_bh();
296 if (--st->bucket < 0)
297 break;
298 rcu_read_lock_bh();
299 r = rt_hash_table[st->bucket].chain;
300 }
301 return rcu_dereference(r);
302}
303
304static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
305{
306 struct rtable *r = rt_cache_get_first(seq);
307
308 if (r)
309 while (pos && (r = rt_cache_get_next(seq, r)))
310 --pos;
311 return pos ? NULL : r;
312}
313
314static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
315{
316 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
317}
318
319static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
320{
321 struct rtable *r = NULL;
322
323 if (v == SEQ_START_TOKEN)
324 r = rt_cache_get_first(seq);
325 else
326 r = rt_cache_get_next(seq, v);
327 ++*pos;
328 return r;
329}
330
331static void rt_cache_seq_stop(struct seq_file *seq, void *v)
332{
333 if (v && v != SEQ_START_TOKEN)
334 rcu_read_unlock_bh();
335}
336
337static int rt_cache_seq_show(struct seq_file *seq, void *v)
338{
339 if (v == SEQ_START_TOKEN)
340 seq_printf(seq, "%-127s\n",
341 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
342 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
343 "HHUptod\tSpecDst");
344 else {
345 struct rtable *r = v;
346 char temp[256];
347
348 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
349 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
350 r->u.dst.dev ? r->u.dst.dev->name : "*",
351 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
352 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
353 r->u.dst.__use, 0, (unsigned long)r->rt_src,
354 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
355 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
356 dst_metric(&r->u.dst, RTAX_WINDOW),
357 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
358 dst_metric(&r->u.dst, RTAX_RTTVAR)),
359 r->fl.fl4_tos,
360 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
361 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
362 dev_queue_xmit) : 0,
363 r->rt_spec_dst);
364 seq_printf(seq, "%-127s\n", temp);
365 }
366 return 0;
367}
368
369static const struct seq_operations rt_cache_seq_ops = {
370 .start = rt_cache_seq_start,
371 .next = rt_cache_seq_next,
372 .stop = rt_cache_seq_stop,
373 .show = rt_cache_seq_show,
374};
375
376static int rt_cache_seq_open(struct inode *inode, struct file *file)
377{
378 return seq_open_private(file, &rt_cache_seq_ops,
379 sizeof(struct rt_cache_iter_state));
380}
381
382static const struct file_operations rt_cache_seq_fops = {
383 .owner = THIS_MODULE,
384 .open = rt_cache_seq_open,
385 .read = seq_read,
386 .llseek = seq_lseek,
387 .release = seq_release_private,
388};
389
390
391static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
392{
393 int cpu;
394
395 if (*pos == 0)
396 return SEQ_START_TOKEN;
397
398 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
399 if (!cpu_possible(cpu))
400 continue;
401 *pos = cpu+1;
402 return &per_cpu(rt_cache_stat, cpu);
403 }
404 return NULL;
405}
406
407static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
408{
409 int cpu;
410
411 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
412 if (!cpu_possible(cpu))
413 continue;
414 *pos = cpu+1;
415 return &per_cpu(rt_cache_stat, cpu);
416 }
417 return NULL;
418
419}
420
421static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
422{
423
424}
425
426static int rt_cpu_seq_show(struct seq_file *seq, void *v)
427{
428 struct rt_cache_stat *st = v;
429
430 if (v == SEQ_START_TOKEN) {
431 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
432 return 0;
433 }
434
435 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
436 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
437 atomic_read(&ipv4_dst_ops.entries),
438 st->in_hit,
439 st->in_slow_tot,
440 st->in_slow_mc,
441 st->in_no_route,
442 st->in_brd,
443 st->in_martian_dst,
444 st->in_martian_src,
445
446 st->out_hit,
447 st->out_slow_tot,
448 st->out_slow_mc,
449
450 st->gc_total,
451 st->gc_ignored,
452 st->gc_goal_miss,
453 st->gc_dst_overflow,
454 st->in_hlist_search,
455 st->out_hlist_search
456 );
457 return 0;
458}
459
460static const struct seq_operations rt_cpu_seq_ops = {
461 .start = rt_cpu_seq_start,
462 .next = rt_cpu_seq_next,
463 .stop = rt_cpu_seq_stop,
464 .show = rt_cpu_seq_show,
465};
466
467
468static int rt_cpu_seq_open(struct inode *inode, struct file *file)
469{
470 return seq_open(file, &rt_cpu_seq_ops);
471}
472
473static const struct file_operations rt_cpu_seq_fops = {
474 .owner = THIS_MODULE,
475 .open = rt_cpu_seq_open,
476 .read = seq_read,
477 .llseek = seq_lseek,
478 .release = seq_release,
479};
480
481#endif
482
483static __inline__ void rt_free(struct rtable *rt)
484{
485 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
486}
487
488static __inline__ void rt_drop(struct rtable *rt)
489{
490 ip_rt_put(rt);
491 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
492}
493
494static __inline__ int rt_fast_clean(struct rtable *rth)
495{
496
497
498 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
499 rth->fl.iif && rth->u.dst.rt_next;
500}
501
502static __inline__ int rt_valuable(struct rtable *rth)
503{
504 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
505 rth->u.dst.expires;
506}
507
508static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
509{
510 unsigned long age;
511 int ret = 0;
512
513 if (atomic_read(&rth->u.dst.__refcnt))
514 goto out;
515
516 ret = 1;
517 if (rth->u.dst.expires &&
518 time_after_eq(jiffies, rth->u.dst.expires))
519 goto out;
520
521 age = jiffies - rth->u.dst.lastuse;
522 ret = 0;
523 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
524 (age <= tmo2 && rt_valuable(rth)))
525 goto out;
526 ret = 1;
527out: return ret;
528}
529
530
531
532
533
534
535static inline u32 rt_score(struct rtable *rt)
536{
537 u32 score = jiffies - rt->u.dst.lastuse;
538
539 score = ~score & ~(3<<30);
540
541 if (rt_valuable(rt))
542 score |= (1<<31);
543
544 if (!rt->fl.iif ||
545 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
546 score |= (1<<30);
547
548 return score;
549}
550
551static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
552{
553 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
554 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
555 (fl1->mark ^ fl2->mark) |
556 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
557 *(u16 *)&fl2->nl_u.ip4_u.tos) |
558 (fl1->oif ^ fl2->oif) |
559 (fl1->iif ^ fl2->iif)) == 0;
560}
561
562static void rt_check_expire(struct work_struct *work)
563{
564 static unsigned int rover;
565 unsigned int i = rover, goal;
566 struct rtable *rth, **rthp;
567 u64 mult;
568
569 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
570 if (ip_rt_gc_timeout > 1)
571 do_div(mult, ip_rt_gc_timeout);
572 goal = (unsigned int)mult;
573 if (goal > rt_hash_mask)
574 goal = rt_hash_mask + 1;
575 for (; goal > 0; goal--) {
576 unsigned long tmo = ip_rt_gc_timeout;
577
578 i = (i + 1) & rt_hash_mask;
579 rthp = &rt_hash_table[i].chain;
580
581 if (need_resched())
582 cond_resched();
583
584 if (*rthp == NULL)
585 continue;
586 spin_lock_bh(rt_hash_lock_addr(i));
587 while ((rth = *rthp) != NULL) {
588 if (rth->u.dst.expires) {
589
590 if (time_before_eq(jiffies, rth->u.dst.expires)) {
591 tmo >>= 1;
592 rthp = &rth->u.dst.rt_next;
593 continue;
594 }
595 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
596 tmo >>= 1;
597 rthp = &rth->u.dst.rt_next;
598 continue;
599 }
600
601
602 *rthp = rth->u.dst.rt_next;
603 rt_free(rth);
604 }
605 spin_unlock_bh(rt_hash_lock_addr(i));
606 }
607 rover = i;
608 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
609}
610
611
612
613
614static void rt_run_flush(unsigned long dummy)
615{
616 int i;
617 struct rtable *rth, *next;
618
619 rt_deadline = 0;
620
621 get_random_bytes(&rt_hash_rnd, 4);
622
623 for (i = rt_hash_mask; i >= 0; i--) {
624 spin_lock_bh(rt_hash_lock_addr(i));
625 rth = rt_hash_table[i].chain;
626 if (rth)
627 rt_hash_table[i].chain = NULL;
628 spin_unlock_bh(rt_hash_lock_addr(i));
629
630 for (; rth; rth = next) {
631 next = rth->u.dst.rt_next;
632 rt_free(rth);
633 }
634 }
635}
636
637static DEFINE_SPINLOCK(rt_flush_lock);
638
639void rt_cache_flush(int delay)
640{
641 unsigned long now = jiffies;
642 int user_mode = !in_softirq();
643
644 if (delay < 0)
645 delay = ip_rt_min_delay;
646
647 spin_lock_bh(&rt_flush_lock);
648
649 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
650 long tmo = (long)(rt_deadline - now);
651
652
653
654
655
656
657
658
659 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
660 tmo = 0;
661
662 if (delay > tmo)
663 delay = tmo;
664 }
665
666 if (delay <= 0) {
667 spin_unlock_bh(&rt_flush_lock);
668 rt_run_flush(0);
669 return;
670 }
671
672 if (rt_deadline == 0)
673 rt_deadline = now + ip_rt_max_delay;
674
675 mod_timer(&rt_flush_timer, now+delay);
676 spin_unlock_bh(&rt_flush_lock);
677}
678
679static void rt_secret_rebuild(unsigned long dummy)
680{
681 unsigned long now = jiffies;
682
683 rt_cache_flush(0);
684 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
685}
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700static int rt_garbage_collect(void)
701{
702 static unsigned long expire = RT_GC_TIMEOUT;
703 static unsigned long last_gc;
704 static int rover;
705 static int equilibrium;
706 struct rtable *rth, **rthp;
707 unsigned long now = jiffies;
708 int goal;
709
710
711
712
713
714
715 RT_CACHE_STAT_INC(gc_total);
716
717 if (now - last_gc < ip_rt_gc_min_interval &&
718 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
719 RT_CACHE_STAT_INC(gc_ignored);
720 goto out;
721 }
722
723
724 goal = atomic_read(&ipv4_dst_ops.entries) -
725 (ip_rt_gc_elasticity << rt_hash_log);
726 if (goal <= 0) {
727 if (equilibrium < ipv4_dst_ops.gc_thresh)
728 equilibrium = ipv4_dst_ops.gc_thresh;
729 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
730 if (goal > 0) {
731 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
732 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
733 }
734 } else {
735
736
737
738 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
739 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
740 }
741
742 if (now - last_gc >= ip_rt_gc_min_interval)
743 last_gc = now;
744
745 if (goal <= 0) {
746 equilibrium += goal;
747 goto work_done;
748 }
749
750 do {
751 int i, k;
752
753 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
754 unsigned long tmo = expire;
755
756 k = (k + 1) & rt_hash_mask;
757 rthp = &rt_hash_table[k].chain;
758 spin_lock_bh(rt_hash_lock_addr(k));
759 while ((rth = *rthp) != NULL) {
760 if (!rt_may_expire(rth, tmo, expire)) {
761 tmo >>= 1;
762 rthp = &rth->u.dst.rt_next;
763 continue;
764 }
765 *rthp = rth->u.dst.rt_next;
766 rt_free(rth);
767 goal--;
768 }
769 spin_unlock_bh(rt_hash_lock_addr(k));
770 if (goal <= 0)
771 break;
772 }
773 rover = k;
774
775 if (goal <= 0)
776 goto work_done;
777
778
779
780
781
782
783
784
785
786
787 RT_CACHE_STAT_INC(gc_goal_miss);
788
789 if (expire == 0)
790 break;
791
792 expire >>= 1;
793#if RT_CACHE_DEBUG >= 2
794 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
795 atomic_read(&ipv4_dst_ops.entries), goal, i);
796#endif
797
798 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
799 goto out;
800 } while (!in_softirq() && time_before_eq(jiffies, now));
801
802 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
803 goto out;
804 if (net_ratelimit())
805 printk(KERN_WARNING "dst cache overflow\n");
806 RT_CACHE_STAT_INC(gc_dst_overflow);
807 return 1;
808
809work_done:
810 expire += ip_rt_gc_min_interval;
811 if (expire > ip_rt_gc_timeout ||
812 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
813 expire = ip_rt_gc_timeout;
814#if RT_CACHE_DEBUG >= 2
815 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
816 atomic_read(&ipv4_dst_ops.entries), goal, rover);
817#endif
818out: return 0;
819}
820
821static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
822{
823 struct rtable *rth, **rthp;
824 unsigned long now;
825 struct rtable *cand, **candp;
826 u32 min_score;
827 int chain_length;
828 int attempts = !in_softirq();
829
830restart:
831 chain_length = 0;
832 min_score = ~(u32)0;
833 cand = NULL;
834 candp = NULL;
835 now = jiffies;
836
837 rthp = &rt_hash_table[hash].chain;
838
839 spin_lock_bh(rt_hash_lock_addr(hash));
840 while ((rth = *rthp) != NULL) {
841 if (compare_keys(&rth->fl, &rt->fl)) {
842
843 *rthp = rth->u.dst.rt_next;
844
845
846
847
848
849 rcu_assign_pointer(rth->u.dst.rt_next,
850 rt_hash_table[hash].chain);
851
852
853
854
855 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
856
857 dst_use(&rth->u.dst, now);
858 spin_unlock_bh(rt_hash_lock_addr(hash));
859
860 rt_drop(rt);
861 *rp = rth;
862 return 0;
863 }
864
865 if (!atomic_read(&rth->u.dst.__refcnt)) {
866 u32 score = rt_score(rth);
867
868 if (score <= min_score) {
869 cand = rth;
870 candp = rthp;
871 min_score = score;
872 }
873 }
874
875 chain_length++;
876
877 rthp = &rth->u.dst.rt_next;
878 }
879
880 if (cand) {
881
882
883
884
885
886
887 if (chain_length > ip_rt_gc_elasticity) {
888 *candp = cand->u.dst.rt_next;
889 rt_free(cand);
890 }
891 }
892
893
894
895
896 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
897 int err = arp_bind_neighbour(&rt->u.dst);
898 if (err) {
899 spin_unlock_bh(rt_hash_lock_addr(hash));
900
901 if (err != -ENOBUFS) {
902 rt_drop(rt);
903 return err;
904 }
905
906
907
908
909
910 if (attempts-- > 0) {
911 int saved_elasticity = ip_rt_gc_elasticity;
912 int saved_int = ip_rt_gc_min_interval;
913 ip_rt_gc_elasticity = 1;
914 ip_rt_gc_min_interval = 0;
915 rt_garbage_collect();
916 ip_rt_gc_min_interval = saved_int;
917 ip_rt_gc_elasticity = saved_elasticity;
918 goto restart;
919 }
920
921 if (net_ratelimit())
922 printk(KERN_WARNING "Neighbour table overflow.\n");
923 rt_drop(rt);
924 return -ENOBUFS;
925 }
926 }
927
928 rt->u.dst.rt_next = rt_hash_table[hash].chain;
929#if RT_CACHE_DEBUG >= 2
930 if (rt->u.dst.rt_next) {
931 struct rtable *trt;
932 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
933 NIPQUAD(rt->rt_dst));
934 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
935 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
936 printk("\n");
937 }
938#endif
939 rt_hash_table[hash].chain = rt;
940 spin_unlock_bh(rt_hash_lock_addr(hash));
941 *rp = rt;
942 return 0;
943}
944
945void rt_bind_peer(struct rtable *rt, int create)
946{
947 static DEFINE_SPINLOCK(rt_peer_lock);
948 struct inet_peer *peer;
949
950 peer = inet_getpeer(rt->rt_dst, create);
951
952 spin_lock_bh(&rt_peer_lock);
953 if (rt->peer == NULL) {
954 rt->peer = peer;
955 peer = NULL;
956 }
957 spin_unlock_bh(&rt_peer_lock);
958 if (peer)
959 inet_putpeer(peer);
960}
961
962
963
964
965
966
967
968
969static void ip_select_fb_ident(struct iphdr *iph)
970{
971 static DEFINE_SPINLOCK(ip_fb_id_lock);
972 static u32 ip_fallback_id;
973 u32 salt;
974
975 spin_lock_bh(&ip_fb_id_lock);
976 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
977 iph->id = htons(salt & 0xFFFF);
978 ip_fallback_id = salt;
979 spin_unlock_bh(&ip_fb_id_lock);
980}
981
982void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
983{
984 struct rtable *rt = (struct rtable *) dst;
985
986 if (rt) {
987 if (rt->peer == NULL)
988 rt_bind_peer(rt, 1);
989
990
991
992
993 if (rt->peer) {
994 iph->id = htons(inet_getid(rt->peer, more));
995 return;
996 }
997 } else
998 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
999 __builtin_return_address(0));
1000
1001 ip_select_fb_ident(iph);
1002}
1003
1004static void rt_del(unsigned hash, struct rtable *rt)
1005{
1006 struct rtable **rthp;
1007
1008 spin_lock_bh(rt_hash_lock_addr(hash));
1009 ip_rt_put(rt);
1010 for (rthp = &rt_hash_table[hash].chain; *rthp;
1011 rthp = &(*rthp)->u.dst.rt_next)
1012 if (*rthp == rt) {
1013 *rthp = rt->u.dst.rt_next;
1014 rt_free(rt);
1015 break;
1016 }
1017 spin_unlock_bh(rt_hash_lock_addr(hash));
1018}
1019
1020void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1021 __be32 saddr, struct net_device *dev)
1022{
1023 int i, k;
1024 struct in_device *in_dev = in_dev_get(dev);
1025 struct rtable *rth, **rthp;
1026 __be32 skeys[2] = { saddr, 0 };
1027 int ikeys[2] = { dev->ifindex, 0 };
1028 struct netevent_redirect netevent;
1029
1030 if (!in_dev)
1031 return;
1032
1033 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1034 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1035 goto reject_redirect;
1036
1037 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1038 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1039 goto reject_redirect;
1040 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1041 goto reject_redirect;
1042 } else {
1043 if (inet_addr_type(new_gw) != RTN_UNICAST)
1044 goto reject_redirect;
1045 }
1046
1047 for (i = 0; i < 2; i++) {
1048 for (k = 0; k < 2; k++) {
1049 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1050
1051 rthp=&rt_hash_table[hash].chain;
1052
1053 rcu_read_lock();
1054 while ((rth = rcu_dereference(*rthp)) != NULL) {
1055 struct rtable *rt;
1056
1057 if (rth->fl.fl4_dst != daddr ||
1058 rth->fl.fl4_src != skeys[i] ||
1059 rth->fl.oif != ikeys[k] ||
1060 rth->fl.iif != 0) {
1061 rthp = &rth->u.dst.rt_next;
1062 continue;
1063 }
1064
1065 if (rth->rt_dst != daddr ||
1066 rth->rt_src != saddr ||
1067 rth->u.dst.error ||
1068 rth->rt_gateway != old_gw ||
1069 rth->u.dst.dev != dev)
1070 break;
1071
1072 dst_hold(&rth->u.dst);
1073 rcu_read_unlock();
1074
1075 rt = dst_alloc(&ipv4_dst_ops);
1076 if (rt == NULL) {
1077 ip_rt_put(rth);
1078 in_dev_put(in_dev);
1079 return;
1080 }
1081
1082
1083 *rt = *rth;
1084 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1085 rt->u.dst.__use = 1;
1086 atomic_set(&rt->u.dst.__refcnt, 1);
1087 rt->u.dst.child = NULL;
1088 if (rt->u.dst.dev)
1089 dev_hold(rt->u.dst.dev);
1090 if (rt->idev)
1091 in_dev_hold(rt->idev);
1092 rt->u.dst.obsolete = 0;
1093 rt->u.dst.lastuse = jiffies;
1094 rt->u.dst.path = &rt->u.dst;
1095 rt->u.dst.neighbour = NULL;
1096 rt->u.dst.hh = NULL;
1097 rt->u.dst.xfrm = NULL;
1098
1099 rt->rt_flags |= RTCF_REDIRECTED;
1100
1101
1102 rt->rt_gateway = new_gw;
1103
1104
1105 dst_confirm(&rth->u.dst);
1106
1107 if (rt->peer)
1108 atomic_inc(&rt->peer->refcnt);
1109
1110 if (arp_bind_neighbour(&rt->u.dst) ||
1111 !(rt->u.dst.neighbour->nud_state &
1112 NUD_VALID)) {
1113 if (rt->u.dst.neighbour)
1114 neigh_event_send(rt->u.dst.neighbour, NULL);
1115 ip_rt_put(rth);
1116 rt_drop(rt);
1117 goto do_next;
1118 }
1119
1120 netevent.old = &rth->u.dst;
1121 netevent.new = &rt->u.dst;
1122 call_netevent_notifiers(NETEVENT_REDIRECT,
1123 &netevent);
1124
1125 rt_del(hash, rth);
1126 if (!rt_intern_hash(hash, rt, &rt))
1127 ip_rt_put(rt);
1128 goto do_next;
1129 }
1130 rcu_read_unlock();
1131 do_next:
1132 ;
1133 }
1134 }
1135 in_dev_put(in_dev);
1136 return;
1137
1138reject_redirect:
1139#ifdef CONFIG_IP_ROUTE_VERBOSE
1140 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1141 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1142 "%u.%u.%u.%u ignored.\n"
1143 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1144 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1145 NIPQUAD(saddr), NIPQUAD(daddr));
1146#endif
1147 in_dev_put(in_dev);
1148}
1149
1150static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1151{
1152 struct rtable *rt = (struct rtable*)dst;
1153 struct dst_entry *ret = dst;
1154
1155 if (rt) {
1156 if (dst->obsolete) {
1157 ip_rt_put(rt);
1158 ret = NULL;
1159 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1160 rt->u.dst.expires) {
1161 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1162 rt->fl.oif);
1163#if RT_CACHE_DEBUG >= 1
1164 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1165 "%u.%u.%u.%u/%02x dropped\n",
1166 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1167#endif
1168 rt_del(hash, rt);
1169 ret = NULL;
1170 }
1171 }
1172 return ret;
1173}
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191void ip_rt_send_redirect(struct sk_buff *skb)
1192{
1193 struct rtable *rt = (struct rtable*)skb->dst;
1194 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1195
1196 if (!in_dev)
1197 return;
1198
1199 if (!IN_DEV_TX_REDIRECTS(in_dev))
1200 goto out;
1201
1202
1203
1204
1205 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1206 rt->u.dst.rate_tokens = 0;
1207
1208
1209
1210
1211 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1212 rt->u.dst.rate_last = jiffies;
1213 goto out;
1214 }
1215
1216
1217
1218
1219 if (rt->u.dst.rate_tokens == 0 ||
1220 time_after(jiffies,
1221 (rt->u.dst.rate_last +
1222 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1223 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1224 rt->u.dst.rate_last = jiffies;
1225 ++rt->u.dst.rate_tokens;
1226#ifdef CONFIG_IP_ROUTE_VERBOSE
1227 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1228 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1229 net_ratelimit())
1230 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1231 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1232 NIPQUAD(rt->rt_src), rt->rt_iif,
1233 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1234#endif
1235 }
1236out:
1237 in_dev_put(in_dev);
1238}
1239
1240static int ip_error(struct sk_buff *skb)
1241{
1242 struct rtable *rt = (struct rtable*)skb->dst;
1243 unsigned long now;
1244 int code;
1245
1246 switch (rt->u.dst.error) {
1247 case EINVAL:
1248 default:
1249 goto out;
1250 case EHOSTUNREACH:
1251 code = ICMP_HOST_UNREACH;
1252 break;
1253 case ENETUNREACH:
1254 code = ICMP_NET_UNREACH;
1255 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1256 break;
1257 case EACCES:
1258 code = ICMP_PKT_FILTERED;
1259 break;
1260 }
1261
1262 now = jiffies;
1263 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1264 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1265 rt->u.dst.rate_tokens = ip_rt_error_burst;
1266 rt->u.dst.rate_last = now;
1267 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1268 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1269 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1270 }
1271
1272out: kfree_skb(skb);
1273 return 0;
1274}
1275
1276
1277
1278
1279
1280
1281static const unsigned short mtu_plateau[] =
1282{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1283
1284static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1285{
1286 int i;
1287
1288 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1289 if (old_mtu > mtu_plateau[i])
1290 return mtu_plateau[i];
1291 return 68;
1292}
1293
1294unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1295{
1296 int i;
1297 unsigned short old_mtu = ntohs(iph->tot_len);
1298 struct rtable *rth;
1299 __be32 skeys[2] = { iph->saddr, 0, };
1300 __be32 daddr = iph->daddr;
1301 unsigned short est_mtu = 0;
1302
1303 if (ipv4_config.no_pmtu_disc)
1304 return 0;
1305
1306 for (i = 0; i < 2; i++) {
1307 unsigned hash = rt_hash(daddr, skeys[i], 0);
1308
1309 rcu_read_lock();
1310 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1311 rth = rcu_dereference(rth->u.dst.rt_next)) {
1312 if (rth->fl.fl4_dst == daddr &&
1313 rth->fl.fl4_src == skeys[i] &&
1314 rth->rt_dst == daddr &&
1315 rth->rt_src == iph->saddr &&
1316 rth->fl.iif == 0 &&
1317 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1318 unsigned short mtu = new_mtu;
1319
1320 if (new_mtu < 68 || new_mtu >= old_mtu) {
1321
1322
1323 if (mtu == 0 &&
1324 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1325 old_mtu >= 68 + (iph->ihl << 2))
1326 old_mtu -= iph->ihl << 2;
1327
1328 mtu = guess_mtu(old_mtu);
1329 }
1330 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1331 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1332 dst_confirm(&rth->u.dst);
1333 if (mtu < ip_rt_min_pmtu) {
1334 mtu = ip_rt_min_pmtu;
1335 rth->u.dst.metrics[RTAX_LOCK-1] |=
1336 (1 << RTAX_MTU);
1337 }
1338 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1339 dst_set_expires(&rth->u.dst,
1340 ip_rt_mtu_expires);
1341 }
1342 est_mtu = mtu;
1343 }
1344 }
1345 }
1346 rcu_read_unlock();
1347 }
1348 return est_mtu ? : new_mtu;
1349}
1350
1351static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1352{
1353 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1354 !(dst_metric_locked(dst, RTAX_MTU))) {
1355 if (mtu < ip_rt_min_pmtu) {
1356 mtu = ip_rt_min_pmtu;
1357 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1358 }
1359 dst->metrics[RTAX_MTU-1] = mtu;
1360 dst_set_expires(dst, ip_rt_mtu_expires);
1361 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1362 }
1363}
1364
1365static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1366{
1367 return NULL;
1368}
1369
1370static void ipv4_dst_destroy(struct dst_entry *dst)
1371{
1372 struct rtable *rt = (struct rtable *) dst;
1373 struct inet_peer *peer = rt->peer;
1374 struct in_device *idev = rt->idev;
1375
1376 if (peer) {
1377 rt->peer = NULL;
1378 inet_putpeer(peer);
1379 }
1380
1381 if (idev) {
1382 rt->idev = NULL;
1383 in_dev_put(idev);
1384 }
1385}
1386
1387static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1388 int how)
1389{
1390 struct rtable *rt = (struct rtable *) dst;
1391 struct in_device *idev = rt->idev;
1392 if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
1393 struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
1394 if (loopback_idev) {
1395 rt->idev = loopback_idev;
1396 in_dev_put(idev);
1397 }
1398 }
1399}
1400
1401static void ipv4_link_failure(struct sk_buff *skb)
1402{
1403 struct rtable *rt;
1404
1405 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1406
1407 rt = (struct rtable *) skb->dst;
1408 if (rt)
1409 dst_set_expires(&rt->u.dst, 0);
1410}
1411
1412static int ip_rt_bug(struct sk_buff *skb)
1413{
1414 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1415 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1416 skb->dev ? skb->dev->name : "?");
1417 kfree_skb(skb);
1418 return 0;
1419}
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430void ip_rt_get_source(u8 *addr, struct rtable *rt)
1431{
1432 __be32 src;
1433 struct fib_result res;
1434
1435 if (rt->fl.iif == 0)
1436 src = rt->rt_src;
1437 else if (fib_lookup(&rt->fl, &res) == 0) {
1438 src = FIB_RES_PREFSRC(res);
1439 fib_res_put(&res);
1440 } else
1441 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1442 RT_SCOPE_UNIVERSE);
1443 memcpy(addr, &src, 4);
1444}
1445
1446#ifdef CONFIG_NET_CLS_ROUTE
1447static void set_class_tag(struct rtable *rt, u32 tag)
1448{
1449 if (!(rt->u.dst.tclassid & 0xFFFF))
1450 rt->u.dst.tclassid |= tag & 0xFFFF;
1451 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1452 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1453}
1454#endif
1455
1456static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1457{
1458 struct fib_info *fi = res->fi;
1459
1460 if (fi) {
1461 if (FIB_RES_GW(*res) &&
1462 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1463 rt->rt_gateway = FIB_RES_GW(*res);
1464 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1465 sizeof(rt->u.dst.metrics));
1466 if (fi->fib_mtu == 0) {
1467 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1468 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1469 rt->rt_gateway != rt->rt_dst &&
1470 rt->u.dst.dev->mtu > 576)
1471 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1472 }
1473#ifdef CONFIG_NET_CLS_ROUTE
1474 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1475#endif
1476 } else
1477 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1478
1479 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1480 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1481 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1482 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1483 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1484 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1485 ip_rt_min_advmss);
1486 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1487 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1488
1489#ifdef CONFIG_NET_CLS_ROUTE
1490#ifdef CONFIG_IP_MULTIPLE_TABLES
1491 set_class_tag(rt, fib_rules_tclass(res));
1492#endif
1493 set_class_tag(rt, itag);
1494#endif
1495 rt->rt_type = res->type;
1496}
1497
1498static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1499 u8 tos, struct net_device *dev, int our)
1500{
1501 unsigned hash;
1502 struct rtable *rth;
1503 __be32 spec_dst;
1504 struct in_device *in_dev = in_dev_get(dev);
1505 u32 itag = 0;
1506
1507
1508
1509 if (in_dev == NULL)
1510 return -EINVAL;
1511
1512 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1513 skb->protocol != htons(ETH_P_IP))
1514 goto e_inval;
1515
1516 if (ZERONET(saddr)) {
1517 if (!LOCAL_MCAST(daddr))
1518 goto e_inval;
1519 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1520 } else if (fib_validate_source(saddr, 0, tos, 0,
1521 dev, &spec_dst, &itag) < 0)
1522 goto e_inval;
1523
1524 rth = dst_alloc(&ipv4_dst_ops);
1525 if (!rth)
1526 goto e_nobufs;
1527
1528 rth->u.dst.output= ip_rt_bug;
1529
1530 atomic_set(&rth->u.dst.__refcnt, 1);
1531 rth->u.dst.flags= DST_HOST;
1532 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1533 rth->u.dst.flags |= DST_NOPOLICY;
1534 rth->fl.fl4_dst = daddr;
1535 rth->rt_dst = daddr;
1536 rth->fl.fl4_tos = tos;
1537 rth->fl.mark = skb->mark;
1538 rth->fl.fl4_src = saddr;
1539 rth->rt_src = saddr;
1540#ifdef CONFIG_NET_CLS_ROUTE
1541 rth->u.dst.tclassid = itag;
1542#endif
1543 rth->rt_iif =
1544 rth->fl.iif = dev->ifindex;
1545 rth->u.dst.dev = init_net.loopback_dev;
1546 dev_hold(rth->u.dst.dev);
1547 rth->idev = in_dev_get(rth->u.dst.dev);
1548 rth->fl.oif = 0;
1549 rth->rt_gateway = daddr;
1550 rth->rt_spec_dst= spec_dst;
1551 rth->rt_type = RTN_MULTICAST;
1552 rth->rt_flags = RTCF_MULTICAST;
1553 if (our) {
1554 rth->u.dst.input= ip_local_deliver;
1555 rth->rt_flags |= RTCF_LOCAL;
1556 }
1557
1558#ifdef CONFIG_IP_MROUTE
1559 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1560 rth->u.dst.input = ip_mr_input;
1561#endif
1562 RT_CACHE_STAT_INC(in_slow_mc);
1563
1564 in_dev_put(in_dev);
1565 hash = rt_hash(daddr, saddr, dev->ifindex);
1566 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1567
1568e_nobufs:
1569 in_dev_put(in_dev);
1570 return -ENOBUFS;
1571
1572e_inval:
1573 in_dev_put(in_dev);
1574 return -EINVAL;
1575}
1576
1577
1578static void ip_handle_martian_source(struct net_device *dev,
1579 struct in_device *in_dev,
1580 struct sk_buff *skb,
1581 __be32 daddr,
1582 __be32 saddr)
1583{
1584 RT_CACHE_STAT_INC(in_martian_src);
1585#ifdef CONFIG_IP_ROUTE_VERBOSE
1586 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1587
1588
1589
1590
1591 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1592 "%u.%u.%u.%u, on dev %s\n",
1593 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1594 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1595 int i;
1596 const unsigned char *p = skb_mac_header(skb);
1597 printk(KERN_WARNING "ll header: ");
1598 for (i = 0; i < dev->hard_header_len; i++, p++) {
1599 printk("%02x", *p);
1600 if (i < (dev->hard_header_len - 1))
1601 printk(":");
1602 }
1603 printk("\n");
1604 }
1605 }
1606#endif
1607}
1608
1609static inline int __mkroute_input(struct sk_buff *skb,
1610 struct fib_result* res,
1611 struct in_device *in_dev,
1612 __be32 daddr, __be32 saddr, u32 tos,
1613 struct rtable **result)
1614{
1615
1616 struct rtable *rth;
1617 int err;
1618 struct in_device *out_dev;
1619 unsigned flags = 0;
1620 __be32 spec_dst;
1621 u32 itag;
1622
1623
1624 out_dev = in_dev_get(FIB_RES_DEV(*res));
1625 if (out_dev == NULL) {
1626 if (net_ratelimit())
1627 printk(KERN_CRIT "Bug in ip_route_input" \
1628 "_slow(). Please, report\n");
1629 return -EINVAL;
1630 }
1631
1632
1633 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1634 in_dev->dev, &spec_dst, &itag);
1635 if (err < 0) {
1636 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1637 saddr);
1638
1639 err = -EINVAL;
1640 goto cleanup;
1641 }
1642
1643 if (err)
1644 flags |= RTCF_DIRECTSRC;
1645
1646 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1647 (IN_DEV_SHARED_MEDIA(out_dev) ||
1648 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1649 flags |= RTCF_DOREDIRECT;
1650
1651 if (skb->protocol != htons(ETH_P_IP)) {
1652
1653
1654
1655 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1656 err = -EINVAL;
1657 goto cleanup;
1658 }
1659 }
1660
1661
1662 rth = dst_alloc(&ipv4_dst_ops);
1663 if (!rth) {
1664 err = -ENOBUFS;
1665 goto cleanup;
1666 }
1667
1668 atomic_set(&rth->u.dst.__refcnt, 1);
1669 rth->u.dst.flags= DST_HOST;
1670 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1671 rth->u.dst.flags |= DST_NOPOLICY;
1672 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1673 rth->u.dst.flags |= DST_NOXFRM;
1674 rth->fl.fl4_dst = daddr;
1675 rth->rt_dst = daddr;
1676 rth->fl.fl4_tos = tos;
1677 rth->fl.mark = skb->mark;
1678 rth->fl.fl4_src = saddr;
1679 rth->rt_src = saddr;
1680 rth->rt_gateway = daddr;
1681 rth->rt_iif =
1682 rth->fl.iif = in_dev->dev->ifindex;
1683 rth->u.dst.dev = (out_dev)->dev;
1684 dev_hold(rth->u.dst.dev);
1685 rth->idev = in_dev_get(rth->u.dst.dev);
1686 rth->fl.oif = 0;
1687 rth->rt_spec_dst= spec_dst;
1688
1689 rth->u.dst.input = ip_forward;
1690 rth->u.dst.output = ip_output;
1691
1692 rt_set_nexthop(rth, res, itag);
1693
1694 rth->rt_flags = flags;
1695
1696 *result = rth;
1697 err = 0;
1698 cleanup:
1699
1700 in_dev_put(out_dev);
1701 return err;
1702}
1703
1704static inline int ip_mkroute_input(struct sk_buff *skb,
1705 struct fib_result* res,
1706 const struct flowi *fl,
1707 struct in_device *in_dev,
1708 __be32 daddr, __be32 saddr, u32 tos)
1709{
1710 struct rtable* rth = NULL;
1711 int err;
1712 unsigned hash;
1713
1714#ifdef CONFIG_IP_ROUTE_MULTIPATH
1715 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1716 fib_select_multipath(fl, res);
1717#endif
1718
1719
1720 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1721 if (err)
1722 return err;
1723
1724
1725 hash = rt_hash(daddr, saddr, fl->iif);
1726 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1727}
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1740 u8 tos, struct net_device *dev)
1741{
1742 struct fib_result res;
1743 struct in_device *in_dev = in_dev_get(dev);
1744 struct flowi fl = { .nl_u = { .ip4_u =
1745 { .daddr = daddr,
1746 .saddr = saddr,
1747 .tos = tos,
1748 .scope = RT_SCOPE_UNIVERSE,
1749 } },
1750 .mark = skb->mark,
1751 .iif = dev->ifindex };
1752 unsigned flags = 0;
1753 u32 itag = 0;
1754 struct rtable * rth;
1755 unsigned hash;
1756 __be32 spec_dst;
1757 int err = -EINVAL;
1758 int free_res = 0;
1759
1760
1761
1762 if (!in_dev)
1763 goto out;
1764
1765
1766
1767
1768
1769 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1770 goto martian_source;
1771
1772 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1773 goto brd_input;
1774
1775
1776
1777
1778 if (ZERONET(saddr))
1779 goto martian_source;
1780
1781 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1782 goto martian_destination;
1783
1784
1785
1786
1787 if ((err = fib_lookup(&fl, &res)) != 0) {
1788 if (!IN_DEV_FORWARD(in_dev))
1789 goto e_hostunreach;
1790 goto no_route;
1791 }
1792 free_res = 1;
1793
1794 RT_CACHE_STAT_INC(in_slow_tot);
1795
1796 if (res.type == RTN_BROADCAST)
1797 goto brd_input;
1798
1799 if (res.type == RTN_LOCAL) {
1800 int result;
1801 result = fib_validate_source(saddr, daddr, tos,
1802 init_net.loopback_dev->ifindex,
1803 dev, &spec_dst, &itag);
1804 if (result < 0)
1805 goto martian_source;
1806 if (result)
1807 flags |= RTCF_DIRECTSRC;
1808 spec_dst = daddr;
1809 goto local_input;
1810 }
1811
1812 if (!IN_DEV_FORWARD(in_dev))
1813 goto e_hostunreach;
1814 if (res.type != RTN_UNICAST)
1815 goto martian_destination;
1816
1817 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1818done:
1819 in_dev_put(in_dev);
1820 if (free_res)
1821 fib_res_put(&res);
1822out: return err;
1823
1824brd_input:
1825 if (skb->protocol != htons(ETH_P_IP))
1826 goto e_inval;
1827
1828 if (ZERONET(saddr))
1829 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1830 else {
1831 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1832 &itag);
1833 if (err < 0)
1834 goto martian_source;
1835 if (err)
1836 flags |= RTCF_DIRECTSRC;
1837 }
1838 flags |= RTCF_BROADCAST;
1839 res.type = RTN_BROADCAST;
1840 RT_CACHE_STAT_INC(in_brd);
1841
1842local_input:
1843 rth = dst_alloc(&ipv4_dst_ops);
1844 if (!rth)
1845 goto e_nobufs;
1846
1847 rth->u.dst.output= ip_rt_bug;
1848
1849 atomic_set(&rth->u.dst.__refcnt, 1);
1850 rth->u.dst.flags= DST_HOST;
1851 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1852 rth->u.dst.flags |= DST_NOPOLICY;
1853 rth->fl.fl4_dst = daddr;
1854 rth->rt_dst = daddr;
1855 rth->fl.fl4_tos = tos;
1856 rth->fl.mark = skb->mark;
1857 rth->fl.fl4_src = saddr;
1858 rth->rt_src = saddr;
1859#ifdef CONFIG_NET_CLS_ROUTE
1860 rth->u.dst.tclassid = itag;
1861#endif
1862 rth->rt_iif =
1863 rth->fl.iif = dev->ifindex;
1864 rth->u.dst.dev = init_net.loopback_dev;
1865 dev_hold(rth->u.dst.dev);
1866 rth->idev = in_dev_get(rth->u.dst.dev);
1867 rth->rt_gateway = daddr;
1868 rth->rt_spec_dst= spec_dst;
1869 rth->u.dst.input= ip_local_deliver;
1870 rth->rt_flags = flags|RTCF_LOCAL;
1871 if (res.type == RTN_UNREACHABLE) {
1872 rth->u.dst.input= ip_error;
1873 rth->u.dst.error= -err;
1874 rth->rt_flags &= ~RTCF_LOCAL;
1875 }
1876 rth->rt_type = res.type;
1877 hash = rt_hash(daddr, saddr, fl.iif);
1878 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1879 goto done;
1880
1881no_route:
1882 RT_CACHE_STAT_INC(in_no_route);
1883 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1884 res.type = RTN_UNREACHABLE;
1885 if (err == -ESRCH)
1886 err = -ENETUNREACH;
1887 goto local_input;
1888
1889
1890
1891
1892martian_destination:
1893 RT_CACHE_STAT_INC(in_martian_dst);
1894#ifdef CONFIG_IP_ROUTE_VERBOSE
1895 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1896 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1897 "%u.%u.%u.%u, dev %s\n",
1898 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1899#endif
1900
1901e_hostunreach:
1902 err = -EHOSTUNREACH;
1903 goto done;
1904
1905e_inval:
1906 err = -EINVAL;
1907 goto done;
1908
1909e_nobufs:
1910 err = -ENOBUFS;
1911 goto done;
1912
1913martian_source:
1914 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1915 goto e_inval;
1916}
1917
1918int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1919 u8 tos, struct net_device *dev)
1920{
1921 struct rtable * rth;
1922 unsigned hash;
1923 int iif = dev->ifindex;
1924
1925 tos &= IPTOS_RT_MASK;
1926 hash = rt_hash(daddr, saddr, iif);
1927
1928 rcu_read_lock();
1929 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1930 rth = rcu_dereference(rth->u.dst.rt_next)) {
1931 if (rth->fl.fl4_dst == daddr &&
1932 rth->fl.fl4_src == saddr &&
1933 rth->fl.iif == iif &&
1934 rth->fl.oif == 0 &&
1935 rth->fl.mark == skb->mark &&
1936 rth->fl.fl4_tos == tos) {
1937 dst_use(&rth->u.dst, jiffies);
1938 RT_CACHE_STAT_INC(in_hit);
1939 rcu_read_unlock();
1940 skb->dst = (struct dst_entry*)rth;
1941 return 0;
1942 }
1943 RT_CACHE_STAT_INC(in_hlist_search);
1944 }
1945 rcu_read_unlock();
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958 if (MULTICAST(daddr)) {
1959 struct in_device *in_dev;
1960
1961 rcu_read_lock();
1962 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1963 int our = ip_check_mc(in_dev, daddr, saddr,
1964 ip_hdr(skb)->protocol);
1965 if (our
1966#ifdef CONFIG_IP_MROUTE
1967 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1968#endif
1969 ) {
1970 rcu_read_unlock();
1971 return ip_route_input_mc(skb, daddr, saddr,
1972 tos, dev, our);
1973 }
1974 }
1975 rcu_read_unlock();
1976 return -EINVAL;
1977 }
1978 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1979}
1980
1981static inline int __mkroute_output(struct rtable **result,
1982 struct fib_result* res,
1983 const struct flowi *fl,
1984 const struct flowi *oldflp,
1985 struct net_device *dev_out,
1986 unsigned flags)
1987{
1988 struct rtable *rth;
1989 struct in_device *in_dev;
1990 u32 tos = RT_FL_TOS(oldflp);
1991 int err = 0;
1992
1993 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
1994 return -EINVAL;
1995
1996 if (fl->fl4_dst == htonl(0xFFFFFFFF))
1997 res->type = RTN_BROADCAST;
1998 else if (MULTICAST(fl->fl4_dst))
1999 res->type = RTN_MULTICAST;
2000 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2001 return -EINVAL;
2002
2003 if (dev_out->flags & IFF_LOOPBACK)
2004 flags |= RTCF_LOCAL;
2005
2006
2007 in_dev = in_dev_get(dev_out);
2008 if (!in_dev)
2009 return -EINVAL;
2010
2011 if (res->type == RTN_BROADCAST) {
2012 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2013 if (res->fi) {
2014 fib_info_put(res->fi);
2015 res->fi = NULL;
2016 }
2017 } else if (res->type == RTN_MULTICAST) {
2018 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2019 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2020 oldflp->proto))
2021 flags &= ~RTCF_LOCAL;
2022
2023
2024
2025
2026 if (res->fi && res->prefixlen < 4) {
2027 fib_info_put(res->fi);
2028 res->fi = NULL;
2029 }
2030 }
2031
2032
2033 rth = dst_alloc(&ipv4_dst_ops);
2034 if (!rth) {
2035 err = -ENOBUFS;
2036 goto cleanup;
2037 }
2038
2039 atomic_set(&rth->u.dst.__refcnt, 1);
2040 rth->u.dst.flags= DST_HOST;
2041 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2042 rth->u.dst.flags |= DST_NOXFRM;
2043 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2044 rth->u.dst.flags |= DST_NOPOLICY;
2045
2046 rth->fl.fl4_dst = oldflp->fl4_dst;
2047 rth->fl.fl4_tos = tos;
2048 rth->fl.fl4_src = oldflp->fl4_src;
2049 rth->fl.oif = oldflp->oif;
2050 rth->fl.mark = oldflp->mark;
2051 rth->rt_dst = fl->fl4_dst;
2052 rth->rt_src = fl->fl4_src;
2053 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2054
2055
2056 rth->u.dst.dev = dev_out;
2057 dev_hold(dev_out);
2058 rth->idev = in_dev_get(dev_out);
2059 rth->rt_gateway = fl->fl4_dst;
2060 rth->rt_spec_dst= fl->fl4_src;
2061
2062 rth->u.dst.output=ip_output;
2063
2064 RT_CACHE_STAT_INC(out_slow_tot);
2065
2066 if (flags & RTCF_LOCAL) {
2067 rth->u.dst.input = ip_local_deliver;
2068 rth->rt_spec_dst = fl->fl4_dst;
2069 }
2070 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2071 rth->rt_spec_dst = fl->fl4_src;
2072 if (flags & RTCF_LOCAL &&
2073 !(dev_out->flags & IFF_LOOPBACK)) {
2074 rth->u.dst.output = ip_mc_output;
2075 RT_CACHE_STAT_INC(out_slow_mc);
2076 }
2077#ifdef CONFIG_IP_MROUTE
2078 if (res->type == RTN_MULTICAST) {
2079 if (IN_DEV_MFORWARD(in_dev) &&
2080 !LOCAL_MCAST(oldflp->fl4_dst)) {
2081 rth->u.dst.input = ip_mr_input;
2082 rth->u.dst.output = ip_mc_output;
2083 }
2084 }
2085#endif
2086 }
2087
2088 rt_set_nexthop(rth, res, 0);
2089
2090 rth->rt_flags = flags;
2091
2092 *result = rth;
2093 cleanup:
2094
2095 in_dev_put(in_dev);
2096
2097 return err;
2098}
2099
2100static inline int ip_mkroute_output(struct rtable **rp,
2101 struct fib_result* res,
2102 const struct flowi *fl,
2103 const struct flowi *oldflp,
2104 struct net_device *dev_out,
2105 unsigned flags)
2106{
2107 struct rtable *rth = NULL;
2108 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2109 unsigned hash;
2110 if (err == 0) {
2111 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2112 err = rt_intern_hash(hash, rth, rp);
2113 }
2114
2115 return err;
2116}
2117
2118
2119
2120
2121
2122static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2123{
2124 u32 tos = RT_FL_TOS(oldflp);
2125 struct flowi fl = { .nl_u = { .ip4_u =
2126 { .daddr = oldflp->fl4_dst,
2127 .saddr = oldflp->fl4_src,
2128 .tos = tos & IPTOS_RT_MASK,
2129 .scope = ((tos & RTO_ONLINK) ?
2130 RT_SCOPE_LINK :
2131 RT_SCOPE_UNIVERSE),
2132 } },
2133 .mark = oldflp->mark,
2134 .iif = init_net.loopback_dev->ifindex,
2135 .oif = oldflp->oif };
2136 struct fib_result res;
2137 unsigned flags = 0;
2138 struct net_device *dev_out = NULL;
2139 int free_res = 0;
2140 int err;
2141
2142
2143 res.fi = NULL;
2144#ifdef CONFIG_IP_MULTIPLE_TABLES
2145 res.r = NULL;
2146#endif
2147
2148 if (oldflp->fl4_src) {
2149 err = -EINVAL;
2150 if (MULTICAST(oldflp->fl4_src) ||
2151 BADCLASS(oldflp->fl4_src) ||
2152 ZERONET(oldflp->fl4_src))
2153 goto out;
2154
2155
2156 dev_out = ip_dev_find(oldflp->fl4_src);
2157 if (dev_out == NULL)
2158 goto out;
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168 if (oldflp->oif == 0
2169 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185 fl.oif = dev_out->ifindex;
2186 goto make_route;
2187 }
2188 if (dev_out)
2189 dev_put(dev_out);
2190 dev_out = NULL;
2191 }
2192
2193
2194 if (oldflp->oif) {
2195 dev_out = dev_get_by_index(&init_net, oldflp->oif);
2196 err = -ENODEV;
2197 if (dev_out == NULL)
2198 goto out;
2199
2200
2201 if (__in_dev_get_rtnl(dev_out) == NULL) {
2202 dev_put(dev_out);
2203 goto out;
2204 }
2205
2206 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2207 if (!fl.fl4_src)
2208 fl.fl4_src = inet_select_addr(dev_out, 0,
2209 RT_SCOPE_LINK);
2210 goto make_route;
2211 }
2212 if (!fl.fl4_src) {
2213 if (MULTICAST(oldflp->fl4_dst))
2214 fl.fl4_src = inet_select_addr(dev_out, 0,
2215 fl.fl4_scope);
2216 else if (!oldflp->fl4_dst)
2217 fl.fl4_src = inet_select_addr(dev_out, 0,
2218 RT_SCOPE_HOST);
2219 }
2220 }
2221
2222 if (!fl.fl4_dst) {
2223 fl.fl4_dst = fl.fl4_src;
2224 if (!fl.fl4_dst)
2225 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2226 if (dev_out)
2227 dev_put(dev_out);
2228 dev_out = init_net.loopback_dev;
2229 dev_hold(dev_out);
2230 fl.oif = init_net.loopback_dev->ifindex;
2231 res.type = RTN_LOCAL;
2232 flags |= RTCF_LOCAL;
2233 goto make_route;
2234 }
2235
2236 if (fib_lookup(&fl, &res)) {
2237 res.fi = NULL;
2238 if (oldflp->oif) {
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257 if (fl.fl4_src == 0)
2258 fl.fl4_src = inet_select_addr(dev_out, 0,
2259 RT_SCOPE_LINK);
2260 res.type = RTN_UNICAST;
2261 goto make_route;
2262 }
2263 if (dev_out)
2264 dev_put(dev_out);
2265 err = -ENETUNREACH;
2266 goto out;
2267 }
2268 free_res = 1;
2269
2270 if (res.type == RTN_LOCAL) {
2271 if (!fl.fl4_src)
2272 fl.fl4_src = fl.fl4_dst;
2273 if (dev_out)
2274 dev_put(dev_out);
2275 dev_out = init_net.loopback_dev;
2276 dev_hold(dev_out);
2277 fl.oif = dev_out->ifindex;
2278 if (res.fi)
2279 fib_info_put(res.fi);
2280 res.fi = NULL;
2281 flags |= RTCF_LOCAL;
2282 goto make_route;
2283 }
2284
2285#ifdef CONFIG_IP_ROUTE_MULTIPATH
2286 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2287 fib_select_multipath(&fl, &res);
2288 else
2289#endif
2290 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2291 fib_select_default(&fl, &res);
2292
2293 if (!fl.fl4_src)
2294 fl.fl4_src = FIB_RES_PREFSRC(res);
2295
2296 if (dev_out)
2297 dev_put(dev_out);
2298 dev_out = FIB_RES_DEV(res);
2299 dev_hold(dev_out);
2300 fl.oif = dev_out->ifindex;
2301
2302
2303make_route:
2304 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2305
2306
2307 if (free_res)
2308 fib_res_put(&res);
2309 if (dev_out)
2310 dev_put(dev_out);
2311out: return err;
2312}
2313
2314int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2315{
2316 unsigned hash;
2317 struct rtable *rth;
2318
2319 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2320
2321 rcu_read_lock_bh();
2322 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2323 rth = rcu_dereference(rth->u.dst.rt_next)) {
2324 if (rth->fl.fl4_dst == flp->fl4_dst &&
2325 rth->fl.fl4_src == flp->fl4_src &&
2326 rth->fl.iif == 0 &&
2327 rth->fl.oif == flp->oif &&
2328 rth->fl.mark == flp->mark &&
2329 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2330 (IPTOS_RT_MASK | RTO_ONLINK))) {
2331 dst_use(&rth->u.dst, jiffies);
2332 RT_CACHE_STAT_INC(out_hit);
2333 rcu_read_unlock_bh();
2334 *rp = rth;
2335 return 0;
2336 }
2337 RT_CACHE_STAT_INC(out_hlist_search);
2338 }
2339 rcu_read_unlock_bh();
2340
2341 return ip_route_output_slow(rp, flp);
2342}
2343
2344EXPORT_SYMBOL_GPL(__ip_route_output_key);
2345
2346static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2347{
2348}
2349
2350static struct dst_ops ipv4_dst_blackhole_ops = {
2351 .family = AF_INET,
2352 .protocol = __constant_htons(ETH_P_IP),
2353 .destroy = ipv4_dst_destroy,
2354 .check = ipv4_dst_check,
2355 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2356 .entry_size = sizeof(struct rtable),
2357};
2358
2359
2360static int ipv4_blackhole_output(struct sk_buff *skb)
2361{
2362 kfree_skb(skb);
2363 return 0;
2364}
2365
2366static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2367{
2368 struct rtable *ort = *rp;
2369 struct rtable *rt = (struct rtable *)
2370 dst_alloc(&ipv4_dst_blackhole_ops);
2371
2372 if (rt) {
2373 struct dst_entry *new = &rt->u.dst;
2374
2375 atomic_set(&new->__refcnt, 1);
2376 new->__use = 1;
2377 new->input = ipv4_blackhole_output;
2378 new->output = ipv4_blackhole_output;
2379 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2380
2381 new->dev = ort->u.dst.dev;
2382 if (new->dev)
2383 dev_hold(new->dev);
2384
2385 rt->fl = ort->fl;
2386
2387 rt->idev = ort->idev;
2388 if (rt->idev)
2389 in_dev_hold(rt->idev);
2390 rt->rt_flags = ort->rt_flags;
2391 rt->rt_type = ort->rt_type;
2392 rt->rt_dst = ort->rt_dst;
2393 rt->rt_src = ort->rt_src;
2394 rt->rt_iif = ort->rt_iif;
2395 rt->rt_gateway = ort->rt_gateway;
2396 rt->rt_spec_dst = ort->rt_spec_dst;
2397 rt->peer = ort->peer;
2398 if (rt->peer)
2399 atomic_inc(&rt->peer->refcnt);
2400
2401 dst_free(new);
2402 }
2403
2404 dst_release(&(*rp)->u.dst);
2405 *rp = rt;
2406 return (rt ? 0 : -ENOMEM);
2407}
2408
2409int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2410{
2411 int err;
2412
2413 if ((err = __ip_route_output_key(rp, flp)) != 0)
2414 return err;
2415
2416 if (flp->proto) {
2417 if (!flp->fl4_src)
2418 flp->fl4_src = (*rp)->rt_src;
2419 if (!flp->fl4_dst)
2420 flp->fl4_dst = (*rp)->rt_dst;
2421 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2422 if (err == -EREMOTE)
2423 err = ipv4_dst_blackhole(rp, flp, sk);
2424
2425 return err;
2426 }
2427
2428 return 0;
2429}
2430
2431EXPORT_SYMBOL_GPL(ip_route_output_flow);
2432
2433int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2434{
2435 return ip_route_output_flow(rp, flp, NULL, 0);
2436}
2437
2438static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2439 int nowait, unsigned int flags)
2440{
2441 struct rtable *rt = (struct rtable*)skb->dst;
2442 struct rtmsg *r;
2443 struct nlmsghdr *nlh;
2444 long expires;
2445 u32 id = 0, ts = 0, tsage = 0, error;
2446
2447 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2448 if (nlh == NULL)
2449 return -EMSGSIZE;
2450
2451 r = nlmsg_data(nlh);
2452 r->rtm_family = AF_INET;
2453 r->rtm_dst_len = 32;
2454 r->rtm_src_len = 0;
2455 r->rtm_tos = rt->fl.fl4_tos;
2456 r->rtm_table = RT_TABLE_MAIN;
2457 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2458 r->rtm_type = rt->rt_type;
2459 r->rtm_scope = RT_SCOPE_UNIVERSE;
2460 r->rtm_protocol = RTPROT_UNSPEC;
2461 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2462 if (rt->rt_flags & RTCF_NOTIFY)
2463 r->rtm_flags |= RTM_F_NOTIFY;
2464
2465 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2466
2467 if (rt->fl.fl4_src) {
2468 r->rtm_src_len = 32;
2469 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2470 }
2471 if (rt->u.dst.dev)
2472 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2473#ifdef CONFIG_NET_CLS_ROUTE
2474 if (rt->u.dst.tclassid)
2475 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2476#endif
2477 if (rt->fl.iif)
2478 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2479 else if (rt->rt_src != rt->fl.fl4_src)
2480 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2481
2482 if (rt->rt_dst != rt->rt_gateway)
2483 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2484
2485 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2486 goto nla_put_failure;
2487
2488 error = rt->u.dst.error;
2489 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2490 if (rt->peer) {
2491 id = rt->peer->ip_id_count;
2492 if (rt->peer->tcp_ts_stamp) {
2493 ts = rt->peer->tcp_ts;
2494 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2495 }
2496 }
2497
2498 if (rt->fl.iif) {
2499#ifdef CONFIG_IP_MROUTE
2500 __be32 dst = rt->rt_dst;
2501
2502 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2503 IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2504 int err = ipmr_get_route(skb, r, nowait);
2505 if (err <= 0) {
2506 if (!nowait) {
2507 if (err == 0)
2508 return 0;
2509 goto nla_put_failure;
2510 } else {
2511 if (err == -EMSGSIZE)
2512 goto nla_put_failure;
2513 error = err;
2514 }
2515 }
2516 } else
2517#endif
2518 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2519 }
2520
2521 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2522 expires, error) < 0)
2523 goto nla_put_failure;
2524
2525 return nlmsg_end(skb, nlh);
2526
2527nla_put_failure:
2528 nlmsg_cancel(skb, nlh);
2529 return -EMSGSIZE;
2530}
2531
2532static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2533{
2534 struct rtmsg *rtm;
2535 struct nlattr *tb[RTA_MAX+1];
2536 struct rtable *rt = NULL;
2537 __be32 dst = 0;
2538 __be32 src = 0;
2539 u32 iif;
2540 int err;
2541 struct sk_buff *skb;
2542
2543 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2544 if (err < 0)
2545 goto errout;
2546
2547 rtm = nlmsg_data(nlh);
2548
2549 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2550 if (skb == NULL) {
2551 err = -ENOBUFS;
2552 goto errout;
2553 }
2554
2555
2556
2557
2558 skb_reset_mac_header(skb);
2559 skb_reset_network_header(skb);
2560
2561
2562 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2563 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2564
2565 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2566 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2567 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2568
2569 if (iif) {
2570 struct net_device *dev;
2571
2572 dev = __dev_get_by_index(&init_net, iif);
2573 if (dev == NULL) {
2574 err = -ENODEV;
2575 goto errout_free;
2576 }
2577
2578 skb->protocol = htons(ETH_P_IP);
2579 skb->dev = dev;
2580 local_bh_disable();
2581 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2582 local_bh_enable();
2583
2584 rt = (struct rtable*) skb->dst;
2585 if (err == 0 && rt->u.dst.error)
2586 err = -rt->u.dst.error;
2587 } else {
2588 struct flowi fl = {
2589 .nl_u = {
2590 .ip4_u = {
2591 .daddr = dst,
2592 .saddr = src,
2593 .tos = rtm->rtm_tos,
2594 },
2595 },
2596 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2597 };
2598 err = ip_route_output_key(&rt, &fl);
2599 }
2600
2601 if (err)
2602 goto errout_free;
2603
2604 skb->dst = &rt->u.dst;
2605 if (rtm->rtm_flags & RTM_F_NOTIFY)
2606 rt->rt_flags |= RTCF_NOTIFY;
2607
2608 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2609 RTM_NEWROUTE, 0, 0);
2610 if (err <= 0)
2611 goto errout_free;
2612
2613 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2614errout:
2615 return err;
2616
2617errout_free:
2618 kfree_skb(skb);
2619 goto errout;
2620}
2621
2622int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2623{
2624 struct rtable *rt;
2625 int h, s_h;
2626 int idx, s_idx;
2627
2628 s_h = cb->args[0];
2629 if (s_h < 0)
2630 s_h = 0;
2631 s_idx = idx = cb->args[1];
2632 for (h = s_h; h <= rt_hash_mask; h++) {
2633 rcu_read_lock_bh();
2634 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2635 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2636 if (idx < s_idx)
2637 continue;
2638 skb->dst = dst_clone(&rt->u.dst);
2639 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2640 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2641 1, NLM_F_MULTI) <= 0) {
2642 dst_release(xchg(&skb->dst, NULL));
2643 rcu_read_unlock_bh();
2644 goto done;
2645 }
2646 dst_release(xchg(&skb->dst, NULL));
2647 }
2648 rcu_read_unlock_bh();
2649 s_idx = 0;
2650 }
2651
2652done:
2653 cb->args[0] = h;
2654 cb->args[1] = idx;
2655 return skb->len;
2656}
2657
2658void ip_rt_multicast_event(struct in_device *in_dev)
2659{
2660 rt_cache_flush(0);
2661}
2662
2663#ifdef CONFIG_SYSCTL
2664static int flush_delay;
2665
2666static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2667 struct file *filp, void __user *buffer,
2668 size_t *lenp, loff_t *ppos)
2669{
2670 if (write) {
2671 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2672 rt_cache_flush(flush_delay);
2673 return 0;
2674 }
2675
2676 return -EINVAL;
2677}
2678
2679static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2680 int __user *name,
2681 int nlen,
2682 void __user *oldval,
2683 size_t __user *oldlenp,
2684 void __user *newval,
2685 size_t newlen)
2686{
2687 int delay;
2688 if (newlen != sizeof(int))
2689 return -EINVAL;
2690 if (get_user(delay, (int __user *)newval))
2691 return -EFAULT;
2692 rt_cache_flush(delay);
2693 return 0;
2694}
2695
2696ctl_table ipv4_route_table[] = {
2697 {
2698 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2699 .procname = "flush",
2700 .data = &flush_delay,
2701 .maxlen = sizeof(int),
2702 .mode = 0200,
2703 .proc_handler = &ipv4_sysctl_rtcache_flush,
2704 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2705 },
2706 {
2707 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2708 .procname = "min_delay",
2709 .data = &ip_rt_min_delay,
2710 .maxlen = sizeof(int),
2711 .mode = 0644,
2712 .proc_handler = &proc_dointvec_jiffies,
2713 .strategy = &sysctl_jiffies,
2714 },
2715 {
2716 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2717 .procname = "max_delay",
2718 .data = &ip_rt_max_delay,
2719 .maxlen = sizeof(int),
2720 .mode = 0644,
2721 .proc_handler = &proc_dointvec_jiffies,
2722 .strategy = &sysctl_jiffies,
2723 },
2724 {
2725 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2726 .procname = "gc_thresh",
2727 .data = &ipv4_dst_ops.gc_thresh,
2728 .maxlen = sizeof(int),
2729 .mode = 0644,
2730 .proc_handler = &proc_dointvec,
2731 },
2732 {
2733 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2734 .procname = "max_size",
2735 .data = &ip_rt_max_size,
2736 .maxlen = sizeof(int),
2737 .mode = 0644,
2738 .proc_handler = &proc_dointvec,
2739 },
2740 {
2741
2742
2743 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2744 .procname = "gc_min_interval",
2745 .data = &ip_rt_gc_min_interval,
2746 .maxlen = sizeof(int),
2747 .mode = 0644,
2748 .proc_handler = &proc_dointvec_jiffies,
2749 .strategy = &sysctl_jiffies,
2750 },
2751 {
2752 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2753 .procname = "gc_min_interval_ms",
2754 .data = &ip_rt_gc_min_interval,
2755 .maxlen = sizeof(int),
2756 .mode = 0644,
2757 .proc_handler = &proc_dointvec_ms_jiffies,
2758 .strategy = &sysctl_ms_jiffies,
2759 },
2760 {
2761 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2762 .procname = "gc_timeout",
2763 .data = &ip_rt_gc_timeout,
2764 .maxlen = sizeof(int),
2765 .mode = 0644,
2766 .proc_handler = &proc_dointvec_jiffies,
2767 .strategy = &sysctl_jiffies,
2768 },
2769 {
2770 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2771 .procname = "gc_interval",
2772 .data = &ip_rt_gc_interval,
2773 .maxlen = sizeof(int),
2774 .mode = 0644,
2775 .proc_handler = &proc_dointvec_jiffies,
2776 .strategy = &sysctl_jiffies,
2777 },
2778 {
2779 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2780 .procname = "redirect_load",
2781 .data = &ip_rt_redirect_load,
2782 .maxlen = sizeof(int),
2783 .mode = 0644,
2784 .proc_handler = &proc_dointvec,
2785 },
2786 {
2787 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2788 .procname = "redirect_number",
2789 .data = &ip_rt_redirect_number,
2790 .maxlen = sizeof(int),
2791 .mode = 0644,
2792 .proc_handler = &proc_dointvec,
2793 },
2794 {
2795 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2796 .procname = "redirect_silence",
2797 .data = &ip_rt_redirect_silence,
2798 .maxlen = sizeof(int),
2799 .mode = 0644,
2800 .proc_handler = &proc_dointvec,
2801 },
2802 {
2803 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2804 .procname = "error_cost",
2805 .data = &ip_rt_error_cost,
2806 .maxlen = sizeof(int),
2807 .mode = 0644,
2808 .proc_handler = &proc_dointvec,
2809 },
2810 {
2811 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2812 .procname = "error_burst",
2813 .data = &ip_rt_error_burst,
2814 .maxlen = sizeof(int),
2815 .mode = 0644,
2816 .proc_handler = &proc_dointvec,
2817 },
2818 {
2819 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2820 .procname = "gc_elasticity",
2821 .data = &ip_rt_gc_elasticity,
2822 .maxlen = sizeof(int),
2823 .mode = 0644,
2824 .proc_handler = &proc_dointvec,
2825 },
2826 {
2827 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2828 .procname = "mtu_expires",
2829 .data = &ip_rt_mtu_expires,
2830 .maxlen = sizeof(int),
2831 .mode = 0644,
2832 .proc_handler = &proc_dointvec_jiffies,
2833 .strategy = &sysctl_jiffies,
2834 },
2835 {
2836 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2837 .procname = "min_pmtu",
2838 .data = &ip_rt_min_pmtu,
2839 .maxlen = sizeof(int),
2840 .mode = 0644,
2841 .proc_handler = &proc_dointvec,
2842 },
2843 {
2844 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2845 .procname = "min_adv_mss",
2846 .data = &ip_rt_min_advmss,
2847 .maxlen = sizeof(int),
2848 .mode = 0644,
2849 .proc_handler = &proc_dointvec,
2850 },
2851 {
2852 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2853 .procname = "secret_interval",
2854 .data = &ip_rt_secret_interval,
2855 .maxlen = sizeof(int),
2856 .mode = 0644,
2857 .proc_handler = &proc_dointvec_jiffies,
2858 .strategy = &sysctl_jiffies,
2859 },
2860 { .ctl_name = 0 }
2861};
2862#endif
2863
2864#ifdef CONFIG_NET_CLS_ROUTE
2865struct ip_rt_acct *ip_rt_acct;
2866
2867
2868
2869
2870#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2871
2872#ifdef CONFIG_PROC_FS
2873static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2874 int length, int *eof, void *data)
2875{
2876 unsigned int i;
2877
2878 if ((offset & 3) || (length & 3))
2879 return -EIO;
2880
2881 if (offset >= sizeof(struct ip_rt_acct) * 256) {
2882 *eof = 1;
2883 return 0;
2884 }
2885
2886 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2887 length = sizeof(struct ip_rt_acct) * 256 - offset;
2888 *eof = 1;
2889 }
2890
2891 offset /= sizeof(u32);
2892
2893 if (length > 0) {
2894 u32 *dst = (u32 *) buffer;
2895
2896 *start = buffer;
2897 memset(dst, 0, length);
2898
2899 for_each_possible_cpu(i) {
2900 unsigned int j;
2901 u32 *src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2902
2903 for (j = 0; j < length/4; j++)
2904 dst[j] += src[j];
2905 }
2906 }
2907 return length;
2908}
2909#endif
2910#endif
2911
2912static __initdata unsigned long rhash_entries;
2913static int __init set_rhash_entries(char *str)
2914{
2915 if (!str)
2916 return 0;
2917 rhash_entries = simple_strtoul(str, &str, 0);
2918 return 1;
2919}
2920__setup("rhash_entries=", set_rhash_entries);
2921
2922int __init ip_rt_init(void)
2923{
2924 int rc = 0;
2925
2926 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2927 (jiffies ^ (jiffies >> 7)));
2928
2929#ifdef CONFIG_NET_CLS_ROUTE
2930 {
2931 int order;
2932 for (order = 0;
2933 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2934 ;
2935 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2936 if (!ip_rt_acct)
2937 panic("IP: failed to allocate ip_rt_acct\n");
2938 memset(ip_rt_acct, 0, PAGE_SIZE << order);
2939 }
2940#endif
2941
2942 ipv4_dst_ops.kmem_cachep =
2943 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2944 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2945
2946 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2947
2948 rt_hash_table = (struct rt_hash_bucket *)
2949 alloc_large_system_hash("IP route cache",
2950 sizeof(struct rt_hash_bucket),
2951 rhash_entries,
2952 (num_physpages >= 128 * 1024) ?
2953 15 : 17,
2954 0,
2955 &rt_hash_log,
2956 &rt_hash_mask,
2957 0);
2958 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2959 rt_hash_lock_init();
2960
2961 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2962 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2963
2964 devinet_init();
2965 ip_fib_init();
2966
2967 init_timer(&rt_flush_timer);
2968 rt_flush_timer.function = rt_run_flush;
2969 init_timer(&rt_secret_timer);
2970 rt_secret_timer.function = rt_secret_rebuild;
2971
2972
2973
2974
2975 schedule_delayed_work(&expires_work,
2976 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
2977
2978 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2979 ip_rt_secret_interval;
2980 add_timer(&rt_secret_timer);
2981
2982#ifdef CONFIG_PROC_FS
2983 {
2984 struct proc_dir_entry *rtstat_pde = NULL;
2985 if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2986 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
2987 init_net.proc_net_stat))) {
2988 return -ENOMEM;
2989 }
2990 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
2991 }
2992#ifdef CONFIG_NET_CLS_ROUTE
2993 create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
2994#endif
2995#endif
2996#ifdef CONFIG_XFRM
2997 xfrm_init();
2998 xfrm4_init();
2999#endif
3000 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3001
3002 return rc;
3003}
3004
3005EXPORT_SYMBOL(__ip_select_ident);
3006EXPORT_SYMBOL(ip_route_input);
3007EXPORT_SYMBOL(ip_route_output_key);
3008