1
2
3
4
5
6
7
8
9
10
11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
14#include <linux/types.h>
15#include <linux/netfilter.h>
16#include <linux/module.h>
17#include <linux/sched.h>
18#include <linux/skbuff.h>
19#include <linux/proc_fs.h>
20#include <linux/vmalloc.h>
21#include <linux/stddef.h>
22#include <linux/slab.h>
23#include <linux/random.h>
24#include <linux/jhash.h>
25#include <linux/siphash.h>
26#include <linux/err.h>
27#include <linux/percpu.h>
28#include <linux/moduleparam.h>
29#include <linux/notifier.h>
30#include <linux/kernel.h>
31#include <linux/netdevice.h>
32#include <linux/socket.h>
33#include <linux/mm.h>
34#include <linux/nsproxy.h>
35#include <linux/rculist_nulls.h>
36
37#include <net/netfilter/nf_conntrack.h>
38#include <net/netfilter/nf_conntrack_l4proto.h>
39#include <net/netfilter/nf_conntrack_expect.h>
40#include <net/netfilter/nf_conntrack_helper.h>
41#include <net/netfilter/nf_conntrack_seqadj.h>
42#include <net/netfilter/nf_conntrack_core.h>
43#include <net/netfilter/nf_conntrack_extend.h>
44#include <net/netfilter/nf_conntrack_acct.h>
45#include <net/netfilter/nf_conntrack_ecache.h>
46#include <net/netfilter/nf_conntrack_zones.h>
47#include <net/netfilter/nf_conntrack_timestamp.h>
48#include <net/netfilter/nf_conntrack_timeout.h>
49#include <net/netfilter/nf_conntrack_labels.h>
50#include <net/netfilter/nf_conntrack_synproxy.h>
51#include <net/netfilter/nf_nat.h>
52#include <net/netfilter/nf_nat_helper.h>
53#include <net/netns/hash.h>
54#include <net/ip.h>
55
56#include "nf_internals.h"
57
58__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
59EXPORT_SYMBOL_GPL(nf_conntrack_locks);
60
61__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
62EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
63
64struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
65EXPORT_SYMBOL_GPL(nf_conntrack_hash);
66
67struct conntrack_gc_work {
68 struct delayed_work dwork;
69 u32 next_bucket;
70 bool exiting;
71 bool early_drop;
72};
73
74static __read_mostly struct kmem_cache *nf_conntrack_cachep;
75static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
76static __read_mostly bool nf_conntrack_locks_all;
77
78#define GC_SCAN_INTERVAL (120u * HZ)
79#define GC_SCAN_MAX_DURATION msecs_to_jiffies(10)
80
81static struct conntrack_gc_work conntrack_gc_work;
82
83void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
84{
85
86 spin_lock(lock);
87
88
89
90
91 if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false))
92 return;
93
94
95 spin_unlock(lock);
96
97
98 spin_lock(&nf_conntrack_locks_all_lock);
99
100
101 spin_lock(lock);
102
103
104 spin_unlock(&nf_conntrack_locks_all_lock);
105}
106EXPORT_SYMBOL_GPL(nf_conntrack_lock);
107
108static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
109{
110 h1 %= CONNTRACK_LOCKS;
111 h2 %= CONNTRACK_LOCKS;
112 spin_unlock(&nf_conntrack_locks[h1]);
113 if (h1 != h2)
114 spin_unlock(&nf_conntrack_locks[h2]);
115}
116
117
118static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
119 unsigned int h2, unsigned int sequence)
120{
121 h1 %= CONNTRACK_LOCKS;
122 h2 %= CONNTRACK_LOCKS;
123 if (h1 <= h2) {
124 nf_conntrack_lock(&nf_conntrack_locks[h1]);
125 if (h1 != h2)
126 spin_lock_nested(&nf_conntrack_locks[h2],
127 SINGLE_DEPTH_NESTING);
128 } else {
129 nf_conntrack_lock(&nf_conntrack_locks[h2]);
130 spin_lock_nested(&nf_conntrack_locks[h1],
131 SINGLE_DEPTH_NESTING);
132 }
133 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
134 nf_conntrack_double_unlock(h1, h2);
135 return true;
136 }
137 return false;
138}
139
140static void nf_conntrack_all_lock(void)
141 __acquires(&nf_conntrack_locks_all_lock)
142{
143 int i;
144
145 spin_lock(&nf_conntrack_locks_all_lock);
146
147
148
149
150
151
152
153
154
155 WRITE_ONCE(nf_conntrack_locks_all, true);
156
157 for (i = 0; i < CONNTRACK_LOCKS; i++) {
158 spin_lock(&nf_conntrack_locks[i]);
159
160
161
162
163
164 spin_unlock(&nf_conntrack_locks[i]);
165 }
166}
167
168static void nf_conntrack_all_unlock(void)
169 __releases(&nf_conntrack_locks_all_lock)
170{
171
172
173
174
175
176
177 smp_store_release(&nf_conntrack_locks_all, false);
178 spin_unlock(&nf_conntrack_locks_all_lock);
179}
180
181unsigned int nf_conntrack_htable_size __read_mostly;
182EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
183
184unsigned int nf_conntrack_max __read_mostly;
185EXPORT_SYMBOL_GPL(nf_conntrack_max);
186seqcount_spinlock_t nf_conntrack_generation __read_mostly;
187static unsigned int nf_conntrack_hash_rnd __read_mostly;
188
189static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
190 const struct net *net)
191{
192 unsigned int n;
193 u32 seed;
194
195 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
196
197
198
199
200
201 seed = nf_conntrack_hash_rnd ^ net_hash_mix(net);
202 n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
203 return jhash2((u32 *)tuple, n, seed ^
204 (((__force __u16)tuple->dst.u.all << 16) |
205 tuple->dst.protonum));
206}
207
208static u32 scale_hash(u32 hash)
209{
210 return reciprocal_scale(hash, nf_conntrack_htable_size);
211}
212
213static u32 __hash_conntrack(const struct net *net,
214 const struct nf_conntrack_tuple *tuple,
215 unsigned int size)
216{
217 return reciprocal_scale(hash_conntrack_raw(tuple, net), size);
218}
219
220static u32 hash_conntrack(const struct net *net,
221 const struct nf_conntrack_tuple *tuple)
222{
223 return scale_hash(hash_conntrack_raw(tuple, net));
224}
225
226static bool nf_ct_get_tuple_ports(const struct sk_buff *skb,
227 unsigned int dataoff,
228 struct nf_conntrack_tuple *tuple)
229{ struct {
230 __be16 sport;
231 __be16 dport;
232 } _inet_hdr, *inet_hdr;
233
234
235 inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr);
236 if (!inet_hdr)
237 return false;
238
239 tuple->src.u.udp.port = inet_hdr->sport;
240 tuple->dst.u.udp.port = inet_hdr->dport;
241 return true;
242}
243
244static bool
245nf_ct_get_tuple(const struct sk_buff *skb,
246 unsigned int nhoff,
247 unsigned int dataoff,
248 u_int16_t l3num,
249 u_int8_t protonum,
250 struct net *net,
251 struct nf_conntrack_tuple *tuple)
252{
253 unsigned int size;
254 const __be32 *ap;
255 __be32 _addrs[8];
256
257 memset(tuple, 0, sizeof(*tuple));
258
259 tuple->src.l3num = l3num;
260 switch (l3num) {
261 case NFPROTO_IPV4:
262 nhoff += offsetof(struct iphdr, saddr);
263 size = 2 * sizeof(__be32);
264 break;
265 case NFPROTO_IPV6:
266 nhoff += offsetof(struct ipv6hdr, saddr);
267 size = sizeof(_addrs);
268 break;
269 default:
270 return true;
271 }
272
273 ap = skb_header_pointer(skb, nhoff, size, _addrs);
274 if (!ap)
275 return false;
276
277 switch (l3num) {
278 case NFPROTO_IPV4:
279 tuple->src.u3.ip = ap[0];
280 tuple->dst.u3.ip = ap[1];
281 break;
282 case NFPROTO_IPV6:
283 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
284 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
285 break;
286 }
287
288 tuple->dst.protonum = protonum;
289 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
290
291 switch (protonum) {
292#if IS_ENABLED(CONFIG_IPV6)
293 case IPPROTO_ICMPV6:
294 return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple);
295#endif
296 case IPPROTO_ICMP:
297 return icmp_pkt_to_tuple(skb, dataoff, net, tuple);
298#ifdef CONFIG_NF_CT_PROTO_GRE
299 case IPPROTO_GRE:
300 return gre_pkt_to_tuple(skb, dataoff, net, tuple);
301#endif
302 case IPPROTO_TCP:
303 case IPPROTO_UDP:
304 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
305#ifdef CONFIG_NF_CT_PROTO_UDPLITE
306 case IPPROTO_UDPLITE:
307 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
308#endif
309#ifdef CONFIG_NF_CT_PROTO_SCTP
310 case IPPROTO_SCTP:
311 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
312#endif
313#ifdef CONFIG_NF_CT_PROTO_DCCP
314 case IPPROTO_DCCP:
315 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
316#endif
317 default:
318 break;
319 }
320
321 return true;
322}
323
324static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
325 u_int8_t *protonum)
326{
327 int dataoff = -1;
328 const struct iphdr *iph;
329 struct iphdr _iph;
330
331 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
332 if (!iph)
333 return -1;
334
335
336
337
338 if (iph->frag_off & htons(IP_OFFSET))
339 return -1;
340
341 dataoff = nhoff + (iph->ihl << 2);
342 *protonum = iph->protocol;
343
344
345 if (dataoff > skb->len) {
346 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n",
347 nhoff, iph->ihl << 2, skb->len);
348 return -1;
349 }
350 return dataoff;
351}
352
353#if IS_ENABLED(CONFIG_IPV6)
354static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
355 u8 *protonum)
356{
357 int protoff = -1;
358 unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
359 __be16 frag_off;
360 u8 nexthdr;
361
362 if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr),
363 &nexthdr, sizeof(nexthdr)) != 0) {
364 pr_debug("can't get nexthdr\n");
365 return -1;
366 }
367 protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off);
368
369
370
371
372 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
373 pr_debug("can't find proto in pkt\n");
374 return -1;
375 }
376
377 *protonum = nexthdr;
378 return protoff;
379}
380#endif
381
382static int get_l4proto(const struct sk_buff *skb,
383 unsigned int nhoff, u8 pf, u8 *l4num)
384{
385 switch (pf) {
386 case NFPROTO_IPV4:
387 return ipv4_get_l4proto(skb, nhoff, l4num);
388#if IS_ENABLED(CONFIG_IPV6)
389 case NFPROTO_IPV6:
390 return ipv6_get_l4proto(skb, nhoff, l4num);
391#endif
392 default:
393 *l4num = 0;
394 break;
395 }
396 return -1;
397}
398
399bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
400 u_int16_t l3num,
401 struct net *net, struct nf_conntrack_tuple *tuple)
402{
403 u8 protonum;
404 int protoff;
405
406 protoff = get_l4proto(skb, nhoff, l3num, &protonum);
407 if (protoff <= 0)
408 return false;
409
410 return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple);
411}
412EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
413
414bool
415nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
416 const struct nf_conntrack_tuple *orig)
417{
418 memset(inverse, 0, sizeof(*inverse));
419
420 inverse->src.l3num = orig->src.l3num;
421
422 switch (orig->src.l3num) {
423 case NFPROTO_IPV4:
424 inverse->src.u3.ip = orig->dst.u3.ip;
425 inverse->dst.u3.ip = orig->src.u3.ip;
426 break;
427 case NFPROTO_IPV6:
428 inverse->src.u3.in6 = orig->dst.u3.in6;
429 inverse->dst.u3.in6 = orig->src.u3.in6;
430 break;
431 default:
432 break;
433 }
434
435 inverse->dst.dir = !orig->dst.dir;
436
437 inverse->dst.protonum = orig->dst.protonum;
438
439 switch (orig->dst.protonum) {
440 case IPPROTO_ICMP:
441 return nf_conntrack_invert_icmp_tuple(inverse, orig);
442#if IS_ENABLED(CONFIG_IPV6)
443 case IPPROTO_ICMPV6:
444 return nf_conntrack_invert_icmpv6_tuple(inverse, orig);
445#endif
446 }
447
448 inverse->src.u.all = orig->dst.u.all;
449 inverse->dst.u.all = orig->src.u.all;
450 return true;
451}
452EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467u32 nf_ct_get_id(const struct nf_conn *ct)
468{
469 static __read_mostly siphash_key_t ct_id_seed;
470 unsigned long a, b, c, d;
471
472 net_get_random_once(&ct_id_seed, sizeof(ct_id_seed));
473
474 a = (unsigned long)ct;
475 b = (unsigned long)ct->master;
476 c = (unsigned long)nf_ct_net(ct);
477 d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
478 sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple),
479 &ct_id_seed);
480#ifdef CONFIG_64BIT
481 return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed);
482#else
483 return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed);
484#endif
485}
486EXPORT_SYMBOL_GPL(nf_ct_get_id);
487
488static void
489clean_from_lists(struct nf_conn *ct)
490{
491 pr_debug("clean_from_lists(%p)\n", ct);
492 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
493 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
494
495
496 nf_ct_remove_expectations(ct);
497}
498
499
500static void nf_ct_add_to_dying_list(struct nf_conn *ct)
501{
502 struct ct_pcpu *pcpu;
503
504
505 ct->cpu = smp_processor_id();
506 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
507
508 spin_lock(&pcpu->lock);
509 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
510 &pcpu->dying);
511 spin_unlock(&pcpu->lock);
512}
513
514
515static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
516{
517 struct ct_pcpu *pcpu;
518
519
520 ct->cpu = smp_processor_id();
521 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
522
523 spin_lock(&pcpu->lock);
524 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
525 &pcpu->unconfirmed);
526 spin_unlock(&pcpu->lock);
527}
528
529
530static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
531{
532 struct ct_pcpu *pcpu;
533
534
535 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
536
537 spin_lock(&pcpu->lock);
538 BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
539 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
540 spin_unlock(&pcpu->lock);
541}
542
543#define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
544
545
546struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
547 const struct nf_conntrack_zone *zone,
548 gfp_t flags)
549{
550 struct nf_conn *tmpl, *p;
551
552 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) {
553 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags);
554 if (!tmpl)
555 return NULL;
556
557 p = tmpl;
558 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
559 if (tmpl != p) {
560 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
561 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p;
562 }
563 } else {
564 tmpl = kzalloc(sizeof(*tmpl), flags);
565 if (!tmpl)
566 return NULL;
567 }
568
569 tmpl->status = IPS_TEMPLATE;
570 write_pnet(&tmpl->ct_net, net);
571 nf_ct_zone_add(tmpl, zone);
572 atomic_set(&tmpl->ct_general.use, 0);
573
574 return tmpl;
575}
576EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
577
578void nf_ct_tmpl_free(struct nf_conn *tmpl)
579{
580 nf_ct_ext_destroy(tmpl);
581
582 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
583 kfree((char *)tmpl - tmpl->proto.tmpl_padto);
584 else
585 kfree(tmpl);
586}
587EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
588
589static void destroy_gre_conntrack(struct nf_conn *ct)
590{
591#ifdef CONFIG_NF_CT_PROTO_GRE
592 struct nf_conn *master = ct->master;
593
594 if (master)
595 nf_ct_gre_keymap_destroy(master);
596#endif
597}
598
599static void
600destroy_conntrack(struct nf_conntrack *nfct)
601{
602 struct nf_conn *ct = (struct nf_conn *)nfct;
603
604 pr_debug("destroy_conntrack(%p)\n", ct);
605 WARN_ON(atomic_read(&nfct->use) != 0);
606
607 if (unlikely(nf_ct_is_template(ct))) {
608 nf_ct_tmpl_free(ct);
609 return;
610 }
611
612 if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE))
613 destroy_gre_conntrack(ct);
614
615 local_bh_disable();
616
617
618
619
620
621 nf_ct_remove_expectations(ct);
622
623 nf_ct_del_from_dying_or_unconfirmed_list(ct);
624
625 local_bh_enable();
626
627 if (ct->master)
628 nf_ct_put(ct->master);
629
630 pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
631 nf_conntrack_free(ct);
632}
633
634static void nf_ct_delete_from_lists(struct nf_conn *ct)
635{
636 struct net *net = nf_ct_net(ct);
637 unsigned int hash, reply_hash;
638 unsigned int sequence;
639
640 nf_ct_helper_destroy(ct);
641
642 local_bh_disable();
643 do {
644 sequence = read_seqcount_begin(&nf_conntrack_generation);
645 hash = hash_conntrack(net,
646 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
647 reply_hash = hash_conntrack(net,
648 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
649 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
650
651 clean_from_lists(ct);
652 nf_conntrack_double_unlock(hash, reply_hash);
653
654 nf_ct_add_to_dying_list(ct);
655
656 local_bh_enable();
657}
658
659bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
660{
661 struct nf_conn_tstamp *tstamp;
662 struct net *net;
663
664 if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
665 return false;
666
667 tstamp = nf_conn_tstamp_find(ct);
668 if (tstamp) {
669 s32 timeout = ct->timeout - nfct_time_stamp;
670
671 tstamp->stop = ktime_get_real_ns();
672 if (timeout < 0)
673 tstamp->stop -= jiffies_to_nsecs(-timeout);
674 }
675
676 if (nf_conntrack_event_report(IPCT_DESTROY, ct,
677 portid, report) < 0) {
678
679
680
681 nf_ct_delete_from_lists(ct);
682 nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL);
683 return false;
684 }
685
686 net = nf_ct_net(ct);
687 if (nf_conntrack_ecache_dwork_pending(net))
688 nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT);
689 nf_ct_delete_from_lists(ct);
690 nf_ct_put(ct);
691 return true;
692}
693EXPORT_SYMBOL_GPL(nf_ct_delete);
694
695static inline bool
696nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
697 const struct nf_conntrack_tuple *tuple,
698 const struct nf_conntrack_zone *zone,
699 const struct net *net)
700{
701 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
702
703
704
705
706 return nf_ct_tuple_equal(tuple, &h->tuple) &&
707 nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
708 nf_ct_is_confirmed(ct) &&
709 net_eq(net, nf_ct_net(ct));
710}
711
712static inline bool
713nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
714{
715 return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
716 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
717 nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
718 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
719 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
720 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
721 net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
722}
723
724
725static void nf_ct_gc_expired(struct nf_conn *ct)
726{
727 if (!atomic_inc_not_zero(&ct->ct_general.use))
728 return;
729
730 if (nf_ct_should_gc(ct))
731 nf_ct_kill(ct);
732
733 nf_ct_put(ct);
734}
735
736
737
738
739
740
741static struct nf_conntrack_tuple_hash *
742____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
743 const struct nf_conntrack_tuple *tuple, u32 hash)
744{
745 struct nf_conntrack_tuple_hash *h;
746 struct hlist_nulls_head *ct_hash;
747 struct hlist_nulls_node *n;
748 unsigned int bucket, hsize;
749
750begin:
751 nf_conntrack_get_ht(&ct_hash, &hsize);
752 bucket = reciprocal_scale(hash, hsize);
753
754 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
755 struct nf_conn *ct;
756
757 ct = nf_ct_tuplehash_to_ctrack(h);
758 if (nf_ct_is_expired(ct)) {
759 nf_ct_gc_expired(ct);
760 continue;
761 }
762
763 if (nf_ct_key_equal(h, tuple, zone, net))
764 return h;
765 }
766
767
768
769
770
771 if (get_nulls_value(n) != bucket) {
772 NF_CT_STAT_INC_ATOMIC(net, search_restart);
773 goto begin;
774 }
775
776 return NULL;
777}
778
779
780static struct nf_conntrack_tuple_hash *
781__nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
782 const struct nf_conntrack_tuple *tuple, u32 hash)
783{
784 struct nf_conntrack_tuple_hash *h;
785 struct nf_conn *ct;
786
787 rcu_read_lock();
788
789 h = ____nf_conntrack_find(net, zone, tuple, hash);
790 if (h) {
791
792
793
794 ct = nf_ct_tuplehash_to_ctrack(h);
795 if (likely(atomic_inc_not_zero(&ct->ct_general.use))) {
796 if (likely(nf_ct_key_equal(h, tuple, zone, net)))
797 goto found;
798
799
800 nf_ct_put(ct);
801 }
802
803 h = NULL;
804 }
805found:
806 rcu_read_unlock();
807
808 return h;
809}
810
811struct nf_conntrack_tuple_hash *
812nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
813 const struct nf_conntrack_tuple *tuple)
814{
815 return __nf_conntrack_find_get(net, zone, tuple,
816 hash_conntrack_raw(tuple, net));
817}
818EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
819
820static void __nf_conntrack_hash_insert(struct nf_conn *ct,
821 unsigned int hash,
822 unsigned int reply_hash)
823{
824 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
825 &nf_conntrack_hash[hash]);
826 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
827 &nf_conntrack_hash[reply_hash]);
828}
829
830int
831nf_conntrack_hash_check_insert(struct nf_conn *ct)
832{
833 const struct nf_conntrack_zone *zone;
834 struct net *net = nf_ct_net(ct);
835 unsigned int hash, reply_hash;
836 struct nf_conntrack_tuple_hash *h;
837 struct hlist_nulls_node *n;
838 unsigned int sequence;
839
840 zone = nf_ct_zone(ct);
841
842 local_bh_disable();
843 do {
844 sequence = read_seqcount_begin(&nf_conntrack_generation);
845 hash = hash_conntrack(net,
846 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
847 reply_hash = hash_conntrack(net,
848 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
849 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
850
851
852 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
853 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
854 zone, net))
855 goto out;
856
857 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
858 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
859 zone, net))
860 goto out;
861
862 smp_wmb();
863
864 atomic_set(&ct->ct_general.use, 2);
865 __nf_conntrack_hash_insert(ct, hash, reply_hash);
866 nf_conntrack_double_unlock(hash, reply_hash);
867 NF_CT_STAT_INC(net, insert);
868 local_bh_enable();
869 return 0;
870
871out:
872 nf_conntrack_double_unlock(hash, reply_hash);
873 local_bh_enable();
874 return -EEXIST;
875}
876EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
877
878void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets,
879 unsigned int bytes)
880{
881 struct nf_conn_acct *acct;
882
883 acct = nf_conn_acct_find(ct);
884 if (acct) {
885 struct nf_conn_counter *counter = acct->counter;
886
887 atomic64_add(packets, &counter[dir].packets);
888 atomic64_add(bytes, &counter[dir].bytes);
889 }
890}
891EXPORT_SYMBOL_GPL(nf_ct_acct_add);
892
893static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
894 const struct nf_conn *loser_ct)
895{
896 struct nf_conn_acct *acct;
897
898 acct = nf_conn_acct_find(loser_ct);
899 if (acct) {
900 struct nf_conn_counter *counter = acct->counter;
901 unsigned int bytes;
902
903
904 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
905 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes);
906 }
907}
908
909static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
910{
911 struct nf_conn_tstamp *tstamp;
912
913 atomic_inc(&ct->ct_general.use);
914 ct->status |= IPS_CONFIRMED;
915
916
917 tstamp = nf_conn_tstamp_find(ct);
918 if (tstamp)
919 tstamp->start = ktime_get_real_ns();
920}
921
922
923static int __nf_ct_resolve_clash(struct sk_buff *skb,
924 struct nf_conntrack_tuple_hash *h)
925{
926
927 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
928 enum ip_conntrack_info ctinfo;
929 struct nf_conn *loser_ct;
930
931 loser_ct = nf_ct_get(skb, &ctinfo);
932
933 if (nf_ct_is_dying(ct))
934 return NF_DROP;
935
936 if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
937 nf_ct_match(ct, loser_ct)) {
938 struct net *net = nf_ct_net(ct);
939
940 nf_conntrack_get(&ct->ct_general);
941
942 nf_ct_acct_merge(ct, ctinfo, loser_ct);
943 nf_ct_add_to_dying_list(loser_ct);
944 nf_conntrack_put(&loser_ct->ct_general);
945 nf_ct_set(skb, ct, ctinfo);
946
947 NF_CT_STAT_INC(net, clash_resolve);
948 return NF_ACCEPT;
949 }
950
951 return NF_DROP;
952}
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
970{
971 struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb);
972 const struct nf_conntrack_zone *zone;
973 struct nf_conntrack_tuple_hash *h;
974 struct hlist_nulls_node *n;
975 struct net *net;
976
977 zone = nf_ct_zone(loser_ct);
978 net = nf_ct_net(loser_ct);
979
980
981
982
983 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) {
984 if (nf_ct_key_equal(h,
985 &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple,
986 zone, net))
987 return __nf_ct_resolve_clash(skb, h);
988 }
989
990
991 loser_ct->timeout = nfct_time_stamp + HZ;
992
993
994
995
996
997
998 loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH;
999
1000 __nf_conntrack_insert_prepare(loser_ct);
1001
1002
1003
1004
1005
1006 hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
1007
1008 hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
1009 &nf_conntrack_hash[repl_idx]);
1010
1011 NF_CT_STAT_INC(net, clash_resolve);
1012 return NF_ACCEPT;
1013}
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048static __cold noinline int
1049nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
1050 u32 reply_hash)
1051{
1052
1053 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1054 const struct nf_conntrack_l4proto *l4proto;
1055 enum ip_conntrack_info ctinfo;
1056 struct nf_conn *loser_ct;
1057 struct net *net;
1058 int ret;
1059
1060 loser_ct = nf_ct_get(skb, &ctinfo);
1061 net = nf_ct_net(loser_ct);
1062
1063 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
1064 if (!l4proto->allow_clash)
1065 goto drop;
1066
1067 ret = __nf_ct_resolve_clash(skb, h);
1068 if (ret == NF_ACCEPT)
1069 return ret;
1070
1071 ret = nf_ct_resolve_clash_harder(skb, reply_hash);
1072 if (ret == NF_ACCEPT)
1073 return ret;
1074
1075drop:
1076 nf_ct_add_to_dying_list(loser_ct);
1077 NF_CT_STAT_INC(net, drop);
1078 NF_CT_STAT_INC(net, insert_failed);
1079 return NF_DROP;
1080}
1081
1082
1083int
1084__nf_conntrack_confirm(struct sk_buff *skb)
1085{
1086 const struct nf_conntrack_zone *zone;
1087 unsigned int hash, reply_hash;
1088 struct nf_conntrack_tuple_hash *h;
1089 struct nf_conn *ct;
1090 struct nf_conn_help *help;
1091 struct hlist_nulls_node *n;
1092 enum ip_conntrack_info ctinfo;
1093 struct net *net;
1094 unsigned int sequence;
1095 int ret = NF_DROP;
1096
1097 ct = nf_ct_get(skb, &ctinfo);
1098 net = nf_ct_net(ct);
1099
1100
1101
1102
1103
1104 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
1105 return NF_ACCEPT;
1106
1107 zone = nf_ct_zone(ct);
1108 local_bh_disable();
1109
1110 do {
1111 sequence = read_seqcount_begin(&nf_conntrack_generation);
1112
1113 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
1114 hash = scale_hash(hash);
1115 reply_hash = hash_conntrack(net,
1116 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
1117
1118 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130 if (unlikely(nf_ct_is_confirmed(ct))) {
1131 WARN_ON_ONCE(1);
1132 nf_conntrack_double_unlock(hash, reply_hash);
1133 local_bh_enable();
1134 return NF_DROP;
1135 }
1136
1137 pr_debug("Confirming conntrack %p\n", ct);
1138
1139
1140
1141
1142
1143 nf_ct_del_from_dying_or_unconfirmed_list(ct);
1144
1145 if (unlikely(nf_ct_is_dying(ct))) {
1146 nf_ct_add_to_dying_list(ct);
1147 NF_CT_STAT_INC(net, insert_failed);
1148 goto dying;
1149 }
1150
1151
1152
1153
1154 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
1155 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1156 zone, net))
1157 goto out;
1158
1159 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
1160 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
1161 zone, net))
1162 goto out;
1163
1164
1165
1166
1167 ct->timeout += nfct_time_stamp;
1168
1169 __nf_conntrack_insert_prepare(ct);
1170
1171
1172
1173
1174
1175
1176 __nf_conntrack_hash_insert(ct, hash, reply_hash);
1177 nf_conntrack_double_unlock(hash, reply_hash);
1178 local_bh_enable();
1179
1180 help = nfct_help(ct);
1181 if (help && help->helper)
1182 nf_conntrack_event_cache(IPCT_HELPER, ct);
1183
1184 nf_conntrack_event_cache(master_ct(ct) ?
1185 IPCT_RELATED : IPCT_NEW, ct);
1186 return NF_ACCEPT;
1187
1188out:
1189 ret = nf_ct_resolve_clash(skb, h, reply_hash);
1190dying:
1191 nf_conntrack_double_unlock(hash, reply_hash);
1192 local_bh_enable();
1193 return ret;
1194}
1195EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
1196
1197
1198
1199int
1200nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
1201 const struct nf_conn *ignored_conntrack)
1202{
1203 struct net *net = nf_ct_net(ignored_conntrack);
1204 const struct nf_conntrack_zone *zone;
1205 struct nf_conntrack_tuple_hash *h;
1206 struct hlist_nulls_head *ct_hash;
1207 unsigned int hash, hsize;
1208 struct hlist_nulls_node *n;
1209 struct nf_conn *ct;
1210
1211 zone = nf_ct_zone(ignored_conntrack);
1212
1213 rcu_read_lock();
1214 begin:
1215 nf_conntrack_get_ht(&ct_hash, &hsize);
1216 hash = __hash_conntrack(net, tuple, hsize);
1217
1218 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
1219 ct = nf_ct_tuplehash_to_ctrack(h);
1220
1221 if (ct == ignored_conntrack)
1222 continue;
1223
1224 if (nf_ct_is_expired(ct)) {
1225 nf_ct_gc_expired(ct);
1226 continue;
1227 }
1228
1229 if (nf_ct_key_equal(h, tuple, zone, net)) {
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242 if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1243 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
1244 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL))
1245 continue;
1246
1247 NF_CT_STAT_INC_ATOMIC(net, found);
1248 rcu_read_unlock();
1249 return 1;
1250 }
1251 }
1252
1253 if (get_nulls_value(n) != hash) {
1254 NF_CT_STAT_INC_ATOMIC(net, search_restart);
1255 goto begin;
1256 }
1257
1258 rcu_read_unlock();
1259
1260 return 0;
1261}
1262EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
1263
1264#define NF_CT_EVICTION_RANGE 8
1265
1266
1267
1268static unsigned int early_drop_list(struct net *net,
1269 struct hlist_nulls_head *head)
1270{
1271 struct nf_conntrack_tuple_hash *h;
1272 struct hlist_nulls_node *n;
1273 unsigned int drops = 0;
1274 struct nf_conn *tmp;
1275
1276 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
1277 tmp = nf_ct_tuplehash_to_ctrack(h);
1278
1279 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
1280 continue;
1281
1282 if (nf_ct_is_expired(tmp)) {
1283 nf_ct_gc_expired(tmp);
1284 continue;
1285 }
1286
1287 if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
1288 !net_eq(nf_ct_net(tmp), net) ||
1289 nf_ct_is_dying(tmp))
1290 continue;
1291
1292 if (!atomic_inc_not_zero(&tmp->ct_general.use))
1293 continue;
1294
1295
1296
1297
1298
1299
1300
1301
1302 if (net_eq(nf_ct_net(tmp), net) &&
1303 nf_ct_is_confirmed(tmp) &&
1304 nf_ct_delete(tmp, 0, 0))
1305 drops++;
1306
1307 nf_ct_put(tmp);
1308 }
1309
1310 return drops;
1311}
1312
1313static noinline int early_drop(struct net *net, unsigned int hash)
1314{
1315 unsigned int i, bucket;
1316
1317 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
1318 struct hlist_nulls_head *ct_hash;
1319 unsigned int hsize, drops;
1320
1321 rcu_read_lock();
1322 nf_conntrack_get_ht(&ct_hash, &hsize);
1323 if (!i)
1324 bucket = reciprocal_scale(hash, hsize);
1325 else
1326 bucket = (bucket + 1) % hsize;
1327
1328 drops = early_drop_list(net, &ct_hash[bucket]);
1329 rcu_read_unlock();
1330
1331 if (drops) {
1332 NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops);
1333 return true;
1334 }
1335 }
1336
1337 return false;
1338}
1339
1340static bool gc_worker_skip_ct(const struct nf_conn *ct)
1341{
1342 return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct);
1343}
1344
1345static bool gc_worker_can_early_drop(const struct nf_conn *ct)
1346{
1347 const struct nf_conntrack_l4proto *l4proto;
1348
1349 if (!test_bit(IPS_ASSURED_BIT, &ct->status))
1350 return true;
1351
1352 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
1353 if (l4proto->can_early_drop && l4proto->can_early_drop(ct))
1354 return true;
1355
1356 return false;
1357}
1358
1359static void gc_worker(struct work_struct *work)
1360{
1361 unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION;
1362 unsigned int i, hashsz, nf_conntrack_max95 = 0;
1363 unsigned long next_run = GC_SCAN_INTERVAL;
1364 struct conntrack_gc_work *gc_work;
1365 gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
1366
1367 i = gc_work->next_bucket;
1368 if (gc_work->early_drop)
1369 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
1370
1371 do {
1372 struct nf_conntrack_tuple_hash *h;
1373 struct hlist_nulls_head *ct_hash;
1374 struct hlist_nulls_node *n;
1375 struct nf_conn *tmp;
1376
1377 rcu_read_lock();
1378
1379 nf_conntrack_get_ht(&ct_hash, &hashsz);
1380 if (i >= hashsz) {
1381 rcu_read_unlock();
1382 break;
1383 }
1384
1385 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
1386 struct nf_conntrack_net *cnet;
1387 struct net *net;
1388
1389 tmp = nf_ct_tuplehash_to_ctrack(h);
1390
1391 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
1392 nf_ct_offload_timeout(tmp);
1393 continue;
1394 }
1395
1396 if (nf_ct_is_expired(tmp)) {
1397 nf_ct_gc_expired(tmp);
1398 continue;
1399 }
1400
1401 if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
1402 continue;
1403
1404 net = nf_ct_net(tmp);
1405 cnet = nf_ct_pernet(net);
1406 if (atomic_read(&cnet->count) < nf_conntrack_max95)
1407 continue;
1408
1409
1410 if (!atomic_inc_not_zero(&tmp->ct_general.use))
1411 continue;
1412
1413 if (gc_worker_skip_ct(tmp)) {
1414 nf_ct_put(tmp);
1415 continue;
1416 }
1417
1418 if (gc_worker_can_early_drop(tmp))
1419 nf_ct_kill(tmp);
1420
1421 nf_ct_put(tmp);
1422 }
1423
1424
1425
1426
1427
1428 rcu_read_unlock();
1429 cond_resched();
1430 i++;
1431
1432 if (time_after(jiffies, end_time) && i < hashsz) {
1433 gc_work->next_bucket = i;
1434 next_run = 0;
1435 break;
1436 }
1437 } while (i < hashsz);
1438
1439 if (gc_work->exiting)
1440 return;
1441
1442
1443
1444
1445
1446
1447
1448
1449 if (next_run) {
1450 gc_work->early_drop = false;
1451 gc_work->next_bucket = 0;
1452 }
1453 queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
1454}
1455
1456static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
1457{
1458 INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker);
1459 gc_work->exiting = false;
1460}
1461
1462static struct nf_conn *
1463__nf_conntrack_alloc(struct net *net,
1464 const struct nf_conntrack_zone *zone,
1465 const struct nf_conntrack_tuple *orig,
1466 const struct nf_conntrack_tuple *repl,
1467 gfp_t gfp, u32 hash)
1468{
1469 struct nf_conntrack_net *cnet = nf_ct_pernet(net);
1470 unsigned int ct_count;
1471 struct nf_conn *ct;
1472
1473
1474 ct_count = atomic_inc_return(&cnet->count);
1475
1476 if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
1477 if (!early_drop(net, hash)) {
1478 if (!conntrack_gc_work.early_drop)
1479 conntrack_gc_work.early_drop = true;
1480 atomic_dec(&cnet->count);
1481 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
1482 return ERR_PTR(-ENOMEM);
1483 }
1484 }
1485
1486
1487
1488
1489
1490 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
1491 if (ct == NULL)
1492 goto out;
1493
1494 spin_lock_init(&ct->lock);
1495 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
1496 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
1497 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
1498
1499 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
1500 ct->status = 0;
1501 ct->timeout = 0;
1502 write_pnet(&ct->ct_net, net);
1503 memset(&ct->__nfct_init_offset, 0,
1504 offsetof(struct nf_conn, proto) -
1505 offsetof(struct nf_conn, __nfct_init_offset));
1506
1507 nf_ct_zone_add(ct, zone);
1508
1509
1510
1511
1512 atomic_set(&ct->ct_general.use, 0);
1513 return ct;
1514out:
1515 atomic_dec(&cnet->count);
1516 return ERR_PTR(-ENOMEM);
1517}
1518
1519struct nf_conn *nf_conntrack_alloc(struct net *net,
1520 const struct nf_conntrack_zone *zone,
1521 const struct nf_conntrack_tuple *orig,
1522 const struct nf_conntrack_tuple *repl,
1523 gfp_t gfp)
1524{
1525 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
1526}
1527EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
1528
1529void nf_conntrack_free(struct nf_conn *ct)
1530{
1531 struct net *net = nf_ct_net(ct);
1532 struct nf_conntrack_net *cnet;
1533
1534
1535
1536
1537 WARN_ON(atomic_read(&ct->ct_general.use) != 0);
1538
1539 nf_ct_ext_destroy(ct);
1540 kmem_cache_free(nf_conntrack_cachep, ct);
1541 cnet = nf_ct_pernet(net);
1542
1543 smp_mb__before_atomic();
1544 atomic_dec(&cnet->count);
1545}
1546EXPORT_SYMBOL_GPL(nf_conntrack_free);
1547
1548
1549
1550
1551static noinline struct nf_conntrack_tuple_hash *
1552init_conntrack(struct net *net, struct nf_conn *tmpl,
1553 const struct nf_conntrack_tuple *tuple,
1554 struct sk_buff *skb,
1555 unsigned int dataoff, u32 hash)
1556{
1557 struct nf_conn *ct;
1558 struct nf_conn_help *help;
1559 struct nf_conntrack_tuple repl_tuple;
1560 struct nf_conntrack_ecache *ecache;
1561 struct nf_conntrack_expect *exp = NULL;
1562 const struct nf_conntrack_zone *zone;
1563 struct nf_conn_timeout *timeout_ext;
1564 struct nf_conntrack_zone tmp;
1565 struct nf_conntrack_net *cnet;
1566
1567 if (!nf_ct_invert_tuple(&repl_tuple, tuple)) {
1568 pr_debug("Can't invert tuple.\n");
1569 return NULL;
1570 }
1571
1572 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1573 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
1574 hash);
1575 if (IS_ERR(ct))
1576 return (struct nf_conntrack_tuple_hash *)ct;
1577
1578 if (!nf_ct_add_synproxy(ct, tmpl)) {
1579 nf_conntrack_free(ct);
1580 return ERR_PTR(-ENOMEM);
1581 }
1582
1583 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
1584
1585 if (timeout_ext)
1586 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
1587 GFP_ATOMIC);
1588
1589 nf_ct_acct_ext_add(ct, GFP_ATOMIC);
1590 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
1591 nf_ct_labels_ext_add(ct);
1592
1593 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
1594 nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
1595 ecache ? ecache->expmask : 0,
1596 GFP_ATOMIC);
1597
1598 local_bh_disable();
1599 cnet = nf_ct_pernet(net);
1600 if (cnet->expect_count) {
1601 spin_lock(&nf_conntrack_expect_lock);
1602 exp = nf_ct_find_expectation(net, zone, tuple);
1603 if (exp) {
1604 pr_debug("expectation arrives ct=%p exp=%p\n",
1605 ct, exp);
1606
1607 __set_bit(IPS_EXPECTED_BIT, &ct->status);
1608
1609 ct->master = exp->master;
1610 if (exp->helper) {
1611 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
1612 if (help)
1613 rcu_assign_pointer(help->helper, exp->helper);
1614 }
1615
1616#ifdef CONFIG_NF_CONNTRACK_MARK
1617 ct->mark = exp->master->mark;
1618#endif
1619#ifdef CONFIG_NF_CONNTRACK_SECMARK
1620 ct->secmark = exp->master->secmark;
1621#endif
1622 NF_CT_STAT_INC(net, expect_new);
1623 }
1624 spin_unlock(&nf_conntrack_expect_lock);
1625 }
1626 if (!exp)
1627 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
1628
1629
1630 nf_conntrack_get(&ct->ct_general);
1631 nf_ct_add_to_unconfirmed_list(ct);
1632
1633 local_bh_enable();
1634
1635 if (exp) {
1636 if (exp->expectfn)
1637 exp->expectfn(ct, exp);
1638 nf_ct_expect_put(exp);
1639 }
1640
1641 return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
1642}
1643
1644
1645static int
1646resolve_normal_ct(struct nf_conn *tmpl,
1647 struct sk_buff *skb,
1648 unsigned int dataoff,
1649 u_int8_t protonum,
1650 const struct nf_hook_state *state)
1651{
1652 const struct nf_conntrack_zone *zone;
1653 struct nf_conntrack_tuple tuple;
1654 struct nf_conntrack_tuple_hash *h;
1655 enum ip_conntrack_info ctinfo;
1656 struct nf_conntrack_zone tmp;
1657 struct nf_conn *ct;
1658 u32 hash;
1659
1660 if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
1661 dataoff, state->pf, protonum, state->net,
1662 &tuple)) {
1663 pr_debug("Can't get tuple\n");
1664 return 0;
1665 }
1666
1667
1668 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1669 hash = hash_conntrack_raw(&tuple, state->net);
1670 h = __nf_conntrack_find_get(state->net, zone, &tuple, hash);
1671 if (!h) {
1672 h = init_conntrack(state->net, tmpl, &tuple,
1673 skb, dataoff, hash);
1674 if (!h)
1675 return 0;
1676 if (IS_ERR(h))
1677 return PTR_ERR(h);
1678 }
1679 ct = nf_ct_tuplehash_to_ctrack(h);
1680
1681
1682 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1683 ctinfo = IP_CT_ESTABLISHED_REPLY;
1684 } else {
1685
1686 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1687 pr_debug("normal packet for %p\n", ct);
1688 ctinfo = IP_CT_ESTABLISHED;
1689 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1690 pr_debug("related packet for %p\n", ct);
1691 ctinfo = IP_CT_RELATED;
1692 } else {
1693 pr_debug("new packet for %p\n", ct);
1694 ctinfo = IP_CT_NEW;
1695 }
1696 }
1697 nf_ct_set(skb, ct, ctinfo);
1698 return 0;
1699}
1700
1701
1702
1703
1704
1705
1706
1707
1708static unsigned int __cold
1709nf_conntrack_handle_icmp(struct nf_conn *tmpl,
1710 struct sk_buff *skb,
1711 unsigned int dataoff,
1712 u8 protonum,
1713 const struct nf_hook_state *state)
1714{
1715 int ret;
1716
1717 if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP)
1718 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state);
1719#if IS_ENABLED(CONFIG_IPV6)
1720 else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6)
1721 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state);
1722#endif
1723 else
1724 return NF_ACCEPT;
1725
1726 if (ret <= 0)
1727 NF_CT_STAT_INC_ATOMIC(state->net, error);
1728
1729 return ret;
1730}
1731
1732static int generic_packet(struct nf_conn *ct, struct sk_buff *skb,
1733 enum ip_conntrack_info ctinfo)
1734{
1735 const unsigned int *timeout = nf_ct_timeout_lookup(ct);
1736
1737 if (!timeout)
1738 timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout;
1739
1740 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
1741 return NF_ACCEPT;
1742}
1743
1744
1745static int nf_conntrack_handle_packet(struct nf_conn *ct,
1746 struct sk_buff *skb,
1747 unsigned int dataoff,
1748 enum ip_conntrack_info ctinfo,
1749 const struct nf_hook_state *state)
1750{
1751 switch (nf_ct_protonum(ct)) {
1752 case IPPROTO_TCP:
1753 return nf_conntrack_tcp_packet(ct, skb, dataoff,
1754 ctinfo, state);
1755 case IPPROTO_UDP:
1756 return nf_conntrack_udp_packet(ct, skb, dataoff,
1757 ctinfo, state);
1758 case IPPROTO_ICMP:
1759 return nf_conntrack_icmp_packet(ct, skb, ctinfo, state);
1760#if IS_ENABLED(CONFIG_IPV6)
1761 case IPPROTO_ICMPV6:
1762 return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state);
1763#endif
1764#ifdef CONFIG_NF_CT_PROTO_UDPLITE
1765 case IPPROTO_UDPLITE:
1766 return nf_conntrack_udplite_packet(ct, skb, dataoff,
1767 ctinfo, state);
1768#endif
1769#ifdef CONFIG_NF_CT_PROTO_SCTP
1770 case IPPROTO_SCTP:
1771 return nf_conntrack_sctp_packet(ct, skb, dataoff,
1772 ctinfo, state);
1773#endif
1774#ifdef CONFIG_NF_CT_PROTO_DCCP
1775 case IPPROTO_DCCP:
1776 return nf_conntrack_dccp_packet(ct, skb, dataoff,
1777 ctinfo, state);
1778#endif
1779#ifdef CONFIG_NF_CT_PROTO_GRE
1780 case IPPROTO_GRE:
1781 return nf_conntrack_gre_packet(ct, skb, dataoff,
1782 ctinfo, state);
1783#endif
1784 }
1785
1786 return generic_packet(ct, skb, ctinfo);
1787}
1788
1789unsigned int
1790nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
1791{
1792 enum ip_conntrack_info ctinfo;
1793 struct nf_conn *ct, *tmpl;
1794 u_int8_t protonum;
1795 int dataoff, ret;
1796
1797 tmpl = nf_ct_get(skb, &ctinfo);
1798 if (tmpl || ctinfo == IP_CT_UNTRACKED) {
1799
1800 if ((tmpl && !nf_ct_is_template(tmpl)) ||
1801 ctinfo == IP_CT_UNTRACKED)
1802 return NF_ACCEPT;
1803 skb->_nfct = 0;
1804 }
1805
1806
1807 dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum);
1808 if (dataoff <= 0) {
1809 pr_debug("not prepared to track yet or error occurred\n");
1810 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1811 ret = NF_ACCEPT;
1812 goto out;
1813 }
1814
1815 if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) {
1816 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff,
1817 protonum, state);
1818 if (ret <= 0) {
1819 ret = -ret;
1820 goto out;
1821 }
1822
1823 if (skb->_nfct)
1824 goto out;
1825 }
1826repeat:
1827 ret = resolve_normal_ct(tmpl, skb, dataoff,
1828 protonum, state);
1829 if (ret < 0) {
1830
1831 NF_CT_STAT_INC_ATOMIC(state->net, drop);
1832 ret = NF_DROP;
1833 goto out;
1834 }
1835
1836 ct = nf_ct_get(skb, &ctinfo);
1837 if (!ct) {
1838
1839 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1840 ret = NF_ACCEPT;
1841 goto out;
1842 }
1843
1844 ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state);
1845 if (ret <= 0) {
1846
1847
1848 pr_debug("nf_conntrack_in: Can't track with proto module\n");
1849 nf_conntrack_put(&ct->ct_general);
1850 skb->_nfct = 0;
1851 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1852 if (ret == -NF_DROP)
1853 NF_CT_STAT_INC_ATOMIC(state->net, drop);
1854
1855
1856
1857
1858 if (ret == -NF_REPEAT)
1859 goto repeat;
1860 ret = -ret;
1861 goto out;
1862 }
1863
1864 if (ctinfo == IP_CT_ESTABLISHED_REPLY &&
1865 !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1866 nf_conntrack_event_cache(IPCT_REPLY, ct);
1867out:
1868 if (tmpl)
1869 nf_ct_put(tmpl);
1870
1871 return ret;
1872}
1873EXPORT_SYMBOL_GPL(nf_conntrack_in);
1874
1875
1876
1877void nf_conntrack_alter_reply(struct nf_conn *ct,
1878 const struct nf_conntrack_tuple *newreply)
1879{
1880 struct nf_conn_help *help = nfct_help(ct);
1881
1882
1883 WARN_ON(nf_ct_is_confirmed(ct));
1884
1885 pr_debug("Altering reply tuple of %p to ", ct);
1886 nf_ct_dump_tuple(newreply);
1887
1888 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1889 if (ct->master || (help && !hlist_empty(&help->expectations)))
1890 return;
1891
1892 rcu_read_lock();
1893 __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
1894 rcu_read_unlock();
1895}
1896EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
1897
1898
1899void __nf_ct_refresh_acct(struct nf_conn *ct,
1900 enum ip_conntrack_info ctinfo,
1901 const struct sk_buff *skb,
1902 u32 extra_jiffies,
1903 bool do_acct)
1904{
1905
1906 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
1907 goto acct;
1908
1909
1910 if (nf_ct_is_confirmed(ct))
1911 extra_jiffies += nfct_time_stamp;
1912
1913 if (READ_ONCE(ct->timeout) != extra_jiffies)
1914 WRITE_ONCE(ct->timeout, extra_jiffies);
1915acct:
1916 if (do_acct)
1917 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
1918}
1919EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
1920
1921bool nf_ct_kill_acct(struct nf_conn *ct,
1922 enum ip_conntrack_info ctinfo,
1923 const struct sk_buff *skb)
1924{
1925 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
1926
1927 return nf_ct_delete(ct, 0, 0);
1928}
1929EXPORT_SYMBOL_GPL(nf_ct_kill_acct);
1930
1931#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1932
1933#include <linux/netfilter/nfnetlink.h>
1934#include <linux/netfilter/nfnetlink_conntrack.h>
1935#include <linux/mutex.h>
1936
1937
1938int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
1939 const struct nf_conntrack_tuple *tuple)
1940{
1941 if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) ||
1942 nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port))
1943 goto nla_put_failure;
1944 return 0;
1945
1946nla_put_failure:
1947 return -1;
1948}
1949EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
1950
1951const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
1952 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 },
1953 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 },
1954};
1955EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
1956
1957int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
1958 struct nf_conntrack_tuple *t,
1959 u_int32_t flags)
1960{
1961 if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) {
1962 if (!tb[CTA_PROTO_SRC_PORT])
1963 return -EINVAL;
1964
1965 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
1966 }
1967
1968 if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) {
1969 if (!tb[CTA_PROTO_DST_PORT])
1970 return -EINVAL;
1971
1972 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
1973 }
1974
1975 return 0;
1976}
1977EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
1978
1979unsigned int nf_ct_port_nlattr_tuple_size(void)
1980{
1981 static unsigned int size __read_mostly;
1982
1983 if (!size)
1984 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1985
1986 return size;
1987}
1988EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
1989#endif
1990
1991
1992static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
1993{
1994 struct nf_conn *ct;
1995 enum ip_conntrack_info ctinfo;
1996
1997
1998 ct = nf_ct_get(skb, &ctinfo);
1999 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
2000 ctinfo = IP_CT_RELATED_REPLY;
2001 else
2002 ctinfo = IP_CT_RELATED;
2003
2004
2005 nf_ct_set(nskb, ct, ctinfo);
2006 nf_conntrack_get(skb_nfct(nskb));
2007}
2008
2009static int __nf_conntrack_update(struct net *net, struct sk_buff *skb,
2010 struct nf_conn *ct,
2011 enum ip_conntrack_info ctinfo)
2012{
2013 struct nf_conntrack_tuple_hash *h;
2014 struct nf_conntrack_tuple tuple;
2015 struct nf_nat_hook *nat_hook;
2016 unsigned int status;
2017 int dataoff;
2018 u16 l3num;
2019 u8 l4num;
2020
2021 l3num = nf_ct_l3num(ct);
2022
2023 dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num);
2024 if (dataoff <= 0)
2025 return -1;
2026
2027 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
2028 l4num, net, &tuple))
2029 return -1;
2030
2031 if (ct->status & IPS_SRC_NAT) {
2032 memcpy(tuple.src.u3.all,
2033 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
2034 sizeof(tuple.src.u3.all));
2035 tuple.src.u.all =
2036 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
2037 }
2038
2039 if (ct->status & IPS_DST_NAT) {
2040 memcpy(tuple.dst.u3.all,
2041 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
2042 sizeof(tuple.dst.u3.all));
2043 tuple.dst.u.all =
2044 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
2045 }
2046
2047 h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
2048 if (!h)
2049 return 0;
2050
2051
2052
2053
2054 status = ct->status;
2055
2056 nf_ct_put(ct);
2057 ct = nf_ct_tuplehash_to_ctrack(h);
2058 nf_ct_set(skb, ct, ctinfo);
2059
2060 nat_hook = rcu_dereference(nf_nat_hook);
2061 if (!nat_hook)
2062 return 0;
2063
2064 if (status & IPS_SRC_NAT &&
2065 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC,
2066 IP_CT_DIR_ORIGINAL) == NF_DROP)
2067 return -1;
2068
2069 if (status & IPS_DST_NAT &&
2070 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST,
2071 IP_CT_DIR_ORIGINAL) == NF_DROP)
2072 return -1;
2073
2074 return 0;
2075}
2076
2077
2078
2079
2080static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct,
2081 enum ip_conntrack_info ctinfo)
2082{
2083 const struct nf_conntrack_helper *helper;
2084 const struct nf_conn_help *help;
2085 int protoff;
2086
2087 help = nfct_help(ct);
2088 if (!help)
2089 return 0;
2090
2091 helper = rcu_dereference(help->helper);
2092 if (!(helper->flags & NF_CT_HELPER_F_USERSPACE))
2093 return 0;
2094
2095 switch (nf_ct_l3num(ct)) {
2096 case NFPROTO_IPV4:
2097 protoff = skb_network_offset(skb) + ip_hdrlen(skb);
2098 break;
2099#if IS_ENABLED(CONFIG_IPV6)
2100 case NFPROTO_IPV6: {
2101 __be16 frag_off;
2102 u8 pnum;
2103
2104 pnum = ipv6_hdr(skb)->nexthdr;
2105 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
2106 &frag_off);
2107 if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
2108 return 0;
2109 break;
2110 }
2111#endif
2112 default:
2113 return 0;
2114 }
2115
2116 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
2117 !nf_is_loopback_packet(skb)) {
2118 if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
2119 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
2120 return -1;
2121 }
2122 }
2123
2124
2125 return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0;
2126}
2127
2128static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
2129{
2130 enum ip_conntrack_info ctinfo;
2131 struct nf_conn *ct;
2132 int err;
2133
2134 ct = nf_ct_get(skb, &ctinfo);
2135 if (!ct)
2136 return 0;
2137
2138 if (!nf_ct_is_confirmed(ct)) {
2139 err = __nf_conntrack_update(net, skb, ct, ctinfo);
2140 if (err < 0)
2141 return err;
2142
2143 ct = nf_ct_get(skb, &ctinfo);
2144 }
2145
2146 return nf_confirm_cthelper(skb, ct, ctinfo);
2147}
2148
2149static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
2150 const struct sk_buff *skb)
2151{
2152 const struct nf_conntrack_tuple *src_tuple;
2153 const struct nf_conntrack_tuple_hash *hash;
2154 struct nf_conntrack_tuple srctuple;
2155 enum ip_conntrack_info ctinfo;
2156 struct nf_conn *ct;
2157
2158 ct = nf_ct_get(skb, &ctinfo);
2159 if (ct) {
2160 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
2161 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
2162 return true;
2163 }
2164
2165 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
2166 NFPROTO_IPV4, dev_net(skb->dev),
2167 &srctuple))
2168 return false;
2169
2170 hash = nf_conntrack_find_get(dev_net(skb->dev),
2171 &nf_ct_zone_dflt,
2172 &srctuple);
2173 if (!hash)
2174 return false;
2175
2176 ct = nf_ct_tuplehash_to_ctrack(hash);
2177 src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
2178 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
2179 nf_ct_put(ct);
2180
2181 return true;
2182}
2183
2184
2185static struct nf_conn *
2186get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
2187 void *data, unsigned int *bucket)
2188{
2189 struct nf_conntrack_tuple_hash *h;
2190 struct nf_conn *ct;
2191 struct hlist_nulls_node *n;
2192 spinlock_t *lockp;
2193
2194 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
2195 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
2196 local_bh_disable();
2197 nf_conntrack_lock(lockp);
2198 if (*bucket < nf_conntrack_htable_size) {
2199 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
2200 if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
2201 continue;
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213 ct = nf_ct_tuplehash_to_ctrack(h);
2214 if (iter(ct, data))
2215 goto found;
2216 }
2217 }
2218 spin_unlock(lockp);
2219 local_bh_enable();
2220 cond_resched();
2221 }
2222
2223 return NULL;
2224found:
2225 atomic_inc(&ct->ct_general.use);
2226 spin_unlock(lockp);
2227 local_bh_enable();
2228 return ct;
2229}
2230
2231static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
2232 void *data, u32 portid, int report)
2233{
2234 unsigned int bucket = 0, sequence;
2235 struct nf_conn *ct;
2236
2237 might_sleep();
2238
2239 for (;;) {
2240 sequence = read_seqcount_begin(&nf_conntrack_generation);
2241
2242 while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
2243
2244
2245 nf_ct_delete(ct, portid, report);
2246 nf_ct_put(ct);
2247 cond_resched();
2248 }
2249
2250 if (!read_seqcount_retry(&nf_conntrack_generation, sequence))
2251 break;
2252 bucket = 0;
2253 }
2254}
2255
2256struct iter_data {
2257 int (*iter)(struct nf_conn *i, void *data);
2258 void *data;
2259 struct net *net;
2260};
2261
2262static int iter_net_only(struct nf_conn *i, void *data)
2263{
2264 struct iter_data *d = data;
2265
2266 if (!net_eq(d->net, nf_ct_net(i)))
2267 return 0;
2268
2269 return d->iter(i, d->data);
2270}
2271
2272static void
2273__nf_ct_unconfirmed_destroy(struct net *net)
2274{
2275 int cpu;
2276
2277 for_each_possible_cpu(cpu) {
2278 struct nf_conntrack_tuple_hash *h;
2279 struct hlist_nulls_node *n;
2280 struct ct_pcpu *pcpu;
2281
2282 pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
2283
2284 spin_lock_bh(&pcpu->lock);
2285 hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
2286 struct nf_conn *ct;
2287
2288 ct = nf_ct_tuplehash_to_ctrack(h);
2289
2290
2291
2292
2293 set_bit(IPS_DYING_BIT, &ct->status);
2294 }
2295 spin_unlock_bh(&pcpu->lock);
2296 cond_resched();
2297 }
2298}
2299
2300void nf_ct_unconfirmed_destroy(struct net *net)
2301{
2302 struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2303
2304 might_sleep();
2305
2306 if (atomic_read(&cnet->count) > 0) {
2307 __nf_ct_unconfirmed_destroy(net);
2308 nf_queue_nf_hook_drop(net);
2309 synchronize_net();
2310 }
2311}
2312EXPORT_SYMBOL_GPL(nf_ct_unconfirmed_destroy);
2313
2314void nf_ct_iterate_cleanup_net(struct net *net,
2315 int (*iter)(struct nf_conn *i, void *data),
2316 void *data, u32 portid, int report)
2317{
2318 struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2319 struct iter_data d;
2320
2321 might_sleep();
2322
2323 if (atomic_read(&cnet->count) == 0)
2324 return;
2325
2326 d.iter = iter;
2327 d.data = data;
2328 d.net = net;
2329
2330 nf_ct_iterate_cleanup(iter_net_only, &d, portid, report);
2331}
2332EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345void
2346nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
2347{
2348 struct net *net;
2349
2350 down_read(&net_rwsem);
2351 for_each_net(net) {
2352 struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2353
2354 if (atomic_read(&cnet->count) == 0)
2355 continue;
2356 __nf_ct_unconfirmed_destroy(net);
2357 nf_queue_nf_hook_drop(net);
2358 }
2359 up_read(&net_rwsem);
2360
2361
2362
2363
2364
2365
2366 net_ns_barrier();
2367
2368
2369
2370
2371
2372 synchronize_net();
2373
2374 nf_ct_iterate_cleanup(iter, data, 0, 0);
2375}
2376EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy);
2377
2378static int kill_all(struct nf_conn *i, void *data)
2379{
2380 return net_eq(nf_ct_net(i), data);
2381}
2382
2383void nf_conntrack_cleanup_start(void)
2384{
2385 conntrack_gc_work.exiting = true;
2386 RCU_INIT_POINTER(ip_ct_attach, NULL);
2387}
2388
2389void nf_conntrack_cleanup_end(void)
2390{
2391 RCU_INIT_POINTER(nf_ct_hook, NULL);
2392 cancel_delayed_work_sync(&conntrack_gc_work.dwork);
2393 kvfree(nf_conntrack_hash);
2394
2395 nf_conntrack_proto_fini();
2396 nf_conntrack_seqadj_fini();
2397 nf_conntrack_labels_fini();
2398 nf_conntrack_helper_fini();
2399 nf_conntrack_timeout_fini();
2400 nf_conntrack_ecache_fini();
2401 nf_conntrack_tstamp_fini();
2402 nf_conntrack_acct_fini();
2403 nf_conntrack_expect_fini();
2404
2405 kmem_cache_destroy(nf_conntrack_cachep);
2406}
2407
2408
2409
2410
2411
2412void nf_conntrack_cleanup_net(struct net *net)
2413{
2414 LIST_HEAD(single);
2415
2416 list_add(&net->exit_list, &single);
2417 nf_conntrack_cleanup_net_list(&single);
2418}
2419
2420void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
2421{
2422 int busy;
2423 struct net *net;
2424
2425
2426
2427
2428
2429
2430 synchronize_net();
2431i_see_dead_people:
2432 busy = 0;
2433 list_for_each_entry(net, net_exit_list, exit_list) {
2434 struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2435
2436 nf_ct_iterate_cleanup(kill_all, net, 0, 0);
2437 if (atomic_read(&cnet->count) != 0)
2438 busy = 1;
2439 }
2440 if (busy) {
2441 schedule();
2442 goto i_see_dead_people;
2443 }
2444
2445 list_for_each_entry(net, net_exit_list, exit_list) {
2446 nf_conntrack_ecache_pernet_fini(net);
2447 nf_conntrack_expect_pernet_fini(net);
2448 free_percpu(net->ct.stat);
2449 free_percpu(net->ct.pcpu_lists);
2450 }
2451}
2452
2453void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
2454{
2455 struct hlist_nulls_head *hash;
2456 unsigned int nr_slots, i;
2457
2458 if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
2459 return NULL;
2460
2461 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
2462 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
2463
2464 hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL);
2465
2466 if (hash && nulls)
2467 for (i = 0; i < nr_slots; i++)
2468 INIT_HLIST_NULLS_HEAD(&hash[i], i);
2469
2470 return hash;
2471}
2472EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
2473
2474int nf_conntrack_hash_resize(unsigned int hashsize)
2475{
2476 int i, bucket;
2477 unsigned int old_size;
2478 struct hlist_nulls_head *hash, *old_hash;
2479 struct nf_conntrack_tuple_hash *h;
2480 struct nf_conn *ct;
2481
2482 if (!hashsize)
2483 return -EINVAL;
2484
2485 hash = nf_ct_alloc_hashtable(&hashsize, 1);
2486 if (!hash)
2487 return -ENOMEM;
2488
2489 old_size = nf_conntrack_htable_size;
2490 if (old_size == hashsize) {
2491 kvfree(hash);
2492 return 0;
2493 }
2494
2495 local_bh_disable();
2496 nf_conntrack_all_lock();
2497 write_seqcount_begin(&nf_conntrack_generation);
2498
2499
2500
2501
2502
2503
2504
2505 for (i = 0; i < nf_conntrack_htable_size; i++) {
2506 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
2507 h = hlist_nulls_entry(nf_conntrack_hash[i].first,
2508 struct nf_conntrack_tuple_hash, hnnode);
2509 ct = nf_ct_tuplehash_to_ctrack(h);
2510 hlist_nulls_del_rcu(&h->hnnode);
2511 bucket = __hash_conntrack(nf_ct_net(ct),
2512 &h->tuple, hashsize);
2513 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
2514 }
2515 }
2516 old_size = nf_conntrack_htable_size;
2517 old_hash = nf_conntrack_hash;
2518
2519 nf_conntrack_hash = hash;
2520 nf_conntrack_htable_size = hashsize;
2521
2522 write_seqcount_end(&nf_conntrack_generation);
2523 nf_conntrack_all_unlock();
2524 local_bh_enable();
2525
2526 synchronize_net();
2527 kvfree(old_hash);
2528 return 0;
2529}
2530
2531int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
2532{
2533 unsigned int hashsize;
2534 int rc;
2535
2536 if (current->nsproxy->net_ns != &init_net)
2537 return -EOPNOTSUPP;
2538
2539
2540 if (!nf_conntrack_hash)
2541 return param_set_uint(val, kp);
2542
2543 rc = kstrtouint(val, 0, &hashsize);
2544 if (rc)
2545 return rc;
2546
2547 return nf_conntrack_hash_resize(hashsize);
2548}
2549
2550static __always_inline unsigned int total_extension_size(void)
2551{
2552
2553 BUILD_BUG_ON(NF_CT_EXT_NUM > 9);
2554
2555 return sizeof(struct nf_ct_ext) +
2556 sizeof(struct nf_conn_help)
2557#if IS_ENABLED(CONFIG_NF_NAT)
2558 + sizeof(struct nf_conn_nat)
2559#endif
2560 + sizeof(struct nf_conn_seqadj)
2561 + sizeof(struct nf_conn_acct)
2562#ifdef CONFIG_NF_CONNTRACK_EVENTS
2563 + sizeof(struct nf_conntrack_ecache)
2564#endif
2565#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
2566 + sizeof(struct nf_conn_tstamp)
2567#endif
2568#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
2569 + sizeof(struct nf_conn_timeout)
2570#endif
2571#ifdef CONFIG_NF_CONNTRACK_LABELS
2572 + sizeof(struct nf_conn_labels)
2573#endif
2574#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
2575 + sizeof(struct nf_conn_synproxy)
2576#endif
2577 ;
2578};
2579
2580int nf_conntrack_init_start(void)
2581{
2582 unsigned long nr_pages = totalram_pages();
2583 int max_factor = 8;
2584 int ret = -ENOMEM;
2585 int i;
2586
2587
2588 BUILD_BUG_ON(total_extension_size() > 255u);
2589
2590 seqcount_spinlock_init(&nf_conntrack_generation,
2591 &nf_conntrack_locks_all_lock);
2592
2593 for (i = 0; i < CONNTRACK_LOCKS; i++)
2594 spin_lock_init(&nf_conntrack_locks[i]);
2595
2596 if (!nf_conntrack_htable_size) {
2597
2598
2599
2600
2601
2602 nf_conntrack_htable_size
2603 = (((nr_pages << PAGE_SHIFT) / 16384)
2604 / sizeof(struct hlist_head));
2605 if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
2606 nf_conntrack_htable_size = 65536;
2607 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
2608 nf_conntrack_htable_size = 16384;
2609 if (nf_conntrack_htable_size < 32)
2610 nf_conntrack_htable_size = 32;
2611
2612
2613
2614
2615
2616 max_factor = 4;
2617 }
2618
2619 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
2620 if (!nf_conntrack_hash)
2621 return -ENOMEM;
2622
2623 nf_conntrack_max = max_factor * nf_conntrack_htable_size;
2624
2625 nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
2626 sizeof(struct nf_conn),
2627 NFCT_INFOMASK + 1,
2628 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
2629 if (!nf_conntrack_cachep)
2630 goto err_cachep;
2631
2632 ret = nf_conntrack_expect_init();
2633 if (ret < 0)
2634 goto err_expect;
2635
2636 ret = nf_conntrack_acct_init();
2637 if (ret < 0)
2638 goto err_acct;
2639
2640 ret = nf_conntrack_tstamp_init();
2641 if (ret < 0)
2642 goto err_tstamp;
2643
2644 ret = nf_conntrack_ecache_init();
2645 if (ret < 0)
2646 goto err_ecache;
2647
2648 ret = nf_conntrack_timeout_init();
2649 if (ret < 0)
2650 goto err_timeout;
2651
2652 ret = nf_conntrack_helper_init();
2653 if (ret < 0)
2654 goto err_helper;
2655
2656 ret = nf_conntrack_labels_init();
2657 if (ret < 0)
2658 goto err_labels;
2659
2660 ret = nf_conntrack_seqadj_init();
2661 if (ret < 0)
2662 goto err_seqadj;
2663
2664 ret = nf_conntrack_proto_init();
2665 if (ret < 0)
2666 goto err_proto;
2667
2668 conntrack_gc_work_init(&conntrack_gc_work);
2669 queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);
2670
2671 return 0;
2672
2673err_proto:
2674 nf_conntrack_seqadj_fini();
2675err_seqadj:
2676 nf_conntrack_labels_fini();
2677err_labels:
2678 nf_conntrack_helper_fini();
2679err_helper:
2680 nf_conntrack_timeout_fini();
2681err_timeout:
2682 nf_conntrack_ecache_fini();
2683err_ecache:
2684 nf_conntrack_tstamp_fini();
2685err_tstamp:
2686 nf_conntrack_acct_fini();
2687err_acct:
2688 nf_conntrack_expect_fini();
2689err_expect:
2690 kmem_cache_destroy(nf_conntrack_cachep);
2691err_cachep:
2692 kvfree(nf_conntrack_hash);
2693 return ret;
2694}
2695
2696static struct nf_ct_hook nf_conntrack_hook = {
2697 .update = nf_conntrack_update,
2698 .destroy = destroy_conntrack,
2699 .get_tuple_skb = nf_conntrack_get_tuple_skb,
2700};
2701
2702void nf_conntrack_init_end(void)
2703{
2704
2705 RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
2706 RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook);
2707}
2708
2709
2710
2711
2712#define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
2713#define DYING_NULLS_VAL ((1<<30)+1)
2714
2715int nf_conntrack_init_net(struct net *net)
2716{
2717 struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2718 int ret = -ENOMEM;
2719 int cpu;
2720
2721 BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER);
2722 BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS);
2723 atomic_set(&cnet->count, 0);
2724
2725 net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
2726 if (!net->ct.pcpu_lists)
2727 goto err_stat;
2728
2729 for_each_possible_cpu(cpu) {
2730 struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
2731
2732 spin_lock_init(&pcpu->lock);
2733 INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
2734 INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
2735 }
2736
2737 net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
2738 if (!net->ct.stat)
2739 goto err_pcpu_lists;
2740
2741 ret = nf_conntrack_expect_pernet_init(net);
2742 if (ret < 0)
2743 goto err_expect;
2744
2745 nf_conntrack_acct_pernet_init(net);
2746 nf_conntrack_tstamp_pernet_init(net);
2747 nf_conntrack_ecache_pernet_init(net);
2748 nf_conntrack_helper_pernet_init(net);
2749 nf_conntrack_proto_pernet_init(net);
2750
2751 return 0;
2752
2753err_expect:
2754 free_percpu(net->ct.stat);
2755err_pcpu_lists:
2756 free_percpu(net->ct.pcpu_lists);
2757err_stat:
2758 return ret;
2759}
2760