1
2
3
4
5
6
7
8
9
10
11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
14#include <linux/types.h>
15#include <linux/netfilter.h>
16#include <linux/module.h>
17#include <linux/sched.h>
18#include <linux/skbuff.h>
19#include <linux/proc_fs.h>
20#include <linux/vmalloc.h>
21#include <linux/stddef.h>
22#include <linux/slab.h>
23#include <linux/random.h>
24#include <linux/jhash.h>
25#include <linux/siphash.h>
26#include <linux/err.h>
27#include <linux/percpu.h>
28#include <linux/moduleparam.h>
29#include <linux/notifier.h>
30#include <linux/kernel.h>
31#include <linux/netdevice.h>
32#include <linux/socket.h>
33#include <linux/mm.h>
34#include <linux/nsproxy.h>
35#include <linux/rculist_nulls.h>
36
37#include <net/netfilter/nf_conntrack.h>
38#include <net/netfilter/nf_conntrack_l4proto.h>
39#include <net/netfilter/nf_conntrack_expect.h>
40#include <net/netfilter/nf_conntrack_helper.h>
41#include <net/netfilter/nf_conntrack_seqadj.h>
42#include <net/netfilter/nf_conntrack_core.h>
43#include <net/netfilter/nf_conntrack_extend.h>
44#include <net/netfilter/nf_conntrack_acct.h>
45#include <net/netfilter/nf_conntrack_ecache.h>
46#include <net/netfilter/nf_conntrack_zones.h>
47#include <net/netfilter/nf_conntrack_timestamp.h>
48#include <net/netfilter/nf_conntrack_timeout.h>
49#include <net/netfilter/nf_conntrack_labels.h>
50#include <net/netfilter/nf_conntrack_synproxy.h>
51#include <net/netfilter/nf_nat.h>
52#include <net/netfilter/nf_nat_helper.h>
53#include <net/netns/hash.h>
54#include <net/ip.h>
55
56#include "nf_internals.h"
57
58__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
59EXPORT_SYMBOL_GPL(nf_conntrack_locks);
60
61__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
62EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
63
64struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
65EXPORT_SYMBOL_GPL(nf_conntrack_hash);
66
67struct conntrack_gc_work {
68 struct delayed_work dwork;
69 u32 last_bucket;
70 bool exiting;
71 bool early_drop;
72 long next_gc_run;
73};
74
75static __read_mostly struct kmem_cache *nf_conntrack_cachep;
76static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
77static __read_mostly bool nf_conntrack_locks_all;
78
79
80#define GC_MAX_BUCKETS_DIV 128u
81
82#define GC_MAX_SCAN_JIFFIES (16u * HZ)
83
84#define GC_EVICT_RATIO 50u
85
86static struct conntrack_gc_work conntrack_gc_work;
87
88void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
89{
90
91 spin_lock(lock);
92
93
94
95
96 if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false))
97 return;
98
99
100 spin_unlock(lock);
101
102
103 spin_lock(&nf_conntrack_locks_all_lock);
104
105
106 spin_lock(lock);
107
108
109 spin_unlock(&nf_conntrack_locks_all_lock);
110}
111EXPORT_SYMBOL_GPL(nf_conntrack_lock);
112
113static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
114{
115 h1 %= CONNTRACK_LOCKS;
116 h2 %= CONNTRACK_LOCKS;
117 spin_unlock(&nf_conntrack_locks[h1]);
118 if (h1 != h2)
119 spin_unlock(&nf_conntrack_locks[h2]);
120}
121
122
123static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
124 unsigned int h2, unsigned int sequence)
125{
126 h1 %= CONNTRACK_LOCKS;
127 h2 %= CONNTRACK_LOCKS;
128 if (h1 <= h2) {
129 nf_conntrack_lock(&nf_conntrack_locks[h1]);
130 if (h1 != h2)
131 spin_lock_nested(&nf_conntrack_locks[h2],
132 SINGLE_DEPTH_NESTING);
133 } else {
134 nf_conntrack_lock(&nf_conntrack_locks[h2]);
135 spin_lock_nested(&nf_conntrack_locks[h1],
136 SINGLE_DEPTH_NESTING);
137 }
138 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
139 nf_conntrack_double_unlock(h1, h2);
140 return true;
141 }
142 return false;
143}
144
145static void nf_conntrack_all_lock(void)
146 __acquires(&nf_conntrack_locks_all_lock)
147{
148 int i;
149
150 spin_lock(&nf_conntrack_locks_all_lock);
151
152 nf_conntrack_locks_all = true;
153
154 for (i = 0; i < CONNTRACK_LOCKS; i++) {
155 spin_lock(&nf_conntrack_locks[i]);
156
157
158
159
160
161 spin_unlock(&nf_conntrack_locks[i]);
162 }
163}
164
165static void nf_conntrack_all_unlock(void)
166 __releases(&nf_conntrack_locks_all_lock)
167{
168
169
170
171
172
173
174 smp_store_release(&nf_conntrack_locks_all, false);
175 spin_unlock(&nf_conntrack_locks_all_lock);
176}
177
178unsigned int nf_conntrack_htable_size __read_mostly;
179EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
180
181unsigned int nf_conntrack_max __read_mostly;
182EXPORT_SYMBOL_GPL(nf_conntrack_max);
183seqcount_t nf_conntrack_generation __read_mostly;
184static unsigned int nf_conntrack_hash_rnd __read_mostly;
185
186static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
187 const struct net *net)
188{
189 unsigned int n;
190 u32 seed;
191
192 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
193
194
195
196
197
198 seed = nf_conntrack_hash_rnd ^ net_hash_mix(net);
199 n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
200 return jhash2((u32 *)tuple, n, seed ^
201 (((__force __u16)tuple->dst.u.all << 16) |
202 tuple->dst.protonum));
203}
204
205static u32 scale_hash(u32 hash)
206{
207 return reciprocal_scale(hash, nf_conntrack_htable_size);
208}
209
210static u32 __hash_conntrack(const struct net *net,
211 const struct nf_conntrack_tuple *tuple,
212 unsigned int size)
213{
214 return reciprocal_scale(hash_conntrack_raw(tuple, net), size);
215}
216
217static u32 hash_conntrack(const struct net *net,
218 const struct nf_conntrack_tuple *tuple)
219{
220 return scale_hash(hash_conntrack_raw(tuple, net));
221}
222
223static bool nf_ct_get_tuple_ports(const struct sk_buff *skb,
224 unsigned int dataoff,
225 struct nf_conntrack_tuple *tuple)
226{ struct {
227 __be16 sport;
228 __be16 dport;
229 } _inet_hdr, *inet_hdr;
230
231
232 inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr);
233 if (!inet_hdr)
234 return false;
235
236 tuple->src.u.udp.port = inet_hdr->sport;
237 tuple->dst.u.udp.port = inet_hdr->dport;
238 return true;
239}
240
241static bool
242nf_ct_get_tuple(const struct sk_buff *skb,
243 unsigned int nhoff,
244 unsigned int dataoff,
245 u_int16_t l3num,
246 u_int8_t protonum,
247 struct net *net,
248 struct nf_conntrack_tuple *tuple)
249{
250 unsigned int size;
251 const __be32 *ap;
252 __be32 _addrs[8];
253
254 memset(tuple, 0, sizeof(*tuple));
255
256 tuple->src.l3num = l3num;
257 switch (l3num) {
258 case NFPROTO_IPV4:
259 nhoff += offsetof(struct iphdr, saddr);
260 size = 2 * sizeof(__be32);
261 break;
262 case NFPROTO_IPV6:
263 nhoff += offsetof(struct ipv6hdr, saddr);
264 size = sizeof(_addrs);
265 break;
266 default:
267 return true;
268 }
269
270 ap = skb_header_pointer(skb, nhoff, size, _addrs);
271 if (!ap)
272 return false;
273
274 switch (l3num) {
275 case NFPROTO_IPV4:
276 tuple->src.u3.ip = ap[0];
277 tuple->dst.u3.ip = ap[1];
278 break;
279 case NFPROTO_IPV6:
280 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
281 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
282 break;
283 }
284
285 tuple->dst.protonum = protonum;
286 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
287
288 switch (protonum) {
289#if IS_ENABLED(CONFIG_IPV6)
290 case IPPROTO_ICMPV6:
291 return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple);
292#endif
293 case IPPROTO_ICMP:
294 return icmp_pkt_to_tuple(skb, dataoff, net, tuple);
295#ifdef CONFIG_NF_CT_PROTO_GRE
296 case IPPROTO_GRE:
297 return gre_pkt_to_tuple(skb, dataoff, net, tuple);
298#endif
299 case IPPROTO_TCP:
300 case IPPROTO_UDP:
301 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
302#ifdef CONFIG_NF_CT_PROTO_UDPLITE
303 case IPPROTO_UDPLITE:
304 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
305#endif
306#ifdef CONFIG_NF_CT_PROTO_SCTP
307 case IPPROTO_SCTP:
308 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
309#endif
310#ifdef CONFIG_NF_CT_PROTO_DCCP
311 case IPPROTO_DCCP:
312 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
313#endif
314 default:
315 break;
316 }
317
318 return true;
319}
320
321static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
322 u_int8_t *protonum)
323{
324 int dataoff = -1;
325 const struct iphdr *iph;
326 struct iphdr _iph;
327
328 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
329 if (!iph)
330 return -1;
331
332
333
334
335 if (iph->frag_off & htons(IP_OFFSET))
336 return -1;
337
338 dataoff = nhoff + (iph->ihl << 2);
339 *protonum = iph->protocol;
340
341
342 if (dataoff > skb->len) {
343 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n",
344 nhoff, iph->ihl << 2, skb->len);
345 return -1;
346 }
347 return dataoff;
348}
349
350#if IS_ENABLED(CONFIG_IPV6)
351static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
352 u8 *protonum)
353{
354 int protoff = -1;
355 unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
356 __be16 frag_off;
357 u8 nexthdr;
358
359 if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr),
360 &nexthdr, sizeof(nexthdr)) != 0) {
361 pr_debug("can't get nexthdr\n");
362 return -1;
363 }
364 protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off);
365
366
367
368
369 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
370 pr_debug("can't find proto in pkt\n");
371 return -1;
372 }
373
374 *protonum = nexthdr;
375 return protoff;
376}
377#endif
378
379static int get_l4proto(const struct sk_buff *skb,
380 unsigned int nhoff, u8 pf, u8 *l4num)
381{
382 switch (pf) {
383 case NFPROTO_IPV4:
384 return ipv4_get_l4proto(skb, nhoff, l4num);
385#if IS_ENABLED(CONFIG_IPV6)
386 case NFPROTO_IPV6:
387 return ipv6_get_l4proto(skb, nhoff, l4num);
388#endif
389 default:
390 *l4num = 0;
391 break;
392 }
393 return -1;
394}
395
396bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
397 u_int16_t l3num,
398 struct net *net, struct nf_conntrack_tuple *tuple)
399{
400 u8 protonum;
401 int protoff;
402
403 protoff = get_l4proto(skb, nhoff, l3num, &protonum);
404 if (protoff <= 0)
405 return false;
406
407 return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple);
408}
409EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
410
411bool
412nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
413 const struct nf_conntrack_tuple *orig)
414{
415 memset(inverse, 0, sizeof(*inverse));
416
417 inverse->src.l3num = orig->src.l3num;
418
419 switch (orig->src.l3num) {
420 case NFPROTO_IPV4:
421 inverse->src.u3.ip = orig->dst.u3.ip;
422 inverse->dst.u3.ip = orig->src.u3.ip;
423 break;
424 case NFPROTO_IPV6:
425 inverse->src.u3.in6 = orig->dst.u3.in6;
426 inverse->dst.u3.in6 = orig->src.u3.in6;
427 break;
428 default:
429 break;
430 }
431
432 inverse->dst.dir = !orig->dst.dir;
433
434 inverse->dst.protonum = orig->dst.protonum;
435
436 switch (orig->dst.protonum) {
437 case IPPROTO_ICMP:
438 return nf_conntrack_invert_icmp_tuple(inverse, orig);
439#if IS_ENABLED(CONFIG_IPV6)
440 case IPPROTO_ICMPV6:
441 return nf_conntrack_invert_icmpv6_tuple(inverse, orig);
442#endif
443 }
444
445 inverse->src.u.all = orig->dst.u.all;
446 inverse->dst.u.all = orig->src.u.all;
447 return true;
448}
449EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464u32 nf_ct_get_id(const struct nf_conn *ct)
465{
466 static __read_mostly siphash_key_t ct_id_seed;
467 unsigned long a, b, c, d;
468
469 net_get_random_once(&ct_id_seed, sizeof(ct_id_seed));
470
471 a = (unsigned long)ct;
472 b = (unsigned long)ct->master;
473 c = (unsigned long)nf_ct_net(ct);
474 d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
475 sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple),
476 &ct_id_seed);
477#ifdef CONFIG_64BIT
478 return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed);
479#else
480 return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed);
481#endif
482}
483EXPORT_SYMBOL_GPL(nf_ct_get_id);
484
485static void
486clean_from_lists(struct nf_conn *ct)
487{
488 pr_debug("clean_from_lists(%p)\n", ct);
489 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
490 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
491
492
493 nf_ct_remove_expectations(ct);
494}
495
496
497static void nf_ct_add_to_dying_list(struct nf_conn *ct)
498{
499 struct ct_pcpu *pcpu;
500
501
502 ct->cpu = smp_processor_id();
503 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
504
505 spin_lock(&pcpu->lock);
506 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
507 &pcpu->dying);
508 spin_unlock(&pcpu->lock);
509}
510
511
512static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
513{
514 struct ct_pcpu *pcpu;
515
516
517 ct->cpu = smp_processor_id();
518 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
519
520 spin_lock(&pcpu->lock);
521 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
522 &pcpu->unconfirmed);
523 spin_unlock(&pcpu->lock);
524}
525
526
527static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
528{
529 struct ct_pcpu *pcpu;
530
531
532 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
533
534 spin_lock(&pcpu->lock);
535 BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
536 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
537 spin_unlock(&pcpu->lock);
538}
539
540#define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
541
542
543struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
544 const struct nf_conntrack_zone *zone,
545 gfp_t flags)
546{
547 struct nf_conn *tmpl, *p;
548
549 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) {
550 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags);
551 if (!tmpl)
552 return NULL;
553
554 p = tmpl;
555 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
556 if (tmpl != p) {
557 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
558 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p;
559 }
560 } else {
561 tmpl = kzalloc(sizeof(*tmpl), flags);
562 if (!tmpl)
563 return NULL;
564 }
565
566 tmpl->status = IPS_TEMPLATE;
567 write_pnet(&tmpl->ct_net, net);
568 nf_ct_zone_add(tmpl, zone);
569 atomic_set(&tmpl->ct_general.use, 0);
570
571 return tmpl;
572}
573EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
574
575void nf_ct_tmpl_free(struct nf_conn *tmpl)
576{
577 nf_ct_ext_destroy(tmpl);
578
579 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
580 kfree((char *)tmpl - tmpl->proto.tmpl_padto);
581 else
582 kfree(tmpl);
583}
584EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
585
586static void destroy_gre_conntrack(struct nf_conn *ct)
587{
588#ifdef CONFIG_NF_CT_PROTO_GRE
589 struct nf_conn *master = ct->master;
590
591 if (master)
592 nf_ct_gre_keymap_destroy(master);
593#endif
594}
595
596static void
597destroy_conntrack(struct nf_conntrack *nfct)
598{
599 struct nf_conn *ct = (struct nf_conn *)nfct;
600
601 pr_debug("destroy_conntrack(%p)\n", ct);
602 WARN_ON(atomic_read(&nfct->use) != 0);
603
604 if (unlikely(nf_ct_is_template(ct))) {
605 nf_ct_tmpl_free(ct);
606 return;
607 }
608
609 if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE))
610 destroy_gre_conntrack(ct);
611
612 local_bh_disable();
613
614
615
616
617
618 nf_ct_remove_expectations(ct);
619
620 nf_ct_del_from_dying_or_unconfirmed_list(ct);
621
622 local_bh_enable();
623
624 if (ct->master)
625 nf_ct_put(ct->master);
626
627 pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
628 nf_conntrack_free(ct);
629}
630
631static void nf_ct_delete_from_lists(struct nf_conn *ct)
632{
633 struct net *net = nf_ct_net(ct);
634 unsigned int hash, reply_hash;
635 unsigned int sequence;
636
637 nf_ct_helper_destroy(ct);
638
639 local_bh_disable();
640 do {
641 sequence = read_seqcount_begin(&nf_conntrack_generation);
642 hash = hash_conntrack(net,
643 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
644 reply_hash = hash_conntrack(net,
645 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
646 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
647
648 clean_from_lists(ct);
649 nf_conntrack_double_unlock(hash, reply_hash);
650
651 nf_ct_add_to_dying_list(ct);
652
653 local_bh_enable();
654}
655
656bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
657{
658 struct nf_conn_tstamp *tstamp;
659
660 if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
661 return false;
662
663 tstamp = nf_conn_tstamp_find(ct);
664 if (tstamp && tstamp->stop == 0)
665 tstamp->stop = ktime_get_real_ns();
666
667 if (nf_conntrack_event_report(IPCT_DESTROY, ct,
668 portid, report) < 0) {
669
670
671
672 nf_ct_delete_from_lists(ct);
673 nf_conntrack_ecache_delayed_work(nf_ct_net(ct));
674 return false;
675 }
676
677 nf_conntrack_ecache_work(nf_ct_net(ct));
678 nf_ct_delete_from_lists(ct);
679 nf_ct_put(ct);
680 return true;
681}
682EXPORT_SYMBOL_GPL(nf_ct_delete);
683
684static inline bool
685nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
686 const struct nf_conntrack_tuple *tuple,
687 const struct nf_conntrack_zone *zone,
688 const struct net *net)
689{
690 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
691
692
693
694
695 return nf_ct_tuple_equal(tuple, &h->tuple) &&
696 nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
697 nf_ct_is_confirmed(ct) &&
698 net_eq(net, nf_ct_net(ct));
699}
700
701static inline bool
702nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
703{
704 return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
705 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
706 nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
707 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
708 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
709 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
710 net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
711}
712
713
714static void nf_ct_gc_expired(struct nf_conn *ct)
715{
716 if (!atomic_inc_not_zero(&ct->ct_general.use))
717 return;
718
719 if (nf_ct_should_gc(ct))
720 nf_ct_kill(ct);
721
722 nf_ct_put(ct);
723}
724
725
726
727
728
729
730static struct nf_conntrack_tuple_hash *
731____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
732 const struct nf_conntrack_tuple *tuple, u32 hash)
733{
734 struct nf_conntrack_tuple_hash *h;
735 struct hlist_nulls_head *ct_hash;
736 struct hlist_nulls_node *n;
737 unsigned int bucket, hsize;
738
739begin:
740 nf_conntrack_get_ht(&ct_hash, &hsize);
741 bucket = reciprocal_scale(hash, hsize);
742
743 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
744 struct nf_conn *ct;
745
746 ct = nf_ct_tuplehash_to_ctrack(h);
747 if (nf_ct_is_expired(ct)) {
748 nf_ct_gc_expired(ct);
749 continue;
750 }
751
752 if (nf_ct_key_equal(h, tuple, zone, net))
753 return h;
754 }
755
756
757
758
759
760 if (get_nulls_value(n) != bucket) {
761 NF_CT_STAT_INC_ATOMIC(net, search_restart);
762 goto begin;
763 }
764
765 return NULL;
766}
767
768
769static struct nf_conntrack_tuple_hash *
770__nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
771 const struct nf_conntrack_tuple *tuple, u32 hash)
772{
773 struct nf_conntrack_tuple_hash *h;
774 struct nf_conn *ct;
775
776 rcu_read_lock();
777
778 h = ____nf_conntrack_find(net, zone, tuple, hash);
779 if (h) {
780
781
782
783 ct = nf_ct_tuplehash_to_ctrack(h);
784 if (likely(atomic_inc_not_zero(&ct->ct_general.use))) {
785 if (likely(nf_ct_key_equal(h, tuple, zone, net)))
786 goto found;
787
788
789 nf_ct_put(ct);
790 }
791
792 h = NULL;
793 }
794found:
795 rcu_read_unlock();
796
797 return h;
798}
799
800struct nf_conntrack_tuple_hash *
801nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
802 const struct nf_conntrack_tuple *tuple)
803{
804 return __nf_conntrack_find_get(net, zone, tuple,
805 hash_conntrack_raw(tuple, net));
806}
807EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
808
809static void __nf_conntrack_hash_insert(struct nf_conn *ct,
810 unsigned int hash,
811 unsigned int reply_hash)
812{
813 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
814 &nf_conntrack_hash[hash]);
815 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
816 &nf_conntrack_hash[reply_hash]);
817}
818
819int
820nf_conntrack_hash_check_insert(struct nf_conn *ct)
821{
822 const struct nf_conntrack_zone *zone;
823 struct net *net = nf_ct_net(ct);
824 unsigned int hash, reply_hash;
825 struct nf_conntrack_tuple_hash *h;
826 struct hlist_nulls_node *n;
827 unsigned int sequence;
828
829 zone = nf_ct_zone(ct);
830
831 local_bh_disable();
832 do {
833 sequence = read_seqcount_begin(&nf_conntrack_generation);
834 hash = hash_conntrack(net,
835 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
836 reply_hash = hash_conntrack(net,
837 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
838 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
839
840
841 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
842 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
843 zone, net))
844 goto out;
845
846 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
847 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
848 zone, net))
849 goto out;
850
851 smp_wmb();
852
853 atomic_set(&ct->ct_general.use, 2);
854 __nf_conntrack_hash_insert(ct, hash, reply_hash);
855 nf_conntrack_double_unlock(hash, reply_hash);
856 NF_CT_STAT_INC(net, insert);
857 local_bh_enable();
858 return 0;
859
860out:
861 nf_conntrack_double_unlock(hash, reply_hash);
862 NF_CT_STAT_INC(net, insert_failed);
863 local_bh_enable();
864 return -EEXIST;
865}
866EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
867
868void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets,
869 unsigned int bytes)
870{
871 struct nf_conn_acct *acct;
872
873 acct = nf_conn_acct_find(ct);
874 if (acct) {
875 struct nf_conn_counter *counter = acct->counter;
876
877 atomic64_add(packets, &counter[dir].packets);
878 atomic64_add(bytes, &counter[dir].bytes);
879 }
880}
881EXPORT_SYMBOL_GPL(nf_ct_acct_add);
882
883static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
884 const struct nf_conn *loser_ct)
885{
886 struct nf_conn_acct *acct;
887
888 acct = nf_conn_acct_find(loser_ct);
889 if (acct) {
890 struct nf_conn_counter *counter = acct->counter;
891 unsigned int bytes;
892
893
894 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
895 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes);
896 }
897}
898
899static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
900{
901 struct nf_conn_tstamp *tstamp;
902
903 atomic_inc(&ct->ct_general.use);
904 ct->status |= IPS_CONFIRMED;
905
906
907 tstamp = nf_conn_tstamp_find(ct);
908 if (tstamp)
909 tstamp->start = ktime_get_real_ns();
910}
911
912static int __nf_ct_resolve_clash(struct sk_buff *skb,
913 struct nf_conntrack_tuple_hash *h)
914{
915
916 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
917 enum ip_conntrack_info ctinfo;
918 struct nf_conn *loser_ct;
919
920 loser_ct = nf_ct_get(skb, &ctinfo);
921
922 if (nf_ct_is_dying(ct))
923 return NF_DROP;
924
925 if (!atomic_inc_not_zero(&ct->ct_general.use))
926 return NF_DROP;
927
928 if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
929 nf_ct_match(ct, loser_ct)) {
930 struct net *net = nf_ct_net(ct);
931
932 nf_ct_acct_merge(ct, ctinfo, loser_ct);
933 nf_ct_add_to_dying_list(loser_ct);
934 nf_conntrack_put(&loser_ct->ct_general);
935 nf_ct_set(skb, ct, ctinfo);
936
937 NF_CT_STAT_INC(net, insert_failed);
938 return NF_ACCEPT;
939 }
940
941 nf_ct_put(ct);
942 return NF_DROP;
943}
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
961{
962 struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb);
963 const struct nf_conntrack_zone *zone;
964 struct nf_conntrack_tuple_hash *h;
965 struct hlist_nulls_node *n;
966 struct net *net;
967
968 zone = nf_ct_zone(loser_ct);
969 net = nf_ct_net(loser_ct);
970
971
972
973
974 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) {
975 if (nf_ct_key_equal(h,
976 &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple,
977 zone, net))
978 return __nf_ct_resolve_clash(skb, h);
979 }
980
981
982 loser_ct->timeout = nfct_time_stamp + HZ;
983
984
985
986
987
988
989 loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH;
990
991 __nf_conntrack_insert_prepare(loser_ct);
992
993
994
995
996
997 hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
998
999 hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
1000 &nf_conntrack_hash[repl_idx]);
1001 return NF_ACCEPT;
1002}
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037static __cold noinline int
1038nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
1039 u32 reply_hash)
1040{
1041
1042 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1043 const struct nf_conntrack_l4proto *l4proto;
1044 enum ip_conntrack_info ctinfo;
1045 struct nf_conn *loser_ct;
1046 struct net *net;
1047 int ret;
1048
1049 loser_ct = nf_ct_get(skb, &ctinfo);
1050 net = nf_ct_net(loser_ct);
1051
1052 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
1053 if (!l4proto->allow_clash)
1054 goto drop;
1055
1056 ret = __nf_ct_resolve_clash(skb, h);
1057 if (ret == NF_ACCEPT)
1058 return ret;
1059
1060 ret = nf_ct_resolve_clash_harder(skb, reply_hash);
1061 if (ret == NF_ACCEPT)
1062 return ret;
1063
1064drop:
1065 nf_ct_add_to_dying_list(loser_ct);
1066 NF_CT_STAT_INC(net, drop);
1067 NF_CT_STAT_INC(net, insert_failed);
1068 return NF_DROP;
1069}
1070
1071
1072int
1073__nf_conntrack_confirm(struct sk_buff *skb)
1074{
1075 const struct nf_conntrack_zone *zone;
1076 unsigned int hash, reply_hash;
1077 struct nf_conntrack_tuple_hash *h;
1078 struct nf_conn *ct;
1079 struct nf_conn_help *help;
1080 struct hlist_nulls_node *n;
1081 enum ip_conntrack_info ctinfo;
1082 struct net *net;
1083 unsigned int sequence;
1084 int ret = NF_DROP;
1085
1086 ct = nf_ct_get(skb, &ctinfo);
1087 net = nf_ct_net(ct);
1088
1089
1090
1091
1092
1093 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
1094 return NF_ACCEPT;
1095
1096 zone = nf_ct_zone(ct);
1097 local_bh_disable();
1098
1099 do {
1100 sequence = read_seqcount_begin(&nf_conntrack_generation);
1101
1102 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
1103 hash = scale_hash(hash);
1104 reply_hash = hash_conntrack(net,
1105 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
1106
1107 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119 if (unlikely(nf_ct_is_confirmed(ct))) {
1120 WARN_ON_ONCE(1);
1121 nf_conntrack_double_unlock(hash, reply_hash);
1122 local_bh_enable();
1123 return NF_DROP;
1124 }
1125
1126 pr_debug("Confirming conntrack %p\n", ct);
1127
1128
1129
1130
1131
1132 nf_ct_del_from_dying_or_unconfirmed_list(ct);
1133
1134 if (unlikely(nf_ct_is_dying(ct))) {
1135 nf_ct_add_to_dying_list(ct);
1136 NF_CT_STAT_INC(net, insert_failed);
1137 goto dying;
1138 }
1139
1140
1141
1142
1143 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
1144 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1145 zone, net))
1146 goto out;
1147
1148 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
1149 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
1150 zone, net))
1151 goto out;
1152
1153
1154
1155
1156 ct->timeout += nfct_time_stamp;
1157
1158 __nf_conntrack_insert_prepare(ct);
1159
1160
1161
1162
1163
1164
1165 __nf_conntrack_hash_insert(ct, hash, reply_hash);
1166 nf_conntrack_double_unlock(hash, reply_hash);
1167 local_bh_enable();
1168
1169 help = nfct_help(ct);
1170 if (help && help->helper)
1171 nf_conntrack_event_cache(IPCT_HELPER, ct);
1172
1173 nf_conntrack_event_cache(master_ct(ct) ?
1174 IPCT_RELATED : IPCT_NEW, ct);
1175 return NF_ACCEPT;
1176
1177out:
1178 ret = nf_ct_resolve_clash(skb, h, reply_hash);
1179dying:
1180 nf_conntrack_double_unlock(hash, reply_hash);
1181 local_bh_enable();
1182 return ret;
1183}
1184EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
1185
1186
1187
1188int
1189nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
1190 const struct nf_conn *ignored_conntrack)
1191{
1192 struct net *net = nf_ct_net(ignored_conntrack);
1193 const struct nf_conntrack_zone *zone;
1194 struct nf_conntrack_tuple_hash *h;
1195 struct hlist_nulls_head *ct_hash;
1196 unsigned int hash, hsize;
1197 struct hlist_nulls_node *n;
1198 struct nf_conn *ct;
1199
1200 zone = nf_ct_zone(ignored_conntrack);
1201
1202 rcu_read_lock();
1203 begin:
1204 nf_conntrack_get_ht(&ct_hash, &hsize);
1205 hash = __hash_conntrack(net, tuple, hsize);
1206
1207 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
1208 ct = nf_ct_tuplehash_to_ctrack(h);
1209
1210 if (ct == ignored_conntrack)
1211 continue;
1212
1213 if (nf_ct_is_expired(ct)) {
1214 nf_ct_gc_expired(ct);
1215 continue;
1216 }
1217
1218 if (nf_ct_key_equal(h, tuple, zone, net)) {
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231 if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1232 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple))
1233 continue;
1234
1235 NF_CT_STAT_INC_ATOMIC(net, found);
1236 rcu_read_unlock();
1237 return 1;
1238 }
1239 }
1240
1241 if (get_nulls_value(n) != hash) {
1242 NF_CT_STAT_INC_ATOMIC(net, search_restart);
1243 goto begin;
1244 }
1245
1246 rcu_read_unlock();
1247
1248 return 0;
1249}
1250EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
1251
1252#define NF_CT_EVICTION_RANGE 8
1253
1254
1255
1256static unsigned int early_drop_list(struct net *net,
1257 struct hlist_nulls_head *head)
1258{
1259 struct nf_conntrack_tuple_hash *h;
1260 struct hlist_nulls_node *n;
1261 unsigned int drops = 0;
1262 struct nf_conn *tmp;
1263
1264 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
1265 tmp = nf_ct_tuplehash_to_ctrack(h);
1266
1267 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
1268 continue;
1269
1270 if (nf_ct_is_expired(tmp)) {
1271 nf_ct_gc_expired(tmp);
1272 continue;
1273 }
1274
1275 if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
1276 !net_eq(nf_ct_net(tmp), net) ||
1277 nf_ct_is_dying(tmp))
1278 continue;
1279
1280 if (!atomic_inc_not_zero(&tmp->ct_general.use))
1281 continue;
1282
1283
1284
1285
1286
1287
1288
1289
1290 if (net_eq(nf_ct_net(tmp), net) &&
1291 nf_ct_is_confirmed(tmp) &&
1292 nf_ct_delete(tmp, 0, 0))
1293 drops++;
1294
1295 nf_ct_put(tmp);
1296 }
1297
1298 return drops;
1299}
1300
1301static noinline int early_drop(struct net *net, unsigned int hash)
1302{
1303 unsigned int i, bucket;
1304
1305 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
1306 struct hlist_nulls_head *ct_hash;
1307 unsigned int hsize, drops;
1308
1309 rcu_read_lock();
1310 nf_conntrack_get_ht(&ct_hash, &hsize);
1311 if (!i)
1312 bucket = reciprocal_scale(hash, hsize);
1313 else
1314 bucket = (bucket + 1) % hsize;
1315
1316 drops = early_drop_list(net, &ct_hash[bucket]);
1317 rcu_read_unlock();
1318
1319 if (drops) {
1320 NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops);
1321 return true;
1322 }
1323 }
1324
1325 return false;
1326}
1327
1328static bool gc_worker_skip_ct(const struct nf_conn *ct)
1329{
1330 return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct);
1331}
1332
1333static bool gc_worker_can_early_drop(const struct nf_conn *ct)
1334{
1335 const struct nf_conntrack_l4proto *l4proto;
1336
1337 if (!test_bit(IPS_ASSURED_BIT, &ct->status))
1338 return true;
1339
1340 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
1341 if (l4proto->can_early_drop && l4proto->can_early_drop(ct))
1342 return true;
1343
1344 return false;
1345}
1346
1347#define DAY (86400 * HZ)
1348
1349
1350
1351
1352
1353static void nf_ct_offload_timeout(struct nf_conn *ct)
1354{
1355 if (nf_ct_expires(ct) < DAY / 2)
1356 ct->timeout = nfct_time_stamp + DAY;
1357}
1358
1359static void gc_worker(struct work_struct *work)
1360{
1361 unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
1362 unsigned int i, goal, buckets = 0, expired_count = 0;
1363 unsigned int nf_conntrack_max95 = 0;
1364 struct conntrack_gc_work *gc_work;
1365 unsigned int ratio, scanned = 0;
1366 unsigned long next_run;
1367
1368 gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
1369
1370 goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV;
1371 i = gc_work->last_bucket;
1372 if (gc_work->early_drop)
1373 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
1374
1375 do {
1376 struct nf_conntrack_tuple_hash *h;
1377 struct hlist_nulls_head *ct_hash;
1378 struct hlist_nulls_node *n;
1379 unsigned int hashsz;
1380 struct nf_conn *tmp;
1381
1382 i++;
1383 rcu_read_lock();
1384
1385 nf_conntrack_get_ht(&ct_hash, &hashsz);
1386 if (i >= hashsz)
1387 i = 0;
1388
1389 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
1390 struct net *net;
1391
1392 tmp = nf_ct_tuplehash_to_ctrack(h);
1393
1394 scanned++;
1395 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
1396 nf_ct_offload_timeout(tmp);
1397 continue;
1398 }
1399
1400 if (nf_ct_is_expired(tmp)) {
1401 nf_ct_gc_expired(tmp);
1402 expired_count++;
1403 continue;
1404 }
1405
1406 if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
1407 continue;
1408
1409 net = nf_ct_net(tmp);
1410 if (atomic_read(&net->ct.count) < nf_conntrack_max95)
1411 continue;
1412
1413
1414 if (!atomic_inc_not_zero(&tmp->ct_general.use))
1415 continue;
1416
1417 if (gc_worker_skip_ct(tmp)) {
1418 nf_ct_put(tmp);
1419 continue;
1420 }
1421
1422 if (gc_worker_can_early_drop(tmp))
1423 nf_ct_kill(tmp);
1424
1425 nf_ct_put(tmp);
1426 }
1427
1428
1429
1430
1431
1432 rcu_read_unlock();
1433 cond_resched();
1434 } while (++buckets < goal);
1435
1436 if (gc_work->exiting)
1437 return;
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456 ratio = scanned ? expired_count * 100 / scanned : 0;
1457 if (ratio > GC_EVICT_RATIO) {
1458 gc_work->next_gc_run = min_interval;
1459 } else {
1460 unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV;
1461
1462 BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0);
1463
1464 gc_work->next_gc_run += min_interval;
1465 if (gc_work->next_gc_run > max)
1466 gc_work->next_gc_run = max;
1467 }
1468
1469 next_run = gc_work->next_gc_run;
1470 gc_work->last_bucket = i;
1471 gc_work->early_drop = false;
1472 queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
1473}
1474
1475static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
1476{
1477 INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker);
1478 gc_work->next_gc_run = HZ;
1479 gc_work->exiting = false;
1480}
1481
1482static struct nf_conn *
1483__nf_conntrack_alloc(struct net *net,
1484 const struct nf_conntrack_zone *zone,
1485 const struct nf_conntrack_tuple *orig,
1486 const struct nf_conntrack_tuple *repl,
1487 gfp_t gfp, u32 hash)
1488{
1489 struct nf_conn *ct;
1490
1491
1492 atomic_inc(&net->ct.count);
1493
1494 if (nf_conntrack_max &&
1495 unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
1496 if (!early_drop(net, hash)) {
1497 if (!conntrack_gc_work.early_drop)
1498 conntrack_gc_work.early_drop = true;
1499 atomic_dec(&net->ct.count);
1500 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
1501 return ERR_PTR(-ENOMEM);
1502 }
1503 }
1504
1505
1506
1507
1508
1509 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
1510 if (ct == NULL)
1511 goto out;
1512
1513 spin_lock_init(&ct->lock);
1514 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
1515 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
1516 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
1517
1518 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
1519 ct->status = 0;
1520 ct->timeout = 0;
1521 write_pnet(&ct->ct_net, net);
1522 memset(&ct->__nfct_init_offset, 0,
1523 offsetof(struct nf_conn, proto) -
1524 offsetof(struct nf_conn, __nfct_init_offset));
1525
1526 nf_ct_zone_add(ct, zone);
1527
1528
1529
1530
1531 atomic_set(&ct->ct_general.use, 0);
1532 return ct;
1533out:
1534 atomic_dec(&net->ct.count);
1535 return ERR_PTR(-ENOMEM);
1536}
1537
1538struct nf_conn *nf_conntrack_alloc(struct net *net,
1539 const struct nf_conntrack_zone *zone,
1540 const struct nf_conntrack_tuple *orig,
1541 const struct nf_conntrack_tuple *repl,
1542 gfp_t gfp)
1543{
1544 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
1545}
1546EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
1547
1548void nf_conntrack_free(struct nf_conn *ct)
1549{
1550 struct net *net = nf_ct_net(ct);
1551
1552
1553
1554
1555 WARN_ON(atomic_read(&ct->ct_general.use) != 0);
1556
1557 nf_ct_ext_destroy(ct);
1558 kmem_cache_free(nf_conntrack_cachep, ct);
1559 smp_mb__before_atomic();
1560 atomic_dec(&net->ct.count);
1561}
1562EXPORT_SYMBOL_GPL(nf_conntrack_free);
1563
1564
1565
1566
1567static noinline struct nf_conntrack_tuple_hash *
1568init_conntrack(struct net *net, struct nf_conn *tmpl,
1569 const struct nf_conntrack_tuple *tuple,
1570 struct sk_buff *skb,
1571 unsigned int dataoff, u32 hash)
1572{
1573 struct nf_conn *ct;
1574 struct nf_conn_help *help;
1575 struct nf_conntrack_tuple repl_tuple;
1576 struct nf_conntrack_ecache *ecache;
1577 struct nf_conntrack_expect *exp = NULL;
1578 const struct nf_conntrack_zone *zone;
1579 struct nf_conn_timeout *timeout_ext;
1580 struct nf_conntrack_zone tmp;
1581
1582 if (!nf_ct_invert_tuple(&repl_tuple, tuple)) {
1583 pr_debug("Can't invert tuple.\n");
1584 return NULL;
1585 }
1586
1587 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1588 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
1589 hash);
1590 if (IS_ERR(ct))
1591 return (struct nf_conntrack_tuple_hash *)ct;
1592
1593 if (!nf_ct_add_synproxy(ct, tmpl)) {
1594 nf_conntrack_free(ct);
1595 return ERR_PTR(-ENOMEM);
1596 }
1597
1598 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
1599
1600 if (timeout_ext)
1601 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
1602 GFP_ATOMIC);
1603
1604 nf_ct_acct_ext_add(ct, GFP_ATOMIC);
1605 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
1606 nf_ct_labels_ext_add(ct);
1607
1608 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
1609 nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
1610 ecache ? ecache->expmask : 0,
1611 GFP_ATOMIC);
1612
1613 local_bh_disable();
1614 if (net->ct.expect_count) {
1615 spin_lock(&nf_conntrack_expect_lock);
1616 exp = nf_ct_find_expectation(net, zone, tuple);
1617 if (exp) {
1618 pr_debug("expectation arrives ct=%p exp=%p\n",
1619 ct, exp);
1620
1621 __set_bit(IPS_EXPECTED_BIT, &ct->status);
1622
1623 ct->master = exp->master;
1624 if (exp->helper) {
1625 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
1626 if (help)
1627 rcu_assign_pointer(help->helper, exp->helper);
1628 }
1629
1630#ifdef CONFIG_NF_CONNTRACK_MARK
1631 ct->mark = exp->master->mark;
1632#endif
1633#ifdef CONFIG_NF_CONNTRACK_SECMARK
1634 ct->secmark = exp->master->secmark;
1635#endif
1636 NF_CT_STAT_INC(net, expect_new);
1637 }
1638 spin_unlock(&nf_conntrack_expect_lock);
1639 }
1640 if (!exp)
1641 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
1642
1643
1644 nf_conntrack_get(&ct->ct_general);
1645 nf_ct_add_to_unconfirmed_list(ct);
1646
1647 local_bh_enable();
1648
1649 if (exp) {
1650 if (exp->expectfn)
1651 exp->expectfn(ct, exp);
1652 nf_ct_expect_put(exp);
1653 }
1654
1655 return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
1656}
1657
1658
1659static int
1660resolve_normal_ct(struct nf_conn *tmpl,
1661 struct sk_buff *skb,
1662 unsigned int dataoff,
1663 u_int8_t protonum,
1664 const struct nf_hook_state *state)
1665{
1666 const struct nf_conntrack_zone *zone;
1667 struct nf_conntrack_tuple tuple;
1668 struct nf_conntrack_tuple_hash *h;
1669 enum ip_conntrack_info ctinfo;
1670 struct nf_conntrack_zone tmp;
1671 struct nf_conn *ct;
1672 u32 hash;
1673
1674 if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
1675 dataoff, state->pf, protonum, state->net,
1676 &tuple)) {
1677 pr_debug("Can't get tuple\n");
1678 return 0;
1679 }
1680
1681
1682 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1683 hash = hash_conntrack_raw(&tuple, state->net);
1684 h = __nf_conntrack_find_get(state->net, zone, &tuple, hash);
1685 if (!h) {
1686 h = init_conntrack(state->net, tmpl, &tuple,
1687 skb, dataoff, hash);
1688 if (!h)
1689 return 0;
1690 if (IS_ERR(h))
1691 return PTR_ERR(h);
1692 }
1693 ct = nf_ct_tuplehash_to_ctrack(h);
1694
1695
1696 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1697 ctinfo = IP_CT_ESTABLISHED_REPLY;
1698 } else {
1699
1700 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1701 pr_debug("normal packet for %p\n", ct);
1702 ctinfo = IP_CT_ESTABLISHED;
1703 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1704 pr_debug("related packet for %p\n", ct);
1705 ctinfo = IP_CT_RELATED;
1706 } else {
1707 pr_debug("new packet for %p\n", ct);
1708 ctinfo = IP_CT_NEW;
1709 }
1710 }
1711 nf_ct_set(skb, ct, ctinfo);
1712 return 0;
1713}
1714
1715
1716
1717
1718
1719
1720
1721
1722static unsigned int __cold
1723nf_conntrack_handle_icmp(struct nf_conn *tmpl,
1724 struct sk_buff *skb,
1725 unsigned int dataoff,
1726 u8 protonum,
1727 const struct nf_hook_state *state)
1728{
1729 int ret;
1730
1731 if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP)
1732 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state);
1733#if IS_ENABLED(CONFIG_IPV6)
1734 else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6)
1735 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state);
1736#endif
1737 else
1738 return NF_ACCEPT;
1739
1740 if (ret <= 0) {
1741 NF_CT_STAT_INC_ATOMIC(state->net, error);
1742 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1743 }
1744
1745 return ret;
1746}
1747
1748static int generic_packet(struct nf_conn *ct, struct sk_buff *skb,
1749 enum ip_conntrack_info ctinfo)
1750{
1751 const unsigned int *timeout = nf_ct_timeout_lookup(ct);
1752
1753 if (!timeout)
1754 timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout;
1755
1756 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
1757 return NF_ACCEPT;
1758}
1759
1760
1761static int nf_conntrack_handle_packet(struct nf_conn *ct,
1762 struct sk_buff *skb,
1763 unsigned int dataoff,
1764 enum ip_conntrack_info ctinfo,
1765 const struct nf_hook_state *state)
1766{
1767 switch (nf_ct_protonum(ct)) {
1768 case IPPROTO_TCP:
1769 return nf_conntrack_tcp_packet(ct, skb, dataoff,
1770 ctinfo, state);
1771 case IPPROTO_UDP:
1772 return nf_conntrack_udp_packet(ct, skb, dataoff,
1773 ctinfo, state);
1774 case IPPROTO_ICMP:
1775 return nf_conntrack_icmp_packet(ct, skb, ctinfo, state);
1776#if IS_ENABLED(CONFIG_IPV6)
1777 case IPPROTO_ICMPV6:
1778 return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state);
1779#endif
1780#ifdef CONFIG_NF_CT_PROTO_UDPLITE
1781 case IPPROTO_UDPLITE:
1782 return nf_conntrack_udplite_packet(ct, skb, dataoff,
1783 ctinfo, state);
1784#endif
1785#ifdef CONFIG_NF_CT_PROTO_SCTP
1786 case IPPROTO_SCTP:
1787 return nf_conntrack_sctp_packet(ct, skb, dataoff,
1788 ctinfo, state);
1789#endif
1790#ifdef CONFIG_NF_CT_PROTO_DCCP
1791 case IPPROTO_DCCP:
1792 return nf_conntrack_dccp_packet(ct, skb, dataoff,
1793 ctinfo, state);
1794#endif
1795#ifdef CONFIG_NF_CT_PROTO_GRE
1796 case IPPROTO_GRE:
1797 return nf_conntrack_gre_packet(ct, skb, dataoff,
1798 ctinfo, state);
1799#endif
1800 }
1801
1802 return generic_packet(ct, skb, ctinfo);
1803}
1804
1805unsigned int
1806nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
1807{
1808 enum ip_conntrack_info ctinfo;
1809 struct nf_conn *ct, *tmpl;
1810 u_int8_t protonum;
1811 int dataoff, ret;
1812
1813 tmpl = nf_ct_get(skb, &ctinfo);
1814 if (tmpl || ctinfo == IP_CT_UNTRACKED) {
1815
1816 if ((tmpl && !nf_ct_is_template(tmpl)) ||
1817 ctinfo == IP_CT_UNTRACKED) {
1818 NF_CT_STAT_INC_ATOMIC(state->net, ignore);
1819 return NF_ACCEPT;
1820 }
1821 skb->_nfct = 0;
1822 }
1823
1824
1825 dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum);
1826 if (dataoff <= 0) {
1827 pr_debug("not prepared to track yet or error occurred\n");
1828 NF_CT_STAT_INC_ATOMIC(state->net, error);
1829 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1830 ret = NF_ACCEPT;
1831 goto out;
1832 }
1833
1834 if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) {
1835 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff,
1836 protonum, state);
1837 if (ret <= 0) {
1838 ret = -ret;
1839 goto out;
1840 }
1841
1842 if (skb->_nfct)
1843 goto out;
1844 }
1845repeat:
1846 ret = resolve_normal_ct(tmpl, skb, dataoff,
1847 protonum, state);
1848 if (ret < 0) {
1849
1850 NF_CT_STAT_INC_ATOMIC(state->net, drop);
1851 ret = NF_DROP;
1852 goto out;
1853 }
1854
1855 ct = nf_ct_get(skb, &ctinfo);
1856 if (!ct) {
1857
1858 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1859 ret = NF_ACCEPT;
1860 goto out;
1861 }
1862
1863 ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state);
1864 if (ret <= 0) {
1865
1866
1867 pr_debug("nf_conntrack_in: Can't track with proto module\n");
1868 nf_conntrack_put(&ct->ct_general);
1869 skb->_nfct = 0;
1870 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1871 if (ret == -NF_DROP)
1872 NF_CT_STAT_INC_ATOMIC(state->net, drop);
1873
1874
1875
1876
1877 if (ret == -NF_REPEAT)
1878 goto repeat;
1879 ret = -ret;
1880 goto out;
1881 }
1882
1883 if (ctinfo == IP_CT_ESTABLISHED_REPLY &&
1884 !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1885 nf_conntrack_event_cache(IPCT_REPLY, ct);
1886out:
1887 if (tmpl)
1888 nf_ct_put(tmpl);
1889
1890 return ret;
1891}
1892EXPORT_SYMBOL_GPL(nf_conntrack_in);
1893
1894
1895
1896void nf_conntrack_alter_reply(struct nf_conn *ct,
1897 const struct nf_conntrack_tuple *newreply)
1898{
1899 struct nf_conn_help *help = nfct_help(ct);
1900
1901
1902 WARN_ON(nf_ct_is_confirmed(ct));
1903
1904 pr_debug("Altering reply tuple of %p to ", ct);
1905 nf_ct_dump_tuple(newreply);
1906
1907 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1908 if (ct->master || (help && !hlist_empty(&help->expectations)))
1909 return;
1910
1911 rcu_read_lock();
1912 __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
1913 rcu_read_unlock();
1914}
1915EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
1916
1917
1918void __nf_ct_refresh_acct(struct nf_conn *ct,
1919 enum ip_conntrack_info ctinfo,
1920 const struct sk_buff *skb,
1921 u32 extra_jiffies,
1922 bool do_acct)
1923{
1924
1925 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
1926 goto acct;
1927
1928
1929 if (nf_ct_is_confirmed(ct))
1930 extra_jiffies += nfct_time_stamp;
1931
1932 if (READ_ONCE(ct->timeout) != extra_jiffies)
1933 WRITE_ONCE(ct->timeout, extra_jiffies);
1934acct:
1935 if (do_acct)
1936 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
1937}
1938EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
1939
1940bool nf_ct_kill_acct(struct nf_conn *ct,
1941 enum ip_conntrack_info ctinfo,
1942 const struct sk_buff *skb)
1943{
1944 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
1945
1946 return nf_ct_delete(ct, 0, 0);
1947}
1948EXPORT_SYMBOL_GPL(nf_ct_kill_acct);
1949
1950#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1951
1952#include <linux/netfilter/nfnetlink.h>
1953#include <linux/netfilter/nfnetlink_conntrack.h>
1954#include <linux/mutex.h>
1955
1956
1957int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
1958 const struct nf_conntrack_tuple *tuple)
1959{
1960 if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) ||
1961 nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port))
1962 goto nla_put_failure;
1963 return 0;
1964
1965nla_put_failure:
1966 return -1;
1967}
1968EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
1969
1970const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
1971 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 },
1972 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 },
1973};
1974EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
1975
1976int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
1977 struct nf_conntrack_tuple *t)
1978{
1979 if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT])
1980 return -EINVAL;
1981
1982 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
1983 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
1984
1985 return 0;
1986}
1987EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
1988
1989unsigned int nf_ct_port_nlattr_tuple_size(void)
1990{
1991 static unsigned int size __read_mostly;
1992
1993 if (!size)
1994 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1995
1996 return size;
1997}
1998EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
1999#endif
2000
2001
2002static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
2003{
2004 struct nf_conn *ct;
2005 enum ip_conntrack_info ctinfo;
2006
2007
2008 ct = nf_ct_get(skb, &ctinfo);
2009 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
2010 ctinfo = IP_CT_RELATED_REPLY;
2011 else
2012 ctinfo = IP_CT_RELATED;
2013
2014
2015 nf_ct_set(nskb, ct, ctinfo);
2016 nf_conntrack_get(skb_nfct(nskb));
2017}
2018
2019static int __nf_conntrack_update(struct net *net, struct sk_buff *skb,
2020 struct nf_conn *ct,
2021 enum ip_conntrack_info ctinfo)
2022{
2023 struct nf_conntrack_tuple_hash *h;
2024 struct nf_conntrack_tuple tuple;
2025 struct nf_nat_hook *nat_hook;
2026 unsigned int status;
2027 int dataoff;
2028 u16 l3num;
2029 u8 l4num;
2030
2031 l3num = nf_ct_l3num(ct);
2032
2033 dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num);
2034 if (dataoff <= 0)
2035 return -1;
2036
2037 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
2038 l4num, net, &tuple))
2039 return -1;
2040
2041 if (ct->status & IPS_SRC_NAT) {
2042 memcpy(tuple.src.u3.all,
2043 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
2044 sizeof(tuple.src.u3.all));
2045 tuple.src.u.all =
2046 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
2047 }
2048
2049 if (ct->status & IPS_DST_NAT) {
2050 memcpy(tuple.dst.u3.all,
2051 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
2052 sizeof(tuple.dst.u3.all));
2053 tuple.dst.u.all =
2054 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
2055 }
2056
2057 h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
2058 if (!h)
2059 return 0;
2060
2061
2062
2063
2064 status = ct->status;
2065
2066 nf_ct_put(ct);
2067 ct = nf_ct_tuplehash_to_ctrack(h);
2068 nf_ct_set(skb, ct, ctinfo);
2069
2070 nat_hook = rcu_dereference(nf_nat_hook);
2071 if (!nat_hook)
2072 return 0;
2073
2074 if (status & IPS_SRC_NAT &&
2075 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC,
2076 IP_CT_DIR_ORIGINAL) == NF_DROP)
2077 return -1;
2078
2079 if (status & IPS_DST_NAT &&
2080 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST,
2081 IP_CT_DIR_ORIGINAL) == NF_DROP)
2082 return -1;
2083
2084 return 0;
2085}
2086
2087
2088
2089
2090static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct,
2091 enum ip_conntrack_info ctinfo)
2092{
2093 const struct nf_conntrack_helper *helper;
2094 const struct nf_conn_help *help;
2095 int protoff;
2096
2097 help = nfct_help(ct);
2098 if (!help)
2099 return 0;
2100
2101 helper = rcu_dereference(help->helper);
2102 if (!(helper->flags & NF_CT_HELPER_F_USERSPACE))
2103 return 0;
2104
2105 switch (nf_ct_l3num(ct)) {
2106 case NFPROTO_IPV4:
2107 protoff = skb_network_offset(skb) + ip_hdrlen(skb);
2108 break;
2109#if IS_ENABLED(CONFIG_IPV6)
2110 case NFPROTO_IPV6: {
2111 __be16 frag_off;
2112 u8 pnum;
2113
2114 pnum = ipv6_hdr(skb)->nexthdr;
2115 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
2116 &frag_off);
2117 if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
2118 return 0;
2119 break;
2120 }
2121#endif
2122 default:
2123 return 0;
2124 }
2125
2126 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
2127 !nf_is_loopback_packet(skb)) {
2128 if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
2129 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
2130 return -1;
2131 }
2132 }
2133
2134
2135 return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0;
2136}
2137
2138static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
2139{
2140 enum ip_conntrack_info ctinfo;
2141 struct nf_conn *ct;
2142 int err;
2143
2144 ct = nf_ct_get(skb, &ctinfo);
2145 if (!ct)
2146 return 0;
2147
2148 if (!nf_ct_is_confirmed(ct)) {
2149 err = __nf_conntrack_update(net, skb, ct, ctinfo);
2150 if (err < 0)
2151 return err;
2152 }
2153
2154 return nf_confirm_cthelper(skb, ct, ctinfo);
2155}
2156
2157static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
2158 const struct sk_buff *skb)
2159{
2160 const struct nf_conntrack_tuple *src_tuple;
2161 const struct nf_conntrack_tuple_hash *hash;
2162 struct nf_conntrack_tuple srctuple;
2163 enum ip_conntrack_info ctinfo;
2164 struct nf_conn *ct;
2165
2166 ct = nf_ct_get(skb, &ctinfo);
2167 if (ct) {
2168 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
2169 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
2170 return true;
2171 }
2172
2173 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
2174 NFPROTO_IPV4, dev_net(skb->dev),
2175 &srctuple))
2176 return false;
2177
2178 hash = nf_conntrack_find_get(dev_net(skb->dev),
2179 &nf_ct_zone_dflt,
2180 &srctuple);
2181 if (!hash)
2182 return false;
2183
2184 ct = nf_ct_tuplehash_to_ctrack(hash);
2185 src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
2186 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
2187 nf_ct_put(ct);
2188
2189 return true;
2190}
2191
2192
2193static struct nf_conn *
2194get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
2195 void *data, unsigned int *bucket)
2196{
2197 struct nf_conntrack_tuple_hash *h;
2198 struct nf_conn *ct;
2199 struct hlist_nulls_node *n;
2200 spinlock_t *lockp;
2201
2202 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
2203 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
2204 local_bh_disable();
2205 nf_conntrack_lock(lockp);
2206 if (*bucket < nf_conntrack_htable_size) {
2207 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
2208 if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
2209 continue;
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221 ct = nf_ct_tuplehash_to_ctrack(h);
2222 if (iter(ct, data))
2223 goto found;
2224 }
2225 }
2226 spin_unlock(lockp);
2227 local_bh_enable();
2228 cond_resched();
2229 }
2230
2231 return NULL;
2232found:
2233 atomic_inc(&ct->ct_general.use);
2234 spin_unlock(lockp);
2235 local_bh_enable();
2236 return ct;
2237}
2238
2239static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
2240 void *data, u32 portid, int report)
2241{
2242 unsigned int bucket = 0, sequence;
2243 struct nf_conn *ct;
2244
2245 might_sleep();
2246
2247 for (;;) {
2248 sequence = read_seqcount_begin(&nf_conntrack_generation);
2249
2250 while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
2251
2252
2253 nf_ct_delete(ct, portid, report);
2254 nf_ct_put(ct);
2255 cond_resched();
2256 }
2257
2258 if (!read_seqcount_retry(&nf_conntrack_generation, sequence))
2259 break;
2260 bucket = 0;
2261 }
2262}
2263
2264struct iter_data {
2265 int (*iter)(struct nf_conn *i, void *data);
2266 void *data;
2267 struct net *net;
2268};
2269
2270static int iter_net_only(struct nf_conn *i, void *data)
2271{
2272 struct iter_data *d = data;
2273
2274 if (!net_eq(d->net, nf_ct_net(i)))
2275 return 0;
2276
2277 return d->iter(i, d->data);
2278}
2279
2280static void
2281__nf_ct_unconfirmed_destroy(struct net *net)
2282{
2283 int cpu;
2284
2285 for_each_possible_cpu(cpu) {
2286 struct nf_conntrack_tuple_hash *h;
2287 struct hlist_nulls_node *n;
2288 struct ct_pcpu *pcpu;
2289
2290 pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
2291
2292 spin_lock_bh(&pcpu->lock);
2293 hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
2294 struct nf_conn *ct;
2295
2296 ct = nf_ct_tuplehash_to_ctrack(h);
2297
2298
2299
2300
2301 set_bit(IPS_DYING_BIT, &ct->status);
2302 }
2303 spin_unlock_bh(&pcpu->lock);
2304 cond_resched();
2305 }
2306}
2307
2308void nf_ct_unconfirmed_destroy(struct net *net)
2309{
2310 might_sleep();
2311
2312 if (atomic_read(&net->ct.count) > 0) {
2313 __nf_ct_unconfirmed_destroy(net);
2314 nf_queue_nf_hook_drop(net);
2315 synchronize_net();
2316 }
2317}
2318EXPORT_SYMBOL_GPL(nf_ct_unconfirmed_destroy);
2319
2320void nf_ct_iterate_cleanup_net(struct net *net,
2321 int (*iter)(struct nf_conn *i, void *data),
2322 void *data, u32 portid, int report)
2323{
2324 struct iter_data d;
2325
2326 might_sleep();
2327
2328 if (atomic_read(&net->ct.count) == 0)
2329 return;
2330
2331 d.iter = iter;
2332 d.data = data;
2333 d.net = net;
2334
2335 nf_ct_iterate_cleanup(iter_net_only, &d, portid, report);
2336}
2337EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350void
2351nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
2352{
2353 struct net *net;
2354
2355 down_read(&net_rwsem);
2356 for_each_net(net) {
2357 if (atomic_read(&net->ct.count) == 0)
2358 continue;
2359 __nf_ct_unconfirmed_destroy(net);
2360 nf_queue_nf_hook_drop(net);
2361 }
2362 up_read(&net_rwsem);
2363
2364
2365
2366
2367
2368
2369 net_ns_barrier();
2370
2371
2372
2373
2374
2375 synchronize_net();
2376
2377 nf_ct_iterate_cleanup(iter, data, 0, 0);
2378}
2379EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy);
2380
2381static int kill_all(struct nf_conn *i, void *data)
2382{
2383 return net_eq(nf_ct_net(i), data);
2384}
2385
2386void nf_conntrack_cleanup_start(void)
2387{
2388 conntrack_gc_work.exiting = true;
2389 RCU_INIT_POINTER(ip_ct_attach, NULL);
2390}
2391
2392void nf_conntrack_cleanup_end(void)
2393{
2394 RCU_INIT_POINTER(nf_ct_hook, NULL);
2395 cancel_delayed_work_sync(&conntrack_gc_work.dwork);
2396 kvfree(nf_conntrack_hash);
2397
2398 nf_conntrack_proto_fini();
2399 nf_conntrack_seqadj_fini();
2400 nf_conntrack_labels_fini();
2401 nf_conntrack_helper_fini();
2402 nf_conntrack_timeout_fini();
2403 nf_conntrack_ecache_fini();
2404 nf_conntrack_tstamp_fini();
2405 nf_conntrack_acct_fini();
2406 nf_conntrack_expect_fini();
2407
2408 kmem_cache_destroy(nf_conntrack_cachep);
2409}
2410
2411
2412
2413
2414
2415void nf_conntrack_cleanup_net(struct net *net)
2416{
2417 LIST_HEAD(single);
2418
2419 list_add(&net->exit_list, &single);
2420 nf_conntrack_cleanup_net_list(&single);
2421}
2422
2423void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
2424{
2425 int busy;
2426 struct net *net;
2427
2428
2429
2430
2431
2432
2433 synchronize_net();
2434i_see_dead_people:
2435 busy = 0;
2436 list_for_each_entry(net, net_exit_list, exit_list) {
2437 nf_ct_iterate_cleanup(kill_all, net, 0, 0);
2438 if (atomic_read(&net->ct.count) != 0)
2439 busy = 1;
2440 }
2441 if (busy) {
2442 schedule();
2443 goto i_see_dead_people;
2444 }
2445
2446 list_for_each_entry(net, net_exit_list, exit_list) {
2447 nf_conntrack_proto_pernet_fini(net);
2448 nf_conntrack_ecache_pernet_fini(net);
2449 nf_conntrack_expect_pernet_fini(net);
2450 free_percpu(net->ct.stat);
2451 free_percpu(net->ct.pcpu_lists);
2452 }
2453}
2454
2455void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
2456{
2457 struct hlist_nulls_head *hash;
2458 unsigned int nr_slots, i;
2459
2460 if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
2461 return NULL;
2462
2463 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
2464 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
2465
2466 hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL);
2467
2468 if (hash && nulls)
2469 for (i = 0; i < nr_slots; i++)
2470 INIT_HLIST_NULLS_HEAD(&hash[i], i);
2471
2472 return hash;
2473}
2474EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
2475
2476int nf_conntrack_hash_resize(unsigned int hashsize)
2477{
2478 int i, bucket;
2479 unsigned int old_size;
2480 struct hlist_nulls_head *hash, *old_hash;
2481 struct nf_conntrack_tuple_hash *h;
2482 struct nf_conn *ct;
2483
2484 if (!hashsize)
2485 return -EINVAL;
2486
2487 hash = nf_ct_alloc_hashtable(&hashsize, 1);
2488 if (!hash)
2489 return -ENOMEM;
2490
2491 old_size = nf_conntrack_htable_size;
2492 if (old_size == hashsize) {
2493 kvfree(hash);
2494 return 0;
2495 }
2496
2497 local_bh_disable();
2498 nf_conntrack_all_lock();
2499 write_seqcount_begin(&nf_conntrack_generation);
2500
2501
2502
2503
2504
2505
2506
2507 for (i = 0; i < nf_conntrack_htable_size; i++) {
2508 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
2509 h = hlist_nulls_entry(nf_conntrack_hash[i].first,
2510 struct nf_conntrack_tuple_hash, hnnode);
2511 ct = nf_ct_tuplehash_to_ctrack(h);
2512 hlist_nulls_del_rcu(&h->hnnode);
2513 bucket = __hash_conntrack(nf_ct_net(ct),
2514 &h->tuple, hashsize);
2515 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
2516 }
2517 }
2518 old_size = nf_conntrack_htable_size;
2519 old_hash = nf_conntrack_hash;
2520
2521 nf_conntrack_hash = hash;
2522 nf_conntrack_htable_size = hashsize;
2523
2524 write_seqcount_end(&nf_conntrack_generation);
2525 nf_conntrack_all_unlock();
2526 local_bh_enable();
2527
2528 synchronize_net();
2529 kvfree(old_hash);
2530 return 0;
2531}
2532
2533int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
2534{
2535 unsigned int hashsize;
2536 int rc;
2537
2538 if (current->nsproxy->net_ns != &init_net)
2539 return -EOPNOTSUPP;
2540
2541
2542 if (!nf_conntrack_hash)
2543 return param_set_uint(val, kp);
2544
2545 rc = kstrtouint(val, 0, &hashsize);
2546 if (rc)
2547 return rc;
2548
2549 return nf_conntrack_hash_resize(hashsize);
2550}
2551
2552static __always_inline unsigned int total_extension_size(void)
2553{
2554
2555 BUILD_BUG_ON(NF_CT_EXT_NUM > 9);
2556
2557 return sizeof(struct nf_ct_ext) +
2558 sizeof(struct nf_conn_help)
2559#if IS_ENABLED(CONFIG_NF_NAT)
2560 + sizeof(struct nf_conn_nat)
2561#endif
2562 + sizeof(struct nf_conn_seqadj)
2563 + sizeof(struct nf_conn_acct)
2564#ifdef CONFIG_NF_CONNTRACK_EVENTS
2565 + sizeof(struct nf_conntrack_ecache)
2566#endif
2567#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
2568 + sizeof(struct nf_conn_tstamp)
2569#endif
2570#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
2571 + sizeof(struct nf_conn_timeout)
2572#endif
2573#ifdef CONFIG_NF_CONNTRACK_LABELS
2574 + sizeof(struct nf_conn_labels)
2575#endif
2576#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
2577 + sizeof(struct nf_conn_synproxy)
2578#endif
2579 ;
2580};
2581
2582int nf_conntrack_init_start(void)
2583{
2584 unsigned long nr_pages = totalram_pages();
2585 int max_factor = 8;
2586 int ret = -ENOMEM;
2587 int i;
2588
2589
2590 BUILD_BUG_ON(total_extension_size() > 255u);
2591
2592 seqcount_init(&nf_conntrack_generation);
2593
2594 for (i = 0; i < CONNTRACK_LOCKS; i++)
2595 spin_lock_init(&nf_conntrack_locks[i]);
2596
2597 if (!nf_conntrack_htable_size) {
2598
2599
2600
2601
2602
2603 nf_conntrack_htable_size
2604 = (((nr_pages << PAGE_SHIFT) / 16384)
2605 / sizeof(struct hlist_head));
2606 if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
2607 nf_conntrack_htable_size = 65536;
2608 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
2609 nf_conntrack_htable_size = 16384;
2610 if (nf_conntrack_htable_size < 32)
2611 nf_conntrack_htable_size = 32;
2612
2613
2614
2615
2616
2617 max_factor = 4;
2618 }
2619
2620 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
2621 if (!nf_conntrack_hash)
2622 return -ENOMEM;
2623
2624 nf_conntrack_max = max_factor * nf_conntrack_htable_size;
2625
2626 nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
2627 sizeof(struct nf_conn),
2628 NFCT_INFOMASK + 1,
2629 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
2630 if (!nf_conntrack_cachep)
2631 goto err_cachep;
2632
2633 ret = nf_conntrack_expect_init();
2634 if (ret < 0)
2635 goto err_expect;
2636
2637 ret = nf_conntrack_acct_init();
2638 if (ret < 0)
2639 goto err_acct;
2640
2641 ret = nf_conntrack_tstamp_init();
2642 if (ret < 0)
2643 goto err_tstamp;
2644
2645 ret = nf_conntrack_ecache_init();
2646 if (ret < 0)
2647 goto err_ecache;
2648
2649 ret = nf_conntrack_timeout_init();
2650 if (ret < 0)
2651 goto err_timeout;
2652
2653 ret = nf_conntrack_helper_init();
2654 if (ret < 0)
2655 goto err_helper;
2656
2657 ret = nf_conntrack_labels_init();
2658 if (ret < 0)
2659 goto err_labels;
2660
2661 ret = nf_conntrack_seqadj_init();
2662 if (ret < 0)
2663 goto err_seqadj;
2664
2665 ret = nf_conntrack_proto_init();
2666 if (ret < 0)
2667 goto err_proto;
2668
2669 conntrack_gc_work_init(&conntrack_gc_work);
2670 queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);
2671
2672 return 0;
2673
2674err_proto:
2675 nf_conntrack_seqadj_fini();
2676err_seqadj:
2677 nf_conntrack_labels_fini();
2678err_labels:
2679 nf_conntrack_helper_fini();
2680err_helper:
2681 nf_conntrack_timeout_fini();
2682err_timeout:
2683 nf_conntrack_ecache_fini();
2684err_ecache:
2685 nf_conntrack_tstamp_fini();
2686err_tstamp:
2687 nf_conntrack_acct_fini();
2688err_acct:
2689 nf_conntrack_expect_fini();
2690err_expect:
2691 kmem_cache_destroy(nf_conntrack_cachep);
2692err_cachep:
2693 kvfree(nf_conntrack_hash);
2694 return ret;
2695}
2696
2697static struct nf_ct_hook nf_conntrack_hook = {
2698 .update = nf_conntrack_update,
2699 .destroy = destroy_conntrack,
2700 .get_tuple_skb = nf_conntrack_get_tuple_skb,
2701};
2702
2703void nf_conntrack_init_end(void)
2704{
2705
2706 RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
2707 RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook);
2708}
2709
2710
2711
2712
2713#define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
2714#define DYING_NULLS_VAL ((1<<30)+1)
2715
2716int nf_conntrack_init_net(struct net *net)
2717{
2718 int ret = -ENOMEM;
2719 int cpu;
2720
2721 BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER);
2722 BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS);
2723 atomic_set(&net->ct.count, 0);
2724
2725 net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
2726 if (!net->ct.pcpu_lists)
2727 goto err_stat;
2728
2729 for_each_possible_cpu(cpu) {
2730 struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
2731
2732 spin_lock_init(&pcpu->lock);
2733 INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
2734 INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
2735 }
2736
2737 net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
2738 if (!net->ct.stat)
2739 goto err_pcpu_lists;
2740
2741 ret = nf_conntrack_expect_pernet_init(net);
2742 if (ret < 0)
2743 goto err_expect;
2744
2745 nf_conntrack_acct_pernet_init(net);
2746 nf_conntrack_tstamp_pernet_init(net);
2747 nf_conntrack_ecache_pernet_init(net);
2748 nf_conntrack_helper_pernet_init(net);
2749 nf_conntrack_proto_pernet_init(net);
2750
2751 return 0;
2752
2753err_expect:
2754 free_percpu(net->ct.stat);
2755err_pcpu_lists:
2756 free_percpu(net->ct.pcpu_lists);
2757err_stat:
2758 return ret;
2759}
2760