1
2
3
4
5
6
7
8
9
10
11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
14#include <linux/types.h>
15#include <linux/netfilter.h>
16#include <linux/module.h>
17#include <linux/sched.h>
18#include <linux/skbuff.h>
19#include <linux/proc_fs.h>
20#include <linux/vmalloc.h>
21#include <linux/stddef.h>
22#include <linux/slab.h>
23#include <linux/random.h>
24#include <linux/siphash.h>
25#include <linux/err.h>
26#include <linux/percpu.h>
27#include <linux/moduleparam.h>
28#include <linux/notifier.h>
29#include <linux/kernel.h>
30#include <linux/netdevice.h>
31#include <linux/socket.h>
32#include <linux/mm.h>
33#include <linux/nsproxy.h>
34#include <linux/rculist_nulls.h>
35
36#include <net/netfilter/nf_conntrack.h>
37#include <net/netfilter/nf_conntrack_l4proto.h>
38#include <net/netfilter/nf_conntrack_expect.h>
39#include <net/netfilter/nf_conntrack_helper.h>
40#include <net/netfilter/nf_conntrack_seqadj.h>
41#include <net/netfilter/nf_conntrack_core.h>
42#include <net/netfilter/nf_conntrack_extend.h>
43#include <net/netfilter/nf_conntrack_acct.h>
44#include <net/netfilter/nf_conntrack_ecache.h>
45#include <net/netfilter/nf_conntrack_zones.h>
46#include <net/netfilter/nf_conntrack_timestamp.h>
47#include <net/netfilter/nf_conntrack_timeout.h>
48#include <net/netfilter/nf_conntrack_labels.h>
49#include <net/netfilter/nf_conntrack_synproxy.h>
50#include <net/netfilter/nf_nat.h>
51#include <net/netfilter/nf_nat_helper.h>
52#include <net/netns/hash.h>
53#include <net/ip.h>
54
55#include "nf_internals.h"
56
57__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
58EXPORT_SYMBOL_GPL(nf_conntrack_locks);
59
60__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
61EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
62
63struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
64EXPORT_SYMBOL_GPL(nf_conntrack_hash);
65
66struct conntrack_gc_work {
67 struct delayed_work dwork;
68 u32 next_bucket;
69 bool exiting;
70 bool early_drop;
71};
72
73static __read_mostly struct kmem_cache *nf_conntrack_cachep;
74static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
75static __read_mostly bool nf_conntrack_locks_all;
76
77
78static DEFINE_MUTEX(nf_conntrack_mutex);
79
80#define GC_SCAN_INTERVAL (120u * HZ)
81#define GC_SCAN_MAX_DURATION msecs_to_jiffies(10)
82
83#define MIN_CHAINLEN 8u
84#define MAX_CHAINLEN (32u - MIN_CHAINLEN)
85
86static struct conntrack_gc_work conntrack_gc_work;
87
88void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
89{
90
91 spin_lock(lock);
92
93
94
95
96 if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false))
97 return;
98
99
100 spin_unlock(lock);
101
102
103 spin_lock(&nf_conntrack_locks_all_lock);
104
105
106 spin_lock(lock);
107
108
109 spin_unlock(&nf_conntrack_locks_all_lock);
110}
111EXPORT_SYMBOL_GPL(nf_conntrack_lock);
112
113static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
114{
115 h1 %= CONNTRACK_LOCKS;
116 h2 %= CONNTRACK_LOCKS;
117 spin_unlock(&nf_conntrack_locks[h1]);
118 if (h1 != h2)
119 spin_unlock(&nf_conntrack_locks[h2]);
120}
121
122
123static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
124 unsigned int h2, unsigned int sequence)
125{
126 h1 %= CONNTRACK_LOCKS;
127 h2 %= CONNTRACK_LOCKS;
128 if (h1 <= h2) {
129 nf_conntrack_lock(&nf_conntrack_locks[h1]);
130 if (h1 != h2)
131 spin_lock_nested(&nf_conntrack_locks[h2],
132 SINGLE_DEPTH_NESTING);
133 } else {
134 nf_conntrack_lock(&nf_conntrack_locks[h2]);
135 spin_lock_nested(&nf_conntrack_locks[h1],
136 SINGLE_DEPTH_NESTING);
137 }
138 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
139 nf_conntrack_double_unlock(h1, h2);
140 return true;
141 }
142 return false;
143}
144
145static void nf_conntrack_all_lock(void)
146 __acquires(&nf_conntrack_locks_all_lock)
147{
148 int i;
149
150 spin_lock(&nf_conntrack_locks_all_lock);
151
152
153
154
155
156
157
158
159
160 WRITE_ONCE(nf_conntrack_locks_all, true);
161
162 for (i = 0; i < CONNTRACK_LOCKS; i++) {
163 spin_lock(&nf_conntrack_locks[i]);
164
165
166
167
168
169 spin_unlock(&nf_conntrack_locks[i]);
170 }
171}
172
173static void nf_conntrack_all_unlock(void)
174 __releases(&nf_conntrack_locks_all_lock)
175{
176
177
178
179
180
181
182 smp_store_release(&nf_conntrack_locks_all, false);
183 spin_unlock(&nf_conntrack_locks_all_lock);
184}
185
186unsigned int nf_conntrack_htable_size __read_mostly;
187EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
188
189unsigned int nf_conntrack_max __read_mostly;
190EXPORT_SYMBOL_GPL(nf_conntrack_max);
191seqcount_spinlock_t nf_conntrack_generation __read_mostly;
192static siphash_key_t nf_conntrack_hash_rnd __read_mostly;
193
194static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
195 unsigned int zoneid,
196 const struct net *net)
197{
198 struct {
199 struct nf_conntrack_man src;
200 union nf_inet_addr dst_addr;
201 unsigned int zone;
202 u32 net_mix;
203 u16 dport;
204 u16 proto;
205 } __aligned(SIPHASH_ALIGNMENT) combined;
206
207 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
208
209 memset(&combined, 0, sizeof(combined));
210
211
212 combined.src = tuple->src;
213 combined.dst_addr = tuple->dst.u3;
214 combined.zone = zoneid;
215 combined.net_mix = net_hash_mix(net);
216 combined.dport = (__force __u16)tuple->dst.u.all;
217 combined.proto = tuple->dst.protonum;
218
219 return (u32)siphash(&combined, sizeof(combined), &nf_conntrack_hash_rnd);
220}
221
222static u32 scale_hash(u32 hash)
223{
224 return reciprocal_scale(hash, nf_conntrack_htable_size);
225}
226
227static u32 __hash_conntrack(const struct net *net,
228 const struct nf_conntrack_tuple *tuple,
229 unsigned int zoneid,
230 unsigned int size)
231{
232 return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size);
233}
234
235static u32 hash_conntrack(const struct net *net,
236 const struct nf_conntrack_tuple *tuple,
237 unsigned int zoneid)
238{
239 return scale_hash(hash_conntrack_raw(tuple, zoneid, net));
240}
241
242static bool nf_ct_get_tuple_ports(const struct sk_buff *skb,
243 unsigned int dataoff,
244 struct nf_conntrack_tuple *tuple)
245{ struct {
246 __be16 sport;
247 __be16 dport;
248 } _inet_hdr, *inet_hdr;
249
250
251 inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr);
252 if (!inet_hdr)
253 return false;
254
255 tuple->src.u.udp.port = inet_hdr->sport;
256 tuple->dst.u.udp.port = inet_hdr->dport;
257 return true;
258}
259
260static bool
261nf_ct_get_tuple(const struct sk_buff *skb,
262 unsigned int nhoff,
263 unsigned int dataoff,
264 u_int16_t l3num,
265 u_int8_t protonum,
266 struct net *net,
267 struct nf_conntrack_tuple *tuple)
268{
269 unsigned int size;
270 const __be32 *ap;
271 __be32 _addrs[8];
272
273 memset(tuple, 0, sizeof(*tuple));
274
275 tuple->src.l3num = l3num;
276 switch (l3num) {
277 case NFPROTO_IPV4:
278 nhoff += offsetof(struct iphdr, saddr);
279 size = 2 * sizeof(__be32);
280 break;
281 case NFPROTO_IPV6:
282 nhoff += offsetof(struct ipv6hdr, saddr);
283 size = sizeof(_addrs);
284 break;
285 default:
286 return true;
287 }
288
289 ap = skb_header_pointer(skb, nhoff, size, _addrs);
290 if (!ap)
291 return false;
292
293 switch (l3num) {
294 case NFPROTO_IPV4:
295 tuple->src.u3.ip = ap[0];
296 tuple->dst.u3.ip = ap[1];
297 break;
298 case NFPROTO_IPV6:
299 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
300 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
301 break;
302 }
303
304 tuple->dst.protonum = protonum;
305 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
306
307 switch (protonum) {
308#if IS_ENABLED(CONFIG_IPV6)
309 case IPPROTO_ICMPV6:
310 return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple);
311#endif
312 case IPPROTO_ICMP:
313 return icmp_pkt_to_tuple(skb, dataoff, net, tuple);
314#ifdef CONFIG_NF_CT_PROTO_GRE
315 case IPPROTO_GRE:
316 return gre_pkt_to_tuple(skb, dataoff, net, tuple);
317#endif
318 case IPPROTO_TCP:
319 case IPPROTO_UDP:
320 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
321#ifdef CONFIG_NF_CT_PROTO_UDPLITE
322 case IPPROTO_UDPLITE:
323 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
324#endif
325#ifdef CONFIG_NF_CT_PROTO_SCTP
326 case IPPROTO_SCTP:
327 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
328#endif
329#ifdef CONFIG_NF_CT_PROTO_DCCP
330 case IPPROTO_DCCP:
331 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
332#endif
333 default:
334 break;
335 }
336
337 return true;
338}
339
340static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
341 u_int8_t *protonum)
342{
343 int dataoff = -1;
344 const struct iphdr *iph;
345 struct iphdr _iph;
346
347 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
348 if (!iph)
349 return -1;
350
351
352
353
354 if (iph->frag_off & htons(IP_OFFSET))
355 return -1;
356
357 dataoff = nhoff + (iph->ihl << 2);
358 *protonum = iph->protocol;
359
360
361 if (dataoff > skb->len) {
362 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n",
363 nhoff, iph->ihl << 2, skb->len);
364 return -1;
365 }
366 return dataoff;
367}
368
369#if IS_ENABLED(CONFIG_IPV6)
370static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
371 u8 *protonum)
372{
373 int protoff = -1;
374 unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
375 __be16 frag_off;
376 u8 nexthdr;
377
378 if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr),
379 &nexthdr, sizeof(nexthdr)) != 0) {
380 pr_debug("can't get nexthdr\n");
381 return -1;
382 }
383 protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off);
384
385
386
387
388 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
389 pr_debug("can't find proto in pkt\n");
390 return -1;
391 }
392
393 *protonum = nexthdr;
394 return protoff;
395}
396#endif
397
398static int get_l4proto(const struct sk_buff *skb,
399 unsigned int nhoff, u8 pf, u8 *l4num)
400{
401 switch (pf) {
402 case NFPROTO_IPV4:
403 return ipv4_get_l4proto(skb, nhoff, l4num);
404#if IS_ENABLED(CONFIG_IPV6)
405 case NFPROTO_IPV6:
406 return ipv6_get_l4proto(skb, nhoff, l4num);
407#endif
408 default:
409 *l4num = 0;
410 break;
411 }
412 return -1;
413}
414
415bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
416 u_int16_t l3num,
417 struct net *net, struct nf_conntrack_tuple *tuple)
418{
419 u8 protonum;
420 int protoff;
421
422 protoff = get_l4proto(skb, nhoff, l3num, &protonum);
423 if (protoff <= 0)
424 return false;
425
426 return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple);
427}
428EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
429
430bool
431nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
432 const struct nf_conntrack_tuple *orig)
433{
434 memset(inverse, 0, sizeof(*inverse));
435
436 inverse->src.l3num = orig->src.l3num;
437
438 switch (orig->src.l3num) {
439 case NFPROTO_IPV4:
440 inverse->src.u3.ip = orig->dst.u3.ip;
441 inverse->dst.u3.ip = orig->src.u3.ip;
442 break;
443 case NFPROTO_IPV6:
444 inverse->src.u3.in6 = orig->dst.u3.in6;
445 inverse->dst.u3.in6 = orig->src.u3.in6;
446 break;
447 default:
448 break;
449 }
450
451 inverse->dst.dir = !orig->dst.dir;
452
453 inverse->dst.protonum = orig->dst.protonum;
454
455 switch (orig->dst.protonum) {
456 case IPPROTO_ICMP:
457 return nf_conntrack_invert_icmp_tuple(inverse, orig);
458#if IS_ENABLED(CONFIG_IPV6)
459 case IPPROTO_ICMPV6:
460 return nf_conntrack_invert_icmpv6_tuple(inverse, orig);
461#endif
462 }
463
464 inverse->src.u.all = orig->dst.u.all;
465 inverse->dst.u.all = orig->src.u.all;
466 return true;
467}
468EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483u32 nf_ct_get_id(const struct nf_conn *ct)
484{
485 static __read_mostly siphash_key_t ct_id_seed;
486 unsigned long a, b, c, d;
487
488 net_get_random_once(&ct_id_seed, sizeof(ct_id_seed));
489
490 a = (unsigned long)ct;
491 b = (unsigned long)ct->master;
492 c = (unsigned long)nf_ct_net(ct);
493 d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
494 sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple),
495 &ct_id_seed);
496#ifdef CONFIG_64BIT
497 return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed);
498#else
499 return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed);
500#endif
501}
502EXPORT_SYMBOL_GPL(nf_ct_get_id);
503
504static void
505clean_from_lists(struct nf_conn *ct)
506{
507 pr_debug("clean_from_lists(%p)\n", ct);
508 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
509 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
510
511
512 nf_ct_remove_expectations(ct);
513}
514
515
516static void nf_ct_add_to_dying_list(struct nf_conn *ct)
517{
518 struct ct_pcpu *pcpu;
519
520
521 ct->cpu = smp_processor_id();
522 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
523
524 spin_lock(&pcpu->lock);
525 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
526 &pcpu->dying);
527 spin_unlock(&pcpu->lock);
528}
529
530
531static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
532{
533 struct ct_pcpu *pcpu;
534
535
536 ct->cpu = smp_processor_id();
537 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
538
539 spin_lock(&pcpu->lock);
540 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
541 &pcpu->unconfirmed);
542 spin_unlock(&pcpu->lock);
543}
544
545
546static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
547{
548 struct ct_pcpu *pcpu;
549
550
551 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
552
553 spin_lock(&pcpu->lock);
554 BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
555 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
556 spin_unlock(&pcpu->lock);
557}
558
559#define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
560
561
562struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
563 const struct nf_conntrack_zone *zone,
564 gfp_t flags)
565{
566 struct nf_conn *tmpl, *p;
567
568 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) {
569 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags);
570 if (!tmpl)
571 return NULL;
572
573 p = tmpl;
574 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
575 if (tmpl != p) {
576 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
577 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p;
578 }
579 } else {
580 tmpl = kzalloc(sizeof(*tmpl), flags);
581 if (!tmpl)
582 return NULL;
583 }
584
585 tmpl->status = IPS_TEMPLATE;
586 write_pnet(&tmpl->ct_net, net);
587 nf_ct_zone_add(tmpl, zone);
588 atomic_set(&tmpl->ct_general.use, 0);
589
590 return tmpl;
591}
592EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
593
594void nf_ct_tmpl_free(struct nf_conn *tmpl)
595{
596 nf_ct_ext_destroy(tmpl);
597
598 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
599 kfree((char *)tmpl - tmpl->proto.tmpl_padto);
600 else
601 kfree(tmpl);
602}
603EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
604
605static void destroy_gre_conntrack(struct nf_conn *ct)
606{
607#ifdef CONFIG_NF_CT_PROTO_GRE
608 struct nf_conn *master = ct->master;
609
610 if (master)
611 nf_ct_gre_keymap_destroy(master);
612#endif
613}
614
615static void
616destroy_conntrack(struct nf_conntrack *nfct)
617{
618 struct nf_conn *ct = (struct nf_conn *)nfct;
619
620 pr_debug("destroy_conntrack(%p)\n", ct);
621 WARN_ON(atomic_read(&nfct->use) != 0);
622
623 if (unlikely(nf_ct_is_template(ct))) {
624 nf_ct_tmpl_free(ct);
625 return;
626 }
627
628 if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE))
629 destroy_gre_conntrack(ct);
630
631 local_bh_disable();
632
633
634
635
636
637 nf_ct_remove_expectations(ct);
638
639 nf_ct_del_from_dying_or_unconfirmed_list(ct);
640
641 local_bh_enable();
642
643 if (ct->master)
644 nf_ct_put(ct->master);
645
646 pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
647 nf_conntrack_free(ct);
648}
649
650static void nf_ct_delete_from_lists(struct nf_conn *ct)
651{
652 struct net *net = nf_ct_net(ct);
653 unsigned int hash, reply_hash;
654 unsigned int sequence;
655
656 nf_ct_helper_destroy(ct);
657
658 local_bh_disable();
659 do {
660 sequence = read_seqcount_begin(&nf_conntrack_generation);
661 hash = hash_conntrack(net,
662 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
663 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL));
664 reply_hash = hash_conntrack(net,
665 &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
666 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
667 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
668
669 clean_from_lists(ct);
670 nf_conntrack_double_unlock(hash, reply_hash);
671
672 nf_ct_add_to_dying_list(ct);
673
674 local_bh_enable();
675}
676
677bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
678{
679 struct nf_conn_tstamp *tstamp;
680 struct net *net;
681
682 if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
683 return false;
684
685 tstamp = nf_conn_tstamp_find(ct);
686 if (tstamp) {
687 s32 timeout = ct->timeout - nfct_time_stamp;
688
689 tstamp->stop = ktime_get_real_ns();
690 if (timeout < 0)
691 tstamp->stop -= jiffies_to_nsecs(-timeout);
692 }
693
694 if (nf_conntrack_event_report(IPCT_DESTROY, ct,
695 portid, report) < 0) {
696
697
698
699 nf_ct_delete_from_lists(ct);
700 nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL);
701 return false;
702 }
703
704 net = nf_ct_net(ct);
705 if (nf_conntrack_ecache_dwork_pending(net))
706 nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT);
707 nf_ct_delete_from_lists(ct);
708 nf_ct_put(ct);
709 return true;
710}
711EXPORT_SYMBOL_GPL(nf_ct_delete);
712
713static inline bool
714nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
715 const struct nf_conntrack_tuple *tuple,
716 const struct nf_conntrack_zone *zone,
717 const struct net *net)
718{
719 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
720
721
722
723
724 return nf_ct_tuple_equal(tuple, &h->tuple) &&
725 nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
726 nf_ct_is_confirmed(ct) &&
727 net_eq(net, nf_ct_net(ct));
728}
729
730static inline bool
731nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
732{
733 return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
734 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
735 nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
736 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
737 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
738 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
739 net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
740}
741
742
743static void nf_ct_gc_expired(struct nf_conn *ct)
744{
745 if (!atomic_inc_not_zero(&ct->ct_general.use))
746 return;
747
748 if (nf_ct_should_gc(ct))
749 nf_ct_kill(ct);
750
751 nf_ct_put(ct);
752}
753
754
755
756
757
758
759static struct nf_conntrack_tuple_hash *
760____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
761 const struct nf_conntrack_tuple *tuple, u32 hash)
762{
763 struct nf_conntrack_tuple_hash *h;
764 struct hlist_nulls_head *ct_hash;
765 struct hlist_nulls_node *n;
766 unsigned int bucket, hsize;
767
768begin:
769 nf_conntrack_get_ht(&ct_hash, &hsize);
770 bucket = reciprocal_scale(hash, hsize);
771
772 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
773 struct nf_conn *ct;
774
775 ct = nf_ct_tuplehash_to_ctrack(h);
776 if (nf_ct_is_expired(ct)) {
777 nf_ct_gc_expired(ct);
778 continue;
779 }
780
781 if (nf_ct_key_equal(h, tuple, zone, net))
782 return h;
783 }
784
785
786
787
788
789 if (get_nulls_value(n) != bucket) {
790 NF_CT_STAT_INC_ATOMIC(net, search_restart);
791 goto begin;
792 }
793
794 return NULL;
795}
796
797
798static struct nf_conntrack_tuple_hash *
799__nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
800 const struct nf_conntrack_tuple *tuple, u32 hash)
801{
802 struct nf_conntrack_tuple_hash *h;
803 struct nf_conn *ct;
804
805 rcu_read_lock();
806
807 h = ____nf_conntrack_find(net, zone, tuple, hash);
808 if (h) {
809
810
811
812 ct = nf_ct_tuplehash_to_ctrack(h);
813 if (likely(atomic_inc_not_zero(&ct->ct_general.use))) {
814 if (likely(nf_ct_key_equal(h, tuple, zone, net)))
815 goto found;
816
817
818 nf_ct_put(ct);
819 }
820
821 h = NULL;
822 }
823found:
824 rcu_read_unlock();
825
826 return h;
827}
828
829struct nf_conntrack_tuple_hash *
830nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
831 const struct nf_conntrack_tuple *tuple)
832{
833 unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
834 struct nf_conntrack_tuple_hash *thash;
835
836 thash = __nf_conntrack_find_get(net, zone, tuple,
837 hash_conntrack_raw(tuple, zone_id, net));
838
839 if (thash)
840 return thash;
841
842 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
843 if (rid != zone_id)
844 return __nf_conntrack_find_get(net, zone, tuple,
845 hash_conntrack_raw(tuple, rid, net));
846 return thash;
847}
848EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
849
850static void __nf_conntrack_hash_insert(struct nf_conn *ct,
851 unsigned int hash,
852 unsigned int reply_hash)
853{
854 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
855 &nf_conntrack_hash[hash]);
856 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
857 &nf_conntrack_hash[reply_hash]);
858}
859
860int
861nf_conntrack_hash_check_insert(struct nf_conn *ct)
862{
863 const struct nf_conntrack_zone *zone;
864 struct net *net = nf_ct_net(ct);
865 unsigned int hash, reply_hash;
866 struct nf_conntrack_tuple_hash *h;
867 struct hlist_nulls_node *n;
868 unsigned int max_chainlen;
869 unsigned int chainlen = 0;
870 unsigned int sequence;
871 int err = -EEXIST;
872
873 zone = nf_ct_zone(ct);
874
875 local_bh_disable();
876 do {
877 sequence = read_seqcount_begin(&nf_conntrack_generation);
878 hash = hash_conntrack(net,
879 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
880 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL));
881 reply_hash = hash_conntrack(net,
882 &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
883 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
884 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
885
886 max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN);
887
888
889 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
890 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
891 zone, net))
892 goto out;
893
894 if (chainlen++ > max_chainlen)
895 goto chaintoolong;
896 }
897
898 chainlen = 0;
899
900 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
901 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
902 zone, net))
903 goto out;
904 if (chainlen++ > max_chainlen)
905 goto chaintoolong;
906 }
907
908 smp_wmb();
909
910 atomic_set(&ct->ct_general.use, 2);
911 __nf_conntrack_hash_insert(ct, hash, reply_hash);
912 nf_conntrack_double_unlock(hash, reply_hash);
913 NF_CT_STAT_INC(net, insert);
914 local_bh_enable();
915 return 0;
916chaintoolong:
917 NF_CT_STAT_INC(net, chaintoolong);
918 err = -ENOSPC;
919out:
920 nf_conntrack_double_unlock(hash, reply_hash);
921 local_bh_enable();
922 return err;
923}
924EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
925
926void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets,
927 unsigned int bytes)
928{
929 struct nf_conn_acct *acct;
930
931 acct = nf_conn_acct_find(ct);
932 if (acct) {
933 struct nf_conn_counter *counter = acct->counter;
934
935 atomic64_add(packets, &counter[dir].packets);
936 atomic64_add(bytes, &counter[dir].bytes);
937 }
938}
939EXPORT_SYMBOL_GPL(nf_ct_acct_add);
940
941static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
942 const struct nf_conn *loser_ct)
943{
944 struct nf_conn_acct *acct;
945
946 acct = nf_conn_acct_find(loser_ct);
947 if (acct) {
948 struct nf_conn_counter *counter = acct->counter;
949 unsigned int bytes;
950
951
952 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
953 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes);
954 }
955}
956
957static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
958{
959 struct nf_conn_tstamp *tstamp;
960
961 atomic_inc(&ct->ct_general.use);
962 ct->status |= IPS_CONFIRMED;
963
964
965 tstamp = nf_conn_tstamp_find(ct);
966 if (tstamp)
967 tstamp->start = ktime_get_real_ns();
968}
969
970
971static int __nf_ct_resolve_clash(struct sk_buff *skb,
972 struct nf_conntrack_tuple_hash *h)
973{
974
975 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
976 enum ip_conntrack_info ctinfo;
977 struct nf_conn *loser_ct;
978
979 loser_ct = nf_ct_get(skb, &ctinfo);
980
981 if (nf_ct_is_dying(ct))
982 return NF_DROP;
983
984 if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
985 nf_ct_match(ct, loser_ct)) {
986 struct net *net = nf_ct_net(ct);
987
988 nf_conntrack_get(&ct->ct_general);
989
990 nf_ct_acct_merge(ct, ctinfo, loser_ct);
991 nf_ct_add_to_dying_list(loser_ct);
992 nf_conntrack_put(&loser_ct->ct_general);
993 nf_ct_set(skb, ct, ctinfo);
994
995 NF_CT_STAT_INC(net, clash_resolve);
996 return NF_ACCEPT;
997 }
998
999 return NF_DROP;
1000}
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
1018{
1019 struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb);
1020 const struct nf_conntrack_zone *zone;
1021 struct nf_conntrack_tuple_hash *h;
1022 struct hlist_nulls_node *n;
1023 struct net *net;
1024
1025 zone = nf_ct_zone(loser_ct);
1026 net = nf_ct_net(loser_ct);
1027
1028
1029
1030
1031 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) {
1032 if (nf_ct_key_equal(h,
1033 &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple,
1034 zone, net))
1035 return __nf_ct_resolve_clash(skb, h);
1036 }
1037
1038
1039 loser_ct->timeout = nfct_time_stamp + HZ;
1040
1041
1042
1043
1044
1045
1046 loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH;
1047
1048 __nf_conntrack_insert_prepare(loser_ct);
1049
1050
1051
1052
1053
1054 hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
1055
1056 hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
1057 &nf_conntrack_hash[repl_idx]);
1058
1059 NF_CT_STAT_INC(net, clash_resolve);
1060 return NF_ACCEPT;
1061}
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096static __cold noinline int
1097nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
1098 u32 reply_hash)
1099{
1100
1101 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1102 const struct nf_conntrack_l4proto *l4proto;
1103 enum ip_conntrack_info ctinfo;
1104 struct nf_conn *loser_ct;
1105 struct net *net;
1106 int ret;
1107
1108 loser_ct = nf_ct_get(skb, &ctinfo);
1109 net = nf_ct_net(loser_ct);
1110
1111 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
1112 if (!l4proto->allow_clash)
1113 goto drop;
1114
1115 ret = __nf_ct_resolve_clash(skb, h);
1116 if (ret == NF_ACCEPT)
1117 return ret;
1118
1119 ret = nf_ct_resolve_clash_harder(skb, reply_hash);
1120 if (ret == NF_ACCEPT)
1121 return ret;
1122
1123drop:
1124 nf_ct_add_to_dying_list(loser_ct);
1125 NF_CT_STAT_INC(net, drop);
1126 NF_CT_STAT_INC(net, insert_failed);
1127 return NF_DROP;
1128}
1129
1130
1131int
1132__nf_conntrack_confirm(struct sk_buff *skb)
1133{
1134 unsigned int chainlen = 0, sequence, max_chainlen;
1135 const struct nf_conntrack_zone *zone;
1136 unsigned int hash, reply_hash;
1137 struct nf_conntrack_tuple_hash *h;
1138 struct nf_conn *ct;
1139 struct nf_conn_help *help;
1140 struct hlist_nulls_node *n;
1141 enum ip_conntrack_info ctinfo;
1142 struct net *net;
1143 int ret = NF_DROP;
1144
1145 ct = nf_ct_get(skb, &ctinfo);
1146 net = nf_ct_net(ct);
1147
1148
1149
1150
1151
1152 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
1153 return NF_ACCEPT;
1154
1155 zone = nf_ct_zone(ct);
1156 local_bh_disable();
1157
1158 do {
1159 sequence = read_seqcount_begin(&nf_conntrack_generation);
1160
1161 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
1162 hash = scale_hash(hash);
1163 reply_hash = hash_conntrack(net,
1164 &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
1165 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
1166 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178 if (unlikely(nf_ct_is_confirmed(ct))) {
1179 WARN_ON_ONCE(1);
1180 nf_conntrack_double_unlock(hash, reply_hash);
1181 local_bh_enable();
1182 return NF_DROP;
1183 }
1184
1185 pr_debug("Confirming conntrack %p\n", ct);
1186
1187
1188
1189
1190
1191 nf_ct_del_from_dying_or_unconfirmed_list(ct);
1192
1193 if (unlikely(nf_ct_is_dying(ct))) {
1194 nf_ct_add_to_dying_list(ct);
1195 NF_CT_STAT_INC(net, insert_failed);
1196 goto dying;
1197 }
1198
1199 max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN);
1200
1201
1202
1203 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
1204 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1205 zone, net))
1206 goto out;
1207 if (chainlen++ > max_chainlen)
1208 goto chaintoolong;
1209 }
1210
1211 chainlen = 0;
1212 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
1213 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
1214 zone, net))
1215 goto out;
1216 if (chainlen++ > max_chainlen) {
1217chaintoolong:
1218 nf_ct_add_to_dying_list(ct);
1219 NF_CT_STAT_INC(net, chaintoolong);
1220 NF_CT_STAT_INC(net, insert_failed);
1221 ret = NF_DROP;
1222 goto dying;
1223 }
1224 }
1225
1226
1227
1228
1229 ct->timeout += nfct_time_stamp;
1230
1231 __nf_conntrack_insert_prepare(ct);
1232
1233
1234
1235
1236
1237
1238 __nf_conntrack_hash_insert(ct, hash, reply_hash);
1239 nf_conntrack_double_unlock(hash, reply_hash);
1240 local_bh_enable();
1241
1242 help = nfct_help(ct);
1243 if (help && help->helper)
1244 nf_conntrack_event_cache(IPCT_HELPER, ct);
1245
1246 nf_conntrack_event_cache(master_ct(ct) ?
1247 IPCT_RELATED : IPCT_NEW, ct);
1248 return NF_ACCEPT;
1249
1250out:
1251 ret = nf_ct_resolve_clash(skb, h, reply_hash);
1252dying:
1253 nf_conntrack_double_unlock(hash, reply_hash);
1254 local_bh_enable();
1255 return ret;
1256}
1257EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
1258
1259
1260
1261int
1262nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
1263 const struct nf_conn *ignored_conntrack)
1264{
1265 struct net *net = nf_ct_net(ignored_conntrack);
1266 const struct nf_conntrack_zone *zone;
1267 struct nf_conntrack_tuple_hash *h;
1268 struct hlist_nulls_head *ct_hash;
1269 unsigned int hash, hsize;
1270 struct hlist_nulls_node *n;
1271 struct nf_conn *ct;
1272
1273 zone = nf_ct_zone(ignored_conntrack);
1274
1275 rcu_read_lock();
1276 begin:
1277 nf_conntrack_get_ht(&ct_hash, &hsize);
1278 hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize);
1279
1280 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
1281 ct = nf_ct_tuplehash_to_ctrack(h);
1282
1283 if (ct == ignored_conntrack)
1284 continue;
1285
1286 if (nf_ct_is_expired(ct)) {
1287 nf_ct_gc_expired(ct);
1288 continue;
1289 }
1290
1291 if (nf_ct_key_equal(h, tuple, zone, net)) {
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304 if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1305 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
1306 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL))
1307 continue;
1308
1309 NF_CT_STAT_INC_ATOMIC(net, found);
1310 rcu_read_unlock();
1311 return 1;
1312 }
1313 }
1314
1315 if (get_nulls_value(n) != hash) {
1316 NF_CT_STAT_INC_ATOMIC(net, search_restart);
1317 goto begin;
1318 }
1319
1320 rcu_read_unlock();
1321
1322 return 0;
1323}
1324EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
1325
1326#define NF_CT_EVICTION_RANGE 8
1327
1328
1329
1330static unsigned int early_drop_list(struct net *net,
1331 struct hlist_nulls_head *head)
1332{
1333 struct nf_conntrack_tuple_hash *h;
1334 struct hlist_nulls_node *n;
1335 unsigned int drops = 0;
1336 struct nf_conn *tmp;
1337
1338 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
1339 tmp = nf_ct_tuplehash_to_ctrack(h);
1340
1341 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
1342 continue;
1343
1344 if (nf_ct_is_expired(tmp)) {
1345 nf_ct_gc_expired(tmp);
1346 continue;
1347 }
1348
1349 if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
1350 !net_eq(nf_ct_net(tmp), net) ||
1351 nf_ct_is_dying(tmp))
1352 continue;
1353
1354 if (!atomic_inc_not_zero(&tmp->ct_general.use))
1355 continue;
1356
1357
1358
1359
1360
1361
1362
1363
1364 if (net_eq(nf_ct_net(tmp), net) &&
1365 nf_ct_is_confirmed(tmp) &&
1366 nf_ct_delete(tmp, 0, 0))
1367 drops++;
1368
1369 nf_ct_put(tmp);
1370 }
1371
1372 return drops;
1373}
1374
1375static noinline int early_drop(struct net *net, unsigned int hash)
1376{
1377 unsigned int i, bucket;
1378
1379 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
1380 struct hlist_nulls_head *ct_hash;
1381 unsigned int hsize, drops;
1382
1383 rcu_read_lock();
1384 nf_conntrack_get_ht(&ct_hash, &hsize);
1385 if (!i)
1386 bucket = reciprocal_scale(hash, hsize);
1387 else
1388 bucket = (bucket + 1) % hsize;
1389
1390 drops = early_drop_list(net, &ct_hash[bucket]);
1391 rcu_read_unlock();
1392
1393 if (drops) {
1394 NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops);
1395 return true;
1396 }
1397 }
1398
1399 return false;
1400}
1401
1402static bool gc_worker_skip_ct(const struct nf_conn *ct)
1403{
1404 return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct);
1405}
1406
1407static bool gc_worker_can_early_drop(const struct nf_conn *ct)
1408{
1409 const struct nf_conntrack_l4proto *l4proto;
1410
1411 if (!test_bit(IPS_ASSURED_BIT, &ct->status))
1412 return true;
1413
1414 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
1415 if (l4proto->can_early_drop && l4proto->can_early_drop(ct))
1416 return true;
1417
1418 return false;
1419}
1420
1421static void gc_worker(struct work_struct *work)
1422{
1423 unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION;
1424 unsigned int i, hashsz, nf_conntrack_max95 = 0;
1425 unsigned long next_run = GC_SCAN_INTERVAL;
1426 struct conntrack_gc_work *gc_work;
1427 gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
1428
1429 i = gc_work->next_bucket;
1430 if (gc_work->early_drop)
1431 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
1432
1433 do {
1434 struct nf_conntrack_tuple_hash *h;
1435 struct hlist_nulls_head *ct_hash;
1436 struct hlist_nulls_node *n;
1437 struct nf_conn *tmp;
1438
1439 rcu_read_lock();
1440
1441 nf_conntrack_get_ht(&ct_hash, &hashsz);
1442 if (i >= hashsz) {
1443 rcu_read_unlock();
1444 break;
1445 }
1446
1447 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
1448 struct nf_conntrack_net *cnet;
1449 struct net *net;
1450
1451 tmp = nf_ct_tuplehash_to_ctrack(h);
1452
1453 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
1454 nf_ct_offload_timeout(tmp);
1455 continue;
1456 }
1457
1458 if (nf_ct_is_expired(tmp)) {
1459 nf_ct_gc_expired(tmp);
1460 continue;
1461 }
1462
1463 if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
1464 continue;
1465
1466 net = nf_ct_net(tmp);
1467 cnet = nf_ct_pernet(net);
1468 if (atomic_read(&cnet->count) < nf_conntrack_max95)
1469 continue;
1470
1471
1472 if (!atomic_inc_not_zero(&tmp->ct_general.use))
1473 continue;
1474
1475 if (gc_worker_skip_ct(tmp)) {
1476 nf_ct_put(tmp);
1477 continue;
1478 }
1479
1480 if (gc_worker_can_early_drop(tmp))
1481 nf_ct_kill(tmp);
1482
1483 nf_ct_put(tmp);
1484 }
1485
1486
1487
1488
1489
1490 rcu_read_unlock();
1491 cond_resched();
1492 i++;
1493
1494 if (time_after(jiffies, end_time) && i < hashsz) {
1495 gc_work->next_bucket = i;
1496 next_run = 0;
1497 break;
1498 }
1499 } while (i < hashsz);
1500
1501 if (gc_work->exiting)
1502 return;
1503
1504
1505
1506
1507
1508
1509
1510
1511 if (next_run) {
1512 gc_work->early_drop = false;
1513 gc_work->next_bucket = 0;
1514 }
1515 queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
1516}
1517
1518static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
1519{
1520 INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker);
1521 gc_work->exiting = false;
1522}
1523
1524static struct nf_conn *
1525__nf_conntrack_alloc(struct net *net,
1526 const struct nf_conntrack_zone *zone,
1527 const struct nf_conntrack_tuple *orig,
1528 const struct nf_conntrack_tuple *repl,
1529 gfp_t gfp, u32 hash)
1530{
1531 struct nf_conntrack_net *cnet = nf_ct_pernet(net);
1532 unsigned int ct_count;
1533 struct nf_conn *ct;
1534
1535
1536 ct_count = atomic_inc_return(&cnet->count);
1537
1538 if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
1539 if (!early_drop(net, hash)) {
1540 if (!conntrack_gc_work.early_drop)
1541 conntrack_gc_work.early_drop = true;
1542 atomic_dec(&cnet->count);
1543 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
1544 return ERR_PTR(-ENOMEM);
1545 }
1546 }
1547
1548
1549
1550
1551
1552 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
1553 if (ct == NULL)
1554 goto out;
1555
1556 spin_lock_init(&ct->lock);
1557 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
1558 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
1559 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
1560
1561 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
1562 ct->status = 0;
1563 ct->timeout = 0;
1564 write_pnet(&ct->ct_net, net);
1565 memset(&ct->__nfct_init_offset, 0,
1566 offsetof(struct nf_conn, proto) -
1567 offsetof(struct nf_conn, __nfct_init_offset));
1568
1569 nf_ct_zone_add(ct, zone);
1570
1571
1572
1573
1574 atomic_set(&ct->ct_general.use, 0);
1575 return ct;
1576out:
1577 atomic_dec(&cnet->count);
1578 return ERR_PTR(-ENOMEM);
1579}
1580
1581struct nf_conn *nf_conntrack_alloc(struct net *net,
1582 const struct nf_conntrack_zone *zone,
1583 const struct nf_conntrack_tuple *orig,
1584 const struct nf_conntrack_tuple *repl,
1585 gfp_t gfp)
1586{
1587 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
1588}
1589EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
1590
1591void nf_conntrack_free(struct nf_conn *ct)
1592{
1593 struct net *net = nf_ct_net(ct);
1594 struct nf_conntrack_net *cnet;
1595
1596
1597
1598
1599 WARN_ON(atomic_read(&ct->ct_general.use) != 0);
1600
1601 nf_ct_ext_destroy(ct);
1602 kmem_cache_free(nf_conntrack_cachep, ct);
1603 cnet = nf_ct_pernet(net);
1604
1605 smp_mb__before_atomic();
1606 atomic_dec(&cnet->count);
1607}
1608EXPORT_SYMBOL_GPL(nf_conntrack_free);
1609
1610
1611
1612
1613static noinline struct nf_conntrack_tuple_hash *
1614init_conntrack(struct net *net, struct nf_conn *tmpl,
1615 const struct nf_conntrack_tuple *tuple,
1616 struct sk_buff *skb,
1617 unsigned int dataoff, u32 hash)
1618{
1619 struct nf_conn *ct;
1620 struct nf_conn_help *help;
1621 struct nf_conntrack_tuple repl_tuple;
1622 struct nf_conntrack_ecache *ecache;
1623 struct nf_conntrack_expect *exp = NULL;
1624 const struct nf_conntrack_zone *zone;
1625 struct nf_conn_timeout *timeout_ext;
1626 struct nf_conntrack_zone tmp;
1627 struct nf_conntrack_net *cnet;
1628
1629 if (!nf_ct_invert_tuple(&repl_tuple, tuple)) {
1630 pr_debug("Can't invert tuple.\n");
1631 return NULL;
1632 }
1633
1634 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1635 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
1636 hash);
1637 if (IS_ERR(ct))
1638 return (struct nf_conntrack_tuple_hash *)ct;
1639
1640 if (!nf_ct_add_synproxy(ct, tmpl)) {
1641 nf_conntrack_free(ct);
1642 return ERR_PTR(-ENOMEM);
1643 }
1644
1645 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
1646
1647 if (timeout_ext)
1648 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
1649 GFP_ATOMIC);
1650
1651 nf_ct_acct_ext_add(ct, GFP_ATOMIC);
1652 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
1653 nf_ct_labels_ext_add(ct);
1654
1655 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
1656 nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
1657 ecache ? ecache->expmask : 0,
1658 GFP_ATOMIC);
1659
1660 local_bh_disable();
1661 cnet = nf_ct_pernet(net);
1662 if (cnet->expect_count) {
1663 spin_lock(&nf_conntrack_expect_lock);
1664 exp = nf_ct_find_expectation(net, zone, tuple);
1665 if (exp) {
1666 pr_debug("expectation arrives ct=%p exp=%p\n",
1667 ct, exp);
1668
1669 __set_bit(IPS_EXPECTED_BIT, &ct->status);
1670
1671 ct->master = exp->master;
1672 if (exp->helper) {
1673 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
1674 if (help)
1675 rcu_assign_pointer(help->helper, exp->helper);
1676 }
1677
1678#ifdef CONFIG_NF_CONNTRACK_MARK
1679 ct->mark = exp->master->mark;
1680#endif
1681#ifdef CONFIG_NF_CONNTRACK_SECMARK
1682 ct->secmark = exp->master->secmark;
1683#endif
1684 NF_CT_STAT_INC(net, expect_new);
1685 }
1686 spin_unlock(&nf_conntrack_expect_lock);
1687 }
1688 if (!exp)
1689 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
1690
1691
1692 nf_conntrack_get(&ct->ct_general);
1693 nf_ct_add_to_unconfirmed_list(ct);
1694
1695 local_bh_enable();
1696
1697 if (exp) {
1698 if (exp->expectfn)
1699 exp->expectfn(ct, exp);
1700 nf_ct_expect_put(exp);
1701 }
1702
1703 return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
1704}
1705
1706
1707static int
1708resolve_normal_ct(struct nf_conn *tmpl,
1709 struct sk_buff *skb,
1710 unsigned int dataoff,
1711 u_int8_t protonum,
1712 const struct nf_hook_state *state)
1713{
1714 const struct nf_conntrack_zone *zone;
1715 struct nf_conntrack_tuple tuple;
1716 struct nf_conntrack_tuple_hash *h;
1717 enum ip_conntrack_info ctinfo;
1718 struct nf_conntrack_zone tmp;
1719 u32 hash, zone_id, rid;
1720 struct nf_conn *ct;
1721
1722 if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
1723 dataoff, state->pf, protonum, state->net,
1724 &tuple)) {
1725 pr_debug("Can't get tuple\n");
1726 return 0;
1727 }
1728
1729
1730 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1731
1732 zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
1733 hash = hash_conntrack_raw(&tuple, zone_id, state->net);
1734 h = __nf_conntrack_find_get(state->net, zone, &tuple, hash);
1735
1736 if (!h) {
1737 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
1738 if (zone_id != rid) {
1739 u32 tmp = hash_conntrack_raw(&tuple, rid, state->net);
1740
1741 h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp);
1742 }
1743 }
1744
1745 if (!h) {
1746 h = init_conntrack(state->net, tmpl, &tuple,
1747 skb, dataoff, hash);
1748 if (!h)
1749 return 0;
1750 if (IS_ERR(h))
1751 return PTR_ERR(h);
1752 }
1753 ct = nf_ct_tuplehash_to_ctrack(h);
1754
1755
1756 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1757 ctinfo = IP_CT_ESTABLISHED_REPLY;
1758 } else {
1759
1760 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1761 pr_debug("normal packet for %p\n", ct);
1762 ctinfo = IP_CT_ESTABLISHED;
1763 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1764 pr_debug("related packet for %p\n", ct);
1765 ctinfo = IP_CT_RELATED;
1766 } else {
1767 pr_debug("new packet for %p\n", ct);
1768 ctinfo = IP_CT_NEW;
1769 }
1770 }
1771 nf_ct_set(skb, ct, ctinfo);
1772 return 0;
1773}
1774
1775
1776
1777
1778
1779
1780
1781
1782static unsigned int __cold
1783nf_conntrack_handle_icmp(struct nf_conn *tmpl,
1784 struct sk_buff *skb,
1785 unsigned int dataoff,
1786 u8 protonum,
1787 const struct nf_hook_state *state)
1788{
1789 int ret;
1790
1791 if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP)
1792 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state);
1793#if IS_ENABLED(CONFIG_IPV6)
1794 else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6)
1795 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state);
1796#endif
1797 else
1798 return NF_ACCEPT;
1799
1800 if (ret <= 0)
1801 NF_CT_STAT_INC_ATOMIC(state->net, error);
1802
1803 return ret;
1804}
1805
1806static int generic_packet(struct nf_conn *ct, struct sk_buff *skb,
1807 enum ip_conntrack_info ctinfo)
1808{
1809 const unsigned int *timeout = nf_ct_timeout_lookup(ct);
1810
1811 if (!timeout)
1812 timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout;
1813
1814 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
1815 return NF_ACCEPT;
1816}
1817
1818
1819static int nf_conntrack_handle_packet(struct nf_conn *ct,
1820 struct sk_buff *skb,
1821 unsigned int dataoff,
1822 enum ip_conntrack_info ctinfo,
1823 const struct nf_hook_state *state)
1824{
1825 switch (nf_ct_protonum(ct)) {
1826 case IPPROTO_TCP:
1827 return nf_conntrack_tcp_packet(ct, skb, dataoff,
1828 ctinfo, state);
1829 case IPPROTO_UDP:
1830 return nf_conntrack_udp_packet(ct, skb, dataoff,
1831 ctinfo, state);
1832 case IPPROTO_ICMP:
1833 return nf_conntrack_icmp_packet(ct, skb, ctinfo, state);
1834#if IS_ENABLED(CONFIG_IPV6)
1835 case IPPROTO_ICMPV6:
1836 return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state);
1837#endif
1838#ifdef CONFIG_NF_CT_PROTO_UDPLITE
1839 case IPPROTO_UDPLITE:
1840 return nf_conntrack_udplite_packet(ct, skb, dataoff,
1841 ctinfo, state);
1842#endif
1843#ifdef CONFIG_NF_CT_PROTO_SCTP
1844 case IPPROTO_SCTP:
1845 return nf_conntrack_sctp_packet(ct, skb, dataoff,
1846 ctinfo, state);
1847#endif
1848#ifdef CONFIG_NF_CT_PROTO_DCCP
1849 case IPPROTO_DCCP:
1850 return nf_conntrack_dccp_packet(ct, skb, dataoff,
1851 ctinfo, state);
1852#endif
1853#ifdef CONFIG_NF_CT_PROTO_GRE
1854 case IPPROTO_GRE:
1855 return nf_conntrack_gre_packet(ct, skb, dataoff,
1856 ctinfo, state);
1857#endif
1858 }
1859
1860 return generic_packet(ct, skb, ctinfo);
1861}
1862
1863unsigned int
1864nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
1865{
1866 enum ip_conntrack_info ctinfo;
1867 struct nf_conn *ct, *tmpl;
1868 u_int8_t protonum;
1869 int dataoff, ret;
1870
1871 tmpl = nf_ct_get(skb, &ctinfo);
1872 if (tmpl || ctinfo == IP_CT_UNTRACKED) {
1873
1874 if ((tmpl && !nf_ct_is_template(tmpl)) ||
1875 ctinfo == IP_CT_UNTRACKED)
1876 return NF_ACCEPT;
1877 skb->_nfct = 0;
1878 }
1879
1880
1881 dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum);
1882 if (dataoff <= 0) {
1883 pr_debug("not prepared to track yet or error occurred\n");
1884 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1885 ret = NF_ACCEPT;
1886 goto out;
1887 }
1888
1889 if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) {
1890 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff,
1891 protonum, state);
1892 if (ret <= 0) {
1893 ret = -ret;
1894 goto out;
1895 }
1896
1897 if (skb->_nfct)
1898 goto out;
1899 }
1900repeat:
1901 ret = resolve_normal_ct(tmpl, skb, dataoff,
1902 protonum, state);
1903 if (ret < 0) {
1904
1905 NF_CT_STAT_INC_ATOMIC(state->net, drop);
1906 ret = NF_DROP;
1907 goto out;
1908 }
1909
1910 ct = nf_ct_get(skb, &ctinfo);
1911 if (!ct) {
1912
1913 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1914 ret = NF_ACCEPT;
1915 goto out;
1916 }
1917
1918 ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state);
1919 if (ret <= 0) {
1920
1921
1922 pr_debug("nf_conntrack_in: Can't track with proto module\n");
1923 nf_conntrack_put(&ct->ct_general);
1924 skb->_nfct = 0;
1925 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1926 if (ret == -NF_DROP)
1927 NF_CT_STAT_INC_ATOMIC(state->net, drop);
1928
1929
1930
1931
1932 if (ret == -NF_REPEAT)
1933 goto repeat;
1934 ret = -ret;
1935 goto out;
1936 }
1937
1938 if (ctinfo == IP_CT_ESTABLISHED_REPLY &&
1939 !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1940 nf_conntrack_event_cache(IPCT_REPLY, ct);
1941out:
1942 if (tmpl)
1943 nf_ct_put(tmpl);
1944
1945 return ret;
1946}
1947EXPORT_SYMBOL_GPL(nf_conntrack_in);
1948
1949
1950
1951void nf_conntrack_alter_reply(struct nf_conn *ct,
1952 const struct nf_conntrack_tuple *newreply)
1953{
1954 struct nf_conn_help *help = nfct_help(ct);
1955
1956
1957 WARN_ON(nf_ct_is_confirmed(ct));
1958
1959 pr_debug("Altering reply tuple of %p to ", ct);
1960 nf_ct_dump_tuple(newreply);
1961
1962 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1963 if (ct->master || (help && !hlist_empty(&help->expectations)))
1964 return;
1965
1966 rcu_read_lock();
1967 __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
1968 rcu_read_unlock();
1969}
1970EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
1971
1972
1973void __nf_ct_refresh_acct(struct nf_conn *ct,
1974 enum ip_conntrack_info ctinfo,
1975 const struct sk_buff *skb,
1976 u32 extra_jiffies,
1977 bool do_acct)
1978{
1979
1980 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
1981 goto acct;
1982
1983
1984 if (nf_ct_is_confirmed(ct))
1985 extra_jiffies += nfct_time_stamp;
1986
1987 if (READ_ONCE(ct->timeout) != extra_jiffies)
1988 WRITE_ONCE(ct->timeout, extra_jiffies);
1989acct:
1990 if (do_acct)
1991 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
1992}
1993EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
1994
1995bool nf_ct_kill_acct(struct nf_conn *ct,
1996 enum ip_conntrack_info ctinfo,
1997 const struct sk_buff *skb)
1998{
1999 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
2000
2001 return nf_ct_delete(ct, 0, 0);
2002}
2003EXPORT_SYMBOL_GPL(nf_ct_kill_acct);
2004
2005#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
2006
2007#include <linux/netfilter/nfnetlink.h>
2008#include <linux/netfilter/nfnetlink_conntrack.h>
2009#include <linux/mutex.h>
2010
2011
2012int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
2013 const struct nf_conntrack_tuple *tuple)
2014{
2015 if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) ||
2016 nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port))
2017 goto nla_put_failure;
2018 return 0;
2019
2020nla_put_failure:
2021 return -1;
2022}
2023EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
2024
2025const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
2026 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 },
2027 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 },
2028};
2029EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
2030
2031int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
2032 struct nf_conntrack_tuple *t,
2033 u_int32_t flags)
2034{
2035 if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) {
2036 if (!tb[CTA_PROTO_SRC_PORT])
2037 return -EINVAL;
2038
2039 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
2040 }
2041
2042 if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) {
2043 if (!tb[CTA_PROTO_DST_PORT])
2044 return -EINVAL;
2045
2046 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
2047 }
2048
2049 return 0;
2050}
2051EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
2052
2053unsigned int nf_ct_port_nlattr_tuple_size(void)
2054{
2055 static unsigned int size __read_mostly;
2056
2057 if (!size)
2058 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
2059
2060 return size;
2061}
2062EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
2063#endif
2064
2065
2066static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
2067{
2068 struct nf_conn *ct;
2069 enum ip_conntrack_info ctinfo;
2070
2071
2072 ct = nf_ct_get(skb, &ctinfo);
2073 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
2074 ctinfo = IP_CT_RELATED_REPLY;
2075 else
2076 ctinfo = IP_CT_RELATED;
2077
2078
2079 nf_ct_set(nskb, ct, ctinfo);
2080 nf_conntrack_get(skb_nfct(nskb));
2081}
2082
2083static int __nf_conntrack_update(struct net *net, struct sk_buff *skb,
2084 struct nf_conn *ct,
2085 enum ip_conntrack_info ctinfo)
2086{
2087 struct nf_conntrack_tuple_hash *h;
2088 struct nf_conntrack_tuple tuple;
2089 struct nf_nat_hook *nat_hook;
2090 unsigned int status;
2091 int dataoff;
2092 u16 l3num;
2093 u8 l4num;
2094
2095 l3num = nf_ct_l3num(ct);
2096
2097 dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num);
2098 if (dataoff <= 0)
2099 return -1;
2100
2101 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
2102 l4num, net, &tuple))
2103 return -1;
2104
2105 if (ct->status & IPS_SRC_NAT) {
2106 memcpy(tuple.src.u3.all,
2107 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
2108 sizeof(tuple.src.u3.all));
2109 tuple.src.u.all =
2110 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
2111 }
2112
2113 if (ct->status & IPS_DST_NAT) {
2114 memcpy(tuple.dst.u3.all,
2115 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
2116 sizeof(tuple.dst.u3.all));
2117 tuple.dst.u.all =
2118 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
2119 }
2120
2121 h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
2122 if (!h)
2123 return 0;
2124
2125
2126
2127
2128 status = ct->status;
2129
2130 nf_ct_put(ct);
2131 ct = nf_ct_tuplehash_to_ctrack(h);
2132 nf_ct_set(skb, ct, ctinfo);
2133
2134 nat_hook = rcu_dereference(nf_nat_hook);
2135 if (!nat_hook)
2136 return 0;
2137
2138 if (status & IPS_SRC_NAT &&
2139 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC,
2140 IP_CT_DIR_ORIGINAL) == NF_DROP)
2141 return -1;
2142
2143 if (status & IPS_DST_NAT &&
2144 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST,
2145 IP_CT_DIR_ORIGINAL) == NF_DROP)
2146 return -1;
2147
2148 return 0;
2149}
2150
2151
2152
2153
2154static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct,
2155 enum ip_conntrack_info ctinfo)
2156{
2157 const struct nf_conntrack_helper *helper;
2158 const struct nf_conn_help *help;
2159 int protoff;
2160
2161 help = nfct_help(ct);
2162 if (!help)
2163 return 0;
2164
2165 helper = rcu_dereference(help->helper);
2166 if (!(helper->flags & NF_CT_HELPER_F_USERSPACE))
2167 return 0;
2168
2169 switch (nf_ct_l3num(ct)) {
2170 case NFPROTO_IPV4:
2171 protoff = skb_network_offset(skb) + ip_hdrlen(skb);
2172 break;
2173#if IS_ENABLED(CONFIG_IPV6)
2174 case NFPROTO_IPV6: {
2175 __be16 frag_off;
2176 u8 pnum;
2177
2178 pnum = ipv6_hdr(skb)->nexthdr;
2179 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
2180 &frag_off);
2181 if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
2182 return 0;
2183 break;
2184 }
2185#endif
2186 default:
2187 return 0;
2188 }
2189
2190 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
2191 !nf_is_loopback_packet(skb)) {
2192 if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
2193 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
2194 return -1;
2195 }
2196 }
2197
2198
2199 return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0;
2200}
2201
2202static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
2203{
2204 enum ip_conntrack_info ctinfo;
2205 struct nf_conn *ct;
2206 int err;
2207
2208 ct = nf_ct_get(skb, &ctinfo);
2209 if (!ct)
2210 return 0;
2211
2212 if (!nf_ct_is_confirmed(ct)) {
2213 err = __nf_conntrack_update(net, skb, ct, ctinfo);
2214 if (err < 0)
2215 return err;
2216
2217 ct = nf_ct_get(skb, &ctinfo);
2218 }
2219
2220 return nf_confirm_cthelper(skb, ct, ctinfo);
2221}
2222
2223static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
2224 const struct sk_buff *skb)
2225{
2226 const struct nf_conntrack_tuple *src_tuple;
2227 const struct nf_conntrack_tuple_hash *hash;
2228 struct nf_conntrack_tuple srctuple;
2229 enum ip_conntrack_info ctinfo;
2230 struct nf_conn *ct;
2231
2232 ct = nf_ct_get(skb, &ctinfo);
2233 if (ct) {
2234 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
2235 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
2236 return true;
2237 }
2238
2239 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
2240 NFPROTO_IPV4, dev_net(skb->dev),
2241 &srctuple))
2242 return false;
2243
2244 hash = nf_conntrack_find_get(dev_net(skb->dev),
2245 &nf_ct_zone_dflt,
2246 &srctuple);
2247 if (!hash)
2248 return false;
2249
2250 ct = nf_ct_tuplehash_to_ctrack(hash);
2251 src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
2252 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
2253 nf_ct_put(ct);
2254
2255 return true;
2256}
2257
2258
2259static struct nf_conn *
2260get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
2261 void *data, unsigned int *bucket)
2262{
2263 struct nf_conntrack_tuple_hash *h;
2264 struct nf_conn *ct;
2265 struct hlist_nulls_node *n;
2266 spinlock_t *lockp;
2267
2268 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
2269 struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket];
2270
2271 if (hlist_nulls_empty(hslot))
2272 continue;
2273
2274 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
2275 local_bh_disable();
2276 nf_conntrack_lock(lockp);
2277 hlist_nulls_for_each_entry(h, n, hslot, hnnode) {
2278 if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
2279 continue;
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291 ct = nf_ct_tuplehash_to_ctrack(h);
2292 if (iter(ct, data))
2293 goto found;
2294 }
2295 spin_unlock(lockp);
2296 local_bh_enable();
2297 cond_resched();
2298 }
2299
2300 return NULL;
2301found:
2302 atomic_inc(&ct->ct_general.use);
2303 spin_unlock(lockp);
2304 local_bh_enable();
2305 return ct;
2306}
2307
2308static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
2309 void *data, u32 portid, int report)
2310{
2311 unsigned int bucket = 0;
2312 struct nf_conn *ct;
2313
2314 might_sleep();
2315
2316 mutex_lock(&nf_conntrack_mutex);
2317 while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
2318
2319
2320 nf_ct_delete(ct, portid, report);
2321 nf_ct_put(ct);
2322 cond_resched();
2323 }
2324 mutex_unlock(&nf_conntrack_mutex);
2325}
2326
2327struct iter_data {
2328 int (*iter)(struct nf_conn *i, void *data);
2329 void *data;
2330 struct net *net;
2331};
2332
2333static int iter_net_only(struct nf_conn *i, void *data)
2334{
2335 struct iter_data *d = data;
2336
2337 if (!net_eq(d->net, nf_ct_net(i)))
2338 return 0;
2339
2340 return d->iter(i, d->data);
2341}
2342
2343static void
2344__nf_ct_unconfirmed_destroy(struct net *net)
2345{
2346 int cpu;
2347
2348 for_each_possible_cpu(cpu) {
2349 struct nf_conntrack_tuple_hash *h;
2350 struct hlist_nulls_node *n;
2351 struct ct_pcpu *pcpu;
2352
2353 pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
2354
2355 spin_lock_bh(&pcpu->lock);
2356 hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
2357 struct nf_conn *ct;
2358
2359 ct = nf_ct_tuplehash_to_ctrack(h);
2360
2361
2362
2363
2364 set_bit(IPS_DYING_BIT, &ct->status);
2365 }
2366 spin_unlock_bh(&pcpu->lock);
2367 cond_resched();
2368 }
2369}
2370
2371void nf_ct_unconfirmed_destroy(struct net *net)
2372{
2373 struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2374
2375 might_sleep();
2376
2377 if (atomic_read(&cnet->count) > 0) {
2378 __nf_ct_unconfirmed_destroy(net);
2379 nf_queue_nf_hook_drop(net);
2380 synchronize_net();
2381 }
2382}
2383EXPORT_SYMBOL_GPL(nf_ct_unconfirmed_destroy);
2384
2385void nf_ct_iterate_cleanup_net(struct net *net,
2386 int (*iter)(struct nf_conn *i, void *data),
2387 void *data, u32 portid, int report)
2388{
2389 struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2390 struct iter_data d;
2391
2392 might_sleep();
2393
2394 if (atomic_read(&cnet->count) == 0)
2395 return;
2396
2397 d.iter = iter;
2398 d.data = data;
2399 d.net = net;
2400
2401 nf_ct_iterate_cleanup(iter_net_only, &d, portid, report);
2402}
2403EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416void
2417nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
2418{
2419 struct net *net;
2420
2421 down_read(&net_rwsem);
2422 for_each_net(net) {
2423 struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2424
2425 if (atomic_read(&cnet->count) == 0)
2426 continue;
2427 __nf_ct_unconfirmed_destroy(net);
2428 nf_queue_nf_hook_drop(net);
2429 }
2430 up_read(&net_rwsem);
2431
2432
2433
2434
2435
2436
2437 net_ns_barrier();
2438
2439
2440
2441
2442
2443 synchronize_net();
2444
2445 nf_ct_iterate_cleanup(iter, data, 0, 0);
2446}
2447EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy);
2448
2449static int kill_all(struct nf_conn *i, void *data)
2450{
2451 return net_eq(nf_ct_net(i), data);
2452}
2453
2454void nf_conntrack_cleanup_start(void)
2455{
2456 conntrack_gc_work.exiting = true;
2457 RCU_INIT_POINTER(ip_ct_attach, NULL);
2458}
2459
2460void nf_conntrack_cleanup_end(void)
2461{
2462 RCU_INIT_POINTER(nf_ct_hook, NULL);
2463 cancel_delayed_work_sync(&conntrack_gc_work.dwork);
2464 kvfree(nf_conntrack_hash);
2465
2466 nf_conntrack_proto_fini();
2467 nf_conntrack_seqadj_fini();
2468 nf_conntrack_labels_fini();
2469 nf_conntrack_helper_fini();
2470 nf_conntrack_timeout_fini();
2471 nf_conntrack_ecache_fini();
2472 nf_conntrack_tstamp_fini();
2473 nf_conntrack_acct_fini();
2474 nf_conntrack_expect_fini();
2475
2476 kmem_cache_destroy(nf_conntrack_cachep);
2477}
2478
2479
2480
2481
2482
2483void nf_conntrack_cleanup_net(struct net *net)
2484{
2485 LIST_HEAD(single);
2486
2487 list_add(&net->exit_list, &single);
2488 nf_conntrack_cleanup_net_list(&single);
2489}
2490
2491void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
2492{
2493 int busy;
2494 struct net *net;
2495
2496
2497
2498
2499
2500
2501 synchronize_net();
2502i_see_dead_people:
2503 busy = 0;
2504 list_for_each_entry(net, net_exit_list, exit_list) {
2505 struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2506
2507 nf_ct_iterate_cleanup(kill_all, net, 0, 0);
2508 if (atomic_read(&cnet->count) != 0)
2509 busy = 1;
2510 }
2511 if (busy) {
2512 schedule();
2513 goto i_see_dead_people;
2514 }
2515
2516 list_for_each_entry(net, net_exit_list, exit_list) {
2517 nf_conntrack_ecache_pernet_fini(net);
2518 nf_conntrack_expect_pernet_fini(net);
2519 free_percpu(net->ct.stat);
2520 free_percpu(net->ct.pcpu_lists);
2521 }
2522}
2523
2524void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
2525{
2526 struct hlist_nulls_head *hash;
2527 unsigned int nr_slots, i;
2528
2529 if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
2530 return NULL;
2531
2532 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
2533 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
2534
2535 hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL);
2536
2537 if (hash && nulls)
2538 for (i = 0; i < nr_slots; i++)
2539 INIT_HLIST_NULLS_HEAD(&hash[i], i);
2540
2541 return hash;
2542}
2543EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
2544
2545int nf_conntrack_hash_resize(unsigned int hashsize)
2546{
2547 int i, bucket;
2548 unsigned int old_size;
2549 struct hlist_nulls_head *hash, *old_hash;
2550 struct nf_conntrack_tuple_hash *h;
2551 struct nf_conn *ct;
2552
2553 if (!hashsize)
2554 return -EINVAL;
2555
2556 hash = nf_ct_alloc_hashtable(&hashsize, 1);
2557 if (!hash)
2558 return -ENOMEM;
2559
2560 mutex_lock(&nf_conntrack_mutex);
2561 old_size = nf_conntrack_htable_size;
2562 if (old_size == hashsize) {
2563 mutex_unlock(&nf_conntrack_mutex);
2564 kvfree(hash);
2565 return 0;
2566 }
2567
2568 local_bh_disable();
2569 nf_conntrack_all_lock();
2570 write_seqcount_begin(&nf_conntrack_generation);
2571
2572
2573
2574
2575
2576
2577
2578 for (i = 0; i < nf_conntrack_htable_size; i++) {
2579 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
2580 unsigned int zone_id;
2581
2582 h = hlist_nulls_entry(nf_conntrack_hash[i].first,
2583 struct nf_conntrack_tuple_hash, hnnode);
2584 ct = nf_ct_tuplehash_to_ctrack(h);
2585 hlist_nulls_del_rcu(&h->hnnode);
2586
2587 zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h));
2588 bucket = __hash_conntrack(nf_ct_net(ct),
2589 &h->tuple, zone_id, hashsize);
2590 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
2591 }
2592 }
2593 old_size = nf_conntrack_htable_size;
2594 old_hash = nf_conntrack_hash;
2595
2596 nf_conntrack_hash = hash;
2597 nf_conntrack_htable_size = hashsize;
2598
2599 write_seqcount_end(&nf_conntrack_generation);
2600 nf_conntrack_all_unlock();
2601 local_bh_enable();
2602
2603 mutex_unlock(&nf_conntrack_mutex);
2604
2605 synchronize_net();
2606 kvfree(old_hash);
2607 return 0;
2608}
2609
2610int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
2611{
2612 unsigned int hashsize;
2613 int rc;
2614
2615 if (current->nsproxy->net_ns != &init_net)
2616 return -EOPNOTSUPP;
2617
2618
2619 if (!nf_conntrack_hash)
2620 return param_set_uint(val, kp);
2621
2622 rc = kstrtouint(val, 0, &hashsize);
2623 if (rc)
2624 return rc;
2625
2626 return nf_conntrack_hash_resize(hashsize);
2627}
2628
2629static __always_inline unsigned int total_extension_size(void)
2630{
2631
2632 BUILD_BUG_ON(NF_CT_EXT_NUM > 9);
2633
2634 return sizeof(struct nf_ct_ext) +
2635 sizeof(struct nf_conn_help)
2636#if IS_ENABLED(CONFIG_NF_NAT)
2637 + sizeof(struct nf_conn_nat)
2638#endif
2639 + sizeof(struct nf_conn_seqadj)
2640 + sizeof(struct nf_conn_acct)
2641#ifdef CONFIG_NF_CONNTRACK_EVENTS
2642 + sizeof(struct nf_conntrack_ecache)
2643#endif
2644#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
2645 + sizeof(struct nf_conn_tstamp)
2646#endif
2647#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
2648 + sizeof(struct nf_conn_timeout)
2649#endif
2650#ifdef CONFIG_NF_CONNTRACK_LABELS
2651 + sizeof(struct nf_conn_labels)
2652#endif
2653#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
2654 + sizeof(struct nf_conn_synproxy)
2655#endif
2656 ;
2657};
2658
2659int nf_conntrack_init_start(void)
2660{
2661 unsigned long nr_pages = totalram_pages();
2662 int max_factor = 8;
2663 int ret = -ENOMEM;
2664 int i;
2665
2666
2667 BUILD_BUG_ON(total_extension_size() > 255u);
2668
2669 seqcount_spinlock_init(&nf_conntrack_generation,
2670 &nf_conntrack_locks_all_lock);
2671
2672 for (i = 0; i < CONNTRACK_LOCKS; i++)
2673 spin_lock_init(&nf_conntrack_locks[i]);
2674
2675 if (!nf_conntrack_htable_size) {
2676 nf_conntrack_htable_size
2677 = (((nr_pages << PAGE_SHIFT) / 16384)
2678 / sizeof(struct hlist_head));
2679 if (BITS_PER_LONG >= 64 &&
2680 nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
2681 nf_conntrack_htable_size = 262144;
2682 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
2683 nf_conntrack_htable_size = 65536;
2684
2685 if (nf_conntrack_htable_size < 1024)
2686 nf_conntrack_htable_size = 1024;
2687
2688
2689
2690
2691
2692
2693 max_factor = 1;
2694 }
2695
2696 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
2697 if (!nf_conntrack_hash)
2698 return -ENOMEM;
2699
2700 nf_conntrack_max = max_factor * nf_conntrack_htable_size;
2701
2702 nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
2703 sizeof(struct nf_conn),
2704 NFCT_INFOMASK + 1,
2705 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
2706 if (!nf_conntrack_cachep)
2707 goto err_cachep;
2708
2709 ret = nf_conntrack_expect_init();
2710 if (ret < 0)
2711 goto err_expect;
2712
2713 ret = nf_conntrack_acct_init();
2714 if (ret < 0)
2715 goto err_acct;
2716
2717 ret = nf_conntrack_tstamp_init();
2718 if (ret < 0)
2719 goto err_tstamp;
2720
2721 ret = nf_conntrack_ecache_init();
2722 if (ret < 0)
2723 goto err_ecache;
2724
2725 ret = nf_conntrack_timeout_init();
2726 if (ret < 0)
2727 goto err_timeout;
2728
2729 ret = nf_conntrack_helper_init();
2730 if (ret < 0)
2731 goto err_helper;
2732
2733 ret = nf_conntrack_labels_init();
2734 if (ret < 0)
2735 goto err_labels;
2736
2737 ret = nf_conntrack_seqadj_init();
2738 if (ret < 0)
2739 goto err_seqadj;
2740
2741 ret = nf_conntrack_proto_init();
2742 if (ret < 0)
2743 goto err_proto;
2744
2745 conntrack_gc_work_init(&conntrack_gc_work);
2746 queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);
2747
2748 return 0;
2749
2750err_proto:
2751 nf_conntrack_seqadj_fini();
2752err_seqadj:
2753 nf_conntrack_labels_fini();
2754err_labels:
2755 nf_conntrack_helper_fini();
2756err_helper:
2757 nf_conntrack_timeout_fini();
2758err_timeout:
2759 nf_conntrack_ecache_fini();
2760err_ecache:
2761 nf_conntrack_tstamp_fini();
2762err_tstamp:
2763 nf_conntrack_acct_fini();
2764err_acct:
2765 nf_conntrack_expect_fini();
2766err_expect:
2767 kmem_cache_destroy(nf_conntrack_cachep);
2768err_cachep:
2769 kvfree(nf_conntrack_hash);
2770 return ret;
2771}
2772
2773static struct nf_ct_hook nf_conntrack_hook = {
2774 .update = nf_conntrack_update,
2775 .destroy = destroy_conntrack,
2776 .get_tuple_skb = nf_conntrack_get_tuple_skb,
2777};
2778
2779void nf_conntrack_init_end(void)
2780{
2781
2782 RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
2783 RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook);
2784}
2785
2786
2787
2788
2789#define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
2790#define DYING_NULLS_VAL ((1<<30)+1)
2791
2792int nf_conntrack_init_net(struct net *net)
2793{
2794 struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2795 int ret = -ENOMEM;
2796 int cpu;
2797
2798 BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER);
2799 BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS);
2800 atomic_set(&cnet->count, 0);
2801
2802 net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
2803 if (!net->ct.pcpu_lists)
2804 goto err_stat;
2805
2806 for_each_possible_cpu(cpu) {
2807 struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
2808
2809 spin_lock_init(&pcpu->lock);
2810 INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
2811 INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
2812 }
2813
2814 net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
2815 if (!net->ct.stat)
2816 goto err_pcpu_lists;
2817
2818 ret = nf_conntrack_expect_pernet_init(net);
2819 if (ret < 0)
2820 goto err_expect;
2821
2822 nf_conntrack_acct_pernet_init(net);
2823 nf_conntrack_tstamp_pernet_init(net);
2824 nf_conntrack_ecache_pernet_init(net);
2825 nf_conntrack_helper_pernet_init(net);
2826 nf_conntrack_proto_pernet_init(net);
2827
2828 return 0;
2829
2830err_expect:
2831 free_percpu(net->ct.stat);
2832err_pcpu_lists:
2833 free_percpu(net->ct.pcpu_lists);
2834err_stat:
2835 return ret;
2836}
2837