1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48#define pr_fmt(fmt) "TCP: " fmt
49
50#include <linux/bottom_half.h>
51#include <linux/types.h>
52#include <linux/fcntl.h>
53#include <linux/module.h>
54#include <linux/random.h>
55#include <linux/cache.h>
56#include <linux/jhash.h>
57#include <linux/init.h>
58#include <linux/times.h>
59#include <linux/slab.h>
60
61#include <net/net_namespace.h>
62#include <net/icmp.h>
63#include <net/inet_hashtables.h>
64#include <net/tcp.h>
65#include <net/transp_v6.h>
66#include <net/ipv6.h>
67#include <net/inet_common.h>
68#include <net/timewait_sock.h>
69#include <net/xfrm.h>
70#include <net/secure_seq.h>
71#include <net/busy_poll.h>
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78#include <linux/inetdevice.h>
79#include <linux/btf_ids.h>
80
81#include <crypto/hash.h>
82#include <linux/scatterlist.h>
83
84#include <trace/events/tcp.h>
85
86#ifdef CONFIG_TCP_MD5SIG
87static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
89#endif
90
91struct inet_hashinfo tcp_hashinfo;
92EXPORT_SYMBOL(tcp_hashinfo);
93
94static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95
96static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97{
98 return secure_tcp_seq(ip_hdr(skb)->daddr,
99 ip_hdr(skb)->saddr,
100 tcp_hdr(skb)->dest,
101 tcp_hdr(skb)->source);
102}
103
104static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105{
106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107}
108
109int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110{
111 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 struct tcp_sock *tp = tcp_sk(sk);
115
116 if (reuse == 2) {
117
118
119
120
121 bool loopback = false;
122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123 loopback = true;
124#if IS_ENABLED(CONFIG_IPV6)
125 if (tw->tw_family == AF_INET6) {
126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130 loopback = true;
131 } else
132#endif
133 {
134 if (ipv4_is_loopback(tw->tw_daddr) ||
135 ipv4_is_loopback(tw->tw_rcv_saddr))
136 loopback = true;
137 }
138 if (!loopback)
139 reuse = 0;
140 }
141
142
143
144
145
146
147
148
149
150
151
152
153 if (tcptw->tw_ts_recent_stamp &&
154 (!twp || (reuse && time_after32(ktime_get_seconds(),
155 tcptw->tw_ts_recent_stamp)))) {
156
157
158
159
160
161
162
163
164
165
166
167 if (likely(!tp->repair)) {
168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169
170 if (!seq)
171 seq = 1;
172 WRITE_ONCE(tp->write_seq, seq);
173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175 }
176 sock_hold(sktw);
177 return 1;
178 }
179
180 return 0;
181}
182EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183
184static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 int addr_len)
186{
187
188
189
190
191 if (addr_len < sizeof(struct sockaddr_in))
192 return -EINVAL;
193
194 sock_owned_by_me(sk);
195
196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197}
198
199
200int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201{
202 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203 struct inet_sock *inet = inet_sk(sk);
204 struct tcp_sock *tp = tcp_sk(sk);
205 __be16 orig_sport, orig_dport;
206 __be32 daddr, nexthop;
207 struct flowi4 *fl4;
208 struct rtable *rt;
209 int err;
210 struct ip_options_rcu *inet_opt;
211 struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
212
213 if (addr_len < sizeof(struct sockaddr_in))
214 return -EINVAL;
215
216 if (usin->sin_family != AF_INET)
217 return -EAFNOSUPPORT;
218
219 nexthop = daddr = usin->sin_addr.s_addr;
220 inet_opt = rcu_dereference_protected(inet->inet_opt,
221 lockdep_sock_is_held(sk));
222 if (inet_opt && inet_opt->opt.srr) {
223 if (!daddr)
224 return -EINVAL;
225 nexthop = inet_opt->opt.faddr;
226 }
227
228 orig_sport = inet->inet_sport;
229 orig_dport = usin->sin_port;
230 fl4 = &inet->cork.fl.u.ip4;
231 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
232 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
233 orig_dport, sk);
234 if (IS_ERR(rt)) {
235 err = PTR_ERR(rt);
236 if (err == -ENETUNREACH)
237 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
238 return err;
239 }
240
241 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
242 ip_rt_put(rt);
243 return -ENETUNREACH;
244 }
245
246 if (!inet_opt || !inet_opt->opt.srr)
247 daddr = fl4->daddr;
248
249 if (!inet->inet_saddr)
250 inet->inet_saddr = fl4->saddr;
251 sk_rcv_saddr_set(sk, inet->inet_saddr);
252
253 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
254
255 tp->rx_opt.ts_recent = 0;
256 tp->rx_opt.ts_recent_stamp = 0;
257 if (likely(!tp->repair))
258 WRITE_ONCE(tp->write_seq, 0);
259 }
260
261 inet->inet_dport = usin->sin_port;
262 sk_daddr_set(sk, daddr);
263
264 inet_csk(sk)->icsk_ext_hdr_len = 0;
265 if (inet_opt)
266 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
267
268 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
269
270
271
272
273
274
275 tcp_set_state(sk, TCP_SYN_SENT);
276 err = inet_hash_connect(tcp_death_row, sk);
277 if (err)
278 goto failure;
279
280 sk_set_txhash(sk);
281
282 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
283 inet->inet_sport, inet->inet_dport, sk);
284 if (IS_ERR(rt)) {
285 err = PTR_ERR(rt);
286 rt = NULL;
287 goto failure;
288 }
289
290 sk->sk_gso_type = SKB_GSO_TCPV4;
291 sk_setup_caps(sk, &rt->dst);
292 rt = NULL;
293
294 if (likely(!tp->repair)) {
295 if (!tp->write_seq)
296 WRITE_ONCE(tp->write_seq,
297 secure_tcp_seq(inet->inet_saddr,
298 inet->inet_daddr,
299 inet->inet_sport,
300 usin->sin_port));
301 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
302 inet->inet_saddr,
303 inet->inet_daddr);
304 }
305
306 inet->inet_id = prandom_u32();
307
308 if (tcp_fastopen_defer_connect(sk, &err))
309 return err;
310 if (err)
311 goto failure;
312
313 err = tcp_connect(sk);
314
315 if (err)
316 goto failure;
317
318 return 0;
319
320failure:
321
322
323
324
325 tcp_set_state(sk, TCP_CLOSE);
326 ip_rt_put(rt);
327 sk->sk_route_caps = 0;
328 inet->inet_dport = 0;
329 return err;
330}
331EXPORT_SYMBOL(tcp_v4_connect);
332
333
334
335
336
337
338void tcp_v4_mtu_reduced(struct sock *sk)
339{
340 struct inet_sock *inet = inet_sk(sk);
341 struct dst_entry *dst;
342 u32 mtu;
343
344 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
345 return;
346 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
347 dst = inet_csk_update_pmtu(sk, mtu);
348 if (!dst)
349 return;
350
351
352
353
354 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
355 sk->sk_err_soft = EMSGSIZE;
356
357 mtu = dst_mtu(dst);
358
359 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
360 ip_sk_accept_pmtu(sk) &&
361 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
362 tcp_sync_mss(sk, mtu);
363
364
365
366
367
368
369 tcp_simple_retransmit(sk);
370 }
371}
372EXPORT_SYMBOL(tcp_v4_mtu_reduced);
373
374static void do_redirect(struct sk_buff *skb, struct sock *sk)
375{
376 struct dst_entry *dst = __sk_dst_check(sk, 0);
377
378 if (dst)
379 dst->ops->redirect(dst, sk, skb);
380}
381
382
383
384void tcp_req_err(struct sock *sk, u32 seq, bool abort)
385{
386 struct request_sock *req = inet_reqsk(sk);
387 struct net *net = sock_net(sk);
388
389
390
391
392 if (seq != tcp_rsk(req)->snt_isn) {
393 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
394 } else if (abort) {
395
396
397
398
399
400
401 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
402 tcp_listendrop(req->rsk_listener);
403 }
404 reqsk_put(req);
405}
406EXPORT_SYMBOL(tcp_req_err);
407
408
409void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
410{
411 struct inet_connection_sock *icsk = inet_csk(sk);
412 struct tcp_sock *tp = tcp_sk(sk);
413 struct sk_buff *skb;
414 s32 remaining;
415 u32 delta_us;
416
417 if (sock_owned_by_user(sk))
418 return;
419
420 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
421 !icsk->icsk_backoff)
422 return;
423
424 skb = tcp_rtx_queue_head(sk);
425 if (WARN_ON_ONCE(!skb))
426 return;
427
428 icsk->icsk_backoff--;
429 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
430 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
431
432 tcp_mstamp_refresh(tp);
433 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
434 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
435
436 if (remaining > 0) {
437 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
438 remaining, TCP_RTO_MAX);
439 } else {
440
441
442
443 tcp_retransmit_timer(sk);
444 }
445}
446EXPORT_SYMBOL(tcp_ld_RTO_revert);
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464int tcp_v4_err(struct sk_buff *skb, u32 info)
465{
466 const struct iphdr *iph = (const struct iphdr *)skb->data;
467 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
468 struct tcp_sock *tp;
469 struct inet_sock *inet;
470 const int type = icmp_hdr(skb)->type;
471 const int code = icmp_hdr(skb)->code;
472 struct sock *sk;
473 struct request_sock *fastopen;
474 u32 seq, snd_una;
475 int err;
476 struct net *net = dev_net(skb->dev);
477
478 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
479 th->dest, iph->saddr, ntohs(th->source),
480 inet_iif(skb), 0);
481 if (!sk) {
482 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
483 return -ENOENT;
484 }
485 if (sk->sk_state == TCP_TIME_WAIT) {
486 inet_twsk_put(inet_twsk(sk));
487 return 0;
488 }
489 seq = ntohl(th->seq);
490 if (sk->sk_state == TCP_NEW_SYN_RECV) {
491 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
492 type == ICMP_TIME_EXCEEDED ||
493 (type == ICMP_DEST_UNREACH &&
494 (code == ICMP_NET_UNREACH ||
495 code == ICMP_HOST_UNREACH)));
496 return 0;
497 }
498
499 bh_lock_sock(sk);
500
501
502
503
504
505 if (sock_owned_by_user(sk)) {
506 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
507 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
508 }
509 if (sk->sk_state == TCP_CLOSE)
510 goto out;
511
512 if (static_branch_unlikely(&ip4_min_ttl)) {
513
514 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
515 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
516 goto out;
517 }
518 }
519
520 tp = tcp_sk(sk);
521
522 fastopen = rcu_dereference(tp->fastopen_rsk);
523 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
524 if (sk->sk_state != TCP_LISTEN &&
525 !between(seq, snd_una, tp->snd_nxt)) {
526 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
527 goto out;
528 }
529
530 switch (type) {
531 case ICMP_REDIRECT:
532 if (!sock_owned_by_user(sk))
533 do_redirect(skb, sk);
534 goto out;
535 case ICMP_SOURCE_QUENCH:
536
537 goto out;
538 case ICMP_PARAMETERPROB:
539 err = EPROTO;
540 break;
541 case ICMP_DEST_UNREACH:
542 if (code > NR_ICMP_UNREACH)
543 goto out;
544
545 if (code == ICMP_FRAG_NEEDED) {
546
547
548
549
550 if (sk->sk_state == TCP_LISTEN)
551 goto out;
552
553 WRITE_ONCE(tp->mtu_info, info);
554 if (!sock_owned_by_user(sk)) {
555 tcp_v4_mtu_reduced(sk);
556 } else {
557 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
558 sock_hold(sk);
559 }
560 goto out;
561 }
562
563 err = icmp_err_convert[code].errno;
564
565
566
567 if (!fastopen &&
568 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
569 tcp_ld_RTO_revert(sk, seq);
570 break;
571 case ICMP_TIME_EXCEEDED:
572 err = EHOSTUNREACH;
573 break;
574 default:
575 goto out;
576 }
577
578 switch (sk->sk_state) {
579 case TCP_SYN_SENT:
580 case TCP_SYN_RECV:
581
582
583
584 if (fastopen && !fastopen->sk)
585 break;
586
587 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
588
589 if (!sock_owned_by_user(sk)) {
590 sk->sk_err = err;
591
592 sk_error_report(sk);
593
594 tcp_done(sk);
595 } else {
596 sk->sk_err_soft = err;
597 }
598 goto out;
599 }
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617 inet = inet_sk(sk);
618 if (!sock_owned_by_user(sk) && inet->recverr) {
619 sk->sk_err = err;
620 sk_error_report(sk);
621 } else {
622 sk->sk_err_soft = err;
623 }
624
625out:
626 bh_unlock_sock(sk);
627 sock_put(sk);
628 return 0;
629}
630
631void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
632{
633 struct tcphdr *th = tcp_hdr(skb);
634
635 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
636 skb->csum_start = skb_transport_header(skb) - skb->head;
637 skb->csum_offset = offsetof(struct tcphdr, check);
638}
639
640
641void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
642{
643 const struct inet_sock *inet = inet_sk(sk);
644
645 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
646}
647EXPORT_SYMBOL(tcp_v4_send_check);
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662#ifdef CONFIG_TCP_MD5SIG
663#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
664#else
665#define OPTION_BYTES sizeof(__be32)
666#endif
667
668static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
669{
670 const struct tcphdr *th = tcp_hdr(skb);
671 struct {
672 struct tcphdr th;
673 __be32 opt[OPTION_BYTES / sizeof(__be32)];
674 } rep;
675 struct ip_reply_arg arg;
676#ifdef CONFIG_TCP_MD5SIG
677 struct tcp_md5sig_key *key = NULL;
678 const __u8 *hash_location = NULL;
679 unsigned char newhash[16];
680 int genhash;
681 struct sock *sk1 = NULL;
682#endif
683 u64 transmit_time = 0;
684 struct sock *ctl_sk;
685 struct net *net;
686
687
688 if (th->rst)
689 return;
690
691
692
693
694 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
695 return;
696
697
698 memset(&rep, 0, sizeof(rep));
699 rep.th.dest = th->source;
700 rep.th.source = th->dest;
701 rep.th.doff = sizeof(struct tcphdr) / 4;
702 rep.th.rst = 1;
703
704 if (th->ack) {
705 rep.th.seq = th->ack_seq;
706 } else {
707 rep.th.ack = 1;
708 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
709 skb->len - (th->doff << 2));
710 }
711
712 memset(&arg, 0, sizeof(arg));
713 arg.iov[0].iov_base = (unsigned char *)&rep;
714 arg.iov[0].iov_len = sizeof(rep.th);
715
716 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
717#ifdef CONFIG_TCP_MD5SIG
718 rcu_read_lock();
719 hash_location = tcp_parse_md5sig_option(th);
720 if (sk && sk_fullsock(sk)) {
721 const union tcp_md5_addr *addr;
722 int l3index;
723
724
725
726
727 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
728 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
729 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
730 } else if (hash_location) {
731 const union tcp_md5_addr *addr;
732 int sdif = tcp_v4_sdif(skb);
733 int dif = inet_iif(skb);
734 int l3index;
735
736
737
738
739
740
741
742
743 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
744 ip_hdr(skb)->saddr,
745 th->source, ip_hdr(skb)->daddr,
746 ntohs(th->source), dif, sdif);
747
748 if (!sk1)
749 goto out;
750
751
752
753
754 l3index = sdif ? dif : 0;
755 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
756 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
757 if (!key)
758 goto out;
759
760
761 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
762 if (genhash || memcmp(hash_location, newhash, 16) != 0)
763 goto out;
764
765 }
766
767 if (key) {
768 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
769 (TCPOPT_NOP << 16) |
770 (TCPOPT_MD5SIG << 8) |
771 TCPOLEN_MD5SIG);
772
773 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
774 rep.th.doff = arg.iov[0].iov_len / 4;
775
776 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
777 key, ip_hdr(skb)->saddr,
778 ip_hdr(skb)->daddr, &rep.th);
779 }
780#endif
781
782 if (rep.opt[0] == 0) {
783 __be32 mrst = mptcp_reset_option(skb);
784
785 if (mrst) {
786 rep.opt[0] = mrst;
787 arg.iov[0].iov_len += sizeof(mrst);
788 rep.th.doff = arg.iov[0].iov_len / 4;
789 }
790 }
791
792 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
793 ip_hdr(skb)->saddr,
794 arg.iov[0].iov_len, IPPROTO_TCP, 0);
795 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
796 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
797
798
799
800
801
802 if (sk) {
803 arg.bound_dev_if = sk->sk_bound_dev_if;
804 if (sk_fullsock(sk))
805 trace_tcp_send_reset(sk, skb);
806 }
807
808 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
809 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
810
811 arg.tos = ip_hdr(skb)->tos;
812 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
813 local_bh_disable();
814 ctl_sk = this_cpu_read(ipv4_tcp_sk);
815 sock_net_set(ctl_sk, net);
816 if (sk) {
817 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
818 inet_twsk(sk)->tw_mark : sk->sk_mark;
819 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
820 inet_twsk(sk)->tw_priority : sk->sk_priority;
821 transmit_time = tcp_transmit_time(sk);
822 }
823 ip_send_unicast_reply(ctl_sk,
824 skb, &TCP_SKB_CB(skb)->header.h4.opt,
825 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
826 &arg, arg.iov[0].iov_len,
827 transmit_time);
828
829 ctl_sk->sk_mark = 0;
830 sock_net_set(ctl_sk, &init_net);
831 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
832 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
833 local_bh_enable();
834
835#ifdef CONFIG_TCP_MD5SIG
836out:
837 rcu_read_unlock();
838#endif
839}
840
841
842
843
844
845static void tcp_v4_send_ack(const struct sock *sk,
846 struct sk_buff *skb, u32 seq, u32 ack,
847 u32 win, u32 tsval, u32 tsecr, int oif,
848 struct tcp_md5sig_key *key,
849 int reply_flags, u8 tos)
850{
851 const struct tcphdr *th = tcp_hdr(skb);
852 struct {
853 struct tcphdr th;
854 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
855#ifdef CONFIG_TCP_MD5SIG
856 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
857#endif
858 ];
859 } rep;
860 struct net *net = sock_net(sk);
861 struct ip_reply_arg arg;
862 struct sock *ctl_sk;
863 u64 transmit_time;
864
865 memset(&rep.th, 0, sizeof(struct tcphdr));
866 memset(&arg, 0, sizeof(arg));
867
868 arg.iov[0].iov_base = (unsigned char *)&rep;
869 arg.iov[0].iov_len = sizeof(rep.th);
870 if (tsecr) {
871 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
872 (TCPOPT_TIMESTAMP << 8) |
873 TCPOLEN_TIMESTAMP);
874 rep.opt[1] = htonl(tsval);
875 rep.opt[2] = htonl(tsecr);
876 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
877 }
878
879
880 rep.th.dest = th->source;
881 rep.th.source = th->dest;
882 rep.th.doff = arg.iov[0].iov_len / 4;
883 rep.th.seq = htonl(seq);
884 rep.th.ack_seq = htonl(ack);
885 rep.th.ack = 1;
886 rep.th.window = htons(win);
887
888#ifdef CONFIG_TCP_MD5SIG
889 if (key) {
890 int offset = (tsecr) ? 3 : 0;
891
892 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
893 (TCPOPT_NOP << 16) |
894 (TCPOPT_MD5SIG << 8) |
895 TCPOLEN_MD5SIG);
896 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
897 rep.th.doff = arg.iov[0].iov_len/4;
898
899 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
900 key, ip_hdr(skb)->saddr,
901 ip_hdr(skb)->daddr, &rep.th);
902 }
903#endif
904 arg.flags = reply_flags;
905 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
906 ip_hdr(skb)->saddr,
907 arg.iov[0].iov_len, IPPROTO_TCP, 0);
908 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
909 if (oif)
910 arg.bound_dev_if = oif;
911 arg.tos = tos;
912 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
913 local_bh_disable();
914 ctl_sk = this_cpu_read(ipv4_tcp_sk);
915 sock_net_set(ctl_sk, net);
916 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
917 inet_twsk(sk)->tw_mark : sk->sk_mark;
918 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
919 inet_twsk(sk)->tw_priority : sk->sk_priority;
920 transmit_time = tcp_transmit_time(sk);
921 ip_send_unicast_reply(ctl_sk,
922 skb, &TCP_SKB_CB(skb)->header.h4.opt,
923 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
924 &arg, arg.iov[0].iov_len,
925 transmit_time);
926
927 ctl_sk->sk_mark = 0;
928 sock_net_set(ctl_sk, &init_net);
929 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
930 local_bh_enable();
931}
932
933static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
934{
935 struct inet_timewait_sock *tw = inet_twsk(sk);
936 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
937
938 tcp_v4_send_ack(sk, skb,
939 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
940 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
941 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
942 tcptw->tw_ts_recent,
943 tw->tw_bound_dev_if,
944 tcp_twsk_md5_key(tcptw),
945 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
946 tw->tw_tos
947 );
948
949 inet_twsk_put(tw);
950}
951
952static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
953 struct request_sock *req)
954{
955 const union tcp_md5_addr *addr;
956 int l3index;
957
958
959
960
961 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
962 tcp_sk(sk)->snd_nxt;
963
964
965
966
967
968
969 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
970 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
971 tcp_v4_send_ack(sk, skb, seq,
972 tcp_rsk(req)->rcv_nxt,
973 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
974 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
975 req->ts_recent,
976 0,
977 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
978 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
979 ip_hdr(skb)->tos);
980}
981
982
983
984
985
986
987static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
988 struct flowi *fl,
989 struct request_sock *req,
990 struct tcp_fastopen_cookie *foc,
991 enum tcp_synack_type synack_type,
992 struct sk_buff *syn_skb)
993{
994 const struct inet_request_sock *ireq = inet_rsk(req);
995 struct flowi4 fl4;
996 int err = -1;
997 struct sk_buff *skb;
998 u8 tos;
999
1000
1001 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1002 return -1;
1003
1004 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1005
1006 if (skb) {
1007 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1008
1009 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1010 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1011 (inet_sk(sk)->tos & INET_ECN_MASK) :
1012 inet_sk(sk)->tos;
1013
1014 if (!INET_ECN_is_capable(tos) &&
1015 tcp_bpf_ca_needs_ecn((struct sock *)req))
1016 tos |= INET_ECN_ECT_0;
1017
1018 rcu_read_lock();
1019 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1020 ireq->ir_rmt_addr,
1021 rcu_dereference(ireq->ireq_opt),
1022 tos);
1023 rcu_read_unlock();
1024 err = net_xmit_eval(err);
1025 }
1026
1027 return err;
1028}
1029
1030
1031
1032
1033static void tcp_v4_reqsk_destructor(struct request_sock *req)
1034{
1035 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1036}
1037
1038#ifdef CONFIG_TCP_MD5SIG
1039
1040
1041
1042
1043
1044
1045DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1046EXPORT_SYMBOL(tcp_md5_needed);
1047
1048static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1049{
1050 if (!old)
1051 return true;
1052
1053
1054 if (old->l3index && new->l3index == 0)
1055 return false;
1056 if (old->l3index == 0 && new->l3index)
1057 return true;
1058
1059 return old->prefixlen < new->prefixlen;
1060}
1061
1062
1063struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1064 const union tcp_md5_addr *addr,
1065 int family)
1066{
1067 const struct tcp_sock *tp = tcp_sk(sk);
1068 struct tcp_md5sig_key *key;
1069 const struct tcp_md5sig_info *md5sig;
1070 __be32 mask;
1071 struct tcp_md5sig_key *best_match = NULL;
1072 bool match;
1073
1074
1075 md5sig = rcu_dereference_check(tp->md5sig_info,
1076 lockdep_sock_is_held(sk));
1077 if (!md5sig)
1078 return NULL;
1079
1080 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1081 lockdep_sock_is_held(sk)) {
1082 if (key->family != family)
1083 continue;
1084 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1085 continue;
1086 if (family == AF_INET) {
1087 mask = inet_make_mask(key->prefixlen);
1088 match = (key->addr.a4.s_addr & mask) ==
1089 (addr->a4.s_addr & mask);
1090#if IS_ENABLED(CONFIG_IPV6)
1091 } else if (family == AF_INET6) {
1092 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1093 key->prefixlen);
1094#endif
1095 } else {
1096 match = false;
1097 }
1098
1099 if (match && better_md5_match(best_match, key))
1100 best_match = key;
1101 }
1102 return best_match;
1103}
1104EXPORT_SYMBOL(__tcp_md5_do_lookup);
1105
1106static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1107 const union tcp_md5_addr *addr,
1108 int family, u8 prefixlen,
1109 int l3index, u8 flags)
1110{
1111 const struct tcp_sock *tp = tcp_sk(sk);
1112 struct tcp_md5sig_key *key;
1113 unsigned int size = sizeof(struct in_addr);
1114 const struct tcp_md5sig_info *md5sig;
1115
1116
1117 md5sig = rcu_dereference_check(tp->md5sig_info,
1118 lockdep_sock_is_held(sk));
1119 if (!md5sig)
1120 return NULL;
1121#if IS_ENABLED(CONFIG_IPV6)
1122 if (family == AF_INET6)
1123 size = sizeof(struct in6_addr);
1124#endif
1125 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1126 lockdep_sock_is_held(sk)) {
1127 if (key->family != family)
1128 continue;
1129 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1130 continue;
1131 if (key->l3index != l3index)
1132 continue;
1133 if (!memcmp(&key->addr, addr, size) &&
1134 key->prefixlen == prefixlen)
1135 return key;
1136 }
1137 return NULL;
1138}
1139
1140struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1141 const struct sock *addr_sk)
1142{
1143 const union tcp_md5_addr *addr;
1144 int l3index;
1145
1146 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1147 addr_sk->sk_bound_dev_if);
1148 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1149 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1150}
1151EXPORT_SYMBOL(tcp_v4_md5_lookup);
1152
1153
1154int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1155 int family, u8 prefixlen, int l3index, u8 flags,
1156 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1157{
1158
1159 struct tcp_md5sig_key *key;
1160 struct tcp_sock *tp = tcp_sk(sk);
1161 struct tcp_md5sig_info *md5sig;
1162
1163 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1164 if (key) {
1165
1166
1167
1168
1169
1170
1171 data_race(memcpy(key->key, newkey, newkeylen));
1172
1173
1174
1175
1176
1177
1178 WRITE_ONCE(key->keylen, newkeylen);
1179
1180 return 0;
1181 }
1182
1183 md5sig = rcu_dereference_protected(tp->md5sig_info,
1184 lockdep_sock_is_held(sk));
1185 if (!md5sig) {
1186 md5sig = kmalloc(sizeof(*md5sig), gfp);
1187 if (!md5sig)
1188 return -ENOMEM;
1189
1190 sk_gso_disable(sk);
1191 INIT_HLIST_HEAD(&md5sig->head);
1192 rcu_assign_pointer(tp->md5sig_info, md5sig);
1193 }
1194
1195 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1196 if (!key)
1197 return -ENOMEM;
1198 if (!tcp_alloc_md5sig_pool()) {
1199 sock_kfree_s(sk, key, sizeof(*key));
1200 return -ENOMEM;
1201 }
1202
1203 memcpy(key->key, newkey, newkeylen);
1204 key->keylen = newkeylen;
1205 key->family = family;
1206 key->prefixlen = prefixlen;
1207 key->l3index = l3index;
1208 key->flags = flags;
1209 memcpy(&key->addr, addr,
1210 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1211 sizeof(struct in_addr));
1212 hlist_add_head_rcu(&key->node, &md5sig->head);
1213 return 0;
1214}
1215EXPORT_SYMBOL(tcp_md5_do_add);
1216
1217int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1218 u8 prefixlen, int l3index, u8 flags)
1219{
1220 struct tcp_md5sig_key *key;
1221
1222 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1223 if (!key)
1224 return -ENOENT;
1225 hlist_del_rcu(&key->node);
1226 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1227 kfree_rcu(key, rcu);
1228 return 0;
1229}
1230EXPORT_SYMBOL(tcp_md5_do_del);
1231
1232static void tcp_clear_md5_list(struct sock *sk)
1233{
1234 struct tcp_sock *tp = tcp_sk(sk);
1235 struct tcp_md5sig_key *key;
1236 struct hlist_node *n;
1237 struct tcp_md5sig_info *md5sig;
1238
1239 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1240
1241 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1242 hlist_del_rcu(&key->node);
1243 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1244 kfree_rcu(key, rcu);
1245 }
1246}
1247
1248static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1249 sockptr_t optval, int optlen)
1250{
1251 struct tcp_md5sig cmd;
1252 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1253 const union tcp_md5_addr *addr;
1254 u8 prefixlen = 32;
1255 int l3index = 0;
1256 u8 flags;
1257
1258 if (optlen < sizeof(cmd))
1259 return -EINVAL;
1260
1261 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1262 return -EFAULT;
1263
1264 if (sin->sin_family != AF_INET)
1265 return -EINVAL;
1266
1267 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1268
1269 if (optname == TCP_MD5SIG_EXT &&
1270 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1271 prefixlen = cmd.tcpm_prefixlen;
1272 if (prefixlen > 32)
1273 return -EINVAL;
1274 }
1275
1276 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1277 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1278 struct net_device *dev;
1279
1280 rcu_read_lock();
1281 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1282 if (dev && netif_is_l3_master(dev))
1283 l3index = dev->ifindex;
1284
1285 rcu_read_unlock();
1286
1287
1288
1289
1290 if (!dev || !l3index)
1291 return -EINVAL;
1292 }
1293
1294 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1295
1296 if (!cmd.tcpm_keylen)
1297 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1298
1299 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1300 return -EINVAL;
1301
1302 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1303 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1304}
1305
1306static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1307 __be32 daddr, __be32 saddr,
1308 const struct tcphdr *th, int nbytes)
1309{
1310 struct tcp4_pseudohdr *bp;
1311 struct scatterlist sg;
1312 struct tcphdr *_th;
1313
1314 bp = hp->scratch;
1315 bp->saddr = saddr;
1316 bp->daddr = daddr;
1317 bp->pad = 0;
1318 bp->protocol = IPPROTO_TCP;
1319 bp->len = cpu_to_be16(nbytes);
1320
1321 _th = (struct tcphdr *)(bp + 1);
1322 memcpy(_th, th, sizeof(*th));
1323 _th->check = 0;
1324
1325 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1326 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1327 sizeof(*bp) + sizeof(*th));
1328 return crypto_ahash_update(hp->md5_req);
1329}
1330
1331static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1332 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1333{
1334 struct tcp_md5sig_pool *hp;
1335 struct ahash_request *req;
1336
1337 hp = tcp_get_md5sig_pool();
1338 if (!hp)
1339 goto clear_hash_noput;
1340 req = hp->md5_req;
1341
1342 if (crypto_ahash_init(req))
1343 goto clear_hash;
1344 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1345 goto clear_hash;
1346 if (tcp_md5_hash_key(hp, key))
1347 goto clear_hash;
1348 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1349 if (crypto_ahash_final(req))
1350 goto clear_hash;
1351
1352 tcp_put_md5sig_pool();
1353 return 0;
1354
1355clear_hash:
1356 tcp_put_md5sig_pool();
1357clear_hash_noput:
1358 memset(md5_hash, 0, 16);
1359 return 1;
1360}
1361
1362int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1363 const struct sock *sk,
1364 const struct sk_buff *skb)
1365{
1366 struct tcp_md5sig_pool *hp;
1367 struct ahash_request *req;
1368 const struct tcphdr *th = tcp_hdr(skb);
1369 __be32 saddr, daddr;
1370
1371 if (sk) {
1372 saddr = sk->sk_rcv_saddr;
1373 daddr = sk->sk_daddr;
1374 } else {
1375 const struct iphdr *iph = ip_hdr(skb);
1376 saddr = iph->saddr;
1377 daddr = iph->daddr;
1378 }
1379
1380 hp = tcp_get_md5sig_pool();
1381 if (!hp)
1382 goto clear_hash_noput;
1383 req = hp->md5_req;
1384
1385 if (crypto_ahash_init(req))
1386 goto clear_hash;
1387
1388 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1389 goto clear_hash;
1390 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1391 goto clear_hash;
1392 if (tcp_md5_hash_key(hp, key))
1393 goto clear_hash;
1394 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1395 if (crypto_ahash_final(req))
1396 goto clear_hash;
1397
1398 tcp_put_md5sig_pool();
1399 return 0;
1400
1401clear_hash:
1402 tcp_put_md5sig_pool();
1403clear_hash_noput:
1404 memset(md5_hash, 0, 16);
1405 return 1;
1406}
1407EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1408
1409#endif
1410
1411static void tcp_v4_init_req(struct request_sock *req,
1412 const struct sock *sk_listener,
1413 struct sk_buff *skb)
1414{
1415 struct inet_request_sock *ireq = inet_rsk(req);
1416 struct net *net = sock_net(sk_listener);
1417
1418 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1419 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1420 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1421}
1422
1423static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1424 struct sk_buff *skb,
1425 struct flowi *fl,
1426 struct request_sock *req)
1427{
1428 tcp_v4_init_req(req, sk, skb);
1429
1430 if (security_inet_conn_request(sk, skb, req))
1431 return NULL;
1432
1433 return inet_csk_route_req(sk, &fl->u.ip4, req);
1434}
1435
1436struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1437 .family = PF_INET,
1438 .obj_size = sizeof(struct tcp_request_sock),
1439 .rtx_syn_ack = tcp_rtx_synack,
1440 .send_ack = tcp_v4_reqsk_send_ack,
1441 .destructor = tcp_v4_reqsk_destructor,
1442 .send_reset = tcp_v4_send_reset,
1443 .syn_ack_timeout = tcp_syn_ack_timeout,
1444};
1445
1446const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1447 .mss_clamp = TCP_MSS_DEFAULT,
1448#ifdef CONFIG_TCP_MD5SIG
1449 .req_md5_lookup = tcp_v4_md5_lookup,
1450 .calc_md5_hash = tcp_v4_md5_hash_skb,
1451#endif
1452#ifdef CONFIG_SYN_COOKIES
1453 .cookie_init_seq = cookie_v4_init_sequence,
1454#endif
1455 .route_req = tcp_v4_route_req,
1456 .init_seq = tcp_v4_init_seq,
1457 .init_ts_off = tcp_v4_init_ts_off,
1458 .send_synack = tcp_v4_send_synack,
1459};
1460
1461int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1462{
1463
1464 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1465 goto drop;
1466
1467 return tcp_conn_request(&tcp_request_sock_ops,
1468 &tcp_request_sock_ipv4_ops, sk, skb);
1469
1470drop:
1471 tcp_listendrop(sk);
1472 return 0;
1473}
1474EXPORT_SYMBOL(tcp_v4_conn_request);
1475
1476
1477
1478
1479
1480
1481struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1482 struct request_sock *req,
1483 struct dst_entry *dst,
1484 struct request_sock *req_unhash,
1485 bool *own_req)
1486{
1487 struct inet_request_sock *ireq;
1488 bool found_dup_sk = false;
1489 struct inet_sock *newinet;
1490 struct tcp_sock *newtp;
1491 struct sock *newsk;
1492#ifdef CONFIG_TCP_MD5SIG
1493 const union tcp_md5_addr *addr;
1494 struct tcp_md5sig_key *key;
1495 int l3index;
1496#endif
1497 struct ip_options_rcu *inet_opt;
1498
1499 if (sk_acceptq_is_full(sk))
1500 goto exit_overflow;
1501
1502 newsk = tcp_create_openreq_child(sk, req, skb);
1503 if (!newsk)
1504 goto exit_nonewsk;
1505
1506 newsk->sk_gso_type = SKB_GSO_TCPV4;
1507 inet_sk_rx_dst_set(newsk, skb);
1508
1509 newtp = tcp_sk(newsk);
1510 newinet = inet_sk(newsk);
1511 ireq = inet_rsk(req);
1512 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1513 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1514 newsk->sk_bound_dev_if = ireq->ir_iif;
1515 newinet->inet_saddr = ireq->ir_loc_addr;
1516 inet_opt = rcu_dereference(ireq->ireq_opt);
1517 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1518 newinet->mc_index = inet_iif(skb);
1519 newinet->mc_ttl = ip_hdr(skb)->ttl;
1520 newinet->rcv_tos = ip_hdr(skb)->tos;
1521 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1522 if (inet_opt)
1523 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1524 newinet->inet_id = prandom_u32();
1525
1526
1527
1528
1529 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1530 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1531
1532 if (!dst) {
1533 dst = inet_csk_route_child_sock(sk, newsk, req);
1534 if (!dst)
1535 goto put_and_exit;
1536 } else {
1537
1538 }
1539 sk_setup_caps(newsk, dst);
1540
1541 tcp_ca_openreq_child(newsk, dst);
1542
1543 tcp_sync_mss(newsk, dst_mtu(dst));
1544 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1545
1546 tcp_initialize_rcv_mss(newsk);
1547
1548#ifdef CONFIG_TCP_MD5SIG
1549 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1550
1551 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1552 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1553 if (key) {
1554
1555
1556
1557
1558
1559
1560 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1561 key->key, key->keylen, GFP_ATOMIC);
1562 sk_gso_disable(newsk);
1563 }
1564#endif
1565
1566 if (__inet_inherit_port(sk, newsk) < 0)
1567 goto put_and_exit;
1568 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1569 &found_dup_sk);
1570 if (likely(*own_req)) {
1571 tcp_move_syn(newtp, req);
1572 ireq->ireq_opt = NULL;
1573 } else {
1574 newinet->inet_opt = NULL;
1575
1576 if (!req_unhash && found_dup_sk) {
1577
1578
1579
1580 bh_unlock_sock(newsk);
1581 sock_put(newsk);
1582 newsk = NULL;
1583 }
1584 }
1585 return newsk;
1586
1587exit_overflow:
1588 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1589exit_nonewsk:
1590 dst_release(dst);
1591exit:
1592 tcp_listendrop(sk);
1593 return NULL;
1594put_and_exit:
1595 newinet->inet_opt = NULL;
1596 inet_csk_prepare_forced_close(newsk);
1597 tcp_done(newsk);
1598 goto exit;
1599}
1600EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1601
1602static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1603{
1604#ifdef CONFIG_SYN_COOKIES
1605 const struct tcphdr *th = tcp_hdr(skb);
1606
1607 if (!th->syn)
1608 sk = cookie_v4_check(sk, skb);
1609#endif
1610 return sk;
1611}
1612
1613u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1614 struct tcphdr *th, u32 *cookie)
1615{
1616 u16 mss = 0;
1617#ifdef CONFIG_SYN_COOKIES
1618 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1619 &tcp_request_sock_ipv4_ops, sk, th);
1620 if (mss) {
1621 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1622 tcp_synq_overflow(sk);
1623 }
1624#endif
1625 return mss;
1626}
1627
1628INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1629 u32));
1630
1631
1632
1633
1634
1635
1636
1637
1638int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1639{
1640 enum skb_drop_reason reason;
1641 struct sock *rsk;
1642
1643 if (sk->sk_state == TCP_ESTABLISHED) {
1644 struct dst_entry *dst;
1645
1646 dst = rcu_dereference_protected(sk->sk_rx_dst,
1647 lockdep_sock_is_held(sk));
1648
1649 sock_rps_save_rxhash(sk, skb);
1650 sk_mark_napi_id(sk, skb);
1651 if (dst) {
1652 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1653 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1654 dst, 0)) {
1655 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1656 dst_release(dst);
1657 }
1658 }
1659 tcp_rcv_established(sk, skb);
1660 return 0;
1661 }
1662
1663 reason = SKB_DROP_REASON_NOT_SPECIFIED;
1664 if (tcp_checksum_complete(skb))
1665 goto csum_err;
1666
1667 if (sk->sk_state == TCP_LISTEN) {
1668 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1669
1670 if (!nsk)
1671 goto discard;
1672 if (nsk != sk) {
1673 if (tcp_child_process(sk, nsk, skb)) {
1674 rsk = nsk;
1675 goto reset;
1676 }
1677 return 0;
1678 }
1679 } else
1680 sock_rps_save_rxhash(sk, skb);
1681
1682 if (tcp_rcv_state_process(sk, skb)) {
1683 rsk = sk;
1684 goto reset;
1685 }
1686 return 0;
1687
1688reset:
1689 tcp_v4_send_reset(rsk, skb);
1690discard:
1691 kfree_skb_reason(skb, reason);
1692
1693
1694
1695
1696
1697 return 0;
1698
1699csum_err:
1700 reason = SKB_DROP_REASON_TCP_CSUM;
1701 trace_tcp_bad_csum(skb);
1702 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1703 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1704 goto discard;
1705}
1706EXPORT_SYMBOL(tcp_v4_do_rcv);
1707
1708int tcp_v4_early_demux(struct sk_buff *skb)
1709{
1710 const struct iphdr *iph;
1711 const struct tcphdr *th;
1712 struct sock *sk;
1713
1714 if (skb->pkt_type != PACKET_HOST)
1715 return 0;
1716
1717 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1718 return 0;
1719
1720 iph = ip_hdr(skb);
1721 th = tcp_hdr(skb);
1722
1723 if (th->doff < sizeof(struct tcphdr) / 4)
1724 return 0;
1725
1726 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1727 iph->saddr, th->source,
1728 iph->daddr, ntohs(th->dest),
1729 skb->skb_iif, inet_sdif(skb));
1730 if (sk) {
1731 skb->sk = sk;
1732 skb->destructor = sock_edemux;
1733 if (sk_fullsock(sk)) {
1734 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1735
1736 if (dst)
1737 dst = dst_check(dst, 0);
1738 if (dst &&
1739 sk->sk_rx_dst_ifindex == skb->skb_iif)
1740 skb_dst_set_noref(skb, dst);
1741 }
1742 }
1743 return 0;
1744}
1745
1746bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1747 enum skb_drop_reason *reason)
1748{
1749 u32 limit, tail_gso_size, tail_gso_segs;
1750 struct skb_shared_info *shinfo;
1751 const struct tcphdr *th;
1752 struct tcphdr *thtail;
1753 struct sk_buff *tail;
1754 unsigned int hdrlen;
1755 bool fragstolen;
1756 u32 gso_segs;
1757 u32 gso_size;
1758 int delta;
1759
1760
1761
1762
1763
1764
1765
1766 skb_condense(skb);
1767
1768 skb_dst_drop(skb);
1769
1770 if (unlikely(tcp_checksum_complete(skb))) {
1771 bh_unlock_sock(sk);
1772 trace_tcp_bad_csum(skb);
1773 *reason = SKB_DROP_REASON_TCP_CSUM;
1774 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1775 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1776 return true;
1777 }
1778
1779
1780
1781
1782
1783 th = (const struct tcphdr *)skb->data;
1784 hdrlen = th->doff * 4;
1785
1786 tail = sk->sk_backlog.tail;
1787 if (!tail)
1788 goto no_coalesce;
1789 thtail = (struct tcphdr *)tail->data;
1790
1791 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1792 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1793 ((TCP_SKB_CB(tail)->tcp_flags |
1794 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1795 !((TCP_SKB_CB(tail)->tcp_flags &
1796 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1797 ((TCP_SKB_CB(tail)->tcp_flags ^
1798 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1799#ifdef CONFIG_TLS_DEVICE
1800 tail->decrypted != skb->decrypted ||
1801#endif
1802 thtail->doff != th->doff ||
1803 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1804 goto no_coalesce;
1805
1806 __skb_pull(skb, hdrlen);
1807
1808 shinfo = skb_shinfo(skb);
1809 gso_size = shinfo->gso_size ?: skb->len;
1810 gso_segs = shinfo->gso_segs ?: 1;
1811
1812 shinfo = skb_shinfo(tail);
1813 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1814 tail_gso_segs = shinfo->gso_segs ?: 1;
1815
1816 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1817 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1818
1819 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1820 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1821 thtail->window = th->window;
1822 }
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832 thtail->fin |= th->fin;
1833 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1834
1835 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1836 TCP_SKB_CB(tail)->has_rxtstamp = true;
1837 tail->tstamp = skb->tstamp;
1838 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1839 }
1840
1841
1842 shinfo->gso_size = max(gso_size, tail_gso_size);
1843 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1844
1845 sk->sk_backlog.len += delta;
1846 __NET_INC_STATS(sock_net(sk),
1847 LINUX_MIB_TCPBACKLOGCOALESCE);
1848 kfree_skb_partial(skb, fragstolen);
1849 return false;
1850 }
1851 __skb_push(skb, hdrlen);
1852
1853no_coalesce:
1854
1855
1856
1857
1858 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1859
1860 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1861 bh_unlock_sock(sk);
1862 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1863 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1864 return true;
1865 }
1866 return false;
1867}
1868EXPORT_SYMBOL(tcp_add_backlog);
1869
1870int tcp_filter(struct sock *sk, struct sk_buff *skb)
1871{
1872 struct tcphdr *th = (struct tcphdr *)skb->data;
1873
1874 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1875}
1876EXPORT_SYMBOL(tcp_filter);
1877
1878static void tcp_v4_restore_cb(struct sk_buff *skb)
1879{
1880 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1881 sizeof(struct inet_skb_parm));
1882}
1883
1884static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1885 const struct tcphdr *th)
1886{
1887
1888
1889
1890 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1891 sizeof(struct inet_skb_parm));
1892 barrier();
1893
1894 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1895 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1896 skb->len - th->doff * 4);
1897 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1898 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1899 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1900 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1901 TCP_SKB_CB(skb)->sacked = 0;
1902 TCP_SKB_CB(skb)->has_rxtstamp =
1903 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1904}
1905
1906
1907
1908
1909
1910int tcp_v4_rcv(struct sk_buff *skb)
1911{
1912 struct net *net = dev_net(skb->dev);
1913 enum skb_drop_reason drop_reason;
1914 int sdif = inet_sdif(skb);
1915 int dif = inet_iif(skb);
1916 const struct iphdr *iph;
1917 const struct tcphdr *th;
1918 bool refcounted;
1919 struct sock *sk;
1920 int ret;
1921
1922 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1923 if (skb->pkt_type != PACKET_HOST)
1924 goto discard_it;
1925
1926
1927 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1928
1929 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1930 goto discard_it;
1931
1932 th = (const struct tcphdr *)skb->data;
1933
1934 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1935 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1936 goto bad_packet;
1937 }
1938 if (!pskb_may_pull(skb, th->doff * 4))
1939 goto discard_it;
1940
1941
1942
1943
1944
1945
1946 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1947 goto csum_error;
1948
1949 th = (const struct tcphdr *)skb->data;
1950 iph = ip_hdr(skb);
1951lookup:
1952 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1953 th->dest, sdif, &refcounted);
1954 if (!sk)
1955 goto no_tcp_socket;
1956
1957process:
1958 if (sk->sk_state == TCP_TIME_WAIT)
1959 goto do_time_wait;
1960
1961 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1962 struct request_sock *req = inet_reqsk(sk);
1963 bool req_stolen = false;
1964 struct sock *nsk;
1965
1966 sk = req->rsk_listener;
1967 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1968 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1969 else
1970 drop_reason = tcp_inbound_md5_hash(sk, skb,
1971 &iph->saddr, &iph->daddr,
1972 AF_INET, dif, sdif);
1973 if (unlikely(drop_reason)) {
1974 sk_drops_add(sk, skb);
1975 reqsk_put(req);
1976 goto discard_it;
1977 }
1978 if (tcp_checksum_complete(skb)) {
1979 reqsk_put(req);
1980 goto csum_error;
1981 }
1982 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1983 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
1984 if (!nsk) {
1985 inet_csk_reqsk_queue_drop_and_put(sk, req);
1986 goto lookup;
1987 }
1988 sk = nsk;
1989
1990
1991
1992 } else {
1993
1994
1995
1996 sock_hold(sk);
1997 }
1998 refcounted = true;
1999 nsk = NULL;
2000 if (!tcp_filter(sk, skb)) {
2001 th = (const struct tcphdr *)skb->data;
2002 iph = ip_hdr(skb);
2003 tcp_v4_fill_cb(skb, iph, th);
2004 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2005 } else {
2006 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2007 }
2008 if (!nsk) {
2009 reqsk_put(req);
2010 if (req_stolen) {
2011
2012
2013
2014
2015
2016 tcp_v4_restore_cb(skb);
2017 sock_put(sk);
2018 goto lookup;
2019 }
2020 goto discard_and_relse;
2021 }
2022 nf_reset_ct(skb);
2023 if (nsk == sk) {
2024 reqsk_put(req);
2025 tcp_v4_restore_cb(skb);
2026 } else if (tcp_child_process(sk, nsk, skb)) {
2027 tcp_v4_send_reset(nsk, skb);
2028 goto discard_and_relse;
2029 } else {
2030 sock_put(sk);
2031 return 0;
2032 }
2033 }
2034
2035 if (static_branch_unlikely(&ip4_min_ttl)) {
2036
2037 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2038 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2039 goto discard_and_relse;
2040 }
2041 }
2042
2043 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2044 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2045 goto discard_and_relse;
2046 }
2047
2048 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2049 &iph->daddr, AF_INET, dif, sdif);
2050 if (drop_reason)
2051 goto discard_and_relse;
2052
2053 nf_reset_ct(skb);
2054
2055 if (tcp_filter(sk, skb)) {
2056 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2057 goto discard_and_relse;
2058 }
2059 th = (const struct tcphdr *)skb->data;
2060 iph = ip_hdr(skb);
2061 tcp_v4_fill_cb(skb, iph, th);
2062
2063 skb->dev = NULL;
2064
2065 if (sk->sk_state == TCP_LISTEN) {
2066 ret = tcp_v4_do_rcv(sk, skb);
2067 goto put_and_return;
2068 }
2069
2070 sk_incoming_cpu_update(sk);
2071
2072 bh_lock_sock_nested(sk);
2073 tcp_segs_in(tcp_sk(sk), skb);
2074 ret = 0;
2075 if (!sock_owned_by_user(sk)) {
2076 ret = tcp_v4_do_rcv(sk, skb);
2077 } else {
2078 if (tcp_add_backlog(sk, skb, &drop_reason))
2079 goto discard_and_relse;
2080 }
2081 bh_unlock_sock(sk);
2082
2083put_and_return:
2084 if (refcounted)
2085 sock_put(sk);
2086
2087 return ret;
2088
2089no_tcp_socket:
2090 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2091 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2092 goto discard_it;
2093
2094 tcp_v4_fill_cb(skb, iph, th);
2095
2096 if (tcp_checksum_complete(skb)) {
2097csum_error:
2098 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2099 trace_tcp_bad_csum(skb);
2100 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2101bad_packet:
2102 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2103 } else {
2104 tcp_v4_send_reset(NULL, skb);
2105 }
2106
2107discard_it:
2108 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2109
2110 kfree_skb_reason(skb, drop_reason);
2111 return 0;
2112
2113discard_and_relse:
2114 sk_drops_add(sk, skb);
2115 if (refcounted)
2116 sock_put(sk);
2117 goto discard_it;
2118
2119do_time_wait:
2120 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2121 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2122 inet_twsk_put(inet_twsk(sk));
2123 goto discard_it;
2124 }
2125
2126 tcp_v4_fill_cb(skb, iph, th);
2127
2128 if (tcp_checksum_complete(skb)) {
2129 inet_twsk_put(inet_twsk(sk));
2130 goto csum_error;
2131 }
2132 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2133 case TCP_TW_SYN: {
2134 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2135 &tcp_hashinfo, skb,
2136 __tcp_hdrlen(th),
2137 iph->saddr, th->source,
2138 iph->daddr, th->dest,
2139 inet_iif(skb),
2140 sdif);
2141 if (sk2) {
2142 inet_twsk_deschedule_put(inet_twsk(sk));
2143 sk = sk2;
2144 tcp_v4_restore_cb(skb);
2145 refcounted = false;
2146 goto process;
2147 }
2148 }
2149
2150 fallthrough;
2151 case TCP_TW_ACK:
2152 tcp_v4_timewait_ack(sk, skb);
2153 break;
2154 case TCP_TW_RST:
2155 tcp_v4_send_reset(sk, skb);
2156 inet_twsk_deschedule_put(inet_twsk(sk));
2157 goto discard_it;
2158 case TCP_TW_SUCCESS:;
2159 }
2160 goto discard_it;
2161}
2162
2163static struct timewait_sock_ops tcp_timewait_sock_ops = {
2164 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2165 .twsk_unique = tcp_twsk_unique,
2166 .twsk_destructor= tcp_twsk_destructor,
2167};
2168
2169void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2170{
2171 struct dst_entry *dst = skb_dst(skb);
2172
2173 if (dst && dst_hold_safe(dst)) {
2174 rcu_assign_pointer(sk->sk_rx_dst, dst);
2175 sk->sk_rx_dst_ifindex = skb->skb_iif;
2176 }
2177}
2178EXPORT_SYMBOL(inet_sk_rx_dst_set);
2179
2180const struct inet_connection_sock_af_ops ipv4_specific = {
2181 .queue_xmit = ip_queue_xmit,
2182 .send_check = tcp_v4_send_check,
2183 .rebuild_header = inet_sk_rebuild_header,
2184 .sk_rx_dst_set = inet_sk_rx_dst_set,
2185 .conn_request = tcp_v4_conn_request,
2186 .syn_recv_sock = tcp_v4_syn_recv_sock,
2187 .net_header_len = sizeof(struct iphdr),
2188 .setsockopt = ip_setsockopt,
2189 .getsockopt = ip_getsockopt,
2190 .addr2sockaddr = inet_csk_addr2sockaddr,
2191 .sockaddr_len = sizeof(struct sockaddr_in),
2192 .mtu_reduced = tcp_v4_mtu_reduced,
2193};
2194EXPORT_SYMBOL(ipv4_specific);
2195
2196#ifdef CONFIG_TCP_MD5SIG
2197static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2198 .md5_lookup = tcp_v4_md5_lookup,
2199 .calc_md5_hash = tcp_v4_md5_hash_skb,
2200 .md5_parse = tcp_v4_parse_md5_keys,
2201};
2202#endif
2203
2204
2205
2206
2207static int tcp_v4_init_sock(struct sock *sk)
2208{
2209 struct inet_connection_sock *icsk = inet_csk(sk);
2210
2211 tcp_init_sock(sk);
2212
2213 icsk->icsk_af_ops = &ipv4_specific;
2214
2215#ifdef CONFIG_TCP_MD5SIG
2216 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2217#endif
2218
2219 return 0;
2220}
2221
2222void tcp_v4_destroy_sock(struct sock *sk)
2223{
2224 struct tcp_sock *tp = tcp_sk(sk);
2225
2226 trace_tcp_destroy_sock(sk);
2227
2228 tcp_clear_xmit_timers(sk);
2229
2230 tcp_cleanup_congestion_control(sk);
2231
2232 tcp_cleanup_ulp(sk);
2233
2234
2235 tcp_write_queue_purge(sk);
2236
2237
2238 tcp_fastopen_active_disable_ofo_check(sk);
2239
2240
2241 skb_rbtree_purge(&tp->out_of_order_queue);
2242
2243#ifdef CONFIG_TCP_MD5SIG
2244
2245 if (tp->md5sig_info) {
2246 tcp_clear_md5_list(sk);
2247 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2248 tp->md5sig_info = NULL;
2249 }
2250#endif
2251
2252
2253 if (inet_csk(sk)->icsk_bind_hash)
2254 inet_put_port(sk);
2255
2256 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2257
2258
2259 tcp_free_fastopen_req(tp);
2260 tcp_fastopen_destroy_cipher(sk);
2261 tcp_saved_syn_free(tp);
2262
2263 sk_sockets_allocated_dec(sk);
2264}
2265EXPORT_SYMBOL(tcp_v4_destroy_sock);
2266
2267#ifdef CONFIG_PROC_FS
2268
2269
2270static unsigned short seq_file_family(const struct seq_file *seq);
2271
2272static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2273{
2274 unsigned short family = seq_file_family(seq);
2275
2276
2277 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2278 net_eq(sock_net(sk), seq_file_net(seq)));
2279}
2280
2281
2282
2283
2284static void *listening_get_first(struct seq_file *seq)
2285{
2286 struct tcp_iter_state *st = seq->private;
2287
2288 st->offset = 0;
2289 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2290 struct inet_listen_hashbucket *ilb2;
2291 struct hlist_nulls_node *node;
2292 struct sock *sk;
2293
2294 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2295 if (hlist_nulls_empty(&ilb2->nulls_head))
2296 continue;
2297
2298 spin_lock(&ilb2->lock);
2299 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2300 if (seq_sk_match(seq, sk))
2301 return sk;
2302 }
2303 spin_unlock(&ilb2->lock);
2304 }
2305
2306 return NULL;
2307}
2308
2309
2310
2311
2312
2313
2314static void *listening_get_next(struct seq_file *seq, void *cur)
2315{
2316 struct tcp_iter_state *st = seq->private;
2317 struct inet_listen_hashbucket *ilb2;
2318 struct hlist_nulls_node *node;
2319 struct sock *sk = cur;
2320
2321 ++st->num;
2322 ++st->offset;
2323
2324 sk = sk_nulls_next(sk);
2325 sk_nulls_for_each_from(sk, node) {
2326 if (seq_sk_match(seq, sk))
2327 return sk;
2328 }
2329
2330 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2331 spin_unlock(&ilb2->lock);
2332 ++st->bucket;
2333 return listening_get_first(seq);
2334}
2335
2336static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2337{
2338 struct tcp_iter_state *st = seq->private;
2339 void *rc;
2340
2341 st->bucket = 0;
2342 st->offset = 0;
2343 rc = listening_get_first(seq);
2344
2345 while (rc && *pos) {
2346 rc = listening_get_next(seq, rc);
2347 --*pos;
2348 }
2349 return rc;
2350}
2351
2352static inline bool empty_bucket(const struct tcp_iter_state *st)
2353{
2354 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2355}
2356
2357
2358
2359
2360
2361static void *established_get_first(struct seq_file *seq)
2362{
2363 struct tcp_iter_state *st = seq->private;
2364
2365 st->offset = 0;
2366 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2367 struct sock *sk;
2368 struct hlist_nulls_node *node;
2369 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2370
2371
2372 if (empty_bucket(st))
2373 continue;
2374
2375 spin_lock_bh(lock);
2376 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2377 if (seq_sk_match(seq, sk))
2378 return sk;
2379 }
2380 spin_unlock_bh(lock);
2381 }
2382
2383 return NULL;
2384}
2385
2386static void *established_get_next(struct seq_file *seq, void *cur)
2387{
2388 struct sock *sk = cur;
2389 struct hlist_nulls_node *node;
2390 struct tcp_iter_state *st = seq->private;
2391
2392 ++st->num;
2393 ++st->offset;
2394
2395 sk = sk_nulls_next(sk);
2396
2397 sk_nulls_for_each_from(sk, node) {
2398 if (seq_sk_match(seq, sk))
2399 return sk;
2400 }
2401
2402 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2403 ++st->bucket;
2404 return established_get_first(seq);
2405}
2406
2407static void *established_get_idx(struct seq_file *seq, loff_t pos)
2408{
2409 struct tcp_iter_state *st = seq->private;
2410 void *rc;
2411
2412 st->bucket = 0;
2413 rc = established_get_first(seq);
2414
2415 while (rc && pos) {
2416 rc = established_get_next(seq, rc);
2417 --pos;
2418 }
2419 return rc;
2420}
2421
2422static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2423{
2424 void *rc;
2425 struct tcp_iter_state *st = seq->private;
2426
2427 st->state = TCP_SEQ_STATE_LISTENING;
2428 rc = listening_get_idx(seq, &pos);
2429
2430 if (!rc) {
2431 st->state = TCP_SEQ_STATE_ESTABLISHED;
2432 rc = established_get_idx(seq, pos);
2433 }
2434
2435 return rc;
2436}
2437
2438static void *tcp_seek_last_pos(struct seq_file *seq)
2439{
2440 struct tcp_iter_state *st = seq->private;
2441 int bucket = st->bucket;
2442 int offset = st->offset;
2443 int orig_num = st->num;
2444 void *rc = NULL;
2445
2446 switch (st->state) {
2447 case TCP_SEQ_STATE_LISTENING:
2448 if (st->bucket > tcp_hashinfo.lhash2_mask)
2449 break;
2450 st->state = TCP_SEQ_STATE_LISTENING;
2451 rc = listening_get_first(seq);
2452 while (offset-- && rc && bucket == st->bucket)
2453 rc = listening_get_next(seq, rc);
2454 if (rc)
2455 break;
2456 st->bucket = 0;
2457 st->state = TCP_SEQ_STATE_ESTABLISHED;
2458 fallthrough;
2459 case TCP_SEQ_STATE_ESTABLISHED:
2460 if (st->bucket > tcp_hashinfo.ehash_mask)
2461 break;
2462 rc = established_get_first(seq);
2463 while (offset-- && rc && bucket == st->bucket)
2464 rc = established_get_next(seq, rc);
2465 }
2466
2467 st->num = orig_num;
2468
2469 return rc;
2470}
2471
2472void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2473{
2474 struct tcp_iter_state *st = seq->private;
2475 void *rc;
2476
2477 if (*pos && *pos == st->last_pos) {
2478 rc = tcp_seek_last_pos(seq);
2479 if (rc)
2480 goto out;
2481 }
2482
2483 st->state = TCP_SEQ_STATE_LISTENING;
2484 st->num = 0;
2485 st->bucket = 0;
2486 st->offset = 0;
2487 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2488
2489out:
2490 st->last_pos = *pos;
2491 return rc;
2492}
2493EXPORT_SYMBOL(tcp_seq_start);
2494
2495void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2496{
2497 struct tcp_iter_state *st = seq->private;
2498 void *rc = NULL;
2499
2500 if (v == SEQ_START_TOKEN) {
2501 rc = tcp_get_idx(seq, 0);
2502 goto out;
2503 }
2504
2505 switch (st->state) {
2506 case TCP_SEQ_STATE_LISTENING:
2507 rc = listening_get_next(seq, v);
2508 if (!rc) {
2509 st->state = TCP_SEQ_STATE_ESTABLISHED;
2510 st->bucket = 0;
2511 st->offset = 0;
2512 rc = established_get_first(seq);
2513 }
2514 break;
2515 case TCP_SEQ_STATE_ESTABLISHED:
2516 rc = established_get_next(seq, v);
2517 break;
2518 }
2519out:
2520 ++*pos;
2521 st->last_pos = *pos;
2522 return rc;
2523}
2524EXPORT_SYMBOL(tcp_seq_next);
2525
2526void tcp_seq_stop(struct seq_file *seq, void *v)
2527{
2528 struct tcp_iter_state *st = seq->private;
2529
2530 switch (st->state) {
2531 case TCP_SEQ_STATE_LISTENING:
2532 if (v != SEQ_START_TOKEN)
2533 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2534 break;
2535 case TCP_SEQ_STATE_ESTABLISHED:
2536 if (v)
2537 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2538 break;
2539 }
2540}
2541EXPORT_SYMBOL(tcp_seq_stop);
2542
2543static void get_openreq4(const struct request_sock *req,
2544 struct seq_file *f, int i)
2545{
2546 const struct inet_request_sock *ireq = inet_rsk(req);
2547 long delta = req->rsk_timer.expires - jiffies;
2548
2549 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2550 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2551 i,
2552 ireq->ir_loc_addr,
2553 ireq->ir_num,
2554 ireq->ir_rmt_addr,
2555 ntohs(ireq->ir_rmt_port),
2556 TCP_SYN_RECV,
2557 0, 0,
2558 1,
2559 jiffies_delta_to_clock_t(delta),
2560 req->num_timeout,
2561 from_kuid_munged(seq_user_ns(f),
2562 sock_i_uid(req->rsk_listener)),
2563 0,
2564 0,
2565 0,
2566 req);
2567}
2568
2569static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2570{
2571 int timer_active;
2572 unsigned long timer_expires;
2573 const struct tcp_sock *tp = tcp_sk(sk);
2574 const struct inet_connection_sock *icsk = inet_csk(sk);
2575 const struct inet_sock *inet = inet_sk(sk);
2576 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2577 __be32 dest = inet->inet_daddr;
2578 __be32 src = inet->inet_rcv_saddr;
2579 __u16 destp = ntohs(inet->inet_dport);
2580 __u16 srcp = ntohs(inet->inet_sport);
2581 int rx_queue;
2582 int state;
2583
2584 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2585 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2586 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2587 timer_active = 1;
2588 timer_expires = icsk->icsk_timeout;
2589 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2590 timer_active = 4;
2591 timer_expires = icsk->icsk_timeout;
2592 } else if (timer_pending(&sk->sk_timer)) {
2593 timer_active = 2;
2594 timer_expires = sk->sk_timer.expires;
2595 } else {
2596 timer_active = 0;
2597 timer_expires = jiffies;
2598 }
2599
2600 state = inet_sk_state_load(sk);
2601 if (state == TCP_LISTEN)
2602 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2603 else
2604
2605
2606
2607 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2608 READ_ONCE(tp->copied_seq), 0);
2609
2610 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2611 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2612 i, src, srcp, dest, destp, state,
2613 READ_ONCE(tp->write_seq) - tp->snd_una,
2614 rx_queue,
2615 timer_active,
2616 jiffies_delta_to_clock_t(timer_expires - jiffies),
2617 icsk->icsk_retransmits,
2618 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2619 icsk->icsk_probes_out,
2620 sock_i_ino(sk),
2621 refcount_read(&sk->sk_refcnt), sk,
2622 jiffies_to_clock_t(icsk->icsk_rto),
2623 jiffies_to_clock_t(icsk->icsk_ack.ato),
2624 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2625 tcp_snd_cwnd(tp),
2626 state == TCP_LISTEN ?
2627 fastopenq->max_qlen :
2628 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2629}
2630
2631static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2632 struct seq_file *f, int i)
2633{
2634 long delta = tw->tw_timer.expires - jiffies;
2635 __be32 dest, src;
2636 __u16 destp, srcp;
2637
2638 dest = tw->tw_daddr;
2639 src = tw->tw_rcv_saddr;
2640 destp = ntohs(tw->tw_dport);
2641 srcp = ntohs(tw->tw_sport);
2642
2643 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2644 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2645 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2646 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2647 refcount_read(&tw->tw_refcnt), tw);
2648}
2649
2650#define TMPSZ 150
2651
2652static int tcp4_seq_show(struct seq_file *seq, void *v)
2653{
2654 struct tcp_iter_state *st;
2655 struct sock *sk = v;
2656
2657 seq_setwidth(seq, TMPSZ - 1);
2658 if (v == SEQ_START_TOKEN) {
2659 seq_puts(seq, " sl local_address rem_address st tx_queue "
2660 "rx_queue tr tm->when retrnsmt uid timeout "
2661 "inode");
2662 goto out;
2663 }
2664 st = seq->private;
2665
2666 if (sk->sk_state == TCP_TIME_WAIT)
2667 get_timewait4_sock(v, seq, st->num);
2668 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2669 get_openreq4(v, seq, st->num);
2670 else
2671 get_tcp4_sock(v, seq, st->num);
2672out:
2673 seq_pad(seq, '\n');
2674 return 0;
2675}
2676
2677#ifdef CONFIG_BPF_SYSCALL
2678struct bpf_tcp_iter_state {
2679 struct tcp_iter_state state;
2680 unsigned int cur_sk;
2681 unsigned int end_sk;
2682 unsigned int max_sk;
2683 struct sock **batch;
2684 bool st_bucket_done;
2685};
2686
2687struct bpf_iter__tcp {
2688 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2689 __bpf_md_ptr(struct sock_common *, sk_common);
2690 uid_t uid __aligned(8);
2691};
2692
2693static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2694 struct sock_common *sk_common, uid_t uid)
2695{
2696 struct bpf_iter__tcp ctx;
2697
2698 meta->seq_num--;
2699 ctx.meta = meta;
2700 ctx.sk_common = sk_common;
2701 ctx.uid = uid;
2702 return bpf_iter_run_prog(prog, &ctx);
2703}
2704
2705static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2706{
2707 while (iter->cur_sk < iter->end_sk)
2708 sock_put(iter->batch[iter->cur_sk++]);
2709}
2710
2711static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2712 unsigned int new_batch_sz)
2713{
2714 struct sock **new_batch;
2715
2716 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2717 GFP_USER | __GFP_NOWARN);
2718 if (!new_batch)
2719 return -ENOMEM;
2720
2721 bpf_iter_tcp_put_batch(iter);
2722 kvfree(iter->batch);
2723 iter->batch = new_batch;
2724 iter->max_sk = new_batch_sz;
2725
2726 return 0;
2727}
2728
2729static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2730 struct sock *start_sk)
2731{
2732 struct bpf_tcp_iter_state *iter = seq->private;
2733 struct tcp_iter_state *st = &iter->state;
2734 struct hlist_nulls_node *node;
2735 unsigned int expected = 1;
2736 struct sock *sk;
2737
2738 sock_hold(start_sk);
2739 iter->batch[iter->end_sk++] = start_sk;
2740
2741 sk = sk_nulls_next(start_sk);
2742 sk_nulls_for_each_from(sk, node) {
2743 if (seq_sk_match(seq, sk)) {
2744 if (iter->end_sk < iter->max_sk) {
2745 sock_hold(sk);
2746 iter->batch[iter->end_sk++] = sk;
2747 }
2748 expected++;
2749 }
2750 }
2751 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2752
2753 return expected;
2754}
2755
2756static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2757 struct sock *start_sk)
2758{
2759 struct bpf_tcp_iter_state *iter = seq->private;
2760 struct tcp_iter_state *st = &iter->state;
2761 struct hlist_nulls_node *node;
2762 unsigned int expected = 1;
2763 struct sock *sk;
2764
2765 sock_hold(start_sk);
2766 iter->batch[iter->end_sk++] = start_sk;
2767
2768 sk = sk_nulls_next(start_sk);
2769 sk_nulls_for_each_from(sk, node) {
2770 if (seq_sk_match(seq, sk)) {
2771 if (iter->end_sk < iter->max_sk) {
2772 sock_hold(sk);
2773 iter->batch[iter->end_sk++] = sk;
2774 }
2775 expected++;
2776 }
2777 }
2778 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2779
2780 return expected;
2781}
2782
2783static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2784{
2785 struct bpf_tcp_iter_state *iter = seq->private;
2786 struct tcp_iter_state *st = &iter->state;
2787 unsigned int expected;
2788 bool resized = false;
2789 struct sock *sk;
2790
2791
2792
2793
2794
2795
2796 if (iter->st_bucket_done) {
2797 st->offset = 0;
2798 st->bucket++;
2799 if (st->state == TCP_SEQ_STATE_LISTENING &&
2800 st->bucket > tcp_hashinfo.lhash2_mask) {
2801 st->state = TCP_SEQ_STATE_ESTABLISHED;
2802 st->bucket = 0;
2803 }
2804 }
2805
2806again:
2807
2808 iter->cur_sk = 0;
2809 iter->end_sk = 0;
2810 iter->st_bucket_done = false;
2811
2812 sk = tcp_seek_last_pos(seq);
2813 if (!sk)
2814 return NULL;
2815
2816 if (st->state == TCP_SEQ_STATE_LISTENING)
2817 expected = bpf_iter_tcp_listening_batch(seq, sk);
2818 else
2819 expected = bpf_iter_tcp_established_batch(seq, sk);
2820
2821 if (iter->end_sk == expected) {
2822 iter->st_bucket_done = true;
2823 return sk;
2824 }
2825
2826 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2827 resized = true;
2828 goto again;
2829 }
2830
2831 return sk;
2832}
2833
2834static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2835{
2836
2837
2838
2839 if (*pos)
2840 return bpf_iter_tcp_batch(seq);
2841
2842 return SEQ_START_TOKEN;
2843}
2844
2845static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2846{
2847 struct bpf_tcp_iter_state *iter = seq->private;
2848 struct tcp_iter_state *st = &iter->state;
2849 struct sock *sk;
2850
2851
2852
2853
2854
2855 if (iter->cur_sk < iter->end_sk) {
2856
2857
2858
2859
2860 st->num++;
2861
2862
2863
2864
2865 st->offset++;
2866 sock_put(iter->batch[iter->cur_sk++]);
2867 }
2868
2869 if (iter->cur_sk < iter->end_sk)
2870 sk = iter->batch[iter->cur_sk];
2871 else
2872 sk = bpf_iter_tcp_batch(seq);
2873
2874 ++*pos;
2875
2876
2877
2878 st->last_pos = *pos;
2879 return sk;
2880}
2881
2882static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2883{
2884 struct bpf_iter_meta meta;
2885 struct bpf_prog *prog;
2886 struct sock *sk = v;
2887 bool slow;
2888 uid_t uid;
2889 int ret;
2890
2891 if (v == SEQ_START_TOKEN)
2892 return 0;
2893
2894 if (sk_fullsock(sk))
2895 slow = lock_sock_fast(sk);
2896
2897 if (unlikely(sk_unhashed(sk))) {
2898 ret = SEQ_SKIP;
2899 goto unlock;
2900 }
2901
2902 if (sk->sk_state == TCP_TIME_WAIT) {
2903 uid = 0;
2904 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2905 const struct request_sock *req = v;
2906
2907 uid = from_kuid_munged(seq_user_ns(seq),
2908 sock_i_uid(req->rsk_listener));
2909 } else {
2910 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2911 }
2912
2913 meta.seq = seq;
2914 prog = bpf_iter_get_info(&meta, false);
2915 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2916
2917unlock:
2918 if (sk_fullsock(sk))
2919 unlock_sock_fast(sk, slow);
2920 return ret;
2921
2922}
2923
2924static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2925{
2926 struct bpf_tcp_iter_state *iter = seq->private;
2927 struct bpf_iter_meta meta;
2928 struct bpf_prog *prog;
2929
2930 if (!v) {
2931 meta.seq = seq;
2932 prog = bpf_iter_get_info(&meta, true);
2933 if (prog)
2934 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2935 }
2936
2937 if (iter->cur_sk < iter->end_sk) {
2938 bpf_iter_tcp_put_batch(iter);
2939 iter->st_bucket_done = false;
2940 }
2941}
2942
2943static const struct seq_operations bpf_iter_tcp_seq_ops = {
2944 .show = bpf_iter_tcp_seq_show,
2945 .start = bpf_iter_tcp_seq_start,
2946 .next = bpf_iter_tcp_seq_next,
2947 .stop = bpf_iter_tcp_seq_stop,
2948};
2949#endif
2950static unsigned short seq_file_family(const struct seq_file *seq)
2951{
2952 const struct tcp_seq_afinfo *afinfo;
2953
2954#ifdef CONFIG_BPF_SYSCALL
2955
2956 if (seq->op == &bpf_iter_tcp_seq_ops)
2957 return AF_UNSPEC;
2958#endif
2959
2960
2961 afinfo = pde_data(file_inode(seq->file));
2962 return afinfo->family;
2963}
2964
2965static const struct seq_operations tcp4_seq_ops = {
2966 .show = tcp4_seq_show,
2967 .start = tcp_seq_start,
2968 .next = tcp_seq_next,
2969 .stop = tcp_seq_stop,
2970};
2971
2972static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2973 .family = AF_INET,
2974};
2975
2976static int __net_init tcp4_proc_init_net(struct net *net)
2977{
2978 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2979 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2980 return -ENOMEM;
2981 return 0;
2982}
2983
2984static void __net_exit tcp4_proc_exit_net(struct net *net)
2985{
2986 remove_proc_entry("tcp", net->proc_net);
2987}
2988
2989static struct pernet_operations tcp4_net_ops = {
2990 .init = tcp4_proc_init_net,
2991 .exit = tcp4_proc_exit_net,
2992};
2993
2994int __init tcp4_proc_init(void)
2995{
2996 return register_pernet_subsys(&tcp4_net_ops);
2997}
2998
2999void tcp4_proc_exit(void)
3000{
3001 unregister_pernet_subsys(&tcp4_net_ops);
3002}
3003#endif
3004
3005
3006
3007
3008
3009bool tcp_stream_memory_free(const struct sock *sk, int wake)
3010{
3011 const struct tcp_sock *tp = tcp_sk(sk);
3012 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3013 READ_ONCE(tp->snd_nxt);
3014
3015 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3016}
3017EXPORT_SYMBOL(tcp_stream_memory_free);
3018
3019struct proto tcp_prot = {
3020 .name = "TCP",
3021 .owner = THIS_MODULE,
3022 .close = tcp_close,
3023 .pre_connect = tcp_v4_pre_connect,
3024 .connect = tcp_v4_connect,
3025 .disconnect = tcp_disconnect,
3026 .accept = inet_csk_accept,
3027 .ioctl = tcp_ioctl,
3028 .init = tcp_v4_init_sock,
3029 .destroy = tcp_v4_destroy_sock,
3030 .shutdown = tcp_shutdown,
3031 .setsockopt = tcp_setsockopt,
3032 .getsockopt = tcp_getsockopt,
3033 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3034 .keepalive = tcp_set_keepalive,
3035 .recvmsg = tcp_recvmsg,
3036 .sendmsg = tcp_sendmsg,
3037 .sendpage = tcp_sendpage,
3038 .backlog_rcv = tcp_v4_do_rcv,
3039 .release_cb = tcp_release_cb,
3040 .hash = inet_hash,
3041 .unhash = inet_unhash,
3042 .get_port = inet_csk_get_port,
3043 .put_port = inet_put_port,
3044#ifdef CONFIG_BPF_SYSCALL
3045 .psock_update_sk_prot = tcp_bpf_update_proto,
3046#endif
3047 .enter_memory_pressure = tcp_enter_memory_pressure,
3048 .leave_memory_pressure = tcp_leave_memory_pressure,
3049 .stream_memory_free = tcp_stream_memory_free,
3050 .sockets_allocated = &tcp_sockets_allocated,
3051 .orphan_count = &tcp_orphan_count,
3052 .memory_allocated = &tcp_memory_allocated,
3053 .memory_pressure = &tcp_memory_pressure,
3054 .sysctl_mem = sysctl_tcp_mem,
3055 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3056 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3057 .max_header = MAX_TCP_HEADER,
3058 .obj_size = sizeof(struct tcp_sock),
3059 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3060 .twsk_prot = &tcp_timewait_sock_ops,
3061 .rsk_prot = &tcp_request_sock_ops,
3062 .h.hashinfo = &tcp_hashinfo,
3063 .no_autobind = true,
3064 .diag_destroy = tcp_abort,
3065};
3066EXPORT_SYMBOL(tcp_prot);
3067
3068static void __net_exit tcp_sk_exit(struct net *net)
3069{
3070 struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3071
3072 if (net->ipv4.tcp_congestion_control)
3073 bpf_module_put(net->ipv4.tcp_congestion_control,
3074 net->ipv4.tcp_congestion_control->owner);
3075 if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3076 kfree(tcp_death_row);
3077}
3078
3079static int __net_init tcp_sk_init(struct net *net)
3080{
3081 int cnt;
3082
3083 net->ipv4.sysctl_tcp_ecn = 2;
3084 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3085
3086 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3087 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3088 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3089 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3090 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3091
3092 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3093 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3094 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3095
3096 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3097 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3098 net->ipv4.sysctl_tcp_syncookies = 1;
3099 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3100 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3101 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3102 net->ipv4.sysctl_tcp_orphan_retries = 0;
3103 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3104 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3105 net->ipv4.sysctl_tcp_tw_reuse = 2;
3106 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3107
3108 net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3109 if (!net->ipv4.tcp_death_row)
3110 return -ENOMEM;
3111 refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3112 cnt = tcp_hashinfo.ehash_mask + 1;
3113 net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3114 net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3115
3116 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3117 net->ipv4.sysctl_tcp_sack = 1;
3118 net->ipv4.sysctl_tcp_window_scaling = 1;
3119 net->ipv4.sysctl_tcp_timestamps = 1;
3120 net->ipv4.sysctl_tcp_early_retrans = 3;
3121 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3122 net->ipv4.sysctl_tcp_slow_start_after_idle = 1;
3123 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3124 net->ipv4.sysctl_tcp_max_reordering = 300;
3125 net->ipv4.sysctl_tcp_dsack = 1;
3126 net->ipv4.sysctl_tcp_app_win = 31;
3127 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3128 net->ipv4.sysctl_tcp_frto = 2;
3129 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3130
3131
3132
3133
3134 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3135
3136 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3137
3138 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3139 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3140 net->ipv4.sysctl_tcp_tso_rtt_log = 9;
3141 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3142 net->ipv4.sysctl_tcp_autocorking = 1;
3143 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3144 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3145 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3146 if (net != &init_net) {
3147 memcpy(net->ipv4.sysctl_tcp_rmem,
3148 init_net.ipv4.sysctl_tcp_rmem,
3149 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3150 memcpy(net->ipv4.sysctl_tcp_wmem,
3151 init_net.ipv4.sysctl_tcp_wmem,
3152 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3153 }
3154 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3155 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3156 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3157 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3158 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3159 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3160
3161
3162 if (!net_eq(net, &init_net) &&
3163 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3164 init_net.ipv4.tcp_congestion_control->owner))
3165 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3166 else
3167 net->ipv4.tcp_congestion_control = &tcp_reno;
3168
3169 return 0;
3170}
3171
3172static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3173{
3174 struct net *net;
3175
3176 inet_twsk_purge(&tcp_hashinfo, AF_INET);
3177
3178 list_for_each_entry(net, net_exit_list, exit_list)
3179 tcp_fastopen_ctx_destroy(net);
3180}
3181
3182static struct pernet_operations __net_initdata tcp_sk_ops = {
3183 .init = tcp_sk_init,
3184 .exit = tcp_sk_exit,
3185 .exit_batch = tcp_sk_exit_batch,
3186};
3187
3188#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3189DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3190 struct sock_common *sk_common, uid_t uid)
3191
3192#define INIT_BATCH_SZ 16
3193
3194static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3195{
3196 struct bpf_tcp_iter_state *iter = priv_data;
3197 int err;
3198
3199 err = bpf_iter_init_seq_net(priv_data, aux);
3200 if (err)
3201 return err;
3202
3203 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3204 if (err) {
3205 bpf_iter_fini_seq_net(priv_data);
3206 return err;
3207 }
3208
3209 return 0;
3210}
3211
3212static void bpf_iter_fini_tcp(void *priv_data)
3213{
3214 struct bpf_tcp_iter_state *iter = priv_data;
3215
3216 bpf_iter_fini_seq_net(priv_data);
3217 kvfree(iter->batch);
3218}
3219
3220static const struct bpf_iter_seq_info tcp_seq_info = {
3221 .seq_ops = &bpf_iter_tcp_seq_ops,
3222 .init_seq_private = bpf_iter_init_tcp,
3223 .fini_seq_private = bpf_iter_fini_tcp,
3224 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3225};
3226
3227static const struct bpf_func_proto *
3228bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3229 const struct bpf_prog *prog)
3230{
3231 switch (func_id) {
3232 case BPF_FUNC_setsockopt:
3233 return &bpf_sk_setsockopt_proto;
3234 case BPF_FUNC_getsockopt:
3235 return &bpf_sk_getsockopt_proto;
3236 default:
3237 return NULL;
3238 }
3239}
3240
3241static struct bpf_iter_reg tcp_reg_info = {
3242 .target = "tcp",
3243 .ctx_arg_info_size = 1,
3244 .ctx_arg_info = {
3245 { offsetof(struct bpf_iter__tcp, sk_common),
3246 PTR_TO_BTF_ID_OR_NULL },
3247 },
3248 .get_func_proto = bpf_iter_tcp_get_func_proto,
3249 .seq_info = &tcp_seq_info,
3250};
3251
3252static void __init bpf_iter_register(void)
3253{
3254 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3255 if (bpf_iter_reg_target(&tcp_reg_info))
3256 pr_warn("Warning: could not register bpf iterator tcp\n");
3257}
3258
3259#endif
3260
3261void __init tcp_v4_init(void)
3262{
3263 int cpu, res;
3264
3265 for_each_possible_cpu(cpu) {
3266 struct sock *sk;
3267
3268 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3269 IPPROTO_TCP, &init_net);
3270 if (res)
3271 panic("Failed to create the TCP control socket.\n");
3272 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3273
3274
3275
3276
3277 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3278
3279 per_cpu(ipv4_tcp_sk, cpu) = sk;
3280 }
3281 if (register_pernet_subsys(&tcp_sk_ops))
3282 panic("Failed to create the TCP control socket.\n");
3283
3284#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3285 bpf_iter_register();
3286#endif
3287}
3288