1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48#define pr_fmt(fmt) "TCP: " fmt
49
50#include <linux/bottom_half.h>
51#include <linux/types.h>
52#include <linux/fcntl.h>
53#include <linux/module.h>
54#include <linux/random.h>
55#include <linux/cache.h>
56#include <linux/jhash.h>
57#include <linux/init.h>
58#include <linux/times.h>
59#include <linux/slab.h>
60
61#include <net/net_namespace.h>
62#include <net/icmp.h>
63#include <net/inet_hashtables.h>
64#include <net/tcp.h>
65#include <net/transp_v6.h>
66#include <net/ipv6.h>
67#include <net/inet_common.h>
68#include <net/timewait_sock.h>
69#include <net/xfrm.h>
70#include <net/secure_seq.h>
71#include <net/busy_poll.h>
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78#include <linux/inetdevice.h>
79#include <linux/btf_ids.h>
80
81#include <crypto/hash.h>
82#include <linux/scatterlist.h>
83
84#include <trace/events/tcp.h>
85
86#ifdef CONFIG_TCP_MD5SIG
87static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
89#endif
90
91struct inet_hashinfo tcp_hashinfo;
92EXPORT_SYMBOL(tcp_hashinfo);
93
94static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95{
96 return secure_tcp_seq(ip_hdr(skb)->daddr,
97 ip_hdr(skb)->saddr,
98 tcp_hdr(skb)->dest,
99 tcp_hdr(skb)->source);
100}
101
102static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103{
104 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105}
106
107int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108{
109 const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
112 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113
114 if (reuse == 2) {
115
116
117
118
119 bool loopback = false;
120 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 loopback = true;
122#if IS_ENABLED(CONFIG_IPV6)
123 if (tw->tw_family == AF_INET6) {
124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128 loopback = true;
129 } else
130#endif
131 {
132 if (ipv4_is_loopback(tw->tw_daddr) ||
133 ipv4_is_loopback(tw->tw_rcv_saddr))
134 loopback = true;
135 }
136 if (!loopback)
137 reuse = 0;
138 }
139
140
141
142
143
144
145
146
147
148
149
150
151 if (tcptw->tw_ts_recent_stamp &&
152 (!twp || (reuse && time_after32(ktime_get_seconds(),
153 tcptw->tw_ts_recent_stamp)))) {
154
155
156
157
158
159
160
161
162
163
164
165 if (likely(!tp->repair)) {
166 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167
168 if (!seq)
169 seq = 1;
170 WRITE_ONCE(tp->write_seq, seq);
171 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
172 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 }
174 sock_hold(sktw);
175 return 1;
176 }
177
178 return 0;
179}
180EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181
182static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 int addr_len)
184{
185
186
187
188
189 if (addr_len < sizeof(struct sockaddr_in))
190 return -EINVAL;
191
192 sock_owned_by_me(sk);
193
194 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195}
196
197
198int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199{
200 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 struct inet_sock *inet = inet_sk(sk);
202 struct tcp_sock *tp = tcp_sk(sk);
203 __be16 orig_sport, orig_dport;
204 __be32 daddr, nexthop;
205 struct flowi4 *fl4;
206 struct rtable *rt;
207 int err;
208 struct ip_options_rcu *inet_opt;
209 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210
211 if (addr_len < sizeof(struct sockaddr_in))
212 return -EINVAL;
213
214 if (usin->sin_family != AF_INET)
215 return -EAFNOSUPPORT;
216
217 nexthop = daddr = usin->sin_addr.s_addr;
218 inet_opt = rcu_dereference_protected(inet->inet_opt,
219 lockdep_sock_is_held(sk));
220 if (inet_opt && inet_opt->opt.srr) {
221 if (!daddr)
222 return -EINVAL;
223 nexthop = inet_opt->opt.faddr;
224 }
225
226 orig_sport = inet->inet_sport;
227 orig_dport = usin->sin_port;
228 fl4 = &inet->cork.fl.u.ip4;
229 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 IPPROTO_TCP,
232 orig_sport, orig_dport, sk);
233 if (IS_ERR(rt)) {
234 err = PTR_ERR(rt);
235 if (err == -ENETUNREACH)
236 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237 return err;
238 }
239
240 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 ip_rt_put(rt);
242 return -ENETUNREACH;
243 }
244
245 if (!inet_opt || !inet_opt->opt.srr)
246 daddr = fl4->daddr;
247
248 if (!inet->inet_saddr)
249 inet->inet_saddr = fl4->saddr;
250 sk_rcv_saddr_set(sk, inet->inet_saddr);
251
252 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253
254 tp->rx_opt.ts_recent = 0;
255 tp->rx_opt.ts_recent_stamp = 0;
256 if (likely(!tp->repair))
257 WRITE_ONCE(tp->write_seq, 0);
258 }
259
260 inet->inet_dport = usin->sin_port;
261 sk_daddr_set(sk, daddr);
262
263 inet_csk(sk)->icsk_ext_hdr_len = 0;
264 if (inet_opt)
265 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266
267 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268
269
270
271
272
273
274 tcp_set_state(sk, TCP_SYN_SENT);
275 err = inet_hash_connect(tcp_death_row, sk);
276 if (err)
277 goto failure;
278
279 sk_set_txhash(sk);
280
281 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 inet->inet_sport, inet->inet_dport, sk);
283 if (IS_ERR(rt)) {
284 err = PTR_ERR(rt);
285 rt = NULL;
286 goto failure;
287 }
288
289 sk->sk_gso_type = SKB_GSO_TCPV4;
290 sk_setup_caps(sk, &rt->dst);
291 rt = NULL;
292
293 if (likely(!tp->repair)) {
294 if (!tp->write_seq)
295 WRITE_ONCE(tp->write_seq,
296 secure_tcp_seq(inet->inet_saddr,
297 inet->inet_daddr,
298 inet->inet_sport,
299 usin->sin_port));
300 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 inet->inet_saddr,
302 inet->inet_daddr);
303 }
304
305 inet->inet_id = prandom_u32();
306
307 if (tcp_fastopen_defer_connect(sk, &err))
308 return err;
309 if (err)
310 goto failure;
311
312 err = tcp_connect(sk);
313
314 if (err)
315 goto failure;
316
317 return 0;
318
319failure:
320
321
322
323
324 tcp_set_state(sk, TCP_CLOSE);
325 ip_rt_put(rt);
326 sk->sk_route_caps = 0;
327 inet->inet_dport = 0;
328 return err;
329}
330EXPORT_SYMBOL(tcp_v4_connect);
331
332
333
334
335
336
337void tcp_v4_mtu_reduced(struct sock *sk)
338{
339 struct inet_sock *inet = inet_sk(sk);
340 struct dst_entry *dst;
341 u32 mtu;
342
343 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344 return;
345 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
346 dst = inet_csk_update_pmtu(sk, mtu);
347 if (!dst)
348 return;
349
350
351
352
353 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 sk->sk_err_soft = EMSGSIZE;
355
356 mtu = dst_mtu(dst);
357
358 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 ip_sk_accept_pmtu(sk) &&
360 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 tcp_sync_mss(sk, mtu);
362
363
364
365
366
367
368 tcp_simple_retransmit(sk);
369 }
370}
371EXPORT_SYMBOL(tcp_v4_mtu_reduced);
372
373static void do_redirect(struct sk_buff *skb, struct sock *sk)
374{
375 struct dst_entry *dst = __sk_dst_check(sk, 0);
376
377 if (dst)
378 dst->ops->redirect(dst, sk, skb);
379}
380
381
382
383void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384{
385 struct request_sock *req = inet_reqsk(sk);
386 struct net *net = sock_net(sk);
387
388
389
390
391 if (seq != tcp_rsk(req)->snt_isn) {
392 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
393 } else if (abort) {
394
395
396
397
398
399
400 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 tcp_listendrop(req->rsk_listener);
402 }
403 reqsk_put(req);
404}
405EXPORT_SYMBOL(tcp_req_err);
406
407
408void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409{
410 struct inet_connection_sock *icsk = inet_csk(sk);
411 struct tcp_sock *tp = tcp_sk(sk);
412 struct sk_buff *skb;
413 s32 remaining;
414 u32 delta_us;
415
416 if (sock_owned_by_user(sk))
417 return;
418
419 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
420 !icsk->icsk_backoff)
421 return;
422
423 skb = tcp_rtx_queue_head(sk);
424 if (WARN_ON_ONCE(!skb))
425 return;
426
427 icsk->icsk_backoff--;
428 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430
431 tcp_mstamp_refresh(tp);
432 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434
435 if (remaining > 0) {
436 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 remaining, TCP_RTO_MAX);
438 } else {
439
440
441
442 tcp_retransmit_timer(sk);
443 }
444}
445EXPORT_SYMBOL(tcp_ld_RTO_revert);
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463int tcp_v4_err(struct sk_buff *skb, u32 info)
464{
465 const struct iphdr *iph = (const struct iphdr *)skb->data;
466 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
467 struct tcp_sock *tp;
468 struct inet_sock *inet;
469 const int type = icmp_hdr(skb)->type;
470 const int code = icmp_hdr(skb)->code;
471 struct sock *sk;
472 struct request_sock *fastopen;
473 u32 seq, snd_una;
474 int err;
475 struct net *net = dev_net(skb->dev);
476
477 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 th->dest, iph->saddr, ntohs(th->source),
479 inet_iif(skb), 0);
480 if (!sk) {
481 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
482 return -ENOENT;
483 }
484 if (sk->sk_state == TCP_TIME_WAIT) {
485 inet_twsk_put(inet_twsk(sk));
486 return 0;
487 }
488 seq = ntohl(th->seq);
489 if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 type == ICMP_TIME_EXCEEDED ||
492 (type == ICMP_DEST_UNREACH &&
493 (code == ICMP_NET_UNREACH ||
494 code == ICMP_HOST_UNREACH)));
495 return 0;
496 }
497
498 bh_lock_sock(sk);
499
500
501
502
503
504 if (sock_owned_by_user(sk)) {
505 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507 }
508 if (sk->sk_state == TCP_CLOSE)
509 goto out;
510
511 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
512 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
513 goto out;
514 }
515
516 tp = tcp_sk(sk);
517
518 fastopen = rcu_dereference(tp->fastopen_rsk);
519 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
520 if (sk->sk_state != TCP_LISTEN &&
521 !between(seq, snd_una, tp->snd_nxt)) {
522 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
523 goto out;
524 }
525
526 switch (type) {
527 case ICMP_REDIRECT:
528 if (!sock_owned_by_user(sk))
529 do_redirect(skb, sk);
530 goto out;
531 case ICMP_SOURCE_QUENCH:
532
533 goto out;
534 case ICMP_PARAMETERPROB:
535 err = EPROTO;
536 break;
537 case ICMP_DEST_UNREACH:
538 if (code > NR_ICMP_UNREACH)
539 goto out;
540
541 if (code == ICMP_FRAG_NEEDED) {
542
543
544
545
546 if (sk->sk_state == TCP_LISTEN)
547 goto out;
548
549 WRITE_ONCE(tp->mtu_info, info);
550 if (!sock_owned_by_user(sk)) {
551 tcp_v4_mtu_reduced(sk);
552 } else {
553 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
554 sock_hold(sk);
555 }
556 goto out;
557 }
558
559 err = icmp_err_convert[code].errno;
560
561
562
563 if (!fastopen &&
564 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565 tcp_ld_RTO_revert(sk, seq);
566 break;
567 case ICMP_TIME_EXCEEDED:
568 err = EHOSTUNREACH;
569 break;
570 default:
571 goto out;
572 }
573
574 switch (sk->sk_state) {
575 case TCP_SYN_SENT:
576 case TCP_SYN_RECV:
577
578
579
580 if (fastopen && !fastopen->sk)
581 break;
582
583 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
584
585 if (!sock_owned_by_user(sk)) {
586 sk->sk_err = err;
587
588 sk_error_report(sk);
589
590 tcp_done(sk);
591 } else {
592 sk->sk_err_soft = err;
593 }
594 goto out;
595 }
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613 inet = inet_sk(sk);
614 if (!sock_owned_by_user(sk) && inet->recverr) {
615 sk->sk_err = err;
616 sk_error_report(sk);
617 } else {
618 sk->sk_err_soft = err;
619 }
620
621out:
622 bh_unlock_sock(sk);
623 sock_put(sk);
624 return 0;
625}
626
627void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
628{
629 struct tcphdr *th = tcp_hdr(skb);
630
631 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632 skb->csum_start = skb_transport_header(skb) - skb->head;
633 skb->csum_offset = offsetof(struct tcphdr, check);
634}
635
636
637void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
638{
639 const struct inet_sock *inet = inet_sk(sk);
640
641 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
642}
643EXPORT_SYMBOL(tcp_v4_send_check);
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658#ifdef CONFIG_TCP_MD5SIG
659#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
660#else
661#define OPTION_BYTES sizeof(__be32)
662#endif
663
664static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
665{
666 const struct tcphdr *th = tcp_hdr(skb);
667 struct {
668 struct tcphdr th;
669 __be32 opt[OPTION_BYTES / sizeof(__be32)];
670 } rep;
671 struct ip_reply_arg arg;
672#ifdef CONFIG_TCP_MD5SIG
673 struct tcp_md5sig_key *key = NULL;
674 const __u8 *hash_location = NULL;
675 unsigned char newhash[16];
676 int genhash;
677 struct sock *sk1 = NULL;
678#endif
679 u64 transmit_time = 0;
680 struct sock *ctl_sk;
681 struct net *net;
682
683
684 if (th->rst)
685 return;
686
687
688
689
690 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
691 return;
692
693
694 memset(&rep, 0, sizeof(rep));
695 rep.th.dest = th->source;
696 rep.th.source = th->dest;
697 rep.th.doff = sizeof(struct tcphdr) / 4;
698 rep.th.rst = 1;
699
700 if (th->ack) {
701 rep.th.seq = th->ack_seq;
702 } else {
703 rep.th.ack = 1;
704 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
705 skb->len - (th->doff << 2));
706 }
707
708 memset(&arg, 0, sizeof(arg));
709 arg.iov[0].iov_base = (unsigned char *)&rep;
710 arg.iov[0].iov_len = sizeof(rep.th);
711
712 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
713#ifdef CONFIG_TCP_MD5SIG
714 rcu_read_lock();
715 hash_location = tcp_parse_md5sig_option(th);
716 if (sk && sk_fullsock(sk)) {
717 const union tcp_md5_addr *addr;
718 int l3index;
719
720
721
722
723 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
724 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
725 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
726 } else if (hash_location) {
727 const union tcp_md5_addr *addr;
728 int sdif = tcp_v4_sdif(skb);
729 int dif = inet_iif(skb);
730 int l3index;
731
732
733
734
735
736
737
738
739 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
740 ip_hdr(skb)->saddr,
741 th->source, ip_hdr(skb)->daddr,
742 ntohs(th->source), dif, sdif);
743
744 if (!sk1)
745 goto out;
746
747
748
749
750 l3index = sdif ? dif : 0;
751 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
752 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
753 if (!key)
754 goto out;
755
756
757 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
758 if (genhash || memcmp(hash_location, newhash, 16) != 0)
759 goto out;
760
761 }
762
763 if (key) {
764 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
765 (TCPOPT_NOP << 16) |
766 (TCPOPT_MD5SIG << 8) |
767 TCPOLEN_MD5SIG);
768
769 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
770 rep.th.doff = arg.iov[0].iov_len / 4;
771
772 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
773 key, ip_hdr(skb)->saddr,
774 ip_hdr(skb)->daddr, &rep.th);
775 }
776#endif
777
778 if (rep.opt[0] == 0) {
779 __be32 mrst = mptcp_reset_option(skb);
780
781 if (mrst) {
782 rep.opt[0] = mrst;
783 arg.iov[0].iov_len += sizeof(mrst);
784 rep.th.doff = arg.iov[0].iov_len / 4;
785 }
786 }
787
788 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
789 ip_hdr(skb)->saddr,
790 arg.iov[0].iov_len, IPPROTO_TCP, 0);
791 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
792 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
793
794
795
796
797
798 if (sk) {
799 arg.bound_dev_if = sk->sk_bound_dev_if;
800 if (sk_fullsock(sk))
801 trace_tcp_send_reset(sk, skb);
802 }
803
804 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
805 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
806
807 arg.tos = ip_hdr(skb)->tos;
808 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
809 local_bh_disable();
810 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
811 if (sk) {
812 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
813 inet_twsk(sk)->tw_mark : sk->sk_mark;
814 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
815 inet_twsk(sk)->tw_priority : sk->sk_priority;
816 transmit_time = tcp_transmit_time(sk);
817 }
818 ip_send_unicast_reply(ctl_sk,
819 skb, &TCP_SKB_CB(skb)->header.h4.opt,
820 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
821 &arg, arg.iov[0].iov_len,
822 transmit_time);
823
824 ctl_sk->sk_mark = 0;
825 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
826 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
827 local_bh_enable();
828
829#ifdef CONFIG_TCP_MD5SIG
830out:
831 rcu_read_unlock();
832#endif
833}
834
835
836
837
838
839static void tcp_v4_send_ack(const struct sock *sk,
840 struct sk_buff *skb, u32 seq, u32 ack,
841 u32 win, u32 tsval, u32 tsecr, int oif,
842 struct tcp_md5sig_key *key,
843 int reply_flags, u8 tos)
844{
845 const struct tcphdr *th = tcp_hdr(skb);
846 struct {
847 struct tcphdr th;
848 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
849#ifdef CONFIG_TCP_MD5SIG
850 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
851#endif
852 ];
853 } rep;
854 struct net *net = sock_net(sk);
855 struct ip_reply_arg arg;
856 struct sock *ctl_sk;
857 u64 transmit_time;
858
859 memset(&rep.th, 0, sizeof(struct tcphdr));
860 memset(&arg, 0, sizeof(arg));
861
862 arg.iov[0].iov_base = (unsigned char *)&rep;
863 arg.iov[0].iov_len = sizeof(rep.th);
864 if (tsecr) {
865 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
866 (TCPOPT_TIMESTAMP << 8) |
867 TCPOLEN_TIMESTAMP);
868 rep.opt[1] = htonl(tsval);
869 rep.opt[2] = htonl(tsecr);
870 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
871 }
872
873
874 rep.th.dest = th->source;
875 rep.th.source = th->dest;
876 rep.th.doff = arg.iov[0].iov_len / 4;
877 rep.th.seq = htonl(seq);
878 rep.th.ack_seq = htonl(ack);
879 rep.th.ack = 1;
880 rep.th.window = htons(win);
881
882#ifdef CONFIG_TCP_MD5SIG
883 if (key) {
884 int offset = (tsecr) ? 3 : 0;
885
886 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
887 (TCPOPT_NOP << 16) |
888 (TCPOPT_MD5SIG << 8) |
889 TCPOLEN_MD5SIG);
890 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
891 rep.th.doff = arg.iov[0].iov_len/4;
892
893 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
894 key, ip_hdr(skb)->saddr,
895 ip_hdr(skb)->daddr, &rep.th);
896 }
897#endif
898 arg.flags = reply_flags;
899 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
900 ip_hdr(skb)->saddr,
901 arg.iov[0].iov_len, IPPROTO_TCP, 0);
902 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
903 if (oif)
904 arg.bound_dev_if = oif;
905 arg.tos = tos;
906 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
907 local_bh_disable();
908 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
909 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
910 inet_twsk(sk)->tw_mark : sk->sk_mark;
911 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
912 inet_twsk(sk)->tw_priority : sk->sk_priority;
913 transmit_time = tcp_transmit_time(sk);
914 ip_send_unicast_reply(ctl_sk,
915 skb, &TCP_SKB_CB(skb)->header.h4.opt,
916 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
917 &arg, arg.iov[0].iov_len,
918 transmit_time);
919
920 ctl_sk->sk_mark = 0;
921 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
922 local_bh_enable();
923}
924
925static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
926{
927 struct inet_timewait_sock *tw = inet_twsk(sk);
928 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
929
930 tcp_v4_send_ack(sk, skb,
931 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
932 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
933 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
934 tcptw->tw_ts_recent,
935 tw->tw_bound_dev_if,
936 tcp_twsk_md5_key(tcptw),
937 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
938 tw->tw_tos
939 );
940
941 inet_twsk_put(tw);
942}
943
944static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
945 struct request_sock *req)
946{
947 const union tcp_md5_addr *addr;
948 int l3index;
949
950
951
952
953 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
954 tcp_sk(sk)->snd_nxt;
955
956
957
958
959
960
961 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
962 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
963 tcp_v4_send_ack(sk, skb, seq,
964 tcp_rsk(req)->rcv_nxt,
965 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
966 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
967 req->ts_recent,
968 0,
969 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
970 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
971 ip_hdr(skb)->tos);
972}
973
974
975
976
977
978
979static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
980 struct flowi *fl,
981 struct request_sock *req,
982 struct tcp_fastopen_cookie *foc,
983 enum tcp_synack_type synack_type,
984 struct sk_buff *syn_skb)
985{
986 const struct inet_request_sock *ireq = inet_rsk(req);
987 struct flowi4 fl4;
988 int err = -1;
989 struct sk_buff *skb;
990 u8 tos;
991
992
993 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
994 return -1;
995
996 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
997
998 if (skb) {
999 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1000
1001 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1002 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1003 (inet_sk(sk)->tos & INET_ECN_MASK) :
1004 inet_sk(sk)->tos;
1005
1006 if (!INET_ECN_is_capable(tos) &&
1007 tcp_bpf_ca_needs_ecn((struct sock *)req))
1008 tos |= INET_ECN_ECT_0;
1009
1010 rcu_read_lock();
1011 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1012 ireq->ir_rmt_addr,
1013 rcu_dereference(ireq->ireq_opt),
1014 tos);
1015 rcu_read_unlock();
1016 err = net_xmit_eval(err);
1017 }
1018
1019 return err;
1020}
1021
1022
1023
1024
1025static void tcp_v4_reqsk_destructor(struct request_sock *req)
1026{
1027 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1028}
1029
1030#ifdef CONFIG_TCP_MD5SIG
1031
1032
1033
1034
1035
1036
1037DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1038EXPORT_SYMBOL(tcp_md5_needed);
1039
1040
1041struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1042 const union tcp_md5_addr *addr,
1043 int family)
1044{
1045 const struct tcp_sock *tp = tcp_sk(sk);
1046 struct tcp_md5sig_key *key;
1047 const struct tcp_md5sig_info *md5sig;
1048 __be32 mask;
1049 struct tcp_md5sig_key *best_match = NULL;
1050 bool match;
1051
1052
1053 md5sig = rcu_dereference_check(tp->md5sig_info,
1054 lockdep_sock_is_held(sk));
1055 if (!md5sig)
1056 return NULL;
1057
1058 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1059 lockdep_sock_is_held(sk)) {
1060 if (key->family != family)
1061 continue;
1062 if (key->l3index && key->l3index != l3index)
1063 continue;
1064 if (family == AF_INET) {
1065 mask = inet_make_mask(key->prefixlen);
1066 match = (key->addr.a4.s_addr & mask) ==
1067 (addr->a4.s_addr & mask);
1068#if IS_ENABLED(CONFIG_IPV6)
1069 } else if (family == AF_INET6) {
1070 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1071 key->prefixlen);
1072#endif
1073 } else {
1074 match = false;
1075 }
1076
1077 if (match && (!best_match ||
1078 key->prefixlen > best_match->prefixlen))
1079 best_match = key;
1080 }
1081 return best_match;
1082}
1083EXPORT_SYMBOL(__tcp_md5_do_lookup);
1084
1085static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1086 const union tcp_md5_addr *addr,
1087 int family, u8 prefixlen,
1088 int l3index)
1089{
1090 const struct tcp_sock *tp = tcp_sk(sk);
1091 struct tcp_md5sig_key *key;
1092 unsigned int size = sizeof(struct in_addr);
1093 const struct tcp_md5sig_info *md5sig;
1094
1095
1096 md5sig = rcu_dereference_check(tp->md5sig_info,
1097 lockdep_sock_is_held(sk));
1098 if (!md5sig)
1099 return NULL;
1100#if IS_ENABLED(CONFIG_IPV6)
1101 if (family == AF_INET6)
1102 size = sizeof(struct in6_addr);
1103#endif
1104 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1105 lockdep_sock_is_held(sk)) {
1106 if (key->family != family)
1107 continue;
1108 if (key->l3index && key->l3index != l3index)
1109 continue;
1110 if (!memcmp(&key->addr, addr, size) &&
1111 key->prefixlen == prefixlen)
1112 return key;
1113 }
1114 return NULL;
1115}
1116
1117struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1118 const struct sock *addr_sk)
1119{
1120 const union tcp_md5_addr *addr;
1121 int l3index;
1122
1123 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1124 addr_sk->sk_bound_dev_if);
1125 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1126 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1127}
1128EXPORT_SYMBOL(tcp_v4_md5_lookup);
1129
1130
1131int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1132 int family, u8 prefixlen, int l3index,
1133 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1134{
1135
1136 struct tcp_md5sig_key *key;
1137 struct tcp_sock *tp = tcp_sk(sk);
1138 struct tcp_md5sig_info *md5sig;
1139
1140 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1141 if (key) {
1142
1143
1144
1145
1146
1147
1148 data_race(memcpy(key->key, newkey, newkeylen));
1149
1150
1151
1152
1153
1154
1155 WRITE_ONCE(key->keylen, newkeylen);
1156
1157 return 0;
1158 }
1159
1160 md5sig = rcu_dereference_protected(tp->md5sig_info,
1161 lockdep_sock_is_held(sk));
1162 if (!md5sig) {
1163 md5sig = kmalloc(sizeof(*md5sig), gfp);
1164 if (!md5sig)
1165 return -ENOMEM;
1166
1167 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1168 INIT_HLIST_HEAD(&md5sig->head);
1169 rcu_assign_pointer(tp->md5sig_info, md5sig);
1170 }
1171
1172 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1173 if (!key)
1174 return -ENOMEM;
1175 if (!tcp_alloc_md5sig_pool()) {
1176 sock_kfree_s(sk, key, sizeof(*key));
1177 return -ENOMEM;
1178 }
1179
1180 memcpy(key->key, newkey, newkeylen);
1181 key->keylen = newkeylen;
1182 key->family = family;
1183 key->prefixlen = prefixlen;
1184 key->l3index = l3index;
1185 memcpy(&key->addr, addr,
1186 (family == AF_INET6) ? sizeof(struct in6_addr) :
1187 sizeof(struct in_addr));
1188 hlist_add_head_rcu(&key->node, &md5sig->head);
1189 return 0;
1190}
1191EXPORT_SYMBOL(tcp_md5_do_add);
1192
1193int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1194 u8 prefixlen, int l3index)
1195{
1196 struct tcp_md5sig_key *key;
1197
1198 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1199 if (!key)
1200 return -ENOENT;
1201 hlist_del_rcu(&key->node);
1202 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1203 kfree_rcu(key, rcu);
1204 return 0;
1205}
1206EXPORT_SYMBOL(tcp_md5_do_del);
1207
1208static void tcp_clear_md5_list(struct sock *sk)
1209{
1210 struct tcp_sock *tp = tcp_sk(sk);
1211 struct tcp_md5sig_key *key;
1212 struct hlist_node *n;
1213 struct tcp_md5sig_info *md5sig;
1214
1215 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1216
1217 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1218 hlist_del_rcu(&key->node);
1219 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1220 kfree_rcu(key, rcu);
1221 }
1222}
1223
1224static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1225 sockptr_t optval, int optlen)
1226{
1227 struct tcp_md5sig cmd;
1228 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1229 const union tcp_md5_addr *addr;
1230 u8 prefixlen = 32;
1231 int l3index = 0;
1232
1233 if (optlen < sizeof(cmd))
1234 return -EINVAL;
1235
1236 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1237 return -EFAULT;
1238
1239 if (sin->sin_family != AF_INET)
1240 return -EINVAL;
1241
1242 if (optname == TCP_MD5SIG_EXT &&
1243 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1244 prefixlen = cmd.tcpm_prefixlen;
1245 if (prefixlen > 32)
1246 return -EINVAL;
1247 }
1248
1249 if (optname == TCP_MD5SIG_EXT &&
1250 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1251 struct net_device *dev;
1252
1253 rcu_read_lock();
1254 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1255 if (dev && netif_is_l3_master(dev))
1256 l3index = dev->ifindex;
1257
1258 rcu_read_unlock();
1259
1260
1261
1262
1263 if (!dev || !l3index)
1264 return -EINVAL;
1265 }
1266
1267 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1268
1269 if (!cmd.tcpm_keylen)
1270 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1271
1272 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1273 return -EINVAL;
1274
1275 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1276 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1277}
1278
1279static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1280 __be32 daddr, __be32 saddr,
1281 const struct tcphdr *th, int nbytes)
1282{
1283 struct tcp4_pseudohdr *bp;
1284 struct scatterlist sg;
1285 struct tcphdr *_th;
1286
1287 bp = hp->scratch;
1288 bp->saddr = saddr;
1289 bp->daddr = daddr;
1290 bp->pad = 0;
1291 bp->protocol = IPPROTO_TCP;
1292 bp->len = cpu_to_be16(nbytes);
1293
1294 _th = (struct tcphdr *)(bp + 1);
1295 memcpy(_th, th, sizeof(*th));
1296 _th->check = 0;
1297
1298 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1299 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1300 sizeof(*bp) + sizeof(*th));
1301 return crypto_ahash_update(hp->md5_req);
1302}
1303
1304static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1305 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1306{
1307 struct tcp_md5sig_pool *hp;
1308 struct ahash_request *req;
1309
1310 hp = tcp_get_md5sig_pool();
1311 if (!hp)
1312 goto clear_hash_noput;
1313 req = hp->md5_req;
1314
1315 if (crypto_ahash_init(req))
1316 goto clear_hash;
1317 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1318 goto clear_hash;
1319 if (tcp_md5_hash_key(hp, key))
1320 goto clear_hash;
1321 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1322 if (crypto_ahash_final(req))
1323 goto clear_hash;
1324
1325 tcp_put_md5sig_pool();
1326 return 0;
1327
1328clear_hash:
1329 tcp_put_md5sig_pool();
1330clear_hash_noput:
1331 memset(md5_hash, 0, 16);
1332 return 1;
1333}
1334
1335int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1336 const struct sock *sk,
1337 const struct sk_buff *skb)
1338{
1339 struct tcp_md5sig_pool *hp;
1340 struct ahash_request *req;
1341 const struct tcphdr *th = tcp_hdr(skb);
1342 __be32 saddr, daddr;
1343
1344 if (sk) {
1345 saddr = sk->sk_rcv_saddr;
1346 daddr = sk->sk_daddr;
1347 } else {
1348 const struct iphdr *iph = ip_hdr(skb);
1349 saddr = iph->saddr;
1350 daddr = iph->daddr;
1351 }
1352
1353 hp = tcp_get_md5sig_pool();
1354 if (!hp)
1355 goto clear_hash_noput;
1356 req = hp->md5_req;
1357
1358 if (crypto_ahash_init(req))
1359 goto clear_hash;
1360
1361 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1362 goto clear_hash;
1363 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1364 goto clear_hash;
1365 if (tcp_md5_hash_key(hp, key))
1366 goto clear_hash;
1367 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1368 if (crypto_ahash_final(req))
1369 goto clear_hash;
1370
1371 tcp_put_md5sig_pool();
1372 return 0;
1373
1374clear_hash:
1375 tcp_put_md5sig_pool();
1376clear_hash_noput:
1377 memset(md5_hash, 0, 16);
1378 return 1;
1379}
1380EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1381
1382#endif
1383
1384
1385static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1386 const struct sk_buff *skb,
1387 int dif, int sdif)
1388{
1389#ifdef CONFIG_TCP_MD5SIG
1390
1391
1392
1393
1394
1395
1396
1397
1398 const __u8 *hash_location = NULL;
1399 struct tcp_md5sig_key *hash_expected;
1400 const struct iphdr *iph = ip_hdr(skb);
1401 const struct tcphdr *th = tcp_hdr(skb);
1402 const union tcp_md5_addr *addr;
1403 unsigned char newhash[16];
1404 int genhash, l3index;
1405
1406
1407
1408
1409 l3index = sdif ? dif : 0;
1410
1411 addr = (union tcp_md5_addr *)&iph->saddr;
1412 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1413 hash_location = tcp_parse_md5sig_option(th);
1414
1415
1416 if (!hash_expected && !hash_location)
1417 return false;
1418
1419 if (hash_expected && !hash_location) {
1420 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1421 return true;
1422 }
1423
1424 if (!hash_expected && hash_location) {
1425 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1426 return true;
1427 }
1428
1429
1430
1431
1432 genhash = tcp_v4_md5_hash_skb(newhash,
1433 hash_expected,
1434 NULL, skb);
1435
1436 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1437 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1438 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1439 &iph->saddr, ntohs(th->source),
1440 &iph->daddr, ntohs(th->dest),
1441 genhash ? " tcp_v4_calc_md5_hash failed"
1442 : "", l3index);
1443 return true;
1444 }
1445 return false;
1446#endif
1447 return false;
1448}
1449
1450static void tcp_v4_init_req(struct request_sock *req,
1451 const struct sock *sk_listener,
1452 struct sk_buff *skb)
1453{
1454 struct inet_request_sock *ireq = inet_rsk(req);
1455 struct net *net = sock_net(sk_listener);
1456
1457 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1458 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1459 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1460}
1461
1462static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1463 struct sk_buff *skb,
1464 struct flowi *fl,
1465 struct request_sock *req)
1466{
1467 tcp_v4_init_req(req, sk, skb);
1468
1469 if (security_inet_conn_request(sk, skb, req))
1470 return NULL;
1471
1472 return inet_csk_route_req(sk, &fl->u.ip4, req);
1473}
1474
1475struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1476 .family = PF_INET,
1477 .obj_size = sizeof(struct tcp_request_sock),
1478 .rtx_syn_ack = tcp_rtx_synack,
1479 .send_ack = tcp_v4_reqsk_send_ack,
1480 .destructor = tcp_v4_reqsk_destructor,
1481 .send_reset = tcp_v4_send_reset,
1482 .syn_ack_timeout = tcp_syn_ack_timeout,
1483};
1484
1485const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1486 .mss_clamp = TCP_MSS_DEFAULT,
1487#ifdef CONFIG_TCP_MD5SIG
1488 .req_md5_lookup = tcp_v4_md5_lookup,
1489 .calc_md5_hash = tcp_v4_md5_hash_skb,
1490#endif
1491#ifdef CONFIG_SYN_COOKIES
1492 .cookie_init_seq = cookie_v4_init_sequence,
1493#endif
1494 .route_req = tcp_v4_route_req,
1495 .init_seq = tcp_v4_init_seq,
1496 .init_ts_off = tcp_v4_init_ts_off,
1497 .send_synack = tcp_v4_send_synack,
1498};
1499
1500int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1501{
1502
1503 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1504 goto drop;
1505
1506 return tcp_conn_request(&tcp_request_sock_ops,
1507 &tcp_request_sock_ipv4_ops, sk, skb);
1508
1509drop:
1510 tcp_listendrop(sk);
1511 return 0;
1512}
1513EXPORT_SYMBOL(tcp_v4_conn_request);
1514
1515
1516
1517
1518
1519
1520struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1521 struct request_sock *req,
1522 struct dst_entry *dst,
1523 struct request_sock *req_unhash,
1524 bool *own_req)
1525{
1526 struct inet_request_sock *ireq;
1527 bool found_dup_sk = false;
1528 struct inet_sock *newinet;
1529 struct tcp_sock *newtp;
1530 struct sock *newsk;
1531#ifdef CONFIG_TCP_MD5SIG
1532 const union tcp_md5_addr *addr;
1533 struct tcp_md5sig_key *key;
1534 int l3index;
1535#endif
1536 struct ip_options_rcu *inet_opt;
1537
1538 if (sk_acceptq_is_full(sk))
1539 goto exit_overflow;
1540
1541 newsk = tcp_create_openreq_child(sk, req, skb);
1542 if (!newsk)
1543 goto exit_nonewsk;
1544
1545 newsk->sk_gso_type = SKB_GSO_TCPV4;
1546 inet_sk_rx_dst_set(newsk, skb);
1547
1548 newtp = tcp_sk(newsk);
1549 newinet = inet_sk(newsk);
1550 ireq = inet_rsk(req);
1551 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1552 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1553 newsk->sk_bound_dev_if = ireq->ir_iif;
1554 newinet->inet_saddr = ireq->ir_loc_addr;
1555 inet_opt = rcu_dereference(ireq->ireq_opt);
1556 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1557 newinet->mc_index = inet_iif(skb);
1558 newinet->mc_ttl = ip_hdr(skb)->ttl;
1559 newinet->rcv_tos = ip_hdr(skb)->tos;
1560 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1561 if (inet_opt)
1562 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1563 newinet->inet_id = prandom_u32();
1564
1565
1566
1567
1568 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1569 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1570
1571 if (!dst) {
1572 dst = inet_csk_route_child_sock(sk, newsk, req);
1573 if (!dst)
1574 goto put_and_exit;
1575 } else {
1576
1577 }
1578 sk_setup_caps(newsk, dst);
1579
1580 tcp_ca_openreq_child(newsk, dst);
1581
1582 tcp_sync_mss(newsk, dst_mtu(dst));
1583 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1584
1585 tcp_initialize_rcv_mss(newsk);
1586
1587#ifdef CONFIG_TCP_MD5SIG
1588 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1589
1590 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1591 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1592 if (key) {
1593
1594
1595
1596
1597
1598
1599 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1600 key->key, key->keylen, GFP_ATOMIC);
1601 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1602 }
1603#endif
1604
1605 if (__inet_inherit_port(sk, newsk) < 0)
1606 goto put_and_exit;
1607 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1608 &found_dup_sk);
1609 if (likely(*own_req)) {
1610 tcp_move_syn(newtp, req);
1611 ireq->ireq_opt = NULL;
1612 } else {
1613 newinet->inet_opt = NULL;
1614
1615 if (!req_unhash && found_dup_sk) {
1616
1617
1618
1619 bh_unlock_sock(newsk);
1620 sock_put(newsk);
1621 newsk = NULL;
1622 }
1623 }
1624 return newsk;
1625
1626exit_overflow:
1627 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1628exit_nonewsk:
1629 dst_release(dst);
1630exit:
1631 tcp_listendrop(sk);
1632 return NULL;
1633put_and_exit:
1634 newinet->inet_opt = NULL;
1635 inet_csk_prepare_forced_close(newsk);
1636 tcp_done(newsk);
1637 goto exit;
1638}
1639EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1640
1641static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1642{
1643#ifdef CONFIG_SYN_COOKIES
1644 const struct tcphdr *th = tcp_hdr(skb);
1645
1646 if (!th->syn)
1647 sk = cookie_v4_check(sk, skb);
1648#endif
1649 return sk;
1650}
1651
1652u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1653 struct tcphdr *th, u32 *cookie)
1654{
1655 u16 mss = 0;
1656#ifdef CONFIG_SYN_COOKIES
1657 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1658 &tcp_request_sock_ipv4_ops, sk, th);
1659 if (mss) {
1660 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1661 tcp_synq_overflow(sk);
1662 }
1663#endif
1664 return mss;
1665}
1666
1667INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1668 u32));
1669
1670
1671
1672
1673
1674
1675
1676
1677int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1678{
1679 struct sock *rsk;
1680
1681 if (sk->sk_state == TCP_ESTABLISHED) {
1682 struct dst_entry *dst = sk->sk_rx_dst;
1683
1684 sock_rps_save_rxhash(sk, skb);
1685 sk_mark_napi_id(sk, skb);
1686 if (dst) {
1687 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1688 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1689 dst, 0)) {
1690 dst_release(dst);
1691 sk->sk_rx_dst = NULL;
1692 }
1693 }
1694 tcp_rcv_established(sk, skb);
1695 return 0;
1696 }
1697
1698 if (tcp_checksum_complete(skb))
1699 goto csum_err;
1700
1701 if (sk->sk_state == TCP_LISTEN) {
1702 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1703
1704 if (!nsk)
1705 goto discard;
1706 if (nsk != sk) {
1707 if (tcp_child_process(sk, nsk, skb)) {
1708 rsk = nsk;
1709 goto reset;
1710 }
1711 return 0;
1712 }
1713 } else
1714 sock_rps_save_rxhash(sk, skb);
1715
1716 if (tcp_rcv_state_process(sk, skb)) {
1717 rsk = sk;
1718 goto reset;
1719 }
1720 return 0;
1721
1722reset:
1723 tcp_v4_send_reset(rsk, skb);
1724discard:
1725 kfree_skb(skb);
1726
1727
1728
1729
1730
1731 return 0;
1732
1733csum_err:
1734 trace_tcp_bad_csum(skb);
1735 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1736 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1737 goto discard;
1738}
1739EXPORT_SYMBOL(tcp_v4_do_rcv);
1740
1741int tcp_v4_early_demux(struct sk_buff *skb)
1742{
1743 const struct iphdr *iph;
1744 const struct tcphdr *th;
1745 struct sock *sk;
1746
1747 if (skb->pkt_type != PACKET_HOST)
1748 return 0;
1749
1750 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1751 return 0;
1752
1753 iph = ip_hdr(skb);
1754 th = tcp_hdr(skb);
1755
1756 if (th->doff < sizeof(struct tcphdr) / 4)
1757 return 0;
1758
1759 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1760 iph->saddr, th->source,
1761 iph->daddr, ntohs(th->dest),
1762 skb->skb_iif, inet_sdif(skb));
1763 if (sk) {
1764 skb->sk = sk;
1765 skb->destructor = sock_edemux;
1766 if (sk_fullsock(sk)) {
1767 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1768
1769 if (dst)
1770 dst = dst_check(dst, 0);
1771 if (dst &&
1772 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1773 skb_dst_set_noref(skb, dst);
1774 }
1775 }
1776 return 0;
1777}
1778
1779bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1780{
1781 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1782 u32 tail_gso_size, tail_gso_segs;
1783 struct skb_shared_info *shinfo;
1784 const struct tcphdr *th;
1785 struct tcphdr *thtail;
1786 struct sk_buff *tail;
1787 unsigned int hdrlen;
1788 bool fragstolen;
1789 u32 gso_segs;
1790 u32 gso_size;
1791 int delta;
1792
1793
1794
1795
1796
1797
1798
1799 skb_condense(skb);
1800
1801 skb_dst_drop(skb);
1802
1803 if (unlikely(tcp_checksum_complete(skb))) {
1804 bh_unlock_sock(sk);
1805 trace_tcp_bad_csum(skb);
1806 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1807 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1808 return true;
1809 }
1810
1811
1812
1813
1814
1815 th = (const struct tcphdr *)skb->data;
1816 hdrlen = th->doff * 4;
1817
1818 tail = sk->sk_backlog.tail;
1819 if (!tail)
1820 goto no_coalesce;
1821 thtail = (struct tcphdr *)tail->data;
1822
1823 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1824 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1825 ((TCP_SKB_CB(tail)->tcp_flags |
1826 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1827 !((TCP_SKB_CB(tail)->tcp_flags &
1828 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1829 ((TCP_SKB_CB(tail)->tcp_flags ^
1830 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1831#ifdef CONFIG_TLS_DEVICE
1832 tail->decrypted != skb->decrypted ||
1833#endif
1834 thtail->doff != th->doff ||
1835 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1836 goto no_coalesce;
1837
1838 __skb_pull(skb, hdrlen);
1839
1840 shinfo = skb_shinfo(skb);
1841 gso_size = shinfo->gso_size ?: skb->len;
1842 gso_segs = shinfo->gso_segs ?: 1;
1843
1844 shinfo = skb_shinfo(tail);
1845 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1846 tail_gso_segs = shinfo->gso_segs ?: 1;
1847
1848 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1849 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1850
1851 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1852 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1853 thtail->window = th->window;
1854 }
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864 thtail->fin |= th->fin;
1865 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1866
1867 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1868 TCP_SKB_CB(tail)->has_rxtstamp = true;
1869 tail->tstamp = skb->tstamp;
1870 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1871 }
1872
1873
1874 shinfo->gso_size = max(gso_size, tail_gso_size);
1875 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1876
1877 sk->sk_backlog.len += delta;
1878 __NET_INC_STATS(sock_net(sk),
1879 LINUX_MIB_TCPBACKLOGCOALESCE);
1880 kfree_skb_partial(skb, fragstolen);
1881 return false;
1882 }
1883 __skb_push(skb, hdrlen);
1884
1885no_coalesce:
1886
1887
1888
1889
1890 limit += 64*1024;
1891
1892 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1893 bh_unlock_sock(sk);
1894 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1895 return true;
1896 }
1897 return false;
1898}
1899EXPORT_SYMBOL(tcp_add_backlog);
1900
1901int tcp_filter(struct sock *sk, struct sk_buff *skb)
1902{
1903 struct tcphdr *th = (struct tcphdr *)skb->data;
1904
1905 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1906}
1907EXPORT_SYMBOL(tcp_filter);
1908
1909static void tcp_v4_restore_cb(struct sk_buff *skb)
1910{
1911 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1912 sizeof(struct inet_skb_parm));
1913}
1914
1915static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1916 const struct tcphdr *th)
1917{
1918
1919
1920
1921 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1922 sizeof(struct inet_skb_parm));
1923 barrier();
1924
1925 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1926 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1927 skb->len - th->doff * 4);
1928 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1929 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1930 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1931 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1932 TCP_SKB_CB(skb)->sacked = 0;
1933 TCP_SKB_CB(skb)->has_rxtstamp =
1934 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1935}
1936
1937
1938
1939
1940
1941int tcp_v4_rcv(struct sk_buff *skb)
1942{
1943 struct net *net = dev_net(skb->dev);
1944 struct sk_buff *skb_to_free;
1945 int sdif = inet_sdif(skb);
1946 int dif = inet_iif(skb);
1947 const struct iphdr *iph;
1948 const struct tcphdr *th;
1949 bool refcounted;
1950 struct sock *sk;
1951 int ret;
1952
1953 if (skb->pkt_type != PACKET_HOST)
1954 goto discard_it;
1955
1956
1957 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1958
1959 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1960 goto discard_it;
1961
1962 th = (const struct tcphdr *)skb->data;
1963
1964 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1965 goto bad_packet;
1966 if (!pskb_may_pull(skb, th->doff * 4))
1967 goto discard_it;
1968
1969
1970
1971
1972
1973
1974 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1975 goto csum_error;
1976
1977 th = (const struct tcphdr *)skb->data;
1978 iph = ip_hdr(skb);
1979lookup:
1980 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1981 th->dest, sdif, &refcounted);
1982 if (!sk)
1983 goto no_tcp_socket;
1984
1985process:
1986 if (sk->sk_state == TCP_TIME_WAIT)
1987 goto do_time_wait;
1988
1989 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1990 struct request_sock *req = inet_reqsk(sk);
1991 bool req_stolen = false;
1992 struct sock *nsk;
1993
1994 sk = req->rsk_listener;
1995 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1996 sk_drops_add(sk, skb);
1997 reqsk_put(req);
1998 goto discard_it;
1999 }
2000 if (tcp_checksum_complete(skb)) {
2001 reqsk_put(req);
2002 goto csum_error;
2003 }
2004 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2005 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2006 if (!nsk) {
2007 inet_csk_reqsk_queue_drop_and_put(sk, req);
2008 goto lookup;
2009 }
2010 sk = nsk;
2011
2012
2013
2014 } else {
2015
2016
2017
2018 sock_hold(sk);
2019 }
2020 refcounted = true;
2021 nsk = NULL;
2022 if (!tcp_filter(sk, skb)) {
2023 th = (const struct tcphdr *)skb->data;
2024 iph = ip_hdr(skb);
2025 tcp_v4_fill_cb(skb, iph, th);
2026 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2027 }
2028 if (!nsk) {
2029 reqsk_put(req);
2030 if (req_stolen) {
2031
2032
2033
2034
2035
2036 tcp_v4_restore_cb(skb);
2037 sock_put(sk);
2038 goto lookup;
2039 }
2040 goto discard_and_relse;
2041 }
2042 if (nsk == sk) {
2043 reqsk_put(req);
2044 tcp_v4_restore_cb(skb);
2045 } else if (tcp_child_process(sk, nsk, skb)) {
2046 tcp_v4_send_reset(nsk, skb);
2047 goto discard_and_relse;
2048 } else {
2049 sock_put(sk);
2050 return 0;
2051 }
2052 }
2053 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2054 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2055 goto discard_and_relse;
2056 }
2057
2058 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2059 goto discard_and_relse;
2060
2061 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2062 goto discard_and_relse;
2063
2064 nf_reset_ct(skb);
2065
2066 if (tcp_filter(sk, skb))
2067 goto discard_and_relse;
2068 th = (const struct tcphdr *)skb->data;
2069 iph = ip_hdr(skb);
2070 tcp_v4_fill_cb(skb, iph, th);
2071
2072 skb->dev = NULL;
2073
2074 if (sk->sk_state == TCP_LISTEN) {
2075 ret = tcp_v4_do_rcv(sk, skb);
2076 goto put_and_return;
2077 }
2078
2079 sk_incoming_cpu_update(sk);
2080
2081 bh_lock_sock_nested(sk);
2082 tcp_segs_in(tcp_sk(sk), skb);
2083 ret = 0;
2084 if (!sock_owned_by_user(sk)) {
2085 skb_to_free = sk->sk_rx_skb_cache;
2086 sk->sk_rx_skb_cache = NULL;
2087 ret = tcp_v4_do_rcv(sk, skb);
2088 } else {
2089 if (tcp_add_backlog(sk, skb))
2090 goto discard_and_relse;
2091 skb_to_free = NULL;
2092 }
2093 bh_unlock_sock(sk);
2094 if (skb_to_free)
2095 __kfree_skb(skb_to_free);
2096
2097put_and_return:
2098 if (refcounted)
2099 sock_put(sk);
2100
2101 return ret;
2102
2103no_tcp_socket:
2104 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2105 goto discard_it;
2106
2107 tcp_v4_fill_cb(skb, iph, th);
2108
2109 if (tcp_checksum_complete(skb)) {
2110csum_error:
2111 trace_tcp_bad_csum(skb);
2112 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2113bad_packet:
2114 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2115 } else {
2116 tcp_v4_send_reset(NULL, skb);
2117 }
2118
2119discard_it:
2120
2121 kfree_skb(skb);
2122 return 0;
2123
2124discard_and_relse:
2125 sk_drops_add(sk, skb);
2126 if (refcounted)
2127 sock_put(sk);
2128 goto discard_it;
2129
2130do_time_wait:
2131 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2132 inet_twsk_put(inet_twsk(sk));
2133 goto discard_it;
2134 }
2135
2136 tcp_v4_fill_cb(skb, iph, th);
2137
2138 if (tcp_checksum_complete(skb)) {
2139 inet_twsk_put(inet_twsk(sk));
2140 goto csum_error;
2141 }
2142 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2143 case TCP_TW_SYN: {
2144 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2145 &tcp_hashinfo, skb,
2146 __tcp_hdrlen(th),
2147 iph->saddr, th->source,
2148 iph->daddr, th->dest,
2149 inet_iif(skb),
2150 sdif);
2151 if (sk2) {
2152 inet_twsk_deschedule_put(inet_twsk(sk));
2153 sk = sk2;
2154 tcp_v4_restore_cb(skb);
2155 refcounted = false;
2156 goto process;
2157 }
2158 }
2159
2160 fallthrough;
2161 case TCP_TW_ACK:
2162 tcp_v4_timewait_ack(sk, skb);
2163 break;
2164 case TCP_TW_RST:
2165 tcp_v4_send_reset(sk, skb);
2166 inet_twsk_deschedule_put(inet_twsk(sk));
2167 goto discard_it;
2168 case TCP_TW_SUCCESS:;
2169 }
2170 goto discard_it;
2171}
2172
2173static struct timewait_sock_ops tcp_timewait_sock_ops = {
2174 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2175 .twsk_unique = tcp_twsk_unique,
2176 .twsk_destructor= tcp_twsk_destructor,
2177};
2178
2179void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2180{
2181 struct dst_entry *dst = skb_dst(skb);
2182
2183 if (dst && dst_hold_safe(dst)) {
2184 sk->sk_rx_dst = dst;
2185 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2186 }
2187}
2188EXPORT_SYMBOL(inet_sk_rx_dst_set);
2189
2190const struct inet_connection_sock_af_ops ipv4_specific = {
2191 .queue_xmit = ip_queue_xmit,
2192 .send_check = tcp_v4_send_check,
2193 .rebuild_header = inet_sk_rebuild_header,
2194 .sk_rx_dst_set = inet_sk_rx_dst_set,
2195 .conn_request = tcp_v4_conn_request,
2196 .syn_recv_sock = tcp_v4_syn_recv_sock,
2197 .net_header_len = sizeof(struct iphdr),
2198 .setsockopt = ip_setsockopt,
2199 .getsockopt = ip_getsockopt,
2200 .addr2sockaddr = inet_csk_addr2sockaddr,
2201 .sockaddr_len = sizeof(struct sockaddr_in),
2202 .mtu_reduced = tcp_v4_mtu_reduced,
2203};
2204EXPORT_SYMBOL(ipv4_specific);
2205
2206#ifdef CONFIG_TCP_MD5SIG
2207static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2208 .md5_lookup = tcp_v4_md5_lookup,
2209 .calc_md5_hash = tcp_v4_md5_hash_skb,
2210 .md5_parse = tcp_v4_parse_md5_keys,
2211};
2212#endif
2213
2214
2215
2216
2217static int tcp_v4_init_sock(struct sock *sk)
2218{
2219 struct inet_connection_sock *icsk = inet_csk(sk);
2220
2221 tcp_init_sock(sk);
2222
2223 icsk->icsk_af_ops = &ipv4_specific;
2224
2225#ifdef CONFIG_TCP_MD5SIG
2226 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2227#endif
2228
2229 return 0;
2230}
2231
2232void tcp_v4_destroy_sock(struct sock *sk)
2233{
2234 struct tcp_sock *tp = tcp_sk(sk);
2235
2236 trace_tcp_destroy_sock(sk);
2237
2238 tcp_clear_xmit_timers(sk);
2239
2240 tcp_cleanup_congestion_control(sk);
2241
2242 tcp_cleanup_ulp(sk);
2243
2244
2245 tcp_write_queue_purge(sk);
2246
2247
2248 tcp_fastopen_active_disable_ofo_check(sk);
2249
2250
2251 skb_rbtree_purge(&tp->out_of_order_queue);
2252
2253#ifdef CONFIG_TCP_MD5SIG
2254
2255 if (tp->md5sig_info) {
2256 tcp_clear_md5_list(sk);
2257 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2258 tp->md5sig_info = NULL;
2259 }
2260#endif
2261
2262
2263 if (inet_csk(sk)->icsk_bind_hash)
2264 inet_put_port(sk);
2265
2266 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2267
2268
2269 tcp_free_fastopen_req(tp);
2270 tcp_fastopen_destroy_cipher(sk);
2271 tcp_saved_syn_free(tp);
2272
2273 sk_sockets_allocated_dec(sk);
2274}
2275EXPORT_SYMBOL(tcp_v4_destroy_sock);
2276
2277#ifdef CONFIG_PROC_FS
2278
2279
2280
2281
2282
2283
2284
2285static void *listening_get_next(struct seq_file *seq, void *cur)
2286{
2287 struct tcp_seq_afinfo *afinfo;
2288 struct tcp_iter_state *st = seq->private;
2289 struct net *net = seq_file_net(seq);
2290 struct inet_listen_hashbucket *ilb;
2291 struct hlist_nulls_node *node;
2292 struct sock *sk = cur;
2293
2294 if (st->bpf_seq_afinfo)
2295 afinfo = st->bpf_seq_afinfo;
2296 else
2297 afinfo = PDE_DATA(file_inode(seq->file));
2298
2299 if (!sk) {
2300get_head:
2301 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2302 spin_lock(&ilb->lock);
2303 sk = sk_nulls_head(&ilb->nulls_head);
2304 st->offset = 0;
2305 goto get_sk;
2306 }
2307 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2308 ++st->num;
2309 ++st->offset;
2310
2311 sk = sk_nulls_next(sk);
2312get_sk:
2313 sk_nulls_for_each_from(sk, node) {
2314 if (!net_eq(sock_net(sk), net))
2315 continue;
2316 if (afinfo->family == AF_UNSPEC ||
2317 sk->sk_family == afinfo->family)
2318 return sk;
2319 }
2320 spin_unlock(&ilb->lock);
2321 st->offset = 0;
2322 if (++st->bucket < INET_LHTABLE_SIZE)
2323 goto get_head;
2324 return NULL;
2325}
2326
2327static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2328{
2329 struct tcp_iter_state *st = seq->private;
2330 void *rc;
2331
2332 st->bucket = 0;
2333 st->offset = 0;
2334 rc = listening_get_next(seq, NULL);
2335
2336 while (rc && *pos) {
2337 rc = listening_get_next(seq, rc);
2338 --*pos;
2339 }
2340 return rc;
2341}
2342
2343static inline bool empty_bucket(const struct tcp_iter_state *st)
2344{
2345 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2346}
2347
2348
2349
2350
2351
2352static void *established_get_first(struct seq_file *seq)
2353{
2354 struct tcp_seq_afinfo *afinfo;
2355 struct tcp_iter_state *st = seq->private;
2356 struct net *net = seq_file_net(seq);
2357 void *rc = NULL;
2358
2359 if (st->bpf_seq_afinfo)
2360 afinfo = st->bpf_seq_afinfo;
2361 else
2362 afinfo = PDE_DATA(file_inode(seq->file));
2363
2364 st->offset = 0;
2365 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2366 struct sock *sk;
2367 struct hlist_nulls_node *node;
2368 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2369
2370
2371 if (empty_bucket(st))
2372 continue;
2373
2374 spin_lock_bh(lock);
2375 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2376 if ((afinfo->family != AF_UNSPEC &&
2377 sk->sk_family != afinfo->family) ||
2378 !net_eq(sock_net(sk), net)) {
2379 continue;
2380 }
2381 rc = sk;
2382 goto out;
2383 }
2384 spin_unlock_bh(lock);
2385 }
2386out:
2387 return rc;
2388}
2389
2390static void *established_get_next(struct seq_file *seq, void *cur)
2391{
2392 struct tcp_seq_afinfo *afinfo;
2393 struct sock *sk = cur;
2394 struct hlist_nulls_node *node;
2395 struct tcp_iter_state *st = seq->private;
2396 struct net *net = seq_file_net(seq);
2397
2398 if (st->bpf_seq_afinfo)
2399 afinfo = st->bpf_seq_afinfo;
2400 else
2401 afinfo = PDE_DATA(file_inode(seq->file));
2402
2403 ++st->num;
2404 ++st->offset;
2405
2406 sk = sk_nulls_next(sk);
2407
2408 sk_nulls_for_each_from(sk, node) {
2409 if ((afinfo->family == AF_UNSPEC ||
2410 sk->sk_family == afinfo->family) &&
2411 net_eq(sock_net(sk), net))
2412 return sk;
2413 }
2414
2415 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2416 ++st->bucket;
2417 return established_get_first(seq);
2418}
2419
2420static void *established_get_idx(struct seq_file *seq, loff_t pos)
2421{
2422 struct tcp_iter_state *st = seq->private;
2423 void *rc;
2424
2425 st->bucket = 0;
2426 rc = established_get_first(seq);
2427
2428 while (rc && pos) {
2429 rc = established_get_next(seq, rc);
2430 --pos;
2431 }
2432 return rc;
2433}
2434
2435static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2436{
2437 void *rc;
2438 struct tcp_iter_state *st = seq->private;
2439
2440 st->state = TCP_SEQ_STATE_LISTENING;
2441 rc = listening_get_idx(seq, &pos);
2442
2443 if (!rc) {
2444 st->state = TCP_SEQ_STATE_ESTABLISHED;
2445 rc = established_get_idx(seq, pos);
2446 }
2447
2448 return rc;
2449}
2450
2451static void *tcp_seek_last_pos(struct seq_file *seq)
2452{
2453 struct tcp_iter_state *st = seq->private;
2454 int offset = st->offset;
2455 int orig_num = st->num;
2456 void *rc = NULL;
2457
2458 switch (st->state) {
2459 case TCP_SEQ_STATE_LISTENING:
2460 if (st->bucket >= INET_LHTABLE_SIZE)
2461 break;
2462 st->state = TCP_SEQ_STATE_LISTENING;
2463 rc = listening_get_next(seq, NULL);
2464 while (offset-- && rc)
2465 rc = listening_get_next(seq, rc);
2466 if (rc)
2467 break;
2468 st->bucket = 0;
2469 st->state = TCP_SEQ_STATE_ESTABLISHED;
2470 fallthrough;
2471 case TCP_SEQ_STATE_ESTABLISHED:
2472 if (st->bucket > tcp_hashinfo.ehash_mask)
2473 break;
2474 rc = established_get_first(seq);
2475 while (offset-- && rc)
2476 rc = established_get_next(seq, rc);
2477 }
2478
2479 st->num = orig_num;
2480
2481 return rc;
2482}
2483
2484void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2485{
2486 struct tcp_iter_state *st = seq->private;
2487 void *rc;
2488
2489 if (*pos && *pos == st->last_pos) {
2490 rc = tcp_seek_last_pos(seq);
2491 if (rc)
2492 goto out;
2493 }
2494
2495 st->state = TCP_SEQ_STATE_LISTENING;
2496 st->num = 0;
2497 st->bucket = 0;
2498 st->offset = 0;
2499 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2500
2501out:
2502 st->last_pos = *pos;
2503 return rc;
2504}
2505EXPORT_SYMBOL(tcp_seq_start);
2506
2507void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2508{
2509 struct tcp_iter_state *st = seq->private;
2510 void *rc = NULL;
2511
2512 if (v == SEQ_START_TOKEN) {
2513 rc = tcp_get_idx(seq, 0);
2514 goto out;
2515 }
2516
2517 switch (st->state) {
2518 case TCP_SEQ_STATE_LISTENING:
2519 rc = listening_get_next(seq, v);
2520 if (!rc) {
2521 st->state = TCP_SEQ_STATE_ESTABLISHED;
2522 st->bucket = 0;
2523 st->offset = 0;
2524 rc = established_get_first(seq);
2525 }
2526 break;
2527 case TCP_SEQ_STATE_ESTABLISHED:
2528 rc = established_get_next(seq, v);
2529 break;
2530 }
2531out:
2532 ++*pos;
2533 st->last_pos = *pos;
2534 return rc;
2535}
2536EXPORT_SYMBOL(tcp_seq_next);
2537
2538void tcp_seq_stop(struct seq_file *seq, void *v)
2539{
2540 struct tcp_iter_state *st = seq->private;
2541
2542 switch (st->state) {
2543 case TCP_SEQ_STATE_LISTENING:
2544 if (v != SEQ_START_TOKEN)
2545 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2546 break;
2547 case TCP_SEQ_STATE_ESTABLISHED:
2548 if (v)
2549 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2550 break;
2551 }
2552}
2553EXPORT_SYMBOL(tcp_seq_stop);
2554
2555static void get_openreq4(const struct request_sock *req,
2556 struct seq_file *f, int i)
2557{
2558 const struct inet_request_sock *ireq = inet_rsk(req);
2559 long delta = req->rsk_timer.expires - jiffies;
2560
2561 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2562 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2563 i,
2564 ireq->ir_loc_addr,
2565 ireq->ir_num,
2566 ireq->ir_rmt_addr,
2567 ntohs(ireq->ir_rmt_port),
2568 TCP_SYN_RECV,
2569 0, 0,
2570 1,
2571 jiffies_delta_to_clock_t(delta),
2572 req->num_timeout,
2573 from_kuid_munged(seq_user_ns(f),
2574 sock_i_uid(req->rsk_listener)),
2575 0,
2576 0,
2577 0,
2578 req);
2579}
2580
2581static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2582{
2583 int timer_active;
2584 unsigned long timer_expires;
2585 const struct tcp_sock *tp = tcp_sk(sk);
2586 const struct inet_connection_sock *icsk = inet_csk(sk);
2587 const struct inet_sock *inet = inet_sk(sk);
2588 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2589 __be32 dest = inet->inet_daddr;
2590 __be32 src = inet->inet_rcv_saddr;
2591 __u16 destp = ntohs(inet->inet_dport);
2592 __u16 srcp = ntohs(inet->inet_sport);
2593 int rx_queue;
2594 int state;
2595
2596 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2597 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2598 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2599 timer_active = 1;
2600 timer_expires = icsk->icsk_timeout;
2601 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2602 timer_active = 4;
2603 timer_expires = icsk->icsk_timeout;
2604 } else if (timer_pending(&sk->sk_timer)) {
2605 timer_active = 2;
2606 timer_expires = sk->sk_timer.expires;
2607 } else {
2608 timer_active = 0;
2609 timer_expires = jiffies;
2610 }
2611
2612 state = inet_sk_state_load(sk);
2613 if (state == TCP_LISTEN)
2614 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2615 else
2616
2617
2618
2619 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2620 READ_ONCE(tp->copied_seq), 0);
2621
2622 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2623 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2624 i, src, srcp, dest, destp, state,
2625 READ_ONCE(tp->write_seq) - tp->snd_una,
2626 rx_queue,
2627 timer_active,
2628 jiffies_delta_to_clock_t(timer_expires - jiffies),
2629 icsk->icsk_retransmits,
2630 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2631 icsk->icsk_probes_out,
2632 sock_i_ino(sk),
2633 refcount_read(&sk->sk_refcnt), sk,
2634 jiffies_to_clock_t(icsk->icsk_rto),
2635 jiffies_to_clock_t(icsk->icsk_ack.ato),
2636 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2637 tp->snd_cwnd,
2638 state == TCP_LISTEN ?
2639 fastopenq->max_qlen :
2640 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2641}
2642
2643static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2644 struct seq_file *f, int i)
2645{
2646 long delta = tw->tw_timer.expires - jiffies;
2647 __be32 dest, src;
2648 __u16 destp, srcp;
2649
2650 dest = tw->tw_daddr;
2651 src = tw->tw_rcv_saddr;
2652 destp = ntohs(tw->tw_dport);
2653 srcp = ntohs(tw->tw_sport);
2654
2655 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2656 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2657 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2658 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2659 refcount_read(&tw->tw_refcnt), tw);
2660}
2661
2662#define TMPSZ 150
2663
2664static int tcp4_seq_show(struct seq_file *seq, void *v)
2665{
2666 struct tcp_iter_state *st;
2667 struct sock *sk = v;
2668
2669 seq_setwidth(seq, TMPSZ - 1);
2670 if (v == SEQ_START_TOKEN) {
2671 seq_puts(seq, " sl local_address rem_address st tx_queue "
2672 "rx_queue tr tm->when retrnsmt uid timeout "
2673 "inode");
2674 goto out;
2675 }
2676 st = seq->private;
2677
2678 if (sk->sk_state == TCP_TIME_WAIT)
2679 get_timewait4_sock(v, seq, st->num);
2680 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2681 get_openreq4(v, seq, st->num);
2682 else
2683 get_tcp4_sock(v, seq, st->num);
2684out:
2685 seq_pad(seq, '\n');
2686 return 0;
2687}
2688
2689#ifdef CONFIG_BPF_SYSCALL
2690struct bpf_iter__tcp {
2691 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2692 __bpf_md_ptr(struct sock_common *, sk_common);
2693 uid_t uid __aligned(8);
2694};
2695
2696static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2697 struct sock_common *sk_common, uid_t uid)
2698{
2699 struct bpf_iter__tcp ctx;
2700
2701 meta->seq_num--;
2702 ctx.meta = meta;
2703 ctx.sk_common = sk_common;
2704 ctx.uid = uid;
2705 return bpf_iter_run_prog(prog, &ctx);
2706}
2707
2708static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2709{
2710 struct bpf_iter_meta meta;
2711 struct bpf_prog *prog;
2712 struct sock *sk = v;
2713 uid_t uid;
2714
2715 if (v == SEQ_START_TOKEN)
2716 return 0;
2717
2718 if (sk->sk_state == TCP_TIME_WAIT) {
2719 uid = 0;
2720 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2721 const struct request_sock *req = v;
2722
2723 uid = from_kuid_munged(seq_user_ns(seq),
2724 sock_i_uid(req->rsk_listener));
2725 } else {
2726 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2727 }
2728
2729 meta.seq = seq;
2730 prog = bpf_iter_get_info(&meta, false);
2731 return tcp_prog_seq_show(prog, &meta, v, uid);
2732}
2733
2734static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2735{
2736 struct bpf_iter_meta meta;
2737 struct bpf_prog *prog;
2738
2739 if (!v) {
2740 meta.seq = seq;
2741 prog = bpf_iter_get_info(&meta, true);
2742 if (prog)
2743 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2744 }
2745
2746 tcp_seq_stop(seq, v);
2747}
2748
2749static const struct seq_operations bpf_iter_tcp_seq_ops = {
2750 .show = bpf_iter_tcp_seq_show,
2751 .start = tcp_seq_start,
2752 .next = tcp_seq_next,
2753 .stop = bpf_iter_tcp_seq_stop,
2754};
2755#endif
2756
2757static const struct seq_operations tcp4_seq_ops = {
2758 .show = tcp4_seq_show,
2759 .start = tcp_seq_start,
2760 .next = tcp_seq_next,
2761 .stop = tcp_seq_stop,
2762};
2763
2764static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2765 .family = AF_INET,
2766};
2767
2768static int __net_init tcp4_proc_init_net(struct net *net)
2769{
2770 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2771 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2772 return -ENOMEM;
2773 return 0;
2774}
2775
2776static void __net_exit tcp4_proc_exit_net(struct net *net)
2777{
2778 remove_proc_entry("tcp", net->proc_net);
2779}
2780
2781static struct pernet_operations tcp4_net_ops = {
2782 .init = tcp4_proc_init_net,
2783 .exit = tcp4_proc_exit_net,
2784};
2785
2786int __init tcp4_proc_init(void)
2787{
2788 return register_pernet_subsys(&tcp4_net_ops);
2789}
2790
2791void tcp4_proc_exit(void)
2792{
2793 unregister_pernet_subsys(&tcp4_net_ops);
2794}
2795#endif
2796
2797
2798
2799
2800
2801bool tcp_stream_memory_free(const struct sock *sk, int wake)
2802{
2803 const struct tcp_sock *tp = tcp_sk(sk);
2804 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
2805 READ_ONCE(tp->snd_nxt);
2806
2807 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
2808}
2809EXPORT_SYMBOL(tcp_stream_memory_free);
2810
2811struct proto tcp_prot = {
2812 .name = "TCP",
2813 .owner = THIS_MODULE,
2814 .close = tcp_close,
2815 .pre_connect = tcp_v4_pre_connect,
2816 .connect = tcp_v4_connect,
2817 .disconnect = tcp_disconnect,
2818 .accept = inet_csk_accept,
2819 .ioctl = tcp_ioctl,
2820 .init = tcp_v4_init_sock,
2821 .destroy = tcp_v4_destroy_sock,
2822 .shutdown = tcp_shutdown,
2823 .setsockopt = tcp_setsockopt,
2824 .getsockopt = tcp_getsockopt,
2825 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
2826 .keepalive = tcp_set_keepalive,
2827 .recvmsg = tcp_recvmsg,
2828 .sendmsg = tcp_sendmsg,
2829 .sendpage = tcp_sendpage,
2830 .backlog_rcv = tcp_v4_do_rcv,
2831 .release_cb = tcp_release_cb,
2832 .hash = inet_hash,
2833 .unhash = inet_unhash,
2834 .get_port = inet_csk_get_port,
2835#ifdef CONFIG_BPF_SYSCALL
2836 .psock_update_sk_prot = tcp_bpf_update_proto,
2837#endif
2838 .enter_memory_pressure = tcp_enter_memory_pressure,
2839 .leave_memory_pressure = tcp_leave_memory_pressure,
2840 .stream_memory_free = tcp_stream_memory_free,
2841 .sockets_allocated = &tcp_sockets_allocated,
2842 .orphan_count = &tcp_orphan_count,
2843 .memory_allocated = &tcp_memory_allocated,
2844 .memory_pressure = &tcp_memory_pressure,
2845 .sysctl_mem = sysctl_tcp_mem,
2846 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2847 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2848 .max_header = MAX_TCP_HEADER,
2849 .obj_size = sizeof(struct tcp_sock),
2850 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2851 .twsk_prot = &tcp_timewait_sock_ops,
2852 .rsk_prot = &tcp_request_sock_ops,
2853 .h.hashinfo = &tcp_hashinfo,
2854 .no_autobind = true,
2855 .diag_destroy = tcp_abort,
2856};
2857EXPORT_SYMBOL(tcp_prot);
2858
2859static void __net_exit tcp_sk_exit(struct net *net)
2860{
2861 int cpu;
2862
2863 if (net->ipv4.tcp_congestion_control)
2864 bpf_module_put(net->ipv4.tcp_congestion_control,
2865 net->ipv4.tcp_congestion_control->owner);
2866
2867 for_each_possible_cpu(cpu)
2868 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2869 free_percpu(net->ipv4.tcp_sk);
2870}
2871
2872static int __net_init tcp_sk_init(struct net *net)
2873{
2874 int res, cpu, cnt;
2875
2876 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2877 if (!net->ipv4.tcp_sk)
2878 return -ENOMEM;
2879
2880 for_each_possible_cpu(cpu) {
2881 struct sock *sk;
2882
2883 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2884 IPPROTO_TCP, net);
2885 if (res)
2886 goto fail;
2887 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2888
2889
2890
2891
2892 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2893
2894 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2895 }
2896
2897 net->ipv4.sysctl_tcp_ecn = 2;
2898 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2899
2900 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2901 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2902 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2903 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2904 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2905
2906 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2907 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2908 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2909
2910 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2911 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2912 net->ipv4.sysctl_tcp_syncookies = 1;
2913 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2914 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2915 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2916 net->ipv4.sysctl_tcp_orphan_retries = 0;
2917 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2918 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2919 net->ipv4.sysctl_tcp_tw_reuse = 2;
2920 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2921
2922 cnt = tcp_hashinfo.ehash_mask + 1;
2923 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2924 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2925
2926 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2927 net->ipv4.sysctl_tcp_sack = 1;
2928 net->ipv4.sysctl_tcp_window_scaling = 1;
2929 net->ipv4.sysctl_tcp_timestamps = 1;
2930 net->ipv4.sysctl_tcp_early_retrans = 3;
2931 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2932 net->ipv4.sysctl_tcp_slow_start_after_idle = 1;
2933 net->ipv4.sysctl_tcp_retrans_collapse = 1;
2934 net->ipv4.sysctl_tcp_max_reordering = 300;
2935 net->ipv4.sysctl_tcp_dsack = 1;
2936 net->ipv4.sysctl_tcp_app_win = 31;
2937 net->ipv4.sysctl_tcp_adv_win_scale = 1;
2938 net->ipv4.sysctl_tcp_frto = 2;
2939 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2940
2941
2942
2943
2944 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2945
2946 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2947
2948 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2949 net->ipv4.sysctl_tcp_min_tso_segs = 2;
2950 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2951 net->ipv4.sysctl_tcp_autocorking = 1;
2952 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2953 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2954 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2955 if (net != &init_net) {
2956 memcpy(net->ipv4.sysctl_tcp_rmem,
2957 init_net.ipv4.sysctl_tcp_rmem,
2958 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2959 memcpy(net->ipv4.sysctl_tcp_wmem,
2960 init_net.ipv4.sysctl_tcp_wmem,
2961 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2962 }
2963 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2964 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2965 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2966 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2967 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2968 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
2969 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2970
2971
2972 if (!net_eq(net, &init_net) &&
2973 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2974 init_net.ipv4.tcp_congestion_control->owner))
2975 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2976 else
2977 net->ipv4.tcp_congestion_control = &tcp_reno;
2978
2979 return 0;
2980fail:
2981 tcp_sk_exit(net);
2982
2983 return res;
2984}
2985
2986static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2987{
2988 struct net *net;
2989
2990 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2991
2992 list_for_each_entry(net, net_exit_list, exit_list)
2993 tcp_fastopen_ctx_destroy(net);
2994}
2995
2996static struct pernet_operations __net_initdata tcp_sk_ops = {
2997 .init = tcp_sk_init,
2998 .exit = tcp_sk_exit,
2999 .exit_batch = tcp_sk_exit_batch,
3000};
3001
3002#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3003DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3004 struct sock_common *sk_common, uid_t uid)
3005
3006static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3007{
3008 struct tcp_iter_state *st = priv_data;
3009 struct tcp_seq_afinfo *afinfo;
3010 int ret;
3011
3012 afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
3013 if (!afinfo)
3014 return -ENOMEM;
3015
3016 afinfo->family = AF_UNSPEC;
3017 st->bpf_seq_afinfo = afinfo;
3018 ret = bpf_iter_init_seq_net(priv_data, aux);
3019 if (ret)
3020 kfree(afinfo);
3021 return ret;
3022}
3023
3024static void bpf_iter_fini_tcp(void *priv_data)
3025{
3026 struct tcp_iter_state *st = priv_data;
3027
3028 kfree(st->bpf_seq_afinfo);
3029 bpf_iter_fini_seq_net(priv_data);
3030}
3031
3032static const struct bpf_iter_seq_info tcp_seq_info = {
3033 .seq_ops = &bpf_iter_tcp_seq_ops,
3034 .init_seq_private = bpf_iter_init_tcp,
3035 .fini_seq_private = bpf_iter_fini_tcp,
3036 .seq_priv_size = sizeof(struct tcp_iter_state),
3037};
3038
3039static struct bpf_iter_reg tcp_reg_info = {
3040 .target = "tcp",
3041 .ctx_arg_info_size = 1,
3042 .ctx_arg_info = {
3043 { offsetof(struct bpf_iter__tcp, sk_common),
3044 PTR_TO_BTF_ID_OR_NULL },
3045 },
3046 .seq_info = &tcp_seq_info,
3047};
3048
3049static void __init bpf_iter_register(void)
3050{
3051 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3052 if (bpf_iter_reg_target(&tcp_reg_info))
3053 pr_warn("Warning: could not register bpf iterator tcp\n");
3054}
3055
3056#endif
3057
3058void __init tcp_v4_init(void)
3059{
3060 if (register_pernet_subsys(&tcp_sk_ops))
3061 panic("Failed to create the TCP control socket.\n");
3062
3063#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3064 bpf_iter_register();
3065#endif
3066}
3067