1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48#define pr_fmt(fmt) "TCP: " fmt
49
50#include <linux/bottom_half.h>
51#include <linux/types.h>
52#include <linux/fcntl.h>
53#include <linux/module.h>
54#include <linux/random.h>
55#include <linux/cache.h>
56#include <linux/jhash.h>
57#include <linux/init.h>
58#include <linux/times.h>
59#include <linux/slab.h>
60
61#include <net/net_namespace.h>
62#include <net/icmp.h>
63#include <net/inet_hashtables.h>
64#include <net/tcp.h>
65#include <net/transp_v6.h>
66#include <net/ipv6.h>
67#include <net/inet_common.h>
68#include <net/timewait_sock.h>
69#include <net/xfrm.h>
70#include <net/secure_seq.h>
71#include <net/busy_poll.h>
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78#include <linux/inetdevice.h>
79#include <linux/btf_ids.h>
80
81#include <crypto/hash.h>
82#include <linux/scatterlist.h>
83
84#include <trace/events/tcp.h>
85
86#ifdef CONFIG_TCP_MD5SIG
87static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
89#endif
90
91struct inet_hashinfo tcp_hashinfo;
92EXPORT_SYMBOL(tcp_hashinfo);
93
94static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95{
96 return secure_tcp_seq(ip_hdr(skb)->daddr,
97 ip_hdr(skb)->saddr,
98 tcp_hdr(skb)->dest,
99 tcp_hdr(skb)->source);
100}
101
102static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103{
104 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105}
106
107int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108{
109 const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
112 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113
114 if (reuse == 2) {
115
116
117
118
119 bool loopback = false;
120 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 loopback = true;
122#if IS_ENABLED(CONFIG_IPV6)
123 if (tw->tw_family == AF_INET6) {
124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128 loopback = true;
129 } else
130#endif
131 {
132 if (ipv4_is_loopback(tw->tw_daddr) ||
133 ipv4_is_loopback(tw->tw_rcv_saddr))
134 loopback = true;
135 }
136 if (!loopback)
137 reuse = 0;
138 }
139
140
141
142
143
144
145
146
147
148
149
150
151 if (tcptw->tw_ts_recent_stamp &&
152 (!twp || (reuse && time_after32(ktime_get_seconds(),
153 tcptw->tw_ts_recent_stamp)))) {
154
155
156
157
158
159
160
161
162
163
164
165 if (likely(!tp->repair)) {
166 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167
168 if (!seq)
169 seq = 1;
170 WRITE_ONCE(tp->write_seq, seq);
171 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
172 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 }
174 sock_hold(sktw);
175 return 1;
176 }
177
178 return 0;
179}
180EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181
182static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 int addr_len)
184{
185
186
187
188
189 if (addr_len < sizeof(struct sockaddr_in))
190 return -EINVAL;
191
192 sock_owned_by_me(sk);
193
194 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195}
196
197
198int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199{
200 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 struct inet_sock *inet = inet_sk(sk);
202 struct tcp_sock *tp = tcp_sk(sk);
203 __be16 orig_sport, orig_dport;
204 __be32 daddr, nexthop;
205 struct flowi4 *fl4;
206 struct rtable *rt;
207 int err;
208 struct ip_options_rcu *inet_opt;
209 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210
211 if (addr_len < sizeof(struct sockaddr_in))
212 return -EINVAL;
213
214 if (usin->sin_family != AF_INET)
215 return -EAFNOSUPPORT;
216
217 nexthop = daddr = usin->sin_addr.s_addr;
218 inet_opt = rcu_dereference_protected(inet->inet_opt,
219 lockdep_sock_is_held(sk));
220 if (inet_opt && inet_opt->opt.srr) {
221 if (!daddr)
222 return -EINVAL;
223 nexthop = inet_opt->opt.faddr;
224 }
225
226 orig_sport = inet->inet_sport;
227 orig_dport = usin->sin_port;
228 fl4 = &inet->cork.fl.u.ip4;
229 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 IPPROTO_TCP,
232 orig_sport, orig_dport, sk);
233 if (IS_ERR(rt)) {
234 err = PTR_ERR(rt);
235 if (err == -ENETUNREACH)
236 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237 return err;
238 }
239
240 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 ip_rt_put(rt);
242 return -ENETUNREACH;
243 }
244
245 if (!inet_opt || !inet_opt->opt.srr)
246 daddr = fl4->daddr;
247
248 if (!inet->inet_saddr)
249 inet->inet_saddr = fl4->saddr;
250 sk_rcv_saddr_set(sk, inet->inet_saddr);
251
252 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253
254 tp->rx_opt.ts_recent = 0;
255 tp->rx_opt.ts_recent_stamp = 0;
256 if (likely(!tp->repair))
257 WRITE_ONCE(tp->write_seq, 0);
258 }
259
260 inet->inet_dport = usin->sin_port;
261 sk_daddr_set(sk, daddr);
262
263 inet_csk(sk)->icsk_ext_hdr_len = 0;
264 if (inet_opt)
265 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266
267 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268
269
270
271
272
273
274 tcp_set_state(sk, TCP_SYN_SENT);
275 err = inet_hash_connect(tcp_death_row, sk);
276 if (err)
277 goto failure;
278
279 sk_set_txhash(sk);
280
281 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 inet->inet_sport, inet->inet_dport, sk);
283 if (IS_ERR(rt)) {
284 err = PTR_ERR(rt);
285 rt = NULL;
286 goto failure;
287 }
288
289 sk->sk_gso_type = SKB_GSO_TCPV4;
290 sk_setup_caps(sk, &rt->dst);
291 rt = NULL;
292
293 if (likely(!tp->repair)) {
294 if (!tp->write_seq)
295 WRITE_ONCE(tp->write_seq,
296 secure_tcp_seq(inet->inet_saddr,
297 inet->inet_daddr,
298 inet->inet_sport,
299 usin->sin_port));
300 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 inet->inet_saddr,
302 inet->inet_daddr);
303 }
304
305 inet->inet_id = prandom_u32();
306
307 if (tcp_fastopen_defer_connect(sk, &err))
308 return err;
309 if (err)
310 goto failure;
311
312 err = tcp_connect(sk);
313
314 if (err)
315 goto failure;
316
317 return 0;
318
319failure:
320
321
322
323
324 tcp_set_state(sk, TCP_CLOSE);
325 ip_rt_put(rt);
326 sk->sk_route_caps = 0;
327 inet->inet_dport = 0;
328 return err;
329}
330EXPORT_SYMBOL(tcp_v4_connect);
331
332
333
334
335
336
337void tcp_v4_mtu_reduced(struct sock *sk)
338{
339 struct inet_sock *inet = inet_sk(sk);
340 struct dst_entry *dst;
341 u32 mtu;
342
343 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344 return;
345 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
346 dst = inet_csk_update_pmtu(sk, mtu);
347 if (!dst)
348 return;
349
350
351
352
353 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 sk->sk_err_soft = EMSGSIZE;
355
356 mtu = dst_mtu(dst);
357
358 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 ip_sk_accept_pmtu(sk) &&
360 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 tcp_sync_mss(sk, mtu);
362
363
364
365
366
367
368 tcp_simple_retransmit(sk);
369 }
370}
371EXPORT_SYMBOL(tcp_v4_mtu_reduced);
372
373static void do_redirect(struct sk_buff *skb, struct sock *sk)
374{
375 struct dst_entry *dst = __sk_dst_check(sk, 0);
376
377 if (dst)
378 dst->ops->redirect(dst, sk, skb);
379}
380
381
382
383void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384{
385 struct request_sock *req = inet_reqsk(sk);
386 struct net *net = sock_net(sk);
387
388
389
390
391 if (seq != tcp_rsk(req)->snt_isn) {
392 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
393 } else if (abort) {
394
395
396
397
398
399
400 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 tcp_listendrop(req->rsk_listener);
402 }
403 reqsk_put(req);
404}
405EXPORT_SYMBOL(tcp_req_err);
406
407
408void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409{
410 struct inet_connection_sock *icsk = inet_csk(sk);
411 struct tcp_sock *tp = tcp_sk(sk);
412 struct sk_buff *skb;
413 s32 remaining;
414 u32 delta_us;
415
416 if (sock_owned_by_user(sk))
417 return;
418
419 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
420 !icsk->icsk_backoff)
421 return;
422
423 skb = tcp_rtx_queue_head(sk);
424 if (WARN_ON_ONCE(!skb))
425 return;
426
427 icsk->icsk_backoff--;
428 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430
431 tcp_mstamp_refresh(tp);
432 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434
435 if (remaining > 0) {
436 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 remaining, TCP_RTO_MAX);
438 } else {
439
440
441
442 tcp_retransmit_timer(sk);
443 }
444}
445EXPORT_SYMBOL(tcp_ld_RTO_revert);
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463int tcp_v4_err(struct sk_buff *skb, u32 info)
464{
465 const struct iphdr *iph = (const struct iphdr *)skb->data;
466 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
467 struct tcp_sock *tp;
468 struct inet_sock *inet;
469 const int type = icmp_hdr(skb)->type;
470 const int code = icmp_hdr(skb)->code;
471 struct sock *sk;
472 struct request_sock *fastopen;
473 u32 seq, snd_una;
474 int err;
475 struct net *net = dev_net(skb->dev);
476
477 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 th->dest, iph->saddr, ntohs(th->source),
479 inet_iif(skb), 0);
480 if (!sk) {
481 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
482 return -ENOENT;
483 }
484 if (sk->sk_state == TCP_TIME_WAIT) {
485 inet_twsk_put(inet_twsk(sk));
486 return 0;
487 }
488 seq = ntohl(th->seq);
489 if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 type == ICMP_TIME_EXCEEDED ||
492 (type == ICMP_DEST_UNREACH &&
493 (code == ICMP_NET_UNREACH ||
494 code == ICMP_HOST_UNREACH)));
495 return 0;
496 }
497
498 bh_lock_sock(sk);
499
500
501
502
503
504 if (sock_owned_by_user(sk)) {
505 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507 }
508 if (sk->sk_state == TCP_CLOSE)
509 goto out;
510
511 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
512 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
513 goto out;
514 }
515
516 tp = tcp_sk(sk);
517
518 fastopen = rcu_dereference(tp->fastopen_rsk);
519 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
520 if (sk->sk_state != TCP_LISTEN &&
521 !between(seq, snd_una, tp->snd_nxt)) {
522 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
523 goto out;
524 }
525
526 switch (type) {
527 case ICMP_REDIRECT:
528 if (!sock_owned_by_user(sk))
529 do_redirect(skb, sk);
530 goto out;
531 case ICMP_SOURCE_QUENCH:
532
533 goto out;
534 case ICMP_PARAMETERPROB:
535 err = EPROTO;
536 break;
537 case ICMP_DEST_UNREACH:
538 if (code > NR_ICMP_UNREACH)
539 goto out;
540
541 if (code == ICMP_FRAG_NEEDED) {
542
543
544
545
546 if (sk->sk_state == TCP_LISTEN)
547 goto out;
548
549 WRITE_ONCE(tp->mtu_info, info);
550 if (!sock_owned_by_user(sk)) {
551 tcp_v4_mtu_reduced(sk);
552 } else {
553 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
554 sock_hold(sk);
555 }
556 goto out;
557 }
558
559 err = icmp_err_convert[code].errno;
560
561
562
563 if (!fastopen &&
564 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565 tcp_ld_RTO_revert(sk, seq);
566 break;
567 case ICMP_TIME_EXCEEDED:
568 err = EHOSTUNREACH;
569 break;
570 default:
571 goto out;
572 }
573
574 switch (sk->sk_state) {
575 case TCP_SYN_SENT:
576 case TCP_SYN_RECV:
577
578
579
580 if (fastopen && !fastopen->sk)
581 break;
582
583 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
584
585 if (!sock_owned_by_user(sk)) {
586 sk->sk_err = err;
587
588 sk_error_report(sk);
589
590 tcp_done(sk);
591 } else {
592 sk->sk_err_soft = err;
593 }
594 goto out;
595 }
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613 inet = inet_sk(sk);
614 if (!sock_owned_by_user(sk) && inet->recverr) {
615 sk->sk_err = err;
616 sk_error_report(sk);
617 } else {
618 sk->sk_err_soft = err;
619 }
620
621out:
622 bh_unlock_sock(sk);
623 sock_put(sk);
624 return 0;
625}
626
627void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
628{
629 struct tcphdr *th = tcp_hdr(skb);
630
631 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632 skb->csum_start = skb_transport_header(skb) - skb->head;
633 skb->csum_offset = offsetof(struct tcphdr, check);
634}
635
636
637void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
638{
639 const struct inet_sock *inet = inet_sk(sk);
640
641 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
642}
643EXPORT_SYMBOL(tcp_v4_send_check);
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658#ifdef CONFIG_TCP_MD5SIG
659#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
660#else
661#define OPTION_BYTES sizeof(__be32)
662#endif
663
664static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
665{
666 const struct tcphdr *th = tcp_hdr(skb);
667 struct {
668 struct tcphdr th;
669 __be32 opt[OPTION_BYTES / sizeof(__be32)];
670 } rep;
671 struct ip_reply_arg arg;
672#ifdef CONFIG_TCP_MD5SIG
673 struct tcp_md5sig_key *key = NULL;
674 const __u8 *hash_location = NULL;
675 unsigned char newhash[16];
676 int genhash;
677 struct sock *sk1 = NULL;
678#endif
679 u64 transmit_time = 0;
680 struct sock *ctl_sk;
681 struct net *net;
682
683
684 if (th->rst)
685 return;
686
687
688
689
690 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
691 return;
692
693
694 memset(&rep, 0, sizeof(rep));
695 rep.th.dest = th->source;
696 rep.th.source = th->dest;
697 rep.th.doff = sizeof(struct tcphdr) / 4;
698 rep.th.rst = 1;
699
700 if (th->ack) {
701 rep.th.seq = th->ack_seq;
702 } else {
703 rep.th.ack = 1;
704 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
705 skb->len - (th->doff << 2));
706 }
707
708 memset(&arg, 0, sizeof(arg));
709 arg.iov[0].iov_base = (unsigned char *)&rep;
710 arg.iov[0].iov_len = sizeof(rep.th);
711
712 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
713#ifdef CONFIG_TCP_MD5SIG
714 rcu_read_lock();
715 hash_location = tcp_parse_md5sig_option(th);
716 if (sk && sk_fullsock(sk)) {
717 const union tcp_md5_addr *addr;
718 int l3index;
719
720
721
722
723 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
724 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
725 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
726 } else if (hash_location) {
727 const union tcp_md5_addr *addr;
728 int sdif = tcp_v4_sdif(skb);
729 int dif = inet_iif(skb);
730 int l3index;
731
732
733
734
735
736
737
738
739 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
740 ip_hdr(skb)->saddr,
741 th->source, ip_hdr(skb)->daddr,
742 ntohs(th->source), dif, sdif);
743
744 if (!sk1)
745 goto out;
746
747
748
749
750 l3index = sdif ? dif : 0;
751 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
752 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
753 if (!key)
754 goto out;
755
756
757 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
758 if (genhash || memcmp(hash_location, newhash, 16) != 0)
759 goto out;
760
761 }
762
763 if (key) {
764 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
765 (TCPOPT_NOP << 16) |
766 (TCPOPT_MD5SIG << 8) |
767 TCPOLEN_MD5SIG);
768
769 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
770 rep.th.doff = arg.iov[0].iov_len / 4;
771
772 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
773 key, ip_hdr(skb)->saddr,
774 ip_hdr(skb)->daddr, &rep.th);
775 }
776#endif
777
778 if (rep.opt[0] == 0) {
779 __be32 mrst = mptcp_reset_option(skb);
780
781 if (mrst) {
782 rep.opt[0] = mrst;
783 arg.iov[0].iov_len += sizeof(mrst);
784 rep.th.doff = arg.iov[0].iov_len / 4;
785 }
786 }
787
788 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
789 ip_hdr(skb)->saddr,
790 arg.iov[0].iov_len, IPPROTO_TCP, 0);
791 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
792 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
793
794
795
796
797
798 if (sk) {
799 arg.bound_dev_if = sk->sk_bound_dev_if;
800 if (sk_fullsock(sk))
801 trace_tcp_send_reset(sk, skb);
802 }
803
804 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
805 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
806
807 arg.tos = ip_hdr(skb)->tos;
808 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
809 local_bh_disable();
810 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
811 if (sk) {
812 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
813 inet_twsk(sk)->tw_mark : sk->sk_mark;
814 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
815 inet_twsk(sk)->tw_priority : sk->sk_priority;
816 transmit_time = tcp_transmit_time(sk);
817 }
818 ip_send_unicast_reply(ctl_sk,
819 skb, &TCP_SKB_CB(skb)->header.h4.opt,
820 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
821 &arg, arg.iov[0].iov_len,
822 transmit_time);
823
824 ctl_sk->sk_mark = 0;
825 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
826 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
827 local_bh_enable();
828
829#ifdef CONFIG_TCP_MD5SIG
830out:
831 rcu_read_unlock();
832#endif
833}
834
835
836
837
838
839static void tcp_v4_send_ack(const struct sock *sk,
840 struct sk_buff *skb, u32 seq, u32 ack,
841 u32 win, u32 tsval, u32 tsecr, int oif,
842 struct tcp_md5sig_key *key,
843 int reply_flags, u8 tos)
844{
845 const struct tcphdr *th = tcp_hdr(skb);
846 struct {
847 struct tcphdr th;
848 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
849#ifdef CONFIG_TCP_MD5SIG
850 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
851#endif
852 ];
853 } rep;
854 struct net *net = sock_net(sk);
855 struct ip_reply_arg arg;
856 struct sock *ctl_sk;
857 u64 transmit_time;
858
859 memset(&rep.th, 0, sizeof(struct tcphdr));
860 memset(&arg, 0, sizeof(arg));
861
862 arg.iov[0].iov_base = (unsigned char *)&rep;
863 arg.iov[0].iov_len = sizeof(rep.th);
864 if (tsecr) {
865 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
866 (TCPOPT_TIMESTAMP << 8) |
867 TCPOLEN_TIMESTAMP);
868 rep.opt[1] = htonl(tsval);
869 rep.opt[2] = htonl(tsecr);
870 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
871 }
872
873
874 rep.th.dest = th->source;
875 rep.th.source = th->dest;
876 rep.th.doff = arg.iov[0].iov_len / 4;
877 rep.th.seq = htonl(seq);
878 rep.th.ack_seq = htonl(ack);
879 rep.th.ack = 1;
880 rep.th.window = htons(win);
881
882#ifdef CONFIG_TCP_MD5SIG
883 if (key) {
884 int offset = (tsecr) ? 3 : 0;
885
886 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
887 (TCPOPT_NOP << 16) |
888 (TCPOPT_MD5SIG << 8) |
889 TCPOLEN_MD5SIG);
890 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
891 rep.th.doff = arg.iov[0].iov_len/4;
892
893 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
894 key, ip_hdr(skb)->saddr,
895 ip_hdr(skb)->daddr, &rep.th);
896 }
897#endif
898 arg.flags = reply_flags;
899 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
900 ip_hdr(skb)->saddr,
901 arg.iov[0].iov_len, IPPROTO_TCP, 0);
902 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
903 if (oif)
904 arg.bound_dev_if = oif;
905 arg.tos = tos;
906 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
907 local_bh_disable();
908 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
909 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
910 inet_twsk(sk)->tw_mark : sk->sk_mark;
911 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
912 inet_twsk(sk)->tw_priority : sk->sk_priority;
913 transmit_time = tcp_transmit_time(sk);
914 ip_send_unicast_reply(ctl_sk,
915 skb, &TCP_SKB_CB(skb)->header.h4.opt,
916 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
917 &arg, arg.iov[0].iov_len,
918 transmit_time);
919
920 ctl_sk->sk_mark = 0;
921 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
922 local_bh_enable();
923}
924
925static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
926{
927 struct inet_timewait_sock *tw = inet_twsk(sk);
928 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
929
930 tcp_v4_send_ack(sk, skb,
931 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
932 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
933 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
934 tcptw->tw_ts_recent,
935 tw->tw_bound_dev_if,
936 tcp_twsk_md5_key(tcptw),
937 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
938 tw->tw_tos
939 );
940
941 inet_twsk_put(tw);
942}
943
944static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
945 struct request_sock *req)
946{
947 const union tcp_md5_addr *addr;
948 int l3index;
949
950
951
952
953 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
954 tcp_sk(sk)->snd_nxt;
955
956
957
958
959
960
961 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
962 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
963 tcp_v4_send_ack(sk, skb, seq,
964 tcp_rsk(req)->rcv_nxt,
965 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
966 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
967 req->ts_recent,
968 0,
969 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
970 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
971 ip_hdr(skb)->tos);
972}
973
974
975
976
977
978
979static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
980 struct flowi *fl,
981 struct request_sock *req,
982 struct tcp_fastopen_cookie *foc,
983 enum tcp_synack_type synack_type,
984 struct sk_buff *syn_skb)
985{
986 const struct inet_request_sock *ireq = inet_rsk(req);
987 struct flowi4 fl4;
988 int err = -1;
989 struct sk_buff *skb;
990 u8 tos;
991
992
993 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
994 return -1;
995
996 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
997
998 if (skb) {
999 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1000
1001 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1002 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1003 (inet_sk(sk)->tos & INET_ECN_MASK) :
1004 inet_sk(sk)->tos;
1005
1006 if (!INET_ECN_is_capable(tos) &&
1007 tcp_bpf_ca_needs_ecn((struct sock *)req))
1008 tos |= INET_ECN_ECT_0;
1009
1010 rcu_read_lock();
1011 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1012 ireq->ir_rmt_addr,
1013 rcu_dereference(ireq->ireq_opt),
1014 tos);
1015 rcu_read_unlock();
1016 err = net_xmit_eval(err);
1017 }
1018
1019 return err;
1020}
1021
1022
1023
1024
1025static void tcp_v4_reqsk_destructor(struct request_sock *req)
1026{
1027 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1028}
1029
1030#ifdef CONFIG_TCP_MD5SIG
1031
1032
1033
1034
1035
1036
1037DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1038EXPORT_SYMBOL(tcp_md5_needed);
1039
1040static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1041{
1042 if (!old)
1043 return true;
1044
1045
1046 if (old->l3index && new->l3index == 0)
1047 return false;
1048 if (old->l3index == 0 && new->l3index)
1049 return true;
1050
1051 return old->prefixlen < new->prefixlen;
1052}
1053
1054
1055struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1056 const union tcp_md5_addr *addr,
1057 int family)
1058{
1059 const struct tcp_sock *tp = tcp_sk(sk);
1060 struct tcp_md5sig_key *key;
1061 const struct tcp_md5sig_info *md5sig;
1062 __be32 mask;
1063 struct tcp_md5sig_key *best_match = NULL;
1064 bool match;
1065
1066
1067 md5sig = rcu_dereference_check(tp->md5sig_info,
1068 lockdep_sock_is_held(sk));
1069 if (!md5sig)
1070 return NULL;
1071
1072 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1073 lockdep_sock_is_held(sk)) {
1074 if (key->family != family)
1075 continue;
1076 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1077 continue;
1078 if (family == AF_INET) {
1079 mask = inet_make_mask(key->prefixlen);
1080 match = (key->addr.a4.s_addr & mask) ==
1081 (addr->a4.s_addr & mask);
1082#if IS_ENABLED(CONFIG_IPV6)
1083 } else if (family == AF_INET6) {
1084 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1085 key->prefixlen);
1086#endif
1087 } else {
1088 match = false;
1089 }
1090
1091 if (match && better_md5_match(best_match, key))
1092 best_match = key;
1093 }
1094 return best_match;
1095}
1096EXPORT_SYMBOL(__tcp_md5_do_lookup);
1097
1098static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1099 const union tcp_md5_addr *addr,
1100 int family, u8 prefixlen,
1101 int l3index, u8 flags)
1102{
1103 const struct tcp_sock *tp = tcp_sk(sk);
1104 struct tcp_md5sig_key *key;
1105 unsigned int size = sizeof(struct in_addr);
1106 const struct tcp_md5sig_info *md5sig;
1107
1108
1109 md5sig = rcu_dereference_check(tp->md5sig_info,
1110 lockdep_sock_is_held(sk));
1111 if (!md5sig)
1112 return NULL;
1113#if IS_ENABLED(CONFIG_IPV6)
1114 if (family == AF_INET6)
1115 size = sizeof(struct in6_addr);
1116#endif
1117 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1118 lockdep_sock_is_held(sk)) {
1119 if (key->family != family)
1120 continue;
1121 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1122 continue;
1123 if (key->l3index != l3index)
1124 continue;
1125 if (!memcmp(&key->addr, addr, size) &&
1126 key->prefixlen == prefixlen)
1127 return key;
1128 }
1129 return NULL;
1130}
1131
1132struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1133 const struct sock *addr_sk)
1134{
1135 const union tcp_md5_addr *addr;
1136 int l3index;
1137
1138 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1139 addr_sk->sk_bound_dev_if);
1140 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1141 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1142}
1143EXPORT_SYMBOL(tcp_v4_md5_lookup);
1144
1145
1146int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1147 int family, u8 prefixlen, int l3index, u8 flags,
1148 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1149{
1150
1151 struct tcp_md5sig_key *key;
1152 struct tcp_sock *tp = tcp_sk(sk);
1153 struct tcp_md5sig_info *md5sig;
1154
1155 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1156 if (key) {
1157
1158
1159
1160
1161
1162
1163 data_race(memcpy(key->key, newkey, newkeylen));
1164
1165
1166
1167
1168
1169
1170 WRITE_ONCE(key->keylen, newkeylen);
1171
1172 return 0;
1173 }
1174
1175 md5sig = rcu_dereference_protected(tp->md5sig_info,
1176 lockdep_sock_is_held(sk));
1177 if (!md5sig) {
1178 md5sig = kmalloc(sizeof(*md5sig), gfp);
1179 if (!md5sig)
1180 return -ENOMEM;
1181
1182 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1183 INIT_HLIST_HEAD(&md5sig->head);
1184 rcu_assign_pointer(tp->md5sig_info, md5sig);
1185 }
1186
1187 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1188 if (!key)
1189 return -ENOMEM;
1190 if (!tcp_alloc_md5sig_pool()) {
1191 sock_kfree_s(sk, key, sizeof(*key));
1192 return -ENOMEM;
1193 }
1194
1195 memcpy(key->key, newkey, newkeylen);
1196 key->keylen = newkeylen;
1197 key->family = family;
1198 key->prefixlen = prefixlen;
1199 key->l3index = l3index;
1200 key->flags = flags;
1201 memcpy(&key->addr, addr,
1202 (family == AF_INET6) ? sizeof(struct in6_addr) :
1203 sizeof(struct in_addr));
1204 hlist_add_head_rcu(&key->node, &md5sig->head);
1205 return 0;
1206}
1207EXPORT_SYMBOL(tcp_md5_do_add);
1208
1209int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1210 u8 prefixlen, int l3index, u8 flags)
1211{
1212 struct tcp_md5sig_key *key;
1213
1214 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1215 if (!key)
1216 return -ENOENT;
1217 hlist_del_rcu(&key->node);
1218 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1219 kfree_rcu(key, rcu);
1220 return 0;
1221}
1222EXPORT_SYMBOL(tcp_md5_do_del);
1223
1224static void tcp_clear_md5_list(struct sock *sk)
1225{
1226 struct tcp_sock *tp = tcp_sk(sk);
1227 struct tcp_md5sig_key *key;
1228 struct hlist_node *n;
1229 struct tcp_md5sig_info *md5sig;
1230
1231 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1232
1233 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1234 hlist_del_rcu(&key->node);
1235 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1236 kfree_rcu(key, rcu);
1237 }
1238}
1239
1240static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1241 sockptr_t optval, int optlen)
1242{
1243 struct tcp_md5sig cmd;
1244 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1245 const union tcp_md5_addr *addr;
1246 u8 prefixlen = 32;
1247 int l3index = 0;
1248 u8 flags;
1249
1250 if (optlen < sizeof(cmd))
1251 return -EINVAL;
1252
1253 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1254 return -EFAULT;
1255
1256 if (sin->sin_family != AF_INET)
1257 return -EINVAL;
1258
1259 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1260
1261 if (optname == TCP_MD5SIG_EXT &&
1262 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1263 prefixlen = cmd.tcpm_prefixlen;
1264 if (prefixlen > 32)
1265 return -EINVAL;
1266 }
1267
1268 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1269 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1270 struct net_device *dev;
1271
1272 rcu_read_lock();
1273 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1274 if (dev && netif_is_l3_master(dev))
1275 l3index = dev->ifindex;
1276
1277 rcu_read_unlock();
1278
1279
1280
1281
1282 if (!dev || !l3index)
1283 return -EINVAL;
1284 }
1285
1286 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1287
1288 if (!cmd.tcpm_keylen)
1289 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1290
1291 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1292 return -EINVAL;
1293
1294 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1295 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1296}
1297
1298static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1299 __be32 daddr, __be32 saddr,
1300 const struct tcphdr *th, int nbytes)
1301{
1302 struct tcp4_pseudohdr *bp;
1303 struct scatterlist sg;
1304 struct tcphdr *_th;
1305
1306 bp = hp->scratch;
1307 bp->saddr = saddr;
1308 bp->daddr = daddr;
1309 bp->pad = 0;
1310 bp->protocol = IPPROTO_TCP;
1311 bp->len = cpu_to_be16(nbytes);
1312
1313 _th = (struct tcphdr *)(bp + 1);
1314 memcpy(_th, th, sizeof(*th));
1315 _th->check = 0;
1316
1317 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1318 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1319 sizeof(*bp) + sizeof(*th));
1320 return crypto_ahash_update(hp->md5_req);
1321}
1322
1323static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1324 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1325{
1326 struct tcp_md5sig_pool *hp;
1327 struct ahash_request *req;
1328
1329 hp = tcp_get_md5sig_pool();
1330 if (!hp)
1331 goto clear_hash_noput;
1332 req = hp->md5_req;
1333
1334 if (crypto_ahash_init(req))
1335 goto clear_hash;
1336 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1337 goto clear_hash;
1338 if (tcp_md5_hash_key(hp, key))
1339 goto clear_hash;
1340 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1341 if (crypto_ahash_final(req))
1342 goto clear_hash;
1343
1344 tcp_put_md5sig_pool();
1345 return 0;
1346
1347clear_hash:
1348 tcp_put_md5sig_pool();
1349clear_hash_noput:
1350 memset(md5_hash, 0, 16);
1351 return 1;
1352}
1353
1354int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1355 const struct sock *sk,
1356 const struct sk_buff *skb)
1357{
1358 struct tcp_md5sig_pool *hp;
1359 struct ahash_request *req;
1360 const struct tcphdr *th = tcp_hdr(skb);
1361 __be32 saddr, daddr;
1362
1363 if (sk) {
1364 saddr = sk->sk_rcv_saddr;
1365 daddr = sk->sk_daddr;
1366 } else {
1367 const struct iphdr *iph = ip_hdr(skb);
1368 saddr = iph->saddr;
1369 daddr = iph->daddr;
1370 }
1371
1372 hp = tcp_get_md5sig_pool();
1373 if (!hp)
1374 goto clear_hash_noput;
1375 req = hp->md5_req;
1376
1377 if (crypto_ahash_init(req))
1378 goto clear_hash;
1379
1380 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1381 goto clear_hash;
1382 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1383 goto clear_hash;
1384 if (tcp_md5_hash_key(hp, key))
1385 goto clear_hash;
1386 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1387 if (crypto_ahash_final(req))
1388 goto clear_hash;
1389
1390 tcp_put_md5sig_pool();
1391 return 0;
1392
1393clear_hash:
1394 tcp_put_md5sig_pool();
1395clear_hash_noput:
1396 memset(md5_hash, 0, 16);
1397 return 1;
1398}
1399EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1400
1401#endif
1402
1403
1404static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1405 const struct sk_buff *skb,
1406 int dif, int sdif)
1407{
1408#ifdef CONFIG_TCP_MD5SIG
1409
1410
1411
1412
1413
1414
1415
1416
1417 const __u8 *hash_location = NULL;
1418 struct tcp_md5sig_key *hash_expected;
1419 const struct iphdr *iph = ip_hdr(skb);
1420 const struct tcphdr *th = tcp_hdr(skb);
1421 const union tcp_md5_addr *addr;
1422 unsigned char newhash[16];
1423 int genhash, l3index;
1424
1425
1426
1427
1428 l3index = sdif ? dif : 0;
1429
1430 addr = (union tcp_md5_addr *)&iph->saddr;
1431 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1432 hash_location = tcp_parse_md5sig_option(th);
1433
1434
1435 if (!hash_expected && !hash_location)
1436 return false;
1437
1438 if (hash_expected && !hash_location) {
1439 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1440 return true;
1441 }
1442
1443 if (!hash_expected && hash_location) {
1444 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1445 return true;
1446 }
1447
1448
1449
1450
1451 genhash = tcp_v4_md5_hash_skb(newhash,
1452 hash_expected,
1453 NULL, skb);
1454
1455 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1456 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1457 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1458 &iph->saddr, ntohs(th->source),
1459 &iph->daddr, ntohs(th->dest),
1460 genhash ? " tcp_v4_calc_md5_hash failed"
1461 : "", l3index);
1462 return true;
1463 }
1464 return false;
1465#endif
1466 return false;
1467}
1468
1469static void tcp_v4_init_req(struct request_sock *req,
1470 const struct sock *sk_listener,
1471 struct sk_buff *skb)
1472{
1473 struct inet_request_sock *ireq = inet_rsk(req);
1474 struct net *net = sock_net(sk_listener);
1475
1476 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1477 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1478 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1479}
1480
1481static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1482 struct sk_buff *skb,
1483 struct flowi *fl,
1484 struct request_sock *req)
1485{
1486 tcp_v4_init_req(req, sk, skb);
1487
1488 if (security_inet_conn_request(sk, skb, req))
1489 return NULL;
1490
1491 return inet_csk_route_req(sk, &fl->u.ip4, req);
1492}
1493
1494struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1495 .family = PF_INET,
1496 .obj_size = sizeof(struct tcp_request_sock),
1497 .rtx_syn_ack = tcp_rtx_synack,
1498 .send_ack = tcp_v4_reqsk_send_ack,
1499 .destructor = tcp_v4_reqsk_destructor,
1500 .send_reset = tcp_v4_send_reset,
1501 .syn_ack_timeout = tcp_syn_ack_timeout,
1502};
1503
1504const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1505 .mss_clamp = TCP_MSS_DEFAULT,
1506#ifdef CONFIG_TCP_MD5SIG
1507 .req_md5_lookup = tcp_v4_md5_lookup,
1508 .calc_md5_hash = tcp_v4_md5_hash_skb,
1509#endif
1510#ifdef CONFIG_SYN_COOKIES
1511 .cookie_init_seq = cookie_v4_init_sequence,
1512#endif
1513 .route_req = tcp_v4_route_req,
1514 .init_seq = tcp_v4_init_seq,
1515 .init_ts_off = tcp_v4_init_ts_off,
1516 .send_synack = tcp_v4_send_synack,
1517};
1518
1519int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1520{
1521
1522 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1523 goto drop;
1524
1525 return tcp_conn_request(&tcp_request_sock_ops,
1526 &tcp_request_sock_ipv4_ops, sk, skb);
1527
1528drop:
1529 tcp_listendrop(sk);
1530 return 0;
1531}
1532EXPORT_SYMBOL(tcp_v4_conn_request);
1533
1534
1535
1536
1537
1538
1539struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1540 struct request_sock *req,
1541 struct dst_entry *dst,
1542 struct request_sock *req_unhash,
1543 bool *own_req)
1544{
1545 struct inet_request_sock *ireq;
1546 bool found_dup_sk = false;
1547 struct inet_sock *newinet;
1548 struct tcp_sock *newtp;
1549 struct sock *newsk;
1550#ifdef CONFIG_TCP_MD5SIG
1551 const union tcp_md5_addr *addr;
1552 struct tcp_md5sig_key *key;
1553 int l3index;
1554#endif
1555 struct ip_options_rcu *inet_opt;
1556
1557 if (sk_acceptq_is_full(sk))
1558 goto exit_overflow;
1559
1560 newsk = tcp_create_openreq_child(sk, req, skb);
1561 if (!newsk)
1562 goto exit_nonewsk;
1563
1564 newsk->sk_gso_type = SKB_GSO_TCPV4;
1565 inet_sk_rx_dst_set(newsk, skb);
1566
1567 newtp = tcp_sk(newsk);
1568 newinet = inet_sk(newsk);
1569 ireq = inet_rsk(req);
1570 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1571 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1572 newsk->sk_bound_dev_if = ireq->ir_iif;
1573 newinet->inet_saddr = ireq->ir_loc_addr;
1574 inet_opt = rcu_dereference(ireq->ireq_opt);
1575 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1576 newinet->mc_index = inet_iif(skb);
1577 newinet->mc_ttl = ip_hdr(skb)->ttl;
1578 newinet->rcv_tos = ip_hdr(skb)->tos;
1579 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1580 if (inet_opt)
1581 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1582 newinet->inet_id = prandom_u32();
1583
1584
1585
1586
1587 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1588 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1589
1590 if (!dst) {
1591 dst = inet_csk_route_child_sock(sk, newsk, req);
1592 if (!dst)
1593 goto put_and_exit;
1594 } else {
1595
1596 }
1597 sk_setup_caps(newsk, dst);
1598
1599 tcp_ca_openreq_child(newsk, dst);
1600
1601 tcp_sync_mss(newsk, dst_mtu(dst));
1602 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1603
1604 tcp_initialize_rcv_mss(newsk);
1605
1606#ifdef CONFIG_TCP_MD5SIG
1607 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1608
1609 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1610 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1611 if (key) {
1612
1613
1614
1615
1616
1617
1618 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1619 key->key, key->keylen, GFP_ATOMIC);
1620 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1621 }
1622#endif
1623
1624 if (__inet_inherit_port(sk, newsk) < 0)
1625 goto put_and_exit;
1626 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1627 &found_dup_sk);
1628 if (likely(*own_req)) {
1629 tcp_move_syn(newtp, req);
1630 ireq->ireq_opt = NULL;
1631 } else {
1632 newinet->inet_opt = NULL;
1633
1634 if (!req_unhash && found_dup_sk) {
1635
1636
1637
1638 bh_unlock_sock(newsk);
1639 sock_put(newsk);
1640 newsk = NULL;
1641 }
1642 }
1643 return newsk;
1644
1645exit_overflow:
1646 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1647exit_nonewsk:
1648 dst_release(dst);
1649exit:
1650 tcp_listendrop(sk);
1651 return NULL;
1652put_and_exit:
1653 newinet->inet_opt = NULL;
1654 inet_csk_prepare_forced_close(newsk);
1655 tcp_done(newsk);
1656 goto exit;
1657}
1658EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1659
1660static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1661{
1662#ifdef CONFIG_SYN_COOKIES
1663 const struct tcphdr *th = tcp_hdr(skb);
1664
1665 if (!th->syn)
1666 sk = cookie_v4_check(sk, skb);
1667#endif
1668 return sk;
1669}
1670
1671u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1672 struct tcphdr *th, u32 *cookie)
1673{
1674 u16 mss = 0;
1675#ifdef CONFIG_SYN_COOKIES
1676 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1677 &tcp_request_sock_ipv4_ops, sk, th);
1678 if (mss) {
1679 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1680 tcp_synq_overflow(sk);
1681 }
1682#endif
1683 return mss;
1684}
1685
1686INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1687 u32));
1688
1689
1690
1691
1692
1693
1694
1695
1696int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1697{
1698 struct sock *rsk;
1699
1700 if (sk->sk_state == TCP_ESTABLISHED) {
1701 struct dst_entry *dst = sk->sk_rx_dst;
1702
1703 sock_rps_save_rxhash(sk, skb);
1704 sk_mark_napi_id(sk, skb);
1705 if (dst) {
1706 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1707 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1708 dst, 0)) {
1709 dst_release(dst);
1710 sk->sk_rx_dst = NULL;
1711 }
1712 }
1713 tcp_rcv_established(sk, skb);
1714 return 0;
1715 }
1716
1717 if (tcp_checksum_complete(skb))
1718 goto csum_err;
1719
1720 if (sk->sk_state == TCP_LISTEN) {
1721 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1722
1723 if (!nsk)
1724 goto discard;
1725 if (nsk != sk) {
1726 if (tcp_child_process(sk, nsk, skb)) {
1727 rsk = nsk;
1728 goto reset;
1729 }
1730 return 0;
1731 }
1732 } else
1733 sock_rps_save_rxhash(sk, skb);
1734
1735 if (tcp_rcv_state_process(sk, skb)) {
1736 rsk = sk;
1737 goto reset;
1738 }
1739 return 0;
1740
1741reset:
1742 tcp_v4_send_reset(rsk, skb);
1743discard:
1744 kfree_skb(skb);
1745
1746
1747
1748
1749
1750 return 0;
1751
1752csum_err:
1753 trace_tcp_bad_csum(skb);
1754 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1755 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1756 goto discard;
1757}
1758EXPORT_SYMBOL(tcp_v4_do_rcv);
1759
1760int tcp_v4_early_demux(struct sk_buff *skb)
1761{
1762 const struct iphdr *iph;
1763 const struct tcphdr *th;
1764 struct sock *sk;
1765
1766 if (skb->pkt_type != PACKET_HOST)
1767 return 0;
1768
1769 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1770 return 0;
1771
1772 iph = ip_hdr(skb);
1773 th = tcp_hdr(skb);
1774
1775 if (th->doff < sizeof(struct tcphdr) / 4)
1776 return 0;
1777
1778 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1779 iph->saddr, th->source,
1780 iph->daddr, ntohs(th->dest),
1781 skb->skb_iif, inet_sdif(skb));
1782 if (sk) {
1783 skb->sk = sk;
1784 skb->destructor = sock_edemux;
1785 if (sk_fullsock(sk)) {
1786 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1787
1788 if (dst)
1789 dst = dst_check(dst, 0);
1790 if (dst &&
1791 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1792 skb_dst_set_noref(skb, dst);
1793 }
1794 }
1795 return 0;
1796}
1797
1798bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1799{
1800 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1801 u32 tail_gso_size, tail_gso_segs;
1802 struct skb_shared_info *shinfo;
1803 const struct tcphdr *th;
1804 struct tcphdr *thtail;
1805 struct sk_buff *tail;
1806 unsigned int hdrlen;
1807 bool fragstolen;
1808 u32 gso_segs;
1809 u32 gso_size;
1810 int delta;
1811
1812
1813
1814
1815
1816
1817
1818 skb_condense(skb);
1819
1820 skb_dst_drop(skb);
1821
1822 if (unlikely(tcp_checksum_complete(skb))) {
1823 bh_unlock_sock(sk);
1824 trace_tcp_bad_csum(skb);
1825 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1826 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1827 return true;
1828 }
1829
1830
1831
1832
1833
1834 th = (const struct tcphdr *)skb->data;
1835 hdrlen = th->doff * 4;
1836
1837 tail = sk->sk_backlog.tail;
1838 if (!tail)
1839 goto no_coalesce;
1840 thtail = (struct tcphdr *)tail->data;
1841
1842 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1843 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1844 ((TCP_SKB_CB(tail)->tcp_flags |
1845 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1846 !((TCP_SKB_CB(tail)->tcp_flags &
1847 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1848 ((TCP_SKB_CB(tail)->tcp_flags ^
1849 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1850#ifdef CONFIG_TLS_DEVICE
1851 tail->decrypted != skb->decrypted ||
1852#endif
1853 thtail->doff != th->doff ||
1854 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1855 goto no_coalesce;
1856
1857 __skb_pull(skb, hdrlen);
1858
1859 shinfo = skb_shinfo(skb);
1860 gso_size = shinfo->gso_size ?: skb->len;
1861 gso_segs = shinfo->gso_segs ?: 1;
1862
1863 shinfo = skb_shinfo(tail);
1864 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1865 tail_gso_segs = shinfo->gso_segs ?: 1;
1866
1867 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1868 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1869
1870 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1871 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1872 thtail->window = th->window;
1873 }
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883 thtail->fin |= th->fin;
1884 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1885
1886 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1887 TCP_SKB_CB(tail)->has_rxtstamp = true;
1888 tail->tstamp = skb->tstamp;
1889 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1890 }
1891
1892
1893 shinfo->gso_size = max(gso_size, tail_gso_size);
1894 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1895
1896 sk->sk_backlog.len += delta;
1897 __NET_INC_STATS(sock_net(sk),
1898 LINUX_MIB_TCPBACKLOGCOALESCE);
1899 kfree_skb_partial(skb, fragstolen);
1900 return false;
1901 }
1902 __skb_push(skb, hdrlen);
1903
1904no_coalesce:
1905
1906
1907
1908
1909 limit += 64*1024;
1910
1911 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1912 bh_unlock_sock(sk);
1913 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1914 return true;
1915 }
1916 return false;
1917}
1918EXPORT_SYMBOL(tcp_add_backlog);
1919
1920int tcp_filter(struct sock *sk, struct sk_buff *skb)
1921{
1922 struct tcphdr *th = (struct tcphdr *)skb->data;
1923
1924 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1925}
1926EXPORT_SYMBOL(tcp_filter);
1927
1928static void tcp_v4_restore_cb(struct sk_buff *skb)
1929{
1930 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1931 sizeof(struct inet_skb_parm));
1932}
1933
1934static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1935 const struct tcphdr *th)
1936{
1937
1938
1939
1940 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1941 sizeof(struct inet_skb_parm));
1942 barrier();
1943
1944 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1945 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1946 skb->len - th->doff * 4);
1947 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1948 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1949 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1950 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1951 TCP_SKB_CB(skb)->sacked = 0;
1952 TCP_SKB_CB(skb)->has_rxtstamp =
1953 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1954}
1955
1956
1957
1958
1959
1960int tcp_v4_rcv(struct sk_buff *skb)
1961{
1962 struct net *net = dev_net(skb->dev);
1963 struct sk_buff *skb_to_free;
1964 int sdif = inet_sdif(skb);
1965 int dif = inet_iif(skb);
1966 const struct iphdr *iph;
1967 const struct tcphdr *th;
1968 bool refcounted;
1969 struct sock *sk;
1970 int ret;
1971
1972 if (skb->pkt_type != PACKET_HOST)
1973 goto discard_it;
1974
1975
1976 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1977
1978 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1979 goto discard_it;
1980
1981 th = (const struct tcphdr *)skb->data;
1982
1983 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1984 goto bad_packet;
1985 if (!pskb_may_pull(skb, th->doff * 4))
1986 goto discard_it;
1987
1988
1989
1990
1991
1992
1993 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1994 goto csum_error;
1995
1996 th = (const struct tcphdr *)skb->data;
1997 iph = ip_hdr(skb);
1998lookup:
1999 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
2000 th->dest, sdif, &refcounted);
2001 if (!sk)
2002 goto no_tcp_socket;
2003
2004process:
2005 if (sk->sk_state == TCP_TIME_WAIT)
2006 goto do_time_wait;
2007
2008 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2009 struct request_sock *req = inet_reqsk(sk);
2010 bool req_stolen = false;
2011 struct sock *nsk;
2012
2013 sk = req->rsk_listener;
2014 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2015 sk_drops_add(sk, skb);
2016 reqsk_put(req);
2017 goto discard_it;
2018 }
2019 if (tcp_checksum_complete(skb)) {
2020 reqsk_put(req);
2021 goto csum_error;
2022 }
2023 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2024 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2025 if (!nsk) {
2026 inet_csk_reqsk_queue_drop_and_put(sk, req);
2027 goto lookup;
2028 }
2029 sk = nsk;
2030
2031
2032
2033 } else {
2034
2035
2036
2037 sock_hold(sk);
2038 }
2039 refcounted = true;
2040 nsk = NULL;
2041 if (!tcp_filter(sk, skb)) {
2042 th = (const struct tcphdr *)skb->data;
2043 iph = ip_hdr(skb);
2044 tcp_v4_fill_cb(skb, iph, th);
2045 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2046 }
2047 if (!nsk) {
2048 reqsk_put(req);
2049 if (req_stolen) {
2050
2051
2052
2053
2054
2055 tcp_v4_restore_cb(skb);
2056 sock_put(sk);
2057 goto lookup;
2058 }
2059 goto discard_and_relse;
2060 }
2061 if (nsk == sk) {
2062 reqsk_put(req);
2063 tcp_v4_restore_cb(skb);
2064 } else if (tcp_child_process(sk, nsk, skb)) {
2065 tcp_v4_send_reset(nsk, skb);
2066 goto discard_and_relse;
2067 } else {
2068 sock_put(sk);
2069 return 0;
2070 }
2071 }
2072 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2073 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2074 goto discard_and_relse;
2075 }
2076
2077 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2078 goto discard_and_relse;
2079
2080 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2081 goto discard_and_relse;
2082
2083 nf_reset_ct(skb);
2084
2085 if (tcp_filter(sk, skb))
2086 goto discard_and_relse;
2087 th = (const struct tcphdr *)skb->data;
2088 iph = ip_hdr(skb);
2089 tcp_v4_fill_cb(skb, iph, th);
2090
2091 skb->dev = NULL;
2092
2093 if (sk->sk_state == TCP_LISTEN) {
2094 ret = tcp_v4_do_rcv(sk, skb);
2095 goto put_and_return;
2096 }
2097
2098 sk_incoming_cpu_update(sk);
2099
2100 bh_lock_sock_nested(sk);
2101 tcp_segs_in(tcp_sk(sk), skb);
2102 ret = 0;
2103 if (!sock_owned_by_user(sk)) {
2104 skb_to_free = sk->sk_rx_skb_cache;
2105 sk->sk_rx_skb_cache = NULL;
2106 ret = tcp_v4_do_rcv(sk, skb);
2107 } else {
2108 if (tcp_add_backlog(sk, skb))
2109 goto discard_and_relse;
2110 skb_to_free = NULL;
2111 }
2112 bh_unlock_sock(sk);
2113 if (skb_to_free)
2114 __kfree_skb(skb_to_free);
2115
2116put_and_return:
2117 if (refcounted)
2118 sock_put(sk);
2119
2120 return ret;
2121
2122no_tcp_socket:
2123 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2124 goto discard_it;
2125
2126 tcp_v4_fill_cb(skb, iph, th);
2127
2128 if (tcp_checksum_complete(skb)) {
2129csum_error:
2130 trace_tcp_bad_csum(skb);
2131 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2132bad_packet:
2133 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2134 } else {
2135 tcp_v4_send_reset(NULL, skb);
2136 }
2137
2138discard_it:
2139
2140 kfree_skb(skb);
2141 return 0;
2142
2143discard_and_relse:
2144 sk_drops_add(sk, skb);
2145 if (refcounted)
2146 sock_put(sk);
2147 goto discard_it;
2148
2149do_time_wait:
2150 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2151 inet_twsk_put(inet_twsk(sk));
2152 goto discard_it;
2153 }
2154
2155 tcp_v4_fill_cb(skb, iph, th);
2156
2157 if (tcp_checksum_complete(skb)) {
2158 inet_twsk_put(inet_twsk(sk));
2159 goto csum_error;
2160 }
2161 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2162 case TCP_TW_SYN: {
2163 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2164 &tcp_hashinfo, skb,
2165 __tcp_hdrlen(th),
2166 iph->saddr, th->source,
2167 iph->daddr, th->dest,
2168 inet_iif(skb),
2169 sdif);
2170 if (sk2) {
2171 inet_twsk_deschedule_put(inet_twsk(sk));
2172 sk = sk2;
2173 tcp_v4_restore_cb(skb);
2174 refcounted = false;
2175 goto process;
2176 }
2177 }
2178
2179 fallthrough;
2180 case TCP_TW_ACK:
2181 tcp_v4_timewait_ack(sk, skb);
2182 break;
2183 case TCP_TW_RST:
2184 tcp_v4_send_reset(sk, skb);
2185 inet_twsk_deschedule_put(inet_twsk(sk));
2186 goto discard_it;
2187 case TCP_TW_SUCCESS:;
2188 }
2189 goto discard_it;
2190}
2191
2192static struct timewait_sock_ops tcp_timewait_sock_ops = {
2193 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2194 .twsk_unique = tcp_twsk_unique,
2195 .twsk_destructor= tcp_twsk_destructor,
2196};
2197
2198void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2199{
2200 struct dst_entry *dst = skb_dst(skb);
2201
2202 if (dst && dst_hold_safe(dst)) {
2203 sk->sk_rx_dst = dst;
2204 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2205 }
2206}
2207EXPORT_SYMBOL(inet_sk_rx_dst_set);
2208
2209const struct inet_connection_sock_af_ops ipv4_specific = {
2210 .queue_xmit = ip_queue_xmit,
2211 .send_check = tcp_v4_send_check,
2212 .rebuild_header = inet_sk_rebuild_header,
2213 .sk_rx_dst_set = inet_sk_rx_dst_set,
2214 .conn_request = tcp_v4_conn_request,
2215 .syn_recv_sock = tcp_v4_syn_recv_sock,
2216 .net_header_len = sizeof(struct iphdr),
2217 .setsockopt = ip_setsockopt,
2218 .getsockopt = ip_getsockopt,
2219 .addr2sockaddr = inet_csk_addr2sockaddr,
2220 .sockaddr_len = sizeof(struct sockaddr_in),
2221 .mtu_reduced = tcp_v4_mtu_reduced,
2222};
2223EXPORT_SYMBOL(ipv4_specific);
2224
2225#ifdef CONFIG_TCP_MD5SIG
2226static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2227 .md5_lookup = tcp_v4_md5_lookup,
2228 .calc_md5_hash = tcp_v4_md5_hash_skb,
2229 .md5_parse = tcp_v4_parse_md5_keys,
2230};
2231#endif
2232
2233
2234
2235
2236static int tcp_v4_init_sock(struct sock *sk)
2237{
2238 struct inet_connection_sock *icsk = inet_csk(sk);
2239
2240 tcp_init_sock(sk);
2241
2242 icsk->icsk_af_ops = &ipv4_specific;
2243
2244#ifdef CONFIG_TCP_MD5SIG
2245 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2246#endif
2247
2248 return 0;
2249}
2250
2251void tcp_v4_destroy_sock(struct sock *sk)
2252{
2253 struct tcp_sock *tp = tcp_sk(sk);
2254
2255 trace_tcp_destroy_sock(sk);
2256
2257 tcp_clear_xmit_timers(sk);
2258
2259 tcp_cleanup_congestion_control(sk);
2260
2261 tcp_cleanup_ulp(sk);
2262
2263
2264 tcp_write_queue_purge(sk);
2265
2266
2267 tcp_fastopen_active_disable_ofo_check(sk);
2268
2269
2270 skb_rbtree_purge(&tp->out_of_order_queue);
2271
2272#ifdef CONFIG_TCP_MD5SIG
2273
2274 if (tp->md5sig_info) {
2275 tcp_clear_md5_list(sk);
2276 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2277 tp->md5sig_info = NULL;
2278 }
2279#endif
2280
2281
2282 if (inet_csk(sk)->icsk_bind_hash)
2283 inet_put_port(sk);
2284
2285 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2286
2287
2288 tcp_free_fastopen_req(tp);
2289 tcp_fastopen_destroy_cipher(sk);
2290 tcp_saved_syn_free(tp);
2291
2292 sk_sockets_allocated_dec(sk);
2293}
2294EXPORT_SYMBOL(tcp_v4_destroy_sock);
2295
2296#ifdef CONFIG_PROC_FS
2297
2298
2299static unsigned short seq_file_family(const struct seq_file *seq);
2300
2301static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2302{
2303 unsigned short family = seq_file_family(seq);
2304
2305
2306 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2307 net_eq(sock_net(sk), seq_file_net(seq)));
2308}
2309
2310
2311
2312
2313static void *listening_get_first(struct seq_file *seq)
2314{
2315 struct tcp_iter_state *st = seq->private;
2316
2317 st->offset = 0;
2318 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2319 struct inet_listen_hashbucket *ilb2;
2320 struct inet_connection_sock *icsk;
2321 struct sock *sk;
2322
2323 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2324 if (hlist_empty(&ilb2->head))
2325 continue;
2326
2327 spin_lock(&ilb2->lock);
2328 inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2329 sk = (struct sock *)icsk;
2330 if (seq_sk_match(seq, sk))
2331 return sk;
2332 }
2333 spin_unlock(&ilb2->lock);
2334 }
2335
2336 return NULL;
2337}
2338
2339
2340
2341
2342
2343
2344static void *listening_get_next(struct seq_file *seq, void *cur)
2345{
2346 struct tcp_iter_state *st = seq->private;
2347 struct inet_listen_hashbucket *ilb2;
2348 struct inet_connection_sock *icsk;
2349 struct sock *sk = cur;
2350
2351 ++st->num;
2352 ++st->offset;
2353
2354 icsk = inet_csk(sk);
2355 inet_lhash2_for_each_icsk_continue(icsk) {
2356 sk = (struct sock *)icsk;
2357 if (seq_sk_match(seq, sk))
2358 return sk;
2359 }
2360
2361 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2362 spin_unlock(&ilb2->lock);
2363 ++st->bucket;
2364 return listening_get_first(seq);
2365}
2366
2367static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2368{
2369 struct tcp_iter_state *st = seq->private;
2370 void *rc;
2371
2372 st->bucket = 0;
2373 st->offset = 0;
2374 rc = listening_get_first(seq);
2375
2376 while (rc && *pos) {
2377 rc = listening_get_next(seq, rc);
2378 --*pos;
2379 }
2380 return rc;
2381}
2382
2383static inline bool empty_bucket(const struct tcp_iter_state *st)
2384{
2385 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2386}
2387
2388
2389
2390
2391
2392static void *established_get_first(struct seq_file *seq)
2393{
2394 struct tcp_iter_state *st = seq->private;
2395
2396 st->offset = 0;
2397 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2398 struct sock *sk;
2399 struct hlist_nulls_node *node;
2400 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2401
2402
2403 if (empty_bucket(st))
2404 continue;
2405
2406 spin_lock_bh(lock);
2407 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2408 if (seq_sk_match(seq, sk))
2409 return sk;
2410 }
2411 spin_unlock_bh(lock);
2412 }
2413
2414 return NULL;
2415}
2416
2417static void *established_get_next(struct seq_file *seq, void *cur)
2418{
2419 struct sock *sk = cur;
2420 struct hlist_nulls_node *node;
2421 struct tcp_iter_state *st = seq->private;
2422
2423 ++st->num;
2424 ++st->offset;
2425
2426 sk = sk_nulls_next(sk);
2427
2428 sk_nulls_for_each_from(sk, node) {
2429 if (seq_sk_match(seq, sk))
2430 return sk;
2431 }
2432
2433 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2434 ++st->bucket;
2435 return established_get_first(seq);
2436}
2437
2438static void *established_get_idx(struct seq_file *seq, loff_t pos)
2439{
2440 struct tcp_iter_state *st = seq->private;
2441 void *rc;
2442
2443 st->bucket = 0;
2444 rc = established_get_first(seq);
2445
2446 while (rc && pos) {
2447 rc = established_get_next(seq, rc);
2448 --pos;
2449 }
2450 return rc;
2451}
2452
2453static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2454{
2455 void *rc;
2456 struct tcp_iter_state *st = seq->private;
2457
2458 st->state = TCP_SEQ_STATE_LISTENING;
2459 rc = listening_get_idx(seq, &pos);
2460
2461 if (!rc) {
2462 st->state = TCP_SEQ_STATE_ESTABLISHED;
2463 rc = established_get_idx(seq, pos);
2464 }
2465
2466 return rc;
2467}
2468
2469static void *tcp_seek_last_pos(struct seq_file *seq)
2470{
2471 struct tcp_iter_state *st = seq->private;
2472 int bucket = st->bucket;
2473 int offset = st->offset;
2474 int orig_num = st->num;
2475 void *rc = NULL;
2476
2477 switch (st->state) {
2478 case TCP_SEQ_STATE_LISTENING:
2479 if (st->bucket > tcp_hashinfo.lhash2_mask)
2480 break;
2481 st->state = TCP_SEQ_STATE_LISTENING;
2482 rc = listening_get_first(seq);
2483 while (offset-- && rc && bucket == st->bucket)
2484 rc = listening_get_next(seq, rc);
2485 if (rc)
2486 break;
2487 st->bucket = 0;
2488 st->state = TCP_SEQ_STATE_ESTABLISHED;
2489 fallthrough;
2490 case TCP_SEQ_STATE_ESTABLISHED:
2491 if (st->bucket > tcp_hashinfo.ehash_mask)
2492 break;
2493 rc = established_get_first(seq);
2494 while (offset-- && rc && bucket == st->bucket)
2495 rc = established_get_next(seq, rc);
2496 }
2497
2498 st->num = orig_num;
2499
2500 return rc;
2501}
2502
2503void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2504{
2505 struct tcp_iter_state *st = seq->private;
2506 void *rc;
2507
2508 if (*pos && *pos == st->last_pos) {
2509 rc = tcp_seek_last_pos(seq);
2510 if (rc)
2511 goto out;
2512 }
2513
2514 st->state = TCP_SEQ_STATE_LISTENING;
2515 st->num = 0;
2516 st->bucket = 0;
2517 st->offset = 0;
2518 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2519
2520out:
2521 st->last_pos = *pos;
2522 return rc;
2523}
2524EXPORT_SYMBOL(tcp_seq_start);
2525
2526void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2527{
2528 struct tcp_iter_state *st = seq->private;
2529 void *rc = NULL;
2530
2531 if (v == SEQ_START_TOKEN) {
2532 rc = tcp_get_idx(seq, 0);
2533 goto out;
2534 }
2535
2536 switch (st->state) {
2537 case TCP_SEQ_STATE_LISTENING:
2538 rc = listening_get_next(seq, v);
2539 if (!rc) {
2540 st->state = TCP_SEQ_STATE_ESTABLISHED;
2541 st->bucket = 0;
2542 st->offset = 0;
2543 rc = established_get_first(seq);
2544 }
2545 break;
2546 case TCP_SEQ_STATE_ESTABLISHED:
2547 rc = established_get_next(seq, v);
2548 break;
2549 }
2550out:
2551 ++*pos;
2552 st->last_pos = *pos;
2553 return rc;
2554}
2555EXPORT_SYMBOL(tcp_seq_next);
2556
2557void tcp_seq_stop(struct seq_file *seq, void *v)
2558{
2559 struct tcp_iter_state *st = seq->private;
2560
2561 switch (st->state) {
2562 case TCP_SEQ_STATE_LISTENING:
2563 if (v != SEQ_START_TOKEN)
2564 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2565 break;
2566 case TCP_SEQ_STATE_ESTABLISHED:
2567 if (v)
2568 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2569 break;
2570 }
2571}
2572EXPORT_SYMBOL(tcp_seq_stop);
2573
2574static void get_openreq4(const struct request_sock *req,
2575 struct seq_file *f, int i)
2576{
2577 const struct inet_request_sock *ireq = inet_rsk(req);
2578 long delta = req->rsk_timer.expires - jiffies;
2579
2580 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2581 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2582 i,
2583 ireq->ir_loc_addr,
2584 ireq->ir_num,
2585 ireq->ir_rmt_addr,
2586 ntohs(ireq->ir_rmt_port),
2587 TCP_SYN_RECV,
2588 0, 0,
2589 1,
2590 jiffies_delta_to_clock_t(delta),
2591 req->num_timeout,
2592 from_kuid_munged(seq_user_ns(f),
2593 sock_i_uid(req->rsk_listener)),
2594 0,
2595 0,
2596 0,
2597 req);
2598}
2599
2600static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2601{
2602 int timer_active;
2603 unsigned long timer_expires;
2604 const struct tcp_sock *tp = tcp_sk(sk);
2605 const struct inet_connection_sock *icsk = inet_csk(sk);
2606 const struct inet_sock *inet = inet_sk(sk);
2607 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2608 __be32 dest = inet->inet_daddr;
2609 __be32 src = inet->inet_rcv_saddr;
2610 __u16 destp = ntohs(inet->inet_dport);
2611 __u16 srcp = ntohs(inet->inet_sport);
2612 int rx_queue;
2613 int state;
2614
2615 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2616 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2617 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2618 timer_active = 1;
2619 timer_expires = icsk->icsk_timeout;
2620 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2621 timer_active = 4;
2622 timer_expires = icsk->icsk_timeout;
2623 } else if (timer_pending(&sk->sk_timer)) {
2624 timer_active = 2;
2625 timer_expires = sk->sk_timer.expires;
2626 } else {
2627 timer_active = 0;
2628 timer_expires = jiffies;
2629 }
2630
2631 state = inet_sk_state_load(sk);
2632 if (state == TCP_LISTEN)
2633 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2634 else
2635
2636
2637
2638 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2639 READ_ONCE(tp->copied_seq), 0);
2640
2641 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2642 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2643 i, src, srcp, dest, destp, state,
2644 READ_ONCE(tp->write_seq) - tp->snd_una,
2645 rx_queue,
2646 timer_active,
2647 jiffies_delta_to_clock_t(timer_expires - jiffies),
2648 icsk->icsk_retransmits,
2649 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2650 icsk->icsk_probes_out,
2651 sock_i_ino(sk),
2652 refcount_read(&sk->sk_refcnt), sk,
2653 jiffies_to_clock_t(icsk->icsk_rto),
2654 jiffies_to_clock_t(icsk->icsk_ack.ato),
2655 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2656 tp->snd_cwnd,
2657 state == TCP_LISTEN ?
2658 fastopenq->max_qlen :
2659 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2660}
2661
2662static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2663 struct seq_file *f, int i)
2664{
2665 long delta = tw->tw_timer.expires - jiffies;
2666 __be32 dest, src;
2667 __u16 destp, srcp;
2668
2669 dest = tw->tw_daddr;
2670 src = tw->tw_rcv_saddr;
2671 destp = ntohs(tw->tw_dport);
2672 srcp = ntohs(tw->tw_sport);
2673
2674 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2675 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2676 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2677 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2678 refcount_read(&tw->tw_refcnt), tw);
2679}
2680
2681#define TMPSZ 150
2682
2683static int tcp4_seq_show(struct seq_file *seq, void *v)
2684{
2685 struct tcp_iter_state *st;
2686 struct sock *sk = v;
2687
2688 seq_setwidth(seq, TMPSZ - 1);
2689 if (v == SEQ_START_TOKEN) {
2690 seq_puts(seq, " sl local_address rem_address st tx_queue "
2691 "rx_queue tr tm->when retrnsmt uid timeout "
2692 "inode");
2693 goto out;
2694 }
2695 st = seq->private;
2696
2697 if (sk->sk_state == TCP_TIME_WAIT)
2698 get_timewait4_sock(v, seq, st->num);
2699 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2700 get_openreq4(v, seq, st->num);
2701 else
2702 get_tcp4_sock(v, seq, st->num);
2703out:
2704 seq_pad(seq, '\n');
2705 return 0;
2706}
2707
2708#ifdef CONFIG_BPF_SYSCALL
2709struct bpf_tcp_iter_state {
2710 struct tcp_iter_state state;
2711 unsigned int cur_sk;
2712 unsigned int end_sk;
2713 unsigned int max_sk;
2714 struct sock **batch;
2715 bool st_bucket_done;
2716};
2717
2718struct bpf_iter__tcp {
2719 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2720 __bpf_md_ptr(struct sock_common *, sk_common);
2721 uid_t uid __aligned(8);
2722};
2723
2724static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2725 struct sock_common *sk_common, uid_t uid)
2726{
2727 struct bpf_iter__tcp ctx;
2728
2729 meta->seq_num--;
2730 ctx.meta = meta;
2731 ctx.sk_common = sk_common;
2732 ctx.uid = uid;
2733 return bpf_iter_run_prog(prog, &ctx);
2734}
2735
2736static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2737{
2738 while (iter->cur_sk < iter->end_sk)
2739 sock_put(iter->batch[iter->cur_sk++]);
2740}
2741
2742static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2743 unsigned int new_batch_sz)
2744{
2745 struct sock **new_batch;
2746
2747 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2748 GFP_USER | __GFP_NOWARN);
2749 if (!new_batch)
2750 return -ENOMEM;
2751
2752 bpf_iter_tcp_put_batch(iter);
2753 kvfree(iter->batch);
2754 iter->batch = new_batch;
2755 iter->max_sk = new_batch_sz;
2756
2757 return 0;
2758}
2759
2760static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2761 struct sock *start_sk)
2762{
2763 struct bpf_tcp_iter_state *iter = seq->private;
2764 struct tcp_iter_state *st = &iter->state;
2765 struct inet_connection_sock *icsk;
2766 unsigned int expected = 1;
2767 struct sock *sk;
2768
2769 sock_hold(start_sk);
2770 iter->batch[iter->end_sk++] = start_sk;
2771
2772 icsk = inet_csk(start_sk);
2773 inet_lhash2_for_each_icsk_continue(icsk) {
2774 sk = (struct sock *)icsk;
2775 if (seq_sk_match(seq, sk)) {
2776 if (iter->end_sk < iter->max_sk) {
2777 sock_hold(sk);
2778 iter->batch[iter->end_sk++] = sk;
2779 }
2780 expected++;
2781 }
2782 }
2783 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2784
2785 return expected;
2786}
2787
2788static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2789 struct sock *start_sk)
2790{
2791 struct bpf_tcp_iter_state *iter = seq->private;
2792 struct tcp_iter_state *st = &iter->state;
2793 struct hlist_nulls_node *node;
2794 unsigned int expected = 1;
2795 struct sock *sk;
2796
2797 sock_hold(start_sk);
2798 iter->batch[iter->end_sk++] = start_sk;
2799
2800 sk = sk_nulls_next(start_sk);
2801 sk_nulls_for_each_from(sk, node) {
2802 if (seq_sk_match(seq, sk)) {
2803 if (iter->end_sk < iter->max_sk) {
2804 sock_hold(sk);
2805 iter->batch[iter->end_sk++] = sk;
2806 }
2807 expected++;
2808 }
2809 }
2810 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2811
2812 return expected;
2813}
2814
2815static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2816{
2817 struct bpf_tcp_iter_state *iter = seq->private;
2818 struct tcp_iter_state *st = &iter->state;
2819 unsigned int expected;
2820 bool resized = false;
2821 struct sock *sk;
2822
2823
2824
2825
2826
2827
2828 if (iter->st_bucket_done) {
2829 st->offset = 0;
2830 st->bucket++;
2831 if (st->state == TCP_SEQ_STATE_LISTENING &&
2832 st->bucket > tcp_hashinfo.lhash2_mask) {
2833 st->state = TCP_SEQ_STATE_ESTABLISHED;
2834 st->bucket = 0;
2835 }
2836 }
2837
2838again:
2839
2840 iter->cur_sk = 0;
2841 iter->end_sk = 0;
2842 iter->st_bucket_done = false;
2843
2844 sk = tcp_seek_last_pos(seq);
2845 if (!sk)
2846 return NULL;
2847
2848 if (st->state == TCP_SEQ_STATE_LISTENING)
2849 expected = bpf_iter_tcp_listening_batch(seq, sk);
2850 else
2851 expected = bpf_iter_tcp_established_batch(seq, sk);
2852
2853 if (iter->end_sk == expected) {
2854 iter->st_bucket_done = true;
2855 return sk;
2856 }
2857
2858 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2859 resized = true;
2860 goto again;
2861 }
2862
2863 return sk;
2864}
2865
2866static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2867{
2868
2869
2870
2871 if (*pos)
2872 return bpf_iter_tcp_batch(seq);
2873
2874 return SEQ_START_TOKEN;
2875}
2876
2877static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2878{
2879 struct bpf_tcp_iter_state *iter = seq->private;
2880 struct tcp_iter_state *st = &iter->state;
2881 struct sock *sk;
2882
2883
2884
2885
2886
2887 if (iter->cur_sk < iter->end_sk) {
2888
2889
2890
2891
2892 st->num++;
2893
2894
2895
2896
2897 st->offset++;
2898 sock_put(iter->batch[iter->cur_sk++]);
2899 }
2900
2901 if (iter->cur_sk < iter->end_sk)
2902 sk = iter->batch[iter->cur_sk];
2903 else
2904 sk = bpf_iter_tcp_batch(seq);
2905
2906 ++*pos;
2907
2908
2909
2910 st->last_pos = *pos;
2911 return sk;
2912}
2913
2914static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2915{
2916 struct bpf_iter_meta meta;
2917 struct bpf_prog *prog;
2918 struct sock *sk = v;
2919 bool slow;
2920 uid_t uid;
2921 int ret;
2922
2923 if (v == SEQ_START_TOKEN)
2924 return 0;
2925
2926 if (sk_fullsock(sk))
2927 slow = lock_sock_fast(sk);
2928
2929 if (unlikely(sk_unhashed(sk))) {
2930 ret = SEQ_SKIP;
2931 goto unlock;
2932 }
2933
2934 if (sk->sk_state == TCP_TIME_WAIT) {
2935 uid = 0;
2936 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2937 const struct request_sock *req = v;
2938
2939 uid = from_kuid_munged(seq_user_ns(seq),
2940 sock_i_uid(req->rsk_listener));
2941 } else {
2942 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2943 }
2944
2945 meta.seq = seq;
2946 prog = bpf_iter_get_info(&meta, false);
2947 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2948
2949unlock:
2950 if (sk_fullsock(sk))
2951 unlock_sock_fast(sk, slow);
2952 return ret;
2953
2954}
2955
2956static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2957{
2958 struct bpf_tcp_iter_state *iter = seq->private;
2959 struct bpf_iter_meta meta;
2960 struct bpf_prog *prog;
2961
2962 if (!v) {
2963 meta.seq = seq;
2964 prog = bpf_iter_get_info(&meta, true);
2965 if (prog)
2966 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2967 }
2968
2969 if (iter->cur_sk < iter->end_sk) {
2970 bpf_iter_tcp_put_batch(iter);
2971 iter->st_bucket_done = false;
2972 }
2973}
2974
2975static const struct seq_operations bpf_iter_tcp_seq_ops = {
2976 .show = bpf_iter_tcp_seq_show,
2977 .start = bpf_iter_tcp_seq_start,
2978 .next = bpf_iter_tcp_seq_next,
2979 .stop = bpf_iter_tcp_seq_stop,
2980};
2981#endif
2982static unsigned short seq_file_family(const struct seq_file *seq)
2983{
2984 const struct tcp_seq_afinfo *afinfo;
2985
2986#ifdef CONFIG_BPF_SYSCALL
2987
2988 if (seq->op == &bpf_iter_tcp_seq_ops)
2989 return AF_UNSPEC;
2990#endif
2991
2992
2993 afinfo = PDE_DATA(file_inode(seq->file));
2994 return afinfo->family;
2995}
2996
2997static const struct seq_operations tcp4_seq_ops = {
2998 .show = tcp4_seq_show,
2999 .start = tcp_seq_start,
3000 .next = tcp_seq_next,
3001 .stop = tcp_seq_stop,
3002};
3003
3004static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3005 .family = AF_INET,
3006};
3007
3008static int __net_init tcp4_proc_init_net(struct net *net)
3009{
3010 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3011 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3012 return -ENOMEM;
3013 return 0;
3014}
3015
3016static void __net_exit tcp4_proc_exit_net(struct net *net)
3017{
3018 remove_proc_entry("tcp", net->proc_net);
3019}
3020
3021static struct pernet_operations tcp4_net_ops = {
3022 .init = tcp4_proc_init_net,
3023 .exit = tcp4_proc_exit_net,
3024};
3025
3026int __init tcp4_proc_init(void)
3027{
3028 return register_pernet_subsys(&tcp4_net_ops);
3029}
3030
3031void tcp4_proc_exit(void)
3032{
3033 unregister_pernet_subsys(&tcp4_net_ops);
3034}
3035#endif
3036
3037
3038
3039
3040
3041bool tcp_stream_memory_free(const struct sock *sk, int wake)
3042{
3043 const struct tcp_sock *tp = tcp_sk(sk);
3044 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3045 READ_ONCE(tp->snd_nxt);
3046
3047 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3048}
3049EXPORT_SYMBOL(tcp_stream_memory_free);
3050
3051struct proto tcp_prot = {
3052 .name = "TCP",
3053 .owner = THIS_MODULE,
3054 .close = tcp_close,
3055 .pre_connect = tcp_v4_pre_connect,
3056 .connect = tcp_v4_connect,
3057 .disconnect = tcp_disconnect,
3058 .accept = inet_csk_accept,
3059 .ioctl = tcp_ioctl,
3060 .init = tcp_v4_init_sock,
3061 .destroy = tcp_v4_destroy_sock,
3062 .shutdown = tcp_shutdown,
3063 .setsockopt = tcp_setsockopt,
3064 .getsockopt = tcp_getsockopt,
3065 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3066 .keepalive = tcp_set_keepalive,
3067 .recvmsg = tcp_recvmsg,
3068 .sendmsg = tcp_sendmsg,
3069 .sendpage = tcp_sendpage,
3070 .backlog_rcv = tcp_v4_do_rcv,
3071 .release_cb = tcp_release_cb,
3072 .hash = inet_hash,
3073 .unhash = inet_unhash,
3074 .get_port = inet_csk_get_port,
3075#ifdef CONFIG_BPF_SYSCALL
3076 .psock_update_sk_prot = tcp_bpf_update_proto,
3077#endif
3078 .enter_memory_pressure = tcp_enter_memory_pressure,
3079 .leave_memory_pressure = tcp_leave_memory_pressure,
3080 .stream_memory_free = tcp_stream_memory_free,
3081 .sockets_allocated = &tcp_sockets_allocated,
3082 .orphan_count = &tcp_orphan_count,
3083 .memory_allocated = &tcp_memory_allocated,
3084 .memory_pressure = &tcp_memory_pressure,
3085 .sysctl_mem = sysctl_tcp_mem,
3086 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3087 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3088 .max_header = MAX_TCP_HEADER,
3089 .obj_size = sizeof(struct tcp_sock),
3090 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3091 .twsk_prot = &tcp_timewait_sock_ops,
3092 .rsk_prot = &tcp_request_sock_ops,
3093 .h.hashinfo = &tcp_hashinfo,
3094 .no_autobind = true,
3095 .diag_destroy = tcp_abort,
3096};
3097EXPORT_SYMBOL(tcp_prot);
3098
3099static void __net_exit tcp_sk_exit(struct net *net)
3100{
3101 int cpu;
3102
3103 if (net->ipv4.tcp_congestion_control)
3104 bpf_module_put(net->ipv4.tcp_congestion_control,
3105 net->ipv4.tcp_congestion_control->owner);
3106
3107 for_each_possible_cpu(cpu)
3108 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
3109 free_percpu(net->ipv4.tcp_sk);
3110}
3111
3112static int __net_init tcp_sk_init(struct net *net)
3113{
3114 int res, cpu, cnt;
3115
3116 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
3117 if (!net->ipv4.tcp_sk)
3118 return -ENOMEM;
3119
3120 for_each_possible_cpu(cpu) {
3121 struct sock *sk;
3122
3123 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3124 IPPROTO_TCP, net);
3125 if (res)
3126 goto fail;
3127 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3128
3129
3130
3131
3132 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3133
3134 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
3135 }
3136
3137 net->ipv4.sysctl_tcp_ecn = 2;
3138 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3139
3140 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3141 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3142 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3143 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3144 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3145
3146 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3147 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3148 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3149
3150 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3151 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3152 net->ipv4.sysctl_tcp_syncookies = 1;
3153 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3154 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3155 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3156 net->ipv4.sysctl_tcp_orphan_retries = 0;
3157 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3158 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3159 net->ipv4.sysctl_tcp_tw_reuse = 2;
3160 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3161
3162 cnt = tcp_hashinfo.ehash_mask + 1;
3163 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3164 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3165
3166 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3167 net->ipv4.sysctl_tcp_sack = 1;
3168 net->ipv4.sysctl_tcp_window_scaling = 1;
3169 net->ipv4.sysctl_tcp_timestamps = 1;
3170 net->ipv4.sysctl_tcp_early_retrans = 3;
3171 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3172 net->ipv4.sysctl_tcp_slow_start_after_idle = 1;
3173 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3174 net->ipv4.sysctl_tcp_max_reordering = 300;
3175 net->ipv4.sysctl_tcp_dsack = 1;
3176 net->ipv4.sysctl_tcp_app_win = 31;
3177 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3178 net->ipv4.sysctl_tcp_frto = 2;
3179 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3180
3181
3182
3183
3184 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3185
3186 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3187
3188 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3189 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3190 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3191 net->ipv4.sysctl_tcp_autocorking = 1;
3192 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3193 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3194 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3195 if (net != &init_net) {
3196 memcpy(net->ipv4.sysctl_tcp_rmem,
3197 init_net.ipv4.sysctl_tcp_rmem,
3198 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3199 memcpy(net->ipv4.sysctl_tcp_wmem,
3200 init_net.ipv4.sysctl_tcp_wmem,
3201 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3202 }
3203 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3204 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3205 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3206 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3207 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3208 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3209
3210
3211 if (!net_eq(net, &init_net) &&
3212 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3213 init_net.ipv4.tcp_congestion_control->owner))
3214 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3215 else
3216 net->ipv4.tcp_congestion_control = &tcp_reno;
3217
3218 return 0;
3219fail:
3220 tcp_sk_exit(net);
3221
3222 return res;
3223}
3224
3225static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3226{
3227 struct net *net;
3228
3229 inet_twsk_purge(&tcp_hashinfo, AF_INET);
3230
3231 list_for_each_entry(net, net_exit_list, exit_list)
3232 tcp_fastopen_ctx_destroy(net);
3233}
3234
3235static struct pernet_operations __net_initdata tcp_sk_ops = {
3236 .init = tcp_sk_init,
3237 .exit = tcp_sk_exit,
3238 .exit_batch = tcp_sk_exit_batch,
3239};
3240
3241#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3242DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3243 struct sock_common *sk_common, uid_t uid)
3244
3245#define INIT_BATCH_SZ 16
3246
3247static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3248{
3249 struct bpf_tcp_iter_state *iter = priv_data;
3250 int err;
3251
3252 err = bpf_iter_init_seq_net(priv_data, aux);
3253 if (err)
3254 return err;
3255
3256 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3257 if (err) {
3258 bpf_iter_fini_seq_net(priv_data);
3259 return err;
3260 }
3261
3262 return 0;
3263}
3264
3265static void bpf_iter_fini_tcp(void *priv_data)
3266{
3267 struct bpf_tcp_iter_state *iter = priv_data;
3268
3269 bpf_iter_fini_seq_net(priv_data);
3270 kvfree(iter->batch);
3271}
3272
3273static const struct bpf_iter_seq_info tcp_seq_info = {
3274 .seq_ops = &bpf_iter_tcp_seq_ops,
3275 .init_seq_private = bpf_iter_init_tcp,
3276 .fini_seq_private = bpf_iter_fini_tcp,
3277 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3278};
3279
3280static const struct bpf_func_proto *
3281bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3282 const struct bpf_prog *prog)
3283{
3284 switch (func_id) {
3285 case BPF_FUNC_setsockopt:
3286 return &bpf_sk_setsockopt_proto;
3287 case BPF_FUNC_getsockopt:
3288 return &bpf_sk_getsockopt_proto;
3289 default:
3290 return NULL;
3291 }
3292}
3293
3294static struct bpf_iter_reg tcp_reg_info = {
3295 .target = "tcp",
3296 .ctx_arg_info_size = 1,
3297 .ctx_arg_info = {
3298 { offsetof(struct bpf_iter__tcp, sk_common),
3299 PTR_TO_BTF_ID_OR_NULL },
3300 },
3301 .get_func_proto = bpf_iter_tcp_get_func_proto,
3302 .seq_info = &tcp_seq_info,
3303};
3304
3305static void __init bpf_iter_register(void)
3306{
3307 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3308 if (bpf_iter_reg_target(&tcp_reg_info))
3309 pr_warn("Warning: could not register bpf iterator tcp\n");
3310}
3311
3312#endif
3313
3314void __init tcp_v4_init(void)
3315{
3316 if (register_pernet_subsys(&tcp_sk_ops))
3317 panic("Failed to create the TCP control socket.\n");
3318
3319#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3320 bpf_iter_register();
3321#endif
3322}
3323