1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244#define pr_fmt(fmt) "TCP: " fmt
245
246#include <crypto/hash.h>
247#include <linux/kernel.h>
248#include <linux/module.h>
249#include <linux/types.h>
250#include <linux/fcntl.h>
251#include <linux/poll.h>
252#include <linux/inet_diag.h>
253#include <linux/init.h>
254#include <linux/fs.h>
255#include <linux/skbuff.h>
256#include <linux/scatterlist.h>
257#include <linux/splice.h>
258#include <linux/net.h>
259#include <linux/socket.h>
260#include <linux/random.h>
261#include <linux/memblock.h>
262#include <linux/highmem.h>
263#include <linux/swap.h>
264#include <linux/cache.h>
265#include <linux/err.h>
266#include <linux/time.h>
267#include <linux/slab.h>
268#include <linux/errqueue.h>
269#include <linux/static_key.h>
270
271#include <net/icmp.h>
272#include <net/inet_common.h>
273#include <net/tcp.h>
274#include <net/xfrm.h>
275#include <net/ip.h>
276#include <net/sock.h>
277
278#include <linux/uaccess.h>
279#include <asm/ioctls.h>
280#include <net/busy_poll.h>
281
282struct percpu_counter tcp_orphan_count;
283EXPORT_SYMBOL_GPL(tcp_orphan_count);
284
285long sysctl_tcp_mem[3] __read_mostly;
286EXPORT_SYMBOL(sysctl_tcp_mem);
287
288atomic_long_t tcp_memory_allocated;
289EXPORT_SYMBOL(tcp_memory_allocated);
290
291#if IS_ENABLED(CONFIG_SMC)
292DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
293EXPORT_SYMBOL(tcp_have_smc);
294#endif
295
296
297
298
299struct percpu_counter tcp_sockets_allocated;
300EXPORT_SYMBOL(tcp_sockets_allocated);
301
302
303
304
305struct tcp_splice_state {
306 struct pipe_inode_info *pipe;
307 size_t len;
308 unsigned int flags;
309};
310
311
312
313
314
315
316
317unsigned long tcp_memory_pressure __read_mostly;
318EXPORT_SYMBOL_GPL(tcp_memory_pressure);
319
320DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
321EXPORT_SYMBOL(tcp_rx_skb_cache_key);
322
323DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
324
325void tcp_enter_memory_pressure(struct sock *sk)
326{
327 unsigned long val;
328
329 if (tcp_memory_pressure)
330 return;
331 val = jiffies;
332
333 if (!val)
334 val--;
335 if (!cmpxchg(&tcp_memory_pressure, 0, val))
336 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
337}
338EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
339
340void tcp_leave_memory_pressure(struct sock *sk)
341{
342 unsigned long val;
343
344 if (!tcp_memory_pressure)
345 return;
346 val = xchg(&tcp_memory_pressure, 0);
347 if (val)
348 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
349 jiffies_to_msecs(jiffies - val));
350}
351EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
352
353
354static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
355{
356 u8 res = 0;
357
358 if (seconds > 0) {
359 int period = timeout;
360
361 res = 1;
362 while (seconds > period && res < 255) {
363 res++;
364 timeout <<= 1;
365 if (timeout > rto_max)
366 timeout = rto_max;
367 period += timeout;
368 }
369 }
370 return res;
371}
372
373
374static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
375{
376 int period = 0;
377
378 if (retrans > 0) {
379 period = timeout;
380 while (--retrans) {
381 timeout <<= 1;
382 if (timeout > rto_max)
383 timeout = rto_max;
384 period += timeout;
385 }
386 }
387 return period;
388}
389
390static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
391{
392 u32 rate = READ_ONCE(tp->rate_delivered);
393 u32 intv = READ_ONCE(tp->rate_interval_us);
394 u64 rate64 = 0;
395
396 if (rate && intv) {
397 rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
398 do_div(rate64, intv);
399 }
400 return rate64;
401}
402
403
404
405
406
407
408void tcp_init_sock(struct sock *sk)
409{
410 struct inet_connection_sock *icsk = inet_csk(sk);
411 struct tcp_sock *tp = tcp_sk(sk);
412
413 tp->out_of_order_queue = RB_ROOT;
414 sk->tcp_rtx_queue = RB_ROOT;
415 tcp_init_xmit_timers(sk);
416 INIT_LIST_HEAD(&tp->tsq_node);
417 INIT_LIST_HEAD(&tp->tsorted_sent_queue);
418
419 icsk->icsk_rto = TCP_TIMEOUT_INIT;
420 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
421 minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
422
423
424
425
426
427
428 tp->snd_cwnd = TCP_INIT_CWND;
429
430
431 tp->app_limited = ~0U;
432
433
434
435
436 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
437 tp->snd_cwnd_clamp = ~0;
438 tp->mss_cache = TCP_MSS_DEFAULT;
439
440 tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
441 tcp_assign_congestion_control(sk);
442
443 tp->tsoffset = 0;
444 tp->rack.reo_wnd_steps = 1;
445
446 sk->sk_state = TCP_CLOSE;
447
448 sk->sk_write_space = sk_stream_write_space;
449 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
450
451 icsk->icsk_sync_mss = tcp_sync_mss;
452
453 sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
454 sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
455
456 sk_sockets_allocated_inc(sk);
457 sk->sk_route_forced_caps = NETIF_F_GSO;
458}
459EXPORT_SYMBOL(tcp_init_sock);
460
461static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
462{
463 struct sk_buff *skb = tcp_write_queue_tail(sk);
464
465 if (tsflags && skb) {
466 struct skb_shared_info *shinfo = skb_shinfo(skb);
467 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
468
469 sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
470 if (tsflags & SOF_TIMESTAMPING_TX_ACK)
471 tcb->txstamp_ack = 1;
472 if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
473 shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
474 }
475}
476
477static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
478 int target, struct sock *sk)
479{
480 return (tp->rcv_nxt - tp->copied_seq >= target) ||
481 (sk->sk_prot->stream_memory_read ?
482 sk->sk_prot->stream_memory_read(sk) : false);
483}
484
485
486
487
488
489
490
491
492__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
493{
494 __poll_t mask;
495 struct sock *sk = sock->sk;
496 const struct tcp_sock *tp = tcp_sk(sk);
497 int state;
498
499 sock_poll_wait(file, sock, wait);
500
501 state = inet_sk_state_load(sk);
502 if (state == TCP_LISTEN)
503 return inet_csk_listen_poll(sk);
504
505
506
507
508
509
510 mask = 0;
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539 if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
540 mask |= EPOLLHUP;
541 if (sk->sk_shutdown & RCV_SHUTDOWN)
542 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
543
544
545 if (state != TCP_SYN_SENT &&
546 (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
547 int target = sock_rcvlowat(sk, 0, INT_MAX);
548
549 if (tp->urg_seq == tp->copied_seq &&
550 !sock_flag(sk, SOCK_URGINLINE) &&
551 tp->urg_data)
552 target++;
553
554 if (tcp_stream_is_readable(tp, target, sk))
555 mask |= EPOLLIN | EPOLLRDNORM;
556
557 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
558 if (sk_stream_is_writeable(sk)) {
559 mask |= EPOLLOUT | EPOLLWRNORM;
560 } else {
561 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
562 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
563
564
565
566
567
568
569 smp_mb__after_atomic();
570 if (sk_stream_is_writeable(sk))
571 mask |= EPOLLOUT | EPOLLWRNORM;
572 }
573 } else
574 mask |= EPOLLOUT | EPOLLWRNORM;
575
576 if (tp->urg_data & TCP_URG_VALID)
577 mask |= EPOLLPRI;
578 } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
579
580
581
582
583 mask |= EPOLLOUT | EPOLLWRNORM;
584 }
585
586 smp_rmb();
587 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
588 mask |= EPOLLERR;
589
590 return mask;
591}
592EXPORT_SYMBOL(tcp_poll);
593
594int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
595{
596 struct tcp_sock *tp = tcp_sk(sk);
597 int answ;
598 bool slow;
599
600 switch (cmd) {
601 case SIOCINQ:
602 if (sk->sk_state == TCP_LISTEN)
603 return -EINVAL;
604
605 slow = lock_sock_fast(sk);
606 answ = tcp_inq(sk);
607 unlock_sock_fast(sk, slow);
608 break;
609 case SIOCATMARK:
610 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
611 break;
612 case SIOCOUTQ:
613 if (sk->sk_state == TCP_LISTEN)
614 return -EINVAL;
615
616 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
617 answ = 0;
618 else
619 answ = tp->write_seq - tp->snd_una;
620 break;
621 case SIOCOUTQNSD:
622 if (sk->sk_state == TCP_LISTEN)
623 return -EINVAL;
624
625 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
626 answ = 0;
627 else
628 answ = tp->write_seq - tp->snd_nxt;
629 break;
630 default:
631 return -ENOIOCTLCMD;
632 }
633
634 return put_user(answ, (int __user *)arg);
635}
636EXPORT_SYMBOL(tcp_ioctl);
637
638static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
639{
640 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
641 tp->pushed_seq = tp->write_seq;
642}
643
644static inline bool forced_push(const struct tcp_sock *tp)
645{
646 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
647}
648
649static void skb_entail(struct sock *sk, struct sk_buff *skb)
650{
651 struct tcp_sock *tp = tcp_sk(sk);
652 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
653
654 skb->csum = 0;
655 tcb->seq = tcb->end_seq = tp->write_seq;
656 tcb->tcp_flags = TCPHDR_ACK;
657 tcb->sacked = 0;
658 __skb_header_release(skb);
659 tcp_add_write_queue_tail(sk, skb);
660 sk->sk_wmem_queued += skb->truesize;
661 sk_mem_charge(sk, skb->truesize);
662 if (tp->nonagle & TCP_NAGLE_PUSH)
663 tp->nonagle &= ~TCP_NAGLE_PUSH;
664
665 tcp_slow_start_after_idle_check(sk);
666}
667
668static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
669{
670 if (flags & MSG_OOB)
671 tp->snd_up = tp->write_seq;
672}
673
674
675
676
677
678
679
680
681
682
683
684static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
685 int size_goal)
686{
687 return skb->len < size_goal &&
688 sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
689 !tcp_rtx_queue_empty(sk) &&
690 refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
691}
692
693static void tcp_push(struct sock *sk, int flags, int mss_now,
694 int nonagle, int size_goal)
695{
696 struct tcp_sock *tp = tcp_sk(sk);
697 struct sk_buff *skb;
698
699 skb = tcp_write_queue_tail(sk);
700 if (!skb)
701 return;
702 if (!(flags & MSG_MORE) || forced_push(tp))
703 tcp_mark_push(tp, skb);
704
705 tcp_mark_urg(tp, flags);
706
707 if (tcp_should_autocork(sk, skb, size_goal)) {
708
709
710 if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
711 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
712 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
713 }
714
715
716
717 if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
718 return;
719 }
720
721 if (flags & MSG_MORE)
722 nonagle = TCP_NAGLE_CORK;
723
724 __tcp_push_pending_frames(sk, mss_now, nonagle);
725}
726
727static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
728 unsigned int offset, size_t len)
729{
730 struct tcp_splice_state *tss = rd_desc->arg.data;
731 int ret;
732
733 ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
734 min(rd_desc->count, len), tss->flags);
735 if (ret > 0)
736 rd_desc->count -= ret;
737 return ret;
738}
739
740static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
741{
742
743 read_descriptor_t rd_desc = {
744 .arg.data = tss,
745 .count = tss->len,
746 };
747
748 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
749}
750
751
752
753
754
755
756
757
758
759
760
761
762
763ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
764 struct pipe_inode_info *pipe, size_t len,
765 unsigned int flags)
766{
767 struct sock *sk = sock->sk;
768 struct tcp_splice_state tss = {
769 .pipe = pipe,
770 .len = len,
771 .flags = flags,
772 };
773 long timeo;
774 ssize_t spliced;
775 int ret;
776
777 sock_rps_record_flow(sk);
778
779
780
781 if (unlikely(*ppos))
782 return -ESPIPE;
783
784 ret = spliced = 0;
785
786 lock_sock(sk);
787
788 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
789 while (tss.len) {
790 ret = __tcp_splice_read(sk, &tss);
791 if (ret < 0)
792 break;
793 else if (!ret) {
794 if (spliced)
795 break;
796 if (sock_flag(sk, SOCK_DONE))
797 break;
798 if (sk->sk_err) {
799 ret = sock_error(sk);
800 break;
801 }
802 if (sk->sk_shutdown & RCV_SHUTDOWN)
803 break;
804 if (sk->sk_state == TCP_CLOSE) {
805
806
807
808
809 ret = -ENOTCONN;
810 break;
811 }
812 if (!timeo) {
813 ret = -EAGAIN;
814 break;
815 }
816
817
818
819
820 if (!skb_queue_empty(&sk->sk_receive_queue))
821 break;
822 sk_wait_data(sk, &timeo, NULL);
823 if (signal_pending(current)) {
824 ret = sock_intr_errno(timeo);
825 break;
826 }
827 continue;
828 }
829 tss.len -= ret;
830 spliced += ret;
831
832 if (!timeo)
833 break;
834 release_sock(sk);
835 lock_sock(sk);
836
837 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
838 (sk->sk_shutdown & RCV_SHUTDOWN) ||
839 signal_pending(current))
840 break;
841 }
842
843 release_sock(sk);
844
845 if (spliced)
846 return spliced;
847
848 return ret;
849}
850EXPORT_SYMBOL(tcp_splice_read);
851
852struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
853 bool force_schedule)
854{
855 struct sk_buff *skb;
856
857 if (likely(!size)) {
858 skb = sk->sk_tx_skb_cache;
859 if (skb) {
860 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
861 sk->sk_tx_skb_cache = NULL;
862 pskb_trim(skb, 0);
863 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
864 skb_shinfo(skb)->tx_flags = 0;
865 memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb));
866 return skb;
867 }
868 }
869
870 size = ALIGN(size, 4);
871
872 if (unlikely(tcp_under_memory_pressure(sk)))
873 sk_mem_reclaim_partial(sk);
874
875 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
876 if (likely(skb)) {
877 bool mem_scheduled;
878
879 if (force_schedule) {
880 mem_scheduled = true;
881 sk_forced_mem_schedule(sk, skb->truesize);
882 } else {
883 mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
884 }
885 if (likely(mem_scheduled)) {
886 skb_reserve(skb, sk->sk_prot->max_header);
887
888
889
890
891 skb->reserved_tailroom = skb->end - skb->tail - size;
892 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
893 return skb;
894 }
895 __kfree_skb(skb);
896 } else {
897 sk->sk_prot->enter_memory_pressure(sk);
898 sk_stream_moderate_sndbuf(sk);
899 }
900 return NULL;
901}
902
903static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
904 int large_allowed)
905{
906 struct tcp_sock *tp = tcp_sk(sk);
907 u32 new_size_goal, size_goal;
908
909 if (!large_allowed)
910 return mss_now;
911
912
913 new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
914 new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
915
916
917 size_goal = tp->gso_segs * mss_now;
918 if (unlikely(new_size_goal < size_goal ||
919 new_size_goal >= size_goal + mss_now)) {
920 tp->gso_segs = min_t(u16, new_size_goal / mss_now,
921 sk->sk_gso_max_segs);
922 size_goal = tp->gso_segs * mss_now;
923 }
924
925 return max(size_goal, mss_now);
926}
927
928static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
929{
930 int mss_now;
931
932 mss_now = tcp_current_mss(sk);
933 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
934
935 return mss_now;
936}
937
938
939
940
941
942
943
944static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
945{
946 if (skb && !skb->len) {
947 tcp_unlink_write_queue(skb, sk);
948 if (tcp_write_queue_empty(sk))
949 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
950 sk_wmem_free_skb(sk, skb);
951 }
952}
953
954ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
955 size_t size, int flags)
956{
957 struct tcp_sock *tp = tcp_sk(sk);
958 int mss_now, size_goal;
959 int err;
960 ssize_t copied;
961 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
962
963 if (IS_ENABLED(CONFIG_DEBUG_VM) &&
964 WARN_ONCE(PageSlab(page), "page must not be a Slab one"))
965 return -EINVAL;
966
967
968
969
970
971 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
972 !tcp_passive_fastopen(sk)) {
973 err = sk_stream_wait_connect(sk, &timeo);
974 if (err != 0)
975 goto out_err;
976 }
977
978 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
979
980 mss_now = tcp_send_mss(sk, &size_goal, flags);
981 copied = 0;
982
983 err = -EPIPE;
984 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
985 goto out_err;
986
987 while (size > 0) {
988 struct sk_buff *skb = tcp_write_queue_tail(sk);
989 int copy, i;
990 bool can_coalesce;
991
992 if (!skb || (copy = size_goal - skb->len) <= 0 ||
993 !tcp_skb_can_collapse_to(skb)) {
994new_segment:
995 if (!sk_stream_memory_free(sk))
996 goto wait_for_sndbuf;
997
998 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
999 tcp_rtx_and_write_queues_empty(sk));
1000 if (!skb)
1001 goto wait_for_memory;
1002
1003#ifdef CONFIG_TLS_DEVICE
1004 skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
1005#endif
1006 skb_entail(sk, skb);
1007 copy = size_goal;
1008 }
1009
1010 if (copy > size)
1011 copy = size;
1012
1013 i = skb_shinfo(skb)->nr_frags;
1014 can_coalesce = skb_can_coalesce(skb, i, page, offset);
1015 if (!can_coalesce && i >= sysctl_max_skb_frags) {
1016 tcp_mark_push(tp, skb);
1017 goto new_segment;
1018 }
1019 if (!sk_wmem_schedule(sk, copy))
1020 goto wait_for_memory;
1021
1022 if (can_coalesce) {
1023 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1024 } else {
1025 get_page(page);
1026 skb_fill_page_desc(skb, i, page, offset, copy);
1027 }
1028
1029 if (!(flags & MSG_NO_SHARED_FRAGS))
1030 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
1031
1032 skb->len += copy;
1033 skb->data_len += copy;
1034 skb->truesize += copy;
1035 sk->sk_wmem_queued += copy;
1036 sk_mem_charge(sk, copy);
1037 skb->ip_summed = CHECKSUM_PARTIAL;
1038 tp->write_seq += copy;
1039 TCP_SKB_CB(skb)->end_seq += copy;
1040 tcp_skb_pcount_set(skb, 0);
1041
1042 if (!copied)
1043 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1044
1045 copied += copy;
1046 offset += copy;
1047 size -= copy;
1048 if (!size)
1049 goto out;
1050
1051 if (skb->len < size_goal || (flags & MSG_OOB))
1052 continue;
1053
1054 if (forced_push(tp)) {
1055 tcp_mark_push(tp, skb);
1056 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1057 } else if (skb == tcp_send_head(sk))
1058 tcp_push_one(sk, mss_now);
1059 continue;
1060
1061wait_for_sndbuf:
1062 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1063wait_for_memory:
1064 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1065 TCP_NAGLE_PUSH, size_goal);
1066
1067 err = sk_stream_wait_memory(sk, &timeo);
1068 if (err != 0)
1069 goto do_error;
1070
1071 mss_now = tcp_send_mss(sk, &size_goal, flags);
1072 }
1073
1074out:
1075 if (copied) {
1076 tcp_tx_timestamp(sk, sk->sk_tsflags);
1077 if (!(flags & MSG_SENDPAGE_NOTLAST))
1078 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1079 }
1080 return copied;
1081
1082do_error:
1083 tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk));
1084 if (copied)
1085 goto out;
1086out_err:
1087
1088 if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
1089 err == -EAGAIN)) {
1090 sk->sk_write_space(sk);
1091 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1092 }
1093 return sk_stream_error(sk, flags, err);
1094}
1095EXPORT_SYMBOL_GPL(do_tcp_sendpages);
1096
1097int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
1098 size_t size, int flags)
1099{
1100 if (!(sk->sk_route_caps & NETIF_F_SG))
1101 return sock_no_sendpage_locked(sk, page, offset, size, flags);
1102
1103 tcp_rate_check_app_limited(sk);
1104
1105 return do_tcp_sendpages(sk, page, offset, size, flags);
1106}
1107EXPORT_SYMBOL_GPL(tcp_sendpage_locked);
1108
1109int tcp_sendpage(struct sock *sk, struct page *page, int offset,
1110 size_t size, int flags)
1111{
1112 int ret;
1113
1114 lock_sock(sk);
1115 ret = tcp_sendpage_locked(sk, page, offset, size, flags);
1116 release_sock(sk);
1117
1118 return ret;
1119}
1120EXPORT_SYMBOL(tcp_sendpage);
1121
1122void tcp_free_fastopen_req(struct tcp_sock *tp)
1123{
1124 if (tp->fastopen_req) {
1125 kfree(tp->fastopen_req);
1126 tp->fastopen_req = NULL;
1127 }
1128}
1129
1130static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1131 int *copied, size_t size,
1132 struct ubuf_info *uarg)
1133{
1134 struct tcp_sock *tp = tcp_sk(sk);
1135 struct inet_sock *inet = inet_sk(sk);
1136 struct sockaddr *uaddr = msg->msg_name;
1137 int err, flags;
1138
1139 if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
1140 (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
1141 uaddr->sa_family == AF_UNSPEC))
1142 return -EOPNOTSUPP;
1143 if (tp->fastopen_req)
1144 return -EALREADY;
1145
1146 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1147 sk->sk_allocation);
1148 if (unlikely(!tp->fastopen_req))
1149 return -ENOBUFS;
1150 tp->fastopen_req->data = msg;
1151 tp->fastopen_req->size = size;
1152 tp->fastopen_req->uarg = uarg;
1153
1154 if (inet->defer_connect) {
1155 err = tcp_connect(sk);
1156
1157 if (err) {
1158 tcp_set_state(sk, TCP_CLOSE);
1159 inet->inet_dport = 0;
1160 sk->sk_route_caps = 0;
1161 }
1162 }
1163 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1164 err = __inet_stream_connect(sk->sk_socket, uaddr,
1165 msg->msg_namelen, flags, 1);
1166
1167
1168
1169 if (tp->fastopen_req) {
1170 *copied = tp->fastopen_req->copied;
1171 tcp_free_fastopen_req(tp);
1172 inet->defer_connect = 0;
1173 }
1174 return err;
1175}
1176
1177int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
1178{
1179 struct tcp_sock *tp = tcp_sk(sk);
1180 struct ubuf_info *uarg = NULL;
1181 struct sk_buff *skb;
1182 struct sockcm_cookie sockc;
1183 int flags, err, copied = 0;
1184 int mss_now = 0, size_goal, copied_syn = 0;
1185 bool process_backlog = false;
1186 bool zc = false;
1187 long timeo;
1188
1189 flags = msg->msg_flags;
1190
1191 if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
1192 skb = tcp_write_queue_tail(sk);
1193 uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
1194 if (!uarg) {
1195 err = -ENOBUFS;
1196 goto out_err;
1197 }
1198
1199 zc = sk->sk_route_caps & NETIF_F_SG;
1200 if (!zc)
1201 uarg->zerocopy = 0;
1202 }
1203
1204 if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
1205 !tp->repair) {
1206 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
1207 if (err == -EINPROGRESS && copied_syn > 0)
1208 goto out;
1209 else if (err)
1210 goto out_err;
1211 }
1212
1213 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1214
1215 tcp_rate_check_app_limited(sk);
1216
1217
1218
1219
1220
1221 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1222 !tcp_passive_fastopen(sk)) {
1223 err = sk_stream_wait_connect(sk, &timeo);
1224 if (err != 0)
1225 goto do_error;
1226 }
1227
1228 if (unlikely(tp->repair)) {
1229 if (tp->repair_queue == TCP_RECV_QUEUE) {
1230 copied = tcp_send_rcvq(sk, msg, size);
1231 goto out_nopush;
1232 }
1233
1234 err = -EINVAL;
1235 if (tp->repair_queue == TCP_NO_QUEUE)
1236 goto out_err;
1237
1238
1239 }
1240
1241 sockcm_init(&sockc, sk);
1242 if (msg->msg_controllen) {
1243 err = sock_cmsg_send(sk, msg, &sockc);
1244 if (unlikely(err)) {
1245 err = -EINVAL;
1246 goto out_err;
1247 }
1248 }
1249
1250
1251 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1252
1253
1254 copied = 0;
1255
1256restart:
1257 mss_now = tcp_send_mss(sk, &size_goal, flags);
1258
1259 err = -EPIPE;
1260 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1261 goto do_error;
1262
1263 while (msg_data_left(msg)) {
1264 int copy = 0;
1265
1266 skb = tcp_write_queue_tail(sk);
1267 if (skb)
1268 copy = size_goal - skb->len;
1269
1270 if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
1271 bool first_skb;
1272
1273new_segment:
1274 if (!sk_stream_memory_free(sk))
1275 goto wait_for_sndbuf;
1276
1277 if (process_backlog && sk_flush_backlog(sk)) {
1278 process_backlog = false;
1279 goto restart;
1280 }
1281 first_skb = tcp_rtx_and_write_queues_empty(sk);
1282 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
1283 first_skb);
1284 if (!skb)
1285 goto wait_for_memory;
1286
1287 process_backlog = true;
1288 skb->ip_summed = CHECKSUM_PARTIAL;
1289
1290 skb_entail(sk, skb);
1291 copy = size_goal;
1292
1293
1294
1295
1296
1297 if (tp->repair)
1298 TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
1299 }
1300
1301
1302 if (copy > msg_data_left(msg))
1303 copy = msg_data_left(msg);
1304
1305
1306 if (skb_availroom(skb) > 0 && !zc) {
1307
1308 copy = min_t(int, copy, skb_availroom(skb));
1309 err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
1310 if (err)
1311 goto do_fault;
1312 } else if (!zc) {
1313 bool merge = true;
1314 int i = skb_shinfo(skb)->nr_frags;
1315 struct page_frag *pfrag = sk_page_frag(sk);
1316
1317 if (!sk_page_frag_refill(sk, pfrag))
1318 goto wait_for_memory;
1319
1320 if (!skb_can_coalesce(skb, i, pfrag->page,
1321 pfrag->offset)) {
1322 if (i >= sysctl_max_skb_frags) {
1323 tcp_mark_push(tp, skb);
1324 goto new_segment;
1325 }
1326 merge = false;
1327 }
1328
1329 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1330
1331 if (!sk_wmem_schedule(sk, copy))
1332 goto wait_for_memory;
1333
1334 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
1335 pfrag->page,
1336 pfrag->offset,
1337 copy);
1338 if (err)
1339 goto do_error;
1340
1341
1342 if (merge) {
1343 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1344 } else {
1345 skb_fill_page_desc(skb, i, pfrag->page,
1346 pfrag->offset, copy);
1347 page_ref_inc(pfrag->page);
1348 }
1349 pfrag->offset += copy;
1350 } else {
1351 err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
1352 if (err == -EMSGSIZE || err == -EEXIST) {
1353 tcp_mark_push(tp, skb);
1354 goto new_segment;
1355 }
1356 if (err < 0)
1357 goto do_error;
1358 copy = err;
1359 }
1360
1361 if (!copied)
1362 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1363
1364 tp->write_seq += copy;
1365 TCP_SKB_CB(skb)->end_seq += copy;
1366 tcp_skb_pcount_set(skb, 0);
1367
1368 copied += copy;
1369 if (!msg_data_left(msg)) {
1370 if (unlikely(flags & MSG_EOR))
1371 TCP_SKB_CB(skb)->eor = 1;
1372 goto out;
1373 }
1374
1375 if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
1376 continue;
1377
1378 if (forced_push(tp)) {
1379 tcp_mark_push(tp, skb);
1380 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1381 } else if (skb == tcp_send_head(sk))
1382 tcp_push_one(sk, mss_now);
1383 continue;
1384
1385wait_for_sndbuf:
1386 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1387wait_for_memory:
1388 if (copied)
1389 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1390 TCP_NAGLE_PUSH, size_goal);
1391
1392 err = sk_stream_wait_memory(sk, &timeo);
1393 if (err != 0)
1394 goto do_error;
1395
1396 mss_now = tcp_send_mss(sk, &size_goal, flags);
1397 }
1398
1399out:
1400 if (copied) {
1401 tcp_tx_timestamp(sk, sockc.tsflags);
1402 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1403 }
1404out_nopush:
1405 sock_zerocopy_put(uarg);
1406 return copied + copied_syn;
1407
1408do_error:
1409 skb = tcp_write_queue_tail(sk);
1410do_fault:
1411 tcp_remove_empty_skb(sk, skb);
1412
1413 if (copied + copied_syn)
1414 goto out;
1415out_err:
1416 sock_zerocopy_put_abort(uarg, true);
1417 err = sk_stream_error(sk, flags, err);
1418
1419 if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
1420 err == -EAGAIN)) {
1421 sk->sk_write_space(sk);
1422 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1423 }
1424 return err;
1425}
1426EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
1427
1428int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1429{
1430 int ret;
1431
1432 lock_sock(sk);
1433 ret = tcp_sendmsg_locked(sk, msg, size);
1434 release_sock(sk);
1435
1436 return ret;
1437}
1438EXPORT_SYMBOL(tcp_sendmsg);
1439
1440
1441
1442
1443
1444
1445static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1446{
1447 struct tcp_sock *tp = tcp_sk(sk);
1448
1449
1450 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1451 tp->urg_data == TCP_URG_READ)
1452 return -EINVAL;
1453
1454 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1455 return -ENOTCONN;
1456
1457 if (tp->urg_data & TCP_URG_VALID) {
1458 int err = 0;
1459 char c = tp->urg_data;
1460
1461 if (!(flags & MSG_PEEK))
1462 tp->urg_data = TCP_URG_READ;
1463
1464
1465 msg->msg_flags |= MSG_OOB;
1466
1467 if (len > 0) {
1468 if (!(flags & MSG_TRUNC))
1469 err = memcpy_to_msg(msg, &c, 1);
1470 len = 1;
1471 } else
1472 msg->msg_flags |= MSG_TRUNC;
1473
1474 return err ? -EFAULT : len;
1475 }
1476
1477 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1478 return 0;
1479
1480
1481
1482
1483
1484
1485
1486 return -EAGAIN;
1487}
1488
1489static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1490{
1491 struct sk_buff *skb;
1492 int copied = 0, err = 0;
1493
1494
1495
1496 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
1497 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1498 if (err)
1499 return err;
1500 copied += skb->len;
1501 }
1502
1503 skb_queue_walk(&sk->sk_write_queue, skb) {
1504 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1505 if (err)
1506 break;
1507
1508 copied += skb->len;
1509 }
1510
1511 return err ?: copied;
1512}
1513
1514
1515
1516
1517
1518
1519
1520static void tcp_cleanup_rbuf(struct sock *sk, int copied)
1521{
1522 struct tcp_sock *tp = tcp_sk(sk);
1523 bool time_to_ack = false;
1524
1525 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1526
1527 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1528 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1529 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1530
1531 if (inet_csk_ack_scheduled(sk)) {
1532 const struct inet_connection_sock *icsk = inet_csk(sk);
1533
1534
1535 if (icsk->icsk_ack.blocked ||
1536
1537 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1538
1539
1540
1541
1542
1543
1544 (copied > 0 &&
1545 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1546 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1547 !inet_csk_in_pingpong_mode(sk))) &&
1548 !atomic_read(&sk->sk_rmem_alloc)))
1549 time_to_ack = true;
1550 }
1551
1552
1553
1554
1555
1556
1557
1558 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1559 __u32 rcv_window_now = tcp_receive_window(tp);
1560
1561
1562 if (2*rcv_window_now <= tp->window_clamp) {
1563 __u32 new_window = __tcp_select_window(sk);
1564
1565
1566
1567
1568
1569
1570 if (new_window && new_window >= 2 * rcv_window_now)
1571 time_to_ack = true;
1572 }
1573 }
1574 if (time_to_ack)
1575 tcp_send_ack(sk);
1576}
1577
1578static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1579{
1580 struct sk_buff *skb;
1581 u32 offset;
1582
1583 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1584 offset = seq - TCP_SKB_CB(skb)->seq;
1585 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1586 pr_err_once("%s: found a SYN, please report !\n", __func__);
1587 offset--;
1588 }
1589 if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
1590 *off = offset;
1591 return skb;
1592 }
1593
1594
1595
1596
1597 sk_eat_skb(sk, skb);
1598 }
1599 return NULL;
1600}
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1614 sk_read_actor_t recv_actor)
1615{
1616 struct sk_buff *skb;
1617 struct tcp_sock *tp = tcp_sk(sk);
1618 u32 seq = tp->copied_seq;
1619 u32 offset;
1620 int copied = 0;
1621
1622 if (sk->sk_state == TCP_LISTEN)
1623 return -ENOTCONN;
1624 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1625 if (offset < skb->len) {
1626 int used;
1627 size_t len;
1628
1629 len = skb->len - offset;
1630
1631 if (tp->urg_data) {
1632 u32 urg_offset = tp->urg_seq - seq;
1633 if (urg_offset < len)
1634 len = urg_offset;
1635 if (!len)
1636 break;
1637 }
1638 used = recv_actor(desc, skb, offset, len);
1639 if (used <= 0) {
1640 if (!copied)
1641 copied = used;
1642 break;
1643 } else if (used <= len) {
1644 seq += used;
1645 copied += used;
1646 offset += used;
1647 }
1648
1649
1650
1651
1652
1653 skb = tcp_recv_skb(sk, seq - 1, &offset);
1654 if (!skb)
1655 break;
1656
1657
1658
1659 if (offset + 1 != skb->len)
1660 continue;
1661 }
1662 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1663 sk_eat_skb(sk, skb);
1664 ++seq;
1665 break;
1666 }
1667 sk_eat_skb(sk, skb);
1668 if (!desc->count)
1669 break;
1670 tp->copied_seq = seq;
1671 }
1672 tp->copied_seq = seq;
1673
1674 tcp_rcv_space_adjust(sk);
1675
1676
1677 if (copied > 0) {
1678 tcp_recv_skb(sk, seq, &offset);
1679 tcp_cleanup_rbuf(sk, copied);
1680 }
1681 return copied;
1682}
1683EXPORT_SYMBOL(tcp_read_sock);
1684
1685int tcp_peek_len(struct socket *sock)
1686{
1687 return tcp_inq(sock->sk);
1688}
1689EXPORT_SYMBOL(tcp_peek_len);
1690
1691
1692int tcp_set_rcvlowat(struct sock *sk, int val)
1693{
1694 int cap;
1695
1696 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1697 cap = sk->sk_rcvbuf >> 1;
1698 else
1699 cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
1700 val = min(val, cap);
1701 sk->sk_rcvlowat = val ? : 1;
1702
1703
1704 tcp_data_ready(sk);
1705
1706 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1707 return 0;
1708
1709 val <<= 1;
1710 if (val > sk->sk_rcvbuf) {
1711 sk->sk_rcvbuf = val;
1712 tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
1713 }
1714 return 0;
1715}
1716EXPORT_SYMBOL(tcp_set_rcvlowat);
1717
1718#ifdef CONFIG_MMU
1719static const struct vm_operations_struct tcp_vm_ops = {
1720};
1721
1722int tcp_mmap(struct file *file, struct socket *sock,
1723 struct vm_area_struct *vma)
1724{
1725 if (vma->vm_flags & (VM_WRITE | VM_EXEC))
1726 return -EPERM;
1727 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
1728
1729
1730 vma->vm_flags |= VM_MIXEDMAP;
1731
1732 vma->vm_ops = &tcp_vm_ops;
1733 return 0;
1734}
1735EXPORT_SYMBOL(tcp_mmap);
1736
1737static int tcp_zerocopy_receive(struct sock *sk,
1738 struct tcp_zerocopy_receive *zc)
1739{
1740 unsigned long address = (unsigned long)zc->address;
1741 const skb_frag_t *frags = NULL;
1742 u32 length = 0, seq, offset;
1743 struct vm_area_struct *vma;
1744 struct sk_buff *skb = NULL;
1745 struct tcp_sock *tp;
1746 int inq;
1747 int ret;
1748
1749 if (address & (PAGE_SIZE - 1) || address != zc->address)
1750 return -EINVAL;
1751
1752 if (sk->sk_state == TCP_LISTEN)
1753 return -ENOTCONN;
1754
1755 sock_rps_record_flow(sk);
1756
1757 down_read(¤t->mm->mmap_sem);
1758
1759 ret = -EINVAL;
1760 vma = find_vma(current->mm, address);
1761 if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
1762 goto out;
1763 zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
1764
1765 tp = tcp_sk(sk);
1766 seq = tp->copied_seq;
1767 inq = tcp_inq(sk);
1768 zc->length = min_t(u32, zc->length, inq);
1769 zc->length &= ~(PAGE_SIZE - 1);
1770 if (zc->length) {
1771 zap_page_range(vma, address, zc->length);
1772 zc->recv_skip_hint = 0;
1773 } else {
1774 zc->recv_skip_hint = inq;
1775 }
1776 ret = 0;
1777 while (length + PAGE_SIZE <= zc->length) {
1778 if (zc->recv_skip_hint < PAGE_SIZE) {
1779 if (skb) {
1780 skb = skb->next;
1781 offset = seq - TCP_SKB_CB(skb)->seq;
1782 } else {
1783 skb = tcp_recv_skb(sk, seq, &offset);
1784 }
1785
1786 zc->recv_skip_hint = skb->len - offset;
1787 offset -= skb_headlen(skb);
1788 if ((int)offset < 0 || skb_has_frag_list(skb))
1789 break;
1790 frags = skb_shinfo(skb)->frags;
1791 while (offset) {
1792 if (frags->size > offset)
1793 goto out;
1794 offset -= frags->size;
1795 frags++;
1796 }
1797 }
1798 if (frags->size != PAGE_SIZE || frags->page_offset) {
1799 int remaining = zc->recv_skip_hint;
1800
1801 while (remaining && (frags->size != PAGE_SIZE ||
1802 frags->page_offset)) {
1803 remaining -= frags->size;
1804 frags++;
1805 }
1806 zc->recv_skip_hint -= remaining;
1807 break;
1808 }
1809 ret = vm_insert_page(vma, address + length,
1810 skb_frag_page(frags));
1811 if (ret)
1812 break;
1813 length += PAGE_SIZE;
1814 seq += PAGE_SIZE;
1815 zc->recv_skip_hint -= PAGE_SIZE;
1816 frags++;
1817 }
1818out:
1819 up_read(¤t->mm->mmap_sem);
1820 if (length) {
1821 tp->copied_seq = seq;
1822 tcp_rcv_space_adjust(sk);
1823
1824
1825 tcp_recv_skb(sk, seq, &offset);
1826 tcp_cleanup_rbuf(sk, length);
1827 ret = 0;
1828 if (length == zc->length)
1829 zc->recv_skip_hint = 0;
1830 } else {
1831 if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
1832 ret = -EIO;
1833 }
1834 zc->length = length;
1835 return ret;
1836}
1837#endif
1838
1839static void tcp_update_recv_tstamps(struct sk_buff *skb,
1840 struct scm_timestamping_internal *tss)
1841{
1842 if (skb->tstamp)
1843 tss->ts[0] = ktime_to_timespec64(skb->tstamp);
1844 else
1845 tss->ts[0] = (struct timespec64) {0};
1846
1847 if (skb_hwtstamps(skb)->hwtstamp)
1848 tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
1849 else
1850 tss->ts[2] = (struct timespec64) {0};
1851}
1852
1853
1854static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
1855 struct scm_timestamping_internal *tss)
1856{
1857 int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
1858 bool has_timestamping = false;
1859
1860 if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
1861 if (sock_flag(sk, SOCK_RCVTSTAMP)) {
1862 if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
1863 if (new_tstamp) {
1864 struct __kernel_timespec kts = {tss->ts[0].tv_sec, tss->ts[0].tv_nsec};
1865
1866 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
1867 sizeof(kts), &kts);
1868 } else {
1869 struct timespec ts_old = timespec64_to_timespec(tss->ts[0]);
1870
1871 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
1872 sizeof(ts_old), &ts_old);
1873 }
1874 } else {
1875 if (new_tstamp) {
1876 struct __kernel_sock_timeval stv;
1877
1878 stv.tv_sec = tss->ts[0].tv_sec;
1879 stv.tv_usec = tss->ts[0].tv_nsec / 1000;
1880 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
1881 sizeof(stv), &stv);
1882 } else {
1883 struct __kernel_old_timeval tv;
1884
1885 tv.tv_sec = tss->ts[0].tv_sec;
1886 tv.tv_usec = tss->ts[0].tv_nsec / 1000;
1887 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
1888 sizeof(tv), &tv);
1889 }
1890 }
1891 }
1892
1893 if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
1894 has_timestamping = true;
1895 else
1896 tss->ts[0] = (struct timespec64) {0};
1897 }
1898
1899 if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
1900 if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
1901 has_timestamping = true;
1902 else
1903 tss->ts[2] = (struct timespec64) {0};
1904 }
1905
1906 if (has_timestamping) {
1907 tss->ts[1] = (struct timespec64) {0};
1908 if (sock_flag(sk, SOCK_TSTAMP_NEW))
1909 put_cmsg_scm_timestamping64(msg, tss);
1910 else
1911 put_cmsg_scm_timestamping(msg, tss);
1912 }
1913}
1914
1915static int tcp_inq_hint(struct sock *sk)
1916{
1917 const struct tcp_sock *tp = tcp_sk(sk);
1918 u32 copied_seq = READ_ONCE(tp->copied_seq);
1919 u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
1920 int inq;
1921
1922 inq = rcv_nxt - copied_seq;
1923 if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
1924 lock_sock(sk);
1925 inq = tp->rcv_nxt - tp->copied_seq;
1926 release_sock(sk);
1927 }
1928
1929
1930
1931 if (inq == 0 && sock_flag(sk, SOCK_DONE))
1932 inq = 1;
1933 return inq;
1934}
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1945 int flags, int *addr_len)
1946{
1947 struct tcp_sock *tp = tcp_sk(sk);
1948 int copied = 0;
1949 u32 peek_seq;
1950 u32 *seq;
1951 unsigned long used;
1952 int err, inq;
1953 int target;
1954 long timeo;
1955 struct sk_buff *skb, *last;
1956 u32 urg_hole = 0;
1957 struct scm_timestamping_internal tss;
1958 bool has_tss = false;
1959 bool has_cmsg;
1960
1961 if (unlikely(flags & MSG_ERRQUEUE))
1962 return inet_recv_error(sk, msg, len, addr_len);
1963
1964 if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
1965 (sk->sk_state == TCP_ESTABLISHED))
1966 sk_busy_loop(sk, nonblock);
1967
1968 lock_sock(sk);
1969
1970 err = -ENOTCONN;
1971 if (sk->sk_state == TCP_LISTEN)
1972 goto out;
1973
1974 has_cmsg = tp->recvmsg_inq;
1975 timeo = sock_rcvtimeo(sk, nonblock);
1976
1977
1978 if (flags & MSG_OOB)
1979 goto recv_urg;
1980
1981 if (unlikely(tp->repair)) {
1982 err = -EPERM;
1983 if (!(flags & MSG_PEEK))
1984 goto out;
1985
1986 if (tp->repair_queue == TCP_SEND_QUEUE)
1987 goto recv_sndq;
1988
1989 err = -EINVAL;
1990 if (tp->repair_queue == TCP_NO_QUEUE)
1991 goto out;
1992
1993
1994 }
1995
1996 seq = &tp->copied_seq;
1997 if (flags & MSG_PEEK) {
1998 peek_seq = tp->copied_seq;
1999 seq = &peek_seq;
2000 }
2001
2002 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
2003
2004 do {
2005 u32 offset;
2006
2007
2008 if (tp->urg_data && tp->urg_seq == *seq) {
2009 if (copied)
2010 break;
2011 if (signal_pending(current)) {
2012 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
2013 break;
2014 }
2015 }
2016
2017
2018
2019 last = skb_peek_tail(&sk->sk_receive_queue);
2020 skb_queue_walk(&sk->sk_receive_queue, skb) {
2021 last = skb;
2022
2023
2024
2025 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
2026 "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
2027 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
2028 flags))
2029 break;
2030
2031 offset = *seq - TCP_SKB_CB(skb)->seq;
2032 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
2033 pr_err_once("%s: found a SYN, please report !\n", __func__);
2034 offset--;
2035 }
2036 if (offset < skb->len)
2037 goto found_ok_skb;
2038 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2039 goto found_fin_ok;
2040 WARN(!(flags & MSG_PEEK),
2041 "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
2042 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
2043 }
2044
2045
2046
2047 if (copied >= target && !sk->sk_backlog.tail)
2048 break;
2049
2050 if (copied) {
2051 if (sk->sk_err ||
2052 sk->sk_state == TCP_CLOSE ||
2053 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2054 !timeo ||
2055 signal_pending(current))
2056 break;
2057 } else {
2058 if (sock_flag(sk, SOCK_DONE))
2059 break;
2060
2061 if (sk->sk_err) {
2062 copied = sock_error(sk);
2063 break;
2064 }
2065
2066 if (sk->sk_shutdown & RCV_SHUTDOWN)
2067 break;
2068
2069 if (sk->sk_state == TCP_CLOSE) {
2070
2071
2072
2073 copied = -ENOTCONN;
2074 break;
2075 }
2076
2077 if (!timeo) {
2078 copied = -EAGAIN;
2079 break;
2080 }
2081
2082 if (signal_pending(current)) {
2083 copied = sock_intr_errno(timeo);
2084 break;
2085 }
2086 }
2087
2088 tcp_cleanup_rbuf(sk, copied);
2089
2090 if (copied >= target) {
2091
2092 release_sock(sk);
2093 lock_sock(sk);
2094 } else {
2095 sk_wait_data(sk, &timeo, last);
2096 }
2097
2098 if ((flags & MSG_PEEK) &&
2099 (peek_seq - copied - urg_hole != tp->copied_seq)) {
2100 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
2101 current->comm,
2102 task_pid_nr(current));
2103 peek_seq = tp->copied_seq;
2104 }
2105 continue;
2106
2107found_ok_skb:
2108
2109 used = skb->len - offset;
2110 if (len < used)
2111 used = len;
2112
2113
2114 if (tp->urg_data) {
2115 u32 urg_offset = tp->urg_seq - *seq;
2116 if (urg_offset < used) {
2117 if (!urg_offset) {
2118 if (!sock_flag(sk, SOCK_URGINLINE)) {
2119 ++*seq;
2120 urg_hole++;
2121 offset++;
2122 used--;
2123 if (!used)
2124 goto skip_copy;
2125 }
2126 } else
2127 used = urg_offset;
2128 }
2129 }
2130
2131 if (!(flags & MSG_TRUNC)) {
2132 err = skb_copy_datagram_msg(skb, offset, msg, used);
2133 if (err) {
2134
2135 if (!copied)
2136 copied = -EFAULT;
2137 break;
2138 }
2139 }
2140
2141 *seq += used;
2142 copied += used;
2143 len -= used;
2144
2145 tcp_rcv_space_adjust(sk);
2146
2147skip_copy:
2148 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
2149 tp->urg_data = 0;
2150 tcp_fast_path_check(sk);
2151 }
2152 if (used + offset < skb->len)
2153 continue;
2154
2155 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2156 tcp_update_recv_tstamps(skb, &tss);
2157 has_tss = true;
2158 has_cmsg = true;
2159 }
2160 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2161 goto found_fin_ok;
2162 if (!(flags & MSG_PEEK))
2163 sk_eat_skb(sk, skb);
2164 continue;
2165
2166found_fin_ok:
2167
2168 ++*seq;
2169 if (!(flags & MSG_PEEK))
2170 sk_eat_skb(sk, skb);
2171 break;
2172 } while (len > 0);
2173
2174
2175
2176
2177
2178
2179 tcp_cleanup_rbuf(sk, copied);
2180
2181 release_sock(sk);
2182
2183 if (has_cmsg) {
2184 if (has_tss)
2185 tcp_recv_timestamp(msg, sk, &tss);
2186 if (tp->recvmsg_inq) {
2187 inq = tcp_inq_hint(sk);
2188 put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
2189 }
2190 }
2191
2192 return copied;
2193
2194out:
2195 release_sock(sk);
2196 return err;
2197
2198recv_urg:
2199 err = tcp_recv_urg(sk, msg, len, flags);
2200 goto out;
2201
2202recv_sndq:
2203 err = tcp_peek_sndq(sk, msg, len);
2204 goto out;
2205}
2206EXPORT_SYMBOL(tcp_recvmsg);
2207
2208void tcp_set_state(struct sock *sk, int state)
2209{
2210 int oldstate = sk->sk_state;
2211
2212
2213
2214
2215
2216
2217
2218
2219 BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
2220 BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
2221 BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
2222 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
2223 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
2224 BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
2225 BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
2226 BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
2227 BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
2228 BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
2229 BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
2230 BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
2231 BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
2232
2233 if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
2234 tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
2235
2236 switch (state) {
2237 case TCP_ESTABLISHED:
2238 if (oldstate != TCP_ESTABLISHED)
2239 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2240 break;
2241
2242 case TCP_CLOSE:
2243 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
2244 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
2245
2246 sk->sk_prot->unhash(sk);
2247 if (inet_csk(sk)->icsk_bind_hash &&
2248 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
2249 inet_put_port(sk);
2250
2251 default:
2252 if (oldstate == TCP_ESTABLISHED)
2253 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2254 }
2255
2256
2257
2258
2259 inet_sk_state_store(sk, state);
2260}
2261EXPORT_SYMBOL_GPL(tcp_set_state);
2262
2263
2264
2265
2266
2267
2268
2269
2270static const unsigned char new_state[16] = {
2271
2272 [0 ] = TCP_CLOSE,
2273 [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2274 [TCP_SYN_SENT] = TCP_CLOSE,
2275 [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2276 [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
2277 [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
2278 [TCP_TIME_WAIT] = TCP_CLOSE,
2279 [TCP_CLOSE] = TCP_CLOSE,
2280 [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
2281 [TCP_LAST_ACK] = TCP_LAST_ACK,
2282 [TCP_LISTEN] = TCP_CLOSE,
2283 [TCP_CLOSING] = TCP_CLOSING,
2284 [TCP_NEW_SYN_RECV] = TCP_CLOSE,
2285};
2286
2287static int tcp_close_state(struct sock *sk)
2288{
2289 int next = (int)new_state[sk->sk_state];
2290 int ns = next & TCP_STATE_MASK;
2291
2292 tcp_set_state(sk, ns);
2293
2294 return next & TCP_ACTION_FIN;
2295}
2296
2297
2298
2299
2300
2301
2302void tcp_shutdown(struct sock *sk, int how)
2303{
2304
2305
2306
2307
2308 if (!(how & SEND_SHUTDOWN))
2309 return;
2310
2311
2312 if ((1 << sk->sk_state) &
2313 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2314 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2315
2316 if (tcp_close_state(sk))
2317 tcp_send_fin(sk);
2318 }
2319}
2320EXPORT_SYMBOL(tcp_shutdown);
2321
2322bool tcp_check_oom(struct sock *sk, int shift)
2323{
2324 bool too_many_orphans, out_of_socket_memory;
2325
2326 too_many_orphans = tcp_too_many_orphans(sk, shift);
2327 out_of_socket_memory = tcp_out_of_memory(sk);
2328
2329 if (too_many_orphans)
2330 net_info_ratelimited("too many orphaned sockets\n");
2331 if (out_of_socket_memory)
2332 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2333 return too_many_orphans || out_of_socket_memory;
2334}
2335
2336void tcp_close(struct sock *sk, long timeout)
2337{
2338 struct sk_buff *skb;
2339 int data_was_unread = 0;
2340 int state;
2341
2342 lock_sock(sk);
2343 sk->sk_shutdown = SHUTDOWN_MASK;
2344
2345 if (sk->sk_state == TCP_LISTEN) {
2346 tcp_set_state(sk, TCP_CLOSE);
2347
2348
2349 inet_csk_listen_stop(sk);
2350
2351 goto adjudge_to_death;
2352 }
2353
2354
2355
2356
2357
2358 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2359 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
2360
2361 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2362 len--;
2363 data_was_unread += len;
2364 __kfree_skb(skb);
2365 }
2366
2367 sk_mem_reclaim(sk);
2368
2369
2370 if (sk->sk_state == TCP_CLOSE)
2371 goto adjudge_to_death;
2372
2373
2374
2375
2376
2377
2378
2379
2380 if (unlikely(tcp_sk(sk)->repair)) {
2381 sk->sk_prot->disconnect(sk, 0);
2382 } else if (data_was_unread) {
2383
2384 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2385 tcp_set_state(sk, TCP_CLOSE);
2386 tcp_send_active_reset(sk, sk->sk_allocation);
2387 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2388
2389 sk->sk_prot->disconnect(sk, 0);
2390 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2391 } else if (tcp_close_state(sk)) {
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421 tcp_send_fin(sk);
2422 }
2423
2424 sk_stream_wait_close(sk, timeout);
2425
2426adjudge_to_death:
2427 state = sk->sk_state;
2428 sock_hold(sk);
2429 sock_orphan(sk);
2430
2431 local_bh_disable();
2432 bh_lock_sock(sk);
2433
2434 __release_sock(sk);
2435
2436 percpu_counter_inc(sk->sk_prot->orphan_count);
2437
2438
2439 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2440 goto out;
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456 if (sk->sk_state == TCP_FIN_WAIT2) {
2457 struct tcp_sock *tp = tcp_sk(sk);
2458 if (tp->linger2 < 0) {
2459 tcp_set_state(sk, TCP_CLOSE);
2460 tcp_send_active_reset(sk, GFP_ATOMIC);
2461 __NET_INC_STATS(sock_net(sk),
2462 LINUX_MIB_TCPABORTONLINGER);
2463 } else {
2464 const int tmo = tcp_fin_time(sk);
2465
2466 if (tmo > TCP_TIMEWAIT_LEN) {
2467 inet_csk_reset_keepalive_timer(sk,
2468 tmo - TCP_TIMEWAIT_LEN);
2469 } else {
2470 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2471 goto out;
2472 }
2473 }
2474 }
2475 if (sk->sk_state != TCP_CLOSE) {
2476 sk_mem_reclaim(sk);
2477 if (tcp_check_oom(sk, 0)) {
2478 tcp_set_state(sk, TCP_CLOSE);
2479 tcp_send_active_reset(sk, GFP_ATOMIC);
2480 __NET_INC_STATS(sock_net(sk),
2481 LINUX_MIB_TCPABORTONMEMORY);
2482 } else if (!check_net(sock_net(sk))) {
2483
2484 tcp_set_state(sk, TCP_CLOSE);
2485 }
2486 }
2487
2488 if (sk->sk_state == TCP_CLOSE) {
2489 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2490
2491
2492
2493
2494 if (req)
2495 reqsk_fastopen_remove(sk, req, false);
2496 inet_csk_destroy_sock(sk);
2497 }
2498
2499
2500out:
2501 bh_unlock_sock(sk);
2502 local_bh_enable();
2503 release_sock(sk);
2504 sock_put(sk);
2505}
2506EXPORT_SYMBOL(tcp_close);
2507
2508
2509
2510static inline bool tcp_need_reset(int state)
2511{
2512 return (1 << state) &
2513 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2514 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2515}
2516
2517static void tcp_rtx_queue_purge(struct sock *sk)
2518{
2519 struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
2520
2521 while (p) {
2522 struct sk_buff *skb = rb_to_skb(p);
2523
2524 p = rb_next(p);
2525
2526
2527
2528 tcp_rtx_queue_unlink(skb, sk);
2529 sk_wmem_free_skb(sk, skb);
2530 }
2531}
2532
2533void tcp_write_queue_purge(struct sock *sk)
2534{
2535 struct sk_buff *skb;
2536
2537 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
2538 while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
2539 tcp_skb_tsorted_anchor_cleanup(skb);
2540 sk_wmem_free_skb(sk, skb);
2541 }
2542 tcp_rtx_queue_purge(sk);
2543 skb = sk->sk_tx_skb_cache;
2544 if (skb) {
2545 __kfree_skb(skb);
2546 sk->sk_tx_skb_cache = NULL;
2547 }
2548 INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
2549 sk_mem_reclaim(sk);
2550 tcp_clear_all_retrans_hints(tcp_sk(sk));
2551 tcp_sk(sk)->packets_out = 0;
2552 inet_csk(sk)->icsk_backoff = 0;
2553}
2554
2555int tcp_disconnect(struct sock *sk, int flags)
2556{
2557 struct inet_sock *inet = inet_sk(sk);
2558 struct inet_connection_sock *icsk = inet_csk(sk);
2559 struct tcp_sock *tp = tcp_sk(sk);
2560 int old_state = sk->sk_state;
2561
2562 if (old_state != TCP_CLOSE)
2563 tcp_set_state(sk, TCP_CLOSE);
2564
2565
2566 if (old_state == TCP_LISTEN) {
2567 inet_csk_listen_stop(sk);
2568 } else if (unlikely(tp->repair)) {
2569 sk->sk_err = ECONNABORTED;
2570 } else if (tcp_need_reset(old_state) ||
2571 (tp->snd_nxt != tp->write_seq &&
2572 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2573
2574
2575
2576 tcp_send_active_reset(sk, gfp_any());
2577 sk->sk_err = ECONNRESET;
2578 } else if (old_state == TCP_SYN_SENT)
2579 sk->sk_err = ECONNRESET;
2580
2581 tcp_clear_xmit_timers(sk);
2582 __skb_queue_purge(&sk->sk_receive_queue);
2583 if (sk->sk_rx_skb_cache) {
2584 __kfree_skb(sk->sk_rx_skb_cache);
2585 sk->sk_rx_skb_cache = NULL;
2586 }
2587 tp->copied_seq = tp->rcv_nxt;
2588 tp->urg_data = 0;
2589 tcp_write_queue_purge(sk);
2590 tcp_fastopen_active_disable_ofo_check(sk);
2591 skb_rbtree_purge(&tp->out_of_order_queue);
2592
2593 inet->inet_dport = 0;
2594
2595 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2596 inet_reset_saddr(sk);
2597
2598 sk->sk_shutdown = 0;
2599 sock_reset_flag(sk, SOCK_DONE);
2600 tp->srtt_us = 0;
2601 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
2602 tp->rcv_rtt_last_tsecr = 0;
2603 tp->write_seq += tp->max_window + 2;
2604 if (tp->write_seq == 0)
2605 tp->write_seq = 1;
2606 icsk->icsk_backoff = 0;
2607 tp->snd_cwnd = 2;
2608 icsk->icsk_probes_out = 0;
2609 icsk->icsk_rto = TCP_TIMEOUT_INIT;
2610 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2611 tp->snd_cwnd = TCP_INIT_CWND;
2612 tp->snd_cwnd_cnt = 0;
2613 tp->window_clamp = 0;
2614 tp->delivered_ce = 0;
2615 tcp_set_ca_state(sk, TCP_CA_Open);
2616 tp->is_sack_reneg = 0;
2617 tcp_clear_retrans(tp);
2618 inet_csk_delack_init(sk);
2619
2620
2621
2622 icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
2623 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2624 __sk_dst_reset(sk);
2625 dst_release(sk->sk_rx_dst);
2626 sk->sk_rx_dst = NULL;
2627 tcp_saved_syn_free(tp);
2628 tp->compressed_ack = 0;
2629 tp->bytes_sent = 0;
2630 tp->bytes_acked = 0;
2631 tp->bytes_received = 0;
2632 tp->bytes_retrans = 0;
2633 tp->duplicate_sack[0].start_seq = 0;
2634 tp->duplicate_sack[0].end_seq = 0;
2635 tp->dsack_dups = 0;
2636 tp->reord_seen = 0;
2637 tp->retrans_out = 0;
2638 tp->sacked_out = 0;
2639 tp->tlp_high_seq = 0;
2640 tp->last_oow_ack_time = 0;
2641
2642 tp->app_limited = ~0U;
2643 tp->rack.mstamp = 0;
2644 tp->rack.advanced = 0;
2645 tp->rack.reo_wnd_steps = 1;
2646 tp->rack.last_delivered = 0;
2647 tp->rack.reo_wnd_persist = 0;
2648 tp->rack.dsack_seen = 0;
2649 tp->syn_data_acked = 0;
2650 tp->rx_opt.saw_tstamp = 0;
2651 tp->rx_opt.dsack = 0;
2652 tp->rx_opt.num_sacks = 0;
2653
2654
2655
2656 tcp_free_fastopen_req(tp);
2657 inet->defer_connect = 0;
2658
2659 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2660
2661 if (sk->sk_frag.page) {
2662 put_page(sk->sk_frag.page);
2663 sk->sk_frag.page = NULL;
2664 sk->sk_frag.offset = 0;
2665 }
2666
2667 sk->sk_error_report(sk);
2668 return 0;
2669}
2670EXPORT_SYMBOL(tcp_disconnect);
2671
2672static inline bool tcp_can_repair_sock(const struct sock *sk)
2673{
2674 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2675 (sk->sk_state != TCP_LISTEN);
2676}
2677
2678static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
2679{
2680 struct tcp_repair_window opt;
2681
2682 if (!tp->repair)
2683 return -EPERM;
2684
2685 if (len != sizeof(opt))
2686 return -EINVAL;
2687
2688 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2689 return -EFAULT;
2690
2691 if (opt.max_window < opt.snd_wnd)
2692 return -EINVAL;
2693
2694 if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
2695 return -EINVAL;
2696
2697 if (after(opt.rcv_wup, tp->rcv_nxt))
2698 return -EINVAL;
2699
2700 tp->snd_wl1 = opt.snd_wl1;
2701 tp->snd_wnd = opt.snd_wnd;
2702 tp->max_window = opt.max_window;
2703
2704 tp->rcv_wnd = opt.rcv_wnd;
2705 tp->rcv_wup = opt.rcv_wup;
2706
2707 return 0;
2708}
2709
2710static int tcp_repair_options_est(struct sock *sk,
2711 struct tcp_repair_opt __user *optbuf, unsigned int len)
2712{
2713 struct tcp_sock *tp = tcp_sk(sk);
2714 struct tcp_repair_opt opt;
2715
2716 while (len >= sizeof(opt)) {
2717 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2718 return -EFAULT;
2719
2720 optbuf++;
2721 len -= sizeof(opt);
2722
2723 switch (opt.opt_code) {
2724 case TCPOPT_MSS:
2725 tp->rx_opt.mss_clamp = opt.opt_val;
2726 tcp_mtup_init(sk);
2727 break;
2728 case TCPOPT_WINDOW:
2729 {
2730 u16 snd_wscale = opt.opt_val & 0xFFFF;
2731 u16 rcv_wscale = opt.opt_val >> 16;
2732
2733 if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE)
2734 return -EFBIG;
2735
2736 tp->rx_opt.snd_wscale = snd_wscale;
2737 tp->rx_opt.rcv_wscale = rcv_wscale;
2738 tp->rx_opt.wscale_ok = 1;
2739 }
2740 break;
2741 case TCPOPT_SACK_PERM:
2742 if (opt.opt_val != 0)
2743 return -EINVAL;
2744
2745 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2746 break;
2747 case TCPOPT_TIMESTAMP:
2748 if (opt.opt_val != 0)
2749 return -EINVAL;
2750
2751 tp->rx_opt.tstamp_ok = 1;
2752 break;
2753 }
2754 }
2755
2756 return 0;
2757}
2758
2759DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
2760EXPORT_SYMBOL(tcp_tx_delay_enabled);
2761
2762static void tcp_enable_tx_delay(void)
2763{
2764 if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
2765 static int __tcp_tx_delay_enabled = 0;
2766
2767 if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
2768 static_branch_enable(&tcp_tx_delay_enabled);
2769 pr_info("TCP_TX_DELAY enabled\n");
2770 }
2771 }
2772}
2773
2774
2775
2776
2777static int do_tcp_setsockopt(struct sock *sk, int level,
2778 int optname, char __user *optval, unsigned int optlen)
2779{
2780 struct tcp_sock *tp = tcp_sk(sk);
2781 struct inet_connection_sock *icsk = inet_csk(sk);
2782 struct net *net = sock_net(sk);
2783 int val;
2784 int err = 0;
2785
2786
2787 switch (optname) {
2788 case TCP_CONGESTION: {
2789 char name[TCP_CA_NAME_MAX];
2790
2791 if (optlen < 1)
2792 return -EINVAL;
2793
2794 val = strncpy_from_user(name, optval,
2795 min_t(long, TCP_CA_NAME_MAX-1, optlen));
2796 if (val < 0)
2797 return -EFAULT;
2798 name[val] = 0;
2799
2800 lock_sock(sk);
2801 err = tcp_set_congestion_control(sk, name, true, true,
2802 ns_capable(sock_net(sk)->user_ns,
2803 CAP_NET_ADMIN));
2804 release_sock(sk);
2805 return err;
2806 }
2807 case TCP_ULP: {
2808 char name[TCP_ULP_NAME_MAX];
2809
2810 if (optlen < 1)
2811 return -EINVAL;
2812
2813 val = strncpy_from_user(name, optval,
2814 min_t(long, TCP_ULP_NAME_MAX - 1,
2815 optlen));
2816 if (val < 0)
2817 return -EFAULT;
2818 name[val] = 0;
2819
2820 lock_sock(sk);
2821 err = tcp_set_ulp(sk, name);
2822 release_sock(sk);
2823 return err;
2824 }
2825 case TCP_FASTOPEN_KEY: {
2826 __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
2827 __u8 *backup_key = NULL;
2828
2829
2830
2831
2832 if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
2833 optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
2834 return -EINVAL;
2835
2836 if (copy_from_user(key, optval, optlen))
2837 return -EFAULT;
2838
2839 if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
2840 backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
2841
2842 return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
2843 }
2844 default:
2845
2846 break;
2847 }
2848
2849 if (optlen < sizeof(int))
2850 return -EINVAL;
2851
2852 if (get_user(val, (int __user *)optval))
2853 return -EFAULT;
2854
2855 lock_sock(sk);
2856
2857 switch (optname) {
2858 case TCP_MAXSEG:
2859
2860
2861
2862
2863 if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
2864 err = -EINVAL;
2865 break;
2866 }
2867 tp->rx_opt.user_mss = val;
2868 break;
2869
2870 case TCP_NODELAY:
2871 if (val) {
2872
2873
2874
2875
2876
2877
2878
2879
2880 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2881 tcp_push_pending_frames(sk);
2882 } else {
2883 tp->nonagle &= ~TCP_NAGLE_OFF;
2884 }
2885 break;
2886
2887 case TCP_THIN_LINEAR_TIMEOUTS:
2888 if (val < 0 || val > 1)
2889 err = -EINVAL;
2890 else
2891 tp->thin_lto = val;
2892 break;
2893
2894 case TCP_THIN_DUPACK:
2895 if (val < 0 || val > 1)
2896 err = -EINVAL;
2897 break;
2898
2899 case TCP_REPAIR:
2900 if (!tcp_can_repair_sock(sk))
2901 err = -EPERM;
2902 else if (val == TCP_REPAIR_ON) {
2903 tp->repair = 1;
2904 sk->sk_reuse = SK_FORCE_REUSE;
2905 tp->repair_queue = TCP_NO_QUEUE;
2906 } else if (val == TCP_REPAIR_OFF) {
2907 tp->repair = 0;
2908 sk->sk_reuse = SK_NO_REUSE;
2909 tcp_send_window_probe(sk);
2910 } else if (val == TCP_REPAIR_OFF_NO_WP) {
2911 tp->repair = 0;
2912 sk->sk_reuse = SK_NO_REUSE;
2913 } else
2914 err = -EINVAL;
2915
2916 break;
2917
2918 case TCP_REPAIR_QUEUE:
2919 if (!tp->repair)
2920 err = -EPERM;
2921 else if ((unsigned int)val < TCP_QUEUES_NR)
2922 tp->repair_queue = val;
2923 else
2924 err = -EINVAL;
2925 break;
2926
2927 case TCP_QUEUE_SEQ:
2928 if (sk->sk_state != TCP_CLOSE)
2929 err = -EPERM;
2930 else if (tp->repair_queue == TCP_SEND_QUEUE)
2931 tp->write_seq = val;
2932 else if (tp->repair_queue == TCP_RECV_QUEUE)
2933 tp->rcv_nxt = val;
2934 else
2935 err = -EINVAL;
2936 break;
2937
2938 case TCP_REPAIR_OPTIONS:
2939 if (!tp->repair)
2940 err = -EINVAL;
2941 else if (sk->sk_state == TCP_ESTABLISHED)
2942 err = tcp_repair_options_est(sk,
2943 (struct tcp_repair_opt __user *)optval,
2944 optlen);
2945 else
2946 err = -EPERM;
2947 break;
2948
2949 case TCP_CORK:
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961 if (val) {
2962 tp->nonagle |= TCP_NAGLE_CORK;
2963 } else {
2964 tp->nonagle &= ~TCP_NAGLE_CORK;
2965 if (tp->nonagle&TCP_NAGLE_OFF)
2966 tp->nonagle |= TCP_NAGLE_PUSH;
2967 tcp_push_pending_frames(sk);
2968 }
2969 break;
2970
2971 case TCP_KEEPIDLE:
2972 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2973 err = -EINVAL;
2974 else {
2975 tp->keepalive_time = val * HZ;
2976 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2977 !((1 << sk->sk_state) &
2978 (TCPF_CLOSE | TCPF_LISTEN))) {
2979 u32 elapsed = keepalive_time_elapsed(tp);
2980 if (tp->keepalive_time > elapsed)
2981 elapsed = tp->keepalive_time - elapsed;
2982 else
2983 elapsed = 0;
2984 inet_csk_reset_keepalive_timer(sk, elapsed);
2985 }
2986 }
2987 break;
2988 case TCP_KEEPINTVL:
2989 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2990 err = -EINVAL;
2991 else
2992 tp->keepalive_intvl = val * HZ;
2993 break;
2994 case TCP_KEEPCNT:
2995 if (val < 1 || val > MAX_TCP_KEEPCNT)
2996 err = -EINVAL;
2997 else
2998 tp->keepalive_probes = val;
2999 break;
3000 case TCP_SYNCNT:
3001 if (val < 1 || val > MAX_TCP_SYNCNT)
3002 err = -EINVAL;
3003 else
3004 icsk->icsk_syn_retries = val;
3005 break;
3006
3007 case TCP_SAVE_SYN:
3008 if (val < 0 || val > 1)
3009 err = -EINVAL;
3010 else
3011 tp->save_syn = val;
3012 break;
3013
3014 case TCP_LINGER2:
3015 if (val < 0)
3016 tp->linger2 = -1;
3017 else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)
3018 tp->linger2 = 0;
3019 else
3020 tp->linger2 = val * HZ;
3021 break;
3022
3023 case TCP_DEFER_ACCEPT:
3024
3025 icsk->icsk_accept_queue.rskq_defer_accept =
3026 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
3027 TCP_RTO_MAX / HZ);
3028 break;
3029
3030 case TCP_WINDOW_CLAMP:
3031 if (!val) {
3032 if (sk->sk_state != TCP_CLOSE) {
3033 err = -EINVAL;
3034 break;
3035 }
3036 tp->window_clamp = 0;
3037 } else
3038 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
3039 SOCK_MIN_RCVBUF / 2 : val;
3040 break;
3041
3042 case TCP_QUICKACK:
3043 if (!val) {
3044 inet_csk_enter_pingpong_mode(sk);
3045 } else {
3046 inet_csk_exit_pingpong_mode(sk);
3047 if ((1 << sk->sk_state) &
3048 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
3049 inet_csk_ack_scheduled(sk)) {
3050 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
3051 tcp_cleanup_rbuf(sk, 1);
3052 if (!(val & 1))
3053 inet_csk_enter_pingpong_mode(sk);
3054 }
3055 }
3056 break;
3057
3058#ifdef CONFIG_TCP_MD5SIG
3059 case TCP_MD5SIG:
3060 case TCP_MD5SIG_EXT:
3061 if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
3062 err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
3063 else
3064 err = -EINVAL;
3065 break;
3066#endif
3067 case TCP_USER_TIMEOUT:
3068
3069
3070
3071 if (val < 0)
3072 err = -EINVAL;
3073 else
3074 icsk->icsk_user_timeout = val;
3075 break;
3076
3077 case TCP_FASTOPEN:
3078 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
3079 TCPF_LISTEN))) {
3080 tcp_fastopen_init_key_once(net);
3081
3082 fastopen_queue_tune(sk, val);
3083 } else {
3084 err = -EINVAL;
3085 }
3086 break;
3087 case TCP_FASTOPEN_CONNECT:
3088 if (val > 1 || val < 0) {
3089 err = -EINVAL;
3090 } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
3091 if (sk->sk_state == TCP_CLOSE)
3092 tp->fastopen_connect = val;
3093 else
3094 err = -EINVAL;
3095 } else {
3096 err = -EOPNOTSUPP;
3097 }
3098 break;
3099 case TCP_FASTOPEN_NO_COOKIE:
3100 if (val > 1 || val < 0)
3101 err = -EINVAL;
3102 else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3103 err = -EINVAL;
3104 else
3105 tp->fastopen_no_cookie = val;
3106 break;
3107 case TCP_TIMESTAMP:
3108 if (!tp->repair)
3109 err = -EPERM;
3110 else
3111 tp->tsoffset = val - tcp_time_stamp_raw();
3112 break;
3113 case TCP_REPAIR_WINDOW:
3114 err = tcp_repair_set_window(tp, optval, optlen);
3115 break;
3116 case TCP_NOTSENT_LOWAT:
3117 tp->notsent_lowat = val;
3118 sk->sk_write_space(sk);
3119 break;
3120 case TCP_INQ:
3121 if (val > 1 || val < 0)
3122 err = -EINVAL;
3123 else
3124 tp->recvmsg_inq = val;
3125 break;
3126 case TCP_TX_DELAY:
3127 if (val)
3128 tcp_enable_tx_delay();
3129 tp->tcp_tx_delay = val;
3130 break;
3131 default:
3132 err = -ENOPROTOOPT;
3133 break;
3134 }
3135
3136 release_sock(sk);
3137 return err;
3138}
3139
3140int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
3141 unsigned int optlen)
3142{
3143 const struct inet_connection_sock *icsk = inet_csk(sk);
3144
3145 if (level != SOL_TCP)
3146 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
3147 optval, optlen);
3148 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
3149}
3150EXPORT_SYMBOL(tcp_setsockopt);
3151
3152#ifdef CONFIG_COMPAT
3153int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
3154 char __user *optval, unsigned int optlen)
3155{
3156 if (level != SOL_TCP)
3157 return inet_csk_compat_setsockopt(sk, level, optname,
3158 optval, optlen);
3159 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
3160}
3161EXPORT_SYMBOL(compat_tcp_setsockopt);
3162#endif
3163
3164static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
3165 struct tcp_info *info)
3166{
3167 u64 stats[__TCP_CHRONO_MAX], total = 0;
3168 enum tcp_chrono i;
3169
3170 for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
3171 stats[i] = tp->chrono_stat[i - 1];
3172 if (i == tp->chrono_type)
3173 stats[i] += tcp_jiffies32 - tp->chrono_start;
3174 stats[i] *= USEC_PER_SEC / HZ;
3175 total += stats[i];
3176 }
3177
3178 info->tcpi_busy_time = total;
3179 info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
3180 info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
3181}
3182
3183
3184void tcp_get_info(struct sock *sk, struct tcp_info *info)
3185{
3186 const struct tcp_sock *tp = tcp_sk(sk);
3187 const struct inet_connection_sock *icsk = inet_csk(sk);
3188 unsigned long rate;
3189 u32 now;
3190 u64 rate64;
3191 bool slow;
3192
3193 memset(info, 0, sizeof(*info));
3194 if (sk->sk_type != SOCK_STREAM)
3195 return;
3196
3197 info->tcpi_state = inet_sk_state_load(sk);
3198
3199
3200 rate = READ_ONCE(sk->sk_pacing_rate);
3201 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3202 info->tcpi_pacing_rate = rate64;
3203
3204 rate = READ_ONCE(sk->sk_max_pacing_rate);
3205 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3206 info->tcpi_max_pacing_rate = rate64;
3207
3208 info->tcpi_reordering = tp->reordering;
3209 info->tcpi_snd_cwnd = tp->snd_cwnd;
3210
3211 if (info->tcpi_state == TCP_LISTEN) {
3212
3213
3214
3215
3216 info->tcpi_unacked = sk->sk_ack_backlog;
3217 info->tcpi_sacked = sk->sk_max_ack_backlog;
3218 return;
3219 }
3220
3221 slow = lock_sock_fast(sk);
3222
3223 info->tcpi_ca_state = icsk->icsk_ca_state;
3224 info->tcpi_retransmits = icsk->icsk_retransmits;
3225 info->tcpi_probes = icsk->icsk_probes_out;
3226 info->tcpi_backoff = icsk->icsk_backoff;
3227
3228 if (tp->rx_opt.tstamp_ok)
3229 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
3230 if (tcp_is_sack(tp))
3231 info->tcpi_options |= TCPI_OPT_SACK;
3232 if (tp->rx_opt.wscale_ok) {
3233 info->tcpi_options |= TCPI_OPT_WSCALE;
3234 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
3235 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
3236 }
3237
3238 if (tp->ecn_flags & TCP_ECN_OK)
3239 info->tcpi_options |= TCPI_OPT_ECN;
3240 if (tp->ecn_flags & TCP_ECN_SEEN)
3241 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
3242 if (tp->syn_data_acked)
3243 info->tcpi_options |= TCPI_OPT_SYN_DATA;
3244
3245 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
3246 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
3247 info->tcpi_snd_mss = tp->mss_cache;
3248 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
3249
3250 info->tcpi_unacked = tp->packets_out;
3251 info->tcpi_sacked = tp->sacked_out;
3252
3253 info->tcpi_lost = tp->lost_out;
3254 info->tcpi_retrans = tp->retrans_out;
3255
3256 now = tcp_jiffies32;
3257 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
3258 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
3259 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
3260
3261 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
3262 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
3263 info->tcpi_rtt = tp->srtt_us >> 3;
3264 info->tcpi_rttvar = tp->mdev_us >> 2;
3265 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
3266 info->tcpi_advmss = tp->advmss;
3267
3268 info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
3269 info->tcpi_rcv_space = tp->rcvq_space.space;
3270
3271 info->tcpi_total_retrans = tp->total_retrans;
3272
3273 info->tcpi_bytes_acked = tp->bytes_acked;
3274 info->tcpi_bytes_received = tp->bytes_received;
3275 info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
3276 tcp_get_info_chrono_stats(tp, info);
3277
3278 info->tcpi_segs_out = tp->segs_out;
3279 info->tcpi_segs_in = tp->segs_in;
3280
3281 info->tcpi_min_rtt = tcp_min_rtt(tp);
3282 info->tcpi_data_segs_in = tp->data_segs_in;
3283 info->tcpi_data_segs_out = tp->data_segs_out;
3284
3285 info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
3286 rate64 = tcp_compute_delivery_rate(tp);
3287 if (rate64)
3288 info->tcpi_delivery_rate = rate64;
3289 info->tcpi_delivered = tp->delivered;
3290 info->tcpi_delivered_ce = tp->delivered_ce;
3291 info->tcpi_bytes_sent = tp->bytes_sent;
3292 info->tcpi_bytes_retrans = tp->bytes_retrans;
3293 info->tcpi_dsack_dups = tp->dsack_dups;
3294 info->tcpi_reord_seen = tp->reord_seen;
3295 unlock_sock_fast(sk, slow);
3296}
3297EXPORT_SYMBOL_GPL(tcp_get_info);
3298
3299static size_t tcp_opt_stats_get_size(void)
3300{
3301 return
3302 nla_total_size_64bit(sizeof(u64)) +
3303 nla_total_size_64bit(sizeof(u64)) +
3304 nla_total_size_64bit(sizeof(u64)) +
3305 nla_total_size_64bit(sizeof(u64)) +
3306 nla_total_size_64bit(sizeof(u64)) +
3307 nla_total_size_64bit(sizeof(u64)) +
3308 nla_total_size_64bit(sizeof(u64)) +
3309 nla_total_size(sizeof(u32)) +
3310 nla_total_size(sizeof(u32)) +
3311 nla_total_size(sizeof(u32)) +
3312 nla_total_size(sizeof(u8)) +
3313 nla_total_size(sizeof(u8)) +
3314 nla_total_size(sizeof(u32)) +
3315 nla_total_size(sizeof(u8)) +
3316 nla_total_size(sizeof(u32)) +
3317 nla_total_size(sizeof(u32)) +
3318 nla_total_size(sizeof(u32)) +
3319 nla_total_size_64bit(sizeof(u64)) +
3320 nla_total_size_64bit(sizeof(u64)) +
3321 nla_total_size(sizeof(u32)) +
3322 nla_total_size(sizeof(u32)) +
3323 nla_total_size(sizeof(u32)) +
3324 0;
3325}
3326
3327struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
3328{
3329 const struct tcp_sock *tp = tcp_sk(sk);
3330 struct sk_buff *stats;
3331 struct tcp_info info;
3332 unsigned long rate;
3333 u64 rate64;
3334
3335 stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
3336 if (!stats)
3337 return NULL;
3338
3339 tcp_get_info_chrono_stats(tp, &info);
3340 nla_put_u64_64bit(stats, TCP_NLA_BUSY,
3341 info.tcpi_busy_time, TCP_NLA_PAD);
3342 nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
3343 info.tcpi_rwnd_limited, TCP_NLA_PAD);
3344 nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
3345 info.tcpi_sndbuf_limited, TCP_NLA_PAD);
3346 nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
3347 tp->data_segs_out, TCP_NLA_PAD);
3348 nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
3349 tp->total_retrans, TCP_NLA_PAD);
3350
3351 rate = READ_ONCE(sk->sk_pacing_rate);
3352 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3353 nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
3354
3355 rate64 = tcp_compute_delivery_rate(tp);
3356 nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
3357
3358 nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
3359 nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
3360 nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
3361
3362 nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
3363 nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
3364 nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
3365 nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
3366 nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
3367
3368 nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
3369 nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
3370
3371 nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
3372 TCP_NLA_PAD);
3373 nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
3374 TCP_NLA_PAD);
3375 nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
3376 nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
3377 nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
3378
3379 return stats;
3380}
3381
3382static int do_tcp_getsockopt(struct sock *sk, int level,
3383 int optname, char __user *optval, int __user *optlen)
3384{
3385 struct inet_connection_sock *icsk = inet_csk(sk);
3386 struct tcp_sock *tp = tcp_sk(sk);
3387 struct net *net = sock_net(sk);
3388 int val, len;
3389
3390 if (get_user(len, optlen))
3391 return -EFAULT;
3392
3393 len = min_t(unsigned int, len, sizeof(int));
3394
3395 if (len < 0)
3396 return -EINVAL;
3397
3398 switch (optname) {
3399 case TCP_MAXSEG:
3400 val = tp->mss_cache;
3401 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3402 val = tp->rx_opt.user_mss;
3403 if (tp->repair)
3404 val = tp->rx_opt.mss_clamp;
3405 break;
3406 case TCP_NODELAY:
3407 val = !!(tp->nonagle&TCP_NAGLE_OFF);
3408 break;
3409 case TCP_CORK:
3410 val = !!(tp->nonagle&TCP_NAGLE_CORK);
3411 break;
3412 case TCP_KEEPIDLE:
3413 val = keepalive_time_when(tp) / HZ;
3414 break;
3415 case TCP_KEEPINTVL:
3416 val = keepalive_intvl_when(tp) / HZ;
3417 break;
3418 case TCP_KEEPCNT:
3419 val = keepalive_probes(tp);
3420 break;
3421 case TCP_SYNCNT:
3422 val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
3423 break;
3424 case TCP_LINGER2:
3425 val = tp->linger2;
3426 if (val >= 0)
3427 val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
3428 break;
3429 case TCP_DEFER_ACCEPT:
3430 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
3431 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
3432 break;
3433 case TCP_WINDOW_CLAMP:
3434 val = tp->window_clamp;
3435 break;
3436 case TCP_INFO: {
3437 struct tcp_info info;
3438
3439 if (get_user(len, optlen))
3440 return -EFAULT;
3441
3442 tcp_get_info(sk, &info);
3443
3444 len = min_t(unsigned int, len, sizeof(info));
3445 if (put_user(len, optlen))
3446 return -EFAULT;
3447 if (copy_to_user(optval, &info, len))
3448 return -EFAULT;
3449 return 0;
3450 }
3451 case TCP_CC_INFO: {
3452 const struct tcp_congestion_ops *ca_ops;
3453 union tcp_cc_info info;
3454 size_t sz = 0;
3455 int attr;
3456
3457 if (get_user(len, optlen))
3458 return -EFAULT;
3459
3460 ca_ops = icsk->icsk_ca_ops;
3461 if (ca_ops && ca_ops->get_info)
3462 sz = ca_ops->get_info(sk, ~0U, &attr, &info);
3463
3464 len = min_t(unsigned int, len, sz);
3465 if (put_user(len, optlen))
3466 return -EFAULT;
3467 if (copy_to_user(optval, &info, len))
3468 return -EFAULT;
3469 return 0;
3470 }
3471 case TCP_QUICKACK:
3472 val = !inet_csk_in_pingpong_mode(sk);
3473 break;
3474
3475 case TCP_CONGESTION:
3476 if (get_user(len, optlen))
3477 return -EFAULT;
3478 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
3479 if (put_user(len, optlen))
3480 return -EFAULT;
3481 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
3482 return -EFAULT;
3483 return 0;
3484
3485 case TCP_ULP:
3486 if (get_user(len, optlen))
3487 return -EFAULT;
3488 len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
3489 if (!icsk->icsk_ulp_ops) {
3490 if (put_user(0, optlen))
3491 return -EFAULT;
3492 return 0;
3493 }
3494 if (put_user(len, optlen))
3495 return -EFAULT;
3496 if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
3497 return -EFAULT;
3498 return 0;
3499
3500 case TCP_FASTOPEN_KEY: {
3501 __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
3502 struct tcp_fastopen_context *ctx;
3503 unsigned int key_len = 0;
3504
3505 if (get_user(len, optlen))
3506 return -EFAULT;
3507
3508 rcu_read_lock();
3509 ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
3510 if (ctx) {
3511 key_len = tcp_fastopen_context_len(ctx) *
3512 TCP_FASTOPEN_KEY_LENGTH;
3513 memcpy(&key[0], &ctx->key[0], key_len);
3514 }
3515 rcu_read_unlock();
3516
3517 len = min_t(unsigned int, len, key_len);
3518 if (put_user(len, optlen))
3519 return -EFAULT;
3520 if (copy_to_user(optval, key, len))
3521 return -EFAULT;
3522 return 0;
3523 }
3524 case TCP_THIN_LINEAR_TIMEOUTS:
3525 val = tp->thin_lto;
3526 break;
3527
3528 case TCP_THIN_DUPACK:
3529 val = 0;
3530 break;
3531
3532 case TCP_REPAIR:
3533 val = tp->repair;
3534 break;
3535
3536 case TCP_REPAIR_QUEUE:
3537 if (tp->repair)
3538 val = tp->repair_queue;
3539 else
3540 return -EINVAL;
3541 break;
3542
3543 case TCP_REPAIR_WINDOW: {
3544 struct tcp_repair_window opt;
3545
3546 if (get_user(len, optlen))
3547 return -EFAULT;
3548
3549 if (len != sizeof(opt))
3550 return -EINVAL;
3551
3552 if (!tp->repair)
3553 return -EPERM;
3554
3555 opt.snd_wl1 = tp->snd_wl1;
3556 opt.snd_wnd = tp->snd_wnd;
3557 opt.max_window = tp->max_window;
3558 opt.rcv_wnd = tp->rcv_wnd;
3559 opt.rcv_wup = tp->rcv_wup;
3560
3561 if (copy_to_user(optval, &opt, len))
3562 return -EFAULT;
3563 return 0;
3564 }
3565 case TCP_QUEUE_SEQ:
3566 if (tp->repair_queue == TCP_SEND_QUEUE)
3567 val = tp->write_seq;
3568 else if (tp->repair_queue == TCP_RECV_QUEUE)
3569 val = tp->rcv_nxt;
3570 else
3571 return -EINVAL;
3572 break;
3573
3574 case TCP_USER_TIMEOUT:
3575 val = icsk->icsk_user_timeout;
3576 break;
3577
3578 case TCP_FASTOPEN:
3579 val = icsk->icsk_accept_queue.fastopenq.max_qlen;
3580 break;
3581
3582 case TCP_FASTOPEN_CONNECT:
3583 val = tp->fastopen_connect;
3584 break;
3585
3586 case TCP_FASTOPEN_NO_COOKIE:
3587 val = tp->fastopen_no_cookie;
3588 break;
3589
3590 case TCP_TX_DELAY:
3591 val = tp->tcp_tx_delay;
3592 break;
3593
3594 case TCP_TIMESTAMP:
3595 val = tcp_time_stamp_raw() + tp->tsoffset;
3596 break;
3597 case TCP_NOTSENT_LOWAT:
3598 val = tp->notsent_lowat;
3599 break;
3600 case TCP_INQ:
3601 val = tp->recvmsg_inq;
3602 break;
3603 case TCP_SAVE_SYN:
3604 val = tp->save_syn;
3605 break;
3606 case TCP_SAVED_SYN: {
3607 if (get_user(len, optlen))
3608 return -EFAULT;
3609
3610 lock_sock(sk);
3611 if (tp->saved_syn) {
3612 if (len < tp->saved_syn[0]) {
3613 if (put_user(tp->saved_syn[0], optlen)) {
3614 release_sock(sk);
3615 return -EFAULT;
3616 }
3617 release_sock(sk);
3618 return -EINVAL;
3619 }
3620 len = tp->saved_syn[0];
3621 if (put_user(len, optlen)) {
3622 release_sock(sk);
3623 return -EFAULT;
3624 }
3625 if (copy_to_user(optval, tp->saved_syn + 1, len)) {
3626 release_sock(sk);
3627 return -EFAULT;
3628 }
3629 tcp_saved_syn_free(tp);
3630 release_sock(sk);
3631 } else {
3632 release_sock(sk);
3633 len = 0;
3634 if (put_user(len, optlen))
3635 return -EFAULT;
3636 }
3637 return 0;
3638 }
3639#ifdef CONFIG_MMU
3640 case TCP_ZEROCOPY_RECEIVE: {
3641 struct tcp_zerocopy_receive zc;
3642 int err;
3643
3644 if (get_user(len, optlen))
3645 return -EFAULT;
3646 if (len != sizeof(zc))
3647 return -EINVAL;
3648 if (copy_from_user(&zc, optval, len))
3649 return -EFAULT;
3650 lock_sock(sk);
3651 err = tcp_zerocopy_receive(sk, &zc);
3652 release_sock(sk);
3653 if (!err && copy_to_user(optval, &zc, len))
3654 err = -EFAULT;
3655 return err;
3656 }
3657#endif
3658 default:
3659 return -ENOPROTOOPT;
3660 }
3661
3662 if (put_user(len, optlen))
3663 return -EFAULT;
3664 if (copy_to_user(optval, &val, len))
3665 return -EFAULT;
3666 return 0;
3667}
3668
3669int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
3670 int __user *optlen)
3671{
3672 struct inet_connection_sock *icsk = inet_csk(sk);
3673
3674 if (level != SOL_TCP)
3675 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
3676 optval, optlen);
3677 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
3678}
3679EXPORT_SYMBOL(tcp_getsockopt);
3680
3681#ifdef CONFIG_COMPAT
3682int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
3683 char __user *optval, int __user *optlen)
3684{
3685 if (level != SOL_TCP)
3686 return inet_csk_compat_getsockopt(sk, level, optname,
3687 optval, optlen);
3688 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
3689}
3690EXPORT_SYMBOL(compat_tcp_getsockopt);
3691#endif
3692
3693#ifdef CONFIG_TCP_MD5SIG
3694static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
3695static DEFINE_MUTEX(tcp_md5sig_mutex);
3696static bool tcp_md5sig_pool_populated = false;
3697
3698static void __tcp_alloc_md5sig_pool(void)
3699{
3700 struct crypto_ahash *hash;
3701 int cpu;
3702
3703 hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
3704 if (IS_ERR(hash))
3705 return;
3706
3707 for_each_possible_cpu(cpu) {
3708 void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
3709 struct ahash_request *req;
3710
3711 if (!scratch) {
3712 scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
3713 sizeof(struct tcphdr),
3714 GFP_KERNEL,
3715 cpu_to_node(cpu));
3716 if (!scratch)
3717 return;
3718 per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
3719 }
3720 if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
3721 continue;
3722
3723 req = ahash_request_alloc(hash, GFP_KERNEL);
3724 if (!req)
3725 return;
3726
3727 ahash_request_set_callback(req, 0, NULL, NULL);
3728
3729 per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
3730 }
3731
3732
3733
3734 smp_wmb();
3735 tcp_md5sig_pool_populated = true;
3736}
3737
3738bool tcp_alloc_md5sig_pool(void)
3739{
3740 if (unlikely(!tcp_md5sig_pool_populated)) {
3741 mutex_lock(&tcp_md5sig_mutex);
3742
3743 if (!tcp_md5sig_pool_populated) {
3744 __tcp_alloc_md5sig_pool();
3745 if (tcp_md5sig_pool_populated)
3746 static_branch_inc(&tcp_md5_needed);
3747 }
3748
3749 mutex_unlock(&tcp_md5sig_mutex);
3750 }
3751 return tcp_md5sig_pool_populated;
3752}
3753EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3764{
3765 local_bh_disable();
3766
3767 if (tcp_md5sig_pool_populated) {
3768
3769 smp_rmb();
3770 return this_cpu_ptr(&tcp_md5sig_pool);
3771 }
3772 local_bh_enable();
3773 return NULL;
3774}
3775EXPORT_SYMBOL(tcp_get_md5sig_pool);
3776
3777int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3778 const struct sk_buff *skb, unsigned int header_len)
3779{
3780 struct scatterlist sg;
3781 const struct tcphdr *tp = tcp_hdr(skb);
3782 struct ahash_request *req = hp->md5_req;
3783 unsigned int i;
3784 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
3785 skb_headlen(skb) - header_len : 0;
3786 const struct skb_shared_info *shi = skb_shinfo(skb);
3787 struct sk_buff *frag_iter;
3788
3789 sg_init_table(&sg, 1);
3790
3791 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
3792 ahash_request_set_crypt(req, &sg, NULL, head_data_len);
3793 if (crypto_ahash_update(req))
3794 return 1;
3795
3796 for (i = 0; i < shi->nr_frags; ++i) {
3797 const struct skb_frag_struct *f = &shi->frags[i];
3798 unsigned int offset = f->page_offset;
3799 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
3800
3801 sg_set_page(&sg, page, skb_frag_size(f),
3802 offset_in_page(offset));
3803 ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
3804 if (crypto_ahash_update(req))
3805 return 1;
3806 }
3807
3808 skb_walk_frags(skb, frag_iter)
3809 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3810 return 1;
3811
3812 return 0;
3813}
3814EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3815
3816int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
3817{
3818 struct scatterlist sg;
3819
3820 sg_init_one(&sg, key->key, key->keylen);
3821 ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen);
3822 return crypto_ahash_update(hp->md5_req);
3823}
3824EXPORT_SYMBOL(tcp_md5_hash_key);
3825
3826#endif
3827
3828void tcp_done(struct sock *sk)
3829{
3830 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3831
3832 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3833 TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3834
3835 tcp_set_state(sk, TCP_CLOSE);
3836 tcp_clear_xmit_timers(sk);
3837 if (req)
3838 reqsk_fastopen_remove(sk, req, false);
3839
3840 sk->sk_shutdown = SHUTDOWN_MASK;
3841
3842 if (!sock_flag(sk, SOCK_DEAD))
3843 sk->sk_state_change(sk);
3844 else
3845 inet_csk_destroy_sock(sk);
3846}
3847EXPORT_SYMBOL_GPL(tcp_done);
3848
3849int tcp_abort(struct sock *sk, int err)
3850{
3851 if (!sk_fullsock(sk)) {
3852 if (sk->sk_state == TCP_NEW_SYN_RECV) {
3853 struct request_sock *req = inet_reqsk(sk);
3854
3855 local_bh_disable();
3856 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
3857 local_bh_enable();
3858 return 0;
3859 }
3860 return -EOPNOTSUPP;
3861 }
3862
3863
3864 lock_sock(sk);
3865
3866 if (sk->sk_state == TCP_LISTEN) {
3867 tcp_set_state(sk, TCP_CLOSE);
3868 inet_csk_listen_stop(sk);
3869 }
3870
3871
3872 local_bh_disable();
3873 bh_lock_sock(sk);
3874
3875 if (!sock_flag(sk, SOCK_DEAD)) {
3876 sk->sk_err = err;
3877
3878 smp_wmb();
3879 sk->sk_error_report(sk);
3880 if (tcp_need_reset(sk->sk_state))
3881 tcp_send_active_reset(sk, GFP_ATOMIC);
3882 tcp_done(sk);
3883 }
3884
3885 bh_unlock_sock(sk);
3886 local_bh_enable();
3887 tcp_write_queue_purge(sk);
3888 release_sock(sk);
3889 return 0;
3890}
3891EXPORT_SYMBOL_GPL(tcp_abort);
3892
3893extern struct tcp_congestion_ops tcp_reno;
3894
3895static __initdata unsigned long thash_entries;
3896static int __init set_thash_entries(char *str)
3897{
3898 ssize_t ret;
3899
3900 if (!str)
3901 return 0;
3902
3903 ret = kstrtoul(str, 0, &thash_entries);
3904 if (ret)
3905 return 0;
3906
3907 return 1;
3908}
3909__setup("thash_entries=", set_thash_entries);
3910
3911static void __init tcp_init_mem(void)
3912{
3913 unsigned long limit = nr_free_buffer_pages() / 16;
3914
3915 limit = max(limit, 128UL);
3916 sysctl_tcp_mem[0] = limit / 4 * 3;
3917 sysctl_tcp_mem[1] = limit;
3918 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
3919}
3920
3921void __init tcp_init(void)
3922{
3923 int max_rshare, max_wshare, cnt;
3924 unsigned long limit;
3925 unsigned int i;
3926
3927 BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
3928 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
3929 FIELD_SIZEOF(struct sk_buff, cb));
3930
3931 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
3932 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
3933 inet_hashinfo_init(&tcp_hashinfo);
3934 inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
3935 thash_entries, 21,
3936 0, 64 * 1024);
3937 tcp_hashinfo.bind_bucket_cachep =
3938 kmem_cache_create("tcp_bind_bucket",
3939 sizeof(struct inet_bind_bucket), 0,
3940 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3941
3942
3943
3944
3945
3946
3947 tcp_hashinfo.ehash =
3948 alloc_large_system_hash("TCP established",
3949 sizeof(struct inet_ehash_bucket),
3950 thash_entries,
3951 17,
3952 0,
3953 NULL,
3954 &tcp_hashinfo.ehash_mask,
3955 0,
3956 thash_entries ? 0 : 512 * 1024);
3957 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
3958 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
3959
3960 if (inet_ehash_locks_alloc(&tcp_hashinfo))
3961 panic("TCP: failed to alloc ehash_locks");
3962 tcp_hashinfo.bhash =
3963 alloc_large_system_hash("TCP bind",
3964 sizeof(struct inet_bind_hashbucket),
3965 tcp_hashinfo.ehash_mask + 1,
3966 17,
3967 0,
3968 &tcp_hashinfo.bhash_size,
3969 NULL,
3970 0,
3971 64 * 1024);
3972 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
3973 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3974 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3975 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3976 }
3977
3978
3979 cnt = tcp_hashinfo.ehash_mask + 1;
3980 sysctl_tcp_max_orphans = cnt / 2;
3981
3982 tcp_init_mem();
3983
3984 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
3985 max_wshare = min(4UL*1024*1024, limit);
3986 max_rshare = min(6UL*1024*1024, limit);
3987
3988 init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3989 init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
3990 init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3991
3992 init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3993 init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
3994 init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
3995
3996 pr_info("Hash tables configured (established %u bind %u)\n",
3997 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3998
3999 tcp_v4_init();
4000 tcp_metrics_init();
4001 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
4002 tcp_tasklet_init();
4003}
4004