1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248#define pr_fmt(fmt) "TCP: " fmt
249
250#include <linux/kernel.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/fs.h>
257#include <linux/skbuff.h>
258#include <linux/scatterlist.h>
259#include <linux/splice.h>
260#include <linux/net.h>
261#include <linux/socket.h>
262#include <linux/random.h>
263#include <linux/bootmem.h>
264#include <linux/highmem.h>
265#include <linux/swap.h>
266#include <linux/cache.h>
267#include <linux/err.h>
268#include <linux/crypto.h>
269#include <linux/time.h>
270#include <linux/slab.h>
271
272#include <net/icmp.h>
273#include <net/inet_common.h>
274#include <net/tcp.h>
275#include <net/xfrm.h>
276#include <net/ip.h>
277#include <net/netdma.h>
278#include <net/sock.h>
279
280#include <asm/uaccess.h>
281#include <asm/ioctls.h>
282#include <net/busy_poll.h>
283
284int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
285
286int sysctl_tcp_min_tso_segs __read_mostly = 2;
287
288int sysctl_tcp_autocorking __read_mostly = 1;
289
290struct percpu_counter tcp_orphan_count;
291EXPORT_SYMBOL_GPL(tcp_orphan_count);
292
293long sysctl_tcp_mem[3] __read_mostly;
294int sysctl_tcp_wmem[3] __read_mostly;
295int sysctl_tcp_rmem[3] __read_mostly;
296
297EXPORT_SYMBOL(sysctl_tcp_mem);
298EXPORT_SYMBOL(sysctl_tcp_rmem);
299EXPORT_SYMBOL(sysctl_tcp_wmem);
300
301atomic_long_t tcp_memory_allocated;
302EXPORT_SYMBOL(tcp_memory_allocated);
303
304
305
306
307struct percpu_counter tcp_sockets_allocated;
308EXPORT_SYMBOL(tcp_sockets_allocated);
309
310
311
312
313struct tcp_splice_state {
314 struct pipe_inode_info *pipe;
315 size_t len;
316 unsigned int flags;
317};
318
319
320
321
322
323
324
325int tcp_memory_pressure __read_mostly;
326EXPORT_SYMBOL(tcp_memory_pressure);
327
328void tcp_enter_memory_pressure(struct sock *sk)
329{
330 if (!tcp_memory_pressure) {
331 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
332 tcp_memory_pressure = 1;
333 }
334}
335EXPORT_SYMBOL(tcp_enter_memory_pressure);
336
337
338static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
339{
340 u8 res = 0;
341
342 if (seconds > 0) {
343 int period = timeout;
344
345 res = 1;
346 while (seconds > period && res < 255) {
347 res++;
348 timeout <<= 1;
349 if (timeout > rto_max)
350 timeout = rto_max;
351 period += timeout;
352 }
353 }
354 return res;
355}
356
357
358static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
359{
360 int period = 0;
361
362 if (retrans > 0) {
363 period = timeout;
364 while (--retrans) {
365 timeout <<= 1;
366 if (timeout > rto_max)
367 timeout = rto_max;
368 period += timeout;
369 }
370 }
371 return period;
372}
373
374
375
376
377
378
379void tcp_init_sock(struct sock *sk)
380{
381 struct inet_connection_sock *icsk = inet_csk(sk);
382 struct tcp_sock *tp = tcp_sk(sk);
383
384 __skb_queue_head_init(&tp->out_of_order_queue);
385 tcp_init_xmit_timers(sk);
386 tcp_prequeue_init(tp);
387 INIT_LIST_HEAD(&tp->tsq_node);
388
389 icsk->icsk_rto = TCP_TIMEOUT_INIT;
390 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
391
392
393
394
395
396
397 tp->snd_cwnd = TCP_INIT_CWND;
398
399
400
401
402 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
403 tp->snd_cwnd_clamp = ~0;
404 tp->mss_cache = TCP_MSS_DEFAULT;
405
406 tp->reordering = sysctl_tcp_reordering;
407 tcp_enable_early_retrans(tp);
408 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
409
410 tp->tsoffset = 0;
411
412 sk->sk_state = TCP_CLOSE;
413
414 sk->sk_write_space = sk_stream_write_space;
415 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
416
417 icsk->icsk_sync_mss = tcp_sync_mss;
418
419 sk->sk_sndbuf = sysctl_tcp_wmem[1];
420 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
421
422 local_bh_disable();
423 sock_update_memcg(sk);
424 sk_sockets_allocated_inc(sk);
425 local_bh_enable();
426}
427EXPORT_SYMBOL(tcp_init_sock);
428
429
430
431
432
433
434
435
436unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
437{
438 unsigned int mask;
439 struct sock *sk = sock->sk;
440 const struct tcp_sock *tp = tcp_sk(sk);
441
442 sock_rps_record_flow(sk);
443
444 sock_poll_wait(file, sk_sleep(sk), wait);
445 if (sk->sk_state == TCP_LISTEN)
446 return inet_csk_listen_poll(sk);
447
448
449
450
451
452
453 mask = 0;
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
483 mask |= POLLHUP;
484 if (sk->sk_shutdown & RCV_SHUTDOWN)
485 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
486
487
488 if (sk->sk_state != TCP_SYN_SENT &&
489 (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
490 int target = sock_rcvlowat(sk, 0, INT_MAX);
491
492 if (tp->urg_seq == tp->copied_seq &&
493 !sock_flag(sk, SOCK_URGINLINE) &&
494 tp->urg_data)
495 target++;
496
497
498
499
500 if (tp->rcv_nxt - tp->copied_seq >= target)
501 mask |= POLLIN | POLLRDNORM;
502
503 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
504 if (sk_stream_is_writeable(sk)) {
505 mask |= POLLOUT | POLLWRNORM;
506 } else {
507 set_bit(SOCK_ASYNC_NOSPACE,
508 &sk->sk_socket->flags);
509 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
510
511
512
513
514
515 if (sk_stream_is_writeable(sk))
516 mask |= POLLOUT | POLLWRNORM;
517 }
518 } else
519 mask |= POLLOUT | POLLWRNORM;
520
521 if (tp->urg_data & TCP_URG_VALID)
522 mask |= POLLPRI;
523 }
524
525 smp_rmb();
526 if (sk->sk_err)
527 mask |= POLLERR;
528
529 return mask;
530}
531EXPORT_SYMBOL(tcp_poll);
532
533int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
534{
535 struct tcp_sock *tp = tcp_sk(sk);
536 int answ;
537 bool slow;
538
539 switch (cmd) {
540 case SIOCINQ:
541 if (sk->sk_state == TCP_LISTEN)
542 return -EINVAL;
543
544 slow = lock_sock_fast(sk);
545 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
546 answ = 0;
547 else if (sock_flag(sk, SOCK_URGINLINE) ||
548 !tp->urg_data ||
549 before(tp->urg_seq, tp->copied_seq) ||
550 !before(tp->urg_seq, tp->rcv_nxt)) {
551
552 answ = tp->rcv_nxt - tp->copied_seq;
553
554
555 if (answ && sock_flag(sk, SOCK_DONE))
556 answ--;
557 } else
558 answ = tp->urg_seq - tp->copied_seq;
559 unlock_sock_fast(sk, slow);
560 break;
561 case SIOCATMARK:
562 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
563 break;
564 case SIOCOUTQ:
565 if (sk->sk_state == TCP_LISTEN)
566 return -EINVAL;
567
568 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
569 answ = 0;
570 else
571 answ = tp->write_seq - tp->snd_una;
572 break;
573 case SIOCOUTQNSD:
574 if (sk->sk_state == TCP_LISTEN)
575 return -EINVAL;
576
577 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
578 answ = 0;
579 else
580 answ = tp->write_seq - tp->snd_nxt;
581 break;
582 default:
583 return -ENOIOCTLCMD;
584 }
585
586 return put_user(answ, (int __user *)arg);
587}
588EXPORT_SYMBOL(tcp_ioctl);
589
590static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
591{
592 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
593 tp->pushed_seq = tp->write_seq;
594}
595
596static inline bool forced_push(const struct tcp_sock *tp)
597{
598 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
599}
600
601static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
602{
603 struct tcp_sock *tp = tcp_sk(sk);
604 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
605
606 skb->csum = 0;
607 tcb->seq = tcb->end_seq = tp->write_seq;
608 tcb->tcp_flags = TCPHDR_ACK;
609 tcb->sacked = 0;
610 skb_header_release(skb);
611 tcp_add_write_queue_tail(sk, skb);
612 sk->sk_wmem_queued += skb->truesize;
613 sk_mem_charge(sk, skb->truesize);
614 if (tp->nonagle & TCP_NAGLE_PUSH)
615 tp->nonagle &= ~TCP_NAGLE_PUSH;
616}
617
618static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
619{
620 if (flags & MSG_OOB)
621 tp->snd_up = tp->write_seq;
622}
623
624
625
626
627
628
629
630
631
632
633
634static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
635 int size_goal)
636{
637 return skb->len < size_goal &&
638 sysctl_tcp_autocorking &&
639 skb != tcp_write_queue_head(sk) &&
640 atomic_read(&sk->sk_wmem_alloc) > skb->truesize;
641}
642
643static void tcp_push(struct sock *sk, int flags, int mss_now,
644 int nonagle, int size_goal)
645{
646 struct tcp_sock *tp = tcp_sk(sk);
647 struct sk_buff *skb;
648
649 if (!tcp_send_head(sk))
650 return;
651
652 skb = tcp_write_queue_tail(sk);
653 if (!(flags & MSG_MORE) || forced_push(tp))
654 tcp_mark_push(tp, skb);
655
656 tcp_mark_urg(tp, flags);
657
658 if (tcp_should_autocork(sk, skb, size_goal)) {
659
660
661 if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
662 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
663 set_bit(TSQ_THROTTLED, &tp->tsq_flags);
664 }
665
666
667
668 if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize)
669 return;
670 }
671
672 if (flags & MSG_MORE)
673 nonagle = TCP_NAGLE_CORK;
674
675 __tcp_push_pending_frames(sk, mss_now, nonagle);
676}
677
678static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
679 unsigned int offset, size_t len)
680{
681 struct tcp_splice_state *tss = rd_desc->arg.data;
682 int ret;
683
684 ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
685 tss->flags);
686 if (ret > 0)
687 rd_desc->count -= ret;
688 return ret;
689}
690
691static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
692{
693
694 read_descriptor_t rd_desc = {
695 .arg.data = tss,
696 .count = tss->len,
697 };
698
699 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
700}
701
702
703
704
705
706
707
708
709
710
711
712
713
714ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
715 struct pipe_inode_info *pipe, size_t len,
716 unsigned int flags)
717{
718 struct sock *sk = sock->sk;
719 struct tcp_splice_state tss = {
720 .pipe = pipe,
721 .len = len,
722 .flags = flags,
723 };
724 long timeo;
725 ssize_t spliced;
726 int ret;
727
728 sock_rps_record_flow(sk);
729
730
731
732 if (unlikely(*ppos))
733 return -ESPIPE;
734
735 ret = spliced = 0;
736
737 lock_sock(sk);
738
739 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
740 while (tss.len) {
741 ret = __tcp_splice_read(sk, &tss);
742 if (ret < 0)
743 break;
744 else if (!ret) {
745 if (spliced)
746 break;
747 if (sock_flag(sk, SOCK_DONE))
748 break;
749 if (sk->sk_err) {
750 ret = sock_error(sk);
751 break;
752 }
753 if (sk->sk_shutdown & RCV_SHUTDOWN)
754 break;
755 if (sk->sk_state == TCP_CLOSE) {
756
757
758
759
760 if (!sock_flag(sk, SOCK_DONE))
761 ret = -ENOTCONN;
762 break;
763 }
764 if (!timeo) {
765 ret = -EAGAIN;
766 break;
767 }
768 sk_wait_data(sk, &timeo);
769 if (signal_pending(current)) {
770 ret = sock_intr_errno(timeo);
771 break;
772 }
773 continue;
774 }
775 tss.len -= ret;
776 spliced += ret;
777
778 if (!timeo)
779 break;
780 release_sock(sk);
781 lock_sock(sk);
782
783 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
784 (sk->sk_shutdown & RCV_SHUTDOWN) ||
785 signal_pending(current))
786 break;
787 }
788
789 release_sock(sk);
790
791 if (spliced)
792 return spliced;
793
794 return ret;
795}
796EXPORT_SYMBOL(tcp_splice_read);
797
798struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
799{
800 struct sk_buff *skb;
801
802
803 size = ALIGN(size, 4);
804
805 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
806 if (skb) {
807 if (sk_wmem_schedule(sk, skb->truesize)) {
808 skb_reserve(skb, sk->sk_prot->max_header);
809
810
811
812
813 skb->reserved_tailroom = skb->end - skb->tail - size;
814 return skb;
815 }
816 __kfree_skb(skb);
817 } else {
818 sk->sk_prot->enter_memory_pressure(sk);
819 sk_stream_moderate_sndbuf(sk);
820 }
821 return NULL;
822}
823
824static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
825 int large_allowed)
826{
827 struct tcp_sock *tp = tcp_sk(sk);
828 u32 xmit_size_goal, old_size_goal;
829
830 xmit_size_goal = mss_now;
831
832 if (large_allowed && sk_can_gso(sk)) {
833 u32 gso_size, hlen;
834
835
836 hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
837 inet_csk(sk)->icsk_ext_hdr_len +
838 tp->tcp_header_len;
839
840
841
842
843
844
845 gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
846 gso_size = max_t(u32, gso_size,
847 sysctl_tcp_min_tso_segs * mss_now);
848
849 xmit_size_goal = min_t(u32, gso_size,
850 sk->sk_gso_max_size - 1 - hlen);
851
852 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
853
854
855 old_size_goal = tp->xmit_size_goal_segs * mss_now;
856
857 if (likely(old_size_goal <= xmit_size_goal &&
858 old_size_goal + mss_now > xmit_size_goal)) {
859 xmit_size_goal = old_size_goal;
860 } else {
861 tp->xmit_size_goal_segs =
862 min_t(u16, xmit_size_goal / mss_now,
863 sk->sk_gso_max_segs);
864 xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
865 }
866 }
867
868 return max(xmit_size_goal, mss_now);
869}
870
871static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
872{
873 int mss_now;
874
875 mss_now = tcp_current_mss(sk);
876 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
877
878 return mss_now;
879}
880
881static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
882 size_t size, int flags)
883{
884 struct tcp_sock *tp = tcp_sk(sk);
885 int mss_now, size_goal;
886 int err;
887 ssize_t copied;
888 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
889
890
891
892
893
894 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
895 !tcp_passive_fastopen(sk)) {
896 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
897 goto out_err;
898 }
899
900 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
901
902 mss_now = tcp_send_mss(sk, &size_goal, flags);
903 copied = 0;
904
905 err = -EPIPE;
906 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
907 goto out_err;
908
909 while (size > 0) {
910 struct sk_buff *skb = tcp_write_queue_tail(sk);
911 int copy, i;
912 bool can_coalesce;
913
914 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
915new_segment:
916 if (!sk_stream_memory_free(sk))
917 goto wait_for_sndbuf;
918
919 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
920 if (!skb)
921 goto wait_for_memory;
922
923 skb_entail(sk, skb);
924 copy = size_goal;
925 }
926
927 if (copy > size)
928 copy = size;
929
930 i = skb_shinfo(skb)->nr_frags;
931 can_coalesce = skb_can_coalesce(skb, i, page, offset);
932 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
933 tcp_mark_push(tp, skb);
934 goto new_segment;
935 }
936 if (!sk_wmem_schedule(sk, copy))
937 goto wait_for_memory;
938
939 if (can_coalesce) {
940 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
941 } else {
942 get_page(page);
943 skb_fill_page_desc(skb, i, page, offset, copy);
944 }
945 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
946
947 skb->len += copy;
948 skb->data_len += copy;
949 skb->truesize += copy;
950 sk->sk_wmem_queued += copy;
951 sk_mem_charge(sk, copy);
952 skb->ip_summed = CHECKSUM_PARTIAL;
953 tp->write_seq += copy;
954 TCP_SKB_CB(skb)->end_seq += copy;
955 skb_shinfo(skb)->gso_segs = 0;
956
957 if (!copied)
958 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
959
960 copied += copy;
961 offset += copy;
962 if (!(size -= copy))
963 goto out;
964
965 if (skb->len < size_goal || (flags & MSG_OOB))
966 continue;
967
968 if (forced_push(tp)) {
969 tcp_mark_push(tp, skb);
970 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
971 } else if (skb == tcp_send_head(sk))
972 tcp_push_one(sk, mss_now);
973 continue;
974
975wait_for_sndbuf:
976 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
977wait_for_memory:
978 tcp_push(sk, flags & ~MSG_MORE, mss_now,
979 TCP_NAGLE_PUSH, size_goal);
980
981 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
982 goto do_error;
983
984 mss_now = tcp_send_mss(sk, &size_goal, flags);
985 }
986
987out:
988 if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
989 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
990 return copied;
991
992do_error:
993 if (copied)
994 goto out;
995out_err:
996 return sk_stream_error(sk, flags, err);
997}
998
999int tcp_sendpage(struct sock *sk, struct page *page, int offset,
1000 size_t size, int flags)
1001{
1002 ssize_t res;
1003
1004 if (!(sk->sk_route_caps & NETIF_F_SG) ||
1005 !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
1006 return sock_no_sendpage(sk->sk_socket, page, offset, size,
1007 flags);
1008
1009 lock_sock(sk);
1010 res = do_tcp_sendpages(sk, page, offset, size, flags);
1011 release_sock(sk);
1012 return res;
1013}
1014EXPORT_SYMBOL(tcp_sendpage);
1015
1016static inline int select_size(const struct sock *sk, bool sg)
1017{
1018 const struct tcp_sock *tp = tcp_sk(sk);
1019 int tmp = tp->mss_cache;
1020
1021 if (sg) {
1022 if (sk_can_gso(sk)) {
1023
1024
1025
1026 tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
1027 } else {
1028 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1029
1030 if (tmp >= pgbreak &&
1031 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
1032 tmp = pgbreak;
1033 }
1034 }
1035
1036 return tmp;
1037}
1038
1039void tcp_free_fastopen_req(struct tcp_sock *tp)
1040{
1041 if (tp->fastopen_req != NULL) {
1042 kfree(tp->fastopen_req);
1043 tp->fastopen_req = NULL;
1044 }
1045}
1046
1047static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1048 int *copied, size_t size)
1049{
1050 struct tcp_sock *tp = tcp_sk(sk);
1051 int err, flags;
1052
1053 if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
1054 return -EOPNOTSUPP;
1055 if (tp->fastopen_req != NULL)
1056 return -EALREADY;
1057
1058 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1059 sk->sk_allocation);
1060 if (unlikely(tp->fastopen_req == NULL))
1061 return -ENOBUFS;
1062 tp->fastopen_req->data = msg;
1063 tp->fastopen_req->size = size;
1064
1065 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1066 err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
1067 msg->msg_namelen, flags);
1068 *copied = tp->fastopen_req->copied;
1069 tcp_free_fastopen_req(tp);
1070 return err;
1071}
1072
1073int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1074 size_t size)
1075{
1076 struct iovec *iov;
1077 struct tcp_sock *tp = tcp_sk(sk);
1078 struct sk_buff *skb;
1079 int iovlen, flags, err, copied = 0;
1080 int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
1081 bool sg;
1082 long timeo;
1083
1084 lock_sock(sk);
1085
1086 flags = msg->msg_flags;
1087 if (flags & MSG_FASTOPEN) {
1088 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
1089 if (err == -EINPROGRESS && copied_syn > 0)
1090 goto out;
1091 else if (err)
1092 goto out_err;
1093 offset = copied_syn;
1094 }
1095
1096 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1097
1098
1099
1100
1101
1102 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1103 !tcp_passive_fastopen(sk)) {
1104 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1105 goto do_error;
1106 }
1107
1108 if (unlikely(tp->repair)) {
1109 if (tp->repair_queue == TCP_RECV_QUEUE) {
1110 copied = tcp_send_rcvq(sk, msg, size);
1111 goto out;
1112 }
1113
1114 err = -EINVAL;
1115 if (tp->repair_queue == TCP_NO_QUEUE)
1116 goto out_err;
1117
1118
1119 }
1120
1121
1122 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1123
1124 mss_now = tcp_send_mss(sk, &size_goal, flags);
1125
1126
1127 iovlen = msg->msg_iovlen;
1128 iov = msg->msg_iov;
1129 copied = 0;
1130
1131 err = -EPIPE;
1132 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1133 goto out_err;
1134
1135 sg = !!(sk->sk_route_caps & NETIF_F_SG);
1136
1137 while (--iovlen >= 0) {
1138 size_t seglen = iov->iov_len;
1139 unsigned char __user *from = iov->iov_base;
1140
1141 iov++;
1142 if (unlikely(offset > 0)) {
1143 if (offset >= seglen) {
1144 offset -= seglen;
1145 continue;
1146 }
1147 seglen -= offset;
1148 from += offset;
1149 offset = 0;
1150 }
1151
1152 while (seglen > 0) {
1153 int copy = 0;
1154 int max = size_goal;
1155
1156 skb = tcp_write_queue_tail(sk);
1157 if (tcp_send_head(sk)) {
1158 if (skb->ip_summed == CHECKSUM_NONE)
1159 max = mss_now;
1160 copy = max - skb->len;
1161 }
1162
1163 if (copy <= 0) {
1164new_segment:
1165
1166
1167
1168 if (!sk_stream_memory_free(sk))
1169 goto wait_for_sndbuf;
1170
1171 skb = sk_stream_alloc_skb(sk,
1172 select_size(sk, sg),
1173 sk->sk_allocation);
1174 if (!skb)
1175 goto wait_for_memory;
1176
1177
1178
1179
1180
1181 if (tp->repair)
1182 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1183
1184
1185
1186
1187 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
1188 skb->ip_summed = CHECKSUM_PARTIAL;
1189
1190 skb_entail(sk, skb);
1191 copy = size_goal;
1192 max = size_goal;
1193 }
1194
1195
1196 if (copy > seglen)
1197 copy = seglen;
1198
1199
1200 if (skb_availroom(skb) > 0) {
1201
1202 copy = min_t(int, copy, skb_availroom(skb));
1203 err = skb_add_data_nocache(sk, skb, from, copy);
1204 if (err)
1205 goto do_fault;
1206 } else {
1207 bool merge = true;
1208 int i = skb_shinfo(skb)->nr_frags;
1209 struct page_frag *pfrag = sk_page_frag(sk);
1210
1211 if (!sk_page_frag_refill(sk, pfrag))
1212 goto wait_for_memory;
1213
1214 if (!skb_can_coalesce(skb, i, pfrag->page,
1215 pfrag->offset)) {
1216 if (i == MAX_SKB_FRAGS || !sg) {
1217 tcp_mark_push(tp, skb);
1218 goto new_segment;
1219 }
1220 merge = false;
1221 }
1222
1223 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1224
1225 if (!sk_wmem_schedule(sk, copy))
1226 goto wait_for_memory;
1227
1228 err = skb_copy_to_page_nocache(sk, from, skb,
1229 pfrag->page,
1230 pfrag->offset,
1231 copy);
1232 if (err)
1233 goto do_error;
1234
1235
1236 if (merge) {
1237 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1238 } else {
1239 skb_fill_page_desc(skb, i, pfrag->page,
1240 pfrag->offset, copy);
1241 get_page(pfrag->page);
1242 }
1243 pfrag->offset += copy;
1244 }
1245
1246 if (!copied)
1247 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1248
1249 tp->write_seq += copy;
1250 TCP_SKB_CB(skb)->end_seq += copy;
1251 skb_shinfo(skb)->gso_segs = 0;
1252
1253 from += copy;
1254 copied += copy;
1255 if ((seglen -= copy) == 0 && iovlen == 0)
1256 goto out;
1257
1258 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1259 continue;
1260
1261 if (forced_push(tp)) {
1262 tcp_mark_push(tp, skb);
1263 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1264 } else if (skb == tcp_send_head(sk))
1265 tcp_push_one(sk, mss_now);
1266 continue;
1267
1268wait_for_sndbuf:
1269 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1270wait_for_memory:
1271 if (copied)
1272 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1273 TCP_NAGLE_PUSH, size_goal);
1274
1275 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1276 goto do_error;
1277
1278 mss_now = tcp_send_mss(sk, &size_goal, flags);
1279 }
1280 }
1281
1282out:
1283 if (copied)
1284 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1285 release_sock(sk);
1286 return copied + copied_syn;
1287
1288do_fault:
1289 if (!skb->len) {
1290 tcp_unlink_write_queue(skb, sk);
1291
1292
1293
1294 tcp_check_send_head(sk, skb);
1295 sk_wmem_free_skb(sk, skb);
1296 }
1297
1298do_error:
1299 if (copied + copied_syn)
1300 goto out;
1301out_err:
1302 err = sk_stream_error(sk, flags, err);
1303 release_sock(sk);
1304 return err;
1305}
1306EXPORT_SYMBOL(tcp_sendmsg);
1307
1308
1309
1310
1311
1312
1313static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1314{
1315 struct tcp_sock *tp = tcp_sk(sk);
1316
1317
1318 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1319 tp->urg_data == TCP_URG_READ)
1320 return -EINVAL;
1321
1322 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1323 return -ENOTCONN;
1324
1325 if (tp->urg_data & TCP_URG_VALID) {
1326 int err = 0;
1327 char c = tp->urg_data;
1328
1329 if (!(flags & MSG_PEEK))
1330 tp->urg_data = TCP_URG_READ;
1331
1332
1333 msg->msg_flags |= MSG_OOB;
1334
1335 if (len > 0) {
1336 if (!(flags & MSG_TRUNC))
1337 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1338 len = 1;
1339 } else
1340 msg->msg_flags |= MSG_TRUNC;
1341
1342 return err ? -EFAULT : len;
1343 }
1344
1345 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1346 return 0;
1347
1348
1349
1350
1351
1352
1353
1354 return -EAGAIN;
1355}
1356
1357static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1358{
1359 struct sk_buff *skb;
1360 int copied = 0, err = 0;
1361
1362
1363
1364 skb_queue_walk(&sk->sk_write_queue, skb) {
1365 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
1366 if (err)
1367 break;
1368
1369 copied += skb->len;
1370 }
1371
1372 return err ?: copied;
1373}
1374
1375
1376
1377
1378
1379
1380
1381void tcp_cleanup_rbuf(struct sock *sk, int copied)
1382{
1383 struct tcp_sock *tp = tcp_sk(sk);
1384 bool time_to_ack = false;
1385
1386 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1387
1388 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1389 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1390 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1391
1392 if (inet_csk_ack_scheduled(sk)) {
1393 const struct inet_connection_sock *icsk = inet_csk(sk);
1394
1395
1396 if (icsk->icsk_ack.blocked ||
1397
1398 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1399
1400
1401
1402
1403
1404
1405 (copied > 0 &&
1406 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1407 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1408 !icsk->icsk_ack.pingpong)) &&
1409 !atomic_read(&sk->sk_rmem_alloc)))
1410 time_to_ack = true;
1411 }
1412
1413
1414
1415
1416
1417
1418
1419 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1420 __u32 rcv_window_now = tcp_receive_window(tp);
1421
1422
1423 if (2*rcv_window_now <= tp->window_clamp) {
1424 __u32 new_window = __tcp_select_window(sk);
1425
1426
1427
1428
1429
1430
1431 if (new_window && new_window >= 2 * rcv_window_now)
1432 time_to_ack = true;
1433 }
1434 }
1435 if (time_to_ack)
1436 tcp_send_ack(sk);
1437}
1438
1439static void tcp_prequeue_process(struct sock *sk)
1440{
1441 struct sk_buff *skb;
1442 struct tcp_sock *tp = tcp_sk(sk);
1443
1444 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1445
1446
1447
1448 local_bh_disable();
1449 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1450 sk_backlog_rcv(sk, skb);
1451 local_bh_enable();
1452
1453
1454 tp->ucopy.memory = 0;
1455}
1456
1457#ifdef CONFIG_NET_DMA
1458static void tcp_service_net_dma(struct sock *sk, bool wait)
1459{
1460 dma_cookie_t done, used;
1461 dma_cookie_t last_issued;
1462 struct tcp_sock *tp = tcp_sk(sk);
1463
1464 if (!tp->ucopy.dma_chan)
1465 return;
1466
1467 last_issued = tp->ucopy.dma_cookie;
1468 dma_async_issue_pending(tp->ucopy.dma_chan);
1469
1470 do {
1471 if (dma_async_is_tx_complete(tp->ucopy.dma_chan,
1472 last_issued, &done,
1473 &used) == DMA_COMPLETE) {
1474
1475 __skb_queue_purge(&sk->sk_async_wait_queue);
1476 break;
1477 } else {
1478 struct sk_buff *skb;
1479 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1480 (dma_async_is_complete(skb->dma_cookie, done,
1481 used) == DMA_COMPLETE)) {
1482 __skb_dequeue(&sk->sk_async_wait_queue);
1483 kfree_skb(skb);
1484 }
1485 }
1486 } while (wait);
1487}
1488#endif
1489
1490static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1491{
1492 struct sk_buff *skb;
1493 u32 offset;
1494
1495 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1496 offset = seq - TCP_SKB_CB(skb)->seq;
1497 if (tcp_hdr(skb)->syn)
1498 offset--;
1499 if (offset < skb->len || tcp_hdr(skb)->fin) {
1500 *off = offset;
1501 return skb;
1502 }
1503
1504
1505
1506
1507 sk_eat_skb(sk, skb, false);
1508 }
1509 return NULL;
1510}
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1524 sk_read_actor_t recv_actor)
1525{
1526 struct sk_buff *skb;
1527 struct tcp_sock *tp = tcp_sk(sk);
1528 u32 seq = tp->copied_seq;
1529 u32 offset;
1530 int copied = 0;
1531
1532 if (sk->sk_state == TCP_LISTEN)
1533 return -ENOTCONN;
1534 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1535 if (offset < skb->len) {
1536 int used;
1537 size_t len;
1538
1539 len = skb->len - offset;
1540
1541 if (tp->urg_data) {
1542 u32 urg_offset = tp->urg_seq - seq;
1543 if (urg_offset < len)
1544 len = urg_offset;
1545 if (!len)
1546 break;
1547 }
1548 used = recv_actor(desc, skb, offset, len);
1549 if (used <= 0) {
1550 if (!copied)
1551 copied = used;
1552 break;
1553 } else if (used <= len) {
1554 seq += used;
1555 copied += used;
1556 offset += used;
1557 }
1558
1559
1560
1561
1562
1563 skb = tcp_recv_skb(sk, seq - 1, &offset);
1564 if (!skb)
1565 break;
1566
1567
1568
1569 if (offset + 1 != skb->len)
1570 continue;
1571 }
1572 if (tcp_hdr(skb)->fin) {
1573 sk_eat_skb(sk, skb, false);
1574 ++seq;
1575 break;
1576 }
1577 sk_eat_skb(sk, skb, false);
1578 if (!desc->count)
1579 break;
1580 tp->copied_seq = seq;
1581 }
1582 tp->copied_seq = seq;
1583
1584 tcp_rcv_space_adjust(sk);
1585
1586
1587 if (copied > 0) {
1588 tcp_recv_skb(sk, seq, &offset);
1589 tcp_cleanup_rbuf(sk, copied);
1590 }
1591 return copied;
1592}
1593EXPORT_SYMBOL(tcp_read_sock);
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1604 size_t len, int nonblock, int flags, int *addr_len)
1605{
1606 struct tcp_sock *tp = tcp_sk(sk);
1607 int copied = 0;
1608 u32 peek_seq;
1609 u32 *seq;
1610 unsigned long used;
1611 int err;
1612 int target;
1613 long timeo;
1614 struct task_struct *user_recv = NULL;
1615 bool copied_early = false;
1616 struct sk_buff *skb;
1617 u32 urg_hole = 0;
1618
1619 if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
1620 (sk->sk_state == TCP_ESTABLISHED))
1621 sk_busy_loop(sk, nonblock);
1622
1623 lock_sock(sk);
1624
1625 err = -ENOTCONN;
1626 if (sk->sk_state == TCP_LISTEN)
1627 goto out;
1628
1629 timeo = sock_rcvtimeo(sk, nonblock);
1630
1631
1632 if (flags & MSG_OOB)
1633 goto recv_urg;
1634
1635 if (unlikely(tp->repair)) {
1636 err = -EPERM;
1637 if (!(flags & MSG_PEEK))
1638 goto out;
1639
1640 if (tp->repair_queue == TCP_SEND_QUEUE)
1641 goto recv_sndq;
1642
1643 err = -EINVAL;
1644 if (tp->repair_queue == TCP_NO_QUEUE)
1645 goto out;
1646
1647
1648 }
1649
1650 seq = &tp->copied_seq;
1651 if (flags & MSG_PEEK) {
1652 peek_seq = tp->copied_seq;
1653 seq = &peek_seq;
1654 }
1655
1656 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1657
1658#ifdef CONFIG_NET_DMA
1659 tp->ucopy.dma_chan = NULL;
1660 preempt_disable();
1661 skb = skb_peek_tail(&sk->sk_receive_queue);
1662 {
1663 int available = 0;
1664
1665 if (skb)
1666 available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
1667 if ((available < target) &&
1668 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1669 !sysctl_tcp_low_latency &&
1670 net_dma_find_channel()) {
1671 preempt_enable();
1672 tp->ucopy.pinned_list =
1673 dma_pin_iovec_pages(msg->msg_iov, len);
1674 } else {
1675 preempt_enable();
1676 }
1677 }
1678#endif
1679
1680 do {
1681 u32 offset;
1682
1683
1684 if (tp->urg_data && tp->urg_seq == *seq) {
1685 if (copied)
1686 break;
1687 if (signal_pending(current)) {
1688 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1689 break;
1690 }
1691 }
1692
1693
1694
1695 skb_queue_walk(&sk->sk_receive_queue, skb) {
1696
1697
1698
1699 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1700 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
1701 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1702 flags))
1703 break;
1704
1705 offset = *seq - TCP_SKB_CB(skb)->seq;
1706 if (tcp_hdr(skb)->syn)
1707 offset--;
1708 if (offset < skb->len)
1709 goto found_ok_skb;
1710 if (tcp_hdr(skb)->fin)
1711 goto found_fin_ok;
1712 WARN(!(flags & MSG_PEEK),
1713 "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
1714 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1715 }
1716
1717
1718
1719 if (copied >= target && !sk->sk_backlog.tail)
1720 break;
1721
1722 if (copied) {
1723 if (sk->sk_err ||
1724 sk->sk_state == TCP_CLOSE ||
1725 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1726 !timeo ||
1727 signal_pending(current))
1728 break;
1729 } else {
1730 if (sock_flag(sk, SOCK_DONE))
1731 break;
1732
1733 if (sk->sk_err) {
1734 copied = sock_error(sk);
1735 break;
1736 }
1737
1738 if (sk->sk_shutdown & RCV_SHUTDOWN)
1739 break;
1740
1741 if (sk->sk_state == TCP_CLOSE) {
1742 if (!sock_flag(sk, SOCK_DONE)) {
1743
1744
1745
1746 copied = -ENOTCONN;
1747 break;
1748 }
1749 break;
1750 }
1751
1752 if (!timeo) {
1753 copied = -EAGAIN;
1754 break;
1755 }
1756
1757 if (signal_pending(current)) {
1758 copied = sock_intr_errno(timeo);
1759 break;
1760 }
1761 }
1762
1763 tcp_cleanup_rbuf(sk, copied);
1764
1765 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1766
1767 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1768 user_recv = current;
1769 tp->ucopy.task = user_recv;
1770 tp->ucopy.iov = msg->msg_iov;
1771 }
1772
1773 tp->ucopy.len = len;
1774
1775 WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1776 !(flags & (MSG_PEEK | MSG_TRUNC)));
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804 if (!skb_queue_empty(&tp->ucopy.prequeue))
1805 goto do_prequeue;
1806
1807
1808 }
1809
1810#ifdef CONFIG_NET_DMA
1811 if (tp->ucopy.dma_chan) {
1812 if (tp->rcv_wnd == 0 &&
1813 !skb_queue_empty(&sk->sk_async_wait_queue)) {
1814 tcp_service_net_dma(sk, true);
1815 tcp_cleanup_rbuf(sk, copied);
1816 } else
1817 dma_async_issue_pending(tp->ucopy.dma_chan);
1818 }
1819#endif
1820 if (copied >= target) {
1821
1822 release_sock(sk);
1823 lock_sock(sk);
1824 } else
1825 sk_wait_data(sk, &timeo);
1826
1827#ifdef CONFIG_NET_DMA
1828 tcp_service_net_dma(sk, false);
1829 tp->ucopy.wakeup = 0;
1830#endif
1831
1832 if (user_recv) {
1833 int chunk;
1834
1835
1836
1837 if ((chunk = len - tp->ucopy.len) != 0) {
1838 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1839 len -= chunk;
1840 copied += chunk;
1841 }
1842
1843 if (tp->rcv_nxt == tp->copied_seq &&
1844 !skb_queue_empty(&tp->ucopy.prequeue)) {
1845do_prequeue:
1846 tcp_prequeue_process(sk);
1847
1848 if ((chunk = len - tp->ucopy.len) != 0) {
1849 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1850 len -= chunk;
1851 copied += chunk;
1852 }
1853 }
1854 }
1855 if ((flags & MSG_PEEK) &&
1856 (peek_seq - copied - urg_hole != tp->copied_seq)) {
1857 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
1858 current->comm,
1859 task_pid_nr(current));
1860 peek_seq = tp->copied_seq;
1861 }
1862 continue;
1863
1864 found_ok_skb:
1865
1866 used = skb->len - offset;
1867 if (len < used)
1868 used = len;
1869
1870
1871 if (tp->urg_data) {
1872 u32 urg_offset = tp->urg_seq - *seq;
1873 if (urg_offset < used) {
1874 if (!urg_offset) {
1875 if (!sock_flag(sk, SOCK_URGINLINE)) {
1876 ++*seq;
1877 urg_hole++;
1878 offset++;
1879 used--;
1880 if (!used)
1881 goto skip_copy;
1882 }
1883 } else
1884 used = urg_offset;
1885 }
1886 }
1887
1888 if (!(flags & MSG_TRUNC)) {
1889#ifdef CONFIG_NET_DMA
1890 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1891 tp->ucopy.dma_chan = net_dma_find_channel();
1892
1893 if (tp->ucopy.dma_chan) {
1894 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1895 tp->ucopy.dma_chan, skb, offset,
1896 msg->msg_iov, used,
1897 tp->ucopy.pinned_list);
1898
1899 if (tp->ucopy.dma_cookie < 0) {
1900
1901 pr_alert("%s: dma_cookie < 0\n",
1902 __func__);
1903
1904
1905 if (!copied)
1906 copied = -EFAULT;
1907 break;
1908 }
1909
1910 dma_async_issue_pending(tp->ucopy.dma_chan);
1911
1912 if ((offset + used) == skb->len)
1913 copied_early = true;
1914
1915 } else
1916#endif
1917 {
1918 err = skb_copy_datagram_iovec(skb, offset,
1919 msg->msg_iov, used);
1920 if (err) {
1921
1922 if (!copied)
1923 copied = -EFAULT;
1924 break;
1925 }
1926 }
1927 }
1928
1929 *seq += used;
1930 copied += used;
1931 len -= used;
1932
1933 tcp_rcv_space_adjust(sk);
1934
1935skip_copy:
1936 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1937 tp->urg_data = 0;
1938 tcp_fast_path_check(sk);
1939 }
1940 if (used + offset < skb->len)
1941 continue;
1942
1943 if (tcp_hdr(skb)->fin)
1944 goto found_fin_ok;
1945 if (!(flags & MSG_PEEK)) {
1946 sk_eat_skb(sk, skb, copied_early);
1947 copied_early = false;
1948 }
1949 continue;
1950
1951 found_fin_ok:
1952
1953 ++*seq;
1954 if (!(flags & MSG_PEEK)) {
1955 sk_eat_skb(sk, skb, copied_early);
1956 copied_early = false;
1957 }
1958 break;
1959 } while (len > 0);
1960
1961 if (user_recv) {
1962 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1963 int chunk;
1964
1965 tp->ucopy.len = copied > 0 ? len : 0;
1966
1967 tcp_prequeue_process(sk);
1968
1969 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1970 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1971 len -= chunk;
1972 copied += chunk;
1973 }
1974 }
1975
1976 tp->ucopy.task = NULL;
1977 tp->ucopy.len = 0;
1978 }
1979
1980#ifdef CONFIG_NET_DMA
1981 tcp_service_net_dma(sk, true);
1982 tp->ucopy.dma_chan = NULL;
1983
1984 if (tp->ucopy.pinned_list) {
1985 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1986 tp->ucopy.pinned_list = NULL;
1987 }
1988#endif
1989
1990
1991
1992
1993
1994
1995 tcp_cleanup_rbuf(sk, copied);
1996
1997 release_sock(sk);
1998 return copied;
1999
2000out:
2001 release_sock(sk);
2002 return err;
2003
2004recv_urg:
2005 err = tcp_recv_urg(sk, msg, len, flags);
2006 goto out;
2007
2008recv_sndq:
2009 err = tcp_peek_sndq(sk, msg, len);
2010 goto out;
2011}
2012EXPORT_SYMBOL(tcp_recvmsg);
2013
2014void tcp_set_state(struct sock *sk, int state)
2015{
2016 int oldstate = sk->sk_state;
2017
2018 switch (state) {
2019 case TCP_ESTABLISHED:
2020 if (oldstate != TCP_ESTABLISHED)
2021 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2022 break;
2023
2024 case TCP_CLOSE:
2025 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
2026 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
2027
2028 sk->sk_prot->unhash(sk);
2029 if (inet_csk(sk)->icsk_bind_hash &&
2030 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
2031 inet_put_port(sk);
2032
2033 default:
2034 if (oldstate == TCP_ESTABLISHED)
2035 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2036 }
2037
2038
2039
2040
2041 sk->sk_state = state;
2042
2043#ifdef STATE_TRACE
2044 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
2045#endif
2046}
2047EXPORT_SYMBOL_GPL(tcp_set_state);
2048
2049
2050
2051
2052
2053
2054
2055
2056static const unsigned char new_state[16] = {
2057
2058 TCP_CLOSE,
2059 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2060 TCP_CLOSE,
2061 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2062 TCP_FIN_WAIT1,
2063 TCP_FIN_WAIT2,
2064 TCP_CLOSE,
2065 TCP_CLOSE,
2066 TCP_LAST_ACK | TCP_ACTION_FIN,
2067 TCP_LAST_ACK,
2068 TCP_CLOSE,
2069 TCP_CLOSING,
2070};
2071
2072static int tcp_close_state(struct sock *sk)
2073{
2074 int next = (int)new_state[sk->sk_state];
2075 int ns = next & TCP_STATE_MASK;
2076
2077 tcp_set_state(sk, ns);
2078
2079 return next & TCP_ACTION_FIN;
2080}
2081
2082
2083
2084
2085
2086
2087void tcp_shutdown(struct sock *sk, int how)
2088{
2089
2090
2091
2092
2093 if (!(how & SEND_SHUTDOWN))
2094 return;
2095
2096
2097 if ((1 << sk->sk_state) &
2098 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2099 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2100
2101 if (tcp_close_state(sk))
2102 tcp_send_fin(sk);
2103 }
2104}
2105EXPORT_SYMBOL(tcp_shutdown);
2106
2107bool tcp_check_oom(struct sock *sk, int shift)
2108{
2109 bool too_many_orphans, out_of_socket_memory;
2110
2111 too_many_orphans = tcp_too_many_orphans(sk, shift);
2112 out_of_socket_memory = tcp_out_of_memory(sk);
2113
2114 if (too_many_orphans)
2115 net_info_ratelimited("too many orphaned sockets\n");
2116 if (out_of_socket_memory)
2117 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2118 return too_many_orphans || out_of_socket_memory;
2119}
2120
2121void tcp_close(struct sock *sk, long timeout)
2122{
2123 struct sk_buff *skb;
2124 int data_was_unread = 0;
2125 int state;
2126
2127 lock_sock(sk);
2128 sk->sk_shutdown = SHUTDOWN_MASK;
2129
2130 if (sk->sk_state == TCP_LISTEN) {
2131 tcp_set_state(sk, TCP_CLOSE);
2132
2133
2134 inet_csk_listen_stop(sk);
2135
2136 goto adjudge_to_death;
2137 }
2138
2139
2140
2141
2142
2143 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2144 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
2145 tcp_hdr(skb)->fin;
2146 data_was_unread += len;
2147 __kfree_skb(skb);
2148 }
2149
2150 sk_mem_reclaim(sk);
2151
2152
2153 if (sk->sk_state == TCP_CLOSE)
2154 goto adjudge_to_death;
2155
2156
2157
2158
2159
2160
2161
2162
2163 if (unlikely(tcp_sk(sk)->repair)) {
2164 sk->sk_prot->disconnect(sk, 0);
2165 } else if (data_was_unread) {
2166
2167 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2168 tcp_set_state(sk, TCP_CLOSE);
2169 tcp_send_active_reset(sk, sk->sk_allocation);
2170 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2171
2172 sk->sk_prot->disconnect(sk, 0);
2173 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2174 } else if (tcp_close_state(sk)) {
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204 tcp_send_fin(sk);
2205 }
2206
2207 sk_stream_wait_close(sk, timeout);
2208
2209adjudge_to_death:
2210 state = sk->sk_state;
2211 sock_hold(sk);
2212 sock_orphan(sk);
2213
2214
2215 release_sock(sk);
2216
2217
2218
2219
2220
2221 local_bh_disable();
2222 bh_lock_sock(sk);
2223 WARN_ON(sock_owned_by_user(sk));
2224
2225 percpu_counter_inc(sk->sk_prot->orphan_count);
2226
2227
2228 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2229 goto out;
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245 if (sk->sk_state == TCP_FIN_WAIT2) {
2246 struct tcp_sock *tp = tcp_sk(sk);
2247 if (tp->linger2 < 0) {
2248 tcp_set_state(sk, TCP_CLOSE);
2249 tcp_send_active_reset(sk, GFP_ATOMIC);
2250 NET_INC_STATS_BH(sock_net(sk),
2251 LINUX_MIB_TCPABORTONLINGER);
2252 } else {
2253 const int tmo = tcp_fin_time(sk);
2254
2255 if (tmo > TCP_TIMEWAIT_LEN) {
2256 inet_csk_reset_keepalive_timer(sk,
2257 tmo - TCP_TIMEWAIT_LEN);
2258 } else {
2259 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2260 goto out;
2261 }
2262 }
2263 }
2264 if (sk->sk_state != TCP_CLOSE) {
2265 sk_mem_reclaim(sk);
2266 if (tcp_check_oom(sk, 0)) {
2267 tcp_set_state(sk, TCP_CLOSE);
2268 tcp_send_active_reset(sk, GFP_ATOMIC);
2269 NET_INC_STATS_BH(sock_net(sk),
2270 LINUX_MIB_TCPABORTONMEMORY);
2271 }
2272 }
2273
2274 if (sk->sk_state == TCP_CLOSE) {
2275 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2276
2277
2278
2279
2280 if (req != NULL)
2281 reqsk_fastopen_remove(sk, req, false);
2282 inet_csk_destroy_sock(sk);
2283 }
2284
2285
2286out:
2287 bh_unlock_sock(sk);
2288 local_bh_enable();
2289 sock_put(sk);
2290}
2291EXPORT_SYMBOL(tcp_close);
2292
2293
2294
2295static inline bool tcp_need_reset(int state)
2296{
2297 return (1 << state) &
2298 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2299 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2300}
2301
2302int tcp_disconnect(struct sock *sk, int flags)
2303{
2304 struct inet_sock *inet = inet_sk(sk);
2305 struct inet_connection_sock *icsk = inet_csk(sk);
2306 struct tcp_sock *tp = tcp_sk(sk);
2307 int err = 0;
2308 int old_state = sk->sk_state;
2309
2310 if (old_state != TCP_CLOSE)
2311 tcp_set_state(sk, TCP_CLOSE);
2312
2313
2314 if (old_state == TCP_LISTEN) {
2315 inet_csk_listen_stop(sk);
2316 } else if (unlikely(tp->repair)) {
2317 sk->sk_err = ECONNABORTED;
2318 } else if (tcp_need_reset(old_state) ||
2319 (tp->snd_nxt != tp->write_seq &&
2320 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2321
2322
2323
2324 tcp_send_active_reset(sk, gfp_any());
2325 sk->sk_err = ECONNRESET;
2326 } else if (old_state == TCP_SYN_SENT)
2327 sk->sk_err = ECONNRESET;
2328
2329 tcp_clear_xmit_timers(sk);
2330 __skb_queue_purge(&sk->sk_receive_queue);
2331 tcp_write_queue_purge(sk);
2332 __skb_queue_purge(&tp->out_of_order_queue);
2333#ifdef CONFIG_NET_DMA
2334 __skb_queue_purge(&sk->sk_async_wait_queue);
2335#endif
2336
2337 inet->inet_dport = 0;
2338
2339 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2340 inet_reset_saddr(sk);
2341
2342 sk->sk_shutdown = 0;
2343 sock_reset_flag(sk, SOCK_DONE);
2344 tp->srtt_us = 0;
2345 if ((tp->write_seq += tp->max_window + 2) == 0)
2346 tp->write_seq = 1;
2347 icsk->icsk_backoff = 0;
2348 tp->snd_cwnd = 2;
2349 icsk->icsk_probes_out = 0;
2350 tp->packets_out = 0;
2351 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2352 tp->snd_cwnd_cnt = 0;
2353 tp->window_clamp = 0;
2354 tcp_set_ca_state(sk, TCP_CA_Open);
2355 tcp_clear_retrans(tp);
2356 inet_csk_delack_init(sk);
2357 tcp_init_send_head(sk);
2358 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2359 __sk_dst_reset(sk);
2360
2361 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2362
2363 sk->sk_error_report(sk);
2364 return err;
2365}
2366EXPORT_SYMBOL(tcp_disconnect);
2367
2368void tcp_sock_destruct(struct sock *sk)
2369{
2370 inet_sock_destruct(sk);
2371
2372 kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
2373}
2374
2375static inline bool tcp_can_repair_sock(const struct sock *sk)
2376{
2377 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2378 ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
2379}
2380
2381static int tcp_repair_options_est(struct tcp_sock *tp,
2382 struct tcp_repair_opt __user *optbuf, unsigned int len)
2383{
2384 struct tcp_repair_opt opt;
2385
2386 while (len >= sizeof(opt)) {
2387 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2388 return -EFAULT;
2389
2390 optbuf++;
2391 len -= sizeof(opt);
2392
2393 switch (opt.opt_code) {
2394 case TCPOPT_MSS:
2395 tp->rx_opt.mss_clamp = opt.opt_val;
2396 break;
2397 case TCPOPT_WINDOW:
2398 {
2399 u16 snd_wscale = opt.opt_val & 0xFFFF;
2400 u16 rcv_wscale = opt.opt_val >> 16;
2401
2402 if (snd_wscale > 14 || rcv_wscale > 14)
2403 return -EFBIG;
2404
2405 tp->rx_opt.snd_wscale = snd_wscale;
2406 tp->rx_opt.rcv_wscale = rcv_wscale;
2407 tp->rx_opt.wscale_ok = 1;
2408 }
2409 break;
2410 case TCPOPT_SACK_PERM:
2411 if (opt.opt_val != 0)
2412 return -EINVAL;
2413
2414 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2415 if (sysctl_tcp_fack)
2416 tcp_enable_fack(tp);
2417 break;
2418 case TCPOPT_TIMESTAMP:
2419 if (opt.opt_val != 0)
2420 return -EINVAL;
2421
2422 tp->rx_opt.tstamp_ok = 1;
2423 break;
2424 }
2425 }
2426
2427 return 0;
2428}
2429
2430
2431
2432
2433static int do_tcp_setsockopt(struct sock *sk, int level,
2434 int optname, char __user *optval, unsigned int optlen)
2435{
2436 struct tcp_sock *tp = tcp_sk(sk);
2437 struct inet_connection_sock *icsk = inet_csk(sk);
2438 int val;
2439 int err = 0;
2440
2441
2442 switch (optname) {
2443 case TCP_CONGESTION: {
2444 char name[TCP_CA_NAME_MAX];
2445
2446 if (optlen < 1)
2447 return -EINVAL;
2448
2449 val = strncpy_from_user(name, optval,
2450 min_t(long, TCP_CA_NAME_MAX-1, optlen));
2451 if (val < 0)
2452 return -EFAULT;
2453 name[val] = 0;
2454
2455 lock_sock(sk);
2456 err = tcp_set_congestion_control(sk, name);
2457 release_sock(sk);
2458 return err;
2459 }
2460 default:
2461
2462 break;
2463 }
2464
2465 if (optlen < sizeof(int))
2466 return -EINVAL;
2467
2468 if (get_user(val, (int __user *)optval))
2469 return -EFAULT;
2470
2471 lock_sock(sk);
2472
2473 switch (optname) {
2474 case TCP_MAXSEG:
2475
2476
2477
2478 if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
2479 err = -EINVAL;
2480 break;
2481 }
2482 tp->rx_opt.user_mss = val;
2483 break;
2484
2485 case TCP_NODELAY:
2486 if (val) {
2487
2488
2489
2490
2491
2492
2493
2494
2495 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2496 tcp_push_pending_frames(sk);
2497 } else {
2498 tp->nonagle &= ~TCP_NAGLE_OFF;
2499 }
2500 break;
2501
2502 case TCP_THIN_LINEAR_TIMEOUTS:
2503 if (val < 0 || val > 1)
2504 err = -EINVAL;
2505 else
2506 tp->thin_lto = val;
2507 break;
2508
2509 case TCP_THIN_DUPACK:
2510 if (val < 0 || val > 1)
2511 err = -EINVAL;
2512 else {
2513 tp->thin_dupack = val;
2514 if (tp->thin_dupack)
2515 tcp_disable_early_retrans(tp);
2516 }
2517 break;
2518
2519 case TCP_REPAIR:
2520 if (!tcp_can_repair_sock(sk))
2521 err = -EPERM;
2522 else if (val == 1) {
2523 tp->repair = 1;
2524 sk->sk_reuse = SK_FORCE_REUSE;
2525 tp->repair_queue = TCP_NO_QUEUE;
2526 } else if (val == 0) {
2527 tp->repair = 0;
2528 sk->sk_reuse = SK_NO_REUSE;
2529 tcp_send_window_probe(sk);
2530 } else
2531 err = -EINVAL;
2532
2533 break;
2534
2535 case TCP_REPAIR_QUEUE:
2536 if (!tp->repair)
2537 err = -EPERM;
2538 else if (val < TCP_QUEUES_NR)
2539 tp->repair_queue = val;
2540 else
2541 err = -EINVAL;
2542 break;
2543
2544 case TCP_QUEUE_SEQ:
2545 if (sk->sk_state != TCP_CLOSE)
2546 err = -EPERM;
2547 else if (tp->repair_queue == TCP_SEND_QUEUE)
2548 tp->write_seq = val;
2549 else if (tp->repair_queue == TCP_RECV_QUEUE)
2550 tp->rcv_nxt = val;
2551 else
2552 err = -EINVAL;
2553 break;
2554
2555 case TCP_REPAIR_OPTIONS:
2556 if (!tp->repair)
2557 err = -EINVAL;
2558 else if (sk->sk_state == TCP_ESTABLISHED)
2559 err = tcp_repair_options_est(tp,
2560 (struct tcp_repair_opt __user *)optval,
2561 optlen);
2562 else
2563 err = -EPERM;
2564 break;
2565
2566 case TCP_CORK:
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578 if (val) {
2579 tp->nonagle |= TCP_NAGLE_CORK;
2580 } else {
2581 tp->nonagle &= ~TCP_NAGLE_CORK;
2582 if (tp->nonagle&TCP_NAGLE_OFF)
2583 tp->nonagle |= TCP_NAGLE_PUSH;
2584 tcp_push_pending_frames(sk);
2585 }
2586 break;
2587
2588 case TCP_KEEPIDLE:
2589 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2590 err = -EINVAL;
2591 else {
2592 tp->keepalive_time = val * HZ;
2593 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2594 !((1 << sk->sk_state) &
2595 (TCPF_CLOSE | TCPF_LISTEN))) {
2596 u32 elapsed = keepalive_time_elapsed(tp);
2597 if (tp->keepalive_time > elapsed)
2598 elapsed = tp->keepalive_time - elapsed;
2599 else
2600 elapsed = 0;
2601 inet_csk_reset_keepalive_timer(sk, elapsed);
2602 }
2603 }
2604 break;
2605 case TCP_KEEPINTVL:
2606 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2607 err = -EINVAL;
2608 else
2609 tp->keepalive_intvl = val * HZ;
2610 break;
2611 case TCP_KEEPCNT:
2612 if (val < 1 || val > MAX_TCP_KEEPCNT)
2613 err = -EINVAL;
2614 else
2615 tp->keepalive_probes = val;
2616 break;
2617 case TCP_SYNCNT:
2618 if (val < 1 || val > MAX_TCP_SYNCNT)
2619 err = -EINVAL;
2620 else
2621 icsk->icsk_syn_retries = val;
2622 break;
2623
2624 case TCP_LINGER2:
2625 if (val < 0)
2626 tp->linger2 = -1;
2627 else if (val > sysctl_tcp_fin_timeout / HZ)
2628 tp->linger2 = 0;
2629 else
2630 tp->linger2 = val * HZ;
2631 break;
2632
2633 case TCP_DEFER_ACCEPT:
2634
2635 icsk->icsk_accept_queue.rskq_defer_accept =
2636 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2637 TCP_RTO_MAX / HZ);
2638 break;
2639
2640 case TCP_WINDOW_CLAMP:
2641 if (!val) {
2642 if (sk->sk_state != TCP_CLOSE) {
2643 err = -EINVAL;
2644 break;
2645 }
2646 tp->window_clamp = 0;
2647 } else
2648 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2649 SOCK_MIN_RCVBUF / 2 : val;
2650 break;
2651
2652 case TCP_QUICKACK:
2653 if (!val) {
2654 icsk->icsk_ack.pingpong = 1;
2655 } else {
2656 icsk->icsk_ack.pingpong = 0;
2657 if ((1 << sk->sk_state) &
2658 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2659 inet_csk_ack_scheduled(sk)) {
2660 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2661 tcp_cleanup_rbuf(sk, 1);
2662 if (!(val & 1))
2663 icsk->icsk_ack.pingpong = 1;
2664 }
2665 }
2666 break;
2667
2668#ifdef CONFIG_TCP_MD5SIG
2669 case TCP_MD5SIG:
2670
2671 err = tp->af_specific->md5_parse(sk, optval, optlen);
2672 break;
2673#endif
2674 case TCP_USER_TIMEOUT:
2675
2676
2677
2678 if (val < 0)
2679 err = -EINVAL;
2680 else
2681 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2682 break;
2683
2684 case TCP_FASTOPEN:
2685 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2686 TCPF_LISTEN)))
2687 err = fastopen_init_queue(sk, val);
2688 else
2689 err = -EINVAL;
2690 break;
2691 case TCP_TIMESTAMP:
2692 if (!tp->repair)
2693 err = -EPERM;
2694 else
2695 tp->tsoffset = val - tcp_time_stamp;
2696 break;
2697 case TCP_NOTSENT_LOWAT:
2698 tp->notsent_lowat = val;
2699 sk->sk_write_space(sk);
2700 break;
2701 default:
2702 err = -ENOPROTOOPT;
2703 break;
2704 }
2705
2706 release_sock(sk);
2707 return err;
2708}
2709
2710int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2711 unsigned int optlen)
2712{
2713 const struct inet_connection_sock *icsk = inet_csk(sk);
2714
2715 if (level != SOL_TCP)
2716 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2717 optval, optlen);
2718 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2719}
2720EXPORT_SYMBOL(tcp_setsockopt);
2721
2722#ifdef CONFIG_COMPAT
2723int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2724 char __user *optval, unsigned int optlen)
2725{
2726 if (level != SOL_TCP)
2727 return inet_csk_compat_setsockopt(sk, level, optname,
2728 optval, optlen);
2729 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2730}
2731EXPORT_SYMBOL(compat_tcp_setsockopt);
2732#endif
2733
2734
2735void tcp_get_info(const struct sock *sk, struct tcp_info *info)
2736{
2737 const struct tcp_sock *tp = tcp_sk(sk);
2738 const struct inet_connection_sock *icsk = inet_csk(sk);
2739 u32 now = tcp_time_stamp;
2740
2741 memset(info, 0, sizeof(*info));
2742
2743 info->tcpi_state = sk->sk_state;
2744 info->tcpi_ca_state = icsk->icsk_ca_state;
2745 info->tcpi_retransmits = icsk->icsk_retransmits;
2746 info->tcpi_probes = icsk->icsk_probes_out;
2747 info->tcpi_backoff = icsk->icsk_backoff;
2748
2749 if (tp->rx_opt.tstamp_ok)
2750 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2751 if (tcp_is_sack(tp))
2752 info->tcpi_options |= TCPI_OPT_SACK;
2753 if (tp->rx_opt.wscale_ok) {
2754 info->tcpi_options |= TCPI_OPT_WSCALE;
2755 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2756 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2757 }
2758
2759 if (tp->ecn_flags & TCP_ECN_OK)
2760 info->tcpi_options |= TCPI_OPT_ECN;
2761 if (tp->ecn_flags & TCP_ECN_SEEN)
2762 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
2763 if (tp->syn_data_acked)
2764 info->tcpi_options |= TCPI_OPT_SYN_DATA;
2765
2766 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2767 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2768 info->tcpi_snd_mss = tp->mss_cache;
2769 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2770
2771 if (sk->sk_state == TCP_LISTEN) {
2772 info->tcpi_unacked = sk->sk_ack_backlog;
2773 info->tcpi_sacked = sk->sk_max_ack_backlog;
2774 } else {
2775 info->tcpi_unacked = tp->packets_out;
2776 info->tcpi_sacked = tp->sacked_out;
2777 }
2778 info->tcpi_lost = tp->lost_out;
2779 info->tcpi_retrans = tp->retrans_out;
2780 info->tcpi_fackets = tp->fackets_out;
2781
2782 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2783 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2784 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2785
2786 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2787 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2788 info->tcpi_rtt = tp->srtt_us >> 3;
2789 info->tcpi_rttvar = tp->mdev_us >> 2;
2790 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2791 info->tcpi_snd_cwnd = tp->snd_cwnd;
2792 info->tcpi_advmss = tp->advmss;
2793 info->tcpi_reordering = tp->reordering;
2794
2795 info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2796 info->tcpi_rcv_space = tp->rcvq_space.space;
2797
2798 info->tcpi_total_retrans = tp->total_retrans;
2799
2800 info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ?
2801 sk->sk_pacing_rate : ~0ULL;
2802 info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ?
2803 sk->sk_max_pacing_rate : ~0ULL;
2804}
2805EXPORT_SYMBOL_GPL(tcp_get_info);
2806
2807static int do_tcp_getsockopt(struct sock *sk, int level,
2808 int optname, char __user *optval, int __user *optlen)
2809{
2810 struct inet_connection_sock *icsk = inet_csk(sk);
2811 struct tcp_sock *tp = tcp_sk(sk);
2812 int val, len;
2813
2814 if (get_user(len, optlen))
2815 return -EFAULT;
2816
2817 len = min_t(unsigned int, len, sizeof(int));
2818
2819 if (len < 0)
2820 return -EINVAL;
2821
2822 switch (optname) {
2823 case TCP_MAXSEG:
2824 val = tp->mss_cache;
2825 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2826 val = tp->rx_opt.user_mss;
2827 if (tp->repair)
2828 val = tp->rx_opt.mss_clamp;
2829 break;
2830 case TCP_NODELAY:
2831 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2832 break;
2833 case TCP_CORK:
2834 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2835 break;
2836 case TCP_KEEPIDLE:
2837 val = keepalive_time_when(tp) / HZ;
2838 break;
2839 case TCP_KEEPINTVL:
2840 val = keepalive_intvl_when(tp) / HZ;
2841 break;
2842 case TCP_KEEPCNT:
2843 val = keepalive_probes(tp);
2844 break;
2845 case TCP_SYNCNT:
2846 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2847 break;
2848 case TCP_LINGER2:
2849 val = tp->linger2;
2850 if (val >= 0)
2851 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2852 break;
2853 case TCP_DEFER_ACCEPT:
2854 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
2855 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
2856 break;
2857 case TCP_WINDOW_CLAMP:
2858 val = tp->window_clamp;
2859 break;
2860 case TCP_INFO: {
2861 struct tcp_info info;
2862
2863 if (get_user(len, optlen))
2864 return -EFAULT;
2865
2866 tcp_get_info(sk, &info);
2867
2868 len = min_t(unsigned int, len, sizeof(info));
2869 if (put_user(len, optlen))
2870 return -EFAULT;
2871 if (copy_to_user(optval, &info, len))
2872 return -EFAULT;
2873 return 0;
2874 }
2875 case TCP_QUICKACK:
2876 val = !icsk->icsk_ack.pingpong;
2877 break;
2878
2879 case TCP_CONGESTION:
2880 if (get_user(len, optlen))
2881 return -EFAULT;
2882 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2883 if (put_user(len, optlen))
2884 return -EFAULT;
2885 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2886 return -EFAULT;
2887 return 0;
2888
2889 case TCP_THIN_LINEAR_TIMEOUTS:
2890 val = tp->thin_lto;
2891 break;
2892 case TCP_THIN_DUPACK:
2893 val = tp->thin_dupack;
2894 break;
2895
2896 case TCP_REPAIR:
2897 val = tp->repair;
2898 break;
2899
2900 case TCP_REPAIR_QUEUE:
2901 if (tp->repair)
2902 val = tp->repair_queue;
2903 else
2904 return -EINVAL;
2905 break;
2906
2907 case TCP_QUEUE_SEQ:
2908 if (tp->repair_queue == TCP_SEND_QUEUE)
2909 val = tp->write_seq;
2910 else if (tp->repair_queue == TCP_RECV_QUEUE)
2911 val = tp->rcv_nxt;
2912 else
2913 return -EINVAL;
2914 break;
2915
2916 case TCP_USER_TIMEOUT:
2917 val = jiffies_to_msecs(icsk->icsk_user_timeout);
2918 break;
2919 case TCP_TIMESTAMP:
2920 val = tcp_time_stamp + tp->tsoffset;
2921 break;
2922 case TCP_NOTSENT_LOWAT:
2923 val = tp->notsent_lowat;
2924 break;
2925 default:
2926 return -ENOPROTOOPT;
2927 }
2928
2929 if (put_user(len, optlen))
2930 return -EFAULT;
2931 if (copy_to_user(optval, &val, len))
2932 return -EFAULT;
2933 return 0;
2934}
2935
2936int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2937 int __user *optlen)
2938{
2939 struct inet_connection_sock *icsk = inet_csk(sk);
2940
2941 if (level != SOL_TCP)
2942 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2943 optval, optlen);
2944 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2945}
2946EXPORT_SYMBOL(tcp_getsockopt);
2947
2948#ifdef CONFIG_COMPAT
2949int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2950 char __user *optval, int __user *optlen)
2951{
2952 if (level != SOL_TCP)
2953 return inet_csk_compat_getsockopt(sk, level, optname,
2954 optval, optlen);
2955 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2956}
2957EXPORT_SYMBOL(compat_tcp_getsockopt);
2958#endif
2959
2960#ifdef CONFIG_TCP_MD5SIG
2961static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly;
2962static DEFINE_MUTEX(tcp_md5sig_mutex);
2963
2964static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
2965{
2966 int cpu;
2967
2968 for_each_possible_cpu(cpu) {
2969 struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu);
2970
2971 if (p->md5_desc.tfm)
2972 crypto_free_hash(p->md5_desc.tfm);
2973 }
2974 free_percpu(pool);
2975}
2976
2977static void __tcp_alloc_md5sig_pool(void)
2978{
2979 int cpu;
2980 struct tcp_md5sig_pool __percpu *pool;
2981
2982 pool = alloc_percpu(struct tcp_md5sig_pool);
2983 if (!pool)
2984 return;
2985
2986 for_each_possible_cpu(cpu) {
2987 struct crypto_hash *hash;
2988
2989 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
2990 if (IS_ERR_OR_NULL(hash))
2991 goto out_free;
2992
2993 per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
2994 }
2995
2996
2997
2998 smp_wmb();
2999 tcp_md5sig_pool = pool;
3000 return;
3001out_free:
3002 __tcp_free_md5sig_pool(pool);
3003}
3004
3005bool tcp_alloc_md5sig_pool(void)
3006{
3007 if (unlikely(!tcp_md5sig_pool)) {
3008 mutex_lock(&tcp_md5sig_mutex);
3009
3010 if (!tcp_md5sig_pool)
3011 __tcp_alloc_md5sig_pool();
3012
3013 mutex_unlock(&tcp_md5sig_mutex);
3014 }
3015 return tcp_md5sig_pool != NULL;
3016}
3017EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3028{
3029 struct tcp_md5sig_pool __percpu *p;
3030
3031 local_bh_disable();
3032 p = ACCESS_ONCE(tcp_md5sig_pool);
3033 if (p)
3034 return __this_cpu_ptr(p);
3035
3036 local_bh_enable();
3037 return NULL;
3038}
3039EXPORT_SYMBOL(tcp_get_md5sig_pool);
3040
3041int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
3042 const struct tcphdr *th)
3043{
3044 struct scatterlist sg;
3045 struct tcphdr hdr;
3046 int err;
3047
3048
3049 memcpy(&hdr, th, sizeof(hdr));
3050 hdr.check = 0;
3051
3052
3053 sg_init_one(&sg, &hdr, sizeof(hdr));
3054 err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
3055 return err;
3056}
3057EXPORT_SYMBOL(tcp_md5_hash_header);
3058
3059int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3060 const struct sk_buff *skb, unsigned int header_len)
3061{
3062 struct scatterlist sg;
3063 const struct tcphdr *tp = tcp_hdr(skb);
3064 struct hash_desc *desc = &hp->md5_desc;
3065 unsigned int i;
3066 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
3067 skb_headlen(skb) - header_len : 0;
3068 const struct skb_shared_info *shi = skb_shinfo(skb);
3069 struct sk_buff *frag_iter;
3070
3071 sg_init_table(&sg, 1);
3072
3073 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
3074 if (crypto_hash_update(desc, &sg, head_data_len))
3075 return 1;
3076
3077 for (i = 0; i < shi->nr_frags; ++i) {
3078 const struct skb_frag_struct *f = &shi->frags[i];
3079 unsigned int offset = f->page_offset;
3080 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
3081
3082 sg_set_page(&sg, page, skb_frag_size(f),
3083 offset_in_page(offset));
3084 if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
3085 return 1;
3086 }
3087
3088 skb_walk_frags(skb, frag_iter)
3089 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3090 return 1;
3091
3092 return 0;
3093}
3094EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3095
3096int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
3097{
3098 struct scatterlist sg;
3099
3100 sg_init_one(&sg, key->key, key->keylen);
3101 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
3102}
3103EXPORT_SYMBOL(tcp_md5_hash_key);
3104
3105#endif
3106
3107void tcp_done(struct sock *sk)
3108{
3109 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3110
3111 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3112 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3113
3114 tcp_set_state(sk, TCP_CLOSE);
3115 tcp_clear_xmit_timers(sk);
3116 if (req != NULL)
3117 reqsk_fastopen_remove(sk, req, false);
3118
3119 sk->sk_shutdown = SHUTDOWN_MASK;
3120
3121 if (!sock_flag(sk, SOCK_DEAD))
3122 sk->sk_state_change(sk);
3123 else
3124 inet_csk_destroy_sock(sk);
3125}
3126EXPORT_SYMBOL_GPL(tcp_done);
3127
3128extern struct tcp_congestion_ops tcp_reno;
3129
3130static __initdata unsigned long thash_entries;
3131static int __init set_thash_entries(char *str)
3132{
3133 ssize_t ret;
3134
3135 if (!str)
3136 return 0;
3137
3138 ret = kstrtoul(str, 0, &thash_entries);
3139 if (ret)
3140 return 0;
3141
3142 return 1;
3143}
3144__setup("thash_entries=", set_thash_entries);
3145
3146static void tcp_init_mem(void)
3147{
3148 unsigned long limit = nr_free_buffer_pages() / 8;
3149 limit = max(limit, 128UL);
3150 sysctl_tcp_mem[0] = limit / 4 * 3;
3151 sysctl_tcp_mem[1] = limit;
3152 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
3153}
3154
3155void __init tcp_init(void)
3156{
3157 struct sk_buff *skb = NULL;
3158 unsigned long limit;
3159 int max_rshare, max_wshare, cnt;
3160 unsigned int i;
3161
3162 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
3163
3164 percpu_counter_init(&tcp_sockets_allocated, 0);
3165 percpu_counter_init(&tcp_orphan_count, 0);
3166 tcp_hashinfo.bind_bucket_cachep =
3167 kmem_cache_create("tcp_bind_bucket",
3168 sizeof(struct inet_bind_bucket), 0,
3169 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3170
3171
3172
3173
3174
3175
3176 tcp_hashinfo.ehash =
3177 alloc_large_system_hash("TCP established",
3178 sizeof(struct inet_ehash_bucket),
3179 thash_entries,
3180 17,
3181 0,
3182 NULL,
3183 &tcp_hashinfo.ehash_mask,
3184 0,
3185 thash_entries ? 0 : 512 * 1024);
3186 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
3187 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
3188
3189 if (inet_ehash_locks_alloc(&tcp_hashinfo))
3190 panic("TCP: failed to alloc ehash_locks");
3191 tcp_hashinfo.bhash =
3192 alloc_large_system_hash("TCP bind",
3193 sizeof(struct inet_bind_hashbucket),
3194 tcp_hashinfo.ehash_mask + 1,
3195 17,
3196 0,
3197 &tcp_hashinfo.bhash_size,
3198 NULL,
3199 0,
3200 64 * 1024);
3201 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
3202 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3203 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3204 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3205 }
3206
3207
3208 cnt = tcp_hashinfo.ehash_mask + 1;
3209
3210 tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3211 sysctl_tcp_max_orphans = cnt / 2;
3212 sysctl_max_syn_backlog = max(128, cnt / 256);
3213
3214 tcp_init_mem();
3215
3216 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
3217 max_wshare = min(4UL*1024*1024, limit);
3218 max_rshare = min(6UL*1024*1024, limit);
3219
3220 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3221 sysctl_tcp_wmem[1] = 16*1024;
3222 sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3223
3224 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3225 sysctl_tcp_rmem[1] = 87380;
3226 sysctl_tcp_rmem[2] = max(87380, max_rshare);
3227
3228 pr_info("Hash tables configured (established %u bind %u)\n",
3229 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3230
3231 tcp_metrics_init();
3232
3233 tcp_register_congestion_control(&tcp_reno);
3234
3235 tcp_tasklet_init();
3236}
3237