1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248#define pr_fmt(fmt) "TCP: " fmt
249
250#include <linux/kernel.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/fs.h>
257#include <linux/skbuff.h>
258#include <linux/scatterlist.h>
259#include <linux/splice.h>
260#include <linux/net.h>
261#include <linux/socket.h>
262#include <linux/random.h>
263#include <linux/bootmem.h>
264#include <linux/highmem.h>
265#include <linux/swap.h>
266#include <linux/cache.h>
267#include <linux/err.h>
268#include <linux/crypto.h>
269#include <linux/time.h>
270#include <linux/slab.h>
271
272#include <net/icmp.h>
273#include <net/inet_common.h>
274#include <net/tcp.h>
275#include <net/xfrm.h>
276#include <net/ip.h>
277#include <net/netdma.h>
278#include <net/sock.h>
279
280#include <asm/uaccess.h>
281#include <asm/ioctls.h>
282#include <net/busy_poll.h>
283
284int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
285
286int sysctl_tcp_min_tso_segs __read_mostly = 2;
287
288struct percpu_counter tcp_orphan_count;
289EXPORT_SYMBOL_GPL(tcp_orphan_count);
290
291long sysctl_tcp_mem[3] __read_mostly;
292int sysctl_tcp_wmem[3] __read_mostly;
293int sysctl_tcp_rmem[3] __read_mostly;
294
295EXPORT_SYMBOL(sysctl_tcp_mem);
296EXPORT_SYMBOL(sysctl_tcp_rmem);
297EXPORT_SYMBOL(sysctl_tcp_wmem);
298
299atomic_long_t tcp_memory_allocated;
300EXPORT_SYMBOL(tcp_memory_allocated);
301
302
303
304
305struct percpu_counter tcp_sockets_allocated;
306EXPORT_SYMBOL(tcp_sockets_allocated);
307
308
309
310
311struct tcp_splice_state {
312 struct pipe_inode_info *pipe;
313 size_t len;
314 unsigned int flags;
315};
316
317
318
319
320
321
322
323int tcp_memory_pressure __read_mostly;
324EXPORT_SYMBOL(tcp_memory_pressure);
325
326void tcp_enter_memory_pressure(struct sock *sk)
327{
328 if (!tcp_memory_pressure) {
329 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
330 tcp_memory_pressure = 1;
331 }
332}
333EXPORT_SYMBOL(tcp_enter_memory_pressure);
334
335
336static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
337{
338 u8 res = 0;
339
340 if (seconds > 0) {
341 int period = timeout;
342
343 res = 1;
344 while (seconds > period && res < 255) {
345 res++;
346 timeout <<= 1;
347 if (timeout > rto_max)
348 timeout = rto_max;
349 period += timeout;
350 }
351 }
352 return res;
353}
354
355
356static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
357{
358 int period = 0;
359
360 if (retrans > 0) {
361 period = timeout;
362 while (--retrans) {
363 timeout <<= 1;
364 if (timeout > rto_max)
365 timeout = rto_max;
366 period += timeout;
367 }
368 }
369 return period;
370}
371
372
373
374
375
376
377void tcp_init_sock(struct sock *sk)
378{
379 struct inet_connection_sock *icsk = inet_csk(sk);
380 struct tcp_sock *tp = tcp_sk(sk);
381
382 skb_queue_head_init(&tp->out_of_order_queue);
383 tcp_init_xmit_timers(sk);
384 tcp_prequeue_init(tp);
385 INIT_LIST_HEAD(&tp->tsq_node);
386
387 icsk->icsk_rto = TCP_TIMEOUT_INIT;
388 tp->mdev = TCP_TIMEOUT_INIT;
389
390
391
392
393
394
395 tp->snd_cwnd = TCP_INIT_CWND;
396
397
398
399
400 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
401 tp->snd_cwnd_clamp = ~0;
402 tp->mss_cache = TCP_MSS_DEFAULT;
403
404 tp->reordering = sysctl_tcp_reordering;
405 tcp_enable_early_retrans(tp);
406 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
407
408 tp->tsoffset = 0;
409
410 sk->sk_state = TCP_CLOSE;
411
412 sk->sk_write_space = sk_stream_write_space;
413 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
414
415 icsk->icsk_sync_mss = tcp_sync_mss;
416
417 sk->sk_sndbuf = sysctl_tcp_wmem[1];
418 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
419
420 local_bh_disable();
421 sock_update_memcg(sk);
422 sk_sockets_allocated_inc(sk);
423 local_bh_enable();
424}
425EXPORT_SYMBOL(tcp_init_sock);
426
427
428
429
430
431
432
433
434unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
435{
436 unsigned int mask;
437 struct sock *sk = sock->sk;
438 const struct tcp_sock *tp = tcp_sk(sk);
439
440 sock_rps_record_flow(sk);
441
442 sock_poll_wait(file, sk_sleep(sk), wait);
443 if (sk->sk_state == TCP_LISTEN)
444 return inet_csk_listen_poll(sk);
445
446
447
448
449
450
451 mask = 0;
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
481 mask |= POLLHUP;
482 if (sk->sk_shutdown & RCV_SHUTDOWN)
483 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
484
485
486 if (sk->sk_state != TCP_SYN_SENT &&
487 (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
488 int target = sock_rcvlowat(sk, 0, INT_MAX);
489
490 if (tp->urg_seq == tp->copied_seq &&
491 !sock_flag(sk, SOCK_URGINLINE) &&
492 tp->urg_data)
493 target++;
494
495
496
497
498 if (tp->rcv_nxt - tp->copied_seq >= target)
499 mask |= POLLIN | POLLRDNORM;
500
501 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
502 if (sk_stream_is_writeable(sk)) {
503 mask |= POLLOUT | POLLWRNORM;
504 } else {
505 set_bit(SOCK_ASYNC_NOSPACE,
506 &sk->sk_socket->flags);
507 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
508
509
510
511
512
513 if (sk_stream_is_writeable(sk))
514 mask |= POLLOUT | POLLWRNORM;
515 }
516 } else
517 mask |= POLLOUT | POLLWRNORM;
518
519 if (tp->urg_data & TCP_URG_VALID)
520 mask |= POLLPRI;
521 }
522
523 smp_rmb();
524 if (sk->sk_err)
525 mask |= POLLERR;
526
527 return mask;
528}
529EXPORT_SYMBOL(tcp_poll);
530
531int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
532{
533 struct tcp_sock *tp = tcp_sk(sk);
534 int answ;
535 bool slow;
536
537 switch (cmd) {
538 case SIOCINQ:
539 if (sk->sk_state == TCP_LISTEN)
540 return -EINVAL;
541
542 slow = lock_sock_fast(sk);
543 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
544 answ = 0;
545 else if (sock_flag(sk, SOCK_URGINLINE) ||
546 !tp->urg_data ||
547 before(tp->urg_seq, tp->copied_seq) ||
548 !before(tp->urg_seq, tp->rcv_nxt)) {
549
550 answ = tp->rcv_nxt - tp->copied_seq;
551
552
553 if (answ && sock_flag(sk, SOCK_DONE))
554 answ--;
555 } else
556 answ = tp->urg_seq - tp->copied_seq;
557 unlock_sock_fast(sk, slow);
558 break;
559 case SIOCATMARK:
560 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
561 break;
562 case SIOCOUTQ:
563 if (sk->sk_state == TCP_LISTEN)
564 return -EINVAL;
565
566 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
567 answ = 0;
568 else
569 answ = tp->write_seq - tp->snd_una;
570 break;
571 case SIOCOUTQNSD:
572 if (sk->sk_state == TCP_LISTEN)
573 return -EINVAL;
574
575 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
576 answ = 0;
577 else
578 answ = tp->write_seq - tp->snd_nxt;
579 break;
580 default:
581 return -ENOIOCTLCMD;
582 }
583
584 return put_user(answ, (int __user *)arg);
585}
586EXPORT_SYMBOL(tcp_ioctl);
587
588static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
589{
590 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
591 tp->pushed_seq = tp->write_seq;
592}
593
594static inline bool forced_push(const struct tcp_sock *tp)
595{
596 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
597}
598
599static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
600{
601 struct tcp_sock *tp = tcp_sk(sk);
602 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
603
604 skb->csum = 0;
605 tcb->seq = tcb->end_seq = tp->write_seq;
606 tcb->tcp_flags = TCPHDR_ACK;
607 tcb->sacked = 0;
608 skb_header_release(skb);
609 tcp_add_write_queue_tail(sk, skb);
610 sk->sk_wmem_queued += skb->truesize;
611 sk_mem_charge(sk, skb->truesize);
612 if (tp->nonagle & TCP_NAGLE_PUSH)
613 tp->nonagle &= ~TCP_NAGLE_PUSH;
614}
615
616static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
617{
618 if (flags & MSG_OOB)
619 tp->snd_up = tp->write_seq;
620}
621
622static inline void tcp_push(struct sock *sk, int flags, int mss_now,
623 int nonagle)
624{
625 if (tcp_send_head(sk)) {
626 struct tcp_sock *tp = tcp_sk(sk);
627
628 if (!(flags & MSG_MORE) || forced_push(tp))
629 tcp_mark_push(tp, tcp_write_queue_tail(sk));
630
631 tcp_mark_urg(tp, flags);
632 __tcp_push_pending_frames(sk, mss_now,
633 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
634 }
635}
636
637static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
638 unsigned int offset, size_t len)
639{
640 struct tcp_splice_state *tss = rd_desc->arg.data;
641 int ret;
642
643 ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
644 tss->flags);
645 if (ret > 0)
646 rd_desc->count -= ret;
647 return ret;
648}
649
650static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
651{
652
653 read_descriptor_t rd_desc = {
654 .arg.data = tss,
655 .count = tss->len,
656 };
657
658 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
659}
660
661
662
663
664
665
666
667
668
669
670
671
672
673ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
674 struct pipe_inode_info *pipe, size_t len,
675 unsigned int flags)
676{
677 struct sock *sk = sock->sk;
678 struct tcp_splice_state tss = {
679 .pipe = pipe,
680 .len = len,
681 .flags = flags,
682 };
683 long timeo;
684 ssize_t spliced;
685 int ret;
686
687 sock_rps_record_flow(sk);
688
689
690
691 if (unlikely(*ppos))
692 return -ESPIPE;
693
694 ret = spliced = 0;
695
696 lock_sock(sk);
697
698 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
699 while (tss.len) {
700 ret = __tcp_splice_read(sk, &tss);
701 if (ret < 0)
702 break;
703 else if (!ret) {
704 if (spliced)
705 break;
706 if (sock_flag(sk, SOCK_DONE))
707 break;
708 if (sk->sk_err) {
709 ret = sock_error(sk);
710 break;
711 }
712 if (sk->sk_shutdown & RCV_SHUTDOWN)
713 break;
714 if (sk->sk_state == TCP_CLOSE) {
715
716
717
718
719 if (!sock_flag(sk, SOCK_DONE))
720 ret = -ENOTCONN;
721 break;
722 }
723 if (!timeo) {
724 ret = -EAGAIN;
725 break;
726 }
727 sk_wait_data(sk, &timeo);
728 if (signal_pending(current)) {
729 ret = sock_intr_errno(timeo);
730 break;
731 }
732 continue;
733 }
734 tss.len -= ret;
735 spliced += ret;
736
737 if (!timeo)
738 break;
739 release_sock(sk);
740 lock_sock(sk);
741
742 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
743 (sk->sk_shutdown & RCV_SHUTDOWN) ||
744 signal_pending(current))
745 break;
746 }
747
748 release_sock(sk);
749
750 if (spliced)
751 return spliced;
752
753 return ret;
754}
755EXPORT_SYMBOL(tcp_splice_read);
756
757struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
758{
759 struct sk_buff *skb;
760
761
762 size = ALIGN(size, 4);
763
764 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
765 if (skb) {
766 if (sk_wmem_schedule(sk, skb->truesize)) {
767 skb_reserve(skb, sk->sk_prot->max_header);
768
769
770
771
772 skb->reserved_tailroom = skb->end - skb->tail - size;
773 return skb;
774 }
775 __kfree_skb(skb);
776 } else {
777 sk->sk_prot->enter_memory_pressure(sk);
778 sk_stream_moderate_sndbuf(sk);
779 }
780 return NULL;
781}
782
783static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
784 int large_allowed)
785{
786 struct tcp_sock *tp = tcp_sk(sk);
787 u32 xmit_size_goal, old_size_goal;
788
789 xmit_size_goal = mss_now;
790
791 if (large_allowed && sk_can_gso(sk)) {
792 u32 gso_size, hlen;
793
794
795 hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
796 inet_csk(sk)->icsk_ext_hdr_len +
797 tp->tcp_header_len;
798
799
800
801
802
803
804 gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
805 gso_size = max_t(u32, gso_size,
806 sysctl_tcp_min_tso_segs * mss_now);
807
808 xmit_size_goal = min_t(u32, gso_size,
809 sk->sk_gso_max_size - 1 - hlen);
810
811 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
812
813
814 old_size_goal = tp->xmit_size_goal_segs * mss_now;
815
816 if (likely(old_size_goal <= xmit_size_goal &&
817 old_size_goal + mss_now > xmit_size_goal)) {
818 xmit_size_goal = old_size_goal;
819 } else {
820 tp->xmit_size_goal_segs =
821 min_t(u16, xmit_size_goal / mss_now,
822 sk->sk_gso_max_segs);
823 xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
824 }
825 }
826
827 return max(xmit_size_goal, mss_now);
828}
829
830static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
831{
832 int mss_now;
833
834 mss_now = tcp_current_mss(sk);
835 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
836
837 return mss_now;
838}
839
840static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
841 size_t size, int flags)
842{
843 struct tcp_sock *tp = tcp_sk(sk);
844 int mss_now, size_goal;
845 int err;
846 ssize_t copied;
847 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
848
849
850
851
852
853 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
854 !tcp_passive_fastopen(sk)) {
855 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
856 goto out_err;
857 }
858
859 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
860
861 mss_now = tcp_send_mss(sk, &size_goal, flags);
862 copied = 0;
863
864 err = -EPIPE;
865 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
866 goto out_err;
867
868 while (size > 0) {
869 struct sk_buff *skb = tcp_write_queue_tail(sk);
870 int copy, i;
871 bool can_coalesce;
872
873 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
874new_segment:
875 if (!sk_stream_memory_free(sk))
876 goto wait_for_sndbuf;
877
878 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
879 if (!skb)
880 goto wait_for_memory;
881
882 skb_entail(sk, skb);
883 copy = size_goal;
884 }
885
886 if (copy > size)
887 copy = size;
888
889 i = skb_shinfo(skb)->nr_frags;
890 can_coalesce = skb_can_coalesce(skb, i, page, offset);
891 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
892 tcp_mark_push(tp, skb);
893 goto new_segment;
894 }
895 if (!sk_wmem_schedule(sk, copy))
896 goto wait_for_memory;
897
898 if (can_coalesce) {
899 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
900 } else {
901 get_page(page);
902 skb_fill_page_desc(skb, i, page, offset, copy);
903 }
904 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
905
906 skb->len += copy;
907 skb->data_len += copy;
908 skb->truesize += copy;
909 sk->sk_wmem_queued += copy;
910 sk_mem_charge(sk, copy);
911 skb->ip_summed = CHECKSUM_PARTIAL;
912 tp->write_seq += copy;
913 TCP_SKB_CB(skb)->end_seq += copy;
914 skb_shinfo(skb)->gso_segs = 0;
915
916 if (!copied)
917 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
918
919 copied += copy;
920 offset += copy;
921 if (!(size -= copy))
922 goto out;
923
924 if (skb->len < size_goal || (flags & MSG_OOB))
925 continue;
926
927 if (forced_push(tp)) {
928 tcp_mark_push(tp, skb);
929 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
930 } else if (skb == tcp_send_head(sk))
931 tcp_push_one(sk, mss_now);
932 continue;
933
934wait_for_sndbuf:
935 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
936wait_for_memory:
937 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
938
939 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
940 goto do_error;
941
942 mss_now = tcp_send_mss(sk, &size_goal, flags);
943 }
944
945out:
946 if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
947 tcp_push(sk, flags, mss_now, tp->nonagle);
948 return copied;
949
950do_error:
951 if (copied)
952 goto out;
953out_err:
954 return sk_stream_error(sk, flags, err);
955}
956
957int tcp_sendpage(struct sock *sk, struct page *page, int offset,
958 size_t size, int flags)
959{
960 ssize_t res;
961
962 if (!(sk->sk_route_caps & NETIF_F_SG) ||
963 !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
964 return sock_no_sendpage(sk->sk_socket, page, offset, size,
965 flags);
966
967 lock_sock(sk);
968 res = do_tcp_sendpages(sk, page, offset, size, flags);
969 release_sock(sk);
970 return res;
971}
972EXPORT_SYMBOL(tcp_sendpage);
973
974static inline int select_size(const struct sock *sk, bool sg)
975{
976 const struct tcp_sock *tp = tcp_sk(sk);
977 int tmp = tp->mss_cache;
978
979 if (sg) {
980 if (sk_can_gso(sk)) {
981
982
983
984 tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
985 } else {
986 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
987
988 if (tmp >= pgbreak &&
989 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
990 tmp = pgbreak;
991 }
992 }
993
994 return tmp;
995}
996
997void tcp_free_fastopen_req(struct tcp_sock *tp)
998{
999 if (tp->fastopen_req != NULL) {
1000 kfree(tp->fastopen_req);
1001 tp->fastopen_req = NULL;
1002 }
1003}
1004
1005static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
1006{
1007 struct tcp_sock *tp = tcp_sk(sk);
1008 int err, flags;
1009
1010 if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
1011 return -EOPNOTSUPP;
1012 if (tp->fastopen_req != NULL)
1013 return -EALREADY;
1014
1015 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1016 sk->sk_allocation);
1017 if (unlikely(tp->fastopen_req == NULL))
1018 return -ENOBUFS;
1019 tp->fastopen_req->data = msg;
1020
1021 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1022 err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
1023 msg->msg_namelen, flags);
1024 *size = tp->fastopen_req->copied;
1025 tcp_free_fastopen_req(tp);
1026 return err;
1027}
1028
1029int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1030 size_t size)
1031{
1032 struct iovec *iov;
1033 struct tcp_sock *tp = tcp_sk(sk);
1034 struct sk_buff *skb;
1035 int iovlen, flags, err, copied = 0;
1036 int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
1037 bool sg;
1038 long timeo;
1039
1040 lock_sock(sk);
1041
1042 flags = msg->msg_flags;
1043 if (flags & MSG_FASTOPEN) {
1044 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
1045 if (err == -EINPROGRESS && copied_syn > 0)
1046 goto out;
1047 else if (err)
1048 goto out_err;
1049 offset = copied_syn;
1050 }
1051
1052 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1053
1054
1055
1056
1057
1058 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1059 !tcp_passive_fastopen(sk)) {
1060 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1061 goto do_error;
1062 }
1063
1064 if (unlikely(tp->repair)) {
1065 if (tp->repair_queue == TCP_RECV_QUEUE) {
1066 copied = tcp_send_rcvq(sk, msg, size);
1067 goto out;
1068 }
1069
1070 err = -EINVAL;
1071 if (tp->repair_queue == TCP_NO_QUEUE)
1072 goto out_err;
1073
1074
1075 }
1076
1077
1078 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1079
1080 mss_now = tcp_send_mss(sk, &size_goal, flags);
1081
1082
1083 iovlen = msg->msg_iovlen;
1084 iov = msg->msg_iov;
1085 copied = 0;
1086
1087 err = -EPIPE;
1088 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1089 goto out_err;
1090
1091 sg = !!(sk->sk_route_caps & NETIF_F_SG);
1092
1093 while (--iovlen >= 0) {
1094 size_t seglen = iov->iov_len;
1095 unsigned char __user *from = iov->iov_base;
1096
1097 iov++;
1098 if (unlikely(offset > 0)) {
1099 if (offset >= seglen) {
1100 offset -= seglen;
1101 continue;
1102 }
1103 seglen -= offset;
1104 from += offset;
1105 offset = 0;
1106 }
1107
1108 while (seglen > 0) {
1109 int copy = 0;
1110 int max = size_goal;
1111
1112 skb = tcp_write_queue_tail(sk);
1113 if (tcp_send_head(sk)) {
1114 if (skb->ip_summed == CHECKSUM_NONE)
1115 max = mss_now;
1116 copy = max - skb->len;
1117 }
1118
1119 if (copy <= 0) {
1120new_segment:
1121
1122
1123
1124 if (!sk_stream_memory_free(sk))
1125 goto wait_for_sndbuf;
1126
1127 skb = sk_stream_alloc_skb(sk,
1128 select_size(sk, sg),
1129 sk->sk_allocation);
1130 if (!skb)
1131 goto wait_for_memory;
1132
1133
1134
1135
1136
1137 if (tp->repair)
1138 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1139
1140
1141
1142
1143 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
1144 skb->ip_summed = CHECKSUM_PARTIAL;
1145
1146 skb_entail(sk, skb);
1147 copy = size_goal;
1148 max = size_goal;
1149 }
1150
1151
1152 if (copy > seglen)
1153 copy = seglen;
1154
1155
1156 if (skb_availroom(skb) > 0) {
1157
1158 copy = min_t(int, copy, skb_availroom(skb));
1159 err = skb_add_data_nocache(sk, skb, from, copy);
1160 if (err)
1161 goto do_fault;
1162 } else {
1163 bool merge = true;
1164 int i = skb_shinfo(skb)->nr_frags;
1165 struct page_frag *pfrag = sk_page_frag(sk);
1166
1167 if (!sk_page_frag_refill(sk, pfrag))
1168 goto wait_for_memory;
1169
1170 if (!skb_can_coalesce(skb, i, pfrag->page,
1171 pfrag->offset)) {
1172 if (i == MAX_SKB_FRAGS || !sg) {
1173 tcp_mark_push(tp, skb);
1174 goto new_segment;
1175 }
1176 merge = false;
1177 }
1178
1179 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1180
1181 if (!sk_wmem_schedule(sk, copy))
1182 goto wait_for_memory;
1183
1184 err = skb_copy_to_page_nocache(sk, from, skb,
1185 pfrag->page,
1186 pfrag->offset,
1187 copy);
1188 if (err)
1189 goto do_error;
1190
1191
1192 if (merge) {
1193 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1194 } else {
1195 skb_fill_page_desc(skb, i, pfrag->page,
1196 pfrag->offset, copy);
1197 get_page(pfrag->page);
1198 }
1199 pfrag->offset += copy;
1200 }
1201
1202 if (!copied)
1203 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1204
1205 tp->write_seq += copy;
1206 TCP_SKB_CB(skb)->end_seq += copy;
1207 skb_shinfo(skb)->gso_segs = 0;
1208
1209 from += copy;
1210 copied += copy;
1211 if ((seglen -= copy) == 0 && iovlen == 0)
1212 goto out;
1213
1214 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1215 continue;
1216
1217 if (forced_push(tp)) {
1218 tcp_mark_push(tp, skb);
1219 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1220 } else if (skb == tcp_send_head(sk))
1221 tcp_push_one(sk, mss_now);
1222 continue;
1223
1224wait_for_sndbuf:
1225 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1226wait_for_memory:
1227 if (copied)
1228 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1229
1230 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1231 goto do_error;
1232
1233 mss_now = tcp_send_mss(sk, &size_goal, flags);
1234 }
1235 }
1236
1237out:
1238 if (copied)
1239 tcp_push(sk, flags, mss_now, tp->nonagle);
1240 release_sock(sk);
1241 return copied + copied_syn;
1242
1243do_fault:
1244 if (!skb->len) {
1245 tcp_unlink_write_queue(skb, sk);
1246
1247
1248
1249 tcp_check_send_head(sk, skb);
1250 sk_wmem_free_skb(sk, skb);
1251 }
1252
1253do_error:
1254 if (copied + copied_syn)
1255 goto out;
1256out_err:
1257 err = sk_stream_error(sk, flags, err);
1258 release_sock(sk);
1259 return err;
1260}
1261EXPORT_SYMBOL(tcp_sendmsg);
1262
1263
1264
1265
1266
1267
1268static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1269{
1270 struct tcp_sock *tp = tcp_sk(sk);
1271
1272
1273 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1274 tp->urg_data == TCP_URG_READ)
1275 return -EINVAL;
1276
1277 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1278 return -ENOTCONN;
1279
1280 if (tp->urg_data & TCP_URG_VALID) {
1281 int err = 0;
1282 char c = tp->urg_data;
1283
1284 if (!(flags & MSG_PEEK))
1285 tp->urg_data = TCP_URG_READ;
1286
1287
1288 msg->msg_flags |= MSG_OOB;
1289
1290 if (len > 0) {
1291 if (!(flags & MSG_TRUNC))
1292 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1293 len = 1;
1294 } else
1295 msg->msg_flags |= MSG_TRUNC;
1296
1297 return err ? -EFAULT : len;
1298 }
1299
1300 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1301 return 0;
1302
1303
1304
1305
1306
1307
1308
1309 return -EAGAIN;
1310}
1311
1312static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1313{
1314 struct sk_buff *skb;
1315 int copied = 0, err = 0;
1316
1317
1318
1319 skb_queue_walk(&sk->sk_write_queue, skb) {
1320 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
1321 if (err)
1322 break;
1323
1324 copied += skb->len;
1325 }
1326
1327 return err ?: copied;
1328}
1329
1330
1331
1332
1333
1334
1335
1336void tcp_cleanup_rbuf(struct sock *sk, int copied)
1337{
1338 struct tcp_sock *tp = tcp_sk(sk);
1339 bool time_to_ack = false;
1340
1341 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1342
1343 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1344 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1345 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1346
1347 if (inet_csk_ack_scheduled(sk)) {
1348 const struct inet_connection_sock *icsk = inet_csk(sk);
1349
1350
1351 if (icsk->icsk_ack.blocked ||
1352
1353 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1354
1355
1356
1357
1358
1359
1360 (copied > 0 &&
1361 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1362 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1363 !icsk->icsk_ack.pingpong)) &&
1364 !atomic_read(&sk->sk_rmem_alloc)))
1365 time_to_ack = true;
1366 }
1367
1368
1369
1370
1371
1372
1373
1374 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1375 __u32 rcv_window_now = tcp_receive_window(tp);
1376
1377
1378 if (2*rcv_window_now <= tp->window_clamp) {
1379 __u32 new_window = __tcp_select_window(sk);
1380
1381
1382
1383
1384
1385
1386 if (new_window && new_window >= 2 * rcv_window_now)
1387 time_to_ack = true;
1388 }
1389 }
1390 if (time_to_ack)
1391 tcp_send_ack(sk);
1392}
1393
1394static void tcp_prequeue_process(struct sock *sk)
1395{
1396 struct sk_buff *skb;
1397 struct tcp_sock *tp = tcp_sk(sk);
1398
1399 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1400
1401
1402
1403 local_bh_disable();
1404 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1405 sk_backlog_rcv(sk, skb);
1406 local_bh_enable();
1407
1408
1409 tp->ucopy.memory = 0;
1410}
1411
1412#ifdef CONFIG_NET_DMA
1413static void tcp_service_net_dma(struct sock *sk, bool wait)
1414{
1415 dma_cookie_t done, used;
1416 dma_cookie_t last_issued;
1417 struct tcp_sock *tp = tcp_sk(sk);
1418
1419 if (!tp->ucopy.dma_chan)
1420 return;
1421
1422 last_issued = tp->ucopy.dma_cookie;
1423 dma_async_issue_pending(tp->ucopy.dma_chan);
1424
1425 do {
1426 if (dma_async_is_tx_complete(tp->ucopy.dma_chan,
1427 last_issued, &done,
1428 &used) == DMA_COMPLETE) {
1429
1430 __skb_queue_purge(&sk->sk_async_wait_queue);
1431 break;
1432 } else {
1433 struct sk_buff *skb;
1434 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1435 (dma_async_is_complete(skb->dma_cookie, done,
1436 used) == DMA_COMPLETE)) {
1437 __skb_dequeue(&sk->sk_async_wait_queue);
1438 kfree_skb(skb);
1439 }
1440 }
1441 } while (wait);
1442}
1443#endif
1444
1445static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1446{
1447 struct sk_buff *skb;
1448 u32 offset;
1449
1450 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1451 offset = seq - TCP_SKB_CB(skb)->seq;
1452 if (tcp_hdr(skb)->syn)
1453 offset--;
1454 if (offset < skb->len || tcp_hdr(skb)->fin) {
1455 *off = offset;
1456 return skb;
1457 }
1458
1459
1460
1461
1462 sk_eat_skb(sk, skb, false);
1463 }
1464 return NULL;
1465}
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1479 sk_read_actor_t recv_actor)
1480{
1481 struct sk_buff *skb;
1482 struct tcp_sock *tp = tcp_sk(sk);
1483 u32 seq = tp->copied_seq;
1484 u32 offset;
1485 int copied = 0;
1486
1487 if (sk->sk_state == TCP_LISTEN)
1488 return -ENOTCONN;
1489 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1490 if (offset < skb->len) {
1491 int used;
1492 size_t len;
1493
1494 len = skb->len - offset;
1495
1496 if (tp->urg_data) {
1497 u32 urg_offset = tp->urg_seq - seq;
1498 if (urg_offset < len)
1499 len = urg_offset;
1500 if (!len)
1501 break;
1502 }
1503 used = recv_actor(desc, skb, offset, len);
1504 if (used <= 0) {
1505 if (!copied)
1506 copied = used;
1507 break;
1508 } else if (used <= len) {
1509 seq += used;
1510 copied += used;
1511 offset += used;
1512 }
1513
1514
1515
1516
1517
1518 skb = tcp_recv_skb(sk, seq - 1, &offset);
1519 if (!skb)
1520 break;
1521
1522
1523
1524 if (offset + 1 != skb->len)
1525 continue;
1526 }
1527 if (tcp_hdr(skb)->fin) {
1528 sk_eat_skb(sk, skb, false);
1529 ++seq;
1530 break;
1531 }
1532 sk_eat_skb(sk, skb, false);
1533 if (!desc->count)
1534 break;
1535 tp->copied_seq = seq;
1536 }
1537 tp->copied_seq = seq;
1538
1539 tcp_rcv_space_adjust(sk);
1540
1541
1542 if (copied > 0) {
1543 tcp_recv_skb(sk, seq, &offset);
1544 tcp_cleanup_rbuf(sk, copied);
1545 }
1546 return copied;
1547}
1548EXPORT_SYMBOL(tcp_read_sock);
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1559 size_t len, int nonblock, int flags, int *addr_len)
1560{
1561 struct tcp_sock *tp = tcp_sk(sk);
1562 int copied = 0;
1563 u32 peek_seq;
1564 u32 *seq;
1565 unsigned long used;
1566 int err;
1567 int target;
1568 long timeo;
1569 struct task_struct *user_recv = NULL;
1570 bool copied_early = false;
1571 struct sk_buff *skb;
1572 u32 urg_hole = 0;
1573
1574 if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
1575 (sk->sk_state == TCP_ESTABLISHED))
1576 sk_busy_loop(sk, nonblock);
1577
1578 lock_sock(sk);
1579
1580 err = -ENOTCONN;
1581 if (sk->sk_state == TCP_LISTEN)
1582 goto out;
1583
1584 timeo = sock_rcvtimeo(sk, nonblock);
1585
1586
1587 if (flags & MSG_OOB)
1588 goto recv_urg;
1589
1590 if (unlikely(tp->repair)) {
1591 err = -EPERM;
1592 if (!(flags & MSG_PEEK))
1593 goto out;
1594
1595 if (tp->repair_queue == TCP_SEND_QUEUE)
1596 goto recv_sndq;
1597
1598 err = -EINVAL;
1599 if (tp->repair_queue == TCP_NO_QUEUE)
1600 goto out;
1601
1602
1603 }
1604
1605 seq = &tp->copied_seq;
1606 if (flags & MSG_PEEK) {
1607 peek_seq = tp->copied_seq;
1608 seq = &peek_seq;
1609 }
1610
1611 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1612
1613#ifdef CONFIG_NET_DMA
1614 tp->ucopy.dma_chan = NULL;
1615 preempt_disable();
1616 skb = skb_peek_tail(&sk->sk_receive_queue);
1617 {
1618 int available = 0;
1619
1620 if (skb)
1621 available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
1622 if ((available < target) &&
1623 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1624 !sysctl_tcp_low_latency &&
1625 net_dma_find_channel()) {
1626 preempt_enable_no_resched();
1627 tp->ucopy.pinned_list =
1628 dma_pin_iovec_pages(msg->msg_iov, len);
1629 } else {
1630 preempt_enable_no_resched();
1631 }
1632 }
1633#endif
1634
1635 do {
1636 u32 offset;
1637
1638
1639 if (tp->urg_data && tp->urg_seq == *seq) {
1640 if (copied)
1641 break;
1642 if (signal_pending(current)) {
1643 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1644 break;
1645 }
1646 }
1647
1648
1649
1650 skb_queue_walk(&sk->sk_receive_queue, skb) {
1651
1652
1653
1654 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1655 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
1656 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1657 flags))
1658 break;
1659
1660 offset = *seq - TCP_SKB_CB(skb)->seq;
1661 if (tcp_hdr(skb)->syn)
1662 offset--;
1663 if (offset < skb->len)
1664 goto found_ok_skb;
1665 if (tcp_hdr(skb)->fin)
1666 goto found_fin_ok;
1667 WARN(!(flags & MSG_PEEK),
1668 "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
1669 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1670 }
1671
1672
1673
1674 if (copied >= target && !sk->sk_backlog.tail)
1675 break;
1676
1677 if (copied) {
1678 if (sk->sk_err ||
1679 sk->sk_state == TCP_CLOSE ||
1680 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1681 !timeo ||
1682 signal_pending(current))
1683 break;
1684 } else {
1685 if (sock_flag(sk, SOCK_DONE))
1686 break;
1687
1688 if (sk->sk_err) {
1689 copied = sock_error(sk);
1690 break;
1691 }
1692
1693 if (sk->sk_shutdown & RCV_SHUTDOWN)
1694 break;
1695
1696 if (sk->sk_state == TCP_CLOSE) {
1697 if (!sock_flag(sk, SOCK_DONE)) {
1698
1699
1700
1701 copied = -ENOTCONN;
1702 break;
1703 }
1704 break;
1705 }
1706
1707 if (!timeo) {
1708 copied = -EAGAIN;
1709 break;
1710 }
1711
1712 if (signal_pending(current)) {
1713 copied = sock_intr_errno(timeo);
1714 break;
1715 }
1716 }
1717
1718 tcp_cleanup_rbuf(sk, copied);
1719
1720 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1721
1722 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1723 user_recv = current;
1724 tp->ucopy.task = user_recv;
1725 tp->ucopy.iov = msg->msg_iov;
1726 }
1727
1728 tp->ucopy.len = len;
1729
1730 WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1731 !(flags & (MSG_PEEK | MSG_TRUNC)));
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759 if (!skb_queue_empty(&tp->ucopy.prequeue))
1760 goto do_prequeue;
1761
1762
1763 }
1764
1765#ifdef CONFIG_NET_DMA
1766 if (tp->ucopy.dma_chan) {
1767 if (tp->rcv_wnd == 0 &&
1768 !skb_queue_empty(&sk->sk_async_wait_queue)) {
1769 tcp_service_net_dma(sk, true);
1770 tcp_cleanup_rbuf(sk, copied);
1771 } else
1772 dma_async_issue_pending(tp->ucopy.dma_chan);
1773 }
1774#endif
1775 if (copied >= target) {
1776
1777 release_sock(sk);
1778 lock_sock(sk);
1779 } else
1780 sk_wait_data(sk, &timeo);
1781
1782#ifdef CONFIG_NET_DMA
1783 tcp_service_net_dma(sk, false);
1784 tp->ucopy.wakeup = 0;
1785#endif
1786
1787 if (user_recv) {
1788 int chunk;
1789
1790
1791
1792 if ((chunk = len - tp->ucopy.len) != 0) {
1793 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1794 len -= chunk;
1795 copied += chunk;
1796 }
1797
1798 if (tp->rcv_nxt == tp->copied_seq &&
1799 !skb_queue_empty(&tp->ucopy.prequeue)) {
1800do_prequeue:
1801 tcp_prequeue_process(sk);
1802
1803 if ((chunk = len - tp->ucopy.len) != 0) {
1804 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1805 len -= chunk;
1806 copied += chunk;
1807 }
1808 }
1809 }
1810 if ((flags & MSG_PEEK) &&
1811 (peek_seq - copied - urg_hole != tp->copied_seq)) {
1812 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
1813 current->comm,
1814 task_pid_nr(current));
1815 peek_seq = tp->copied_seq;
1816 }
1817 continue;
1818
1819 found_ok_skb:
1820
1821 used = skb->len - offset;
1822 if (len < used)
1823 used = len;
1824
1825
1826 if (tp->urg_data) {
1827 u32 urg_offset = tp->urg_seq - *seq;
1828 if (urg_offset < used) {
1829 if (!urg_offset) {
1830 if (!sock_flag(sk, SOCK_URGINLINE)) {
1831 ++*seq;
1832 urg_hole++;
1833 offset++;
1834 used--;
1835 if (!used)
1836 goto skip_copy;
1837 }
1838 } else
1839 used = urg_offset;
1840 }
1841 }
1842
1843 if (!(flags & MSG_TRUNC)) {
1844#ifdef CONFIG_NET_DMA
1845 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1846 tp->ucopy.dma_chan = net_dma_find_channel();
1847
1848 if (tp->ucopy.dma_chan) {
1849 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1850 tp->ucopy.dma_chan, skb, offset,
1851 msg->msg_iov, used,
1852 tp->ucopy.pinned_list);
1853
1854 if (tp->ucopy.dma_cookie < 0) {
1855
1856 pr_alert("%s: dma_cookie < 0\n",
1857 __func__);
1858
1859
1860 if (!copied)
1861 copied = -EFAULT;
1862 break;
1863 }
1864
1865 dma_async_issue_pending(tp->ucopy.dma_chan);
1866
1867 if ((offset + used) == skb->len)
1868 copied_early = true;
1869
1870 } else
1871#endif
1872 {
1873 err = skb_copy_datagram_iovec(skb, offset,
1874 msg->msg_iov, used);
1875 if (err) {
1876
1877 if (!copied)
1878 copied = -EFAULT;
1879 break;
1880 }
1881 }
1882 }
1883
1884 *seq += used;
1885 copied += used;
1886 len -= used;
1887
1888 tcp_rcv_space_adjust(sk);
1889
1890skip_copy:
1891 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1892 tp->urg_data = 0;
1893 tcp_fast_path_check(sk);
1894 }
1895 if (used + offset < skb->len)
1896 continue;
1897
1898 if (tcp_hdr(skb)->fin)
1899 goto found_fin_ok;
1900 if (!(flags & MSG_PEEK)) {
1901 sk_eat_skb(sk, skb, copied_early);
1902 copied_early = false;
1903 }
1904 continue;
1905
1906 found_fin_ok:
1907
1908 ++*seq;
1909 if (!(flags & MSG_PEEK)) {
1910 sk_eat_skb(sk, skb, copied_early);
1911 copied_early = false;
1912 }
1913 break;
1914 } while (len > 0);
1915
1916 if (user_recv) {
1917 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1918 int chunk;
1919
1920 tp->ucopy.len = copied > 0 ? len : 0;
1921
1922 tcp_prequeue_process(sk);
1923
1924 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1925 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1926 len -= chunk;
1927 copied += chunk;
1928 }
1929 }
1930
1931 tp->ucopy.task = NULL;
1932 tp->ucopy.len = 0;
1933 }
1934
1935#ifdef CONFIG_NET_DMA
1936 tcp_service_net_dma(sk, true);
1937 tp->ucopy.dma_chan = NULL;
1938
1939 if (tp->ucopy.pinned_list) {
1940 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1941 tp->ucopy.pinned_list = NULL;
1942 }
1943#endif
1944
1945
1946
1947
1948
1949
1950 tcp_cleanup_rbuf(sk, copied);
1951
1952 release_sock(sk);
1953 return copied;
1954
1955out:
1956 release_sock(sk);
1957 return err;
1958
1959recv_urg:
1960 err = tcp_recv_urg(sk, msg, len, flags);
1961 goto out;
1962
1963recv_sndq:
1964 err = tcp_peek_sndq(sk, msg, len);
1965 goto out;
1966}
1967EXPORT_SYMBOL(tcp_recvmsg);
1968
1969void tcp_set_state(struct sock *sk, int state)
1970{
1971 int oldstate = sk->sk_state;
1972
1973 switch (state) {
1974 case TCP_ESTABLISHED:
1975 if (oldstate != TCP_ESTABLISHED)
1976 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1977 break;
1978
1979 case TCP_CLOSE:
1980 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
1981 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
1982
1983 sk->sk_prot->unhash(sk);
1984 if (inet_csk(sk)->icsk_bind_hash &&
1985 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1986 inet_put_port(sk);
1987
1988 default:
1989 if (oldstate == TCP_ESTABLISHED)
1990 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1991 }
1992
1993
1994
1995
1996 sk->sk_state = state;
1997
1998#ifdef STATE_TRACE
1999 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
2000#endif
2001}
2002EXPORT_SYMBOL_GPL(tcp_set_state);
2003
2004
2005
2006
2007
2008
2009
2010
2011static const unsigned char new_state[16] = {
2012
2013 TCP_CLOSE,
2014 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2015 TCP_CLOSE,
2016 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2017 TCP_FIN_WAIT1,
2018 TCP_FIN_WAIT2,
2019 TCP_CLOSE,
2020 TCP_CLOSE,
2021 TCP_LAST_ACK | TCP_ACTION_FIN,
2022 TCP_LAST_ACK,
2023 TCP_CLOSE,
2024 TCP_CLOSING,
2025};
2026
2027static int tcp_close_state(struct sock *sk)
2028{
2029 int next = (int)new_state[sk->sk_state];
2030 int ns = next & TCP_STATE_MASK;
2031
2032 tcp_set_state(sk, ns);
2033
2034 return next & TCP_ACTION_FIN;
2035}
2036
2037
2038
2039
2040
2041
2042void tcp_shutdown(struct sock *sk, int how)
2043{
2044
2045
2046
2047
2048 if (!(how & SEND_SHUTDOWN))
2049 return;
2050
2051
2052 if ((1 << sk->sk_state) &
2053 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2054 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2055
2056 if (tcp_close_state(sk))
2057 tcp_send_fin(sk);
2058 }
2059}
2060EXPORT_SYMBOL(tcp_shutdown);
2061
2062bool tcp_check_oom(struct sock *sk, int shift)
2063{
2064 bool too_many_orphans, out_of_socket_memory;
2065
2066 too_many_orphans = tcp_too_many_orphans(sk, shift);
2067 out_of_socket_memory = tcp_out_of_memory(sk);
2068
2069 if (too_many_orphans)
2070 net_info_ratelimited("too many orphaned sockets\n");
2071 if (out_of_socket_memory)
2072 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2073 return too_many_orphans || out_of_socket_memory;
2074}
2075
2076void tcp_close(struct sock *sk, long timeout)
2077{
2078 struct sk_buff *skb;
2079 int data_was_unread = 0;
2080 int state;
2081
2082 lock_sock(sk);
2083 sk->sk_shutdown = SHUTDOWN_MASK;
2084
2085 if (sk->sk_state == TCP_LISTEN) {
2086 tcp_set_state(sk, TCP_CLOSE);
2087
2088
2089 inet_csk_listen_stop(sk);
2090
2091 goto adjudge_to_death;
2092 }
2093
2094
2095
2096
2097
2098 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2099 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
2100 tcp_hdr(skb)->fin;
2101 data_was_unread += len;
2102 __kfree_skb(skb);
2103 }
2104
2105 sk_mem_reclaim(sk);
2106
2107
2108 if (sk->sk_state == TCP_CLOSE)
2109 goto adjudge_to_death;
2110
2111
2112
2113
2114
2115
2116
2117
2118 if (unlikely(tcp_sk(sk)->repair)) {
2119 sk->sk_prot->disconnect(sk, 0);
2120 } else if (data_was_unread) {
2121
2122 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2123 tcp_set_state(sk, TCP_CLOSE);
2124 tcp_send_active_reset(sk, sk->sk_allocation);
2125 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2126
2127 sk->sk_prot->disconnect(sk, 0);
2128 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2129 } else if (tcp_close_state(sk)) {
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159 tcp_send_fin(sk);
2160 }
2161
2162 sk_stream_wait_close(sk, timeout);
2163
2164adjudge_to_death:
2165 state = sk->sk_state;
2166 sock_hold(sk);
2167 sock_orphan(sk);
2168
2169
2170 release_sock(sk);
2171
2172
2173
2174
2175
2176 local_bh_disable();
2177 bh_lock_sock(sk);
2178 WARN_ON(sock_owned_by_user(sk));
2179
2180 percpu_counter_inc(sk->sk_prot->orphan_count);
2181
2182
2183 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2184 goto out;
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200 if (sk->sk_state == TCP_FIN_WAIT2) {
2201 struct tcp_sock *tp = tcp_sk(sk);
2202 if (tp->linger2 < 0) {
2203 tcp_set_state(sk, TCP_CLOSE);
2204 tcp_send_active_reset(sk, GFP_ATOMIC);
2205 NET_INC_STATS_BH(sock_net(sk),
2206 LINUX_MIB_TCPABORTONLINGER);
2207 } else {
2208 const int tmo = tcp_fin_time(sk);
2209
2210 if (tmo > TCP_TIMEWAIT_LEN) {
2211 inet_csk_reset_keepalive_timer(sk,
2212 tmo - TCP_TIMEWAIT_LEN);
2213 } else {
2214 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2215 goto out;
2216 }
2217 }
2218 }
2219 if (sk->sk_state != TCP_CLOSE) {
2220 sk_mem_reclaim(sk);
2221 if (tcp_check_oom(sk, 0)) {
2222 tcp_set_state(sk, TCP_CLOSE);
2223 tcp_send_active_reset(sk, GFP_ATOMIC);
2224 NET_INC_STATS_BH(sock_net(sk),
2225 LINUX_MIB_TCPABORTONMEMORY);
2226 }
2227 }
2228
2229 if (sk->sk_state == TCP_CLOSE) {
2230 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2231
2232
2233
2234
2235 if (req != NULL)
2236 reqsk_fastopen_remove(sk, req, false);
2237 inet_csk_destroy_sock(sk);
2238 }
2239
2240
2241out:
2242 bh_unlock_sock(sk);
2243 local_bh_enable();
2244 sock_put(sk);
2245}
2246EXPORT_SYMBOL(tcp_close);
2247
2248
2249
2250static inline bool tcp_need_reset(int state)
2251{
2252 return (1 << state) &
2253 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2254 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2255}
2256
2257int tcp_disconnect(struct sock *sk, int flags)
2258{
2259 struct inet_sock *inet = inet_sk(sk);
2260 struct inet_connection_sock *icsk = inet_csk(sk);
2261 struct tcp_sock *tp = tcp_sk(sk);
2262 int err = 0;
2263 int old_state = sk->sk_state;
2264
2265 if (old_state != TCP_CLOSE)
2266 tcp_set_state(sk, TCP_CLOSE);
2267
2268
2269 if (old_state == TCP_LISTEN) {
2270 inet_csk_listen_stop(sk);
2271 } else if (unlikely(tp->repair)) {
2272 sk->sk_err = ECONNABORTED;
2273 } else if (tcp_need_reset(old_state) ||
2274 (tp->snd_nxt != tp->write_seq &&
2275 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2276
2277
2278
2279 tcp_send_active_reset(sk, gfp_any());
2280 sk->sk_err = ECONNRESET;
2281 } else if (old_state == TCP_SYN_SENT)
2282 sk->sk_err = ECONNRESET;
2283
2284 tcp_clear_xmit_timers(sk);
2285 __skb_queue_purge(&sk->sk_receive_queue);
2286 tcp_write_queue_purge(sk);
2287 __skb_queue_purge(&tp->out_of_order_queue);
2288#ifdef CONFIG_NET_DMA
2289 __skb_queue_purge(&sk->sk_async_wait_queue);
2290#endif
2291
2292 inet->inet_dport = 0;
2293
2294 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2295 inet_reset_saddr(sk);
2296
2297 sk->sk_shutdown = 0;
2298 sock_reset_flag(sk, SOCK_DONE);
2299 tp->srtt = 0;
2300 if ((tp->write_seq += tp->max_window + 2) == 0)
2301 tp->write_seq = 1;
2302 icsk->icsk_backoff = 0;
2303 tp->snd_cwnd = 2;
2304 icsk->icsk_probes_out = 0;
2305 tp->packets_out = 0;
2306 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2307 tp->snd_cwnd_cnt = 0;
2308 tp->window_clamp = 0;
2309 tcp_set_ca_state(sk, TCP_CA_Open);
2310 tcp_clear_retrans(tp);
2311 inet_csk_delack_init(sk);
2312 tcp_init_send_head(sk);
2313 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2314 __sk_dst_reset(sk);
2315
2316 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2317
2318 sk->sk_error_report(sk);
2319 return err;
2320}
2321EXPORT_SYMBOL(tcp_disconnect);
2322
2323void tcp_sock_destruct(struct sock *sk)
2324{
2325 inet_sock_destruct(sk);
2326
2327 kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
2328}
2329
2330static inline bool tcp_can_repair_sock(const struct sock *sk)
2331{
2332 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2333 ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
2334}
2335
2336static int tcp_repair_options_est(struct tcp_sock *tp,
2337 struct tcp_repair_opt __user *optbuf, unsigned int len)
2338{
2339 struct tcp_repair_opt opt;
2340
2341 while (len >= sizeof(opt)) {
2342 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2343 return -EFAULT;
2344
2345 optbuf++;
2346 len -= sizeof(opt);
2347
2348 switch (opt.opt_code) {
2349 case TCPOPT_MSS:
2350 tp->rx_opt.mss_clamp = opt.opt_val;
2351 break;
2352 case TCPOPT_WINDOW:
2353 {
2354 u16 snd_wscale = opt.opt_val & 0xFFFF;
2355 u16 rcv_wscale = opt.opt_val >> 16;
2356
2357 if (snd_wscale > 14 || rcv_wscale > 14)
2358 return -EFBIG;
2359
2360 tp->rx_opt.snd_wscale = snd_wscale;
2361 tp->rx_opt.rcv_wscale = rcv_wscale;
2362 tp->rx_opt.wscale_ok = 1;
2363 }
2364 break;
2365 case TCPOPT_SACK_PERM:
2366 if (opt.opt_val != 0)
2367 return -EINVAL;
2368
2369 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2370 if (sysctl_tcp_fack)
2371 tcp_enable_fack(tp);
2372 break;
2373 case TCPOPT_TIMESTAMP:
2374 if (opt.opt_val != 0)
2375 return -EINVAL;
2376
2377 tp->rx_opt.tstamp_ok = 1;
2378 break;
2379 }
2380 }
2381
2382 return 0;
2383}
2384
2385
2386
2387
2388static int do_tcp_setsockopt(struct sock *sk, int level,
2389 int optname, char __user *optval, unsigned int optlen)
2390{
2391 struct tcp_sock *tp = tcp_sk(sk);
2392 struct inet_connection_sock *icsk = inet_csk(sk);
2393 int val;
2394 int err = 0;
2395
2396
2397 switch (optname) {
2398 case TCP_CONGESTION: {
2399 char name[TCP_CA_NAME_MAX];
2400
2401 if (optlen < 1)
2402 return -EINVAL;
2403
2404 val = strncpy_from_user(name, optval,
2405 min_t(long, TCP_CA_NAME_MAX-1, optlen));
2406 if (val < 0)
2407 return -EFAULT;
2408 name[val] = 0;
2409
2410 lock_sock(sk);
2411 err = tcp_set_congestion_control(sk, name);
2412 release_sock(sk);
2413 return err;
2414 }
2415 default:
2416
2417 break;
2418 }
2419
2420 if (optlen < sizeof(int))
2421 return -EINVAL;
2422
2423 if (get_user(val, (int __user *)optval))
2424 return -EFAULT;
2425
2426 lock_sock(sk);
2427
2428 switch (optname) {
2429 case TCP_MAXSEG:
2430
2431
2432
2433 if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
2434 err = -EINVAL;
2435 break;
2436 }
2437 tp->rx_opt.user_mss = val;
2438 break;
2439
2440 case TCP_NODELAY:
2441 if (val) {
2442
2443
2444
2445
2446
2447
2448
2449
2450 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2451 tcp_push_pending_frames(sk);
2452 } else {
2453 tp->nonagle &= ~TCP_NAGLE_OFF;
2454 }
2455 break;
2456
2457 case TCP_THIN_LINEAR_TIMEOUTS:
2458 if (val < 0 || val > 1)
2459 err = -EINVAL;
2460 else
2461 tp->thin_lto = val;
2462 break;
2463
2464 case TCP_THIN_DUPACK:
2465 if (val < 0 || val > 1)
2466 err = -EINVAL;
2467 else {
2468 tp->thin_dupack = val;
2469 if (tp->thin_dupack)
2470 tcp_disable_early_retrans(tp);
2471 }
2472 break;
2473
2474 case TCP_REPAIR:
2475 if (!tcp_can_repair_sock(sk))
2476 err = -EPERM;
2477 else if (val == 1) {
2478 tp->repair = 1;
2479 sk->sk_reuse = SK_FORCE_REUSE;
2480 tp->repair_queue = TCP_NO_QUEUE;
2481 } else if (val == 0) {
2482 tp->repair = 0;
2483 sk->sk_reuse = SK_NO_REUSE;
2484 tcp_send_window_probe(sk);
2485 } else
2486 err = -EINVAL;
2487
2488 break;
2489
2490 case TCP_REPAIR_QUEUE:
2491 if (!tp->repair)
2492 err = -EPERM;
2493 else if (val < TCP_QUEUES_NR)
2494 tp->repair_queue = val;
2495 else
2496 err = -EINVAL;
2497 break;
2498
2499 case TCP_QUEUE_SEQ:
2500 if (sk->sk_state != TCP_CLOSE)
2501 err = -EPERM;
2502 else if (tp->repair_queue == TCP_SEND_QUEUE)
2503 tp->write_seq = val;
2504 else if (tp->repair_queue == TCP_RECV_QUEUE)
2505 tp->rcv_nxt = val;
2506 else
2507 err = -EINVAL;
2508 break;
2509
2510 case TCP_REPAIR_OPTIONS:
2511 if (!tp->repair)
2512 err = -EINVAL;
2513 else if (sk->sk_state == TCP_ESTABLISHED)
2514 err = tcp_repair_options_est(tp,
2515 (struct tcp_repair_opt __user *)optval,
2516 optlen);
2517 else
2518 err = -EPERM;
2519 break;
2520
2521 case TCP_CORK:
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533 if (val) {
2534 tp->nonagle |= TCP_NAGLE_CORK;
2535 } else {
2536 tp->nonagle &= ~TCP_NAGLE_CORK;
2537 if (tp->nonagle&TCP_NAGLE_OFF)
2538 tp->nonagle |= TCP_NAGLE_PUSH;
2539 tcp_push_pending_frames(sk);
2540 }
2541 break;
2542
2543 case TCP_KEEPIDLE:
2544 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2545 err = -EINVAL;
2546 else {
2547 tp->keepalive_time = val * HZ;
2548 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2549 !((1 << sk->sk_state) &
2550 (TCPF_CLOSE | TCPF_LISTEN))) {
2551 u32 elapsed = keepalive_time_elapsed(tp);
2552 if (tp->keepalive_time > elapsed)
2553 elapsed = tp->keepalive_time - elapsed;
2554 else
2555 elapsed = 0;
2556 inet_csk_reset_keepalive_timer(sk, elapsed);
2557 }
2558 }
2559 break;
2560 case TCP_KEEPINTVL:
2561 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2562 err = -EINVAL;
2563 else
2564 tp->keepalive_intvl = val * HZ;
2565 break;
2566 case TCP_KEEPCNT:
2567 if (val < 1 || val > MAX_TCP_KEEPCNT)
2568 err = -EINVAL;
2569 else
2570 tp->keepalive_probes = val;
2571 break;
2572 case TCP_SYNCNT:
2573 if (val < 1 || val > MAX_TCP_SYNCNT)
2574 err = -EINVAL;
2575 else
2576 icsk->icsk_syn_retries = val;
2577 break;
2578
2579 case TCP_LINGER2:
2580 if (val < 0)
2581 tp->linger2 = -1;
2582 else if (val > sysctl_tcp_fin_timeout / HZ)
2583 tp->linger2 = 0;
2584 else
2585 tp->linger2 = val * HZ;
2586 break;
2587
2588 case TCP_DEFER_ACCEPT:
2589
2590 icsk->icsk_accept_queue.rskq_defer_accept =
2591 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2592 TCP_RTO_MAX / HZ);
2593 break;
2594
2595 case TCP_WINDOW_CLAMP:
2596 if (!val) {
2597 if (sk->sk_state != TCP_CLOSE) {
2598 err = -EINVAL;
2599 break;
2600 }
2601 tp->window_clamp = 0;
2602 } else
2603 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2604 SOCK_MIN_RCVBUF / 2 : val;
2605 break;
2606
2607 case TCP_QUICKACK:
2608 if (!val) {
2609 icsk->icsk_ack.pingpong = 1;
2610 } else {
2611 icsk->icsk_ack.pingpong = 0;
2612 if ((1 << sk->sk_state) &
2613 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2614 inet_csk_ack_scheduled(sk)) {
2615 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2616 tcp_cleanup_rbuf(sk, 1);
2617 if (!(val & 1))
2618 icsk->icsk_ack.pingpong = 1;
2619 }
2620 }
2621 break;
2622
2623#ifdef CONFIG_TCP_MD5SIG
2624 case TCP_MD5SIG:
2625
2626 err = tp->af_specific->md5_parse(sk, optval, optlen);
2627 break;
2628#endif
2629 case TCP_USER_TIMEOUT:
2630
2631
2632
2633 if (val < 0)
2634 err = -EINVAL;
2635 else
2636 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2637 break;
2638
2639 case TCP_FASTOPEN:
2640 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2641 TCPF_LISTEN)))
2642 err = fastopen_init_queue(sk, val);
2643 else
2644 err = -EINVAL;
2645 break;
2646 case TCP_TIMESTAMP:
2647 if (!tp->repair)
2648 err = -EPERM;
2649 else
2650 tp->tsoffset = val - tcp_time_stamp;
2651 break;
2652 case TCP_NOTSENT_LOWAT:
2653 tp->notsent_lowat = val;
2654 sk->sk_write_space(sk);
2655 break;
2656 default:
2657 err = -ENOPROTOOPT;
2658 break;
2659 }
2660
2661 release_sock(sk);
2662 return err;
2663}
2664
2665int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2666 unsigned int optlen)
2667{
2668 const struct inet_connection_sock *icsk = inet_csk(sk);
2669
2670 if (level != SOL_TCP)
2671 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2672 optval, optlen);
2673 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2674}
2675EXPORT_SYMBOL(tcp_setsockopt);
2676
2677#ifdef CONFIG_COMPAT
2678int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2679 char __user *optval, unsigned int optlen)
2680{
2681 if (level != SOL_TCP)
2682 return inet_csk_compat_setsockopt(sk, level, optname,
2683 optval, optlen);
2684 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2685}
2686EXPORT_SYMBOL(compat_tcp_setsockopt);
2687#endif
2688
2689
2690void tcp_get_info(const struct sock *sk, struct tcp_info *info)
2691{
2692 const struct tcp_sock *tp = tcp_sk(sk);
2693 const struct inet_connection_sock *icsk = inet_csk(sk);
2694 u32 now = tcp_time_stamp;
2695
2696 memset(info, 0, sizeof(*info));
2697
2698 info->tcpi_state = sk->sk_state;
2699 info->tcpi_ca_state = icsk->icsk_ca_state;
2700 info->tcpi_retransmits = icsk->icsk_retransmits;
2701 info->tcpi_probes = icsk->icsk_probes_out;
2702 info->tcpi_backoff = icsk->icsk_backoff;
2703
2704 if (tp->rx_opt.tstamp_ok)
2705 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2706 if (tcp_is_sack(tp))
2707 info->tcpi_options |= TCPI_OPT_SACK;
2708 if (tp->rx_opt.wscale_ok) {
2709 info->tcpi_options |= TCPI_OPT_WSCALE;
2710 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2711 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2712 }
2713
2714 if (tp->ecn_flags & TCP_ECN_OK)
2715 info->tcpi_options |= TCPI_OPT_ECN;
2716 if (tp->ecn_flags & TCP_ECN_SEEN)
2717 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
2718 if (tp->syn_data_acked)
2719 info->tcpi_options |= TCPI_OPT_SYN_DATA;
2720
2721 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2722 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2723 info->tcpi_snd_mss = tp->mss_cache;
2724 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2725
2726 if (sk->sk_state == TCP_LISTEN) {
2727 info->tcpi_unacked = sk->sk_ack_backlog;
2728 info->tcpi_sacked = sk->sk_max_ack_backlog;
2729 } else {
2730 info->tcpi_unacked = tp->packets_out;
2731 info->tcpi_sacked = tp->sacked_out;
2732 }
2733 info->tcpi_lost = tp->lost_out;
2734 info->tcpi_retrans = tp->retrans_out;
2735 info->tcpi_fackets = tp->fackets_out;
2736
2737 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2738 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2739 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2740
2741 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2742 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2743 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2744 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2745 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2746 info->tcpi_snd_cwnd = tp->snd_cwnd;
2747 info->tcpi_advmss = tp->advmss;
2748 info->tcpi_reordering = tp->reordering;
2749
2750 info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2751 info->tcpi_rcv_space = tp->rcvq_space.space;
2752
2753 info->tcpi_total_retrans = tp->total_retrans;
2754}
2755EXPORT_SYMBOL_GPL(tcp_get_info);
2756
2757static int do_tcp_getsockopt(struct sock *sk, int level,
2758 int optname, char __user *optval, int __user *optlen)
2759{
2760 struct inet_connection_sock *icsk = inet_csk(sk);
2761 struct tcp_sock *tp = tcp_sk(sk);
2762 int val, len;
2763
2764 if (get_user(len, optlen))
2765 return -EFAULT;
2766
2767 len = min_t(unsigned int, len, sizeof(int));
2768
2769 if (len < 0)
2770 return -EINVAL;
2771
2772 switch (optname) {
2773 case TCP_MAXSEG:
2774 val = tp->mss_cache;
2775 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2776 val = tp->rx_opt.user_mss;
2777 if (tp->repair)
2778 val = tp->rx_opt.mss_clamp;
2779 break;
2780 case TCP_NODELAY:
2781 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2782 break;
2783 case TCP_CORK:
2784 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2785 break;
2786 case TCP_KEEPIDLE:
2787 val = keepalive_time_when(tp) / HZ;
2788 break;
2789 case TCP_KEEPINTVL:
2790 val = keepalive_intvl_when(tp) / HZ;
2791 break;
2792 case TCP_KEEPCNT:
2793 val = keepalive_probes(tp);
2794 break;
2795 case TCP_SYNCNT:
2796 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2797 break;
2798 case TCP_LINGER2:
2799 val = tp->linger2;
2800 if (val >= 0)
2801 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2802 break;
2803 case TCP_DEFER_ACCEPT:
2804 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
2805 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
2806 break;
2807 case TCP_WINDOW_CLAMP:
2808 val = tp->window_clamp;
2809 break;
2810 case TCP_INFO: {
2811 struct tcp_info info;
2812
2813 if (get_user(len, optlen))
2814 return -EFAULT;
2815
2816 tcp_get_info(sk, &info);
2817
2818 len = min_t(unsigned int, len, sizeof(info));
2819 if (put_user(len, optlen))
2820 return -EFAULT;
2821 if (copy_to_user(optval, &info, len))
2822 return -EFAULT;
2823 return 0;
2824 }
2825 case TCP_QUICKACK:
2826 val = !icsk->icsk_ack.pingpong;
2827 break;
2828
2829 case TCP_CONGESTION:
2830 if (get_user(len, optlen))
2831 return -EFAULT;
2832 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2833 if (put_user(len, optlen))
2834 return -EFAULT;
2835 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2836 return -EFAULT;
2837 return 0;
2838
2839 case TCP_THIN_LINEAR_TIMEOUTS:
2840 val = tp->thin_lto;
2841 break;
2842 case TCP_THIN_DUPACK:
2843 val = tp->thin_dupack;
2844 break;
2845
2846 case TCP_REPAIR:
2847 val = tp->repair;
2848 break;
2849
2850 case TCP_REPAIR_QUEUE:
2851 if (tp->repair)
2852 val = tp->repair_queue;
2853 else
2854 return -EINVAL;
2855 break;
2856
2857 case TCP_QUEUE_SEQ:
2858 if (tp->repair_queue == TCP_SEND_QUEUE)
2859 val = tp->write_seq;
2860 else if (tp->repair_queue == TCP_RECV_QUEUE)
2861 val = tp->rcv_nxt;
2862 else
2863 return -EINVAL;
2864 break;
2865
2866 case TCP_USER_TIMEOUT:
2867 val = jiffies_to_msecs(icsk->icsk_user_timeout);
2868 break;
2869 case TCP_TIMESTAMP:
2870 val = tcp_time_stamp + tp->tsoffset;
2871 break;
2872 case TCP_NOTSENT_LOWAT:
2873 val = tp->notsent_lowat;
2874 break;
2875 default:
2876 return -ENOPROTOOPT;
2877 }
2878
2879 if (put_user(len, optlen))
2880 return -EFAULT;
2881 if (copy_to_user(optval, &val, len))
2882 return -EFAULT;
2883 return 0;
2884}
2885
2886int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2887 int __user *optlen)
2888{
2889 struct inet_connection_sock *icsk = inet_csk(sk);
2890
2891 if (level != SOL_TCP)
2892 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2893 optval, optlen);
2894 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2895}
2896EXPORT_SYMBOL(tcp_getsockopt);
2897
2898#ifdef CONFIG_COMPAT
2899int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2900 char __user *optval, int __user *optlen)
2901{
2902 if (level != SOL_TCP)
2903 return inet_csk_compat_getsockopt(sk, level, optname,
2904 optval, optlen);
2905 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2906}
2907EXPORT_SYMBOL(compat_tcp_getsockopt);
2908#endif
2909
2910#ifdef CONFIG_TCP_MD5SIG
2911static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly;
2912static DEFINE_MUTEX(tcp_md5sig_mutex);
2913
2914static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
2915{
2916 int cpu;
2917
2918 for_each_possible_cpu(cpu) {
2919 struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu);
2920
2921 if (p->md5_desc.tfm)
2922 crypto_free_hash(p->md5_desc.tfm);
2923 }
2924 free_percpu(pool);
2925}
2926
2927static void __tcp_alloc_md5sig_pool(void)
2928{
2929 int cpu;
2930 struct tcp_md5sig_pool __percpu *pool;
2931
2932 pool = alloc_percpu(struct tcp_md5sig_pool);
2933 if (!pool)
2934 return;
2935
2936 for_each_possible_cpu(cpu) {
2937 struct crypto_hash *hash;
2938
2939 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
2940 if (IS_ERR_OR_NULL(hash))
2941 goto out_free;
2942
2943 per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
2944 }
2945
2946
2947
2948 smp_wmb();
2949 tcp_md5sig_pool = pool;
2950 return;
2951out_free:
2952 __tcp_free_md5sig_pool(pool);
2953}
2954
2955bool tcp_alloc_md5sig_pool(void)
2956{
2957 if (unlikely(!tcp_md5sig_pool)) {
2958 mutex_lock(&tcp_md5sig_mutex);
2959
2960 if (!tcp_md5sig_pool)
2961 __tcp_alloc_md5sig_pool();
2962
2963 mutex_unlock(&tcp_md5sig_mutex);
2964 }
2965 return tcp_md5sig_pool != NULL;
2966}
2967EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
2978{
2979 struct tcp_md5sig_pool __percpu *p;
2980
2981 local_bh_disable();
2982 p = ACCESS_ONCE(tcp_md5sig_pool);
2983 if (p)
2984 return __this_cpu_ptr(p);
2985
2986 local_bh_enable();
2987 return NULL;
2988}
2989EXPORT_SYMBOL(tcp_get_md5sig_pool);
2990
2991int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
2992 const struct tcphdr *th)
2993{
2994 struct scatterlist sg;
2995 struct tcphdr hdr;
2996 int err;
2997
2998
2999 memcpy(&hdr, th, sizeof(hdr));
3000 hdr.check = 0;
3001
3002
3003 sg_init_one(&sg, &hdr, sizeof(hdr));
3004 err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
3005 return err;
3006}
3007EXPORT_SYMBOL(tcp_md5_hash_header);
3008
3009int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3010 const struct sk_buff *skb, unsigned int header_len)
3011{
3012 struct scatterlist sg;
3013 const struct tcphdr *tp = tcp_hdr(skb);
3014 struct hash_desc *desc = &hp->md5_desc;
3015 unsigned int i;
3016 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
3017 skb_headlen(skb) - header_len : 0;
3018 const struct skb_shared_info *shi = skb_shinfo(skb);
3019 struct sk_buff *frag_iter;
3020
3021 sg_init_table(&sg, 1);
3022
3023 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
3024 if (crypto_hash_update(desc, &sg, head_data_len))
3025 return 1;
3026
3027 for (i = 0; i < shi->nr_frags; ++i) {
3028 const struct skb_frag_struct *f = &shi->frags[i];
3029 unsigned int offset = f->page_offset;
3030 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
3031
3032 sg_set_page(&sg, page, skb_frag_size(f),
3033 offset_in_page(offset));
3034 if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
3035 return 1;
3036 }
3037
3038 skb_walk_frags(skb, frag_iter)
3039 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3040 return 1;
3041
3042 return 0;
3043}
3044EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3045
3046int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
3047{
3048 struct scatterlist sg;
3049
3050 sg_init_one(&sg, key->key, key->keylen);
3051 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
3052}
3053EXPORT_SYMBOL(tcp_md5_hash_key);
3054
3055#endif
3056
3057void tcp_done(struct sock *sk)
3058{
3059 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3060
3061 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3062 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3063
3064 tcp_set_state(sk, TCP_CLOSE);
3065 tcp_clear_xmit_timers(sk);
3066 if (req != NULL)
3067 reqsk_fastopen_remove(sk, req, false);
3068
3069 sk->sk_shutdown = SHUTDOWN_MASK;
3070
3071 if (!sock_flag(sk, SOCK_DEAD))
3072 sk->sk_state_change(sk);
3073 else
3074 inet_csk_destroy_sock(sk);
3075}
3076EXPORT_SYMBOL_GPL(tcp_done);
3077
3078extern struct tcp_congestion_ops tcp_reno;
3079
3080static __initdata unsigned long thash_entries;
3081static int __init set_thash_entries(char *str)
3082{
3083 ssize_t ret;
3084
3085 if (!str)
3086 return 0;
3087
3088 ret = kstrtoul(str, 0, &thash_entries);
3089 if (ret)
3090 return 0;
3091
3092 return 1;
3093}
3094__setup("thash_entries=", set_thash_entries);
3095
3096static void tcp_init_mem(void)
3097{
3098 unsigned long limit = nr_free_buffer_pages() / 8;
3099 limit = max(limit, 128UL);
3100 sysctl_tcp_mem[0] = limit / 4 * 3;
3101 sysctl_tcp_mem[1] = limit;
3102 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
3103}
3104
3105void __init tcp_init(void)
3106{
3107 struct sk_buff *skb = NULL;
3108 unsigned long limit;
3109 int max_rshare, max_wshare, cnt;
3110 unsigned int i;
3111
3112 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
3113
3114 percpu_counter_init(&tcp_sockets_allocated, 0);
3115 percpu_counter_init(&tcp_orphan_count, 0);
3116 tcp_hashinfo.bind_bucket_cachep =
3117 kmem_cache_create("tcp_bind_bucket",
3118 sizeof(struct inet_bind_bucket), 0,
3119 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3120
3121
3122
3123
3124
3125
3126 tcp_hashinfo.ehash =
3127 alloc_large_system_hash("TCP established",
3128 sizeof(struct inet_ehash_bucket),
3129 thash_entries,
3130 17,
3131 0,
3132 NULL,
3133 &tcp_hashinfo.ehash_mask,
3134 0,
3135 thash_entries ? 0 : 512 * 1024);
3136 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
3137 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
3138
3139 if (inet_ehash_locks_alloc(&tcp_hashinfo))
3140 panic("TCP: failed to alloc ehash_locks");
3141 tcp_hashinfo.bhash =
3142 alloc_large_system_hash("TCP bind",
3143 sizeof(struct inet_bind_hashbucket),
3144 tcp_hashinfo.ehash_mask + 1,
3145 17,
3146 0,
3147 &tcp_hashinfo.bhash_size,
3148 NULL,
3149 0,
3150 64 * 1024);
3151 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
3152 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3153 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3154 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3155 }
3156
3157
3158 cnt = tcp_hashinfo.ehash_mask + 1;
3159
3160 tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3161 sysctl_tcp_max_orphans = cnt / 2;
3162 sysctl_max_syn_backlog = max(128, cnt / 256);
3163
3164 tcp_init_mem();
3165
3166 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
3167 max_wshare = min(4UL*1024*1024, limit);
3168 max_rshare = min(6UL*1024*1024, limit);
3169
3170 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3171 sysctl_tcp_wmem[1] = 16*1024;
3172 sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3173
3174 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3175 sysctl_tcp_rmem[1] = 87380;
3176 sysctl_tcp_rmem[2] = max(87380, max_rshare);
3177
3178 pr_info("Hash tables configured (established %u bind %u)\n",
3179 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3180
3181 tcp_metrics_init();
3182
3183 tcp_register_congestion_control(&tcp_reno);
3184
3185 tcp_tasklet_init();
3186}
3187