1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248#define pr_fmt(fmt) "TCP: " fmt
249
250#include <linux/kernel.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/fs.h>
257#include <linux/skbuff.h>
258#include <linux/scatterlist.h>
259#include <linux/splice.h>
260#include <linux/net.h>
261#include <linux/socket.h>
262#include <linux/random.h>
263#include <linux/bootmem.h>
264#include <linux/highmem.h>
265#include <linux/swap.h>
266#include <linux/cache.h>
267#include <linux/err.h>
268#include <linux/crypto.h>
269#include <linux/time.h>
270#include <linux/slab.h>
271
272#include <net/icmp.h>
273#include <net/inet_common.h>
274#include <net/tcp.h>
275#include <net/xfrm.h>
276#include <net/ip.h>
277#include <net/netdma.h>
278#include <net/sock.h>
279
280#include <asm/uaccess.h>
281#include <asm/ioctls.h>
282#include <net/busy_poll.h>
283
284int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
285
286int sysctl_tcp_min_tso_segs __read_mostly = 2;
287
288struct percpu_counter tcp_orphan_count;
289EXPORT_SYMBOL_GPL(tcp_orphan_count);
290
291int sysctl_tcp_wmem[3] __read_mostly;
292int sysctl_tcp_rmem[3] __read_mostly;
293
294EXPORT_SYMBOL(sysctl_tcp_rmem);
295EXPORT_SYMBOL(sysctl_tcp_wmem);
296
297atomic_long_t tcp_memory_allocated;
298EXPORT_SYMBOL(tcp_memory_allocated);
299
300
301
302
303struct percpu_counter tcp_sockets_allocated;
304EXPORT_SYMBOL(tcp_sockets_allocated);
305
306
307
308
309struct tcp_splice_state {
310 struct pipe_inode_info *pipe;
311 size_t len;
312 unsigned int flags;
313};
314
315
316
317
318
319
320
321int tcp_memory_pressure __read_mostly;
322EXPORT_SYMBOL(tcp_memory_pressure);
323
324void tcp_enter_memory_pressure(struct sock *sk)
325{
326 if (!tcp_memory_pressure) {
327 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
328 tcp_memory_pressure = 1;
329 }
330}
331EXPORT_SYMBOL(tcp_enter_memory_pressure);
332
333
334static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
335{
336 u8 res = 0;
337
338 if (seconds > 0) {
339 int period = timeout;
340
341 res = 1;
342 while (seconds > period && res < 255) {
343 res++;
344 timeout <<= 1;
345 if (timeout > rto_max)
346 timeout = rto_max;
347 period += timeout;
348 }
349 }
350 return res;
351}
352
353
354static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
355{
356 int period = 0;
357
358 if (retrans > 0) {
359 period = timeout;
360 while (--retrans) {
361 timeout <<= 1;
362 if (timeout > rto_max)
363 timeout = rto_max;
364 period += timeout;
365 }
366 }
367 return period;
368}
369
370
371
372
373
374
375void tcp_init_sock(struct sock *sk)
376{
377 struct inet_connection_sock *icsk = inet_csk(sk);
378 struct tcp_sock *tp = tcp_sk(sk);
379
380 skb_queue_head_init(&tp->out_of_order_queue);
381 tcp_init_xmit_timers(sk);
382 tcp_prequeue_init(tp);
383 INIT_LIST_HEAD(&tp->tsq_node);
384
385 icsk->icsk_rto = TCP_TIMEOUT_INIT;
386 tp->mdev = TCP_TIMEOUT_INIT;
387
388
389
390
391
392
393 tp->snd_cwnd = TCP_INIT_CWND;
394
395
396
397
398 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
399 tp->snd_cwnd_clamp = ~0;
400 tp->mss_cache = TCP_MSS_DEFAULT;
401
402 tp->reordering = sysctl_tcp_reordering;
403 tcp_enable_early_retrans(tp);
404 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
405
406 tp->tsoffset = 0;
407
408 sk->sk_state = TCP_CLOSE;
409
410 sk->sk_write_space = sk_stream_write_space;
411 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
412
413 icsk->icsk_sync_mss = tcp_sync_mss;
414
415 sk->sk_sndbuf = sysctl_tcp_wmem[1];
416 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
417
418 local_bh_disable();
419 sock_update_memcg(sk);
420 sk_sockets_allocated_inc(sk);
421 local_bh_enable();
422}
423EXPORT_SYMBOL(tcp_init_sock);
424
425
426
427
428
429
430
431
432unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
433{
434 unsigned int mask;
435 struct sock *sk = sock->sk;
436 const struct tcp_sock *tp = tcp_sk(sk);
437
438 sock_rps_record_flow(sk);
439
440 sock_poll_wait(file, sk_sleep(sk), wait);
441 if (sk->sk_state == TCP_LISTEN)
442 return inet_csk_listen_poll(sk);
443
444
445
446
447
448
449 mask = 0;
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
479 mask |= POLLHUP;
480 if (sk->sk_shutdown & RCV_SHUTDOWN)
481 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
482
483
484 if (sk->sk_state != TCP_SYN_SENT &&
485 (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
486 int target = sock_rcvlowat(sk, 0, INT_MAX);
487
488 if (tp->urg_seq == tp->copied_seq &&
489 !sock_flag(sk, SOCK_URGINLINE) &&
490 tp->urg_data)
491 target++;
492
493
494
495
496 if (tp->rcv_nxt - tp->copied_seq >= target)
497 mask |= POLLIN | POLLRDNORM;
498
499 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
500 if (sk_stream_is_writeable(sk)) {
501 mask |= POLLOUT | POLLWRNORM;
502 } else {
503 set_bit(SOCK_ASYNC_NOSPACE,
504 &sk->sk_socket->flags);
505 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
506
507
508
509
510
511 if (sk_stream_is_writeable(sk))
512 mask |= POLLOUT | POLLWRNORM;
513 }
514 } else
515 mask |= POLLOUT | POLLWRNORM;
516
517 if (tp->urg_data & TCP_URG_VALID)
518 mask |= POLLPRI;
519 }
520
521 smp_rmb();
522 if (sk->sk_err)
523 mask |= POLLERR;
524
525 return mask;
526}
527EXPORT_SYMBOL(tcp_poll);
528
529int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
530{
531 struct tcp_sock *tp = tcp_sk(sk);
532 int answ;
533 bool slow;
534
535 switch (cmd) {
536 case SIOCINQ:
537 if (sk->sk_state == TCP_LISTEN)
538 return -EINVAL;
539
540 slow = lock_sock_fast(sk);
541 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
542 answ = 0;
543 else if (sock_flag(sk, SOCK_URGINLINE) ||
544 !tp->urg_data ||
545 before(tp->urg_seq, tp->copied_seq) ||
546 !before(tp->urg_seq, tp->rcv_nxt)) {
547
548 answ = tp->rcv_nxt - tp->copied_seq;
549
550
551 if (answ && sock_flag(sk, SOCK_DONE))
552 answ--;
553 } else
554 answ = tp->urg_seq - tp->copied_seq;
555 unlock_sock_fast(sk, slow);
556 break;
557 case SIOCATMARK:
558 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
559 break;
560 case SIOCOUTQ:
561 if (sk->sk_state == TCP_LISTEN)
562 return -EINVAL;
563
564 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
565 answ = 0;
566 else
567 answ = tp->write_seq - tp->snd_una;
568 break;
569 case SIOCOUTQNSD:
570 if (sk->sk_state == TCP_LISTEN)
571 return -EINVAL;
572
573 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
574 answ = 0;
575 else
576 answ = tp->write_seq - tp->snd_nxt;
577 break;
578 default:
579 return -ENOIOCTLCMD;
580 }
581
582 return put_user(answ, (int __user *)arg);
583}
584EXPORT_SYMBOL(tcp_ioctl);
585
586static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
587{
588 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
589 tp->pushed_seq = tp->write_seq;
590}
591
592static inline bool forced_push(const struct tcp_sock *tp)
593{
594 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
595}
596
597static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
598{
599 struct tcp_sock *tp = tcp_sk(sk);
600 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
601
602 skb->csum = 0;
603 tcb->seq = tcb->end_seq = tp->write_seq;
604 tcb->tcp_flags = TCPHDR_ACK;
605 tcb->sacked = 0;
606 skb_header_release(skb);
607 tcp_add_write_queue_tail(sk, skb);
608 sk->sk_wmem_queued += skb->truesize;
609 sk_mem_charge(sk, skb->truesize);
610 if (tp->nonagle & TCP_NAGLE_PUSH)
611 tp->nonagle &= ~TCP_NAGLE_PUSH;
612}
613
614static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
615{
616 if (flags & MSG_OOB)
617 tp->snd_up = tp->write_seq;
618}
619
620static inline void tcp_push(struct sock *sk, int flags, int mss_now,
621 int nonagle)
622{
623 if (tcp_send_head(sk)) {
624 struct tcp_sock *tp = tcp_sk(sk);
625
626 if (!(flags & MSG_MORE) || forced_push(tp))
627 tcp_mark_push(tp, tcp_write_queue_tail(sk));
628
629 tcp_mark_urg(tp, flags);
630 __tcp_push_pending_frames(sk, mss_now,
631 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
632 }
633}
634
635static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
636 unsigned int offset, size_t len)
637{
638 struct tcp_splice_state *tss = rd_desc->arg.data;
639 int ret;
640
641 ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
642 tss->flags);
643 if (ret > 0)
644 rd_desc->count -= ret;
645 return ret;
646}
647
648static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
649{
650
651 read_descriptor_t rd_desc = {
652 .arg.data = tss,
653 .count = tss->len,
654 };
655
656 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
657}
658
659
660
661
662
663
664
665
666
667
668
669
670
671ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
672 struct pipe_inode_info *pipe, size_t len,
673 unsigned int flags)
674{
675 struct sock *sk = sock->sk;
676 struct tcp_splice_state tss = {
677 .pipe = pipe,
678 .len = len,
679 .flags = flags,
680 };
681 long timeo;
682 ssize_t spliced;
683 int ret;
684
685 sock_rps_record_flow(sk);
686
687
688
689 if (unlikely(*ppos))
690 return -ESPIPE;
691
692 ret = spliced = 0;
693
694 lock_sock(sk);
695
696 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
697 while (tss.len) {
698 ret = __tcp_splice_read(sk, &tss);
699 if (ret < 0)
700 break;
701 else if (!ret) {
702 if (spliced)
703 break;
704 if (sock_flag(sk, SOCK_DONE))
705 break;
706 if (sk->sk_err) {
707 ret = sock_error(sk);
708 break;
709 }
710 if (sk->sk_shutdown & RCV_SHUTDOWN)
711 break;
712 if (sk->sk_state == TCP_CLOSE) {
713
714
715
716
717 if (!sock_flag(sk, SOCK_DONE))
718 ret = -ENOTCONN;
719 break;
720 }
721 if (!timeo) {
722 ret = -EAGAIN;
723 break;
724 }
725 sk_wait_data(sk, &timeo);
726 if (signal_pending(current)) {
727 ret = sock_intr_errno(timeo);
728 break;
729 }
730 continue;
731 }
732 tss.len -= ret;
733 spliced += ret;
734
735 if (!timeo)
736 break;
737 release_sock(sk);
738 lock_sock(sk);
739
740 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
741 (sk->sk_shutdown & RCV_SHUTDOWN) ||
742 signal_pending(current))
743 break;
744 }
745
746 release_sock(sk);
747
748 if (spliced)
749 return spliced;
750
751 return ret;
752}
753EXPORT_SYMBOL(tcp_splice_read);
754
755struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
756{
757 struct sk_buff *skb;
758
759
760 size = ALIGN(size, 4);
761
762 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
763 if (skb) {
764 if (sk_wmem_schedule(sk, skb->truesize)) {
765 skb_reserve(skb, sk->sk_prot->max_header);
766
767
768
769
770 skb->reserved_tailroom = skb->end - skb->tail - size;
771 return skb;
772 }
773 __kfree_skb(skb);
774 } else {
775 sk->sk_prot->enter_memory_pressure(sk);
776 sk_stream_moderate_sndbuf(sk);
777 }
778 return NULL;
779}
780
781static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
782 int large_allowed)
783{
784 struct tcp_sock *tp = tcp_sk(sk);
785 u32 xmit_size_goal, old_size_goal;
786
787 xmit_size_goal = mss_now;
788
789 if (large_allowed && sk_can_gso(sk)) {
790 u32 gso_size, hlen;
791
792
793 hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
794 inet_csk(sk)->icsk_ext_hdr_len +
795 tp->tcp_header_len;
796
797
798
799
800
801
802 gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
803 gso_size = max_t(u32, gso_size,
804 sysctl_tcp_min_tso_segs * mss_now);
805
806 xmit_size_goal = min_t(u32, gso_size,
807 sk->sk_gso_max_size - 1 - hlen);
808
809
810
811
812 xmit_size_goal = min_t(u32, xmit_size_goal,
813 sysctl_tcp_limit_output_bytes >> 1);
814
815 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
816
817
818 old_size_goal = tp->xmit_size_goal_segs * mss_now;
819
820 if (likely(old_size_goal <= xmit_size_goal &&
821 old_size_goal + mss_now > xmit_size_goal)) {
822 xmit_size_goal = old_size_goal;
823 } else {
824 tp->xmit_size_goal_segs =
825 min_t(u16, xmit_size_goal / mss_now,
826 sk->sk_gso_max_segs);
827 xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
828 }
829 }
830
831 return max(xmit_size_goal, mss_now);
832}
833
834static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
835{
836 int mss_now;
837
838 mss_now = tcp_current_mss(sk);
839 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
840
841 return mss_now;
842}
843
844static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
845 size_t size, int flags)
846{
847 struct tcp_sock *tp = tcp_sk(sk);
848 int mss_now, size_goal;
849 int err;
850 ssize_t copied;
851 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
852
853
854
855
856
857 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
858 !tcp_passive_fastopen(sk)) {
859 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
860 goto out_err;
861 }
862
863 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
864
865 mss_now = tcp_send_mss(sk, &size_goal, flags);
866 copied = 0;
867
868 err = -EPIPE;
869 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
870 goto out_err;
871
872 while (size > 0) {
873 struct sk_buff *skb = tcp_write_queue_tail(sk);
874 int copy, i;
875 bool can_coalesce;
876
877 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
878new_segment:
879 if (!sk_stream_memory_free(sk))
880 goto wait_for_sndbuf;
881
882 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
883 if (!skb)
884 goto wait_for_memory;
885
886 skb_entail(sk, skb);
887 copy = size_goal;
888 }
889
890 if (copy > size)
891 copy = size;
892
893 i = skb_shinfo(skb)->nr_frags;
894 can_coalesce = skb_can_coalesce(skb, i, page, offset);
895 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
896 tcp_mark_push(tp, skb);
897 goto new_segment;
898 }
899 if (!sk_wmem_schedule(sk, copy))
900 goto wait_for_memory;
901
902 if (can_coalesce) {
903 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
904 } else {
905 get_page(page);
906 skb_fill_page_desc(skb, i, page, offset, copy);
907 }
908 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
909
910 skb->len += copy;
911 skb->data_len += copy;
912 skb->truesize += copy;
913 sk->sk_wmem_queued += copy;
914 sk_mem_charge(sk, copy);
915 skb->ip_summed = CHECKSUM_PARTIAL;
916 tp->write_seq += copy;
917 TCP_SKB_CB(skb)->end_seq += copy;
918 skb_shinfo(skb)->gso_segs = 0;
919
920 if (!copied)
921 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
922
923 copied += copy;
924 offset += copy;
925 if (!(size -= copy))
926 goto out;
927
928 if (skb->len < size_goal || (flags & MSG_OOB))
929 continue;
930
931 if (forced_push(tp)) {
932 tcp_mark_push(tp, skb);
933 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
934 } else if (skb == tcp_send_head(sk))
935 tcp_push_one(sk, mss_now);
936 continue;
937
938wait_for_sndbuf:
939 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
940wait_for_memory:
941 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
942
943 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
944 goto do_error;
945
946 mss_now = tcp_send_mss(sk, &size_goal, flags);
947 }
948
949out:
950 if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
951 tcp_push(sk, flags, mss_now, tp->nonagle);
952 return copied;
953
954do_error:
955 if (copied)
956 goto out;
957out_err:
958 return sk_stream_error(sk, flags, err);
959}
960
961int tcp_sendpage(struct sock *sk, struct page *page, int offset,
962 size_t size, int flags)
963{
964 ssize_t res;
965
966 if (!(sk->sk_route_caps & NETIF_F_SG) ||
967 !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
968 return sock_no_sendpage(sk->sk_socket, page, offset, size,
969 flags);
970
971 lock_sock(sk);
972 res = do_tcp_sendpages(sk, page, offset, size, flags);
973 release_sock(sk);
974 return res;
975}
976EXPORT_SYMBOL(tcp_sendpage);
977
978static inline int select_size(const struct sock *sk, bool sg)
979{
980 const struct tcp_sock *tp = tcp_sk(sk);
981 int tmp = tp->mss_cache;
982
983 if (sg) {
984 if (sk_can_gso(sk)) {
985
986
987
988 tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
989 } else {
990 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
991
992 if (tmp >= pgbreak &&
993 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
994 tmp = pgbreak;
995 }
996 }
997
998 return tmp;
999}
1000
1001void tcp_free_fastopen_req(struct tcp_sock *tp)
1002{
1003 if (tp->fastopen_req != NULL) {
1004 kfree(tp->fastopen_req);
1005 tp->fastopen_req = NULL;
1006 }
1007}
1008
1009static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
1010{
1011 struct tcp_sock *tp = tcp_sk(sk);
1012 int err, flags;
1013
1014 if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
1015 return -EOPNOTSUPP;
1016 if (tp->fastopen_req != NULL)
1017 return -EALREADY;
1018
1019 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1020 sk->sk_allocation);
1021 if (unlikely(tp->fastopen_req == NULL))
1022 return -ENOBUFS;
1023 tp->fastopen_req->data = msg;
1024
1025 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1026 err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
1027 msg->msg_namelen, flags);
1028 *size = tp->fastopen_req->copied;
1029 tcp_free_fastopen_req(tp);
1030 return err;
1031}
1032
1033int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1034 size_t size)
1035{
1036 struct iovec *iov;
1037 struct tcp_sock *tp = tcp_sk(sk);
1038 struct sk_buff *skb;
1039 int iovlen, flags, err, copied = 0;
1040 int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
1041 bool sg;
1042 long timeo;
1043
1044 lock_sock(sk);
1045
1046 flags = msg->msg_flags;
1047 if (flags & MSG_FASTOPEN) {
1048 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
1049 if (err == -EINPROGRESS && copied_syn > 0)
1050 goto out;
1051 else if (err)
1052 goto out_err;
1053 offset = copied_syn;
1054 }
1055
1056 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1057
1058
1059
1060
1061
1062 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1063 !tcp_passive_fastopen(sk)) {
1064 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1065 goto do_error;
1066 }
1067
1068 if (unlikely(tp->repair)) {
1069 if (tp->repair_queue == TCP_RECV_QUEUE) {
1070 copied = tcp_send_rcvq(sk, msg, size);
1071 goto out;
1072 }
1073
1074 err = -EINVAL;
1075 if (tp->repair_queue == TCP_NO_QUEUE)
1076 goto out_err;
1077
1078
1079 }
1080
1081
1082 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1083
1084 mss_now = tcp_send_mss(sk, &size_goal, flags);
1085
1086
1087 iovlen = msg->msg_iovlen;
1088 iov = msg->msg_iov;
1089 copied = 0;
1090
1091 err = -EPIPE;
1092 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1093 goto out_err;
1094
1095 sg = !!(sk->sk_route_caps & NETIF_F_SG);
1096
1097 while (--iovlen >= 0) {
1098 size_t seglen = iov->iov_len;
1099 unsigned char __user *from = iov->iov_base;
1100
1101 iov++;
1102 if (unlikely(offset > 0)) {
1103 if (offset >= seglen) {
1104 offset -= seglen;
1105 continue;
1106 }
1107 seglen -= offset;
1108 from += offset;
1109 offset = 0;
1110 }
1111
1112 while (seglen > 0) {
1113 int copy = 0;
1114 int max = size_goal;
1115
1116 skb = tcp_write_queue_tail(sk);
1117 if (tcp_send_head(sk)) {
1118 if (skb->ip_summed == CHECKSUM_NONE)
1119 max = mss_now;
1120 copy = max - skb->len;
1121 }
1122
1123 if (copy <= 0) {
1124new_segment:
1125
1126
1127
1128 if (!sk_stream_memory_free(sk))
1129 goto wait_for_sndbuf;
1130
1131 skb = sk_stream_alloc_skb(sk,
1132 select_size(sk, sg),
1133 sk->sk_allocation);
1134 if (!skb)
1135 goto wait_for_memory;
1136
1137
1138
1139
1140
1141 if (tp->repair)
1142 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1143
1144
1145
1146
1147 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
1148 skb->ip_summed = CHECKSUM_PARTIAL;
1149
1150 skb_entail(sk, skb);
1151 copy = size_goal;
1152 max = size_goal;
1153 }
1154
1155
1156 if (copy > seglen)
1157 copy = seglen;
1158
1159
1160 if (skb_availroom(skb) > 0) {
1161
1162 copy = min_t(int, copy, skb_availroom(skb));
1163 err = skb_add_data_nocache(sk, skb, from, copy);
1164 if (err)
1165 goto do_fault;
1166 } else {
1167 bool merge = true;
1168 int i = skb_shinfo(skb)->nr_frags;
1169 struct page_frag *pfrag = sk_page_frag(sk);
1170
1171 if (!sk_page_frag_refill(sk, pfrag))
1172 goto wait_for_memory;
1173
1174 if (!skb_can_coalesce(skb, i, pfrag->page,
1175 pfrag->offset)) {
1176 if (i == MAX_SKB_FRAGS || !sg) {
1177 tcp_mark_push(tp, skb);
1178 goto new_segment;
1179 }
1180 merge = false;
1181 }
1182
1183 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1184
1185 if (!sk_wmem_schedule(sk, copy))
1186 goto wait_for_memory;
1187
1188 err = skb_copy_to_page_nocache(sk, from, skb,
1189 pfrag->page,
1190 pfrag->offset,
1191 copy);
1192 if (err)
1193 goto do_error;
1194
1195
1196 if (merge) {
1197 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1198 } else {
1199 skb_fill_page_desc(skb, i, pfrag->page,
1200 pfrag->offset, copy);
1201 get_page(pfrag->page);
1202 }
1203 pfrag->offset += copy;
1204 }
1205
1206 if (!copied)
1207 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1208
1209 tp->write_seq += copy;
1210 TCP_SKB_CB(skb)->end_seq += copy;
1211 skb_shinfo(skb)->gso_segs = 0;
1212
1213 from += copy;
1214 copied += copy;
1215 if ((seglen -= copy) == 0 && iovlen == 0)
1216 goto out;
1217
1218 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1219 continue;
1220
1221 if (forced_push(tp)) {
1222 tcp_mark_push(tp, skb);
1223 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1224 } else if (skb == tcp_send_head(sk))
1225 tcp_push_one(sk, mss_now);
1226 continue;
1227
1228wait_for_sndbuf:
1229 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1230wait_for_memory:
1231 if (copied)
1232 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1233
1234 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1235 goto do_error;
1236
1237 mss_now = tcp_send_mss(sk, &size_goal, flags);
1238 }
1239 }
1240
1241out:
1242 if (copied)
1243 tcp_push(sk, flags, mss_now, tp->nonagle);
1244 release_sock(sk);
1245 return copied + copied_syn;
1246
1247do_fault:
1248 if (!skb->len) {
1249 tcp_unlink_write_queue(skb, sk);
1250
1251
1252
1253 tcp_check_send_head(sk, skb);
1254 sk_wmem_free_skb(sk, skb);
1255 }
1256
1257do_error:
1258 if (copied + copied_syn)
1259 goto out;
1260out_err:
1261 err = sk_stream_error(sk, flags, err);
1262 release_sock(sk);
1263 return err;
1264}
1265EXPORT_SYMBOL(tcp_sendmsg);
1266
1267
1268
1269
1270
1271
1272static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1273{
1274 struct tcp_sock *tp = tcp_sk(sk);
1275
1276
1277 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1278 tp->urg_data == TCP_URG_READ)
1279 return -EINVAL;
1280
1281 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1282 return -ENOTCONN;
1283
1284 if (tp->urg_data & TCP_URG_VALID) {
1285 int err = 0;
1286 char c = tp->urg_data;
1287
1288 if (!(flags & MSG_PEEK))
1289 tp->urg_data = TCP_URG_READ;
1290
1291
1292 msg->msg_flags |= MSG_OOB;
1293
1294 if (len > 0) {
1295 if (!(flags & MSG_TRUNC))
1296 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1297 len = 1;
1298 } else
1299 msg->msg_flags |= MSG_TRUNC;
1300
1301 return err ? -EFAULT : len;
1302 }
1303
1304 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1305 return 0;
1306
1307
1308
1309
1310
1311
1312
1313 return -EAGAIN;
1314}
1315
1316static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1317{
1318 struct sk_buff *skb;
1319 int copied = 0, err = 0;
1320
1321
1322
1323 skb_queue_walk(&sk->sk_write_queue, skb) {
1324 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
1325 if (err)
1326 break;
1327
1328 copied += skb->len;
1329 }
1330
1331 return err ?: copied;
1332}
1333
1334
1335
1336
1337
1338
1339
1340void tcp_cleanup_rbuf(struct sock *sk, int copied)
1341{
1342 struct tcp_sock *tp = tcp_sk(sk);
1343 bool time_to_ack = false;
1344
1345 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1346
1347 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1348 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1349 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1350
1351 if (inet_csk_ack_scheduled(sk)) {
1352 const struct inet_connection_sock *icsk = inet_csk(sk);
1353
1354
1355 if (icsk->icsk_ack.blocked ||
1356
1357 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1358
1359
1360
1361
1362
1363
1364 (copied > 0 &&
1365 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1366 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1367 !icsk->icsk_ack.pingpong)) &&
1368 !atomic_read(&sk->sk_rmem_alloc)))
1369 time_to_ack = true;
1370 }
1371
1372
1373
1374
1375
1376
1377
1378 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1379 __u32 rcv_window_now = tcp_receive_window(tp);
1380
1381
1382 if (2*rcv_window_now <= tp->window_clamp) {
1383 __u32 new_window = __tcp_select_window(sk);
1384
1385
1386
1387
1388
1389
1390 if (new_window && new_window >= 2 * rcv_window_now)
1391 time_to_ack = true;
1392 }
1393 }
1394 if (time_to_ack)
1395 tcp_send_ack(sk);
1396}
1397
1398static void tcp_prequeue_process(struct sock *sk)
1399{
1400 struct sk_buff *skb;
1401 struct tcp_sock *tp = tcp_sk(sk);
1402
1403 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1404
1405
1406
1407 local_bh_disable();
1408 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1409 sk_backlog_rcv(sk, skb);
1410 local_bh_enable();
1411
1412
1413 tp->ucopy.memory = 0;
1414}
1415
1416#ifdef CONFIG_NET_DMA
1417static void tcp_service_net_dma(struct sock *sk, bool wait)
1418{
1419 dma_cookie_t done, used;
1420 dma_cookie_t last_issued;
1421 struct tcp_sock *tp = tcp_sk(sk);
1422
1423 if (!tp->ucopy.dma_chan)
1424 return;
1425
1426 last_issued = tp->ucopy.dma_cookie;
1427 dma_async_issue_pending(tp->ucopy.dma_chan);
1428
1429 do {
1430 if (dma_async_is_tx_complete(tp->ucopy.dma_chan,
1431 last_issued, &done,
1432 &used) == DMA_SUCCESS) {
1433
1434 __skb_queue_purge(&sk->sk_async_wait_queue);
1435 break;
1436 } else {
1437 struct sk_buff *skb;
1438 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1439 (dma_async_is_complete(skb->dma_cookie, done,
1440 used) == DMA_SUCCESS)) {
1441 __skb_dequeue(&sk->sk_async_wait_queue);
1442 kfree_skb(skb);
1443 }
1444 }
1445 } while (wait);
1446}
1447#endif
1448
1449static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1450{
1451 struct sk_buff *skb;
1452 u32 offset;
1453
1454 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1455 offset = seq - TCP_SKB_CB(skb)->seq;
1456 if (tcp_hdr(skb)->syn)
1457 offset--;
1458 if (offset < skb->len || tcp_hdr(skb)->fin) {
1459 *off = offset;
1460 return skb;
1461 }
1462
1463
1464
1465
1466 sk_eat_skb(sk, skb, false);
1467 }
1468 return NULL;
1469}
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1483 sk_read_actor_t recv_actor)
1484{
1485 struct sk_buff *skb;
1486 struct tcp_sock *tp = tcp_sk(sk);
1487 u32 seq = tp->copied_seq;
1488 u32 offset;
1489 int copied = 0;
1490
1491 if (sk->sk_state == TCP_LISTEN)
1492 return -ENOTCONN;
1493 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1494 if (offset < skb->len) {
1495 int used;
1496 size_t len;
1497
1498 len = skb->len - offset;
1499
1500 if (tp->urg_data) {
1501 u32 urg_offset = tp->urg_seq - seq;
1502 if (urg_offset < len)
1503 len = urg_offset;
1504 if (!len)
1505 break;
1506 }
1507 used = recv_actor(desc, skb, offset, len);
1508 if (used <= 0) {
1509 if (!copied)
1510 copied = used;
1511 break;
1512 } else if (used <= len) {
1513 seq += used;
1514 copied += used;
1515 offset += used;
1516 }
1517
1518
1519
1520
1521
1522 skb = tcp_recv_skb(sk, seq - 1, &offset);
1523 if (!skb)
1524 break;
1525
1526
1527
1528 if (offset + 1 != skb->len)
1529 continue;
1530 }
1531 if (tcp_hdr(skb)->fin) {
1532 sk_eat_skb(sk, skb, false);
1533 ++seq;
1534 break;
1535 }
1536 sk_eat_skb(sk, skb, false);
1537 if (!desc->count)
1538 break;
1539 tp->copied_seq = seq;
1540 }
1541 tp->copied_seq = seq;
1542
1543 tcp_rcv_space_adjust(sk);
1544
1545
1546 if (copied > 0) {
1547 tcp_recv_skb(sk, seq, &offset);
1548 tcp_cleanup_rbuf(sk, copied);
1549 }
1550 return copied;
1551}
1552EXPORT_SYMBOL(tcp_read_sock);
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1563 size_t len, int nonblock, int flags, int *addr_len)
1564{
1565 struct tcp_sock *tp = tcp_sk(sk);
1566 int copied = 0;
1567 u32 peek_seq;
1568 u32 *seq;
1569 unsigned long used;
1570 int err;
1571 int target;
1572 long timeo;
1573 struct task_struct *user_recv = NULL;
1574 bool copied_early = false;
1575 struct sk_buff *skb;
1576 u32 urg_hole = 0;
1577
1578 if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
1579 (sk->sk_state == TCP_ESTABLISHED))
1580 sk_busy_loop(sk, nonblock);
1581
1582 lock_sock(sk);
1583
1584 err = -ENOTCONN;
1585 if (sk->sk_state == TCP_LISTEN)
1586 goto out;
1587
1588 timeo = sock_rcvtimeo(sk, nonblock);
1589
1590
1591 if (flags & MSG_OOB)
1592 goto recv_urg;
1593
1594 if (unlikely(tp->repair)) {
1595 err = -EPERM;
1596 if (!(flags & MSG_PEEK))
1597 goto out;
1598
1599 if (tp->repair_queue == TCP_SEND_QUEUE)
1600 goto recv_sndq;
1601
1602 err = -EINVAL;
1603 if (tp->repair_queue == TCP_NO_QUEUE)
1604 goto out;
1605
1606
1607 }
1608
1609 seq = &tp->copied_seq;
1610 if (flags & MSG_PEEK) {
1611 peek_seq = tp->copied_seq;
1612 seq = &peek_seq;
1613 }
1614
1615 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1616
1617#ifdef CONFIG_NET_DMA
1618 tp->ucopy.dma_chan = NULL;
1619 preempt_disable();
1620 skb = skb_peek_tail(&sk->sk_receive_queue);
1621 {
1622 int available = 0;
1623
1624 if (skb)
1625 available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
1626 if ((available < target) &&
1627 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1628 !sysctl_tcp_low_latency &&
1629 net_dma_find_channel()) {
1630 preempt_enable_no_resched();
1631 tp->ucopy.pinned_list =
1632 dma_pin_iovec_pages(msg->msg_iov, len);
1633 } else {
1634 preempt_enable_no_resched();
1635 }
1636 }
1637#endif
1638
1639 do {
1640 u32 offset;
1641
1642
1643 if (tp->urg_data && tp->urg_seq == *seq) {
1644 if (copied)
1645 break;
1646 if (signal_pending(current)) {
1647 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1648 break;
1649 }
1650 }
1651
1652
1653
1654 skb_queue_walk(&sk->sk_receive_queue, skb) {
1655
1656
1657
1658 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1659 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
1660 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1661 flags))
1662 break;
1663
1664 offset = *seq - TCP_SKB_CB(skb)->seq;
1665 if (tcp_hdr(skb)->syn)
1666 offset--;
1667 if (offset < skb->len)
1668 goto found_ok_skb;
1669 if (tcp_hdr(skb)->fin)
1670 goto found_fin_ok;
1671 WARN(!(flags & MSG_PEEK),
1672 "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
1673 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1674 }
1675
1676
1677
1678 if (copied >= target && !sk->sk_backlog.tail)
1679 break;
1680
1681 if (copied) {
1682 if (sk->sk_err ||
1683 sk->sk_state == TCP_CLOSE ||
1684 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1685 !timeo ||
1686 signal_pending(current))
1687 break;
1688 } else {
1689 if (sock_flag(sk, SOCK_DONE))
1690 break;
1691
1692 if (sk->sk_err) {
1693 copied = sock_error(sk);
1694 break;
1695 }
1696
1697 if (sk->sk_shutdown & RCV_SHUTDOWN)
1698 break;
1699
1700 if (sk->sk_state == TCP_CLOSE) {
1701 if (!sock_flag(sk, SOCK_DONE)) {
1702
1703
1704
1705 copied = -ENOTCONN;
1706 break;
1707 }
1708 break;
1709 }
1710
1711 if (!timeo) {
1712 copied = -EAGAIN;
1713 break;
1714 }
1715
1716 if (signal_pending(current)) {
1717 copied = sock_intr_errno(timeo);
1718 break;
1719 }
1720 }
1721
1722 tcp_cleanup_rbuf(sk, copied);
1723
1724 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1725
1726 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1727 user_recv = current;
1728 tp->ucopy.task = user_recv;
1729 tp->ucopy.iov = msg->msg_iov;
1730 }
1731
1732 tp->ucopy.len = len;
1733
1734 WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1735 !(flags & (MSG_PEEK | MSG_TRUNC)));
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763 if (!skb_queue_empty(&tp->ucopy.prequeue))
1764 goto do_prequeue;
1765
1766
1767 }
1768
1769#ifdef CONFIG_NET_DMA
1770 if (tp->ucopy.dma_chan) {
1771 if (tp->rcv_wnd == 0 &&
1772 !skb_queue_empty(&sk->sk_async_wait_queue)) {
1773 tcp_service_net_dma(sk, true);
1774 tcp_cleanup_rbuf(sk, copied);
1775 } else
1776 dma_async_issue_pending(tp->ucopy.dma_chan);
1777 }
1778#endif
1779 if (copied >= target) {
1780
1781 release_sock(sk);
1782 lock_sock(sk);
1783 } else
1784 sk_wait_data(sk, &timeo);
1785
1786#ifdef CONFIG_NET_DMA
1787 tcp_service_net_dma(sk, false);
1788 tp->ucopy.wakeup = 0;
1789#endif
1790
1791 if (user_recv) {
1792 int chunk;
1793
1794
1795
1796 if ((chunk = len - tp->ucopy.len) != 0) {
1797 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1798 len -= chunk;
1799 copied += chunk;
1800 }
1801
1802 if (tp->rcv_nxt == tp->copied_seq &&
1803 !skb_queue_empty(&tp->ucopy.prequeue)) {
1804do_prequeue:
1805 tcp_prequeue_process(sk);
1806
1807 if ((chunk = len - tp->ucopy.len) != 0) {
1808 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1809 len -= chunk;
1810 copied += chunk;
1811 }
1812 }
1813 }
1814 if ((flags & MSG_PEEK) &&
1815 (peek_seq - copied - urg_hole != tp->copied_seq)) {
1816 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
1817 current->comm,
1818 task_pid_nr(current));
1819 peek_seq = tp->copied_seq;
1820 }
1821 continue;
1822
1823 found_ok_skb:
1824
1825 used = skb->len - offset;
1826 if (len < used)
1827 used = len;
1828
1829
1830 if (tp->urg_data) {
1831 u32 urg_offset = tp->urg_seq - *seq;
1832 if (urg_offset < used) {
1833 if (!urg_offset) {
1834 if (!sock_flag(sk, SOCK_URGINLINE)) {
1835 ++*seq;
1836 urg_hole++;
1837 offset++;
1838 used--;
1839 if (!used)
1840 goto skip_copy;
1841 }
1842 } else
1843 used = urg_offset;
1844 }
1845 }
1846
1847 if (!(flags & MSG_TRUNC)) {
1848#ifdef CONFIG_NET_DMA
1849 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1850 tp->ucopy.dma_chan = net_dma_find_channel();
1851
1852 if (tp->ucopy.dma_chan) {
1853 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1854 tp->ucopy.dma_chan, skb, offset,
1855 msg->msg_iov, used,
1856 tp->ucopy.pinned_list);
1857
1858 if (tp->ucopy.dma_cookie < 0) {
1859
1860 pr_alert("%s: dma_cookie < 0\n",
1861 __func__);
1862
1863
1864 if (!copied)
1865 copied = -EFAULT;
1866 break;
1867 }
1868
1869 dma_async_issue_pending(tp->ucopy.dma_chan);
1870
1871 if ((offset + used) == skb->len)
1872 copied_early = true;
1873
1874 } else
1875#endif
1876 {
1877 err = skb_copy_datagram_iovec(skb, offset,
1878 msg->msg_iov, used);
1879 if (err) {
1880
1881 if (!copied)
1882 copied = -EFAULT;
1883 break;
1884 }
1885 }
1886 }
1887
1888 *seq += used;
1889 copied += used;
1890 len -= used;
1891
1892 tcp_rcv_space_adjust(sk);
1893
1894skip_copy:
1895 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1896 tp->urg_data = 0;
1897 tcp_fast_path_check(sk);
1898 }
1899 if (used + offset < skb->len)
1900 continue;
1901
1902 if (tcp_hdr(skb)->fin)
1903 goto found_fin_ok;
1904 if (!(flags & MSG_PEEK)) {
1905 sk_eat_skb(sk, skb, copied_early);
1906 copied_early = false;
1907 }
1908 continue;
1909
1910 found_fin_ok:
1911
1912 ++*seq;
1913 if (!(flags & MSG_PEEK)) {
1914 sk_eat_skb(sk, skb, copied_early);
1915 copied_early = false;
1916 }
1917 break;
1918 } while (len > 0);
1919
1920 if (user_recv) {
1921 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1922 int chunk;
1923
1924 tp->ucopy.len = copied > 0 ? len : 0;
1925
1926 tcp_prequeue_process(sk);
1927
1928 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1929 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1930 len -= chunk;
1931 copied += chunk;
1932 }
1933 }
1934
1935 tp->ucopy.task = NULL;
1936 tp->ucopy.len = 0;
1937 }
1938
1939#ifdef CONFIG_NET_DMA
1940 tcp_service_net_dma(sk, true);
1941 tp->ucopy.dma_chan = NULL;
1942
1943 if (tp->ucopy.pinned_list) {
1944 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1945 tp->ucopy.pinned_list = NULL;
1946 }
1947#endif
1948
1949
1950
1951
1952
1953
1954 tcp_cleanup_rbuf(sk, copied);
1955
1956 release_sock(sk);
1957 return copied;
1958
1959out:
1960 release_sock(sk);
1961 return err;
1962
1963recv_urg:
1964 err = tcp_recv_urg(sk, msg, len, flags);
1965 goto out;
1966
1967recv_sndq:
1968 err = tcp_peek_sndq(sk, msg, len);
1969 goto out;
1970}
1971EXPORT_SYMBOL(tcp_recvmsg);
1972
1973void tcp_set_state(struct sock *sk, int state)
1974{
1975 int oldstate = sk->sk_state;
1976
1977 switch (state) {
1978 case TCP_ESTABLISHED:
1979 if (oldstate != TCP_ESTABLISHED)
1980 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1981 break;
1982
1983 case TCP_CLOSE:
1984 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
1985 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
1986
1987 sk->sk_prot->unhash(sk);
1988 if (inet_csk(sk)->icsk_bind_hash &&
1989 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1990 inet_put_port(sk);
1991
1992 default:
1993 if (oldstate == TCP_ESTABLISHED)
1994 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1995 }
1996
1997
1998
1999
2000 sk->sk_state = state;
2001
2002#ifdef STATE_TRACE
2003 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
2004#endif
2005}
2006EXPORT_SYMBOL_GPL(tcp_set_state);
2007
2008
2009
2010
2011
2012
2013
2014
2015static const unsigned char new_state[16] = {
2016
2017 TCP_CLOSE,
2018 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2019 TCP_CLOSE,
2020 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2021 TCP_FIN_WAIT1,
2022 TCP_FIN_WAIT2,
2023 TCP_CLOSE,
2024 TCP_CLOSE,
2025 TCP_LAST_ACK | TCP_ACTION_FIN,
2026 TCP_LAST_ACK,
2027 TCP_CLOSE,
2028 TCP_CLOSING,
2029};
2030
2031static int tcp_close_state(struct sock *sk)
2032{
2033 int next = (int)new_state[sk->sk_state];
2034 int ns = next & TCP_STATE_MASK;
2035
2036 tcp_set_state(sk, ns);
2037
2038 return next & TCP_ACTION_FIN;
2039}
2040
2041
2042
2043
2044
2045
2046void tcp_shutdown(struct sock *sk, int how)
2047{
2048
2049
2050
2051
2052 if (!(how & SEND_SHUTDOWN))
2053 return;
2054
2055
2056 if ((1 << sk->sk_state) &
2057 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2058 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2059
2060 if (tcp_close_state(sk))
2061 tcp_send_fin(sk);
2062 }
2063}
2064EXPORT_SYMBOL(tcp_shutdown);
2065
2066bool tcp_check_oom(struct sock *sk, int shift)
2067{
2068 bool too_many_orphans, out_of_socket_memory;
2069
2070 too_many_orphans = tcp_too_many_orphans(sk, shift);
2071 out_of_socket_memory = tcp_out_of_memory(sk);
2072
2073 if (too_many_orphans)
2074 net_info_ratelimited("too many orphaned sockets\n");
2075 if (out_of_socket_memory)
2076 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2077 return too_many_orphans || out_of_socket_memory;
2078}
2079
2080void tcp_close(struct sock *sk, long timeout)
2081{
2082 struct sk_buff *skb;
2083 int data_was_unread = 0;
2084 int state;
2085
2086 lock_sock(sk);
2087 sk->sk_shutdown = SHUTDOWN_MASK;
2088
2089 if (sk->sk_state == TCP_LISTEN) {
2090 tcp_set_state(sk, TCP_CLOSE);
2091
2092
2093 inet_csk_listen_stop(sk);
2094
2095 goto adjudge_to_death;
2096 }
2097
2098
2099
2100
2101
2102 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2103 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
2104 tcp_hdr(skb)->fin;
2105 data_was_unread += len;
2106 __kfree_skb(skb);
2107 }
2108
2109 sk_mem_reclaim(sk);
2110
2111
2112 if (sk->sk_state == TCP_CLOSE)
2113 goto adjudge_to_death;
2114
2115
2116
2117
2118
2119
2120
2121
2122 if (unlikely(tcp_sk(sk)->repair)) {
2123 sk->sk_prot->disconnect(sk, 0);
2124 } else if (data_was_unread) {
2125
2126 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2127 tcp_set_state(sk, TCP_CLOSE);
2128 tcp_send_active_reset(sk, sk->sk_allocation);
2129 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2130
2131 sk->sk_prot->disconnect(sk, 0);
2132 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2133 } else if (tcp_close_state(sk)) {
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163 tcp_send_fin(sk);
2164 }
2165
2166 sk_stream_wait_close(sk, timeout);
2167
2168adjudge_to_death:
2169 state = sk->sk_state;
2170 sock_hold(sk);
2171 sock_orphan(sk);
2172
2173
2174 release_sock(sk);
2175
2176
2177
2178
2179
2180 local_bh_disable();
2181 bh_lock_sock(sk);
2182 WARN_ON(sock_owned_by_user(sk));
2183
2184 percpu_counter_inc(sk->sk_prot->orphan_count);
2185
2186
2187 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2188 goto out;
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204 if (sk->sk_state == TCP_FIN_WAIT2) {
2205 struct tcp_sock *tp = tcp_sk(sk);
2206 if (tp->linger2 < 0) {
2207 tcp_set_state(sk, TCP_CLOSE);
2208 tcp_send_active_reset(sk, GFP_ATOMIC);
2209 NET_INC_STATS_BH(sock_net(sk),
2210 LINUX_MIB_TCPABORTONLINGER);
2211 } else {
2212 const int tmo = tcp_fin_time(sk);
2213
2214 if (tmo > TCP_TIMEWAIT_LEN) {
2215 inet_csk_reset_keepalive_timer(sk,
2216 tmo - TCP_TIMEWAIT_LEN);
2217 } else {
2218 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2219 goto out;
2220 }
2221 }
2222 }
2223 if (sk->sk_state != TCP_CLOSE) {
2224 sk_mem_reclaim(sk);
2225 if (tcp_check_oom(sk, 0)) {
2226 tcp_set_state(sk, TCP_CLOSE);
2227 tcp_send_active_reset(sk, GFP_ATOMIC);
2228 NET_INC_STATS_BH(sock_net(sk),
2229 LINUX_MIB_TCPABORTONMEMORY);
2230 }
2231 }
2232
2233 if (sk->sk_state == TCP_CLOSE) {
2234 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2235
2236
2237
2238
2239 if (req != NULL)
2240 reqsk_fastopen_remove(sk, req, false);
2241 inet_csk_destroy_sock(sk);
2242 }
2243
2244
2245out:
2246 bh_unlock_sock(sk);
2247 local_bh_enable();
2248 sock_put(sk);
2249}
2250EXPORT_SYMBOL(tcp_close);
2251
2252
2253
2254static inline bool tcp_need_reset(int state)
2255{
2256 return (1 << state) &
2257 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2258 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2259}
2260
2261int tcp_disconnect(struct sock *sk, int flags)
2262{
2263 struct inet_sock *inet = inet_sk(sk);
2264 struct inet_connection_sock *icsk = inet_csk(sk);
2265 struct tcp_sock *tp = tcp_sk(sk);
2266 int err = 0;
2267 int old_state = sk->sk_state;
2268
2269 if (old_state != TCP_CLOSE)
2270 tcp_set_state(sk, TCP_CLOSE);
2271
2272
2273 if (old_state == TCP_LISTEN) {
2274 inet_csk_listen_stop(sk);
2275 } else if (unlikely(tp->repair)) {
2276 sk->sk_err = ECONNABORTED;
2277 } else if (tcp_need_reset(old_state) ||
2278 (tp->snd_nxt != tp->write_seq &&
2279 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2280
2281
2282
2283 tcp_send_active_reset(sk, gfp_any());
2284 sk->sk_err = ECONNRESET;
2285 } else if (old_state == TCP_SYN_SENT)
2286 sk->sk_err = ECONNRESET;
2287
2288 tcp_clear_xmit_timers(sk);
2289 __skb_queue_purge(&sk->sk_receive_queue);
2290 tcp_write_queue_purge(sk);
2291 __skb_queue_purge(&tp->out_of_order_queue);
2292#ifdef CONFIG_NET_DMA
2293 __skb_queue_purge(&sk->sk_async_wait_queue);
2294#endif
2295
2296 inet->inet_dport = 0;
2297
2298 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2299 inet_reset_saddr(sk);
2300
2301 sk->sk_shutdown = 0;
2302 sock_reset_flag(sk, SOCK_DONE);
2303 tp->srtt = 0;
2304 if ((tp->write_seq += tp->max_window + 2) == 0)
2305 tp->write_seq = 1;
2306 icsk->icsk_backoff = 0;
2307 tp->snd_cwnd = 2;
2308 icsk->icsk_probes_out = 0;
2309 tp->packets_out = 0;
2310 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2311 tp->snd_cwnd_cnt = 0;
2312 tp->window_clamp = 0;
2313 tcp_set_ca_state(sk, TCP_CA_Open);
2314 tcp_clear_retrans(tp);
2315 inet_csk_delack_init(sk);
2316 tcp_init_send_head(sk);
2317 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2318 __sk_dst_reset(sk);
2319
2320 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2321
2322 sk->sk_error_report(sk);
2323 return err;
2324}
2325EXPORT_SYMBOL(tcp_disconnect);
2326
2327void tcp_sock_destruct(struct sock *sk)
2328{
2329 inet_sock_destruct(sk);
2330
2331 kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
2332}
2333
2334static inline bool tcp_can_repair_sock(const struct sock *sk)
2335{
2336 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2337 ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
2338}
2339
2340static int tcp_repair_options_est(struct tcp_sock *tp,
2341 struct tcp_repair_opt __user *optbuf, unsigned int len)
2342{
2343 struct tcp_repair_opt opt;
2344
2345 while (len >= sizeof(opt)) {
2346 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2347 return -EFAULT;
2348
2349 optbuf++;
2350 len -= sizeof(opt);
2351
2352 switch (opt.opt_code) {
2353 case TCPOPT_MSS:
2354 tp->rx_opt.mss_clamp = opt.opt_val;
2355 break;
2356 case TCPOPT_WINDOW:
2357 {
2358 u16 snd_wscale = opt.opt_val & 0xFFFF;
2359 u16 rcv_wscale = opt.opt_val >> 16;
2360
2361 if (snd_wscale > 14 || rcv_wscale > 14)
2362 return -EFBIG;
2363
2364 tp->rx_opt.snd_wscale = snd_wscale;
2365 tp->rx_opt.rcv_wscale = rcv_wscale;
2366 tp->rx_opt.wscale_ok = 1;
2367 }
2368 break;
2369 case TCPOPT_SACK_PERM:
2370 if (opt.opt_val != 0)
2371 return -EINVAL;
2372
2373 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2374 if (sysctl_tcp_fack)
2375 tcp_enable_fack(tp);
2376 break;
2377 case TCPOPT_TIMESTAMP:
2378 if (opt.opt_val != 0)
2379 return -EINVAL;
2380
2381 tp->rx_opt.tstamp_ok = 1;
2382 break;
2383 }
2384 }
2385
2386 return 0;
2387}
2388
2389
2390
2391
2392static int do_tcp_setsockopt(struct sock *sk, int level,
2393 int optname, char __user *optval, unsigned int optlen)
2394{
2395 struct tcp_sock *tp = tcp_sk(sk);
2396 struct inet_connection_sock *icsk = inet_csk(sk);
2397 int val;
2398 int err = 0;
2399
2400
2401 switch (optname) {
2402 case TCP_CONGESTION: {
2403 char name[TCP_CA_NAME_MAX];
2404
2405 if (optlen < 1)
2406 return -EINVAL;
2407
2408 val = strncpy_from_user(name, optval,
2409 min_t(long, TCP_CA_NAME_MAX-1, optlen));
2410 if (val < 0)
2411 return -EFAULT;
2412 name[val] = 0;
2413
2414 lock_sock(sk);
2415 err = tcp_set_congestion_control(sk, name);
2416 release_sock(sk);
2417 return err;
2418 }
2419 default:
2420
2421 break;
2422 }
2423
2424 if (optlen < sizeof(int))
2425 return -EINVAL;
2426
2427 if (get_user(val, (int __user *)optval))
2428 return -EFAULT;
2429
2430 lock_sock(sk);
2431
2432 switch (optname) {
2433 case TCP_MAXSEG:
2434
2435
2436
2437 if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
2438 err = -EINVAL;
2439 break;
2440 }
2441 tp->rx_opt.user_mss = val;
2442 break;
2443
2444 case TCP_NODELAY:
2445 if (val) {
2446
2447
2448
2449
2450
2451
2452
2453
2454 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2455 tcp_push_pending_frames(sk);
2456 } else {
2457 tp->nonagle &= ~TCP_NAGLE_OFF;
2458 }
2459 break;
2460
2461 case TCP_THIN_LINEAR_TIMEOUTS:
2462 if (val < 0 || val > 1)
2463 err = -EINVAL;
2464 else
2465 tp->thin_lto = val;
2466 break;
2467
2468 case TCP_THIN_DUPACK:
2469 if (val < 0 || val > 1)
2470 err = -EINVAL;
2471 else {
2472 tp->thin_dupack = val;
2473 if (tp->thin_dupack)
2474 tcp_disable_early_retrans(tp);
2475 }
2476 break;
2477
2478 case TCP_REPAIR:
2479 if (!tcp_can_repair_sock(sk))
2480 err = -EPERM;
2481 else if (val == 1) {
2482 tp->repair = 1;
2483 sk->sk_reuse = SK_FORCE_REUSE;
2484 tp->repair_queue = TCP_NO_QUEUE;
2485 } else if (val == 0) {
2486 tp->repair = 0;
2487 sk->sk_reuse = SK_NO_REUSE;
2488 tcp_send_window_probe(sk);
2489 } else
2490 err = -EINVAL;
2491
2492 break;
2493
2494 case TCP_REPAIR_QUEUE:
2495 if (!tp->repair)
2496 err = -EPERM;
2497 else if (val < TCP_QUEUES_NR)
2498 tp->repair_queue = val;
2499 else
2500 err = -EINVAL;
2501 break;
2502
2503 case TCP_QUEUE_SEQ:
2504 if (sk->sk_state != TCP_CLOSE)
2505 err = -EPERM;
2506 else if (tp->repair_queue == TCP_SEND_QUEUE)
2507 tp->write_seq = val;
2508 else if (tp->repair_queue == TCP_RECV_QUEUE)
2509 tp->rcv_nxt = val;
2510 else
2511 err = -EINVAL;
2512 break;
2513
2514 case TCP_REPAIR_OPTIONS:
2515 if (!tp->repair)
2516 err = -EINVAL;
2517 else if (sk->sk_state == TCP_ESTABLISHED)
2518 err = tcp_repair_options_est(tp,
2519 (struct tcp_repair_opt __user *)optval,
2520 optlen);
2521 else
2522 err = -EPERM;
2523 break;
2524
2525 case TCP_CORK:
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537 if (val) {
2538 tp->nonagle |= TCP_NAGLE_CORK;
2539 } else {
2540 tp->nonagle &= ~TCP_NAGLE_CORK;
2541 if (tp->nonagle&TCP_NAGLE_OFF)
2542 tp->nonagle |= TCP_NAGLE_PUSH;
2543 tcp_push_pending_frames(sk);
2544 }
2545 break;
2546
2547 case TCP_KEEPIDLE:
2548 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2549 err = -EINVAL;
2550 else {
2551 tp->keepalive_time = val * HZ;
2552 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2553 !((1 << sk->sk_state) &
2554 (TCPF_CLOSE | TCPF_LISTEN))) {
2555 u32 elapsed = keepalive_time_elapsed(tp);
2556 if (tp->keepalive_time > elapsed)
2557 elapsed = tp->keepalive_time - elapsed;
2558 else
2559 elapsed = 0;
2560 inet_csk_reset_keepalive_timer(sk, elapsed);
2561 }
2562 }
2563 break;
2564 case TCP_KEEPINTVL:
2565 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2566 err = -EINVAL;
2567 else
2568 tp->keepalive_intvl = val * HZ;
2569 break;
2570 case TCP_KEEPCNT:
2571 if (val < 1 || val > MAX_TCP_KEEPCNT)
2572 err = -EINVAL;
2573 else
2574 tp->keepalive_probes = val;
2575 break;
2576 case TCP_SYNCNT:
2577 if (val < 1 || val > MAX_TCP_SYNCNT)
2578 err = -EINVAL;
2579 else
2580 icsk->icsk_syn_retries = val;
2581 break;
2582
2583 case TCP_LINGER2:
2584 if (val < 0)
2585 tp->linger2 = -1;
2586 else if (val > sysctl_tcp_fin_timeout / HZ)
2587 tp->linger2 = 0;
2588 else
2589 tp->linger2 = val * HZ;
2590 break;
2591
2592 case TCP_DEFER_ACCEPT:
2593
2594 icsk->icsk_accept_queue.rskq_defer_accept =
2595 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2596 TCP_RTO_MAX / HZ);
2597 break;
2598
2599 case TCP_WINDOW_CLAMP:
2600 if (!val) {
2601 if (sk->sk_state != TCP_CLOSE) {
2602 err = -EINVAL;
2603 break;
2604 }
2605 tp->window_clamp = 0;
2606 } else
2607 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2608 SOCK_MIN_RCVBUF / 2 : val;
2609 break;
2610
2611 case TCP_QUICKACK:
2612 if (!val) {
2613 icsk->icsk_ack.pingpong = 1;
2614 } else {
2615 icsk->icsk_ack.pingpong = 0;
2616 if ((1 << sk->sk_state) &
2617 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2618 inet_csk_ack_scheduled(sk)) {
2619 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2620 tcp_cleanup_rbuf(sk, 1);
2621 if (!(val & 1))
2622 icsk->icsk_ack.pingpong = 1;
2623 }
2624 }
2625 break;
2626
2627#ifdef CONFIG_TCP_MD5SIG
2628 case TCP_MD5SIG:
2629
2630 err = tp->af_specific->md5_parse(sk, optval, optlen);
2631 break;
2632#endif
2633 case TCP_USER_TIMEOUT:
2634
2635
2636
2637 if (val < 0)
2638 err = -EINVAL;
2639 else
2640 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2641 break;
2642
2643 case TCP_FASTOPEN:
2644 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2645 TCPF_LISTEN)))
2646 err = fastopen_init_queue(sk, val);
2647 else
2648 err = -EINVAL;
2649 break;
2650 case TCP_TIMESTAMP:
2651 if (!tp->repair)
2652 err = -EPERM;
2653 else
2654 tp->tsoffset = val - tcp_time_stamp;
2655 break;
2656 case TCP_NOTSENT_LOWAT:
2657 tp->notsent_lowat = val;
2658 sk->sk_write_space(sk);
2659 break;
2660 default:
2661 err = -ENOPROTOOPT;
2662 break;
2663 }
2664
2665 release_sock(sk);
2666 return err;
2667}
2668
2669int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2670 unsigned int optlen)
2671{
2672 const struct inet_connection_sock *icsk = inet_csk(sk);
2673
2674 if (level != SOL_TCP)
2675 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2676 optval, optlen);
2677 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2678}
2679EXPORT_SYMBOL(tcp_setsockopt);
2680
2681#ifdef CONFIG_COMPAT
2682int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2683 char __user *optval, unsigned int optlen)
2684{
2685 if (level != SOL_TCP)
2686 return inet_csk_compat_setsockopt(sk, level, optname,
2687 optval, optlen);
2688 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2689}
2690EXPORT_SYMBOL(compat_tcp_setsockopt);
2691#endif
2692
2693
2694void tcp_get_info(const struct sock *sk, struct tcp_info *info)
2695{
2696 const struct tcp_sock *tp = tcp_sk(sk);
2697 const struct inet_connection_sock *icsk = inet_csk(sk);
2698 u32 now = tcp_time_stamp;
2699
2700 memset(info, 0, sizeof(*info));
2701
2702 info->tcpi_state = sk->sk_state;
2703 info->tcpi_ca_state = icsk->icsk_ca_state;
2704 info->tcpi_retransmits = icsk->icsk_retransmits;
2705 info->tcpi_probes = icsk->icsk_probes_out;
2706 info->tcpi_backoff = icsk->icsk_backoff;
2707
2708 if (tp->rx_opt.tstamp_ok)
2709 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2710 if (tcp_is_sack(tp))
2711 info->tcpi_options |= TCPI_OPT_SACK;
2712 if (tp->rx_opt.wscale_ok) {
2713 info->tcpi_options |= TCPI_OPT_WSCALE;
2714 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2715 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2716 }
2717
2718 if (tp->ecn_flags & TCP_ECN_OK)
2719 info->tcpi_options |= TCPI_OPT_ECN;
2720 if (tp->ecn_flags & TCP_ECN_SEEN)
2721 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
2722 if (tp->syn_data_acked)
2723 info->tcpi_options |= TCPI_OPT_SYN_DATA;
2724
2725 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2726 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2727 info->tcpi_snd_mss = tp->mss_cache;
2728 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2729
2730 if (sk->sk_state == TCP_LISTEN) {
2731 info->tcpi_unacked = sk->sk_ack_backlog;
2732 info->tcpi_sacked = sk->sk_max_ack_backlog;
2733 } else {
2734 info->tcpi_unacked = tp->packets_out;
2735 info->tcpi_sacked = tp->sacked_out;
2736 }
2737 info->tcpi_lost = tp->lost_out;
2738 info->tcpi_retrans = tp->retrans_out;
2739 info->tcpi_fackets = tp->fackets_out;
2740
2741 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2742 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2743 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2744
2745 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2746 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2747 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2748 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2749 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2750 info->tcpi_snd_cwnd = tp->snd_cwnd;
2751 info->tcpi_advmss = tp->advmss;
2752 info->tcpi_reordering = tp->reordering;
2753
2754 info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2755 info->tcpi_rcv_space = tp->rcvq_space.space;
2756
2757 info->tcpi_total_retrans = tp->total_retrans;
2758}
2759EXPORT_SYMBOL_GPL(tcp_get_info);
2760
2761static int do_tcp_getsockopt(struct sock *sk, int level,
2762 int optname, char __user *optval, int __user *optlen)
2763{
2764 struct inet_connection_sock *icsk = inet_csk(sk);
2765 struct tcp_sock *tp = tcp_sk(sk);
2766 int val, len;
2767
2768 if (get_user(len, optlen))
2769 return -EFAULT;
2770
2771 len = min_t(unsigned int, len, sizeof(int));
2772
2773 if (len < 0)
2774 return -EINVAL;
2775
2776 switch (optname) {
2777 case TCP_MAXSEG:
2778 val = tp->mss_cache;
2779 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2780 val = tp->rx_opt.user_mss;
2781 if (tp->repair)
2782 val = tp->rx_opt.mss_clamp;
2783 break;
2784 case TCP_NODELAY:
2785 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2786 break;
2787 case TCP_CORK:
2788 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2789 break;
2790 case TCP_KEEPIDLE:
2791 val = keepalive_time_when(tp) / HZ;
2792 break;
2793 case TCP_KEEPINTVL:
2794 val = keepalive_intvl_when(tp) / HZ;
2795 break;
2796 case TCP_KEEPCNT:
2797 val = keepalive_probes(tp);
2798 break;
2799 case TCP_SYNCNT:
2800 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2801 break;
2802 case TCP_LINGER2:
2803 val = tp->linger2;
2804 if (val >= 0)
2805 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2806 break;
2807 case TCP_DEFER_ACCEPT:
2808 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
2809 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
2810 break;
2811 case TCP_WINDOW_CLAMP:
2812 val = tp->window_clamp;
2813 break;
2814 case TCP_INFO: {
2815 struct tcp_info info;
2816
2817 if (get_user(len, optlen))
2818 return -EFAULT;
2819
2820 tcp_get_info(sk, &info);
2821
2822 len = min_t(unsigned int, len, sizeof(info));
2823 if (put_user(len, optlen))
2824 return -EFAULT;
2825 if (copy_to_user(optval, &info, len))
2826 return -EFAULT;
2827 return 0;
2828 }
2829 case TCP_QUICKACK:
2830 val = !icsk->icsk_ack.pingpong;
2831 break;
2832
2833 case TCP_CONGESTION:
2834 if (get_user(len, optlen))
2835 return -EFAULT;
2836 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2837 if (put_user(len, optlen))
2838 return -EFAULT;
2839 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2840 return -EFAULT;
2841 return 0;
2842
2843 case TCP_THIN_LINEAR_TIMEOUTS:
2844 val = tp->thin_lto;
2845 break;
2846 case TCP_THIN_DUPACK:
2847 val = tp->thin_dupack;
2848 break;
2849
2850 case TCP_REPAIR:
2851 val = tp->repair;
2852 break;
2853
2854 case TCP_REPAIR_QUEUE:
2855 if (tp->repair)
2856 val = tp->repair_queue;
2857 else
2858 return -EINVAL;
2859 break;
2860
2861 case TCP_QUEUE_SEQ:
2862 if (tp->repair_queue == TCP_SEND_QUEUE)
2863 val = tp->write_seq;
2864 else if (tp->repair_queue == TCP_RECV_QUEUE)
2865 val = tp->rcv_nxt;
2866 else
2867 return -EINVAL;
2868 break;
2869
2870 case TCP_USER_TIMEOUT:
2871 val = jiffies_to_msecs(icsk->icsk_user_timeout);
2872 break;
2873 case TCP_TIMESTAMP:
2874 val = tcp_time_stamp + tp->tsoffset;
2875 break;
2876 case TCP_NOTSENT_LOWAT:
2877 val = tp->notsent_lowat;
2878 break;
2879 default:
2880 return -ENOPROTOOPT;
2881 }
2882
2883 if (put_user(len, optlen))
2884 return -EFAULT;
2885 if (copy_to_user(optval, &val, len))
2886 return -EFAULT;
2887 return 0;
2888}
2889
2890int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2891 int __user *optlen)
2892{
2893 struct inet_connection_sock *icsk = inet_csk(sk);
2894
2895 if (level != SOL_TCP)
2896 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2897 optval, optlen);
2898 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2899}
2900EXPORT_SYMBOL(tcp_getsockopt);
2901
2902#ifdef CONFIG_COMPAT
2903int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2904 char __user *optval, int __user *optlen)
2905{
2906 if (level != SOL_TCP)
2907 return inet_csk_compat_getsockopt(sk, level, optname,
2908 optval, optlen);
2909 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2910}
2911EXPORT_SYMBOL(compat_tcp_getsockopt);
2912#endif
2913
2914#ifdef CONFIG_TCP_MD5SIG
2915static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly;
2916static DEFINE_MUTEX(tcp_md5sig_mutex);
2917
2918static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
2919{
2920 int cpu;
2921
2922 for_each_possible_cpu(cpu) {
2923 struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu);
2924
2925 if (p->md5_desc.tfm)
2926 crypto_free_hash(p->md5_desc.tfm);
2927 }
2928 free_percpu(pool);
2929}
2930
2931static void __tcp_alloc_md5sig_pool(void)
2932{
2933 int cpu;
2934 struct tcp_md5sig_pool __percpu *pool;
2935
2936 pool = alloc_percpu(struct tcp_md5sig_pool);
2937 if (!pool)
2938 return;
2939
2940 for_each_possible_cpu(cpu) {
2941 struct crypto_hash *hash;
2942
2943 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
2944 if (IS_ERR_OR_NULL(hash))
2945 goto out_free;
2946
2947 per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
2948 }
2949
2950
2951
2952 smp_wmb();
2953 tcp_md5sig_pool = pool;
2954 return;
2955out_free:
2956 __tcp_free_md5sig_pool(pool);
2957}
2958
2959bool tcp_alloc_md5sig_pool(void)
2960{
2961 if (unlikely(!tcp_md5sig_pool)) {
2962 mutex_lock(&tcp_md5sig_mutex);
2963
2964 if (!tcp_md5sig_pool)
2965 __tcp_alloc_md5sig_pool();
2966
2967 mutex_unlock(&tcp_md5sig_mutex);
2968 }
2969 return tcp_md5sig_pool != NULL;
2970}
2971EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
2982{
2983 struct tcp_md5sig_pool __percpu *p;
2984
2985 local_bh_disable();
2986 p = ACCESS_ONCE(tcp_md5sig_pool);
2987 if (p)
2988 return __this_cpu_ptr(p);
2989
2990 local_bh_enable();
2991 return NULL;
2992}
2993EXPORT_SYMBOL(tcp_get_md5sig_pool);
2994
2995int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
2996 const struct tcphdr *th)
2997{
2998 struct scatterlist sg;
2999 struct tcphdr hdr;
3000 int err;
3001
3002
3003 memcpy(&hdr, th, sizeof(hdr));
3004 hdr.check = 0;
3005
3006
3007 sg_init_one(&sg, &hdr, sizeof(hdr));
3008 err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
3009 return err;
3010}
3011EXPORT_SYMBOL(tcp_md5_hash_header);
3012
3013int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3014 const struct sk_buff *skb, unsigned int header_len)
3015{
3016 struct scatterlist sg;
3017 const struct tcphdr *tp = tcp_hdr(skb);
3018 struct hash_desc *desc = &hp->md5_desc;
3019 unsigned int i;
3020 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
3021 skb_headlen(skb) - header_len : 0;
3022 const struct skb_shared_info *shi = skb_shinfo(skb);
3023 struct sk_buff *frag_iter;
3024
3025 sg_init_table(&sg, 1);
3026
3027 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
3028 if (crypto_hash_update(desc, &sg, head_data_len))
3029 return 1;
3030
3031 for (i = 0; i < shi->nr_frags; ++i) {
3032 const struct skb_frag_struct *f = &shi->frags[i];
3033 unsigned int offset = f->page_offset;
3034 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
3035
3036 sg_set_page(&sg, page, skb_frag_size(f),
3037 offset_in_page(offset));
3038 if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
3039 return 1;
3040 }
3041
3042 skb_walk_frags(skb, frag_iter)
3043 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3044 return 1;
3045
3046 return 0;
3047}
3048EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3049
3050int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
3051{
3052 struct scatterlist sg;
3053
3054 sg_init_one(&sg, key->key, key->keylen);
3055 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
3056}
3057EXPORT_SYMBOL(tcp_md5_hash_key);
3058
3059#endif
3060
3061void tcp_done(struct sock *sk)
3062{
3063 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3064
3065 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3066 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3067
3068 tcp_set_state(sk, TCP_CLOSE);
3069 tcp_clear_xmit_timers(sk);
3070 if (req != NULL)
3071 reqsk_fastopen_remove(sk, req, false);
3072
3073 sk->sk_shutdown = SHUTDOWN_MASK;
3074
3075 if (!sock_flag(sk, SOCK_DEAD))
3076 sk->sk_state_change(sk);
3077 else
3078 inet_csk_destroy_sock(sk);
3079}
3080EXPORT_SYMBOL_GPL(tcp_done);
3081
3082extern struct tcp_congestion_ops tcp_reno;
3083
3084static __initdata unsigned long thash_entries;
3085static int __init set_thash_entries(char *str)
3086{
3087 ssize_t ret;
3088
3089 if (!str)
3090 return 0;
3091
3092 ret = kstrtoul(str, 0, &thash_entries);
3093 if (ret)
3094 return 0;
3095
3096 return 1;
3097}
3098__setup("thash_entries=", set_thash_entries);
3099
3100void tcp_init_mem(struct net *net)
3101{
3102 unsigned long limit = nr_free_buffer_pages() / 8;
3103 limit = max(limit, 128UL);
3104 net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
3105 net->ipv4.sysctl_tcp_mem[1] = limit;
3106 net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
3107}
3108
3109void __init tcp_init(void)
3110{
3111 struct sk_buff *skb = NULL;
3112 unsigned long limit;
3113 int max_rshare, max_wshare, cnt;
3114 unsigned int i;
3115
3116 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
3117
3118 percpu_counter_init(&tcp_sockets_allocated, 0);
3119 percpu_counter_init(&tcp_orphan_count, 0);
3120 tcp_hashinfo.bind_bucket_cachep =
3121 kmem_cache_create("tcp_bind_bucket",
3122 sizeof(struct inet_bind_bucket), 0,
3123 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3124
3125
3126
3127
3128
3129
3130 tcp_hashinfo.ehash =
3131 alloc_large_system_hash("TCP established",
3132 sizeof(struct inet_ehash_bucket),
3133 thash_entries,
3134 17,
3135 0,
3136 NULL,
3137 &tcp_hashinfo.ehash_mask,
3138 0,
3139 thash_entries ? 0 : 512 * 1024);
3140 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
3141 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
3142 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
3143 }
3144 if (inet_ehash_locks_alloc(&tcp_hashinfo))
3145 panic("TCP: failed to alloc ehash_locks");
3146 tcp_hashinfo.bhash =
3147 alloc_large_system_hash("TCP bind",
3148 sizeof(struct inet_bind_hashbucket),
3149 tcp_hashinfo.ehash_mask + 1,
3150 17,
3151 0,
3152 &tcp_hashinfo.bhash_size,
3153 NULL,
3154 0,
3155 64 * 1024);
3156 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
3157 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3158 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3159 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3160 }
3161
3162
3163 cnt = tcp_hashinfo.ehash_mask + 1;
3164
3165 tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3166 sysctl_tcp_max_orphans = cnt / 2;
3167 sysctl_max_syn_backlog = max(128, cnt / 256);
3168
3169 tcp_init_mem(&init_net);
3170
3171 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
3172 max_wshare = min(4UL*1024*1024, limit);
3173 max_rshare = min(6UL*1024*1024, limit);
3174
3175 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3176 sysctl_tcp_wmem[1] = 16*1024;
3177 sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3178
3179 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3180 sysctl_tcp_rmem[1] = 87380;
3181 sysctl_tcp_rmem[2] = max(87380, max_rshare);
3182
3183 pr_info("Hash tables configured (established %u bind %u)\n",
3184 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3185
3186 tcp_metrics_init();
3187
3188 tcp_register_congestion_control(&tcp_reno);
3189
3190 tcp_tasklet_init();
3191}
3192