1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248#define pr_fmt(fmt) "TCP: " fmt
249
250#include <linux/kernel.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/fs.h>
257#include <linux/skbuff.h>
258#include <linux/scatterlist.h>
259#include <linux/splice.h>
260#include <linux/net.h>
261#include <linux/socket.h>
262#include <linux/random.h>
263#include <linux/bootmem.h>
264#include <linux/highmem.h>
265#include <linux/swap.h>
266#include <linux/cache.h>
267#include <linux/err.h>
268#include <linux/crypto.h>
269#include <linux/time.h>
270#include <linux/slab.h>
271
272#include <net/icmp.h>
273#include <net/inet_common.h>
274#include <net/tcp.h>
275#include <net/xfrm.h>
276#include <net/ip.h>
277#include <net/netdma.h>
278#include <net/sock.h>
279
280#include <asm/uaccess.h>
281#include <asm/ioctls.h>
282#include <net/busy_poll.h>
283
284int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
285
286struct percpu_counter tcp_orphan_count;
287EXPORT_SYMBOL_GPL(tcp_orphan_count);
288
289int sysctl_tcp_wmem[3] __read_mostly;
290int sysctl_tcp_rmem[3] __read_mostly;
291
292EXPORT_SYMBOL(sysctl_tcp_rmem);
293EXPORT_SYMBOL(sysctl_tcp_wmem);
294
295atomic_long_t tcp_memory_allocated;
296EXPORT_SYMBOL(tcp_memory_allocated);
297
298
299
300
301struct percpu_counter tcp_sockets_allocated;
302EXPORT_SYMBOL(tcp_sockets_allocated);
303
304
305
306
307struct tcp_splice_state {
308 struct pipe_inode_info *pipe;
309 size_t len;
310 unsigned int flags;
311};
312
313
314
315
316
317
318
319int tcp_memory_pressure __read_mostly;
320EXPORT_SYMBOL(tcp_memory_pressure);
321
322void tcp_enter_memory_pressure(struct sock *sk)
323{
324 if (!tcp_memory_pressure) {
325 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
326 tcp_memory_pressure = 1;
327 }
328}
329EXPORT_SYMBOL(tcp_enter_memory_pressure);
330
331
332static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
333{
334 u8 res = 0;
335
336 if (seconds > 0) {
337 int period = timeout;
338
339 res = 1;
340 while (seconds > period && res < 255) {
341 res++;
342 timeout <<= 1;
343 if (timeout > rto_max)
344 timeout = rto_max;
345 period += timeout;
346 }
347 }
348 return res;
349}
350
351
352static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
353{
354 int period = 0;
355
356 if (retrans > 0) {
357 period = timeout;
358 while (--retrans) {
359 timeout <<= 1;
360 if (timeout > rto_max)
361 timeout = rto_max;
362 period += timeout;
363 }
364 }
365 return period;
366}
367
368
369
370
371
372
373void tcp_init_sock(struct sock *sk)
374{
375 struct inet_connection_sock *icsk = inet_csk(sk);
376 struct tcp_sock *tp = tcp_sk(sk);
377
378 skb_queue_head_init(&tp->out_of_order_queue);
379 tcp_init_xmit_timers(sk);
380 tcp_prequeue_init(tp);
381 INIT_LIST_HEAD(&tp->tsq_node);
382
383 icsk->icsk_rto = TCP_TIMEOUT_INIT;
384 tp->mdev = TCP_TIMEOUT_INIT;
385
386
387
388
389
390
391 tp->snd_cwnd = TCP_INIT_CWND;
392
393
394
395
396 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
397 tp->snd_cwnd_clamp = ~0;
398 tp->mss_cache = TCP_MSS_DEFAULT;
399
400 tp->reordering = sysctl_tcp_reordering;
401 tcp_enable_early_retrans(tp);
402 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
403
404 tp->tsoffset = 0;
405
406 sk->sk_state = TCP_CLOSE;
407
408 sk->sk_write_space = sk_stream_write_space;
409 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
410
411 icsk->icsk_sync_mss = tcp_sync_mss;
412
413
414
415
416
417 sk->sk_sndbuf = sysctl_tcp_wmem[1];
418 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
419
420 local_bh_disable();
421 sock_update_memcg(sk);
422 sk_sockets_allocated_inc(sk);
423 local_bh_enable();
424}
425EXPORT_SYMBOL(tcp_init_sock);
426
427
428
429
430
431
432
433
434unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
435{
436 unsigned int mask;
437 struct sock *sk = sock->sk;
438 const struct tcp_sock *tp = tcp_sk(sk);
439
440 sock_rps_record_flow(sk);
441
442 sock_poll_wait(file, sk_sleep(sk), wait);
443 if (sk->sk_state == TCP_LISTEN)
444 return inet_csk_listen_poll(sk);
445
446
447
448
449
450
451 mask = 0;
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
481 mask |= POLLHUP;
482 if (sk->sk_shutdown & RCV_SHUTDOWN)
483 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
484
485
486 if (sk->sk_state != TCP_SYN_SENT &&
487 (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
488 int target = sock_rcvlowat(sk, 0, INT_MAX);
489
490 if (tp->urg_seq == tp->copied_seq &&
491 !sock_flag(sk, SOCK_URGINLINE) &&
492 tp->urg_data)
493 target++;
494
495
496
497
498 if (tp->rcv_nxt - tp->copied_seq >= target)
499 mask |= POLLIN | POLLRDNORM;
500
501 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
502 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
503 mask |= POLLOUT | POLLWRNORM;
504 } else {
505 set_bit(SOCK_ASYNC_NOSPACE,
506 &sk->sk_socket->flags);
507 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
508
509
510
511
512
513 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
514 mask |= POLLOUT | POLLWRNORM;
515 }
516 } else
517 mask |= POLLOUT | POLLWRNORM;
518
519 if (tp->urg_data & TCP_URG_VALID)
520 mask |= POLLPRI;
521 }
522
523 smp_rmb();
524 if (sk->sk_err)
525 mask |= POLLERR;
526
527 return mask;
528}
529EXPORT_SYMBOL(tcp_poll);
530
531int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
532{
533 struct tcp_sock *tp = tcp_sk(sk);
534 int answ;
535 bool slow;
536
537 switch (cmd) {
538 case SIOCINQ:
539 if (sk->sk_state == TCP_LISTEN)
540 return -EINVAL;
541
542 slow = lock_sock_fast(sk);
543 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
544 answ = 0;
545 else if (sock_flag(sk, SOCK_URGINLINE) ||
546 !tp->urg_data ||
547 before(tp->urg_seq, tp->copied_seq) ||
548 !before(tp->urg_seq, tp->rcv_nxt)) {
549
550 answ = tp->rcv_nxt - tp->copied_seq;
551
552
553 if (answ && sock_flag(sk, SOCK_DONE))
554 answ--;
555 } else
556 answ = tp->urg_seq - tp->copied_seq;
557 unlock_sock_fast(sk, slow);
558 break;
559 case SIOCATMARK:
560 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
561 break;
562 case SIOCOUTQ:
563 if (sk->sk_state == TCP_LISTEN)
564 return -EINVAL;
565
566 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
567 answ = 0;
568 else
569 answ = tp->write_seq - tp->snd_una;
570 break;
571 case SIOCOUTQNSD:
572 if (sk->sk_state == TCP_LISTEN)
573 return -EINVAL;
574
575 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
576 answ = 0;
577 else
578 answ = tp->write_seq - tp->snd_nxt;
579 break;
580 default:
581 return -ENOIOCTLCMD;
582 }
583
584 return put_user(answ, (int __user *)arg);
585}
586EXPORT_SYMBOL(tcp_ioctl);
587
588static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
589{
590 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
591 tp->pushed_seq = tp->write_seq;
592}
593
594static inline bool forced_push(const struct tcp_sock *tp)
595{
596 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
597}
598
599static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
600{
601 struct tcp_sock *tp = tcp_sk(sk);
602 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
603
604 skb->csum = 0;
605 tcb->seq = tcb->end_seq = tp->write_seq;
606 tcb->tcp_flags = TCPHDR_ACK;
607 tcb->sacked = 0;
608 skb_header_release(skb);
609 tcp_add_write_queue_tail(sk, skb);
610 sk->sk_wmem_queued += skb->truesize;
611 sk_mem_charge(sk, skb->truesize);
612 if (tp->nonagle & TCP_NAGLE_PUSH)
613 tp->nonagle &= ~TCP_NAGLE_PUSH;
614}
615
616static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
617{
618 if (flags & MSG_OOB)
619 tp->snd_up = tp->write_seq;
620}
621
622static inline void tcp_push(struct sock *sk, int flags, int mss_now,
623 int nonagle)
624{
625 if (tcp_send_head(sk)) {
626 struct tcp_sock *tp = tcp_sk(sk);
627
628 if (!(flags & MSG_MORE) || forced_push(tp))
629 tcp_mark_push(tp, tcp_write_queue_tail(sk));
630
631 tcp_mark_urg(tp, flags);
632 __tcp_push_pending_frames(sk, mss_now,
633 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
634 }
635}
636
637static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
638 unsigned int offset, size_t len)
639{
640 struct tcp_splice_state *tss = rd_desc->arg.data;
641 int ret;
642
643 ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
644 tss->flags);
645 if (ret > 0)
646 rd_desc->count -= ret;
647 return ret;
648}
649
650static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
651{
652
653 read_descriptor_t rd_desc = {
654 .arg.data = tss,
655 .count = tss->len,
656 };
657
658 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
659}
660
661
662
663
664
665
666
667
668
669
670
671
672
673ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
674 struct pipe_inode_info *pipe, size_t len,
675 unsigned int flags)
676{
677 struct sock *sk = sock->sk;
678 struct tcp_splice_state tss = {
679 .pipe = pipe,
680 .len = len,
681 .flags = flags,
682 };
683 long timeo;
684 ssize_t spliced;
685 int ret;
686
687 sock_rps_record_flow(sk);
688
689
690
691 if (unlikely(*ppos))
692 return -ESPIPE;
693
694 ret = spliced = 0;
695
696 lock_sock(sk);
697
698 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
699 while (tss.len) {
700 ret = __tcp_splice_read(sk, &tss);
701 if (ret < 0)
702 break;
703 else if (!ret) {
704 if (spliced)
705 break;
706 if (sock_flag(sk, SOCK_DONE))
707 break;
708 if (sk->sk_err) {
709 ret = sock_error(sk);
710 break;
711 }
712 if (sk->sk_shutdown & RCV_SHUTDOWN)
713 break;
714 if (sk->sk_state == TCP_CLOSE) {
715
716
717
718
719 if (!sock_flag(sk, SOCK_DONE))
720 ret = -ENOTCONN;
721 break;
722 }
723 if (!timeo) {
724 ret = -EAGAIN;
725 break;
726 }
727 sk_wait_data(sk, &timeo);
728 if (signal_pending(current)) {
729 ret = sock_intr_errno(timeo);
730 break;
731 }
732 continue;
733 }
734 tss.len -= ret;
735 spliced += ret;
736
737 if (!timeo)
738 break;
739 release_sock(sk);
740 lock_sock(sk);
741
742 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
743 (sk->sk_shutdown & RCV_SHUTDOWN) ||
744 signal_pending(current))
745 break;
746 }
747
748 release_sock(sk);
749
750 if (spliced)
751 return spliced;
752
753 return ret;
754}
755EXPORT_SYMBOL(tcp_splice_read);
756
757struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
758{
759 struct sk_buff *skb;
760
761
762 size = ALIGN(size, 4);
763
764 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
765 if (skb) {
766 if (sk_wmem_schedule(sk, skb->truesize)) {
767 skb_reserve(skb, sk->sk_prot->max_header);
768
769
770
771
772 skb->reserved_tailroom = skb->end - skb->tail - size;
773 return skb;
774 }
775 __kfree_skb(skb);
776 } else {
777 sk->sk_prot->enter_memory_pressure(sk);
778 sk_stream_moderate_sndbuf(sk);
779 }
780 return NULL;
781}
782
783static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
784 int large_allowed)
785{
786 struct tcp_sock *tp = tcp_sk(sk);
787 u32 xmit_size_goal, old_size_goal;
788
789 xmit_size_goal = mss_now;
790
791 if (large_allowed && sk_can_gso(sk)) {
792 xmit_size_goal = ((sk->sk_gso_max_size - 1) -
793 inet_csk(sk)->icsk_af_ops->net_header_len -
794 inet_csk(sk)->icsk_ext_hdr_len -
795 tp->tcp_header_len);
796
797
798 xmit_size_goal = min_t(u32, xmit_size_goal,
799 sysctl_tcp_limit_output_bytes >> 1);
800
801 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
802
803
804 old_size_goal = tp->xmit_size_goal_segs * mss_now;
805
806 if (likely(old_size_goal <= xmit_size_goal &&
807 old_size_goal + mss_now > xmit_size_goal)) {
808 xmit_size_goal = old_size_goal;
809 } else {
810 tp->xmit_size_goal_segs =
811 min_t(u16, xmit_size_goal / mss_now,
812 sk->sk_gso_max_segs);
813 xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
814 }
815 }
816
817 return max(xmit_size_goal, mss_now);
818}
819
820static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
821{
822 int mss_now;
823
824 mss_now = tcp_current_mss(sk);
825 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
826
827 return mss_now;
828}
829
830static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
831 size_t size, int flags)
832{
833 struct tcp_sock *tp = tcp_sk(sk);
834 int mss_now, size_goal;
835 int err;
836 ssize_t copied;
837 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
838
839
840
841
842
843 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
844 !tcp_passive_fastopen(sk)) {
845 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
846 goto out_err;
847 }
848
849 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
850
851 mss_now = tcp_send_mss(sk, &size_goal, flags);
852 copied = 0;
853
854 err = -EPIPE;
855 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
856 goto out_err;
857
858 while (size > 0) {
859 struct sk_buff *skb = tcp_write_queue_tail(sk);
860 int copy, i;
861 bool can_coalesce;
862
863 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
864new_segment:
865 if (!sk_stream_memory_free(sk))
866 goto wait_for_sndbuf;
867
868 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
869 if (!skb)
870 goto wait_for_memory;
871
872 skb_entail(sk, skb);
873 copy = size_goal;
874 }
875
876 if (copy > size)
877 copy = size;
878
879 i = skb_shinfo(skb)->nr_frags;
880 can_coalesce = skb_can_coalesce(skb, i, page, offset);
881 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
882 tcp_mark_push(tp, skb);
883 goto new_segment;
884 }
885 if (!sk_wmem_schedule(sk, copy))
886 goto wait_for_memory;
887
888 if (can_coalesce) {
889 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
890 } else {
891 get_page(page);
892 skb_fill_page_desc(skb, i, page, offset, copy);
893 }
894 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
895
896 skb->len += copy;
897 skb->data_len += copy;
898 skb->truesize += copy;
899 sk->sk_wmem_queued += copy;
900 sk_mem_charge(sk, copy);
901 skb->ip_summed = CHECKSUM_PARTIAL;
902 tp->write_seq += copy;
903 TCP_SKB_CB(skb)->end_seq += copy;
904 skb_shinfo(skb)->gso_segs = 0;
905
906 if (!copied)
907 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
908
909 copied += copy;
910 offset += copy;
911 if (!(size -= copy))
912 goto out;
913
914 if (skb->len < size_goal || (flags & MSG_OOB))
915 continue;
916
917 if (forced_push(tp)) {
918 tcp_mark_push(tp, skb);
919 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
920 } else if (skb == tcp_send_head(sk))
921 tcp_push_one(sk, mss_now);
922 continue;
923
924wait_for_sndbuf:
925 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
926wait_for_memory:
927 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
928
929 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
930 goto do_error;
931
932 mss_now = tcp_send_mss(sk, &size_goal, flags);
933 }
934
935out:
936 if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
937 tcp_push(sk, flags, mss_now, tp->nonagle);
938 return copied;
939
940do_error:
941 if (copied)
942 goto out;
943out_err:
944 return sk_stream_error(sk, flags, err);
945}
946
947int tcp_sendpage(struct sock *sk, struct page *page, int offset,
948 size_t size, int flags)
949{
950 ssize_t res;
951
952 if (!(sk->sk_route_caps & NETIF_F_SG) ||
953 !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
954 return sock_no_sendpage(sk->sk_socket, page, offset, size,
955 flags);
956
957 lock_sock(sk);
958 res = do_tcp_sendpages(sk, page, offset, size, flags);
959 release_sock(sk);
960 return res;
961}
962EXPORT_SYMBOL(tcp_sendpage);
963
964static inline int select_size(const struct sock *sk, bool sg)
965{
966 const struct tcp_sock *tp = tcp_sk(sk);
967 int tmp = tp->mss_cache;
968
969 if (sg) {
970 if (sk_can_gso(sk)) {
971
972
973
974 tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
975 } else {
976 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
977
978 if (tmp >= pgbreak &&
979 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
980 tmp = pgbreak;
981 }
982 }
983
984 return tmp;
985}
986
987void tcp_free_fastopen_req(struct tcp_sock *tp)
988{
989 if (tp->fastopen_req != NULL) {
990 kfree(tp->fastopen_req);
991 tp->fastopen_req = NULL;
992 }
993}
994
995static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
996{
997 struct tcp_sock *tp = tcp_sk(sk);
998 int err, flags;
999
1000 if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
1001 return -EOPNOTSUPP;
1002 if (tp->fastopen_req != NULL)
1003 return -EALREADY;
1004
1005 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1006 sk->sk_allocation);
1007 if (unlikely(tp->fastopen_req == NULL))
1008 return -ENOBUFS;
1009 tp->fastopen_req->data = msg;
1010
1011 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1012 err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
1013 msg->msg_namelen, flags);
1014 *size = tp->fastopen_req->copied;
1015 tcp_free_fastopen_req(tp);
1016 return err;
1017}
1018
1019int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1020 size_t size)
1021{
1022 struct iovec *iov;
1023 struct tcp_sock *tp = tcp_sk(sk);
1024 struct sk_buff *skb;
1025 int iovlen, flags, err, copied = 0;
1026 int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
1027 bool sg;
1028 long timeo;
1029
1030 lock_sock(sk);
1031
1032 flags = msg->msg_flags;
1033 if (flags & MSG_FASTOPEN) {
1034 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
1035 if (err == -EINPROGRESS && copied_syn > 0)
1036 goto out;
1037 else if (err)
1038 goto out_err;
1039 offset = copied_syn;
1040 }
1041
1042 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1043
1044
1045
1046
1047
1048 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1049 !tcp_passive_fastopen(sk)) {
1050 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1051 goto do_error;
1052 }
1053
1054 if (unlikely(tp->repair)) {
1055 if (tp->repair_queue == TCP_RECV_QUEUE) {
1056 copied = tcp_send_rcvq(sk, msg, size);
1057 goto out;
1058 }
1059
1060 err = -EINVAL;
1061 if (tp->repair_queue == TCP_NO_QUEUE)
1062 goto out_err;
1063
1064
1065 }
1066
1067
1068 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1069
1070 mss_now = tcp_send_mss(sk, &size_goal, flags);
1071
1072
1073 iovlen = msg->msg_iovlen;
1074 iov = msg->msg_iov;
1075 copied = 0;
1076
1077 err = -EPIPE;
1078 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1079 goto out_err;
1080
1081 sg = !!(sk->sk_route_caps & NETIF_F_SG);
1082
1083 while (--iovlen >= 0) {
1084 size_t seglen = iov->iov_len;
1085 unsigned char __user *from = iov->iov_base;
1086
1087 iov++;
1088 if (unlikely(offset > 0)) {
1089 if (offset >= seglen) {
1090 offset -= seglen;
1091 continue;
1092 }
1093 seglen -= offset;
1094 from += offset;
1095 offset = 0;
1096 }
1097
1098 while (seglen > 0) {
1099 int copy = 0;
1100 int max = size_goal;
1101
1102 skb = tcp_write_queue_tail(sk);
1103 if (tcp_send_head(sk)) {
1104 if (skb->ip_summed == CHECKSUM_NONE)
1105 max = mss_now;
1106 copy = max - skb->len;
1107 }
1108
1109 if (copy <= 0) {
1110new_segment:
1111
1112
1113
1114 if (!sk_stream_memory_free(sk))
1115 goto wait_for_sndbuf;
1116
1117 skb = sk_stream_alloc_skb(sk,
1118 select_size(sk, sg),
1119 sk->sk_allocation);
1120 if (!skb)
1121 goto wait_for_memory;
1122
1123
1124
1125
1126
1127 if (tp->repair)
1128 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1129
1130
1131
1132
1133 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
1134 skb->ip_summed = CHECKSUM_PARTIAL;
1135
1136 skb_entail(sk, skb);
1137 copy = size_goal;
1138 max = size_goal;
1139 }
1140
1141
1142 if (copy > seglen)
1143 copy = seglen;
1144
1145
1146 if (skb_availroom(skb) > 0) {
1147
1148 copy = min_t(int, copy, skb_availroom(skb));
1149 err = skb_add_data_nocache(sk, skb, from, copy);
1150 if (err)
1151 goto do_fault;
1152 } else {
1153 bool merge = true;
1154 int i = skb_shinfo(skb)->nr_frags;
1155 struct page_frag *pfrag = sk_page_frag(sk);
1156
1157 if (!sk_page_frag_refill(sk, pfrag))
1158 goto wait_for_memory;
1159
1160 if (!skb_can_coalesce(skb, i, pfrag->page,
1161 pfrag->offset)) {
1162 if (i == MAX_SKB_FRAGS || !sg) {
1163 tcp_mark_push(tp, skb);
1164 goto new_segment;
1165 }
1166 merge = false;
1167 }
1168
1169 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1170
1171 if (!sk_wmem_schedule(sk, copy))
1172 goto wait_for_memory;
1173
1174 err = skb_copy_to_page_nocache(sk, from, skb,
1175 pfrag->page,
1176 pfrag->offset,
1177 copy);
1178 if (err)
1179 goto do_error;
1180
1181
1182 if (merge) {
1183 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1184 } else {
1185 skb_fill_page_desc(skb, i, pfrag->page,
1186 pfrag->offset, copy);
1187 get_page(pfrag->page);
1188 }
1189 pfrag->offset += copy;
1190 }
1191
1192 if (!copied)
1193 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1194
1195 tp->write_seq += copy;
1196 TCP_SKB_CB(skb)->end_seq += copy;
1197 skb_shinfo(skb)->gso_segs = 0;
1198
1199 from += copy;
1200 copied += copy;
1201 if ((seglen -= copy) == 0 && iovlen == 0)
1202 goto out;
1203
1204 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1205 continue;
1206
1207 if (forced_push(tp)) {
1208 tcp_mark_push(tp, skb);
1209 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1210 } else if (skb == tcp_send_head(sk))
1211 tcp_push_one(sk, mss_now);
1212 continue;
1213
1214wait_for_sndbuf:
1215 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1216wait_for_memory:
1217 if (copied)
1218 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1219
1220 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1221 goto do_error;
1222
1223 mss_now = tcp_send_mss(sk, &size_goal, flags);
1224 }
1225 }
1226
1227out:
1228 if (copied)
1229 tcp_push(sk, flags, mss_now, tp->nonagle);
1230 release_sock(sk);
1231 return copied + copied_syn;
1232
1233do_fault:
1234 if (!skb->len) {
1235 tcp_unlink_write_queue(skb, sk);
1236
1237
1238
1239 tcp_check_send_head(sk, skb);
1240 sk_wmem_free_skb(sk, skb);
1241 }
1242
1243do_error:
1244 if (copied + copied_syn)
1245 goto out;
1246out_err:
1247 err = sk_stream_error(sk, flags, err);
1248 release_sock(sk);
1249 return err;
1250}
1251EXPORT_SYMBOL(tcp_sendmsg);
1252
1253
1254
1255
1256
1257
1258static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1259{
1260 struct tcp_sock *tp = tcp_sk(sk);
1261
1262
1263 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1264 tp->urg_data == TCP_URG_READ)
1265 return -EINVAL;
1266
1267 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1268 return -ENOTCONN;
1269
1270 if (tp->urg_data & TCP_URG_VALID) {
1271 int err = 0;
1272 char c = tp->urg_data;
1273
1274 if (!(flags & MSG_PEEK))
1275 tp->urg_data = TCP_URG_READ;
1276
1277
1278 msg->msg_flags |= MSG_OOB;
1279
1280 if (len > 0) {
1281 if (!(flags & MSG_TRUNC))
1282 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1283 len = 1;
1284 } else
1285 msg->msg_flags |= MSG_TRUNC;
1286
1287 return err ? -EFAULT : len;
1288 }
1289
1290 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1291 return 0;
1292
1293
1294
1295
1296
1297
1298
1299 return -EAGAIN;
1300}
1301
1302static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1303{
1304 struct sk_buff *skb;
1305 int copied = 0, err = 0;
1306
1307
1308
1309 skb_queue_walk(&sk->sk_write_queue, skb) {
1310 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
1311 if (err)
1312 break;
1313
1314 copied += skb->len;
1315 }
1316
1317 return err ?: copied;
1318}
1319
1320
1321
1322
1323
1324
1325
1326void tcp_cleanup_rbuf(struct sock *sk, int copied)
1327{
1328 struct tcp_sock *tp = tcp_sk(sk);
1329 bool time_to_ack = false;
1330
1331 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1332
1333 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1334 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1335 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1336
1337 if (inet_csk_ack_scheduled(sk)) {
1338 const struct inet_connection_sock *icsk = inet_csk(sk);
1339
1340
1341 if (icsk->icsk_ack.blocked ||
1342
1343 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1344
1345
1346
1347
1348
1349
1350 (copied > 0 &&
1351 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1352 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1353 !icsk->icsk_ack.pingpong)) &&
1354 !atomic_read(&sk->sk_rmem_alloc)))
1355 time_to_ack = true;
1356 }
1357
1358
1359
1360
1361
1362
1363
1364 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1365 __u32 rcv_window_now = tcp_receive_window(tp);
1366
1367
1368 if (2*rcv_window_now <= tp->window_clamp) {
1369 __u32 new_window = __tcp_select_window(sk);
1370
1371
1372
1373
1374
1375
1376 if (new_window && new_window >= 2 * rcv_window_now)
1377 time_to_ack = true;
1378 }
1379 }
1380 if (time_to_ack)
1381 tcp_send_ack(sk);
1382}
1383
1384static void tcp_prequeue_process(struct sock *sk)
1385{
1386 struct sk_buff *skb;
1387 struct tcp_sock *tp = tcp_sk(sk);
1388
1389 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1390
1391
1392
1393 local_bh_disable();
1394 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1395 sk_backlog_rcv(sk, skb);
1396 local_bh_enable();
1397
1398
1399 tp->ucopy.memory = 0;
1400}
1401
1402#ifdef CONFIG_NET_DMA
1403static void tcp_service_net_dma(struct sock *sk, bool wait)
1404{
1405 dma_cookie_t done, used;
1406 dma_cookie_t last_issued;
1407 struct tcp_sock *tp = tcp_sk(sk);
1408
1409 if (!tp->ucopy.dma_chan)
1410 return;
1411
1412 last_issued = tp->ucopy.dma_cookie;
1413 dma_async_issue_pending(tp->ucopy.dma_chan);
1414
1415 do {
1416 if (dma_async_is_tx_complete(tp->ucopy.dma_chan,
1417 last_issued, &done,
1418 &used) == DMA_SUCCESS) {
1419
1420 __skb_queue_purge(&sk->sk_async_wait_queue);
1421 break;
1422 } else {
1423 struct sk_buff *skb;
1424 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1425 (dma_async_is_complete(skb->dma_cookie, done,
1426 used) == DMA_SUCCESS)) {
1427 __skb_dequeue(&sk->sk_async_wait_queue);
1428 kfree_skb(skb);
1429 }
1430 }
1431 } while (wait);
1432}
1433#endif
1434
1435static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1436{
1437 struct sk_buff *skb;
1438 u32 offset;
1439
1440 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1441 offset = seq - TCP_SKB_CB(skb)->seq;
1442 if (tcp_hdr(skb)->syn)
1443 offset--;
1444 if (offset < skb->len || tcp_hdr(skb)->fin) {
1445 *off = offset;
1446 return skb;
1447 }
1448
1449
1450
1451
1452 sk_eat_skb(sk, skb, false);
1453 }
1454 return NULL;
1455}
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1469 sk_read_actor_t recv_actor)
1470{
1471 struct sk_buff *skb;
1472 struct tcp_sock *tp = tcp_sk(sk);
1473 u32 seq = tp->copied_seq;
1474 u32 offset;
1475 int copied = 0;
1476
1477 if (sk->sk_state == TCP_LISTEN)
1478 return -ENOTCONN;
1479 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1480 if (offset < skb->len) {
1481 int used;
1482 size_t len;
1483
1484 len = skb->len - offset;
1485
1486 if (tp->urg_data) {
1487 u32 urg_offset = tp->urg_seq - seq;
1488 if (urg_offset < len)
1489 len = urg_offset;
1490 if (!len)
1491 break;
1492 }
1493 used = recv_actor(desc, skb, offset, len);
1494 if (used <= 0) {
1495 if (!copied)
1496 copied = used;
1497 break;
1498 } else if (used <= len) {
1499 seq += used;
1500 copied += used;
1501 offset += used;
1502 }
1503
1504
1505
1506
1507
1508 skb = tcp_recv_skb(sk, seq - 1, &offset);
1509 if (!skb)
1510 break;
1511
1512
1513
1514 if (offset + 1 != skb->len)
1515 continue;
1516 }
1517 if (tcp_hdr(skb)->fin) {
1518 sk_eat_skb(sk, skb, false);
1519 ++seq;
1520 break;
1521 }
1522 sk_eat_skb(sk, skb, false);
1523 if (!desc->count)
1524 break;
1525 tp->copied_seq = seq;
1526 }
1527 tp->copied_seq = seq;
1528
1529 tcp_rcv_space_adjust(sk);
1530
1531
1532 if (copied > 0) {
1533 tcp_recv_skb(sk, seq, &offset);
1534 tcp_cleanup_rbuf(sk, copied);
1535 }
1536 return copied;
1537}
1538EXPORT_SYMBOL(tcp_read_sock);
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1549 size_t len, int nonblock, int flags, int *addr_len)
1550{
1551 struct tcp_sock *tp = tcp_sk(sk);
1552 int copied = 0;
1553 u32 peek_seq;
1554 u32 *seq;
1555 unsigned long used;
1556 int err;
1557 int target;
1558 long timeo;
1559 struct task_struct *user_recv = NULL;
1560 bool copied_early = false;
1561 struct sk_buff *skb;
1562 u32 urg_hole = 0;
1563
1564 if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
1565 (sk->sk_state == TCP_ESTABLISHED))
1566 sk_busy_loop(sk, nonblock);
1567
1568 lock_sock(sk);
1569
1570 err = -ENOTCONN;
1571 if (sk->sk_state == TCP_LISTEN)
1572 goto out;
1573
1574 timeo = sock_rcvtimeo(sk, nonblock);
1575
1576
1577 if (flags & MSG_OOB)
1578 goto recv_urg;
1579
1580 if (unlikely(tp->repair)) {
1581 err = -EPERM;
1582 if (!(flags & MSG_PEEK))
1583 goto out;
1584
1585 if (tp->repair_queue == TCP_SEND_QUEUE)
1586 goto recv_sndq;
1587
1588 err = -EINVAL;
1589 if (tp->repair_queue == TCP_NO_QUEUE)
1590 goto out;
1591
1592
1593 }
1594
1595 seq = &tp->copied_seq;
1596 if (flags & MSG_PEEK) {
1597 peek_seq = tp->copied_seq;
1598 seq = &peek_seq;
1599 }
1600
1601 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1602
1603#ifdef CONFIG_NET_DMA
1604 tp->ucopy.dma_chan = NULL;
1605 preempt_disable();
1606 skb = skb_peek_tail(&sk->sk_receive_queue);
1607 {
1608 int available = 0;
1609
1610 if (skb)
1611 available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
1612 if ((available < target) &&
1613 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1614 !sysctl_tcp_low_latency &&
1615 net_dma_find_channel()) {
1616 preempt_enable_no_resched();
1617 tp->ucopy.pinned_list =
1618 dma_pin_iovec_pages(msg->msg_iov, len);
1619 } else {
1620 preempt_enable_no_resched();
1621 }
1622 }
1623#endif
1624
1625 do {
1626 u32 offset;
1627
1628
1629 if (tp->urg_data && tp->urg_seq == *seq) {
1630 if (copied)
1631 break;
1632 if (signal_pending(current)) {
1633 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1634 break;
1635 }
1636 }
1637
1638
1639
1640 skb_queue_walk(&sk->sk_receive_queue, skb) {
1641
1642
1643
1644 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1645 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
1646 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1647 flags))
1648 break;
1649
1650 offset = *seq - TCP_SKB_CB(skb)->seq;
1651 if (tcp_hdr(skb)->syn)
1652 offset--;
1653 if (offset < skb->len)
1654 goto found_ok_skb;
1655 if (tcp_hdr(skb)->fin)
1656 goto found_fin_ok;
1657 WARN(!(flags & MSG_PEEK),
1658 "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
1659 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1660 }
1661
1662
1663
1664 if (copied >= target && !sk->sk_backlog.tail)
1665 break;
1666
1667 if (copied) {
1668 if (sk->sk_err ||
1669 sk->sk_state == TCP_CLOSE ||
1670 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1671 !timeo ||
1672 signal_pending(current))
1673 break;
1674 } else {
1675 if (sock_flag(sk, SOCK_DONE))
1676 break;
1677
1678 if (sk->sk_err) {
1679 copied = sock_error(sk);
1680 break;
1681 }
1682
1683 if (sk->sk_shutdown & RCV_SHUTDOWN)
1684 break;
1685
1686 if (sk->sk_state == TCP_CLOSE) {
1687 if (!sock_flag(sk, SOCK_DONE)) {
1688
1689
1690
1691 copied = -ENOTCONN;
1692 break;
1693 }
1694 break;
1695 }
1696
1697 if (!timeo) {
1698 copied = -EAGAIN;
1699 break;
1700 }
1701
1702 if (signal_pending(current)) {
1703 copied = sock_intr_errno(timeo);
1704 break;
1705 }
1706 }
1707
1708 tcp_cleanup_rbuf(sk, copied);
1709
1710 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1711
1712 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1713 user_recv = current;
1714 tp->ucopy.task = user_recv;
1715 tp->ucopy.iov = msg->msg_iov;
1716 }
1717
1718 tp->ucopy.len = len;
1719
1720 WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1721 !(flags & (MSG_PEEK | MSG_TRUNC)));
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749 if (!skb_queue_empty(&tp->ucopy.prequeue))
1750 goto do_prequeue;
1751
1752
1753 }
1754
1755#ifdef CONFIG_NET_DMA
1756 if (tp->ucopy.dma_chan) {
1757 if (tp->rcv_wnd == 0 &&
1758 !skb_queue_empty(&sk->sk_async_wait_queue)) {
1759 tcp_service_net_dma(sk, true);
1760 tcp_cleanup_rbuf(sk, copied);
1761 } else
1762 dma_async_issue_pending(tp->ucopy.dma_chan);
1763 }
1764#endif
1765 if (copied >= target) {
1766
1767 release_sock(sk);
1768 lock_sock(sk);
1769 } else
1770 sk_wait_data(sk, &timeo);
1771
1772#ifdef CONFIG_NET_DMA
1773 tcp_service_net_dma(sk, false);
1774 tp->ucopy.wakeup = 0;
1775#endif
1776
1777 if (user_recv) {
1778 int chunk;
1779
1780
1781
1782 if ((chunk = len - tp->ucopy.len) != 0) {
1783 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1784 len -= chunk;
1785 copied += chunk;
1786 }
1787
1788 if (tp->rcv_nxt == tp->copied_seq &&
1789 !skb_queue_empty(&tp->ucopy.prequeue)) {
1790do_prequeue:
1791 tcp_prequeue_process(sk);
1792
1793 if ((chunk = len - tp->ucopy.len) != 0) {
1794 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1795 len -= chunk;
1796 copied += chunk;
1797 }
1798 }
1799 }
1800 if ((flags & MSG_PEEK) &&
1801 (peek_seq - copied - urg_hole != tp->copied_seq)) {
1802 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
1803 current->comm,
1804 task_pid_nr(current));
1805 peek_seq = tp->copied_seq;
1806 }
1807 continue;
1808
1809 found_ok_skb:
1810
1811 used = skb->len - offset;
1812 if (len < used)
1813 used = len;
1814
1815
1816 if (tp->urg_data) {
1817 u32 urg_offset = tp->urg_seq - *seq;
1818 if (urg_offset < used) {
1819 if (!urg_offset) {
1820 if (!sock_flag(sk, SOCK_URGINLINE)) {
1821 ++*seq;
1822 urg_hole++;
1823 offset++;
1824 used--;
1825 if (!used)
1826 goto skip_copy;
1827 }
1828 } else
1829 used = urg_offset;
1830 }
1831 }
1832
1833 if (!(flags & MSG_TRUNC)) {
1834#ifdef CONFIG_NET_DMA
1835 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1836 tp->ucopy.dma_chan = net_dma_find_channel();
1837
1838 if (tp->ucopy.dma_chan) {
1839 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1840 tp->ucopy.dma_chan, skb, offset,
1841 msg->msg_iov, used,
1842 tp->ucopy.pinned_list);
1843
1844 if (tp->ucopy.dma_cookie < 0) {
1845
1846 pr_alert("%s: dma_cookie < 0\n",
1847 __func__);
1848
1849
1850 if (!copied)
1851 copied = -EFAULT;
1852 break;
1853 }
1854
1855 dma_async_issue_pending(tp->ucopy.dma_chan);
1856
1857 if ((offset + used) == skb->len)
1858 copied_early = true;
1859
1860 } else
1861#endif
1862 {
1863 err = skb_copy_datagram_iovec(skb, offset,
1864 msg->msg_iov, used);
1865 if (err) {
1866
1867 if (!copied)
1868 copied = -EFAULT;
1869 break;
1870 }
1871 }
1872 }
1873
1874 *seq += used;
1875 copied += used;
1876 len -= used;
1877
1878 tcp_rcv_space_adjust(sk);
1879
1880skip_copy:
1881 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1882 tp->urg_data = 0;
1883 tcp_fast_path_check(sk);
1884 }
1885 if (used + offset < skb->len)
1886 continue;
1887
1888 if (tcp_hdr(skb)->fin)
1889 goto found_fin_ok;
1890 if (!(flags & MSG_PEEK)) {
1891 sk_eat_skb(sk, skb, copied_early);
1892 copied_early = false;
1893 }
1894 continue;
1895
1896 found_fin_ok:
1897
1898 ++*seq;
1899 if (!(flags & MSG_PEEK)) {
1900 sk_eat_skb(sk, skb, copied_early);
1901 copied_early = false;
1902 }
1903 break;
1904 } while (len > 0);
1905
1906 if (user_recv) {
1907 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1908 int chunk;
1909
1910 tp->ucopy.len = copied > 0 ? len : 0;
1911
1912 tcp_prequeue_process(sk);
1913
1914 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1915 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1916 len -= chunk;
1917 copied += chunk;
1918 }
1919 }
1920
1921 tp->ucopy.task = NULL;
1922 tp->ucopy.len = 0;
1923 }
1924
1925#ifdef CONFIG_NET_DMA
1926 tcp_service_net_dma(sk, true);
1927 tp->ucopy.dma_chan = NULL;
1928
1929 if (tp->ucopy.pinned_list) {
1930 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1931 tp->ucopy.pinned_list = NULL;
1932 }
1933#endif
1934
1935
1936
1937
1938
1939
1940 tcp_cleanup_rbuf(sk, copied);
1941
1942 release_sock(sk);
1943 return copied;
1944
1945out:
1946 release_sock(sk);
1947 return err;
1948
1949recv_urg:
1950 err = tcp_recv_urg(sk, msg, len, flags);
1951 goto out;
1952
1953recv_sndq:
1954 err = tcp_peek_sndq(sk, msg, len);
1955 goto out;
1956}
1957EXPORT_SYMBOL(tcp_recvmsg);
1958
1959void tcp_set_state(struct sock *sk, int state)
1960{
1961 int oldstate = sk->sk_state;
1962
1963 switch (state) {
1964 case TCP_ESTABLISHED:
1965 if (oldstate != TCP_ESTABLISHED)
1966 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1967 break;
1968
1969 case TCP_CLOSE:
1970 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
1971 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
1972
1973 sk->sk_prot->unhash(sk);
1974 if (inet_csk(sk)->icsk_bind_hash &&
1975 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1976 inet_put_port(sk);
1977
1978 default:
1979 if (oldstate == TCP_ESTABLISHED)
1980 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1981 }
1982
1983
1984
1985
1986 sk->sk_state = state;
1987
1988#ifdef STATE_TRACE
1989 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
1990#endif
1991}
1992EXPORT_SYMBOL_GPL(tcp_set_state);
1993
1994
1995
1996
1997
1998
1999
2000
2001static const unsigned char new_state[16] = {
2002
2003 TCP_CLOSE,
2004 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2005 TCP_CLOSE,
2006 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2007 TCP_FIN_WAIT1,
2008 TCP_FIN_WAIT2,
2009 TCP_CLOSE,
2010 TCP_CLOSE,
2011 TCP_LAST_ACK | TCP_ACTION_FIN,
2012 TCP_LAST_ACK,
2013 TCP_CLOSE,
2014 TCP_CLOSING,
2015};
2016
2017static int tcp_close_state(struct sock *sk)
2018{
2019 int next = (int)new_state[sk->sk_state];
2020 int ns = next & TCP_STATE_MASK;
2021
2022 tcp_set_state(sk, ns);
2023
2024 return next & TCP_ACTION_FIN;
2025}
2026
2027
2028
2029
2030
2031
2032void tcp_shutdown(struct sock *sk, int how)
2033{
2034
2035
2036
2037
2038 if (!(how & SEND_SHUTDOWN))
2039 return;
2040
2041
2042 if ((1 << sk->sk_state) &
2043 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2044 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2045
2046 if (tcp_close_state(sk))
2047 tcp_send_fin(sk);
2048 }
2049}
2050EXPORT_SYMBOL(tcp_shutdown);
2051
2052bool tcp_check_oom(struct sock *sk, int shift)
2053{
2054 bool too_many_orphans, out_of_socket_memory;
2055
2056 too_many_orphans = tcp_too_many_orphans(sk, shift);
2057 out_of_socket_memory = tcp_out_of_memory(sk);
2058
2059 if (too_many_orphans)
2060 net_info_ratelimited("too many orphaned sockets\n");
2061 if (out_of_socket_memory)
2062 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2063 return too_many_orphans || out_of_socket_memory;
2064}
2065
2066void tcp_close(struct sock *sk, long timeout)
2067{
2068 struct sk_buff *skb;
2069 int data_was_unread = 0;
2070 int state;
2071
2072 lock_sock(sk);
2073 sk->sk_shutdown = SHUTDOWN_MASK;
2074
2075 if (sk->sk_state == TCP_LISTEN) {
2076 tcp_set_state(sk, TCP_CLOSE);
2077
2078
2079 inet_csk_listen_stop(sk);
2080
2081 goto adjudge_to_death;
2082 }
2083
2084
2085
2086
2087
2088 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2089 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
2090 tcp_hdr(skb)->fin;
2091 data_was_unread += len;
2092 __kfree_skb(skb);
2093 }
2094
2095 sk_mem_reclaim(sk);
2096
2097
2098 if (sk->sk_state == TCP_CLOSE)
2099 goto adjudge_to_death;
2100
2101
2102
2103
2104
2105
2106
2107
2108 if (unlikely(tcp_sk(sk)->repair)) {
2109 sk->sk_prot->disconnect(sk, 0);
2110 } else if (data_was_unread) {
2111
2112 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2113 tcp_set_state(sk, TCP_CLOSE);
2114 tcp_send_active_reset(sk, sk->sk_allocation);
2115 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2116
2117 sk->sk_prot->disconnect(sk, 0);
2118 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2119 } else if (tcp_close_state(sk)) {
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149 tcp_send_fin(sk);
2150 }
2151
2152 sk_stream_wait_close(sk, timeout);
2153
2154adjudge_to_death:
2155 state = sk->sk_state;
2156 sock_hold(sk);
2157 sock_orphan(sk);
2158
2159
2160 release_sock(sk);
2161
2162
2163
2164
2165
2166 local_bh_disable();
2167 bh_lock_sock(sk);
2168 WARN_ON(sock_owned_by_user(sk));
2169
2170 percpu_counter_inc(sk->sk_prot->orphan_count);
2171
2172
2173 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2174 goto out;
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190 if (sk->sk_state == TCP_FIN_WAIT2) {
2191 struct tcp_sock *tp = tcp_sk(sk);
2192 if (tp->linger2 < 0) {
2193 tcp_set_state(sk, TCP_CLOSE);
2194 tcp_send_active_reset(sk, GFP_ATOMIC);
2195 NET_INC_STATS_BH(sock_net(sk),
2196 LINUX_MIB_TCPABORTONLINGER);
2197 } else {
2198 const int tmo = tcp_fin_time(sk);
2199
2200 if (tmo > TCP_TIMEWAIT_LEN) {
2201 inet_csk_reset_keepalive_timer(sk,
2202 tmo - TCP_TIMEWAIT_LEN);
2203 } else {
2204 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2205 goto out;
2206 }
2207 }
2208 }
2209 if (sk->sk_state != TCP_CLOSE) {
2210 sk_mem_reclaim(sk);
2211 if (tcp_check_oom(sk, 0)) {
2212 tcp_set_state(sk, TCP_CLOSE);
2213 tcp_send_active_reset(sk, GFP_ATOMIC);
2214 NET_INC_STATS_BH(sock_net(sk),
2215 LINUX_MIB_TCPABORTONMEMORY);
2216 }
2217 }
2218
2219 if (sk->sk_state == TCP_CLOSE) {
2220 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2221
2222
2223
2224
2225 if (req != NULL)
2226 reqsk_fastopen_remove(sk, req, false);
2227 inet_csk_destroy_sock(sk);
2228 }
2229
2230
2231out:
2232 bh_unlock_sock(sk);
2233 local_bh_enable();
2234 sock_put(sk);
2235}
2236EXPORT_SYMBOL(tcp_close);
2237
2238
2239
2240static inline bool tcp_need_reset(int state)
2241{
2242 return (1 << state) &
2243 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2244 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2245}
2246
2247int tcp_disconnect(struct sock *sk, int flags)
2248{
2249 struct inet_sock *inet = inet_sk(sk);
2250 struct inet_connection_sock *icsk = inet_csk(sk);
2251 struct tcp_sock *tp = tcp_sk(sk);
2252 int err = 0;
2253 int old_state = sk->sk_state;
2254
2255 if (old_state != TCP_CLOSE)
2256 tcp_set_state(sk, TCP_CLOSE);
2257
2258
2259 if (old_state == TCP_LISTEN) {
2260 inet_csk_listen_stop(sk);
2261 } else if (unlikely(tp->repair)) {
2262 sk->sk_err = ECONNABORTED;
2263 } else if (tcp_need_reset(old_state) ||
2264 (tp->snd_nxt != tp->write_seq &&
2265 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2266
2267
2268
2269 tcp_send_active_reset(sk, gfp_any());
2270 sk->sk_err = ECONNRESET;
2271 } else if (old_state == TCP_SYN_SENT)
2272 sk->sk_err = ECONNRESET;
2273
2274 tcp_clear_xmit_timers(sk);
2275 __skb_queue_purge(&sk->sk_receive_queue);
2276 tcp_write_queue_purge(sk);
2277 __skb_queue_purge(&tp->out_of_order_queue);
2278#ifdef CONFIG_NET_DMA
2279 __skb_queue_purge(&sk->sk_async_wait_queue);
2280#endif
2281
2282 inet->inet_dport = 0;
2283
2284 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2285 inet_reset_saddr(sk);
2286
2287 sk->sk_shutdown = 0;
2288 sock_reset_flag(sk, SOCK_DONE);
2289 tp->srtt = 0;
2290 if ((tp->write_seq += tp->max_window + 2) == 0)
2291 tp->write_seq = 1;
2292 icsk->icsk_backoff = 0;
2293 tp->snd_cwnd = 2;
2294 icsk->icsk_probes_out = 0;
2295 tp->packets_out = 0;
2296 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2297 tp->snd_cwnd_cnt = 0;
2298 tp->window_clamp = 0;
2299 tcp_set_ca_state(sk, TCP_CA_Open);
2300 tcp_clear_retrans(tp);
2301 inet_csk_delack_init(sk);
2302 tcp_init_send_head(sk);
2303 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2304 __sk_dst_reset(sk);
2305
2306 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2307
2308 sk->sk_error_report(sk);
2309 return err;
2310}
2311EXPORT_SYMBOL(tcp_disconnect);
2312
2313void tcp_sock_destruct(struct sock *sk)
2314{
2315 inet_sock_destruct(sk);
2316
2317 kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
2318}
2319
2320static inline bool tcp_can_repair_sock(const struct sock *sk)
2321{
2322 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2323 ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
2324}
2325
2326static int tcp_repair_options_est(struct tcp_sock *tp,
2327 struct tcp_repair_opt __user *optbuf, unsigned int len)
2328{
2329 struct tcp_repair_opt opt;
2330
2331 while (len >= sizeof(opt)) {
2332 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2333 return -EFAULT;
2334
2335 optbuf++;
2336 len -= sizeof(opt);
2337
2338 switch (opt.opt_code) {
2339 case TCPOPT_MSS:
2340 tp->rx_opt.mss_clamp = opt.opt_val;
2341 break;
2342 case TCPOPT_WINDOW:
2343 {
2344 u16 snd_wscale = opt.opt_val & 0xFFFF;
2345 u16 rcv_wscale = opt.opt_val >> 16;
2346
2347 if (snd_wscale > 14 || rcv_wscale > 14)
2348 return -EFBIG;
2349
2350 tp->rx_opt.snd_wscale = snd_wscale;
2351 tp->rx_opt.rcv_wscale = rcv_wscale;
2352 tp->rx_opt.wscale_ok = 1;
2353 }
2354 break;
2355 case TCPOPT_SACK_PERM:
2356 if (opt.opt_val != 0)
2357 return -EINVAL;
2358
2359 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2360 if (sysctl_tcp_fack)
2361 tcp_enable_fack(tp);
2362 break;
2363 case TCPOPT_TIMESTAMP:
2364 if (opt.opt_val != 0)
2365 return -EINVAL;
2366
2367 tp->rx_opt.tstamp_ok = 1;
2368 break;
2369 }
2370 }
2371
2372 return 0;
2373}
2374
2375
2376
2377
2378static int do_tcp_setsockopt(struct sock *sk, int level,
2379 int optname, char __user *optval, unsigned int optlen)
2380{
2381 struct tcp_sock *tp = tcp_sk(sk);
2382 struct inet_connection_sock *icsk = inet_csk(sk);
2383 int val;
2384 int err = 0;
2385
2386
2387 switch (optname) {
2388 case TCP_CONGESTION: {
2389 char name[TCP_CA_NAME_MAX];
2390
2391 if (optlen < 1)
2392 return -EINVAL;
2393
2394 val = strncpy_from_user(name, optval,
2395 min_t(long, TCP_CA_NAME_MAX-1, optlen));
2396 if (val < 0)
2397 return -EFAULT;
2398 name[val] = 0;
2399
2400 lock_sock(sk);
2401 err = tcp_set_congestion_control(sk, name);
2402 release_sock(sk);
2403 return err;
2404 }
2405 default:
2406
2407 break;
2408 }
2409
2410 if (optlen < sizeof(int))
2411 return -EINVAL;
2412
2413 if (get_user(val, (int __user *)optval))
2414 return -EFAULT;
2415
2416 lock_sock(sk);
2417
2418 switch (optname) {
2419 case TCP_MAXSEG:
2420
2421
2422
2423 if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
2424 err = -EINVAL;
2425 break;
2426 }
2427 tp->rx_opt.user_mss = val;
2428 break;
2429
2430 case TCP_NODELAY:
2431 if (val) {
2432
2433
2434
2435
2436
2437
2438
2439
2440 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2441 tcp_push_pending_frames(sk);
2442 } else {
2443 tp->nonagle &= ~TCP_NAGLE_OFF;
2444 }
2445 break;
2446
2447 case TCP_THIN_LINEAR_TIMEOUTS:
2448 if (val < 0 || val > 1)
2449 err = -EINVAL;
2450 else
2451 tp->thin_lto = val;
2452 break;
2453
2454 case TCP_THIN_DUPACK:
2455 if (val < 0 || val > 1)
2456 err = -EINVAL;
2457 else
2458 tp->thin_dupack = val;
2459 if (tp->thin_dupack)
2460 tcp_disable_early_retrans(tp);
2461 break;
2462
2463 case TCP_REPAIR:
2464 if (!tcp_can_repair_sock(sk))
2465 err = -EPERM;
2466 else if (val == 1) {
2467 tp->repair = 1;
2468 sk->sk_reuse = SK_FORCE_REUSE;
2469 tp->repair_queue = TCP_NO_QUEUE;
2470 } else if (val == 0) {
2471 tp->repair = 0;
2472 sk->sk_reuse = SK_NO_REUSE;
2473 tcp_send_window_probe(sk);
2474 } else
2475 err = -EINVAL;
2476
2477 break;
2478
2479 case TCP_REPAIR_QUEUE:
2480 if (!tp->repair)
2481 err = -EPERM;
2482 else if (val < TCP_QUEUES_NR)
2483 tp->repair_queue = val;
2484 else
2485 err = -EINVAL;
2486 break;
2487
2488 case TCP_QUEUE_SEQ:
2489 if (sk->sk_state != TCP_CLOSE)
2490 err = -EPERM;
2491 else if (tp->repair_queue == TCP_SEND_QUEUE)
2492 tp->write_seq = val;
2493 else if (tp->repair_queue == TCP_RECV_QUEUE)
2494 tp->rcv_nxt = val;
2495 else
2496 err = -EINVAL;
2497 break;
2498
2499 case TCP_REPAIR_OPTIONS:
2500 if (!tp->repair)
2501 err = -EINVAL;
2502 else if (sk->sk_state == TCP_ESTABLISHED)
2503 err = tcp_repair_options_est(tp,
2504 (struct tcp_repair_opt __user *)optval,
2505 optlen);
2506 else
2507 err = -EPERM;
2508 break;
2509
2510 case TCP_CORK:
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522 if (val) {
2523 tp->nonagle |= TCP_NAGLE_CORK;
2524 } else {
2525 tp->nonagle &= ~TCP_NAGLE_CORK;
2526 if (tp->nonagle&TCP_NAGLE_OFF)
2527 tp->nonagle |= TCP_NAGLE_PUSH;
2528 tcp_push_pending_frames(sk);
2529 }
2530 break;
2531
2532 case TCP_KEEPIDLE:
2533 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2534 err = -EINVAL;
2535 else {
2536 tp->keepalive_time = val * HZ;
2537 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2538 !((1 << sk->sk_state) &
2539 (TCPF_CLOSE | TCPF_LISTEN))) {
2540 u32 elapsed = keepalive_time_elapsed(tp);
2541 if (tp->keepalive_time > elapsed)
2542 elapsed = tp->keepalive_time - elapsed;
2543 else
2544 elapsed = 0;
2545 inet_csk_reset_keepalive_timer(sk, elapsed);
2546 }
2547 }
2548 break;
2549 case TCP_KEEPINTVL:
2550 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2551 err = -EINVAL;
2552 else
2553 tp->keepalive_intvl = val * HZ;
2554 break;
2555 case TCP_KEEPCNT:
2556 if (val < 1 || val > MAX_TCP_KEEPCNT)
2557 err = -EINVAL;
2558 else
2559 tp->keepalive_probes = val;
2560 break;
2561 case TCP_SYNCNT:
2562 if (val < 1 || val > MAX_TCP_SYNCNT)
2563 err = -EINVAL;
2564 else
2565 icsk->icsk_syn_retries = val;
2566 break;
2567
2568 case TCP_LINGER2:
2569 if (val < 0)
2570 tp->linger2 = -1;
2571 else if (val > sysctl_tcp_fin_timeout / HZ)
2572 tp->linger2 = 0;
2573 else
2574 tp->linger2 = val * HZ;
2575 break;
2576
2577 case TCP_DEFER_ACCEPT:
2578
2579 icsk->icsk_accept_queue.rskq_defer_accept =
2580 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2581 TCP_RTO_MAX / HZ);
2582 break;
2583
2584 case TCP_WINDOW_CLAMP:
2585 if (!val) {
2586 if (sk->sk_state != TCP_CLOSE) {
2587 err = -EINVAL;
2588 break;
2589 }
2590 tp->window_clamp = 0;
2591 } else
2592 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2593 SOCK_MIN_RCVBUF / 2 : val;
2594 break;
2595
2596 case TCP_QUICKACK:
2597 if (!val) {
2598 icsk->icsk_ack.pingpong = 1;
2599 } else {
2600 icsk->icsk_ack.pingpong = 0;
2601 if ((1 << sk->sk_state) &
2602 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2603 inet_csk_ack_scheduled(sk)) {
2604 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2605 tcp_cleanup_rbuf(sk, 1);
2606 if (!(val & 1))
2607 icsk->icsk_ack.pingpong = 1;
2608 }
2609 }
2610 break;
2611
2612#ifdef CONFIG_TCP_MD5SIG
2613 case TCP_MD5SIG:
2614
2615 err = tp->af_specific->md5_parse(sk, optval, optlen);
2616 break;
2617#endif
2618 case TCP_USER_TIMEOUT:
2619
2620
2621
2622 if (val < 0)
2623 err = -EINVAL;
2624 else
2625 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2626 break;
2627
2628 case TCP_FASTOPEN:
2629 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2630 TCPF_LISTEN)))
2631 err = fastopen_init_queue(sk, val);
2632 else
2633 err = -EINVAL;
2634 break;
2635 case TCP_TIMESTAMP:
2636 if (!tp->repair)
2637 err = -EPERM;
2638 else
2639 tp->tsoffset = val - tcp_time_stamp;
2640 break;
2641 default:
2642 err = -ENOPROTOOPT;
2643 break;
2644 }
2645
2646 release_sock(sk);
2647 return err;
2648}
2649
2650int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2651 unsigned int optlen)
2652{
2653 const struct inet_connection_sock *icsk = inet_csk(sk);
2654
2655 if (level != SOL_TCP)
2656 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2657 optval, optlen);
2658 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2659}
2660EXPORT_SYMBOL(tcp_setsockopt);
2661
2662#ifdef CONFIG_COMPAT
2663int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2664 char __user *optval, unsigned int optlen)
2665{
2666 if (level != SOL_TCP)
2667 return inet_csk_compat_setsockopt(sk, level, optname,
2668 optval, optlen);
2669 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2670}
2671EXPORT_SYMBOL(compat_tcp_setsockopt);
2672#endif
2673
2674
2675void tcp_get_info(const struct sock *sk, struct tcp_info *info)
2676{
2677 const struct tcp_sock *tp = tcp_sk(sk);
2678 const struct inet_connection_sock *icsk = inet_csk(sk);
2679 u32 now = tcp_time_stamp;
2680
2681 memset(info, 0, sizeof(*info));
2682
2683 info->tcpi_state = sk->sk_state;
2684 info->tcpi_ca_state = icsk->icsk_ca_state;
2685 info->tcpi_retransmits = icsk->icsk_retransmits;
2686 info->tcpi_probes = icsk->icsk_probes_out;
2687 info->tcpi_backoff = icsk->icsk_backoff;
2688
2689 if (tp->rx_opt.tstamp_ok)
2690 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2691 if (tcp_is_sack(tp))
2692 info->tcpi_options |= TCPI_OPT_SACK;
2693 if (tp->rx_opt.wscale_ok) {
2694 info->tcpi_options |= TCPI_OPT_WSCALE;
2695 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2696 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2697 }
2698
2699 if (tp->ecn_flags & TCP_ECN_OK)
2700 info->tcpi_options |= TCPI_OPT_ECN;
2701 if (tp->ecn_flags & TCP_ECN_SEEN)
2702 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
2703 if (tp->syn_data_acked)
2704 info->tcpi_options |= TCPI_OPT_SYN_DATA;
2705
2706 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2707 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2708 info->tcpi_snd_mss = tp->mss_cache;
2709 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2710
2711 if (sk->sk_state == TCP_LISTEN) {
2712 info->tcpi_unacked = sk->sk_ack_backlog;
2713 info->tcpi_sacked = sk->sk_max_ack_backlog;
2714 } else {
2715 info->tcpi_unacked = tp->packets_out;
2716 info->tcpi_sacked = tp->sacked_out;
2717 }
2718 info->tcpi_lost = tp->lost_out;
2719 info->tcpi_retrans = tp->retrans_out;
2720 info->tcpi_fackets = tp->fackets_out;
2721
2722 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2723 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2724 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2725
2726 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2727 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2728 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2729 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2730 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2731 info->tcpi_snd_cwnd = tp->snd_cwnd;
2732 info->tcpi_advmss = tp->advmss;
2733 info->tcpi_reordering = tp->reordering;
2734
2735 info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2736 info->tcpi_rcv_space = tp->rcvq_space.space;
2737
2738 info->tcpi_total_retrans = tp->total_retrans;
2739}
2740EXPORT_SYMBOL_GPL(tcp_get_info);
2741
2742static int do_tcp_getsockopt(struct sock *sk, int level,
2743 int optname, char __user *optval, int __user *optlen)
2744{
2745 struct inet_connection_sock *icsk = inet_csk(sk);
2746 struct tcp_sock *tp = tcp_sk(sk);
2747 int val, len;
2748
2749 if (get_user(len, optlen))
2750 return -EFAULT;
2751
2752 len = min_t(unsigned int, len, sizeof(int));
2753
2754 if (len < 0)
2755 return -EINVAL;
2756
2757 switch (optname) {
2758 case TCP_MAXSEG:
2759 val = tp->mss_cache;
2760 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2761 val = tp->rx_opt.user_mss;
2762 if (tp->repair)
2763 val = tp->rx_opt.mss_clamp;
2764 break;
2765 case TCP_NODELAY:
2766 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2767 break;
2768 case TCP_CORK:
2769 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2770 break;
2771 case TCP_KEEPIDLE:
2772 val = keepalive_time_when(tp) / HZ;
2773 break;
2774 case TCP_KEEPINTVL:
2775 val = keepalive_intvl_when(tp) / HZ;
2776 break;
2777 case TCP_KEEPCNT:
2778 val = keepalive_probes(tp);
2779 break;
2780 case TCP_SYNCNT:
2781 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2782 break;
2783 case TCP_LINGER2:
2784 val = tp->linger2;
2785 if (val >= 0)
2786 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2787 break;
2788 case TCP_DEFER_ACCEPT:
2789 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
2790 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
2791 break;
2792 case TCP_WINDOW_CLAMP:
2793 val = tp->window_clamp;
2794 break;
2795 case TCP_INFO: {
2796 struct tcp_info info;
2797
2798 if (get_user(len, optlen))
2799 return -EFAULT;
2800
2801 tcp_get_info(sk, &info);
2802
2803 len = min_t(unsigned int, len, sizeof(info));
2804 if (put_user(len, optlen))
2805 return -EFAULT;
2806 if (copy_to_user(optval, &info, len))
2807 return -EFAULT;
2808 return 0;
2809 }
2810 case TCP_QUICKACK:
2811 val = !icsk->icsk_ack.pingpong;
2812 break;
2813
2814 case TCP_CONGESTION:
2815 if (get_user(len, optlen))
2816 return -EFAULT;
2817 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2818 if (put_user(len, optlen))
2819 return -EFAULT;
2820 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2821 return -EFAULT;
2822 return 0;
2823
2824 case TCP_THIN_LINEAR_TIMEOUTS:
2825 val = tp->thin_lto;
2826 break;
2827 case TCP_THIN_DUPACK:
2828 val = tp->thin_dupack;
2829 break;
2830
2831 case TCP_REPAIR:
2832 val = tp->repair;
2833 break;
2834
2835 case TCP_REPAIR_QUEUE:
2836 if (tp->repair)
2837 val = tp->repair_queue;
2838 else
2839 return -EINVAL;
2840 break;
2841
2842 case TCP_QUEUE_SEQ:
2843 if (tp->repair_queue == TCP_SEND_QUEUE)
2844 val = tp->write_seq;
2845 else if (tp->repair_queue == TCP_RECV_QUEUE)
2846 val = tp->rcv_nxt;
2847 else
2848 return -EINVAL;
2849 break;
2850
2851 case TCP_USER_TIMEOUT:
2852 val = jiffies_to_msecs(icsk->icsk_user_timeout);
2853 break;
2854 case TCP_TIMESTAMP:
2855 val = tcp_time_stamp + tp->tsoffset;
2856 break;
2857 default:
2858 return -ENOPROTOOPT;
2859 }
2860
2861 if (put_user(len, optlen))
2862 return -EFAULT;
2863 if (copy_to_user(optval, &val, len))
2864 return -EFAULT;
2865 return 0;
2866}
2867
2868int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2869 int __user *optlen)
2870{
2871 struct inet_connection_sock *icsk = inet_csk(sk);
2872
2873 if (level != SOL_TCP)
2874 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2875 optval, optlen);
2876 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2877}
2878EXPORT_SYMBOL(tcp_getsockopt);
2879
2880#ifdef CONFIG_COMPAT
2881int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2882 char __user *optval, int __user *optlen)
2883{
2884 if (level != SOL_TCP)
2885 return inet_csk_compat_getsockopt(sk, level, optname,
2886 optval, optlen);
2887 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2888}
2889EXPORT_SYMBOL(compat_tcp_getsockopt);
2890#endif
2891
2892#ifdef CONFIG_TCP_MD5SIG
2893static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly;
2894static DEFINE_MUTEX(tcp_md5sig_mutex);
2895
2896static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
2897{
2898 int cpu;
2899
2900 for_each_possible_cpu(cpu) {
2901 struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu);
2902
2903 if (p->md5_desc.tfm)
2904 crypto_free_hash(p->md5_desc.tfm);
2905 }
2906 free_percpu(pool);
2907}
2908
2909static void __tcp_alloc_md5sig_pool(void)
2910{
2911 int cpu;
2912 struct tcp_md5sig_pool __percpu *pool;
2913
2914 pool = alloc_percpu(struct tcp_md5sig_pool);
2915 if (!pool)
2916 return;
2917
2918 for_each_possible_cpu(cpu) {
2919 struct crypto_hash *hash;
2920
2921 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
2922 if (IS_ERR_OR_NULL(hash))
2923 goto out_free;
2924
2925 per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
2926 }
2927
2928
2929
2930 smp_wmb();
2931 tcp_md5sig_pool = pool;
2932 return;
2933out_free:
2934 __tcp_free_md5sig_pool(pool);
2935}
2936
2937bool tcp_alloc_md5sig_pool(void)
2938{
2939 if (unlikely(!tcp_md5sig_pool)) {
2940 mutex_lock(&tcp_md5sig_mutex);
2941
2942 if (!tcp_md5sig_pool)
2943 __tcp_alloc_md5sig_pool();
2944
2945 mutex_unlock(&tcp_md5sig_mutex);
2946 }
2947 return tcp_md5sig_pool != NULL;
2948}
2949EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
2960{
2961 struct tcp_md5sig_pool __percpu *p;
2962
2963 local_bh_disable();
2964 p = ACCESS_ONCE(tcp_md5sig_pool);
2965 if (p)
2966 return __this_cpu_ptr(p);
2967
2968 local_bh_enable();
2969 return NULL;
2970}
2971EXPORT_SYMBOL(tcp_get_md5sig_pool);
2972
2973int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
2974 const struct tcphdr *th)
2975{
2976 struct scatterlist sg;
2977 struct tcphdr hdr;
2978 int err;
2979
2980
2981 memcpy(&hdr, th, sizeof(hdr));
2982 hdr.check = 0;
2983
2984
2985 sg_init_one(&sg, &hdr, sizeof(hdr));
2986 err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
2987 return err;
2988}
2989EXPORT_SYMBOL(tcp_md5_hash_header);
2990
2991int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
2992 const struct sk_buff *skb, unsigned int header_len)
2993{
2994 struct scatterlist sg;
2995 const struct tcphdr *tp = tcp_hdr(skb);
2996 struct hash_desc *desc = &hp->md5_desc;
2997 unsigned int i;
2998 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
2999 skb_headlen(skb) - header_len : 0;
3000 const struct skb_shared_info *shi = skb_shinfo(skb);
3001 struct sk_buff *frag_iter;
3002
3003 sg_init_table(&sg, 1);
3004
3005 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
3006 if (crypto_hash_update(desc, &sg, head_data_len))
3007 return 1;
3008
3009 for (i = 0; i < shi->nr_frags; ++i) {
3010 const struct skb_frag_struct *f = &shi->frags[i];
3011 unsigned int offset = f->page_offset;
3012 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
3013
3014 sg_set_page(&sg, page, skb_frag_size(f),
3015 offset_in_page(offset));
3016 if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
3017 return 1;
3018 }
3019
3020 skb_walk_frags(skb, frag_iter)
3021 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3022 return 1;
3023
3024 return 0;
3025}
3026EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3027
3028int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
3029{
3030 struct scatterlist sg;
3031
3032 sg_init_one(&sg, key->key, key->keylen);
3033 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
3034}
3035EXPORT_SYMBOL(tcp_md5_hash_key);
3036
3037#endif
3038
3039void tcp_done(struct sock *sk)
3040{
3041 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3042
3043 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3044 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3045
3046 tcp_set_state(sk, TCP_CLOSE);
3047 tcp_clear_xmit_timers(sk);
3048 if (req != NULL)
3049 reqsk_fastopen_remove(sk, req, false);
3050
3051 sk->sk_shutdown = SHUTDOWN_MASK;
3052
3053 if (!sock_flag(sk, SOCK_DEAD))
3054 sk->sk_state_change(sk);
3055 else
3056 inet_csk_destroy_sock(sk);
3057}
3058EXPORT_SYMBOL_GPL(tcp_done);
3059
3060extern struct tcp_congestion_ops tcp_reno;
3061
3062static __initdata unsigned long thash_entries;
3063static int __init set_thash_entries(char *str)
3064{
3065 ssize_t ret;
3066
3067 if (!str)
3068 return 0;
3069
3070 ret = kstrtoul(str, 0, &thash_entries);
3071 if (ret)
3072 return 0;
3073
3074 return 1;
3075}
3076__setup("thash_entries=", set_thash_entries);
3077
3078void tcp_init_mem(struct net *net)
3079{
3080 unsigned long limit = nr_free_buffer_pages() / 8;
3081 limit = max(limit, 128UL);
3082 net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
3083 net->ipv4.sysctl_tcp_mem[1] = limit;
3084 net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
3085}
3086
3087void __init tcp_init(void)
3088{
3089 struct sk_buff *skb = NULL;
3090 unsigned long limit;
3091 int max_rshare, max_wshare, cnt;
3092 unsigned int i;
3093
3094 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
3095
3096 percpu_counter_init(&tcp_sockets_allocated, 0);
3097 percpu_counter_init(&tcp_orphan_count, 0);
3098 tcp_hashinfo.bind_bucket_cachep =
3099 kmem_cache_create("tcp_bind_bucket",
3100 sizeof(struct inet_bind_bucket), 0,
3101 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3102
3103
3104
3105
3106
3107
3108 tcp_hashinfo.ehash =
3109 alloc_large_system_hash("TCP established",
3110 sizeof(struct inet_ehash_bucket),
3111 thash_entries,
3112 17,
3113 0,
3114 NULL,
3115 &tcp_hashinfo.ehash_mask,
3116 0,
3117 thash_entries ? 0 : 512 * 1024);
3118 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
3119 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
3120 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
3121 }
3122 if (inet_ehash_locks_alloc(&tcp_hashinfo))
3123 panic("TCP: failed to alloc ehash_locks");
3124 tcp_hashinfo.bhash =
3125 alloc_large_system_hash("TCP bind",
3126 sizeof(struct inet_bind_hashbucket),
3127 tcp_hashinfo.ehash_mask + 1,
3128 17,
3129 0,
3130 &tcp_hashinfo.bhash_size,
3131 NULL,
3132 0,
3133 64 * 1024);
3134 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
3135 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3136 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3137 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3138 }
3139
3140
3141 cnt = tcp_hashinfo.ehash_mask + 1;
3142
3143 tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3144 sysctl_tcp_max_orphans = cnt / 2;
3145 sysctl_max_syn_backlog = max(128, cnt / 256);
3146
3147 tcp_init_mem(&init_net);
3148
3149 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
3150 max_wshare = min(4UL*1024*1024, limit);
3151 max_rshare = min(6UL*1024*1024, limit);
3152
3153 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3154 sysctl_tcp_wmem[1] = 16*1024;
3155 sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3156
3157 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3158 sysctl_tcp_rmem[1] = 87380;
3159 sysctl_tcp_rmem[2] = max(87380, max_rshare);
3160
3161 pr_info("Hash tables configured (established %u bind %u)\n",
3162 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3163
3164 tcp_metrics_init();
3165
3166 tcp_register_congestion_control(&tcp_reno);
3167
3168 tcp_tasklet_init();
3169}
3170