1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248#define pr_fmt(fmt) "TCP: " fmt
249
250#include <linux/kernel.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/fs.h>
257#include <linux/skbuff.h>
258#include <linux/scatterlist.h>
259#include <linux/splice.h>
260#include <linux/net.h>
261#include <linux/socket.h>
262#include <linux/random.h>
263#include <linux/bootmem.h>
264#include <linux/highmem.h>
265#include <linux/swap.h>
266#include <linux/cache.h>
267#include <linux/err.h>
268#include <linux/crypto.h>
269#include <linux/time.h>
270#include <linux/slab.h>
271
272#include <net/icmp.h>
273#include <net/inet_common.h>
274#include <net/tcp.h>
275#include <net/xfrm.h>
276#include <net/ip.h>
277#include <net/netdma.h>
278#include <net/sock.h>
279
280#include <asm/uaccess.h>
281#include <asm/ioctls.h>
282
283int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
284
285struct percpu_counter tcp_orphan_count;
286EXPORT_SYMBOL_GPL(tcp_orphan_count);
287
288int sysctl_tcp_wmem[3] __read_mostly;
289int sysctl_tcp_rmem[3] __read_mostly;
290
291EXPORT_SYMBOL(sysctl_tcp_rmem);
292EXPORT_SYMBOL(sysctl_tcp_wmem);
293
294atomic_long_t tcp_memory_allocated;
295EXPORT_SYMBOL(tcp_memory_allocated);
296
297
298
299
300struct percpu_counter tcp_sockets_allocated;
301EXPORT_SYMBOL(tcp_sockets_allocated);
302
303
304
305
306struct tcp_splice_state {
307 struct pipe_inode_info *pipe;
308 size_t len;
309 unsigned int flags;
310};
311
312
313
314
315
316
317
318int tcp_memory_pressure __read_mostly;
319EXPORT_SYMBOL(tcp_memory_pressure);
320
321void tcp_enter_memory_pressure(struct sock *sk)
322{
323 if (!tcp_memory_pressure) {
324 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
325 tcp_memory_pressure = 1;
326 }
327}
328EXPORT_SYMBOL(tcp_enter_memory_pressure);
329
330
331static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
332{
333 u8 res = 0;
334
335 if (seconds > 0) {
336 int period = timeout;
337
338 res = 1;
339 while (seconds > period && res < 255) {
340 res++;
341 timeout <<= 1;
342 if (timeout > rto_max)
343 timeout = rto_max;
344 period += timeout;
345 }
346 }
347 return res;
348}
349
350
351static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
352{
353 int period = 0;
354
355 if (retrans > 0) {
356 period = timeout;
357 while (--retrans) {
358 timeout <<= 1;
359 if (timeout > rto_max)
360 timeout = rto_max;
361 period += timeout;
362 }
363 }
364 return period;
365}
366
367
368
369
370
371
372void tcp_init_sock(struct sock *sk)
373{
374 struct inet_connection_sock *icsk = inet_csk(sk);
375 struct tcp_sock *tp = tcp_sk(sk);
376
377 skb_queue_head_init(&tp->out_of_order_queue);
378 tcp_init_xmit_timers(sk);
379 tcp_prequeue_init(tp);
380 INIT_LIST_HEAD(&tp->tsq_node);
381
382 icsk->icsk_rto = TCP_TIMEOUT_INIT;
383 tp->mdev = TCP_TIMEOUT_INIT;
384
385
386
387
388
389
390 tp->snd_cwnd = TCP_INIT_CWND;
391
392
393
394
395 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
396 tp->snd_cwnd_clamp = ~0;
397 tp->mss_cache = TCP_MSS_DEFAULT;
398
399 tp->reordering = sysctl_tcp_reordering;
400 tcp_enable_early_retrans(tp);
401 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
402
403 tp->tsoffset = 0;
404
405 sk->sk_state = TCP_CLOSE;
406
407 sk->sk_write_space = sk_stream_write_space;
408 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
409
410 icsk->icsk_sync_mss = tcp_sync_mss;
411
412
413 if (sysctl_tcp_cookie_size > 0) {
414
415 tp->cookie_values =
416 kzalloc(sizeof(*tp->cookie_values),
417 sk->sk_allocation);
418 if (tp->cookie_values != NULL)
419 kref_init(&tp->cookie_values->kref);
420 }
421
422
423
424
425 sk->sk_sndbuf = sysctl_tcp_wmem[1];
426 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
427
428 local_bh_disable();
429 sock_update_memcg(sk);
430 sk_sockets_allocated_inc(sk);
431 local_bh_enable();
432}
433EXPORT_SYMBOL(tcp_init_sock);
434
435
436
437
438
439
440
441
442unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
443{
444 unsigned int mask;
445 struct sock *sk = sock->sk;
446 const struct tcp_sock *tp = tcp_sk(sk);
447
448 sock_poll_wait(file, sk_sleep(sk), wait);
449 if (sk->sk_state == TCP_LISTEN)
450 return inet_csk_listen_poll(sk);
451
452
453
454
455
456
457 mask = 0;
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
487 mask |= POLLHUP;
488 if (sk->sk_shutdown & RCV_SHUTDOWN)
489 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
490
491
492 if (sk->sk_state != TCP_SYN_SENT &&
493 (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
494 int target = sock_rcvlowat(sk, 0, INT_MAX);
495
496 if (tp->urg_seq == tp->copied_seq &&
497 !sock_flag(sk, SOCK_URGINLINE) &&
498 tp->urg_data)
499 target++;
500
501
502
503
504 if (tp->rcv_nxt - tp->copied_seq >= target)
505 mask |= POLLIN | POLLRDNORM;
506
507 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
508 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
509 mask |= POLLOUT | POLLWRNORM;
510 } else {
511 set_bit(SOCK_ASYNC_NOSPACE,
512 &sk->sk_socket->flags);
513 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
514
515
516
517
518
519 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
520 mask |= POLLOUT | POLLWRNORM;
521 }
522 } else
523 mask |= POLLOUT | POLLWRNORM;
524
525 if (tp->urg_data & TCP_URG_VALID)
526 mask |= POLLPRI;
527 }
528
529 smp_rmb();
530 if (sk->sk_err)
531 mask |= POLLERR;
532
533 return mask;
534}
535EXPORT_SYMBOL(tcp_poll);
536
537int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
538{
539 struct tcp_sock *tp = tcp_sk(sk);
540 int answ;
541 bool slow;
542
543 switch (cmd) {
544 case SIOCINQ:
545 if (sk->sk_state == TCP_LISTEN)
546 return -EINVAL;
547
548 slow = lock_sock_fast(sk);
549 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
550 answ = 0;
551 else if (sock_flag(sk, SOCK_URGINLINE) ||
552 !tp->urg_data ||
553 before(tp->urg_seq, tp->copied_seq) ||
554 !before(tp->urg_seq, tp->rcv_nxt)) {
555
556 answ = tp->rcv_nxt - tp->copied_seq;
557
558
559 if (answ && sock_flag(sk, SOCK_DONE))
560 answ--;
561 } else
562 answ = tp->urg_seq - tp->copied_seq;
563 unlock_sock_fast(sk, slow);
564 break;
565 case SIOCATMARK:
566 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
567 break;
568 case SIOCOUTQ:
569 if (sk->sk_state == TCP_LISTEN)
570 return -EINVAL;
571
572 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
573 answ = 0;
574 else
575 answ = tp->write_seq - tp->snd_una;
576 break;
577 case SIOCOUTQNSD:
578 if (sk->sk_state == TCP_LISTEN)
579 return -EINVAL;
580
581 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
582 answ = 0;
583 else
584 answ = tp->write_seq - tp->snd_nxt;
585 break;
586 default:
587 return -ENOIOCTLCMD;
588 }
589
590 return put_user(answ, (int __user *)arg);
591}
592EXPORT_SYMBOL(tcp_ioctl);
593
594static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
595{
596 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
597 tp->pushed_seq = tp->write_seq;
598}
599
600static inline bool forced_push(const struct tcp_sock *tp)
601{
602 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
603}
604
605static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
606{
607 struct tcp_sock *tp = tcp_sk(sk);
608 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
609
610 skb->csum = 0;
611 tcb->seq = tcb->end_seq = tp->write_seq;
612 tcb->tcp_flags = TCPHDR_ACK;
613 tcb->sacked = 0;
614 skb_header_release(skb);
615 tcp_add_write_queue_tail(sk, skb);
616 sk->sk_wmem_queued += skb->truesize;
617 sk_mem_charge(sk, skb->truesize);
618 if (tp->nonagle & TCP_NAGLE_PUSH)
619 tp->nonagle &= ~TCP_NAGLE_PUSH;
620}
621
622static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
623{
624 if (flags & MSG_OOB)
625 tp->snd_up = tp->write_seq;
626}
627
628static inline void tcp_push(struct sock *sk, int flags, int mss_now,
629 int nonagle)
630{
631 if (tcp_send_head(sk)) {
632 struct tcp_sock *tp = tcp_sk(sk);
633
634 if (!(flags & MSG_MORE) || forced_push(tp))
635 tcp_mark_push(tp, tcp_write_queue_tail(sk));
636
637 tcp_mark_urg(tp, flags);
638 __tcp_push_pending_frames(sk, mss_now,
639 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
640 }
641}
642
643static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
644 unsigned int offset, size_t len)
645{
646 struct tcp_splice_state *tss = rd_desc->arg.data;
647 int ret;
648
649 ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
650 tss->flags);
651 if (ret > 0)
652 rd_desc->count -= ret;
653 return ret;
654}
655
656static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
657{
658
659 read_descriptor_t rd_desc = {
660 .arg.data = tss,
661 .count = tss->len,
662 };
663
664 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
665}
666
667
668
669
670
671
672
673
674
675
676
677
678
679ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
680 struct pipe_inode_info *pipe, size_t len,
681 unsigned int flags)
682{
683 struct sock *sk = sock->sk;
684 struct tcp_splice_state tss = {
685 .pipe = pipe,
686 .len = len,
687 .flags = flags,
688 };
689 long timeo;
690 ssize_t spliced;
691 int ret;
692
693 sock_rps_record_flow(sk);
694
695
696
697 if (unlikely(*ppos))
698 return -ESPIPE;
699
700 ret = spliced = 0;
701
702 lock_sock(sk);
703
704 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
705 while (tss.len) {
706 ret = __tcp_splice_read(sk, &tss);
707 if (ret < 0)
708 break;
709 else if (!ret) {
710 if (spliced)
711 break;
712 if (sock_flag(sk, SOCK_DONE))
713 break;
714 if (sk->sk_err) {
715 ret = sock_error(sk);
716 break;
717 }
718 if (sk->sk_shutdown & RCV_SHUTDOWN)
719 break;
720 if (sk->sk_state == TCP_CLOSE) {
721
722
723
724
725 if (!sock_flag(sk, SOCK_DONE))
726 ret = -ENOTCONN;
727 break;
728 }
729 if (!timeo) {
730 ret = -EAGAIN;
731 break;
732 }
733 sk_wait_data(sk, &timeo);
734 if (signal_pending(current)) {
735 ret = sock_intr_errno(timeo);
736 break;
737 }
738 continue;
739 }
740 tss.len -= ret;
741 spliced += ret;
742
743 if (!timeo)
744 break;
745 release_sock(sk);
746 lock_sock(sk);
747
748 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
749 (sk->sk_shutdown & RCV_SHUTDOWN) ||
750 signal_pending(current))
751 break;
752 }
753
754 release_sock(sk);
755
756 if (spliced)
757 return spliced;
758
759 return ret;
760}
761EXPORT_SYMBOL(tcp_splice_read);
762
763struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
764{
765 struct sk_buff *skb;
766
767
768 size = ALIGN(size, 4);
769
770 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
771 if (skb) {
772 if (sk_wmem_schedule(sk, skb->truesize)) {
773 skb_reserve(skb, sk->sk_prot->max_header);
774
775
776
777
778 skb->reserved_tailroom = skb->end - skb->tail - size;
779 return skb;
780 }
781 __kfree_skb(skb);
782 } else {
783 sk->sk_prot->enter_memory_pressure(sk);
784 sk_stream_moderate_sndbuf(sk);
785 }
786 return NULL;
787}
788
789static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
790 int large_allowed)
791{
792 struct tcp_sock *tp = tcp_sk(sk);
793 u32 xmit_size_goal, old_size_goal;
794
795 xmit_size_goal = mss_now;
796
797 if (large_allowed && sk_can_gso(sk)) {
798 xmit_size_goal = ((sk->sk_gso_max_size - 1) -
799 inet_csk(sk)->icsk_af_ops->net_header_len -
800 inet_csk(sk)->icsk_ext_hdr_len -
801 tp->tcp_header_len);
802
803
804 xmit_size_goal = min_t(u32, xmit_size_goal,
805 sysctl_tcp_limit_output_bytes >> 1);
806
807 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
808
809
810 old_size_goal = tp->xmit_size_goal_segs * mss_now;
811
812 if (likely(old_size_goal <= xmit_size_goal &&
813 old_size_goal + mss_now > xmit_size_goal)) {
814 xmit_size_goal = old_size_goal;
815 } else {
816 tp->xmit_size_goal_segs =
817 min_t(u16, xmit_size_goal / mss_now,
818 sk->sk_gso_max_segs);
819 xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
820 }
821 }
822
823 return max(xmit_size_goal, mss_now);
824}
825
826static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
827{
828 int mss_now;
829
830 mss_now = tcp_current_mss(sk);
831 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
832
833 return mss_now;
834}
835
836static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
837 size_t size, int flags)
838{
839 struct tcp_sock *tp = tcp_sk(sk);
840 int mss_now, size_goal;
841 int err;
842 ssize_t copied;
843 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
844
845
846
847
848
849 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
850 !tcp_passive_fastopen(sk)) {
851 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
852 goto out_err;
853 }
854
855 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
856
857 mss_now = tcp_send_mss(sk, &size_goal, flags);
858 copied = 0;
859
860 err = -EPIPE;
861 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
862 goto out_err;
863
864 while (size > 0) {
865 struct sk_buff *skb = tcp_write_queue_tail(sk);
866 int copy, i;
867 bool can_coalesce;
868
869 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
870new_segment:
871 if (!sk_stream_memory_free(sk))
872 goto wait_for_sndbuf;
873
874 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
875 if (!skb)
876 goto wait_for_memory;
877
878 skb_entail(sk, skb);
879 copy = size_goal;
880 }
881
882 if (copy > size)
883 copy = size;
884
885 i = skb_shinfo(skb)->nr_frags;
886 can_coalesce = skb_can_coalesce(skb, i, page, offset);
887 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
888 tcp_mark_push(tp, skb);
889 goto new_segment;
890 }
891 if (!sk_wmem_schedule(sk, copy))
892 goto wait_for_memory;
893
894 if (can_coalesce) {
895 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
896 } else {
897 get_page(page);
898 skb_fill_page_desc(skb, i, page, offset, copy);
899 }
900 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
901
902 skb->len += copy;
903 skb->data_len += copy;
904 skb->truesize += copy;
905 sk->sk_wmem_queued += copy;
906 sk_mem_charge(sk, copy);
907 skb->ip_summed = CHECKSUM_PARTIAL;
908 tp->write_seq += copy;
909 TCP_SKB_CB(skb)->end_seq += copy;
910 skb_shinfo(skb)->gso_segs = 0;
911
912 if (!copied)
913 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
914
915 copied += copy;
916 offset += copy;
917 if (!(size -= copy))
918 goto out;
919
920 if (skb->len < size_goal || (flags & MSG_OOB))
921 continue;
922
923 if (forced_push(tp)) {
924 tcp_mark_push(tp, skb);
925 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
926 } else if (skb == tcp_send_head(sk))
927 tcp_push_one(sk, mss_now);
928 continue;
929
930wait_for_sndbuf:
931 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
932wait_for_memory:
933 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
934
935 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
936 goto do_error;
937
938 mss_now = tcp_send_mss(sk, &size_goal, flags);
939 }
940
941out:
942 if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
943 tcp_push(sk, flags, mss_now, tp->nonagle);
944 return copied;
945
946do_error:
947 if (copied)
948 goto out;
949out_err:
950 return sk_stream_error(sk, flags, err);
951}
952
953int tcp_sendpage(struct sock *sk, struct page *page, int offset,
954 size_t size, int flags)
955{
956 ssize_t res;
957
958 if (!(sk->sk_route_caps & NETIF_F_SG) ||
959 !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
960 return sock_no_sendpage(sk->sk_socket, page, offset, size,
961 flags);
962
963 lock_sock(sk);
964 res = do_tcp_sendpages(sk, page, offset, size, flags);
965 release_sock(sk);
966 return res;
967}
968EXPORT_SYMBOL(tcp_sendpage);
969
970static inline int select_size(const struct sock *sk, bool sg)
971{
972 const struct tcp_sock *tp = tcp_sk(sk);
973 int tmp = tp->mss_cache;
974
975 if (sg) {
976 if (sk_can_gso(sk)) {
977
978
979
980 tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
981 } else {
982 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
983
984 if (tmp >= pgbreak &&
985 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
986 tmp = pgbreak;
987 }
988 }
989
990 return tmp;
991}
992
993void tcp_free_fastopen_req(struct tcp_sock *tp)
994{
995 if (tp->fastopen_req != NULL) {
996 kfree(tp->fastopen_req);
997 tp->fastopen_req = NULL;
998 }
999}
1000
1001static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
1002{
1003 struct tcp_sock *tp = tcp_sk(sk);
1004 int err, flags;
1005
1006 if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
1007 return -EOPNOTSUPP;
1008 if (tp->fastopen_req != NULL)
1009 return -EALREADY;
1010
1011 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1012 sk->sk_allocation);
1013 if (unlikely(tp->fastopen_req == NULL))
1014 return -ENOBUFS;
1015 tp->fastopen_req->data = msg;
1016
1017 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1018 err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
1019 msg->msg_namelen, flags);
1020 *size = tp->fastopen_req->copied;
1021 tcp_free_fastopen_req(tp);
1022 return err;
1023}
1024
1025int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1026 size_t size)
1027{
1028 struct iovec *iov;
1029 struct tcp_sock *tp = tcp_sk(sk);
1030 struct sk_buff *skb;
1031 int iovlen, flags, err, copied = 0;
1032 int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
1033 bool sg;
1034 long timeo;
1035
1036 lock_sock(sk);
1037
1038 flags = msg->msg_flags;
1039 if (flags & MSG_FASTOPEN) {
1040 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
1041 if (err == -EINPROGRESS && copied_syn > 0)
1042 goto out;
1043 else if (err)
1044 goto out_err;
1045 offset = copied_syn;
1046 }
1047
1048 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1049
1050
1051
1052
1053
1054 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1055 !tcp_passive_fastopen(sk)) {
1056 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1057 goto do_error;
1058 }
1059
1060 if (unlikely(tp->repair)) {
1061 if (tp->repair_queue == TCP_RECV_QUEUE) {
1062 copied = tcp_send_rcvq(sk, msg, size);
1063 goto out;
1064 }
1065
1066 err = -EINVAL;
1067 if (tp->repair_queue == TCP_NO_QUEUE)
1068 goto out_err;
1069
1070
1071 }
1072
1073
1074 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1075
1076 mss_now = tcp_send_mss(sk, &size_goal, flags);
1077
1078
1079 iovlen = msg->msg_iovlen;
1080 iov = msg->msg_iov;
1081 copied = 0;
1082
1083 err = -EPIPE;
1084 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1085 goto out_err;
1086
1087 sg = !!(sk->sk_route_caps & NETIF_F_SG);
1088
1089 while (--iovlen >= 0) {
1090 size_t seglen = iov->iov_len;
1091 unsigned char __user *from = iov->iov_base;
1092
1093 iov++;
1094 if (unlikely(offset > 0)) {
1095 if (offset >= seglen) {
1096 offset -= seglen;
1097 continue;
1098 }
1099 seglen -= offset;
1100 from += offset;
1101 offset = 0;
1102 }
1103
1104 while (seglen > 0) {
1105 int copy = 0;
1106 int max = size_goal;
1107
1108 skb = tcp_write_queue_tail(sk);
1109 if (tcp_send_head(sk)) {
1110 if (skb->ip_summed == CHECKSUM_NONE)
1111 max = mss_now;
1112 copy = max - skb->len;
1113 }
1114
1115 if (copy <= 0) {
1116new_segment:
1117
1118
1119
1120 if (!sk_stream_memory_free(sk))
1121 goto wait_for_sndbuf;
1122
1123 skb = sk_stream_alloc_skb(sk,
1124 select_size(sk, sg),
1125 sk->sk_allocation);
1126 if (!skb)
1127 goto wait_for_memory;
1128
1129
1130
1131
1132 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
1133 skb->ip_summed = CHECKSUM_PARTIAL;
1134
1135 skb_entail(sk, skb);
1136 copy = size_goal;
1137 max = size_goal;
1138 }
1139
1140
1141 if (copy > seglen)
1142 copy = seglen;
1143
1144
1145 if (skb_availroom(skb) > 0) {
1146
1147 copy = min_t(int, copy, skb_availroom(skb));
1148 err = skb_add_data_nocache(sk, skb, from, copy);
1149 if (err)
1150 goto do_fault;
1151 } else {
1152 bool merge = true;
1153 int i = skb_shinfo(skb)->nr_frags;
1154 struct page_frag *pfrag = sk_page_frag(sk);
1155
1156 if (!sk_page_frag_refill(sk, pfrag))
1157 goto wait_for_memory;
1158
1159 if (!skb_can_coalesce(skb, i, pfrag->page,
1160 pfrag->offset)) {
1161 if (i == MAX_SKB_FRAGS || !sg) {
1162 tcp_mark_push(tp, skb);
1163 goto new_segment;
1164 }
1165 merge = false;
1166 }
1167
1168 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1169
1170 if (!sk_wmem_schedule(sk, copy))
1171 goto wait_for_memory;
1172
1173 err = skb_copy_to_page_nocache(sk, from, skb,
1174 pfrag->page,
1175 pfrag->offset,
1176 copy);
1177 if (err)
1178 goto do_error;
1179
1180
1181 if (merge) {
1182 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1183 } else {
1184 skb_fill_page_desc(skb, i, pfrag->page,
1185 pfrag->offset, copy);
1186 get_page(pfrag->page);
1187 }
1188 pfrag->offset += copy;
1189 }
1190
1191 if (!copied)
1192 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1193
1194 tp->write_seq += copy;
1195 TCP_SKB_CB(skb)->end_seq += copy;
1196 skb_shinfo(skb)->gso_segs = 0;
1197
1198 from += copy;
1199 copied += copy;
1200 if ((seglen -= copy) == 0 && iovlen == 0)
1201 goto out;
1202
1203 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1204 continue;
1205
1206 if (forced_push(tp)) {
1207 tcp_mark_push(tp, skb);
1208 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1209 } else if (skb == tcp_send_head(sk))
1210 tcp_push_one(sk, mss_now);
1211 continue;
1212
1213wait_for_sndbuf:
1214 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1215wait_for_memory:
1216 if (copied)
1217 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1218
1219 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1220 goto do_error;
1221
1222 mss_now = tcp_send_mss(sk, &size_goal, flags);
1223 }
1224 }
1225
1226out:
1227 if (copied)
1228 tcp_push(sk, flags, mss_now, tp->nonagle);
1229 release_sock(sk);
1230 return copied + copied_syn;
1231
1232do_fault:
1233 if (!skb->len) {
1234 tcp_unlink_write_queue(skb, sk);
1235
1236
1237
1238 tcp_check_send_head(sk, skb);
1239 sk_wmem_free_skb(sk, skb);
1240 }
1241
1242do_error:
1243 if (copied + copied_syn)
1244 goto out;
1245out_err:
1246 err = sk_stream_error(sk, flags, err);
1247 release_sock(sk);
1248 return err;
1249}
1250EXPORT_SYMBOL(tcp_sendmsg);
1251
1252
1253
1254
1255
1256
1257static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1258{
1259 struct tcp_sock *tp = tcp_sk(sk);
1260
1261
1262 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1263 tp->urg_data == TCP_URG_READ)
1264 return -EINVAL;
1265
1266 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1267 return -ENOTCONN;
1268
1269 if (tp->urg_data & TCP_URG_VALID) {
1270 int err = 0;
1271 char c = tp->urg_data;
1272
1273 if (!(flags & MSG_PEEK))
1274 tp->urg_data = TCP_URG_READ;
1275
1276
1277 msg->msg_flags |= MSG_OOB;
1278
1279 if (len > 0) {
1280 if (!(flags & MSG_TRUNC))
1281 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1282 len = 1;
1283 } else
1284 msg->msg_flags |= MSG_TRUNC;
1285
1286 return err ? -EFAULT : len;
1287 }
1288
1289 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1290 return 0;
1291
1292
1293
1294
1295
1296
1297
1298 return -EAGAIN;
1299}
1300
1301static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1302{
1303 struct sk_buff *skb;
1304 int copied = 0, err = 0;
1305
1306
1307
1308 skb_queue_walk(&sk->sk_write_queue, skb) {
1309 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
1310 if (err)
1311 break;
1312
1313 copied += skb->len;
1314 }
1315
1316 return err ?: copied;
1317}
1318
1319
1320
1321
1322
1323
1324
1325void tcp_cleanup_rbuf(struct sock *sk, int copied)
1326{
1327 struct tcp_sock *tp = tcp_sk(sk);
1328 bool time_to_ack = false;
1329
1330 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1331
1332 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1333 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1334 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1335
1336 if (inet_csk_ack_scheduled(sk)) {
1337 const struct inet_connection_sock *icsk = inet_csk(sk);
1338
1339
1340 if (icsk->icsk_ack.blocked ||
1341
1342 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1343
1344
1345
1346
1347
1348
1349 (copied > 0 &&
1350 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1351 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1352 !icsk->icsk_ack.pingpong)) &&
1353 !atomic_read(&sk->sk_rmem_alloc)))
1354 time_to_ack = true;
1355 }
1356
1357
1358
1359
1360
1361
1362
1363 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1364 __u32 rcv_window_now = tcp_receive_window(tp);
1365
1366
1367 if (2*rcv_window_now <= tp->window_clamp) {
1368 __u32 new_window = __tcp_select_window(sk);
1369
1370
1371
1372
1373
1374
1375 if (new_window && new_window >= 2 * rcv_window_now)
1376 time_to_ack = true;
1377 }
1378 }
1379 if (time_to_ack)
1380 tcp_send_ack(sk);
1381}
1382
1383static void tcp_prequeue_process(struct sock *sk)
1384{
1385 struct sk_buff *skb;
1386 struct tcp_sock *tp = tcp_sk(sk);
1387
1388 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1389
1390
1391
1392 local_bh_disable();
1393 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1394 sk_backlog_rcv(sk, skb);
1395 local_bh_enable();
1396
1397
1398 tp->ucopy.memory = 0;
1399}
1400
1401#ifdef CONFIG_NET_DMA
1402static void tcp_service_net_dma(struct sock *sk, bool wait)
1403{
1404 dma_cookie_t done, used;
1405 dma_cookie_t last_issued;
1406 struct tcp_sock *tp = tcp_sk(sk);
1407
1408 if (!tp->ucopy.dma_chan)
1409 return;
1410
1411 last_issued = tp->ucopy.dma_cookie;
1412 dma_async_issue_pending(tp->ucopy.dma_chan);
1413
1414 do {
1415 if (dma_async_is_tx_complete(tp->ucopy.dma_chan,
1416 last_issued, &done,
1417 &used) == DMA_SUCCESS) {
1418
1419 __skb_queue_purge(&sk->sk_async_wait_queue);
1420 break;
1421 } else {
1422 struct sk_buff *skb;
1423 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1424 (dma_async_is_complete(skb->dma_cookie, done,
1425 used) == DMA_SUCCESS)) {
1426 __skb_dequeue(&sk->sk_async_wait_queue);
1427 kfree_skb(skb);
1428 }
1429 }
1430 } while (wait);
1431}
1432#endif
1433
1434static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1435{
1436 struct sk_buff *skb;
1437 u32 offset;
1438
1439 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1440 offset = seq - TCP_SKB_CB(skb)->seq;
1441 if (tcp_hdr(skb)->syn)
1442 offset--;
1443 if (offset < skb->len || tcp_hdr(skb)->fin) {
1444 *off = offset;
1445 return skb;
1446 }
1447
1448
1449
1450
1451 sk_eat_skb(sk, skb, false);
1452 }
1453 return NULL;
1454}
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1468 sk_read_actor_t recv_actor)
1469{
1470 struct sk_buff *skb;
1471 struct tcp_sock *tp = tcp_sk(sk);
1472 u32 seq = tp->copied_seq;
1473 u32 offset;
1474 int copied = 0;
1475
1476 if (sk->sk_state == TCP_LISTEN)
1477 return -ENOTCONN;
1478 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1479 if (offset < skb->len) {
1480 int used;
1481 size_t len;
1482
1483 len = skb->len - offset;
1484
1485 if (tp->urg_data) {
1486 u32 urg_offset = tp->urg_seq - seq;
1487 if (urg_offset < len)
1488 len = urg_offset;
1489 if (!len)
1490 break;
1491 }
1492 used = recv_actor(desc, skb, offset, len);
1493 if (used <= 0) {
1494 if (!copied)
1495 copied = used;
1496 break;
1497 } else if (used <= len) {
1498 seq += used;
1499 copied += used;
1500 offset += used;
1501 }
1502
1503
1504
1505
1506
1507 skb = tcp_recv_skb(sk, seq - 1, &offset);
1508 if (!skb)
1509 break;
1510
1511
1512
1513 if (offset + 1 != skb->len)
1514 continue;
1515 }
1516 if (tcp_hdr(skb)->fin) {
1517 sk_eat_skb(sk, skb, false);
1518 ++seq;
1519 break;
1520 }
1521 sk_eat_skb(sk, skb, false);
1522 if (!desc->count)
1523 break;
1524 tp->copied_seq = seq;
1525 }
1526 tp->copied_seq = seq;
1527
1528 tcp_rcv_space_adjust(sk);
1529
1530
1531 if (copied > 0) {
1532 tcp_recv_skb(sk, seq, &offset);
1533 tcp_cleanup_rbuf(sk, copied);
1534 }
1535 return copied;
1536}
1537EXPORT_SYMBOL(tcp_read_sock);
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1548 size_t len, int nonblock, int flags, int *addr_len)
1549{
1550 struct tcp_sock *tp = tcp_sk(sk);
1551 int copied = 0;
1552 u32 peek_seq;
1553 u32 *seq;
1554 unsigned long used;
1555 int err;
1556 int target;
1557 long timeo;
1558 struct task_struct *user_recv = NULL;
1559 bool copied_early = false;
1560 struct sk_buff *skb;
1561 u32 urg_hole = 0;
1562
1563 lock_sock(sk);
1564
1565 err = -ENOTCONN;
1566 if (sk->sk_state == TCP_LISTEN)
1567 goto out;
1568
1569 timeo = sock_rcvtimeo(sk, nonblock);
1570
1571
1572 if (flags & MSG_OOB)
1573 goto recv_urg;
1574
1575 if (unlikely(tp->repair)) {
1576 err = -EPERM;
1577 if (!(flags & MSG_PEEK))
1578 goto out;
1579
1580 if (tp->repair_queue == TCP_SEND_QUEUE)
1581 goto recv_sndq;
1582
1583 err = -EINVAL;
1584 if (tp->repair_queue == TCP_NO_QUEUE)
1585 goto out;
1586
1587
1588 }
1589
1590 seq = &tp->copied_seq;
1591 if (flags & MSG_PEEK) {
1592 peek_seq = tp->copied_seq;
1593 seq = &peek_seq;
1594 }
1595
1596 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1597
1598#ifdef CONFIG_NET_DMA
1599 tp->ucopy.dma_chan = NULL;
1600 preempt_disable();
1601 skb = skb_peek_tail(&sk->sk_receive_queue);
1602 {
1603 int available = 0;
1604
1605 if (skb)
1606 available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
1607 if ((available < target) &&
1608 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1609 !sysctl_tcp_low_latency &&
1610 net_dma_find_channel()) {
1611 preempt_enable_no_resched();
1612 tp->ucopy.pinned_list =
1613 dma_pin_iovec_pages(msg->msg_iov, len);
1614 } else {
1615 preempt_enable_no_resched();
1616 }
1617 }
1618#endif
1619
1620 do {
1621 u32 offset;
1622
1623
1624 if (tp->urg_data && tp->urg_seq == *seq) {
1625 if (copied)
1626 break;
1627 if (signal_pending(current)) {
1628 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1629 break;
1630 }
1631 }
1632
1633
1634
1635 skb_queue_walk(&sk->sk_receive_queue, skb) {
1636
1637
1638
1639 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1640 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
1641 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1642 flags))
1643 break;
1644
1645 offset = *seq - TCP_SKB_CB(skb)->seq;
1646 if (tcp_hdr(skb)->syn)
1647 offset--;
1648 if (offset < skb->len)
1649 goto found_ok_skb;
1650 if (tcp_hdr(skb)->fin)
1651 goto found_fin_ok;
1652 WARN(!(flags & MSG_PEEK),
1653 "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
1654 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1655 }
1656
1657
1658
1659 if (copied >= target && !sk->sk_backlog.tail)
1660 break;
1661
1662 if (copied) {
1663 if (sk->sk_err ||
1664 sk->sk_state == TCP_CLOSE ||
1665 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1666 !timeo ||
1667 signal_pending(current))
1668 break;
1669 } else {
1670 if (sock_flag(sk, SOCK_DONE))
1671 break;
1672
1673 if (sk->sk_err) {
1674 copied = sock_error(sk);
1675 break;
1676 }
1677
1678 if (sk->sk_shutdown & RCV_SHUTDOWN)
1679 break;
1680
1681 if (sk->sk_state == TCP_CLOSE) {
1682 if (!sock_flag(sk, SOCK_DONE)) {
1683
1684
1685
1686 copied = -ENOTCONN;
1687 break;
1688 }
1689 break;
1690 }
1691
1692 if (!timeo) {
1693 copied = -EAGAIN;
1694 break;
1695 }
1696
1697 if (signal_pending(current)) {
1698 copied = sock_intr_errno(timeo);
1699 break;
1700 }
1701 }
1702
1703 tcp_cleanup_rbuf(sk, copied);
1704
1705 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1706
1707 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1708 user_recv = current;
1709 tp->ucopy.task = user_recv;
1710 tp->ucopy.iov = msg->msg_iov;
1711 }
1712
1713 tp->ucopy.len = len;
1714
1715 WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1716 !(flags & (MSG_PEEK | MSG_TRUNC)));
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744 if (!skb_queue_empty(&tp->ucopy.prequeue))
1745 goto do_prequeue;
1746
1747
1748 }
1749
1750#ifdef CONFIG_NET_DMA
1751 if (tp->ucopy.dma_chan) {
1752 if (tp->rcv_wnd == 0 &&
1753 !skb_queue_empty(&sk->sk_async_wait_queue)) {
1754 tcp_service_net_dma(sk, true);
1755 tcp_cleanup_rbuf(sk, copied);
1756 } else
1757 dma_async_issue_pending(tp->ucopy.dma_chan);
1758 }
1759#endif
1760 if (copied >= target) {
1761
1762 release_sock(sk);
1763 lock_sock(sk);
1764 } else
1765 sk_wait_data(sk, &timeo);
1766
1767#ifdef CONFIG_NET_DMA
1768 tcp_service_net_dma(sk, false);
1769 tp->ucopy.wakeup = 0;
1770#endif
1771
1772 if (user_recv) {
1773 int chunk;
1774
1775
1776
1777 if ((chunk = len - tp->ucopy.len) != 0) {
1778 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1779 len -= chunk;
1780 copied += chunk;
1781 }
1782
1783 if (tp->rcv_nxt == tp->copied_seq &&
1784 !skb_queue_empty(&tp->ucopy.prequeue)) {
1785do_prequeue:
1786 tcp_prequeue_process(sk);
1787
1788 if ((chunk = len - tp->ucopy.len) != 0) {
1789 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1790 len -= chunk;
1791 copied += chunk;
1792 }
1793 }
1794 }
1795 if ((flags & MSG_PEEK) &&
1796 (peek_seq - copied - urg_hole != tp->copied_seq)) {
1797 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
1798 current->comm,
1799 task_pid_nr(current));
1800 peek_seq = tp->copied_seq;
1801 }
1802 continue;
1803
1804 found_ok_skb:
1805
1806 used = skb->len - offset;
1807 if (len < used)
1808 used = len;
1809
1810
1811 if (tp->urg_data) {
1812 u32 urg_offset = tp->urg_seq - *seq;
1813 if (urg_offset < used) {
1814 if (!urg_offset) {
1815 if (!sock_flag(sk, SOCK_URGINLINE)) {
1816 ++*seq;
1817 urg_hole++;
1818 offset++;
1819 used--;
1820 if (!used)
1821 goto skip_copy;
1822 }
1823 } else
1824 used = urg_offset;
1825 }
1826 }
1827
1828 if (!(flags & MSG_TRUNC)) {
1829#ifdef CONFIG_NET_DMA
1830 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1831 tp->ucopy.dma_chan = net_dma_find_channel();
1832
1833 if (tp->ucopy.dma_chan) {
1834 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1835 tp->ucopy.dma_chan, skb, offset,
1836 msg->msg_iov, used,
1837 tp->ucopy.pinned_list);
1838
1839 if (tp->ucopy.dma_cookie < 0) {
1840
1841 pr_alert("%s: dma_cookie < 0\n",
1842 __func__);
1843
1844
1845 if (!copied)
1846 copied = -EFAULT;
1847 break;
1848 }
1849
1850 dma_async_issue_pending(tp->ucopy.dma_chan);
1851
1852 if ((offset + used) == skb->len)
1853 copied_early = true;
1854
1855 } else
1856#endif
1857 {
1858 err = skb_copy_datagram_iovec(skb, offset,
1859 msg->msg_iov, used);
1860 if (err) {
1861
1862 if (!copied)
1863 copied = -EFAULT;
1864 break;
1865 }
1866 }
1867 }
1868
1869 *seq += used;
1870 copied += used;
1871 len -= used;
1872
1873 tcp_rcv_space_adjust(sk);
1874
1875skip_copy:
1876 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1877 tp->urg_data = 0;
1878 tcp_fast_path_check(sk);
1879 }
1880 if (used + offset < skb->len)
1881 continue;
1882
1883 if (tcp_hdr(skb)->fin)
1884 goto found_fin_ok;
1885 if (!(flags & MSG_PEEK)) {
1886 sk_eat_skb(sk, skb, copied_early);
1887 copied_early = false;
1888 }
1889 continue;
1890
1891 found_fin_ok:
1892
1893 ++*seq;
1894 if (!(flags & MSG_PEEK)) {
1895 sk_eat_skb(sk, skb, copied_early);
1896 copied_early = false;
1897 }
1898 break;
1899 } while (len > 0);
1900
1901 if (user_recv) {
1902 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1903 int chunk;
1904
1905 tp->ucopy.len = copied > 0 ? len : 0;
1906
1907 tcp_prequeue_process(sk);
1908
1909 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1910 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1911 len -= chunk;
1912 copied += chunk;
1913 }
1914 }
1915
1916 tp->ucopy.task = NULL;
1917 tp->ucopy.len = 0;
1918 }
1919
1920#ifdef CONFIG_NET_DMA
1921 tcp_service_net_dma(sk, true);
1922 tp->ucopy.dma_chan = NULL;
1923
1924 if (tp->ucopy.pinned_list) {
1925 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1926 tp->ucopy.pinned_list = NULL;
1927 }
1928#endif
1929
1930
1931
1932
1933
1934
1935 tcp_cleanup_rbuf(sk, copied);
1936
1937 release_sock(sk);
1938 return copied;
1939
1940out:
1941 release_sock(sk);
1942 return err;
1943
1944recv_urg:
1945 err = tcp_recv_urg(sk, msg, len, flags);
1946 goto out;
1947
1948recv_sndq:
1949 err = tcp_peek_sndq(sk, msg, len);
1950 goto out;
1951}
1952EXPORT_SYMBOL(tcp_recvmsg);
1953
1954void tcp_set_state(struct sock *sk, int state)
1955{
1956 int oldstate = sk->sk_state;
1957
1958 switch (state) {
1959 case TCP_ESTABLISHED:
1960 if (oldstate != TCP_ESTABLISHED)
1961 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1962 break;
1963
1964 case TCP_CLOSE:
1965 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
1966 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
1967
1968 sk->sk_prot->unhash(sk);
1969 if (inet_csk(sk)->icsk_bind_hash &&
1970 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1971 inet_put_port(sk);
1972
1973 default:
1974 if (oldstate == TCP_ESTABLISHED)
1975 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1976 }
1977
1978
1979
1980
1981 sk->sk_state = state;
1982
1983#ifdef STATE_TRACE
1984 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
1985#endif
1986}
1987EXPORT_SYMBOL_GPL(tcp_set_state);
1988
1989
1990
1991
1992
1993
1994
1995
1996static const unsigned char new_state[16] = {
1997
1998 TCP_CLOSE,
1999 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2000 TCP_CLOSE,
2001 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2002 TCP_FIN_WAIT1,
2003 TCP_FIN_WAIT2,
2004 TCP_CLOSE,
2005 TCP_CLOSE,
2006 TCP_LAST_ACK | TCP_ACTION_FIN,
2007 TCP_LAST_ACK,
2008 TCP_CLOSE,
2009 TCP_CLOSING,
2010};
2011
2012static int tcp_close_state(struct sock *sk)
2013{
2014 int next = (int)new_state[sk->sk_state];
2015 int ns = next & TCP_STATE_MASK;
2016
2017 tcp_set_state(sk, ns);
2018
2019 return next & TCP_ACTION_FIN;
2020}
2021
2022
2023
2024
2025
2026
2027void tcp_shutdown(struct sock *sk, int how)
2028{
2029
2030
2031
2032
2033 if (!(how & SEND_SHUTDOWN))
2034 return;
2035
2036
2037 if ((1 << sk->sk_state) &
2038 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2039 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2040
2041 if (tcp_close_state(sk))
2042 tcp_send_fin(sk);
2043 }
2044}
2045EXPORT_SYMBOL(tcp_shutdown);
2046
2047bool tcp_check_oom(struct sock *sk, int shift)
2048{
2049 bool too_many_orphans, out_of_socket_memory;
2050
2051 too_many_orphans = tcp_too_many_orphans(sk, shift);
2052 out_of_socket_memory = tcp_out_of_memory(sk);
2053
2054 if (too_many_orphans)
2055 net_info_ratelimited("too many orphaned sockets\n");
2056 if (out_of_socket_memory)
2057 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2058 return too_many_orphans || out_of_socket_memory;
2059}
2060
2061void tcp_close(struct sock *sk, long timeout)
2062{
2063 struct sk_buff *skb;
2064 int data_was_unread = 0;
2065 int state;
2066
2067 lock_sock(sk);
2068 sk->sk_shutdown = SHUTDOWN_MASK;
2069
2070 if (sk->sk_state == TCP_LISTEN) {
2071 tcp_set_state(sk, TCP_CLOSE);
2072
2073
2074 inet_csk_listen_stop(sk);
2075
2076 goto adjudge_to_death;
2077 }
2078
2079
2080
2081
2082
2083 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2084 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
2085 tcp_hdr(skb)->fin;
2086 data_was_unread += len;
2087 __kfree_skb(skb);
2088 }
2089
2090 sk_mem_reclaim(sk);
2091
2092
2093 if (sk->sk_state == TCP_CLOSE)
2094 goto adjudge_to_death;
2095
2096
2097
2098
2099
2100
2101
2102
2103 if (unlikely(tcp_sk(sk)->repair)) {
2104 sk->sk_prot->disconnect(sk, 0);
2105 } else if (data_was_unread) {
2106
2107 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2108 tcp_set_state(sk, TCP_CLOSE);
2109 tcp_send_active_reset(sk, sk->sk_allocation);
2110 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2111
2112 sk->sk_prot->disconnect(sk, 0);
2113 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2114 } else if (tcp_close_state(sk)) {
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144 tcp_send_fin(sk);
2145 }
2146
2147 sk_stream_wait_close(sk, timeout);
2148
2149adjudge_to_death:
2150 state = sk->sk_state;
2151 sock_hold(sk);
2152 sock_orphan(sk);
2153
2154
2155 release_sock(sk);
2156
2157
2158
2159
2160
2161 local_bh_disable();
2162 bh_lock_sock(sk);
2163 WARN_ON(sock_owned_by_user(sk));
2164
2165 percpu_counter_inc(sk->sk_prot->orphan_count);
2166
2167
2168 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2169 goto out;
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185 if (sk->sk_state == TCP_FIN_WAIT2) {
2186 struct tcp_sock *tp = tcp_sk(sk);
2187 if (tp->linger2 < 0) {
2188 tcp_set_state(sk, TCP_CLOSE);
2189 tcp_send_active_reset(sk, GFP_ATOMIC);
2190 NET_INC_STATS_BH(sock_net(sk),
2191 LINUX_MIB_TCPABORTONLINGER);
2192 } else {
2193 const int tmo = tcp_fin_time(sk);
2194
2195 if (tmo > TCP_TIMEWAIT_LEN) {
2196 inet_csk_reset_keepalive_timer(sk,
2197 tmo - TCP_TIMEWAIT_LEN);
2198 } else {
2199 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2200 goto out;
2201 }
2202 }
2203 }
2204 if (sk->sk_state != TCP_CLOSE) {
2205 sk_mem_reclaim(sk);
2206 if (tcp_check_oom(sk, 0)) {
2207 tcp_set_state(sk, TCP_CLOSE);
2208 tcp_send_active_reset(sk, GFP_ATOMIC);
2209 NET_INC_STATS_BH(sock_net(sk),
2210 LINUX_MIB_TCPABORTONMEMORY);
2211 }
2212 }
2213
2214 if (sk->sk_state == TCP_CLOSE) {
2215 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2216
2217
2218
2219
2220 if (req != NULL)
2221 reqsk_fastopen_remove(sk, req, false);
2222 inet_csk_destroy_sock(sk);
2223 }
2224
2225
2226out:
2227 bh_unlock_sock(sk);
2228 local_bh_enable();
2229 sock_put(sk);
2230}
2231EXPORT_SYMBOL(tcp_close);
2232
2233
2234
2235static inline bool tcp_need_reset(int state)
2236{
2237 return (1 << state) &
2238 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2239 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2240}
2241
2242int tcp_disconnect(struct sock *sk, int flags)
2243{
2244 struct inet_sock *inet = inet_sk(sk);
2245 struct inet_connection_sock *icsk = inet_csk(sk);
2246 struct tcp_sock *tp = tcp_sk(sk);
2247 int err = 0;
2248 int old_state = sk->sk_state;
2249
2250 if (old_state != TCP_CLOSE)
2251 tcp_set_state(sk, TCP_CLOSE);
2252
2253
2254 if (old_state == TCP_LISTEN) {
2255 inet_csk_listen_stop(sk);
2256 } else if (unlikely(tp->repair)) {
2257 sk->sk_err = ECONNABORTED;
2258 } else if (tcp_need_reset(old_state) ||
2259 (tp->snd_nxt != tp->write_seq &&
2260 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2261
2262
2263
2264 tcp_send_active_reset(sk, gfp_any());
2265 sk->sk_err = ECONNRESET;
2266 } else if (old_state == TCP_SYN_SENT)
2267 sk->sk_err = ECONNRESET;
2268
2269 tcp_clear_xmit_timers(sk);
2270 __skb_queue_purge(&sk->sk_receive_queue);
2271 tcp_write_queue_purge(sk);
2272 __skb_queue_purge(&tp->out_of_order_queue);
2273#ifdef CONFIG_NET_DMA
2274 __skb_queue_purge(&sk->sk_async_wait_queue);
2275#endif
2276
2277 inet->inet_dport = 0;
2278
2279 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2280 inet_reset_saddr(sk);
2281
2282 sk->sk_shutdown = 0;
2283 sock_reset_flag(sk, SOCK_DONE);
2284 tp->srtt = 0;
2285 if ((tp->write_seq += tp->max_window + 2) == 0)
2286 tp->write_seq = 1;
2287 icsk->icsk_backoff = 0;
2288 tp->snd_cwnd = 2;
2289 icsk->icsk_probes_out = 0;
2290 tp->packets_out = 0;
2291 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2292 tp->snd_cwnd_cnt = 0;
2293 tp->window_clamp = 0;
2294 tcp_set_ca_state(sk, TCP_CA_Open);
2295 tcp_clear_retrans(tp);
2296 inet_csk_delack_init(sk);
2297 tcp_init_send_head(sk);
2298 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2299 __sk_dst_reset(sk);
2300
2301 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2302
2303 sk->sk_error_report(sk);
2304 return err;
2305}
2306EXPORT_SYMBOL(tcp_disconnect);
2307
2308void tcp_sock_destruct(struct sock *sk)
2309{
2310 inet_sock_destruct(sk);
2311
2312 kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
2313}
2314
2315static inline bool tcp_can_repair_sock(const struct sock *sk)
2316{
2317 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2318 ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
2319}
2320
2321static int tcp_repair_options_est(struct tcp_sock *tp,
2322 struct tcp_repair_opt __user *optbuf, unsigned int len)
2323{
2324 struct tcp_repair_opt opt;
2325
2326 while (len >= sizeof(opt)) {
2327 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2328 return -EFAULT;
2329
2330 optbuf++;
2331 len -= sizeof(opt);
2332
2333 switch (opt.opt_code) {
2334 case TCPOPT_MSS:
2335 tp->rx_opt.mss_clamp = opt.opt_val;
2336 break;
2337 case TCPOPT_WINDOW:
2338 {
2339 u16 snd_wscale = opt.opt_val & 0xFFFF;
2340 u16 rcv_wscale = opt.opt_val >> 16;
2341
2342 if (snd_wscale > 14 || rcv_wscale > 14)
2343 return -EFBIG;
2344
2345 tp->rx_opt.snd_wscale = snd_wscale;
2346 tp->rx_opt.rcv_wscale = rcv_wscale;
2347 tp->rx_opt.wscale_ok = 1;
2348 }
2349 break;
2350 case TCPOPT_SACK_PERM:
2351 if (opt.opt_val != 0)
2352 return -EINVAL;
2353
2354 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2355 if (sysctl_tcp_fack)
2356 tcp_enable_fack(tp);
2357 break;
2358 case TCPOPT_TIMESTAMP:
2359 if (opt.opt_val != 0)
2360 return -EINVAL;
2361
2362 tp->rx_opt.tstamp_ok = 1;
2363 break;
2364 }
2365 }
2366
2367 return 0;
2368}
2369
2370
2371
2372
2373static int do_tcp_setsockopt(struct sock *sk, int level,
2374 int optname, char __user *optval, unsigned int optlen)
2375{
2376 struct tcp_sock *tp = tcp_sk(sk);
2377 struct inet_connection_sock *icsk = inet_csk(sk);
2378 int val;
2379 int err = 0;
2380
2381
2382 switch (optname) {
2383 case TCP_CONGESTION: {
2384 char name[TCP_CA_NAME_MAX];
2385
2386 if (optlen < 1)
2387 return -EINVAL;
2388
2389 val = strncpy_from_user(name, optval,
2390 min_t(long, TCP_CA_NAME_MAX-1, optlen));
2391 if (val < 0)
2392 return -EFAULT;
2393 name[val] = 0;
2394
2395 lock_sock(sk);
2396 err = tcp_set_congestion_control(sk, name);
2397 release_sock(sk);
2398 return err;
2399 }
2400 case TCP_COOKIE_TRANSACTIONS: {
2401 struct tcp_cookie_transactions ctd;
2402 struct tcp_cookie_values *cvp = NULL;
2403
2404 if (sizeof(ctd) > optlen)
2405 return -EINVAL;
2406 if (copy_from_user(&ctd, optval, sizeof(ctd)))
2407 return -EFAULT;
2408
2409 if (ctd.tcpct_used > sizeof(ctd.tcpct_value) ||
2410 ctd.tcpct_s_data_desired > TCP_MSS_DESIRED)
2411 return -EINVAL;
2412
2413 if (ctd.tcpct_cookie_desired == 0) {
2414
2415 } else if ((0x1 & ctd.tcpct_cookie_desired) ||
2416 ctd.tcpct_cookie_desired > TCP_COOKIE_MAX ||
2417 ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) {
2418 return -EINVAL;
2419 }
2420
2421 if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) {
2422
2423 lock_sock(sk);
2424 if (tp->cookie_values != NULL) {
2425 kref_put(&tp->cookie_values->kref,
2426 tcp_cookie_values_release);
2427 tp->cookie_values = NULL;
2428 }
2429 tp->rx_opt.cookie_in_always = 0;
2430 tp->rx_opt.cookie_out_never = 1;
2431 release_sock(sk);
2432 return err;
2433 }
2434
2435
2436
2437 if (ctd.tcpct_used > 0 ||
2438 (tp->cookie_values == NULL &&
2439 (sysctl_tcp_cookie_size > 0 ||
2440 ctd.tcpct_cookie_desired > 0 ||
2441 ctd.tcpct_s_data_desired > 0))) {
2442 cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used,
2443 GFP_KERNEL);
2444 if (cvp == NULL)
2445 return -ENOMEM;
2446
2447 kref_init(&cvp->kref);
2448 }
2449 lock_sock(sk);
2450 tp->rx_opt.cookie_in_always =
2451 (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags);
2452 tp->rx_opt.cookie_out_never = 0;
2453
2454 if (tp->cookie_values != NULL) {
2455 if (cvp != NULL) {
2456
2457
2458
2459
2460 kref_put(&tp->cookie_values->kref,
2461 tcp_cookie_values_release);
2462 } else {
2463 cvp = tp->cookie_values;
2464 }
2465 }
2466
2467 if (cvp != NULL) {
2468 cvp->cookie_desired = ctd.tcpct_cookie_desired;
2469
2470 if (ctd.tcpct_used > 0) {
2471 memcpy(cvp->s_data_payload, ctd.tcpct_value,
2472 ctd.tcpct_used);
2473 cvp->s_data_desired = ctd.tcpct_used;
2474 cvp->s_data_constant = 1;
2475 } else {
2476
2477 cvp->s_data_desired = ctd.tcpct_s_data_desired;
2478 cvp->s_data_constant = 0;
2479 }
2480
2481 tp->cookie_values = cvp;
2482 }
2483 release_sock(sk);
2484 return err;
2485 }
2486 default:
2487
2488 break;
2489 }
2490
2491 if (optlen < sizeof(int))
2492 return -EINVAL;
2493
2494 if (get_user(val, (int __user *)optval))
2495 return -EFAULT;
2496
2497 lock_sock(sk);
2498
2499 switch (optname) {
2500 case TCP_MAXSEG:
2501
2502
2503
2504 if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
2505 err = -EINVAL;
2506 break;
2507 }
2508 tp->rx_opt.user_mss = val;
2509 break;
2510
2511 case TCP_NODELAY:
2512 if (val) {
2513
2514
2515
2516
2517
2518
2519
2520
2521 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2522 tcp_push_pending_frames(sk);
2523 } else {
2524 tp->nonagle &= ~TCP_NAGLE_OFF;
2525 }
2526 break;
2527
2528 case TCP_THIN_LINEAR_TIMEOUTS:
2529 if (val < 0 || val > 1)
2530 err = -EINVAL;
2531 else
2532 tp->thin_lto = val;
2533 break;
2534
2535 case TCP_THIN_DUPACK:
2536 if (val < 0 || val > 1)
2537 err = -EINVAL;
2538 else
2539 tp->thin_dupack = val;
2540 if (tp->thin_dupack)
2541 tcp_disable_early_retrans(tp);
2542 break;
2543
2544 case TCP_REPAIR:
2545 if (!tcp_can_repair_sock(sk))
2546 err = -EPERM;
2547 else if (val == 1) {
2548 tp->repair = 1;
2549 sk->sk_reuse = SK_FORCE_REUSE;
2550 tp->repair_queue = TCP_NO_QUEUE;
2551 } else if (val == 0) {
2552 tp->repair = 0;
2553 sk->sk_reuse = SK_NO_REUSE;
2554 tcp_send_window_probe(sk);
2555 } else
2556 err = -EINVAL;
2557
2558 break;
2559
2560 case TCP_REPAIR_QUEUE:
2561 if (!tp->repair)
2562 err = -EPERM;
2563 else if (val < TCP_QUEUES_NR)
2564 tp->repair_queue = val;
2565 else
2566 err = -EINVAL;
2567 break;
2568
2569 case TCP_QUEUE_SEQ:
2570 if (sk->sk_state != TCP_CLOSE)
2571 err = -EPERM;
2572 else if (tp->repair_queue == TCP_SEND_QUEUE)
2573 tp->write_seq = val;
2574 else if (tp->repair_queue == TCP_RECV_QUEUE)
2575 tp->rcv_nxt = val;
2576 else
2577 err = -EINVAL;
2578 break;
2579
2580 case TCP_REPAIR_OPTIONS:
2581 if (!tp->repair)
2582 err = -EINVAL;
2583 else if (sk->sk_state == TCP_ESTABLISHED)
2584 err = tcp_repair_options_est(tp,
2585 (struct tcp_repair_opt __user *)optval,
2586 optlen);
2587 else
2588 err = -EPERM;
2589 break;
2590
2591 case TCP_CORK:
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603 if (val) {
2604 tp->nonagle |= TCP_NAGLE_CORK;
2605 } else {
2606 tp->nonagle &= ~TCP_NAGLE_CORK;
2607 if (tp->nonagle&TCP_NAGLE_OFF)
2608 tp->nonagle |= TCP_NAGLE_PUSH;
2609 tcp_push_pending_frames(sk);
2610 }
2611 break;
2612
2613 case TCP_KEEPIDLE:
2614 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2615 err = -EINVAL;
2616 else {
2617 tp->keepalive_time = val * HZ;
2618 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2619 !((1 << sk->sk_state) &
2620 (TCPF_CLOSE | TCPF_LISTEN))) {
2621 u32 elapsed = keepalive_time_elapsed(tp);
2622 if (tp->keepalive_time > elapsed)
2623 elapsed = tp->keepalive_time - elapsed;
2624 else
2625 elapsed = 0;
2626 inet_csk_reset_keepalive_timer(sk, elapsed);
2627 }
2628 }
2629 break;
2630 case TCP_KEEPINTVL:
2631 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2632 err = -EINVAL;
2633 else
2634 tp->keepalive_intvl = val * HZ;
2635 break;
2636 case TCP_KEEPCNT:
2637 if (val < 1 || val > MAX_TCP_KEEPCNT)
2638 err = -EINVAL;
2639 else
2640 tp->keepalive_probes = val;
2641 break;
2642 case TCP_SYNCNT:
2643 if (val < 1 || val > MAX_TCP_SYNCNT)
2644 err = -EINVAL;
2645 else
2646 icsk->icsk_syn_retries = val;
2647 break;
2648
2649 case TCP_LINGER2:
2650 if (val < 0)
2651 tp->linger2 = -1;
2652 else if (val > sysctl_tcp_fin_timeout / HZ)
2653 tp->linger2 = 0;
2654 else
2655 tp->linger2 = val * HZ;
2656 break;
2657
2658 case TCP_DEFER_ACCEPT:
2659
2660 icsk->icsk_accept_queue.rskq_defer_accept =
2661 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2662 TCP_RTO_MAX / HZ);
2663 break;
2664
2665 case TCP_WINDOW_CLAMP:
2666 if (!val) {
2667 if (sk->sk_state != TCP_CLOSE) {
2668 err = -EINVAL;
2669 break;
2670 }
2671 tp->window_clamp = 0;
2672 } else
2673 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2674 SOCK_MIN_RCVBUF / 2 : val;
2675 break;
2676
2677 case TCP_QUICKACK:
2678 if (!val) {
2679 icsk->icsk_ack.pingpong = 1;
2680 } else {
2681 icsk->icsk_ack.pingpong = 0;
2682 if ((1 << sk->sk_state) &
2683 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2684 inet_csk_ack_scheduled(sk)) {
2685 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2686 tcp_cleanup_rbuf(sk, 1);
2687 if (!(val & 1))
2688 icsk->icsk_ack.pingpong = 1;
2689 }
2690 }
2691 break;
2692
2693#ifdef CONFIG_TCP_MD5SIG
2694 case TCP_MD5SIG:
2695
2696 err = tp->af_specific->md5_parse(sk, optval, optlen);
2697 break;
2698#endif
2699 case TCP_USER_TIMEOUT:
2700
2701
2702
2703 if (val < 0)
2704 err = -EINVAL;
2705 else
2706 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2707 break;
2708
2709 case TCP_FASTOPEN:
2710 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2711 TCPF_LISTEN)))
2712 err = fastopen_init_queue(sk, val);
2713 else
2714 err = -EINVAL;
2715 break;
2716 case TCP_TIMESTAMP:
2717 if (!tp->repair)
2718 err = -EPERM;
2719 else
2720 tp->tsoffset = val - tcp_time_stamp;
2721 break;
2722 default:
2723 err = -ENOPROTOOPT;
2724 break;
2725 }
2726
2727 release_sock(sk);
2728 return err;
2729}
2730
2731int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2732 unsigned int optlen)
2733{
2734 const struct inet_connection_sock *icsk = inet_csk(sk);
2735
2736 if (level != SOL_TCP)
2737 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2738 optval, optlen);
2739 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2740}
2741EXPORT_SYMBOL(tcp_setsockopt);
2742
2743#ifdef CONFIG_COMPAT
2744int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2745 char __user *optval, unsigned int optlen)
2746{
2747 if (level != SOL_TCP)
2748 return inet_csk_compat_setsockopt(sk, level, optname,
2749 optval, optlen);
2750 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2751}
2752EXPORT_SYMBOL(compat_tcp_setsockopt);
2753#endif
2754
2755
2756void tcp_get_info(const struct sock *sk, struct tcp_info *info)
2757{
2758 const struct tcp_sock *tp = tcp_sk(sk);
2759 const struct inet_connection_sock *icsk = inet_csk(sk);
2760 u32 now = tcp_time_stamp;
2761
2762 memset(info, 0, sizeof(*info));
2763
2764 info->tcpi_state = sk->sk_state;
2765 info->tcpi_ca_state = icsk->icsk_ca_state;
2766 info->tcpi_retransmits = icsk->icsk_retransmits;
2767 info->tcpi_probes = icsk->icsk_probes_out;
2768 info->tcpi_backoff = icsk->icsk_backoff;
2769
2770 if (tp->rx_opt.tstamp_ok)
2771 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2772 if (tcp_is_sack(tp))
2773 info->tcpi_options |= TCPI_OPT_SACK;
2774 if (tp->rx_opt.wscale_ok) {
2775 info->tcpi_options |= TCPI_OPT_WSCALE;
2776 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2777 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2778 }
2779
2780 if (tp->ecn_flags & TCP_ECN_OK)
2781 info->tcpi_options |= TCPI_OPT_ECN;
2782 if (tp->ecn_flags & TCP_ECN_SEEN)
2783 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
2784 if (tp->syn_data_acked)
2785 info->tcpi_options |= TCPI_OPT_SYN_DATA;
2786
2787 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2788 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2789 info->tcpi_snd_mss = tp->mss_cache;
2790 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2791
2792 if (sk->sk_state == TCP_LISTEN) {
2793 info->tcpi_unacked = sk->sk_ack_backlog;
2794 info->tcpi_sacked = sk->sk_max_ack_backlog;
2795 } else {
2796 info->tcpi_unacked = tp->packets_out;
2797 info->tcpi_sacked = tp->sacked_out;
2798 }
2799 info->tcpi_lost = tp->lost_out;
2800 info->tcpi_retrans = tp->retrans_out;
2801 info->tcpi_fackets = tp->fackets_out;
2802
2803 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2804 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2805 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2806
2807 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2808 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2809 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2810 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2811 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2812 info->tcpi_snd_cwnd = tp->snd_cwnd;
2813 info->tcpi_advmss = tp->advmss;
2814 info->tcpi_reordering = tp->reordering;
2815
2816 info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2817 info->tcpi_rcv_space = tp->rcvq_space.space;
2818
2819 info->tcpi_total_retrans = tp->total_retrans;
2820}
2821EXPORT_SYMBOL_GPL(tcp_get_info);
2822
2823static int do_tcp_getsockopt(struct sock *sk, int level,
2824 int optname, char __user *optval, int __user *optlen)
2825{
2826 struct inet_connection_sock *icsk = inet_csk(sk);
2827 struct tcp_sock *tp = tcp_sk(sk);
2828 int val, len;
2829
2830 if (get_user(len, optlen))
2831 return -EFAULT;
2832
2833 len = min_t(unsigned int, len, sizeof(int));
2834
2835 if (len < 0)
2836 return -EINVAL;
2837
2838 switch (optname) {
2839 case TCP_MAXSEG:
2840 val = tp->mss_cache;
2841 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2842 val = tp->rx_opt.user_mss;
2843 if (tp->repair)
2844 val = tp->rx_opt.mss_clamp;
2845 break;
2846 case TCP_NODELAY:
2847 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2848 break;
2849 case TCP_CORK:
2850 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2851 break;
2852 case TCP_KEEPIDLE:
2853 val = keepalive_time_when(tp) / HZ;
2854 break;
2855 case TCP_KEEPINTVL:
2856 val = keepalive_intvl_when(tp) / HZ;
2857 break;
2858 case TCP_KEEPCNT:
2859 val = keepalive_probes(tp);
2860 break;
2861 case TCP_SYNCNT:
2862 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2863 break;
2864 case TCP_LINGER2:
2865 val = tp->linger2;
2866 if (val >= 0)
2867 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2868 break;
2869 case TCP_DEFER_ACCEPT:
2870 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
2871 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
2872 break;
2873 case TCP_WINDOW_CLAMP:
2874 val = tp->window_clamp;
2875 break;
2876 case TCP_INFO: {
2877 struct tcp_info info;
2878
2879 if (get_user(len, optlen))
2880 return -EFAULT;
2881
2882 tcp_get_info(sk, &info);
2883
2884 len = min_t(unsigned int, len, sizeof(info));
2885 if (put_user(len, optlen))
2886 return -EFAULT;
2887 if (copy_to_user(optval, &info, len))
2888 return -EFAULT;
2889 return 0;
2890 }
2891 case TCP_QUICKACK:
2892 val = !icsk->icsk_ack.pingpong;
2893 break;
2894
2895 case TCP_CONGESTION:
2896 if (get_user(len, optlen))
2897 return -EFAULT;
2898 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2899 if (put_user(len, optlen))
2900 return -EFAULT;
2901 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2902 return -EFAULT;
2903 return 0;
2904
2905 case TCP_COOKIE_TRANSACTIONS: {
2906 struct tcp_cookie_transactions ctd;
2907 struct tcp_cookie_values *cvp = tp->cookie_values;
2908
2909 if (get_user(len, optlen))
2910 return -EFAULT;
2911 if (len < sizeof(ctd))
2912 return -EINVAL;
2913
2914 memset(&ctd, 0, sizeof(ctd));
2915 ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ?
2916 TCP_COOKIE_IN_ALWAYS : 0)
2917 | (tp->rx_opt.cookie_out_never ?
2918 TCP_COOKIE_OUT_NEVER : 0);
2919
2920 if (cvp != NULL) {
2921 ctd.tcpct_flags |= (cvp->s_data_in ?
2922 TCP_S_DATA_IN : 0)
2923 | (cvp->s_data_out ?
2924 TCP_S_DATA_OUT : 0);
2925
2926 ctd.tcpct_cookie_desired = cvp->cookie_desired;
2927 ctd.tcpct_s_data_desired = cvp->s_data_desired;
2928
2929 memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
2930 cvp->cookie_pair_size);
2931 ctd.tcpct_used = cvp->cookie_pair_size;
2932 }
2933
2934 if (put_user(sizeof(ctd), optlen))
2935 return -EFAULT;
2936 if (copy_to_user(optval, &ctd, sizeof(ctd)))
2937 return -EFAULT;
2938 return 0;
2939 }
2940 case TCP_THIN_LINEAR_TIMEOUTS:
2941 val = tp->thin_lto;
2942 break;
2943 case TCP_THIN_DUPACK:
2944 val = tp->thin_dupack;
2945 break;
2946
2947 case TCP_REPAIR:
2948 val = tp->repair;
2949 break;
2950
2951 case TCP_REPAIR_QUEUE:
2952 if (tp->repair)
2953 val = tp->repair_queue;
2954 else
2955 return -EINVAL;
2956 break;
2957
2958 case TCP_QUEUE_SEQ:
2959 if (tp->repair_queue == TCP_SEND_QUEUE)
2960 val = tp->write_seq;
2961 else if (tp->repair_queue == TCP_RECV_QUEUE)
2962 val = tp->rcv_nxt;
2963 else
2964 return -EINVAL;
2965 break;
2966
2967 case TCP_USER_TIMEOUT:
2968 val = jiffies_to_msecs(icsk->icsk_user_timeout);
2969 break;
2970 case TCP_TIMESTAMP:
2971 val = tcp_time_stamp + tp->tsoffset;
2972 break;
2973 default:
2974 return -ENOPROTOOPT;
2975 }
2976
2977 if (put_user(len, optlen))
2978 return -EFAULT;
2979 if (copy_to_user(optval, &val, len))
2980 return -EFAULT;
2981 return 0;
2982}
2983
2984int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2985 int __user *optlen)
2986{
2987 struct inet_connection_sock *icsk = inet_csk(sk);
2988
2989 if (level != SOL_TCP)
2990 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2991 optval, optlen);
2992 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2993}
2994EXPORT_SYMBOL(tcp_getsockopt);
2995
2996#ifdef CONFIG_COMPAT
2997int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2998 char __user *optval, int __user *optlen)
2999{
3000 if (level != SOL_TCP)
3001 return inet_csk_compat_getsockopt(sk, level, optname,
3002 optval, optlen);
3003 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
3004}
3005EXPORT_SYMBOL(compat_tcp_getsockopt);
3006#endif
3007
3008struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
3009 netdev_features_t features)
3010{
3011 struct sk_buff *segs = ERR_PTR(-EINVAL);
3012 struct tcphdr *th;
3013 unsigned int thlen;
3014 unsigned int seq;
3015 __be32 delta;
3016 unsigned int oldlen;
3017 unsigned int mss;
3018
3019 if (!pskb_may_pull(skb, sizeof(*th)))
3020 goto out;
3021
3022 th = tcp_hdr(skb);
3023 thlen = th->doff * 4;
3024 if (thlen < sizeof(*th))
3025 goto out;
3026
3027 if (!pskb_may_pull(skb, thlen))
3028 goto out;
3029
3030 oldlen = (u16)~skb->len;
3031 __skb_pull(skb, thlen);
3032
3033 mss = skb_shinfo(skb)->gso_size;
3034 if (unlikely(skb->len <= mss))
3035 goto out;
3036
3037 if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
3038
3039 int type = skb_shinfo(skb)->gso_type;
3040
3041 if (unlikely(type &
3042 ~(SKB_GSO_TCPV4 |
3043 SKB_GSO_DODGY |
3044 SKB_GSO_TCP_ECN |
3045 SKB_GSO_TCPV6 |
3046 SKB_GSO_GRE |
3047 0) ||
3048 !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
3049 goto out;
3050
3051 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
3052
3053 segs = NULL;
3054 goto out;
3055 }
3056
3057 segs = skb_segment(skb, features);
3058 if (IS_ERR(segs))
3059 goto out;
3060
3061 delta = htonl(oldlen + (thlen + mss));
3062
3063 skb = segs;
3064 th = tcp_hdr(skb);
3065 seq = ntohl(th->seq);
3066
3067 do {
3068 th->fin = th->psh = 0;
3069
3070 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
3071 (__force u32)delta));
3072 if (skb->ip_summed != CHECKSUM_PARTIAL)
3073 th->check =
3074 csum_fold(csum_partial(skb_transport_header(skb),
3075 thlen, skb->csum));
3076
3077 seq += mss;
3078 skb = skb->next;
3079 th = tcp_hdr(skb);
3080
3081 th->seq = htonl(seq);
3082 th->cwr = 0;
3083 } while (skb->next);
3084
3085 delta = htonl(oldlen + (skb->tail - skb->transport_header) +
3086 skb->data_len);
3087 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
3088 (__force u32)delta));
3089 if (skb->ip_summed != CHECKSUM_PARTIAL)
3090 th->check = csum_fold(csum_partial(skb_transport_header(skb),
3091 thlen, skb->csum));
3092
3093out:
3094 return segs;
3095}
3096EXPORT_SYMBOL(tcp_tso_segment);
3097
3098struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
3099{
3100 struct sk_buff **pp = NULL;
3101 struct sk_buff *p;
3102 struct tcphdr *th;
3103 struct tcphdr *th2;
3104 unsigned int len;
3105 unsigned int thlen;
3106 __be32 flags;
3107 unsigned int mss = 1;
3108 unsigned int hlen;
3109 unsigned int off;
3110 int flush = 1;
3111 int i;
3112
3113 off = skb_gro_offset(skb);
3114 hlen = off + sizeof(*th);
3115 th = skb_gro_header_fast(skb, off);
3116 if (skb_gro_header_hard(skb, hlen)) {
3117 th = skb_gro_header_slow(skb, hlen, off);
3118 if (unlikely(!th))
3119 goto out;
3120 }
3121
3122 thlen = th->doff * 4;
3123 if (thlen < sizeof(*th))
3124 goto out;
3125
3126 hlen = off + thlen;
3127 if (skb_gro_header_hard(skb, hlen)) {
3128 th = skb_gro_header_slow(skb, hlen, off);
3129 if (unlikely(!th))
3130 goto out;
3131 }
3132
3133 skb_gro_pull(skb, thlen);
3134
3135 len = skb_gro_len(skb);
3136 flags = tcp_flag_word(th);
3137
3138 for (; (p = *head); head = &p->next) {
3139 if (!NAPI_GRO_CB(p)->same_flow)
3140 continue;
3141
3142 th2 = tcp_hdr(p);
3143
3144 if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
3145 NAPI_GRO_CB(p)->same_flow = 0;
3146 continue;
3147 }
3148
3149 goto found;
3150 }
3151
3152 goto out_check_final;
3153
3154found:
3155 flush = NAPI_GRO_CB(p)->flush;
3156 flush |= (__force int)(flags & TCP_FLAG_CWR);
3157 flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
3158 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
3159 flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
3160 for (i = sizeof(*th); i < thlen; i += 4)
3161 flush |= *(u32 *)((u8 *)th + i) ^
3162 *(u32 *)((u8 *)th2 + i);
3163
3164 mss = skb_shinfo(p)->gso_size;
3165
3166 flush |= (len - 1) >= mss;
3167 flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
3168
3169 if (flush || skb_gro_receive(head, skb)) {
3170 mss = 1;
3171 goto out_check_final;
3172 }
3173
3174 p = *head;
3175 th2 = tcp_hdr(p);
3176 tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
3177
3178out_check_final:
3179 flush = len < mss;
3180 flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
3181 TCP_FLAG_RST | TCP_FLAG_SYN |
3182 TCP_FLAG_FIN));
3183
3184 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
3185 pp = head;
3186
3187out:
3188 NAPI_GRO_CB(skb)->flush |= flush;
3189
3190 return pp;
3191}
3192EXPORT_SYMBOL(tcp_gro_receive);
3193
3194int tcp_gro_complete(struct sk_buff *skb)
3195{
3196 struct tcphdr *th = tcp_hdr(skb);
3197
3198 skb->csum_start = skb_transport_header(skb) - skb->head;
3199 skb->csum_offset = offsetof(struct tcphdr, check);
3200 skb->ip_summed = CHECKSUM_PARTIAL;
3201
3202 skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
3203
3204 if (th->cwr)
3205 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
3206
3207 return 0;
3208}
3209EXPORT_SYMBOL(tcp_gro_complete);
3210
3211#ifdef CONFIG_TCP_MD5SIG
3212static unsigned long tcp_md5sig_users;
3213static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool;
3214static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
3215
3216static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
3217{
3218 int cpu;
3219
3220 for_each_possible_cpu(cpu) {
3221 struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu);
3222
3223 if (p->md5_desc.tfm)
3224 crypto_free_hash(p->md5_desc.tfm);
3225 }
3226 free_percpu(pool);
3227}
3228
3229void tcp_free_md5sig_pool(void)
3230{
3231 struct tcp_md5sig_pool __percpu *pool = NULL;
3232
3233 spin_lock_bh(&tcp_md5sig_pool_lock);
3234 if (--tcp_md5sig_users == 0) {
3235 pool = tcp_md5sig_pool;
3236 tcp_md5sig_pool = NULL;
3237 }
3238 spin_unlock_bh(&tcp_md5sig_pool_lock);
3239 if (pool)
3240 __tcp_free_md5sig_pool(pool);
3241}
3242EXPORT_SYMBOL(tcp_free_md5sig_pool);
3243
3244static struct tcp_md5sig_pool __percpu *
3245__tcp_alloc_md5sig_pool(struct sock *sk)
3246{
3247 int cpu;
3248 struct tcp_md5sig_pool __percpu *pool;
3249
3250 pool = alloc_percpu(struct tcp_md5sig_pool);
3251 if (!pool)
3252 return NULL;
3253
3254 for_each_possible_cpu(cpu) {
3255 struct crypto_hash *hash;
3256
3257 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
3258 if (IS_ERR_OR_NULL(hash))
3259 goto out_free;
3260
3261 per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
3262 }
3263 return pool;
3264out_free:
3265 __tcp_free_md5sig_pool(pool);
3266 return NULL;
3267}
3268
3269struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
3270{
3271 struct tcp_md5sig_pool __percpu *pool;
3272 bool alloc = false;
3273
3274retry:
3275 spin_lock_bh(&tcp_md5sig_pool_lock);
3276 pool = tcp_md5sig_pool;
3277 if (tcp_md5sig_users++ == 0) {
3278 alloc = true;
3279 spin_unlock_bh(&tcp_md5sig_pool_lock);
3280 } else if (!pool) {
3281 tcp_md5sig_users--;
3282 spin_unlock_bh(&tcp_md5sig_pool_lock);
3283 cpu_relax();
3284 goto retry;
3285 } else
3286 spin_unlock_bh(&tcp_md5sig_pool_lock);
3287
3288 if (alloc) {
3289
3290 struct tcp_md5sig_pool __percpu *p;
3291
3292 p = __tcp_alloc_md5sig_pool(sk);
3293 spin_lock_bh(&tcp_md5sig_pool_lock);
3294 if (!p) {
3295 tcp_md5sig_users--;
3296 spin_unlock_bh(&tcp_md5sig_pool_lock);
3297 return NULL;
3298 }
3299 pool = tcp_md5sig_pool;
3300 if (pool) {
3301
3302 spin_unlock_bh(&tcp_md5sig_pool_lock);
3303 __tcp_free_md5sig_pool(p);
3304 } else {
3305 tcp_md5sig_pool = pool = p;
3306 spin_unlock_bh(&tcp_md5sig_pool_lock);
3307 }
3308 }
3309 return pool;
3310}
3311EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3322{
3323 struct tcp_md5sig_pool __percpu *p;
3324
3325 local_bh_disable();
3326
3327 spin_lock(&tcp_md5sig_pool_lock);
3328 p = tcp_md5sig_pool;
3329 if (p)
3330 tcp_md5sig_users++;
3331 spin_unlock(&tcp_md5sig_pool_lock);
3332
3333 if (p)
3334 return this_cpu_ptr(p);
3335
3336 local_bh_enable();
3337 return NULL;
3338}
3339EXPORT_SYMBOL(tcp_get_md5sig_pool);
3340
3341void tcp_put_md5sig_pool(void)
3342{
3343 local_bh_enable();
3344 tcp_free_md5sig_pool();
3345}
3346EXPORT_SYMBOL(tcp_put_md5sig_pool);
3347
3348int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
3349 const struct tcphdr *th)
3350{
3351 struct scatterlist sg;
3352 struct tcphdr hdr;
3353 int err;
3354
3355
3356 memcpy(&hdr, th, sizeof(hdr));
3357 hdr.check = 0;
3358
3359
3360 sg_init_one(&sg, &hdr, sizeof(hdr));
3361 err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
3362 return err;
3363}
3364EXPORT_SYMBOL(tcp_md5_hash_header);
3365
3366int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3367 const struct sk_buff *skb, unsigned int header_len)
3368{
3369 struct scatterlist sg;
3370 const struct tcphdr *tp = tcp_hdr(skb);
3371 struct hash_desc *desc = &hp->md5_desc;
3372 unsigned int i;
3373 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
3374 skb_headlen(skb) - header_len : 0;
3375 const struct skb_shared_info *shi = skb_shinfo(skb);
3376 struct sk_buff *frag_iter;
3377
3378 sg_init_table(&sg, 1);
3379
3380 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
3381 if (crypto_hash_update(desc, &sg, head_data_len))
3382 return 1;
3383
3384 for (i = 0; i < shi->nr_frags; ++i) {
3385 const struct skb_frag_struct *f = &shi->frags[i];
3386 struct page *page = skb_frag_page(f);
3387 sg_set_page(&sg, page, skb_frag_size(f), f->page_offset);
3388 if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
3389 return 1;
3390 }
3391
3392 skb_walk_frags(skb, frag_iter)
3393 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3394 return 1;
3395
3396 return 0;
3397}
3398EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3399
3400int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
3401{
3402 struct scatterlist sg;
3403
3404 sg_init_one(&sg, key->key, key->keylen);
3405 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
3406}
3407EXPORT_SYMBOL(tcp_md5_hash_key);
3408
3409#endif
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434struct tcp_cookie_secret {
3435
3436
3437
3438
3439
3440 u32 secrets[COOKIE_WORKSPACE_WORDS];
3441 unsigned long expires;
3442};
3443
3444#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
3445#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
3446#define TCP_SECRET_LIFE (HZ * 600)
3447
3448static struct tcp_cookie_secret tcp_secret_one;
3449static struct tcp_cookie_secret tcp_secret_two;
3450
3451
3452static struct tcp_cookie_secret *tcp_secret_generating;
3453static struct tcp_cookie_secret *tcp_secret_primary;
3454static struct tcp_cookie_secret *tcp_secret_retiring;
3455static struct tcp_cookie_secret *tcp_secret_secondary;
3456
3457static DEFINE_SPINLOCK(tcp_secret_locker);
3458
3459
3460
3461static inline u32 tcp_cookie_work(const u32 *ws, const int n)
3462{
3463 return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])];
3464}
3465
3466
3467
3468
3469
3470int tcp_cookie_generator(u32 *bakery)
3471{
3472 unsigned long jiffy = jiffies;
3473
3474 if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) {
3475 spin_lock_bh(&tcp_secret_locker);
3476 if (!time_after_eq(jiffy, tcp_secret_generating->expires)) {
3477
3478 memcpy(bakery,
3479 &tcp_secret_generating->secrets[0],
3480 COOKIE_WORKSPACE_WORDS);
3481 } else {
3482
3483 get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS);
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494 if (unlikely(tcp_secret_primary->expires ==
3495 tcp_secret_secondary->expires)) {
3496 struct timespec tv;
3497
3498 getnstimeofday(&tv);
3499 bakery[COOKIE_DIGEST_WORDS+0] ^=
3500 (u32)tv.tv_nsec;
3501
3502 tcp_secret_secondary->expires = jiffy
3503 + TCP_SECRET_1MSL
3504 + (0x0f & tcp_cookie_work(bakery, 0));
3505 } else {
3506 tcp_secret_secondary->expires = jiffy
3507 + TCP_SECRET_LIFE
3508 + (0xff & tcp_cookie_work(bakery, 1));
3509 tcp_secret_primary->expires = jiffy
3510 + TCP_SECRET_2MSL
3511 + (0x1f & tcp_cookie_work(bakery, 2));
3512 }
3513 memcpy(&tcp_secret_secondary->secrets[0],
3514 bakery, COOKIE_WORKSPACE_WORDS);
3515
3516 rcu_assign_pointer(tcp_secret_generating,
3517 tcp_secret_secondary);
3518 rcu_assign_pointer(tcp_secret_retiring,
3519 tcp_secret_primary);
3520
3521
3522
3523
3524
3525
3526 }
3527 spin_unlock_bh(&tcp_secret_locker);
3528 } else {
3529 rcu_read_lock_bh();
3530 memcpy(bakery,
3531 &rcu_dereference(tcp_secret_generating)->secrets[0],
3532 COOKIE_WORKSPACE_WORDS);
3533 rcu_read_unlock_bh();
3534 }
3535 return 0;
3536}
3537EXPORT_SYMBOL(tcp_cookie_generator);
3538
3539void tcp_done(struct sock *sk)
3540{
3541 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3542
3543 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3544 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3545
3546 tcp_set_state(sk, TCP_CLOSE);
3547 tcp_clear_xmit_timers(sk);
3548 if (req != NULL)
3549 reqsk_fastopen_remove(sk, req, false);
3550
3551 sk->sk_shutdown = SHUTDOWN_MASK;
3552
3553 if (!sock_flag(sk, SOCK_DEAD))
3554 sk->sk_state_change(sk);
3555 else
3556 inet_csk_destroy_sock(sk);
3557}
3558EXPORT_SYMBOL_GPL(tcp_done);
3559
3560extern struct tcp_congestion_ops tcp_reno;
3561
3562static __initdata unsigned long thash_entries;
3563static int __init set_thash_entries(char *str)
3564{
3565 ssize_t ret;
3566
3567 if (!str)
3568 return 0;
3569
3570 ret = kstrtoul(str, 0, &thash_entries);
3571 if (ret)
3572 return 0;
3573
3574 return 1;
3575}
3576__setup("thash_entries=", set_thash_entries);
3577
3578void tcp_init_mem(struct net *net)
3579{
3580 unsigned long limit = nr_free_buffer_pages() / 8;
3581 limit = max(limit, 128UL);
3582 net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
3583 net->ipv4.sysctl_tcp_mem[1] = limit;
3584 net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
3585}
3586
3587void __init tcp_init(void)
3588{
3589 struct sk_buff *skb = NULL;
3590 unsigned long limit;
3591 int max_rshare, max_wshare, cnt;
3592 unsigned int i;
3593 unsigned long jiffy = jiffies;
3594
3595 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
3596
3597 percpu_counter_init(&tcp_sockets_allocated, 0);
3598 percpu_counter_init(&tcp_orphan_count, 0);
3599 tcp_hashinfo.bind_bucket_cachep =
3600 kmem_cache_create("tcp_bind_bucket",
3601 sizeof(struct inet_bind_bucket), 0,
3602 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3603
3604
3605
3606
3607
3608
3609 tcp_hashinfo.ehash =
3610 alloc_large_system_hash("TCP established",
3611 sizeof(struct inet_ehash_bucket),
3612 thash_entries,
3613 17,
3614 0,
3615 NULL,
3616 &tcp_hashinfo.ehash_mask,
3617 0,
3618 thash_entries ? 0 : 512 * 1024);
3619 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
3620 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
3621 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
3622 }
3623 if (inet_ehash_locks_alloc(&tcp_hashinfo))
3624 panic("TCP: failed to alloc ehash_locks");
3625 tcp_hashinfo.bhash =
3626 alloc_large_system_hash("TCP bind",
3627 sizeof(struct inet_bind_hashbucket),
3628 tcp_hashinfo.ehash_mask + 1,
3629 17,
3630 0,
3631 &tcp_hashinfo.bhash_size,
3632 NULL,
3633 0,
3634 64 * 1024);
3635 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
3636 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3637 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3638 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3639 }
3640
3641
3642 cnt = tcp_hashinfo.ehash_mask + 1;
3643
3644 tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3645 sysctl_tcp_max_orphans = cnt / 2;
3646 sysctl_max_syn_backlog = max(128, cnt / 256);
3647
3648 tcp_init_mem(&init_net);
3649
3650 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
3651 max_wshare = min(4UL*1024*1024, limit);
3652 max_rshare = min(6UL*1024*1024, limit);
3653
3654 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3655 sysctl_tcp_wmem[1] = 16*1024;
3656 sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3657
3658 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3659 sysctl_tcp_rmem[1] = 87380;
3660 sysctl_tcp_rmem[2] = max(87380, max_rshare);
3661
3662 pr_info("Hash tables configured (established %u bind %u)\n",
3663 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3664
3665 tcp_metrics_init();
3666
3667 tcp_register_congestion_control(&tcp_reno);
3668
3669 memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
3670 memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
3671 tcp_secret_one.expires = jiffy;
3672 tcp_secret_two.expires = jiffy;
3673 tcp_secret_generating = &tcp_secret_one;
3674 tcp_secret_primary = &tcp_secret_one;
3675 tcp_secret_retiring = &tcp_secret_two;
3676 tcp_secret_secondary = &tcp_secret_two;
3677 tcp_tasklet_init();
3678}
3679