1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248#define pr_fmt(fmt) "TCP: " fmt
249
250#include <linux/kernel.h>
251#include <linux/module.h>
252#include <linux/types.h>
253#include <linux/fcntl.h>
254#include <linux/poll.h>
255#include <linux/init.h>
256#include <linux/fs.h>
257#include <linux/skbuff.h>
258#include <linux/scatterlist.h>
259#include <linux/splice.h>
260#include <linux/net.h>
261#include <linux/socket.h>
262#include <linux/random.h>
263#include <linux/bootmem.h>
264#include <linux/highmem.h>
265#include <linux/swap.h>
266#include <linux/cache.h>
267#include <linux/err.h>
268#include <linux/crypto.h>
269#include <linux/time.h>
270#include <linux/slab.h>
271
272#include <net/icmp.h>
273#include <net/inet_common.h>
274#include <net/tcp.h>
275#include <net/xfrm.h>
276#include <net/ip.h>
277#include <net/netdma.h>
278#include <net/sock.h>
279
280#include <asm/uaccess.h>
281#include <asm/ioctls.h>
282
283int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
284
285struct percpu_counter tcp_orphan_count;
286EXPORT_SYMBOL_GPL(tcp_orphan_count);
287
288int sysctl_tcp_wmem[3] __read_mostly;
289int sysctl_tcp_rmem[3] __read_mostly;
290
291EXPORT_SYMBOL(sysctl_tcp_rmem);
292EXPORT_SYMBOL(sysctl_tcp_wmem);
293
294atomic_long_t tcp_memory_allocated;
295EXPORT_SYMBOL(tcp_memory_allocated);
296
297
298
299
300struct percpu_counter tcp_sockets_allocated;
301EXPORT_SYMBOL(tcp_sockets_allocated);
302
303
304
305
306struct tcp_splice_state {
307 struct pipe_inode_info *pipe;
308 size_t len;
309 unsigned int flags;
310};
311
312
313
314
315
316
317
318int tcp_memory_pressure __read_mostly;
319EXPORT_SYMBOL(tcp_memory_pressure);
320
321void tcp_enter_memory_pressure(struct sock *sk)
322{
323 if (!tcp_memory_pressure) {
324 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
325 tcp_memory_pressure = 1;
326 }
327}
328EXPORT_SYMBOL(tcp_enter_memory_pressure);
329
330
331static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
332{
333 u8 res = 0;
334
335 if (seconds > 0) {
336 int period = timeout;
337
338 res = 1;
339 while (seconds > period && res < 255) {
340 res++;
341 timeout <<= 1;
342 if (timeout > rto_max)
343 timeout = rto_max;
344 period += timeout;
345 }
346 }
347 return res;
348}
349
350
351static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
352{
353 int period = 0;
354
355 if (retrans > 0) {
356 period = timeout;
357 while (--retrans) {
358 timeout <<= 1;
359 if (timeout > rto_max)
360 timeout = rto_max;
361 period += timeout;
362 }
363 }
364 return period;
365}
366
367
368
369
370
371
372void tcp_init_sock(struct sock *sk)
373{
374 struct inet_connection_sock *icsk = inet_csk(sk);
375 struct tcp_sock *tp = tcp_sk(sk);
376
377 skb_queue_head_init(&tp->out_of_order_queue);
378 tcp_init_xmit_timers(sk);
379 tcp_prequeue_init(tp);
380 INIT_LIST_HEAD(&tp->tsq_node);
381
382 icsk->icsk_rto = TCP_TIMEOUT_INIT;
383 tp->mdev = TCP_TIMEOUT_INIT;
384
385
386
387
388
389
390 tp->snd_cwnd = TCP_INIT_CWND;
391
392
393
394
395 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
396 tp->snd_cwnd_clamp = ~0;
397 tp->mss_cache = TCP_MSS_DEFAULT;
398
399 tp->reordering = sysctl_tcp_reordering;
400 tcp_enable_early_retrans(tp);
401 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
402
403 tp->tsoffset = 0;
404
405 sk->sk_state = TCP_CLOSE;
406
407 sk->sk_write_space = sk_stream_write_space;
408 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
409
410 icsk->icsk_sync_mss = tcp_sync_mss;
411
412
413
414
415
416 sk->sk_sndbuf = sysctl_tcp_wmem[1];
417 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
418
419 local_bh_disable();
420 sock_update_memcg(sk);
421 sk_sockets_allocated_inc(sk);
422 local_bh_enable();
423}
424EXPORT_SYMBOL(tcp_init_sock);
425
426
427
428
429
430
431
432
433unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
434{
435 unsigned int mask;
436 struct sock *sk = sock->sk;
437 const struct tcp_sock *tp = tcp_sk(sk);
438
439 sock_poll_wait(file, sk_sleep(sk), wait);
440 if (sk->sk_state == TCP_LISTEN)
441 return inet_csk_listen_poll(sk);
442
443
444
445
446
447
448 mask = 0;
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
478 mask |= POLLHUP;
479 if (sk->sk_shutdown & RCV_SHUTDOWN)
480 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
481
482
483 if (sk->sk_state != TCP_SYN_SENT &&
484 (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
485 int target = sock_rcvlowat(sk, 0, INT_MAX);
486
487 if (tp->urg_seq == tp->copied_seq &&
488 !sock_flag(sk, SOCK_URGINLINE) &&
489 tp->urg_data)
490 target++;
491
492
493
494
495 if (tp->rcv_nxt - tp->copied_seq >= target)
496 mask |= POLLIN | POLLRDNORM;
497
498 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
499 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
500 mask |= POLLOUT | POLLWRNORM;
501 } else {
502 set_bit(SOCK_ASYNC_NOSPACE,
503 &sk->sk_socket->flags);
504 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
505
506
507
508
509
510 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
511 mask |= POLLOUT | POLLWRNORM;
512 }
513 } else
514 mask |= POLLOUT | POLLWRNORM;
515
516 if (tp->urg_data & TCP_URG_VALID)
517 mask |= POLLPRI;
518 }
519
520 smp_rmb();
521 if (sk->sk_err)
522 mask |= POLLERR;
523
524 return mask;
525}
526EXPORT_SYMBOL(tcp_poll);
527
528int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
529{
530 struct tcp_sock *tp = tcp_sk(sk);
531 int answ;
532 bool slow;
533
534 switch (cmd) {
535 case SIOCINQ:
536 if (sk->sk_state == TCP_LISTEN)
537 return -EINVAL;
538
539 slow = lock_sock_fast(sk);
540 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
541 answ = 0;
542 else if (sock_flag(sk, SOCK_URGINLINE) ||
543 !tp->urg_data ||
544 before(tp->urg_seq, tp->copied_seq) ||
545 !before(tp->urg_seq, tp->rcv_nxt)) {
546
547 answ = tp->rcv_nxt - tp->copied_seq;
548
549
550 if (answ && sock_flag(sk, SOCK_DONE))
551 answ--;
552 } else
553 answ = tp->urg_seq - tp->copied_seq;
554 unlock_sock_fast(sk, slow);
555 break;
556 case SIOCATMARK:
557 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
558 break;
559 case SIOCOUTQ:
560 if (sk->sk_state == TCP_LISTEN)
561 return -EINVAL;
562
563 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
564 answ = 0;
565 else
566 answ = tp->write_seq - tp->snd_una;
567 break;
568 case SIOCOUTQNSD:
569 if (sk->sk_state == TCP_LISTEN)
570 return -EINVAL;
571
572 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
573 answ = 0;
574 else
575 answ = tp->write_seq - tp->snd_nxt;
576 break;
577 default:
578 return -ENOIOCTLCMD;
579 }
580
581 return put_user(answ, (int __user *)arg);
582}
583EXPORT_SYMBOL(tcp_ioctl);
584
585static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
586{
587 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
588 tp->pushed_seq = tp->write_seq;
589}
590
591static inline bool forced_push(const struct tcp_sock *tp)
592{
593 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
594}
595
596static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
597{
598 struct tcp_sock *tp = tcp_sk(sk);
599 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
600
601 skb->csum = 0;
602 tcb->seq = tcb->end_seq = tp->write_seq;
603 tcb->tcp_flags = TCPHDR_ACK;
604 tcb->sacked = 0;
605 skb_header_release(skb);
606 tcp_add_write_queue_tail(sk, skb);
607 sk->sk_wmem_queued += skb->truesize;
608 sk_mem_charge(sk, skb->truesize);
609 if (tp->nonagle & TCP_NAGLE_PUSH)
610 tp->nonagle &= ~TCP_NAGLE_PUSH;
611}
612
613static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
614{
615 if (flags & MSG_OOB)
616 tp->snd_up = tp->write_seq;
617}
618
619static inline void tcp_push(struct sock *sk, int flags, int mss_now,
620 int nonagle)
621{
622 if (tcp_send_head(sk)) {
623 struct tcp_sock *tp = tcp_sk(sk);
624
625 if (!(flags & MSG_MORE) || forced_push(tp))
626 tcp_mark_push(tp, tcp_write_queue_tail(sk));
627
628 tcp_mark_urg(tp, flags);
629 __tcp_push_pending_frames(sk, mss_now,
630 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
631 }
632}
633
634static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
635 unsigned int offset, size_t len)
636{
637 struct tcp_splice_state *tss = rd_desc->arg.data;
638 int ret;
639
640 ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
641 tss->flags);
642 if (ret > 0)
643 rd_desc->count -= ret;
644 return ret;
645}
646
647static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
648{
649
650 read_descriptor_t rd_desc = {
651 .arg.data = tss,
652 .count = tss->len,
653 };
654
655 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
656}
657
658
659
660
661
662
663
664
665
666
667
668
669
670ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
671 struct pipe_inode_info *pipe, size_t len,
672 unsigned int flags)
673{
674 struct sock *sk = sock->sk;
675 struct tcp_splice_state tss = {
676 .pipe = pipe,
677 .len = len,
678 .flags = flags,
679 };
680 long timeo;
681 ssize_t spliced;
682 int ret;
683
684 sock_rps_record_flow(sk);
685
686
687
688 if (unlikely(*ppos))
689 return -ESPIPE;
690
691 ret = spliced = 0;
692
693 lock_sock(sk);
694
695 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
696 while (tss.len) {
697 ret = __tcp_splice_read(sk, &tss);
698 if (ret < 0)
699 break;
700 else if (!ret) {
701 if (spliced)
702 break;
703 if (sock_flag(sk, SOCK_DONE))
704 break;
705 if (sk->sk_err) {
706 ret = sock_error(sk);
707 break;
708 }
709 if (sk->sk_shutdown & RCV_SHUTDOWN)
710 break;
711 if (sk->sk_state == TCP_CLOSE) {
712
713
714
715
716 if (!sock_flag(sk, SOCK_DONE))
717 ret = -ENOTCONN;
718 break;
719 }
720 if (!timeo) {
721 ret = -EAGAIN;
722 break;
723 }
724 sk_wait_data(sk, &timeo);
725 if (signal_pending(current)) {
726 ret = sock_intr_errno(timeo);
727 break;
728 }
729 continue;
730 }
731 tss.len -= ret;
732 spliced += ret;
733
734 if (!timeo)
735 break;
736 release_sock(sk);
737 lock_sock(sk);
738
739 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
740 (sk->sk_shutdown & RCV_SHUTDOWN) ||
741 signal_pending(current))
742 break;
743 }
744
745 release_sock(sk);
746
747 if (spliced)
748 return spliced;
749
750 return ret;
751}
752EXPORT_SYMBOL(tcp_splice_read);
753
754struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
755{
756 struct sk_buff *skb;
757
758
759 size = ALIGN(size, 4);
760
761 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
762 if (skb) {
763 if (sk_wmem_schedule(sk, skb->truesize)) {
764 skb_reserve(skb, sk->sk_prot->max_header);
765
766
767
768
769 skb->reserved_tailroom = skb->end - skb->tail - size;
770 return skb;
771 }
772 __kfree_skb(skb);
773 } else {
774 sk->sk_prot->enter_memory_pressure(sk);
775 sk_stream_moderate_sndbuf(sk);
776 }
777 return NULL;
778}
779
780static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
781 int large_allowed)
782{
783 struct tcp_sock *tp = tcp_sk(sk);
784 u32 xmit_size_goal, old_size_goal;
785
786 xmit_size_goal = mss_now;
787
788 if (large_allowed && sk_can_gso(sk)) {
789 xmit_size_goal = ((sk->sk_gso_max_size - 1) -
790 inet_csk(sk)->icsk_af_ops->net_header_len -
791 inet_csk(sk)->icsk_ext_hdr_len -
792 tp->tcp_header_len);
793
794
795 xmit_size_goal = min_t(u32, xmit_size_goal,
796 sysctl_tcp_limit_output_bytes >> 1);
797
798 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
799
800
801 old_size_goal = tp->xmit_size_goal_segs * mss_now;
802
803 if (likely(old_size_goal <= xmit_size_goal &&
804 old_size_goal + mss_now > xmit_size_goal)) {
805 xmit_size_goal = old_size_goal;
806 } else {
807 tp->xmit_size_goal_segs =
808 min_t(u16, xmit_size_goal / mss_now,
809 sk->sk_gso_max_segs);
810 xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
811 }
812 }
813
814 return max(xmit_size_goal, mss_now);
815}
816
817static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
818{
819 int mss_now;
820
821 mss_now = tcp_current_mss(sk);
822 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
823
824 return mss_now;
825}
826
827static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
828 size_t size, int flags)
829{
830 struct tcp_sock *tp = tcp_sk(sk);
831 int mss_now, size_goal;
832 int err;
833 ssize_t copied;
834 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
835
836
837
838
839
840 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
841 !tcp_passive_fastopen(sk)) {
842 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
843 goto out_err;
844 }
845
846 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
847
848 mss_now = tcp_send_mss(sk, &size_goal, flags);
849 copied = 0;
850
851 err = -EPIPE;
852 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
853 goto out_err;
854
855 while (size > 0) {
856 struct sk_buff *skb = tcp_write_queue_tail(sk);
857 int copy, i;
858 bool can_coalesce;
859
860 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
861new_segment:
862 if (!sk_stream_memory_free(sk))
863 goto wait_for_sndbuf;
864
865 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
866 if (!skb)
867 goto wait_for_memory;
868
869 skb_entail(sk, skb);
870 copy = size_goal;
871 }
872
873 if (copy > size)
874 copy = size;
875
876 i = skb_shinfo(skb)->nr_frags;
877 can_coalesce = skb_can_coalesce(skb, i, page, offset);
878 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
879 tcp_mark_push(tp, skb);
880 goto new_segment;
881 }
882 if (!sk_wmem_schedule(sk, copy))
883 goto wait_for_memory;
884
885 if (can_coalesce) {
886 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
887 } else {
888 get_page(page);
889 skb_fill_page_desc(skb, i, page, offset, copy);
890 }
891 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
892
893 skb->len += copy;
894 skb->data_len += copy;
895 skb->truesize += copy;
896 sk->sk_wmem_queued += copy;
897 sk_mem_charge(sk, copy);
898 skb->ip_summed = CHECKSUM_PARTIAL;
899 tp->write_seq += copy;
900 TCP_SKB_CB(skb)->end_seq += copy;
901 skb_shinfo(skb)->gso_segs = 0;
902
903 if (!copied)
904 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
905
906 copied += copy;
907 offset += copy;
908 if (!(size -= copy))
909 goto out;
910
911 if (skb->len < size_goal || (flags & MSG_OOB))
912 continue;
913
914 if (forced_push(tp)) {
915 tcp_mark_push(tp, skb);
916 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
917 } else if (skb == tcp_send_head(sk))
918 tcp_push_one(sk, mss_now);
919 continue;
920
921wait_for_sndbuf:
922 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
923wait_for_memory:
924 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
925
926 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
927 goto do_error;
928
929 mss_now = tcp_send_mss(sk, &size_goal, flags);
930 }
931
932out:
933 if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
934 tcp_push(sk, flags, mss_now, tp->nonagle);
935 return copied;
936
937do_error:
938 if (copied)
939 goto out;
940out_err:
941 return sk_stream_error(sk, flags, err);
942}
943
944int tcp_sendpage(struct sock *sk, struct page *page, int offset,
945 size_t size, int flags)
946{
947 ssize_t res;
948
949 if (!(sk->sk_route_caps & NETIF_F_SG) ||
950 !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
951 return sock_no_sendpage(sk->sk_socket, page, offset, size,
952 flags);
953
954 lock_sock(sk);
955 res = do_tcp_sendpages(sk, page, offset, size, flags);
956 release_sock(sk);
957 return res;
958}
959EXPORT_SYMBOL(tcp_sendpage);
960
961static inline int select_size(const struct sock *sk, bool sg)
962{
963 const struct tcp_sock *tp = tcp_sk(sk);
964 int tmp = tp->mss_cache;
965
966 if (sg) {
967 if (sk_can_gso(sk)) {
968
969
970
971 tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
972 } else {
973 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
974
975 if (tmp >= pgbreak &&
976 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
977 tmp = pgbreak;
978 }
979 }
980
981 return tmp;
982}
983
984void tcp_free_fastopen_req(struct tcp_sock *tp)
985{
986 if (tp->fastopen_req != NULL) {
987 kfree(tp->fastopen_req);
988 tp->fastopen_req = NULL;
989 }
990}
991
992static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
993{
994 struct tcp_sock *tp = tcp_sk(sk);
995 int err, flags;
996
997 if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
998 return -EOPNOTSUPP;
999 if (tp->fastopen_req != NULL)
1000 return -EALREADY;
1001
1002 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1003 sk->sk_allocation);
1004 if (unlikely(tp->fastopen_req == NULL))
1005 return -ENOBUFS;
1006 tp->fastopen_req->data = msg;
1007
1008 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1009 err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
1010 msg->msg_namelen, flags);
1011 *size = tp->fastopen_req->copied;
1012 tcp_free_fastopen_req(tp);
1013 return err;
1014}
1015
1016int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1017 size_t size)
1018{
1019 struct iovec *iov;
1020 struct tcp_sock *tp = tcp_sk(sk);
1021 struct sk_buff *skb;
1022 int iovlen, flags, err, copied = 0;
1023 int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
1024 bool sg;
1025 long timeo;
1026
1027 lock_sock(sk);
1028
1029 flags = msg->msg_flags;
1030 if (flags & MSG_FASTOPEN) {
1031 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
1032 if (err == -EINPROGRESS && copied_syn > 0)
1033 goto out;
1034 else if (err)
1035 goto out_err;
1036 offset = copied_syn;
1037 }
1038
1039 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1040
1041
1042
1043
1044
1045 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1046 !tcp_passive_fastopen(sk)) {
1047 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1048 goto do_error;
1049 }
1050
1051 if (unlikely(tp->repair)) {
1052 if (tp->repair_queue == TCP_RECV_QUEUE) {
1053 copied = tcp_send_rcvq(sk, msg, size);
1054 goto out;
1055 }
1056
1057 err = -EINVAL;
1058 if (tp->repair_queue == TCP_NO_QUEUE)
1059 goto out_err;
1060
1061
1062 }
1063
1064
1065 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1066
1067 mss_now = tcp_send_mss(sk, &size_goal, flags);
1068
1069
1070 iovlen = msg->msg_iovlen;
1071 iov = msg->msg_iov;
1072 copied = 0;
1073
1074 err = -EPIPE;
1075 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1076 goto out_err;
1077
1078 sg = !!(sk->sk_route_caps & NETIF_F_SG);
1079
1080 while (--iovlen >= 0) {
1081 size_t seglen = iov->iov_len;
1082 unsigned char __user *from = iov->iov_base;
1083
1084 iov++;
1085 if (unlikely(offset > 0)) {
1086 if (offset >= seglen) {
1087 offset -= seglen;
1088 continue;
1089 }
1090 seglen -= offset;
1091 from += offset;
1092 offset = 0;
1093 }
1094
1095 while (seglen > 0) {
1096 int copy = 0;
1097 int max = size_goal;
1098
1099 skb = tcp_write_queue_tail(sk);
1100 if (tcp_send_head(sk)) {
1101 if (skb->ip_summed == CHECKSUM_NONE)
1102 max = mss_now;
1103 copy = max - skb->len;
1104 }
1105
1106 if (copy <= 0) {
1107new_segment:
1108
1109
1110
1111 if (!sk_stream_memory_free(sk))
1112 goto wait_for_sndbuf;
1113
1114 skb = sk_stream_alloc_skb(sk,
1115 select_size(sk, sg),
1116 sk->sk_allocation);
1117 if (!skb)
1118 goto wait_for_memory;
1119
1120
1121
1122
1123 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
1124 skb->ip_summed = CHECKSUM_PARTIAL;
1125
1126 skb_entail(sk, skb);
1127 copy = size_goal;
1128 max = size_goal;
1129 }
1130
1131
1132 if (copy > seglen)
1133 copy = seglen;
1134
1135
1136 if (skb_availroom(skb) > 0) {
1137
1138 copy = min_t(int, copy, skb_availroom(skb));
1139 err = skb_add_data_nocache(sk, skb, from, copy);
1140 if (err)
1141 goto do_fault;
1142 } else {
1143 bool merge = true;
1144 int i = skb_shinfo(skb)->nr_frags;
1145 struct page_frag *pfrag = sk_page_frag(sk);
1146
1147 if (!sk_page_frag_refill(sk, pfrag))
1148 goto wait_for_memory;
1149
1150 if (!skb_can_coalesce(skb, i, pfrag->page,
1151 pfrag->offset)) {
1152 if (i == MAX_SKB_FRAGS || !sg) {
1153 tcp_mark_push(tp, skb);
1154 goto new_segment;
1155 }
1156 merge = false;
1157 }
1158
1159 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1160
1161 if (!sk_wmem_schedule(sk, copy))
1162 goto wait_for_memory;
1163
1164 err = skb_copy_to_page_nocache(sk, from, skb,
1165 pfrag->page,
1166 pfrag->offset,
1167 copy);
1168 if (err)
1169 goto do_error;
1170
1171
1172 if (merge) {
1173 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1174 } else {
1175 skb_fill_page_desc(skb, i, pfrag->page,
1176 pfrag->offset, copy);
1177 get_page(pfrag->page);
1178 }
1179 pfrag->offset += copy;
1180 }
1181
1182 if (!copied)
1183 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1184
1185 tp->write_seq += copy;
1186 TCP_SKB_CB(skb)->end_seq += copy;
1187 skb_shinfo(skb)->gso_segs = 0;
1188
1189 from += copy;
1190 copied += copy;
1191 if ((seglen -= copy) == 0 && iovlen == 0)
1192 goto out;
1193
1194 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1195 continue;
1196
1197 if (forced_push(tp)) {
1198 tcp_mark_push(tp, skb);
1199 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1200 } else if (skb == tcp_send_head(sk))
1201 tcp_push_one(sk, mss_now);
1202 continue;
1203
1204wait_for_sndbuf:
1205 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1206wait_for_memory:
1207 if (copied)
1208 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1209
1210 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1211 goto do_error;
1212
1213 mss_now = tcp_send_mss(sk, &size_goal, flags);
1214 }
1215 }
1216
1217out:
1218 if (copied)
1219 tcp_push(sk, flags, mss_now, tp->nonagle);
1220 release_sock(sk);
1221 return copied + copied_syn;
1222
1223do_fault:
1224 if (!skb->len) {
1225 tcp_unlink_write_queue(skb, sk);
1226
1227
1228
1229 tcp_check_send_head(sk, skb);
1230 sk_wmem_free_skb(sk, skb);
1231 }
1232
1233do_error:
1234 if (copied + copied_syn)
1235 goto out;
1236out_err:
1237 err = sk_stream_error(sk, flags, err);
1238 release_sock(sk);
1239 return err;
1240}
1241EXPORT_SYMBOL(tcp_sendmsg);
1242
1243
1244
1245
1246
1247
1248static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1249{
1250 struct tcp_sock *tp = tcp_sk(sk);
1251
1252
1253 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1254 tp->urg_data == TCP_URG_READ)
1255 return -EINVAL;
1256
1257 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1258 return -ENOTCONN;
1259
1260 if (tp->urg_data & TCP_URG_VALID) {
1261 int err = 0;
1262 char c = tp->urg_data;
1263
1264 if (!(flags & MSG_PEEK))
1265 tp->urg_data = TCP_URG_READ;
1266
1267
1268 msg->msg_flags |= MSG_OOB;
1269
1270 if (len > 0) {
1271 if (!(flags & MSG_TRUNC))
1272 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1273 len = 1;
1274 } else
1275 msg->msg_flags |= MSG_TRUNC;
1276
1277 return err ? -EFAULT : len;
1278 }
1279
1280 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1281 return 0;
1282
1283
1284
1285
1286
1287
1288
1289 return -EAGAIN;
1290}
1291
1292static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1293{
1294 struct sk_buff *skb;
1295 int copied = 0, err = 0;
1296
1297
1298
1299 skb_queue_walk(&sk->sk_write_queue, skb) {
1300 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
1301 if (err)
1302 break;
1303
1304 copied += skb->len;
1305 }
1306
1307 return err ?: copied;
1308}
1309
1310
1311
1312
1313
1314
1315
1316void tcp_cleanup_rbuf(struct sock *sk, int copied)
1317{
1318 struct tcp_sock *tp = tcp_sk(sk);
1319 bool time_to_ack = false;
1320
1321 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1322
1323 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1324 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1325 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1326
1327 if (inet_csk_ack_scheduled(sk)) {
1328 const struct inet_connection_sock *icsk = inet_csk(sk);
1329
1330
1331 if (icsk->icsk_ack.blocked ||
1332
1333 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1334
1335
1336
1337
1338
1339
1340 (copied > 0 &&
1341 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1342 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1343 !icsk->icsk_ack.pingpong)) &&
1344 !atomic_read(&sk->sk_rmem_alloc)))
1345 time_to_ack = true;
1346 }
1347
1348
1349
1350
1351
1352
1353
1354 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1355 __u32 rcv_window_now = tcp_receive_window(tp);
1356
1357
1358 if (2*rcv_window_now <= tp->window_clamp) {
1359 __u32 new_window = __tcp_select_window(sk);
1360
1361
1362
1363
1364
1365
1366 if (new_window && new_window >= 2 * rcv_window_now)
1367 time_to_ack = true;
1368 }
1369 }
1370 if (time_to_ack)
1371 tcp_send_ack(sk);
1372}
1373
1374static void tcp_prequeue_process(struct sock *sk)
1375{
1376 struct sk_buff *skb;
1377 struct tcp_sock *tp = tcp_sk(sk);
1378
1379 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1380
1381
1382
1383 local_bh_disable();
1384 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1385 sk_backlog_rcv(sk, skb);
1386 local_bh_enable();
1387
1388
1389 tp->ucopy.memory = 0;
1390}
1391
1392#ifdef CONFIG_NET_DMA
1393static void tcp_service_net_dma(struct sock *sk, bool wait)
1394{
1395 dma_cookie_t done, used;
1396 dma_cookie_t last_issued;
1397 struct tcp_sock *tp = tcp_sk(sk);
1398
1399 if (!tp->ucopy.dma_chan)
1400 return;
1401
1402 last_issued = tp->ucopy.dma_cookie;
1403 dma_async_issue_pending(tp->ucopy.dma_chan);
1404
1405 do {
1406 if (dma_async_is_tx_complete(tp->ucopy.dma_chan,
1407 last_issued, &done,
1408 &used) == DMA_SUCCESS) {
1409
1410 __skb_queue_purge(&sk->sk_async_wait_queue);
1411 break;
1412 } else {
1413 struct sk_buff *skb;
1414 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1415 (dma_async_is_complete(skb->dma_cookie, done,
1416 used) == DMA_SUCCESS)) {
1417 __skb_dequeue(&sk->sk_async_wait_queue);
1418 kfree_skb(skb);
1419 }
1420 }
1421 } while (wait);
1422}
1423#endif
1424
1425static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1426{
1427 struct sk_buff *skb;
1428 u32 offset;
1429
1430 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1431 offset = seq - TCP_SKB_CB(skb)->seq;
1432 if (tcp_hdr(skb)->syn)
1433 offset--;
1434 if (offset < skb->len || tcp_hdr(skb)->fin) {
1435 *off = offset;
1436 return skb;
1437 }
1438
1439
1440
1441
1442 sk_eat_skb(sk, skb, false);
1443 }
1444 return NULL;
1445}
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1459 sk_read_actor_t recv_actor)
1460{
1461 struct sk_buff *skb;
1462 struct tcp_sock *tp = tcp_sk(sk);
1463 u32 seq = tp->copied_seq;
1464 u32 offset;
1465 int copied = 0;
1466
1467 if (sk->sk_state == TCP_LISTEN)
1468 return -ENOTCONN;
1469 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1470 if (offset < skb->len) {
1471 int used;
1472 size_t len;
1473
1474 len = skb->len - offset;
1475
1476 if (tp->urg_data) {
1477 u32 urg_offset = tp->urg_seq - seq;
1478 if (urg_offset < len)
1479 len = urg_offset;
1480 if (!len)
1481 break;
1482 }
1483 used = recv_actor(desc, skb, offset, len);
1484 if (used <= 0) {
1485 if (!copied)
1486 copied = used;
1487 break;
1488 } else if (used <= len) {
1489 seq += used;
1490 copied += used;
1491 offset += used;
1492 }
1493
1494
1495
1496
1497
1498 skb = tcp_recv_skb(sk, seq - 1, &offset);
1499 if (!skb)
1500 break;
1501
1502
1503
1504 if (offset + 1 != skb->len)
1505 continue;
1506 }
1507 if (tcp_hdr(skb)->fin) {
1508 sk_eat_skb(sk, skb, false);
1509 ++seq;
1510 break;
1511 }
1512 sk_eat_skb(sk, skb, false);
1513 if (!desc->count)
1514 break;
1515 tp->copied_seq = seq;
1516 }
1517 tp->copied_seq = seq;
1518
1519 tcp_rcv_space_adjust(sk);
1520
1521
1522 if (copied > 0) {
1523 tcp_recv_skb(sk, seq, &offset);
1524 tcp_cleanup_rbuf(sk, copied);
1525 }
1526 return copied;
1527}
1528EXPORT_SYMBOL(tcp_read_sock);
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1539 size_t len, int nonblock, int flags, int *addr_len)
1540{
1541 struct tcp_sock *tp = tcp_sk(sk);
1542 int copied = 0;
1543 u32 peek_seq;
1544 u32 *seq;
1545 unsigned long used;
1546 int err;
1547 int target;
1548 long timeo;
1549 struct task_struct *user_recv = NULL;
1550 bool copied_early = false;
1551 struct sk_buff *skb;
1552 u32 urg_hole = 0;
1553
1554 lock_sock(sk);
1555
1556 err = -ENOTCONN;
1557 if (sk->sk_state == TCP_LISTEN)
1558 goto out;
1559
1560 timeo = sock_rcvtimeo(sk, nonblock);
1561
1562
1563 if (flags & MSG_OOB)
1564 goto recv_urg;
1565
1566 if (unlikely(tp->repair)) {
1567 err = -EPERM;
1568 if (!(flags & MSG_PEEK))
1569 goto out;
1570
1571 if (tp->repair_queue == TCP_SEND_QUEUE)
1572 goto recv_sndq;
1573
1574 err = -EINVAL;
1575 if (tp->repair_queue == TCP_NO_QUEUE)
1576 goto out;
1577
1578
1579 }
1580
1581 seq = &tp->copied_seq;
1582 if (flags & MSG_PEEK) {
1583 peek_seq = tp->copied_seq;
1584 seq = &peek_seq;
1585 }
1586
1587 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1588
1589#ifdef CONFIG_NET_DMA
1590 tp->ucopy.dma_chan = NULL;
1591 preempt_disable();
1592 skb = skb_peek_tail(&sk->sk_receive_queue);
1593 {
1594 int available = 0;
1595
1596 if (skb)
1597 available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
1598 if ((available < target) &&
1599 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1600 !sysctl_tcp_low_latency &&
1601 net_dma_find_channel()) {
1602 preempt_enable_no_resched();
1603 tp->ucopy.pinned_list =
1604 dma_pin_iovec_pages(msg->msg_iov, len);
1605 } else {
1606 preempt_enable_no_resched();
1607 }
1608 }
1609#endif
1610
1611 do {
1612 u32 offset;
1613
1614
1615 if (tp->urg_data && tp->urg_seq == *seq) {
1616 if (copied)
1617 break;
1618 if (signal_pending(current)) {
1619 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1620 break;
1621 }
1622 }
1623
1624
1625
1626 skb_queue_walk(&sk->sk_receive_queue, skb) {
1627
1628
1629
1630 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1631 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
1632 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1633 flags))
1634 break;
1635
1636 offset = *seq - TCP_SKB_CB(skb)->seq;
1637 if (tcp_hdr(skb)->syn)
1638 offset--;
1639 if (offset < skb->len)
1640 goto found_ok_skb;
1641 if (tcp_hdr(skb)->fin)
1642 goto found_fin_ok;
1643 WARN(!(flags & MSG_PEEK),
1644 "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
1645 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1646 }
1647
1648
1649
1650 if (copied >= target && !sk->sk_backlog.tail)
1651 break;
1652
1653 if (copied) {
1654 if (sk->sk_err ||
1655 sk->sk_state == TCP_CLOSE ||
1656 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1657 !timeo ||
1658 signal_pending(current))
1659 break;
1660 } else {
1661 if (sock_flag(sk, SOCK_DONE))
1662 break;
1663
1664 if (sk->sk_err) {
1665 copied = sock_error(sk);
1666 break;
1667 }
1668
1669 if (sk->sk_shutdown & RCV_SHUTDOWN)
1670 break;
1671
1672 if (sk->sk_state == TCP_CLOSE) {
1673 if (!sock_flag(sk, SOCK_DONE)) {
1674
1675
1676
1677 copied = -ENOTCONN;
1678 break;
1679 }
1680 break;
1681 }
1682
1683 if (!timeo) {
1684 copied = -EAGAIN;
1685 break;
1686 }
1687
1688 if (signal_pending(current)) {
1689 copied = sock_intr_errno(timeo);
1690 break;
1691 }
1692 }
1693
1694 tcp_cleanup_rbuf(sk, copied);
1695
1696 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1697
1698 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1699 user_recv = current;
1700 tp->ucopy.task = user_recv;
1701 tp->ucopy.iov = msg->msg_iov;
1702 }
1703
1704 tp->ucopy.len = len;
1705
1706 WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1707 !(flags & (MSG_PEEK | MSG_TRUNC)));
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735 if (!skb_queue_empty(&tp->ucopy.prequeue))
1736 goto do_prequeue;
1737
1738
1739 }
1740
1741#ifdef CONFIG_NET_DMA
1742 if (tp->ucopy.dma_chan) {
1743 if (tp->rcv_wnd == 0 &&
1744 !skb_queue_empty(&sk->sk_async_wait_queue)) {
1745 tcp_service_net_dma(sk, true);
1746 tcp_cleanup_rbuf(sk, copied);
1747 } else
1748 dma_async_issue_pending(tp->ucopy.dma_chan);
1749 }
1750#endif
1751 if (copied >= target) {
1752
1753 release_sock(sk);
1754 lock_sock(sk);
1755 } else
1756 sk_wait_data(sk, &timeo);
1757
1758#ifdef CONFIG_NET_DMA
1759 tcp_service_net_dma(sk, false);
1760 tp->ucopy.wakeup = 0;
1761#endif
1762
1763 if (user_recv) {
1764 int chunk;
1765
1766
1767
1768 if ((chunk = len - tp->ucopy.len) != 0) {
1769 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1770 len -= chunk;
1771 copied += chunk;
1772 }
1773
1774 if (tp->rcv_nxt == tp->copied_seq &&
1775 !skb_queue_empty(&tp->ucopy.prequeue)) {
1776do_prequeue:
1777 tcp_prequeue_process(sk);
1778
1779 if ((chunk = len - tp->ucopy.len) != 0) {
1780 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1781 len -= chunk;
1782 copied += chunk;
1783 }
1784 }
1785 }
1786 if ((flags & MSG_PEEK) &&
1787 (peek_seq - copied - urg_hole != tp->copied_seq)) {
1788 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
1789 current->comm,
1790 task_pid_nr(current));
1791 peek_seq = tp->copied_seq;
1792 }
1793 continue;
1794
1795 found_ok_skb:
1796
1797 used = skb->len - offset;
1798 if (len < used)
1799 used = len;
1800
1801
1802 if (tp->urg_data) {
1803 u32 urg_offset = tp->urg_seq - *seq;
1804 if (urg_offset < used) {
1805 if (!urg_offset) {
1806 if (!sock_flag(sk, SOCK_URGINLINE)) {
1807 ++*seq;
1808 urg_hole++;
1809 offset++;
1810 used--;
1811 if (!used)
1812 goto skip_copy;
1813 }
1814 } else
1815 used = urg_offset;
1816 }
1817 }
1818
1819 if (!(flags & MSG_TRUNC)) {
1820#ifdef CONFIG_NET_DMA
1821 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1822 tp->ucopy.dma_chan = net_dma_find_channel();
1823
1824 if (tp->ucopy.dma_chan) {
1825 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1826 tp->ucopy.dma_chan, skb, offset,
1827 msg->msg_iov, used,
1828 tp->ucopy.pinned_list);
1829
1830 if (tp->ucopy.dma_cookie < 0) {
1831
1832 pr_alert("%s: dma_cookie < 0\n",
1833 __func__);
1834
1835
1836 if (!copied)
1837 copied = -EFAULT;
1838 break;
1839 }
1840
1841 dma_async_issue_pending(tp->ucopy.dma_chan);
1842
1843 if ((offset + used) == skb->len)
1844 copied_early = true;
1845
1846 } else
1847#endif
1848 {
1849 err = skb_copy_datagram_iovec(skb, offset,
1850 msg->msg_iov, used);
1851 if (err) {
1852
1853 if (!copied)
1854 copied = -EFAULT;
1855 break;
1856 }
1857 }
1858 }
1859
1860 *seq += used;
1861 copied += used;
1862 len -= used;
1863
1864 tcp_rcv_space_adjust(sk);
1865
1866skip_copy:
1867 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1868 tp->urg_data = 0;
1869 tcp_fast_path_check(sk);
1870 }
1871 if (used + offset < skb->len)
1872 continue;
1873
1874 if (tcp_hdr(skb)->fin)
1875 goto found_fin_ok;
1876 if (!(flags & MSG_PEEK)) {
1877 sk_eat_skb(sk, skb, copied_early);
1878 copied_early = false;
1879 }
1880 continue;
1881
1882 found_fin_ok:
1883
1884 ++*seq;
1885 if (!(flags & MSG_PEEK)) {
1886 sk_eat_skb(sk, skb, copied_early);
1887 copied_early = false;
1888 }
1889 break;
1890 } while (len > 0);
1891
1892 if (user_recv) {
1893 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1894 int chunk;
1895
1896 tp->ucopy.len = copied > 0 ? len : 0;
1897
1898 tcp_prequeue_process(sk);
1899
1900 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1901 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1902 len -= chunk;
1903 copied += chunk;
1904 }
1905 }
1906
1907 tp->ucopy.task = NULL;
1908 tp->ucopy.len = 0;
1909 }
1910
1911#ifdef CONFIG_NET_DMA
1912 tcp_service_net_dma(sk, true);
1913 tp->ucopy.dma_chan = NULL;
1914
1915 if (tp->ucopy.pinned_list) {
1916 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1917 tp->ucopy.pinned_list = NULL;
1918 }
1919#endif
1920
1921
1922
1923
1924
1925
1926 tcp_cleanup_rbuf(sk, copied);
1927
1928 release_sock(sk);
1929 return copied;
1930
1931out:
1932 release_sock(sk);
1933 return err;
1934
1935recv_urg:
1936 err = tcp_recv_urg(sk, msg, len, flags);
1937 goto out;
1938
1939recv_sndq:
1940 err = tcp_peek_sndq(sk, msg, len);
1941 goto out;
1942}
1943EXPORT_SYMBOL(tcp_recvmsg);
1944
1945void tcp_set_state(struct sock *sk, int state)
1946{
1947 int oldstate = sk->sk_state;
1948
1949 switch (state) {
1950 case TCP_ESTABLISHED:
1951 if (oldstate != TCP_ESTABLISHED)
1952 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1953 break;
1954
1955 case TCP_CLOSE:
1956 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
1957 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
1958
1959 sk->sk_prot->unhash(sk);
1960 if (inet_csk(sk)->icsk_bind_hash &&
1961 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1962 inet_put_port(sk);
1963
1964 default:
1965 if (oldstate == TCP_ESTABLISHED)
1966 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1967 }
1968
1969
1970
1971
1972 sk->sk_state = state;
1973
1974#ifdef STATE_TRACE
1975 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
1976#endif
1977}
1978EXPORT_SYMBOL_GPL(tcp_set_state);
1979
1980
1981
1982
1983
1984
1985
1986
1987static const unsigned char new_state[16] = {
1988
1989 TCP_CLOSE,
1990 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1991 TCP_CLOSE,
1992 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1993 TCP_FIN_WAIT1,
1994 TCP_FIN_WAIT2,
1995 TCP_CLOSE,
1996 TCP_CLOSE,
1997 TCP_LAST_ACK | TCP_ACTION_FIN,
1998 TCP_LAST_ACK,
1999 TCP_CLOSE,
2000 TCP_CLOSING,
2001};
2002
2003static int tcp_close_state(struct sock *sk)
2004{
2005 int next = (int)new_state[sk->sk_state];
2006 int ns = next & TCP_STATE_MASK;
2007
2008 tcp_set_state(sk, ns);
2009
2010 return next & TCP_ACTION_FIN;
2011}
2012
2013
2014
2015
2016
2017
2018void tcp_shutdown(struct sock *sk, int how)
2019{
2020
2021
2022
2023
2024 if (!(how & SEND_SHUTDOWN))
2025 return;
2026
2027
2028 if ((1 << sk->sk_state) &
2029 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2030 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2031
2032 if (tcp_close_state(sk))
2033 tcp_send_fin(sk);
2034 }
2035}
2036EXPORT_SYMBOL(tcp_shutdown);
2037
2038bool tcp_check_oom(struct sock *sk, int shift)
2039{
2040 bool too_many_orphans, out_of_socket_memory;
2041
2042 too_many_orphans = tcp_too_many_orphans(sk, shift);
2043 out_of_socket_memory = tcp_out_of_memory(sk);
2044
2045 if (too_many_orphans)
2046 net_info_ratelimited("too many orphaned sockets\n");
2047 if (out_of_socket_memory)
2048 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2049 return too_many_orphans || out_of_socket_memory;
2050}
2051
2052void tcp_close(struct sock *sk, long timeout)
2053{
2054 struct sk_buff *skb;
2055 int data_was_unread = 0;
2056 int state;
2057
2058 lock_sock(sk);
2059 sk->sk_shutdown = SHUTDOWN_MASK;
2060
2061 if (sk->sk_state == TCP_LISTEN) {
2062 tcp_set_state(sk, TCP_CLOSE);
2063
2064
2065 inet_csk_listen_stop(sk);
2066
2067 goto adjudge_to_death;
2068 }
2069
2070
2071
2072
2073
2074 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2075 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
2076 tcp_hdr(skb)->fin;
2077 data_was_unread += len;
2078 __kfree_skb(skb);
2079 }
2080
2081 sk_mem_reclaim(sk);
2082
2083
2084 if (sk->sk_state == TCP_CLOSE)
2085 goto adjudge_to_death;
2086
2087
2088
2089
2090
2091
2092
2093
2094 if (unlikely(tcp_sk(sk)->repair)) {
2095 sk->sk_prot->disconnect(sk, 0);
2096 } else if (data_was_unread) {
2097
2098 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2099 tcp_set_state(sk, TCP_CLOSE);
2100 tcp_send_active_reset(sk, sk->sk_allocation);
2101 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2102
2103 sk->sk_prot->disconnect(sk, 0);
2104 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2105 } else if (tcp_close_state(sk)) {
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135 tcp_send_fin(sk);
2136 }
2137
2138 sk_stream_wait_close(sk, timeout);
2139
2140adjudge_to_death:
2141 state = sk->sk_state;
2142 sock_hold(sk);
2143 sock_orphan(sk);
2144
2145
2146 release_sock(sk);
2147
2148
2149
2150
2151
2152 local_bh_disable();
2153 bh_lock_sock(sk);
2154 WARN_ON(sock_owned_by_user(sk));
2155
2156 percpu_counter_inc(sk->sk_prot->orphan_count);
2157
2158
2159 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2160 goto out;
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176 if (sk->sk_state == TCP_FIN_WAIT2) {
2177 struct tcp_sock *tp = tcp_sk(sk);
2178 if (tp->linger2 < 0) {
2179 tcp_set_state(sk, TCP_CLOSE);
2180 tcp_send_active_reset(sk, GFP_ATOMIC);
2181 NET_INC_STATS_BH(sock_net(sk),
2182 LINUX_MIB_TCPABORTONLINGER);
2183 } else {
2184 const int tmo = tcp_fin_time(sk);
2185
2186 if (tmo > TCP_TIMEWAIT_LEN) {
2187 inet_csk_reset_keepalive_timer(sk,
2188 tmo - TCP_TIMEWAIT_LEN);
2189 } else {
2190 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2191 goto out;
2192 }
2193 }
2194 }
2195 if (sk->sk_state != TCP_CLOSE) {
2196 sk_mem_reclaim(sk);
2197 if (tcp_check_oom(sk, 0)) {
2198 tcp_set_state(sk, TCP_CLOSE);
2199 tcp_send_active_reset(sk, GFP_ATOMIC);
2200 NET_INC_STATS_BH(sock_net(sk),
2201 LINUX_MIB_TCPABORTONMEMORY);
2202 }
2203 }
2204
2205 if (sk->sk_state == TCP_CLOSE) {
2206 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2207
2208
2209
2210
2211 if (req != NULL)
2212 reqsk_fastopen_remove(sk, req, false);
2213 inet_csk_destroy_sock(sk);
2214 }
2215
2216
2217out:
2218 bh_unlock_sock(sk);
2219 local_bh_enable();
2220 sock_put(sk);
2221}
2222EXPORT_SYMBOL(tcp_close);
2223
2224
2225
2226static inline bool tcp_need_reset(int state)
2227{
2228 return (1 << state) &
2229 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2230 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2231}
2232
2233int tcp_disconnect(struct sock *sk, int flags)
2234{
2235 struct inet_sock *inet = inet_sk(sk);
2236 struct inet_connection_sock *icsk = inet_csk(sk);
2237 struct tcp_sock *tp = tcp_sk(sk);
2238 int err = 0;
2239 int old_state = sk->sk_state;
2240
2241 if (old_state != TCP_CLOSE)
2242 tcp_set_state(sk, TCP_CLOSE);
2243
2244
2245 if (old_state == TCP_LISTEN) {
2246 inet_csk_listen_stop(sk);
2247 } else if (unlikely(tp->repair)) {
2248 sk->sk_err = ECONNABORTED;
2249 } else if (tcp_need_reset(old_state) ||
2250 (tp->snd_nxt != tp->write_seq &&
2251 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2252
2253
2254
2255 tcp_send_active_reset(sk, gfp_any());
2256 sk->sk_err = ECONNRESET;
2257 } else if (old_state == TCP_SYN_SENT)
2258 sk->sk_err = ECONNRESET;
2259
2260 tcp_clear_xmit_timers(sk);
2261 __skb_queue_purge(&sk->sk_receive_queue);
2262 tcp_write_queue_purge(sk);
2263 __skb_queue_purge(&tp->out_of_order_queue);
2264#ifdef CONFIG_NET_DMA
2265 __skb_queue_purge(&sk->sk_async_wait_queue);
2266#endif
2267
2268 inet->inet_dport = 0;
2269
2270 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2271 inet_reset_saddr(sk);
2272
2273 sk->sk_shutdown = 0;
2274 sock_reset_flag(sk, SOCK_DONE);
2275 tp->srtt = 0;
2276 if ((tp->write_seq += tp->max_window + 2) == 0)
2277 tp->write_seq = 1;
2278 icsk->icsk_backoff = 0;
2279 tp->snd_cwnd = 2;
2280 icsk->icsk_probes_out = 0;
2281 tp->packets_out = 0;
2282 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2283 tp->snd_cwnd_cnt = 0;
2284 tp->window_clamp = 0;
2285 tcp_set_ca_state(sk, TCP_CA_Open);
2286 tcp_clear_retrans(tp);
2287 inet_csk_delack_init(sk);
2288 tcp_init_send_head(sk);
2289 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2290 __sk_dst_reset(sk);
2291
2292 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2293
2294 sk->sk_error_report(sk);
2295 return err;
2296}
2297EXPORT_SYMBOL(tcp_disconnect);
2298
2299void tcp_sock_destruct(struct sock *sk)
2300{
2301 inet_sock_destruct(sk);
2302
2303 kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
2304}
2305
2306static inline bool tcp_can_repair_sock(const struct sock *sk)
2307{
2308 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2309 ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
2310}
2311
2312static int tcp_repair_options_est(struct tcp_sock *tp,
2313 struct tcp_repair_opt __user *optbuf, unsigned int len)
2314{
2315 struct tcp_repair_opt opt;
2316
2317 while (len >= sizeof(opt)) {
2318 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2319 return -EFAULT;
2320
2321 optbuf++;
2322 len -= sizeof(opt);
2323
2324 switch (opt.opt_code) {
2325 case TCPOPT_MSS:
2326 tp->rx_opt.mss_clamp = opt.opt_val;
2327 break;
2328 case TCPOPT_WINDOW:
2329 {
2330 u16 snd_wscale = opt.opt_val & 0xFFFF;
2331 u16 rcv_wscale = opt.opt_val >> 16;
2332
2333 if (snd_wscale > 14 || rcv_wscale > 14)
2334 return -EFBIG;
2335
2336 tp->rx_opt.snd_wscale = snd_wscale;
2337 tp->rx_opt.rcv_wscale = rcv_wscale;
2338 tp->rx_opt.wscale_ok = 1;
2339 }
2340 break;
2341 case TCPOPT_SACK_PERM:
2342 if (opt.opt_val != 0)
2343 return -EINVAL;
2344
2345 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2346 if (sysctl_tcp_fack)
2347 tcp_enable_fack(tp);
2348 break;
2349 case TCPOPT_TIMESTAMP:
2350 if (opt.opt_val != 0)
2351 return -EINVAL;
2352
2353 tp->rx_opt.tstamp_ok = 1;
2354 break;
2355 }
2356 }
2357
2358 return 0;
2359}
2360
2361
2362
2363
2364static int do_tcp_setsockopt(struct sock *sk, int level,
2365 int optname, char __user *optval, unsigned int optlen)
2366{
2367 struct tcp_sock *tp = tcp_sk(sk);
2368 struct inet_connection_sock *icsk = inet_csk(sk);
2369 int val;
2370 int err = 0;
2371
2372
2373 switch (optname) {
2374 case TCP_CONGESTION: {
2375 char name[TCP_CA_NAME_MAX];
2376
2377 if (optlen < 1)
2378 return -EINVAL;
2379
2380 val = strncpy_from_user(name, optval,
2381 min_t(long, TCP_CA_NAME_MAX-1, optlen));
2382 if (val < 0)
2383 return -EFAULT;
2384 name[val] = 0;
2385
2386 lock_sock(sk);
2387 err = tcp_set_congestion_control(sk, name);
2388 release_sock(sk);
2389 return err;
2390 }
2391 default:
2392
2393 break;
2394 }
2395
2396 if (optlen < sizeof(int))
2397 return -EINVAL;
2398
2399 if (get_user(val, (int __user *)optval))
2400 return -EFAULT;
2401
2402 lock_sock(sk);
2403
2404 switch (optname) {
2405 case TCP_MAXSEG:
2406
2407
2408
2409 if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
2410 err = -EINVAL;
2411 break;
2412 }
2413 tp->rx_opt.user_mss = val;
2414 break;
2415
2416 case TCP_NODELAY:
2417 if (val) {
2418
2419
2420
2421
2422
2423
2424
2425
2426 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2427 tcp_push_pending_frames(sk);
2428 } else {
2429 tp->nonagle &= ~TCP_NAGLE_OFF;
2430 }
2431 break;
2432
2433 case TCP_THIN_LINEAR_TIMEOUTS:
2434 if (val < 0 || val > 1)
2435 err = -EINVAL;
2436 else
2437 tp->thin_lto = val;
2438 break;
2439
2440 case TCP_THIN_DUPACK:
2441 if (val < 0 || val > 1)
2442 err = -EINVAL;
2443 else
2444 tp->thin_dupack = val;
2445 if (tp->thin_dupack)
2446 tcp_disable_early_retrans(tp);
2447 break;
2448
2449 case TCP_REPAIR:
2450 if (!tcp_can_repair_sock(sk))
2451 err = -EPERM;
2452 else if (val == 1) {
2453 tp->repair = 1;
2454 sk->sk_reuse = SK_FORCE_REUSE;
2455 tp->repair_queue = TCP_NO_QUEUE;
2456 } else if (val == 0) {
2457 tp->repair = 0;
2458 sk->sk_reuse = SK_NO_REUSE;
2459 tcp_send_window_probe(sk);
2460 } else
2461 err = -EINVAL;
2462
2463 break;
2464
2465 case TCP_REPAIR_QUEUE:
2466 if (!tp->repair)
2467 err = -EPERM;
2468 else if (val < TCP_QUEUES_NR)
2469 tp->repair_queue = val;
2470 else
2471 err = -EINVAL;
2472 break;
2473
2474 case TCP_QUEUE_SEQ:
2475 if (sk->sk_state != TCP_CLOSE)
2476 err = -EPERM;
2477 else if (tp->repair_queue == TCP_SEND_QUEUE)
2478 tp->write_seq = val;
2479 else if (tp->repair_queue == TCP_RECV_QUEUE)
2480 tp->rcv_nxt = val;
2481 else
2482 err = -EINVAL;
2483 break;
2484
2485 case TCP_REPAIR_OPTIONS:
2486 if (!tp->repair)
2487 err = -EINVAL;
2488 else if (sk->sk_state == TCP_ESTABLISHED)
2489 err = tcp_repair_options_est(tp,
2490 (struct tcp_repair_opt __user *)optval,
2491 optlen);
2492 else
2493 err = -EPERM;
2494 break;
2495
2496 case TCP_CORK:
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508 if (val) {
2509 tp->nonagle |= TCP_NAGLE_CORK;
2510 } else {
2511 tp->nonagle &= ~TCP_NAGLE_CORK;
2512 if (tp->nonagle&TCP_NAGLE_OFF)
2513 tp->nonagle |= TCP_NAGLE_PUSH;
2514 tcp_push_pending_frames(sk);
2515 }
2516 break;
2517
2518 case TCP_KEEPIDLE:
2519 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2520 err = -EINVAL;
2521 else {
2522 tp->keepalive_time = val * HZ;
2523 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2524 !((1 << sk->sk_state) &
2525 (TCPF_CLOSE | TCPF_LISTEN))) {
2526 u32 elapsed = keepalive_time_elapsed(tp);
2527 if (tp->keepalive_time > elapsed)
2528 elapsed = tp->keepalive_time - elapsed;
2529 else
2530 elapsed = 0;
2531 inet_csk_reset_keepalive_timer(sk, elapsed);
2532 }
2533 }
2534 break;
2535 case TCP_KEEPINTVL:
2536 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2537 err = -EINVAL;
2538 else
2539 tp->keepalive_intvl = val * HZ;
2540 break;
2541 case TCP_KEEPCNT:
2542 if (val < 1 || val > MAX_TCP_KEEPCNT)
2543 err = -EINVAL;
2544 else
2545 tp->keepalive_probes = val;
2546 break;
2547 case TCP_SYNCNT:
2548 if (val < 1 || val > MAX_TCP_SYNCNT)
2549 err = -EINVAL;
2550 else
2551 icsk->icsk_syn_retries = val;
2552 break;
2553
2554 case TCP_LINGER2:
2555 if (val < 0)
2556 tp->linger2 = -1;
2557 else if (val > sysctl_tcp_fin_timeout / HZ)
2558 tp->linger2 = 0;
2559 else
2560 tp->linger2 = val * HZ;
2561 break;
2562
2563 case TCP_DEFER_ACCEPT:
2564
2565 icsk->icsk_accept_queue.rskq_defer_accept =
2566 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2567 TCP_RTO_MAX / HZ);
2568 break;
2569
2570 case TCP_WINDOW_CLAMP:
2571 if (!val) {
2572 if (sk->sk_state != TCP_CLOSE) {
2573 err = -EINVAL;
2574 break;
2575 }
2576 tp->window_clamp = 0;
2577 } else
2578 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2579 SOCK_MIN_RCVBUF / 2 : val;
2580 break;
2581
2582 case TCP_QUICKACK:
2583 if (!val) {
2584 icsk->icsk_ack.pingpong = 1;
2585 } else {
2586 icsk->icsk_ack.pingpong = 0;
2587 if ((1 << sk->sk_state) &
2588 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2589 inet_csk_ack_scheduled(sk)) {
2590 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2591 tcp_cleanup_rbuf(sk, 1);
2592 if (!(val & 1))
2593 icsk->icsk_ack.pingpong = 1;
2594 }
2595 }
2596 break;
2597
2598#ifdef CONFIG_TCP_MD5SIG
2599 case TCP_MD5SIG:
2600
2601 err = tp->af_specific->md5_parse(sk, optval, optlen);
2602 break;
2603#endif
2604 case TCP_USER_TIMEOUT:
2605
2606
2607
2608 if (val < 0)
2609 err = -EINVAL;
2610 else
2611 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2612 break;
2613
2614 case TCP_FASTOPEN:
2615 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2616 TCPF_LISTEN)))
2617 err = fastopen_init_queue(sk, val);
2618 else
2619 err = -EINVAL;
2620 break;
2621 case TCP_TIMESTAMP:
2622 if (!tp->repair)
2623 err = -EPERM;
2624 else
2625 tp->tsoffset = val - tcp_time_stamp;
2626 break;
2627 default:
2628 err = -ENOPROTOOPT;
2629 break;
2630 }
2631
2632 release_sock(sk);
2633 return err;
2634}
2635
2636int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2637 unsigned int optlen)
2638{
2639 const struct inet_connection_sock *icsk = inet_csk(sk);
2640
2641 if (level != SOL_TCP)
2642 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2643 optval, optlen);
2644 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2645}
2646EXPORT_SYMBOL(tcp_setsockopt);
2647
2648#ifdef CONFIG_COMPAT
2649int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2650 char __user *optval, unsigned int optlen)
2651{
2652 if (level != SOL_TCP)
2653 return inet_csk_compat_setsockopt(sk, level, optname,
2654 optval, optlen);
2655 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2656}
2657EXPORT_SYMBOL(compat_tcp_setsockopt);
2658#endif
2659
2660
2661void tcp_get_info(const struct sock *sk, struct tcp_info *info)
2662{
2663 const struct tcp_sock *tp = tcp_sk(sk);
2664 const struct inet_connection_sock *icsk = inet_csk(sk);
2665 u32 now = tcp_time_stamp;
2666
2667 memset(info, 0, sizeof(*info));
2668
2669 info->tcpi_state = sk->sk_state;
2670 info->tcpi_ca_state = icsk->icsk_ca_state;
2671 info->tcpi_retransmits = icsk->icsk_retransmits;
2672 info->tcpi_probes = icsk->icsk_probes_out;
2673 info->tcpi_backoff = icsk->icsk_backoff;
2674
2675 if (tp->rx_opt.tstamp_ok)
2676 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2677 if (tcp_is_sack(tp))
2678 info->tcpi_options |= TCPI_OPT_SACK;
2679 if (tp->rx_opt.wscale_ok) {
2680 info->tcpi_options |= TCPI_OPT_WSCALE;
2681 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2682 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2683 }
2684
2685 if (tp->ecn_flags & TCP_ECN_OK)
2686 info->tcpi_options |= TCPI_OPT_ECN;
2687 if (tp->ecn_flags & TCP_ECN_SEEN)
2688 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
2689 if (tp->syn_data_acked)
2690 info->tcpi_options |= TCPI_OPT_SYN_DATA;
2691
2692 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2693 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2694 info->tcpi_snd_mss = tp->mss_cache;
2695 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2696
2697 if (sk->sk_state == TCP_LISTEN) {
2698 info->tcpi_unacked = sk->sk_ack_backlog;
2699 info->tcpi_sacked = sk->sk_max_ack_backlog;
2700 } else {
2701 info->tcpi_unacked = tp->packets_out;
2702 info->tcpi_sacked = tp->sacked_out;
2703 }
2704 info->tcpi_lost = tp->lost_out;
2705 info->tcpi_retrans = tp->retrans_out;
2706 info->tcpi_fackets = tp->fackets_out;
2707
2708 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2709 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2710 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2711
2712 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2713 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2714 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2715 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2716 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2717 info->tcpi_snd_cwnd = tp->snd_cwnd;
2718 info->tcpi_advmss = tp->advmss;
2719 info->tcpi_reordering = tp->reordering;
2720
2721 info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2722 info->tcpi_rcv_space = tp->rcvq_space.space;
2723
2724 info->tcpi_total_retrans = tp->total_retrans;
2725}
2726EXPORT_SYMBOL_GPL(tcp_get_info);
2727
2728static int do_tcp_getsockopt(struct sock *sk, int level,
2729 int optname, char __user *optval, int __user *optlen)
2730{
2731 struct inet_connection_sock *icsk = inet_csk(sk);
2732 struct tcp_sock *tp = tcp_sk(sk);
2733 int val, len;
2734
2735 if (get_user(len, optlen))
2736 return -EFAULT;
2737
2738 len = min_t(unsigned int, len, sizeof(int));
2739
2740 if (len < 0)
2741 return -EINVAL;
2742
2743 switch (optname) {
2744 case TCP_MAXSEG:
2745 val = tp->mss_cache;
2746 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2747 val = tp->rx_opt.user_mss;
2748 if (tp->repair)
2749 val = tp->rx_opt.mss_clamp;
2750 break;
2751 case TCP_NODELAY:
2752 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2753 break;
2754 case TCP_CORK:
2755 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2756 break;
2757 case TCP_KEEPIDLE:
2758 val = keepalive_time_when(tp) / HZ;
2759 break;
2760 case TCP_KEEPINTVL:
2761 val = keepalive_intvl_when(tp) / HZ;
2762 break;
2763 case TCP_KEEPCNT:
2764 val = keepalive_probes(tp);
2765 break;
2766 case TCP_SYNCNT:
2767 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2768 break;
2769 case TCP_LINGER2:
2770 val = tp->linger2;
2771 if (val >= 0)
2772 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2773 break;
2774 case TCP_DEFER_ACCEPT:
2775 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
2776 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
2777 break;
2778 case TCP_WINDOW_CLAMP:
2779 val = tp->window_clamp;
2780 break;
2781 case TCP_INFO: {
2782 struct tcp_info info;
2783
2784 if (get_user(len, optlen))
2785 return -EFAULT;
2786
2787 tcp_get_info(sk, &info);
2788
2789 len = min_t(unsigned int, len, sizeof(info));
2790 if (put_user(len, optlen))
2791 return -EFAULT;
2792 if (copy_to_user(optval, &info, len))
2793 return -EFAULT;
2794 return 0;
2795 }
2796 case TCP_QUICKACK:
2797 val = !icsk->icsk_ack.pingpong;
2798 break;
2799
2800 case TCP_CONGESTION:
2801 if (get_user(len, optlen))
2802 return -EFAULT;
2803 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2804 if (put_user(len, optlen))
2805 return -EFAULT;
2806 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2807 return -EFAULT;
2808 return 0;
2809
2810 case TCP_THIN_LINEAR_TIMEOUTS:
2811 val = tp->thin_lto;
2812 break;
2813 case TCP_THIN_DUPACK:
2814 val = tp->thin_dupack;
2815 break;
2816
2817 case TCP_REPAIR:
2818 val = tp->repair;
2819 break;
2820
2821 case TCP_REPAIR_QUEUE:
2822 if (tp->repair)
2823 val = tp->repair_queue;
2824 else
2825 return -EINVAL;
2826 break;
2827
2828 case TCP_QUEUE_SEQ:
2829 if (tp->repair_queue == TCP_SEND_QUEUE)
2830 val = tp->write_seq;
2831 else if (tp->repair_queue == TCP_RECV_QUEUE)
2832 val = tp->rcv_nxt;
2833 else
2834 return -EINVAL;
2835 break;
2836
2837 case TCP_USER_TIMEOUT:
2838 val = jiffies_to_msecs(icsk->icsk_user_timeout);
2839 break;
2840 case TCP_TIMESTAMP:
2841 val = tcp_time_stamp + tp->tsoffset;
2842 break;
2843 default:
2844 return -ENOPROTOOPT;
2845 }
2846
2847 if (put_user(len, optlen))
2848 return -EFAULT;
2849 if (copy_to_user(optval, &val, len))
2850 return -EFAULT;
2851 return 0;
2852}
2853
2854int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2855 int __user *optlen)
2856{
2857 struct inet_connection_sock *icsk = inet_csk(sk);
2858
2859 if (level != SOL_TCP)
2860 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2861 optval, optlen);
2862 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2863}
2864EXPORT_SYMBOL(tcp_getsockopt);
2865
2866#ifdef CONFIG_COMPAT
2867int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2868 char __user *optval, int __user *optlen)
2869{
2870 if (level != SOL_TCP)
2871 return inet_csk_compat_getsockopt(sk, level, optname,
2872 optval, optlen);
2873 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2874}
2875EXPORT_SYMBOL(compat_tcp_getsockopt);
2876#endif
2877
2878struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
2879 netdev_features_t features)
2880{
2881 struct sk_buff *segs = ERR_PTR(-EINVAL);
2882 struct tcphdr *th;
2883 unsigned int thlen;
2884 unsigned int seq;
2885 __be32 delta;
2886 unsigned int oldlen;
2887 unsigned int mss;
2888 struct sk_buff *gso_skb = skb;
2889 __sum16 newcheck;
2890 bool ooo_okay, copy_destructor;
2891
2892 if (!pskb_may_pull(skb, sizeof(*th)))
2893 goto out;
2894
2895 th = tcp_hdr(skb);
2896 thlen = th->doff * 4;
2897 if (thlen < sizeof(*th))
2898 goto out;
2899
2900 if (!pskb_may_pull(skb, thlen))
2901 goto out;
2902
2903 oldlen = (u16)~skb->len;
2904 __skb_pull(skb, thlen);
2905
2906 mss = skb_shinfo(skb)->gso_size;
2907 if (unlikely(skb->len <= mss))
2908 goto out;
2909
2910 if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
2911
2912 int type = skb_shinfo(skb)->gso_type;
2913
2914 if (unlikely(type &
2915 ~(SKB_GSO_TCPV4 |
2916 SKB_GSO_DODGY |
2917 SKB_GSO_TCP_ECN |
2918 SKB_GSO_TCPV6 |
2919 SKB_GSO_GRE |
2920 SKB_GSO_UDP_TUNNEL |
2921 0) ||
2922 !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
2923 goto out;
2924
2925 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
2926
2927 segs = NULL;
2928 goto out;
2929 }
2930
2931 copy_destructor = gso_skb->destructor == tcp_wfree;
2932 ooo_okay = gso_skb->ooo_okay;
2933
2934 skb->ooo_okay = 0;
2935
2936 segs = skb_segment(skb, features);
2937 if (IS_ERR(segs))
2938 goto out;
2939
2940
2941 segs->ooo_okay = ooo_okay;
2942
2943 delta = htonl(oldlen + (thlen + mss));
2944
2945 skb = segs;
2946 th = tcp_hdr(skb);
2947 seq = ntohl(th->seq);
2948
2949 newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
2950 (__force u32)delta));
2951
2952 do {
2953 th->fin = th->psh = 0;
2954 th->check = newcheck;
2955
2956 if (skb->ip_summed != CHECKSUM_PARTIAL)
2957 th->check =
2958 csum_fold(csum_partial(skb_transport_header(skb),
2959 thlen, skb->csum));
2960
2961 seq += mss;
2962 if (copy_destructor) {
2963 skb->destructor = gso_skb->destructor;
2964 skb->sk = gso_skb->sk;
2965
2966
2967
2968
2969
2970 skb->truesize = mss;
2971 gso_skb->truesize -= mss;
2972 }
2973 skb = skb->next;
2974 th = tcp_hdr(skb);
2975
2976 th->seq = htonl(seq);
2977 th->cwr = 0;
2978 } while (skb->next);
2979
2980
2981
2982
2983
2984
2985 if (copy_destructor) {
2986 swap(gso_skb->sk, skb->sk);
2987 swap(gso_skb->destructor, skb->destructor);
2988 swap(gso_skb->truesize, skb->truesize);
2989 }
2990
2991 delta = htonl(oldlen + (skb->tail - skb->transport_header) +
2992 skb->data_len);
2993 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2994 (__force u32)delta));
2995 if (skb->ip_summed != CHECKSUM_PARTIAL)
2996 th->check = csum_fold(csum_partial(skb_transport_header(skb),
2997 thlen, skb->csum));
2998
2999out:
3000 return segs;
3001}
3002EXPORT_SYMBOL(tcp_tso_segment);
3003
3004struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
3005{
3006 struct sk_buff **pp = NULL;
3007 struct sk_buff *p;
3008 struct tcphdr *th;
3009 struct tcphdr *th2;
3010 unsigned int len;
3011 unsigned int thlen;
3012 __be32 flags;
3013 unsigned int mss = 1;
3014 unsigned int hlen;
3015 unsigned int off;
3016 int flush = 1;
3017 int i;
3018
3019 off = skb_gro_offset(skb);
3020 hlen = off + sizeof(*th);
3021 th = skb_gro_header_fast(skb, off);
3022 if (skb_gro_header_hard(skb, hlen)) {
3023 th = skb_gro_header_slow(skb, hlen, off);
3024 if (unlikely(!th))
3025 goto out;
3026 }
3027
3028 thlen = th->doff * 4;
3029 if (thlen < sizeof(*th))
3030 goto out;
3031
3032 hlen = off + thlen;
3033 if (skb_gro_header_hard(skb, hlen)) {
3034 th = skb_gro_header_slow(skb, hlen, off);
3035 if (unlikely(!th))
3036 goto out;
3037 }
3038
3039 skb_gro_pull(skb, thlen);
3040
3041 len = skb_gro_len(skb);
3042 flags = tcp_flag_word(th);
3043
3044 for (; (p = *head); head = &p->next) {
3045 if (!NAPI_GRO_CB(p)->same_flow)
3046 continue;
3047
3048 th2 = tcp_hdr(p);
3049
3050 if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
3051 NAPI_GRO_CB(p)->same_flow = 0;
3052 continue;
3053 }
3054
3055 goto found;
3056 }
3057
3058 goto out_check_final;
3059
3060found:
3061 flush = NAPI_GRO_CB(p)->flush;
3062 flush |= (__force int)(flags & TCP_FLAG_CWR);
3063 flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
3064 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
3065 flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
3066 for (i = sizeof(*th); i < thlen; i += 4)
3067 flush |= *(u32 *)((u8 *)th + i) ^
3068 *(u32 *)((u8 *)th2 + i);
3069
3070 mss = skb_shinfo(p)->gso_size;
3071
3072 flush |= (len - 1) >= mss;
3073 flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
3074
3075 if (flush || skb_gro_receive(head, skb)) {
3076 mss = 1;
3077 goto out_check_final;
3078 }
3079
3080 p = *head;
3081 th2 = tcp_hdr(p);
3082 tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
3083
3084out_check_final:
3085 flush = len < mss;
3086 flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
3087 TCP_FLAG_RST | TCP_FLAG_SYN |
3088 TCP_FLAG_FIN));
3089
3090 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
3091 pp = head;
3092
3093out:
3094 NAPI_GRO_CB(skb)->flush |= flush;
3095
3096 return pp;
3097}
3098EXPORT_SYMBOL(tcp_gro_receive);
3099
3100int tcp_gro_complete(struct sk_buff *skb)
3101{
3102 struct tcphdr *th = tcp_hdr(skb);
3103
3104 skb->csum_start = skb_transport_header(skb) - skb->head;
3105 skb->csum_offset = offsetof(struct tcphdr, check);
3106 skb->ip_summed = CHECKSUM_PARTIAL;
3107
3108 skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
3109
3110 if (th->cwr)
3111 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
3112
3113 return 0;
3114}
3115EXPORT_SYMBOL(tcp_gro_complete);
3116
3117#ifdef CONFIG_TCP_MD5SIG
3118static unsigned long tcp_md5sig_users;
3119static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool;
3120static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
3121
3122static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
3123{
3124 int cpu;
3125
3126 for_each_possible_cpu(cpu) {
3127 struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu);
3128
3129 if (p->md5_desc.tfm)
3130 crypto_free_hash(p->md5_desc.tfm);
3131 }
3132 free_percpu(pool);
3133}
3134
3135void tcp_free_md5sig_pool(void)
3136{
3137 struct tcp_md5sig_pool __percpu *pool = NULL;
3138
3139 spin_lock_bh(&tcp_md5sig_pool_lock);
3140 if (--tcp_md5sig_users == 0) {
3141 pool = tcp_md5sig_pool;
3142 tcp_md5sig_pool = NULL;
3143 }
3144 spin_unlock_bh(&tcp_md5sig_pool_lock);
3145 if (pool)
3146 __tcp_free_md5sig_pool(pool);
3147}
3148EXPORT_SYMBOL(tcp_free_md5sig_pool);
3149
3150static struct tcp_md5sig_pool __percpu *
3151__tcp_alloc_md5sig_pool(struct sock *sk)
3152{
3153 int cpu;
3154 struct tcp_md5sig_pool __percpu *pool;
3155
3156 pool = alloc_percpu(struct tcp_md5sig_pool);
3157 if (!pool)
3158 return NULL;
3159
3160 for_each_possible_cpu(cpu) {
3161 struct crypto_hash *hash;
3162
3163 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
3164 if (IS_ERR_OR_NULL(hash))
3165 goto out_free;
3166
3167 per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
3168 }
3169 return pool;
3170out_free:
3171 __tcp_free_md5sig_pool(pool);
3172 return NULL;
3173}
3174
3175struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
3176{
3177 struct tcp_md5sig_pool __percpu *pool;
3178 bool alloc = false;
3179
3180retry:
3181 spin_lock_bh(&tcp_md5sig_pool_lock);
3182 pool = tcp_md5sig_pool;
3183 if (tcp_md5sig_users++ == 0) {
3184 alloc = true;
3185 spin_unlock_bh(&tcp_md5sig_pool_lock);
3186 } else if (!pool) {
3187 tcp_md5sig_users--;
3188 spin_unlock_bh(&tcp_md5sig_pool_lock);
3189 cpu_relax();
3190 goto retry;
3191 } else
3192 spin_unlock_bh(&tcp_md5sig_pool_lock);
3193
3194 if (alloc) {
3195
3196 struct tcp_md5sig_pool __percpu *p;
3197
3198 p = __tcp_alloc_md5sig_pool(sk);
3199 spin_lock_bh(&tcp_md5sig_pool_lock);
3200 if (!p) {
3201 tcp_md5sig_users--;
3202 spin_unlock_bh(&tcp_md5sig_pool_lock);
3203 return NULL;
3204 }
3205 pool = tcp_md5sig_pool;
3206 if (pool) {
3207
3208 spin_unlock_bh(&tcp_md5sig_pool_lock);
3209 __tcp_free_md5sig_pool(p);
3210 } else {
3211 tcp_md5sig_pool = pool = p;
3212 spin_unlock_bh(&tcp_md5sig_pool_lock);
3213 }
3214 }
3215 return pool;
3216}
3217EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3228{
3229 struct tcp_md5sig_pool __percpu *p;
3230
3231 local_bh_disable();
3232
3233 spin_lock(&tcp_md5sig_pool_lock);
3234 p = tcp_md5sig_pool;
3235 if (p)
3236 tcp_md5sig_users++;
3237 spin_unlock(&tcp_md5sig_pool_lock);
3238
3239 if (p)
3240 return this_cpu_ptr(p);
3241
3242 local_bh_enable();
3243 return NULL;
3244}
3245EXPORT_SYMBOL(tcp_get_md5sig_pool);
3246
3247void tcp_put_md5sig_pool(void)
3248{
3249 local_bh_enable();
3250 tcp_free_md5sig_pool();
3251}
3252EXPORT_SYMBOL(tcp_put_md5sig_pool);
3253
3254int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
3255 const struct tcphdr *th)
3256{
3257 struct scatterlist sg;
3258 struct tcphdr hdr;
3259 int err;
3260
3261
3262 memcpy(&hdr, th, sizeof(hdr));
3263 hdr.check = 0;
3264
3265
3266 sg_init_one(&sg, &hdr, sizeof(hdr));
3267 err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
3268 return err;
3269}
3270EXPORT_SYMBOL(tcp_md5_hash_header);
3271
3272int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3273 const struct sk_buff *skb, unsigned int header_len)
3274{
3275 struct scatterlist sg;
3276 const struct tcphdr *tp = tcp_hdr(skb);
3277 struct hash_desc *desc = &hp->md5_desc;
3278 unsigned int i;
3279 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
3280 skb_headlen(skb) - header_len : 0;
3281 const struct skb_shared_info *shi = skb_shinfo(skb);
3282 struct sk_buff *frag_iter;
3283
3284 sg_init_table(&sg, 1);
3285
3286 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
3287 if (crypto_hash_update(desc, &sg, head_data_len))
3288 return 1;
3289
3290 for (i = 0; i < shi->nr_frags; ++i) {
3291 const struct skb_frag_struct *f = &shi->frags[i];
3292 unsigned int offset = f->page_offset;
3293 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
3294
3295 sg_set_page(&sg, page, skb_frag_size(f),
3296 offset_in_page(offset));
3297 if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
3298 return 1;
3299 }
3300
3301 skb_walk_frags(skb, frag_iter)
3302 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3303 return 1;
3304
3305 return 0;
3306}
3307EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3308
3309int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
3310{
3311 struct scatterlist sg;
3312
3313 sg_init_one(&sg, key->key, key->keylen);
3314 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
3315}
3316EXPORT_SYMBOL(tcp_md5_hash_key);
3317
3318#endif
3319
3320void tcp_done(struct sock *sk)
3321{
3322 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3323
3324 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3325 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3326
3327 tcp_set_state(sk, TCP_CLOSE);
3328 tcp_clear_xmit_timers(sk);
3329 if (req != NULL)
3330 reqsk_fastopen_remove(sk, req, false);
3331
3332 sk->sk_shutdown = SHUTDOWN_MASK;
3333
3334 if (!sock_flag(sk, SOCK_DEAD))
3335 sk->sk_state_change(sk);
3336 else
3337 inet_csk_destroy_sock(sk);
3338}
3339EXPORT_SYMBOL_GPL(tcp_done);
3340
3341extern struct tcp_congestion_ops tcp_reno;
3342
3343static __initdata unsigned long thash_entries;
3344static int __init set_thash_entries(char *str)
3345{
3346 ssize_t ret;
3347
3348 if (!str)
3349 return 0;
3350
3351 ret = kstrtoul(str, 0, &thash_entries);
3352 if (ret)
3353 return 0;
3354
3355 return 1;
3356}
3357__setup("thash_entries=", set_thash_entries);
3358
3359void tcp_init_mem(struct net *net)
3360{
3361 unsigned long limit = nr_free_buffer_pages() / 8;
3362 limit = max(limit, 128UL);
3363 net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
3364 net->ipv4.sysctl_tcp_mem[1] = limit;
3365 net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
3366}
3367
3368void __init tcp_init(void)
3369{
3370 struct sk_buff *skb = NULL;
3371 unsigned long limit;
3372 int max_rshare, max_wshare, cnt;
3373 unsigned int i;
3374
3375 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
3376
3377 percpu_counter_init(&tcp_sockets_allocated, 0);
3378 percpu_counter_init(&tcp_orphan_count, 0);
3379 tcp_hashinfo.bind_bucket_cachep =
3380 kmem_cache_create("tcp_bind_bucket",
3381 sizeof(struct inet_bind_bucket), 0,
3382 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3383
3384
3385
3386
3387
3388
3389 tcp_hashinfo.ehash =
3390 alloc_large_system_hash("TCP established",
3391 sizeof(struct inet_ehash_bucket),
3392 thash_entries,
3393 17,
3394 0,
3395 NULL,
3396 &tcp_hashinfo.ehash_mask,
3397 0,
3398 thash_entries ? 0 : 512 * 1024);
3399 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
3400 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
3401 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
3402 }
3403 if (inet_ehash_locks_alloc(&tcp_hashinfo))
3404 panic("TCP: failed to alloc ehash_locks");
3405 tcp_hashinfo.bhash =
3406 alloc_large_system_hash("TCP bind",
3407 sizeof(struct inet_bind_hashbucket),
3408 tcp_hashinfo.ehash_mask + 1,
3409 17,
3410 0,
3411 &tcp_hashinfo.bhash_size,
3412 NULL,
3413 0,
3414 64 * 1024);
3415 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
3416 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3417 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3418 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3419 }
3420
3421
3422 cnt = tcp_hashinfo.ehash_mask + 1;
3423
3424 tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3425 sysctl_tcp_max_orphans = cnt / 2;
3426 sysctl_max_syn_backlog = max(128, cnt / 256);
3427
3428 tcp_init_mem(&init_net);
3429
3430 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
3431 max_wshare = min(4UL*1024*1024, limit);
3432 max_rshare = min(6UL*1024*1024, limit);
3433
3434 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3435 sysctl_tcp_wmem[1] = 16*1024;
3436 sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3437
3438 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3439 sysctl_tcp_rmem[1] = 87380;
3440 sysctl_tcp_rmem[2] = max(87380, max_rshare);
3441
3442 pr_info("Hash tables configured (established %u bind %u)\n",
3443 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3444
3445 tcp_metrics_init();
3446
3447 tcp_register_congestion_control(&tcp_reno);
3448
3449 tcp_tasklet_init();
3450}
3451