1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248#define pr_fmt(fmt) "TCP: " fmt
249
250#include <crypto/hash.h>
251#include <linux/kernel.h>
252#include <linux/module.h>
253#include <linux/types.h>
254#include <linux/fcntl.h>
255#include <linux/poll.h>
256#include <linux/inet_diag.h>
257#include <linux/init.h>
258#include <linux/fs.h>
259#include <linux/skbuff.h>
260#include <linux/scatterlist.h>
261#include <linux/splice.h>
262#include <linux/net.h>
263#include <linux/socket.h>
264#include <linux/random.h>
265#include <linux/bootmem.h>
266#include <linux/highmem.h>
267#include <linux/swap.h>
268#include <linux/cache.h>
269#include <linux/err.h>
270#include <linux/time.h>
271#include <linux/slab.h>
272
273#include <net/icmp.h>
274#include <net/inet_common.h>
275#include <net/tcp.h>
276#include <net/xfrm.h>
277#include <net/ip.h>
278#include <net/sock.h>
279
280#include <linux/uaccess.h>
281#include <asm/ioctls.h>
282#include <net/busy_poll.h>
283
284int sysctl_tcp_min_tso_segs __read_mostly = 2;
285
286int sysctl_tcp_autocorking __read_mostly = 1;
287
288struct percpu_counter tcp_orphan_count;
289EXPORT_SYMBOL_GPL(tcp_orphan_count);
290
291long sysctl_tcp_mem[3] __read_mostly;
292int sysctl_tcp_wmem[3] __read_mostly;
293int sysctl_tcp_rmem[3] __read_mostly;
294
295EXPORT_SYMBOL(sysctl_tcp_mem);
296EXPORT_SYMBOL(sysctl_tcp_rmem);
297EXPORT_SYMBOL(sysctl_tcp_wmem);
298
299atomic_long_t tcp_memory_allocated;
300EXPORT_SYMBOL(tcp_memory_allocated);
301
302
303
304
305struct percpu_counter tcp_sockets_allocated;
306EXPORT_SYMBOL(tcp_sockets_allocated);
307
308
309
310
311struct tcp_splice_state {
312 struct pipe_inode_info *pipe;
313 size_t len;
314 unsigned int flags;
315};
316
317
318
319
320
321
322
323unsigned long tcp_memory_pressure __read_mostly;
324EXPORT_SYMBOL_GPL(tcp_memory_pressure);
325
326void tcp_enter_memory_pressure(struct sock *sk)
327{
328 unsigned long val;
329
330 if (tcp_memory_pressure)
331 return;
332 val = jiffies;
333
334 if (!val)
335 val--;
336 if (!cmpxchg(&tcp_memory_pressure, 0, val))
337 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
338}
339EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
340
341void tcp_leave_memory_pressure(struct sock *sk)
342{
343 unsigned long val;
344
345 if (!tcp_memory_pressure)
346 return;
347 val = xchg(&tcp_memory_pressure, 0);
348 if (val)
349 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
350 jiffies_to_msecs(jiffies - val));
351}
352EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
353
354
355static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
356{
357 u8 res = 0;
358
359 if (seconds > 0) {
360 int period = timeout;
361
362 res = 1;
363 while (seconds > period && res < 255) {
364 res++;
365 timeout <<= 1;
366 if (timeout > rto_max)
367 timeout = rto_max;
368 period += timeout;
369 }
370 }
371 return res;
372}
373
374
375static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
376{
377 int period = 0;
378
379 if (retrans > 0) {
380 period = timeout;
381 while (--retrans) {
382 timeout <<= 1;
383 if (timeout > rto_max)
384 timeout = rto_max;
385 period += timeout;
386 }
387 }
388 return period;
389}
390
391
392
393
394
395
396void tcp_init_sock(struct sock *sk)
397{
398 struct inet_connection_sock *icsk = inet_csk(sk);
399 struct tcp_sock *tp = tcp_sk(sk);
400
401 tp->out_of_order_queue = RB_ROOT;
402 tcp_init_xmit_timers(sk);
403 tcp_prequeue_init(tp);
404 INIT_LIST_HEAD(&tp->tsq_node);
405
406 icsk->icsk_rto = TCP_TIMEOUT_INIT;
407 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
408 minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
409
410
411
412
413
414
415 tp->snd_cwnd = TCP_INIT_CWND;
416
417
418 tp->app_limited = ~0U;
419
420
421
422
423 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
424 tp->snd_cwnd_clamp = ~0;
425 tp->mss_cache = TCP_MSS_DEFAULT;
426
427 tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
428 tcp_assign_congestion_control(sk);
429
430 tp->tsoffset = 0;
431
432 sk->sk_state = TCP_CLOSE;
433
434 sk->sk_write_space = sk_stream_write_space;
435 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
436
437 icsk->icsk_sync_mss = tcp_sync_mss;
438
439 sk->sk_sndbuf = sysctl_tcp_wmem[1];
440 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
441
442 sk_sockets_allocated_inc(sk);
443}
444EXPORT_SYMBOL(tcp_init_sock);
445
446static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
447{
448 if (tsflags && skb) {
449 struct skb_shared_info *shinfo = skb_shinfo(skb);
450 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
451
452 sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
453 if (tsflags & SOF_TIMESTAMPING_TX_ACK)
454 tcb->txstamp_ack = 1;
455 if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
456 shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
457 }
458}
459
460
461
462
463
464
465
466
467unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
468{
469 unsigned int mask;
470 struct sock *sk = sock->sk;
471 const struct tcp_sock *tp = tcp_sk(sk);
472 int state;
473
474 sock_rps_record_flow(sk);
475
476 sock_poll_wait(file, sk_sleep(sk), wait);
477
478 state = sk_state_load(sk);
479 if (state == TCP_LISTEN)
480 return inet_csk_listen_poll(sk);
481
482
483
484
485
486
487 mask = 0;
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516 if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
517 mask |= POLLHUP;
518 if (sk->sk_shutdown & RCV_SHUTDOWN)
519 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
520
521
522 if (state != TCP_SYN_SENT &&
523 (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
524 int target = sock_rcvlowat(sk, 0, INT_MAX);
525
526 if (tp->urg_seq == tp->copied_seq &&
527 !sock_flag(sk, SOCK_URGINLINE) &&
528 tp->urg_data)
529 target++;
530
531 if (tp->rcv_nxt - tp->copied_seq >= target)
532 mask |= POLLIN | POLLRDNORM;
533
534 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
535 if (sk_stream_is_writeable(sk)) {
536 mask |= POLLOUT | POLLWRNORM;
537 } else {
538 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
539 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
540
541
542
543
544
545
546 smp_mb__after_atomic();
547 if (sk_stream_is_writeable(sk))
548 mask |= POLLOUT | POLLWRNORM;
549 }
550 } else
551 mask |= POLLOUT | POLLWRNORM;
552
553 if (tp->urg_data & TCP_URG_VALID)
554 mask |= POLLPRI;
555 } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
556
557
558
559
560 mask |= POLLOUT | POLLWRNORM;
561 }
562
563 smp_rmb();
564 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
565 mask |= POLLERR;
566
567 return mask;
568}
569EXPORT_SYMBOL(tcp_poll);
570
571int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
572{
573 struct tcp_sock *tp = tcp_sk(sk);
574 int answ;
575 bool slow;
576
577 switch (cmd) {
578 case SIOCINQ:
579 if (sk->sk_state == TCP_LISTEN)
580 return -EINVAL;
581
582 slow = lock_sock_fast(sk);
583 answ = tcp_inq(sk);
584 unlock_sock_fast(sk, slow);
585 break;
586 case SIOCATMARK:
587 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
588 break;
589 case SIOCOUTQ:
590 if (sk->sk_state == TCP_LISTEN)
591 return -EINVAL;
592
593 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
594 answ = 0;
595 else
596 answ = tp->write_seq - tp->snd_una;
597 break;
598 case SIOCOUTQNSD:
599 if (sk->sk_state == TCP_LISTEN)
600 return -EINVAL;
601
602 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
603 answ = 0;
604 else
605 answ = tp->write_seq - tp->snd_nxt;
606 break;
607 default:
608 return -ENOIOCTLCMD;
609 }
610
611 return put_user(answ, (int __user *)arg);
612}
613EXPORT_SYMBOL(tcp_ioctl);
614
615static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
616{
617 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
618 tp->pushed_seq = tp->write_seq;
619}
620
621static inline bool forced_push(const struct tcp_sock *tp)
622{
623 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
624}
625
626static void skb_entail(struct sock *sk, struct sk_buff *skb)
627{
628 struct tcp_sock *tp = tcp_sk(sk);
629 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
630
631 skb->csum = 0;
632 tcb->seq = tcb->end_seq = tp->write_seq;
633 tcb->tcp_flags = TCPHDR_ACK;
634 tcb->sacked = 0;
635 __skb_header_release(skb);
636 tcp_add_write_queue_tail(sk, skb);
637 sk->sk_wmem_queued += skb->truesize;
638 sk_mem_charge(sk, skb->truesize);
639 if (tp->nonagle & TCP_NAGLE_PUSH)
640 tp->nonagle &= ~TCP_NAGLE_PUSH;
641
642 tcp_slow_start_after_idle_check(sk);
643}
644
645static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
646{
647 if (flags & MSG_OOB)
648 tp->snd_up = tp->write_seq;
649}
650
651
652
653
654
655
656
657
658
659
660
661static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
662 int size_goal)
663{
664 return skb->len < size_goal &&
665 sysctl_tcp_autocorking &&
666 skb != tcp_write_queue_head(sk) &&
667 refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
668}
669
670static void tcp_push(struct sock *sk, int flags, int mss_now,
671 int nonagle, int size_goal)
672{
673 struct tcp_sock *tp = tcp_sk(sk);
674 struct sk_buff *skb;
675
676 if (!tcp_send_head(sk))
677 return;
678
679 skb = tcp_write_queue_tail(sk);
680 if (!(flags & MSG_MORE) || forced_push(tp))
681 tcp_mark_push(tp, skb);
682
683 tcp_mark_urg(tp, flags);
684
685 if (tcp_should_autocork(sk, skb, size_goal)) {
686
687
688 if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
689 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
690 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
691 }
692
693
694
695 if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
696 return;
697 }
698
699 if (flags & MSG_MORE)
700 nonagle = TCP_NAGLE_CORK;
701
702 __tcp_push_pending_frames(sk, mss_now, nonagle);
703}
704
705static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
706 unsigned int offset, size_t len)
707{
708 struct tcp_splice_state *tss = rd_desc->arg.data;
709 int ret;
710
711 ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
712 min(rd_desc->count, len), tss->flags);
713 if (ret > 0)
714 rd_desc->count -= ret;
715 return ret;
716}
717
718static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
719{
720
721 read_descriptor_t rd_desc = {
722 .arg.data = tss,
723 .count = tss->len,
724 };
725
726 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
727}
728
729
730
731
732
733
734
735
736
737
738
739
740
741ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
742 struct pipe_inode_info *pipe, size_t len,
743 unsigned int flags)
744{
745 struct sock *sk = sock->sk;
746 struct tcp_splice_state tss = {
747 .pipe = pipe,
748 .len = len,
749 .flags = flags,
750 };
751 long timeo;
752 ssize_t spliced;
753 int ret;
754
755 sock_rps_record_flow(sk);
756
757
758
759 if (unlikely(*ppos))
760 return -ESPIPE;
761
762 ret = spliced = 0;
763
764 lock_sock(sk);
765
766 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
767 while (tss.len) {
768 ret = __tcp_splice_read(sk, &tss);
769 if (ret < 0)
770 break;
771 else if (!ret) {
772 if (spliced)
773 break;
774 if (sock_flag(sk, SOCK_DONE))
775 break;
776 if (sk->sk_err) {
777 ret = sock_error(sk);
778 break;
779 }
780 if (sk->sk_shutdown & RCV_SHUTDOWN)
781 break;
782 if (sk->sk_state == TCP_CLOSE) {
783
784
785
786
787 if (!sock_flag(sk, SOCK_DONE))
788 ret = -ENOTCONN;
789 break;
790 }
791 if (!timeo) {
792 ret = -EAGAIN;
793 break;
794 }
795
796
797
798
799 if (!skb_queue_empty(&sk->sk_receive_queue))
800 break;
801 sk_wait_data(sk, &timeo, NULL);
802 if (signal_pending(current)) {
803 ret = sock_intr_errno(timeo);
804 break;
805 }
806 continue;
807 }
808 tss.len -= ret;
809 spliced += ret;
810
811 if (!timeo)
812 break;
813 release_sock(sk);
814 lock_sock(sk);
815
816 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
817 (sk->sk_shutdown & RCV_SHUTDOWN) ||
818 signal_pending(current))
819 break;
820 }
821
822 release_sock(sk);
823
824 if (spliced)
825 return spliced;
826
827 return ret;
828}
829EXPORT_SYMBOL(tcp_splice_read);
830
831struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
832 bool force_schedule)
833{
834 struct sk_buff *skb;
835
836
837 size = ALIGN(size, 4);
838
839 if (unlikely(tcp_under_memory_pressure(sk)))
840 sk_mem_reclaim_partial(sk);
841
842 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
843 if (likely(skb)) {
844 bool mem_scheduled;
845
846 if (force_schedule) {
847 mem_scheduled = true;
848 sk_forced_mem_schedule(sk, skb->truesize);
849 } else {
850 mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
851 }
852 if (likely(mem_scheduled)) {
853 skb_reserve(skb, sk->sk_prot->max_header);
854
855
856
857
858 skb->reserved_tailroom = skb->end - skb->tail - size;
859 return skb;
860 }
861 __kfree_skb(skb);
862 } else {
863 sk->sk_prot->enter_memory_pressure(sk);
864 sk_stream_moderate_sndbuf(sk);
865 }
866 return NULL;
867}
868
869static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
870 int large_allowed)
871{
872 struct tcp_sock *tp = tcp_sk(sk);
873 u32 new_size_goal, size_goal;
874
875 if (!large_allowed || !sk_can_gso(sk))
876 return mss_now;
877
878
879 new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
880 new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
881
882
883 size_goal = tp->gso_segs * mss_now;
884 if (unlikely(new_size_goal < size_goal ||
885 new_size_goal >= size_goal + mss_now)) {
886 tp->gso_segs = min_t(u16, new_size_goal / mss_now,
887 sk->sk_gso_max_segs);
888 size_goal = tp->gso_segs * mss_now;
889 }
890
891 return max(size_goal, mss_now);
892}
893
894static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
895{
896 int mss_now;
897
898 mss_now = tcp_current_mss(sk);
899 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
900
901 return mss_now;
902}
903
904ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
905 size_t size, int flags)
906{
907 struct tcp_sock *tp = tcp_sk(sk);
908 int mss_now, size_goal;
909 int err;
910 ssize_t copied;
911 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
912
913
914
915
916
917 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
918 !tcp_passive_fastopen(sk)) {
919 err = sk_stream_wait_connect(sk, &timeo);
920 if (err != 0)
921 goto out_err;
922 }
923
924 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
925
926 mss_now = tcp_send_mss(sk, &size_goal, flags);
927 copied = 0;
928
929 err = -EPIPE;
930 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
931 goto out_err;
932
933 while (size > 0) {
934 struct sk_buff *skb = tcp_write_queue_tail(sk);
935 int copy, i;
936 bool can_coalesce;
937
938 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
939 !tcp_skb_can_collapse_to(skb)) {
940new_segment:
941 if (!sk_stream_memory_free(sk))
942 goto wait_for_sndbuf;
943
944 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
945 skb_queue_empty(&sk->sk_write_queue));
946 if (!skb)
947 goto wait_for_memory;
948
949 skb_entail(sk, skb);
950 copy = size_goal;
951 }
952
953 if (copy > size)
954 copy = size;
955
956 i = skb_shinfo(skb)->nr_frags;
957 can_coalesce = skb_can_coalesce(skb, i, page, offset);
958 if (!can_coalesce && i >= sysctl_max_skb_frags) {
959 tcp_mark_push(tp, skb);
960 goto new_segment;
961 }
962 if (!sk_wmem_schedule(sk, copy))
963 goto wait_for_memory;
964
965 if (can_coalesce) {
966 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
967 } else {
968 get_page(page);
969 skb_fill_page_desc(skb, i, page, offset, copy);
970 }
971 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
972
973 skb->len += copy;
974 skb->data_len += copy;
975 skb->truesize += copy;
976 sk->sk_wmem_queued += copy;
977 sk_mem_charge(sk, copy);
978 skb->ip_summed = CHECKSUM_PARTIAL;
979 tp->write_seq += copy;
980 TCP_SKB_CB(skb)->end_seq += copy;
981 tcp_skb_pcount_set(skb, 0);
982
983 if (!copied)
984 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
985
986 copied += copy;
987 offset += copy;
988 size -= copy;
989 if (!size)
990 goto out;
991
992 if (skb->len < size_goal || (flags & MSG_OOB))
993 continue;
994
995 if (forced_push(tp)) {
996 tcp_mark_push(tp, skb);
997 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
998 } else if (skb == tcp_send_head(sk))
999 tcp_push_one(sk, mss_now);
1000 continue;
1001
1002wait_for_sndbuf:
1003 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1004wait_for_memory:
1005 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1006 TCP_NAGLE_PUSH, size_goal);
1007
1008 err = sk_stream_wait_memory(sk, &timeo);
1009 if (err != 0)
1010 goto do_error;
1011
1012 mss_now = tcp_send_mss(sk, &size_goal, flags);
1013 }
1014
1015out:
1016 if (copied) {
1017 tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk));
1018 if (!(flags & MSG_SENDPAGE_NOTLAST))
1019 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1020 }
1021 return copied;
1022
1023do_error:
1024 if (copied)
1025 goto out;
1026out_err:
1027
1028 if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
1029 err == -EAGAIN)) {
1030 sk->sk_write_space(sk);
1031 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1032 }
1033 return sk_stream_error(sk, flags, err);
1034}
1035EXPORT_SYMBOL_GPL(do_tcp_sendpages);
1036
1037int tcp_sendpage(struct sock *sk, struct page *page, int offset,
1038 size_t size, int flags)
1039{
1040 ssize_t res;
1041
1042 if (!(sk->sk_route_caps & NETIF_F_SG) ||
1043 !sk_check_csum_caps(sk))
1044 return sock_no_sendpage(sk->sk_socket, page, offset, size,
1045 flags);
1046
1047 lock_sock(sk);
1048
1049 tcp_rate_check_app_limited(sk);
1050
1051 res = do_tcp_sendpages(sk, page, offset, size, flags);
1052 release_sock(sk);
1053 return res;
1054}
1055EXPORT_SYMBOL(tcp_sendpage);
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067static int linear_payload_sz(bool first_skb)
1068{
1069 if (first_skb)
1070 return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
1071 return 0;
1072}
1073
1074static int select_size(const struct sock *sk, bool sg, bool first_skb)
1075{
1076 const struct tcp_sock *tp = tcp_sk(sk);
1077 int tmp = tp->mss_cache;
1078
1079 if (sg) {
1080 if (sk_can_gso(sk)) {
1081 tmp = linear_payload_sz(first_skb);
1082 } else {
1083 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1084
1085 if (tmp >= pgbreak &&
1086 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
1087 tmp = pgbreak;
1088 }
1089 }
1090
1091 return tmp;
1092}
1093
1094void tcp_free_fastopen_req(struct tcp_sock *tp)
1095{
1096 if (tp->fastopen_req) {
1097 kfree(tp->fastopen_req);
1098 tp->fastopen_req = NULL;
1099 }
1100}
1101
1102static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1103 int *copied, size_t size)
1104{
1105 struct tcp_sock *tp = tcp_sk(sk);
1106 struct inet_sock *inet = inet_sk(sk);
1107 struct sockaddr *uaddr = msg->msg_name;
1108 int err, flags;
1109
1110 if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
1111 (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
1112 uaddr->sa_family == AF_UNSPEC))
1113 return -EOPNOTSUPP;
1114 if (tp->fastopen_req)
1115 return -EALREADY;
1116
1117 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1118 sk->sk_allocation);
1119 if (unlikely(!tp->fastopen_req))
1120 return -ENOBUFS;
1121 tp->fastopen_req->data = msg;
1122 tp->fastopen_req->size = size;
1123
1124 if (inet->defer_connect) {
1125 err = tcp_connect(sk);
1126
1127 if (err) {
1128 tcp_set_state(sk, TCP_CLOSE);
1129 inet->inet_dport = 0;
1130 sk->sk_route_caps = 0;
1131 }
1132 }
1133 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1134 err = __inet_stream_connect(sk->sk_socket, uaddr,
1135 msg->msg_namelen, flags, 1);
1136
1137
1138
1139 if (tp->fastopen_req) {
1140 *copied = tp->fastopen_req->copied;
1141 tcp_free_fastopen_req(tp);
1142 inet->defer_connect = 0;
1143 }
1144 return err;
1145}
1146
1147int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1148{
1149 struct tcp_sock *tp = tcp_sk(sk);
1150 struct sk_buff *skb;
1151 struct sockcm_cookie sockc;
1152 int flags, err, copied = 0;
1153 int mss_now = 0, size_goal, copied_syn = 0;
1154 bool process_backlog = false;
1155 bool sg;
1156 long timeo;
1157
1158 lock_sock(sk);
1159
1160 flags = msg->msg_flags;
1161 if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
1162 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
1163 if (err == -EINPROGRESS && copied_syn > 0)
1164 goto out;
1165 else if (err)
1166 goto out_err;
1167 }
1168
1169 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1170
1171 tcp_rate_check_app_limited(sk);
1172
1173
1174
1175
1176
1177 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1178 !tcp_passive_fastopen(sk)) {
1179 err = sk_stream_wait_connect(sk, &timeo);
1180 if (err != 0)
1181 goto do_error;
1182 }
1183
1184 if (unlikely(tp->repair)) {
1185 if (tp->repair_queue == TCP_RECV_QUEUE) {
1186 copied = tcp_send_rcvq(sk, msg, size);
1187 goto out_nopush;
1188 }
1189
1190 err = -EINVAL;
1191 if (tp->repair_queue == TCP_NO_QUEUE)
1192 goto out_err;
1193
1194
1195 }
1196
1197 sockc.tsflags = sk->sk_tsflags;
1198 if (msg->msg_controllen) {
1199 err = sock_cmsg_send(sk, msg, &sockc);
1200 if (unlikely(err)) {
1201 err = -EINVAL;
1202 goto out_err;
1203 }
1204 }
1205
1206
1207 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1208
1209
1210 copied = 0;
1211
1212restart:
1213 mss_now = tcp_send_mss(sk, &size_goal, flags);
1214
1215 err = -EPIPE;
1216 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1217 goto do_error;
1218
1219 sg = !!(sk->sk_route_caps & NETIF_F_SG);
1220
1221 while (msg_data_left(msg)) {
1222 int copy = 0;
1223 int max = size_goal;
1224
1225 skb = tcp_write_queue_tail(sk);
1226 if (tcp_send_head(sk)) {
1227 if (skb->ip_summed == CHECKSUM_NONE)
1228 max = mss_now;
1229 copy = max - skb->len;
1230 }
1231
1232 if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
1233 bool first_skb;
1234
1235new_segment:
1236
1237
1238
1239 if (!sk_stream_memory_free(sk))
1240 goto wait_for_sndbuf;
1241
1242 if (process_backlog && sk_flush_backlog(sk)) {
1243 process_backlog = false;
1244 goto restart;
1245 }
1246 first_skb = skb_queue_empty(&sk->sk_write_queue);
1247 skb = sk_stream_alloc_skb(sk,
1248 select_size(sk, sg, first_skb),
1249 sk->sk_allocation,
1250 first_skb);
1251 if (!skb)
1252 goto wait_for_memory;
1253
1254 process_backlog = true;
1255
1256
1257
1258 if (sk_check_csum_caps(sk))
1259 skb->ip_summed = CHECKSUM_PARTIAL;
1260
1261 skb_entail(sk, skb);
1262 copy = size_goal;
1263 max = size_goal;
1264
1265
1266
1267
1268
1269 if (tp->repair)
1270 TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
1271 }
1272
1273
1274 if (copy > msg_data_left(msg))
1275 copy = msg_data_left(msg);
1276
1277
1278 if (skb_availroom(skb) > 0) {
1279
1280 copy = min_t(int, copy, skb_availroom(skb));
1281 err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
1282 if (err)
1283 goto do_fault;
1284 } else {
1285 bool merge = true;
1286 int i = skb_shinfo(skb)->nr_frags;
1287 struct page_frag *pfrag = sk_page_frag(sk);
1288
1289 if (!sk_page_frag_refill(sk, pfrag))
1290 goto wait_for_memory;
1291
1292 if (!skb_can_coalesce(skb, i, pfrag->page,
1293 pfrag->offset)) {
1294 if (i >= sysctl_max_skb_frags || !sg) {
1295 tcp_mark_push(tp, skb);
1296 goto new_segment;
1297 }
1298 merge = false;
1299 }
1300
1301 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1302
1303 if (!sk_wmem_schedule(sk, copy))
1304 goto wait_for_memory;
1305
1306 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
1307 pfrag->page,
1308 pfrag->offset,
1309 copy);
1310 if (err)
1311 goto do_error;
1312
1313
1314 if (merge) {
1315 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1316 } else {
1317 skb_fill_page_desc(skb, i, pfrag->page,
1318 pfrag->offset, copy);
1319 page_ref_inc(pfrag->page);
1320 }
1321 pfrag->offset += copy;
1322 }
1323
1324 if (!copied)
1325 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1326
1327 tp->write_seq += copy;
1328 TCP_SKB_CB(skb)->end_seq += copy;
1329 tcp_skb_pcount_set(skb, 0);
1330
1331 copied += copy;
1332 if (!msg_data_left(msg)) {
1333 if (unlikely(flags & MSG_EOR))
1334 TCP_SKB_CB(skb)->eor = 1;
1335 goto out;
1336 }
1337
1338 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1339 continue;
1340
1341 if (forced_push(tp)) {
1342 tcp_mark_push(tp, skb);
1343 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1344 } else if (skb == tcp_send_head(sk))
1345 tcp_push_one(sk, mss_now);
1346 continue;
1347
1348wait_for_sndbuf:
1349 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1350wait_for_memory:
1351 if (copied)
1352 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1353 TCP_NAGLE_PUSH, size_goal);
1354
1355 err = sk_stream_wait_memory(sk, &timeo);
1356 if (err != 0)
1357 goto do_error;
1358
1359 mss_now = tcp_send_mss(sk, &size_goal, flags);
1360 }
1361
1362out:
1363 if (copied) {
1364 tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk));
1365 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1366 }
1367out_nopush:
1368 release_sock(sk);
1369 return copied + copied_syn;
1370
1371do_fault:
1372 if (!skb->len) {
1373 tcp_unlink_write_queue(skb, sk);
1374
1375
1376
1377 tcp_check_send_head(sk, skb);
1378 sk_wmem_free_skb(sk, skb);
1379 }
1380
1381do_error:
1382 if (copied + copied_syn)
1383 goto out;
1384out_err:
1385 err = sk_stream_error(sk, flags, err);
1386
1387 if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
1388 err == -EAGAIN)) {
1389 sk->sk_write_space(sk);
1390 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1391 }
1392 release_sock(sk);
1393 return err;
1394}
1395EXPORT_SYMBOL(tcp_sendmsg);
1396
1397
1398
1399
1400
1401
1402static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1403{
1404 struct tcp_sock *tp = tcp_sk(sk);
1405
1406
1407 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1408 tp->urg_data == TCP_URG_READ)
1409 return -EINVAL;
1410
1411 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1412 return -ENOTCONN;
1413
1414 if (tp->urg_data & TCP_URG_VALID) {
1415 int err = 0;
1416 char c = tp->urg_data;
1417
1418 if (!(flags & MSG_PEEK))
1419 tp->urg_data = TCP_URG_READ;
1420
1421
1422 msg->msg_flags |= MSG_OOB;
1423
1424 if (len > 0) {
1425 if (!(flags & MSG_TRUNC))
1426 err = memcpy_to_msg(msg, &c, 1);
1427 len = 1;
1428 } else
1429 msg->msg_flags |= MSG_TRUNC;
1430
1431 return err ? -EFAULT : len;
1432 }
1433
1434 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1435 return 0;
1436
1437
1438
1439
1440
1441
1442
1443 return -EAGAIN;
1444}
1445
1446static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1447{
1448 struct sk_buff *skb;
1449 int copied = 0, err = 0;
1450
1451
1452
1453 skb_queue_walk(&sk->sk_write_queue, skb) {
1454 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1455 if (err)
1456 break;
1457
1458 copied += skb->len;
1459 }
1460
1461 return err ?: copied;
1462}
1463
1464
1465
1466
1467
1468
1469
1470static void tcp_cleanup_rbuf(struct sock *sk, int copied)
1471{
1472 struct tcp_sock *tp = tcp_sk(sk);
1473 bool time_to_ack = false;
1474
1475 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1476
1477 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1478 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1479 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1480
1481 if (inet_csk_ack_scheduled(sk)) {
1482 const struct inet_connection_sock *icsk = inet_csk(sk);
1483
1484
1485 if (icsk->icsk_ack.blocked ||
1486
1487 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1488
1489
1490
1491
1492
1493
1494 (copied > 0 &&
1495 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1496 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1497 !icsk->icsk_ack.pingpong)) &&
1498 !atomic_read(&sk->sk_rmem_alloc)))
1499 time_to_ack = true;
1500 }
1501
1502
1503
1504
1505
1506
1507
1508 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1509 __u32 rcv_window_now = tcp_receive_window(tp);
1510
1511
1512 if (2*rcv_window_now <= tp->window_clamp) {
1513 __u32 new_window = __tcp_select_window(sk);
1514
1515
1516
1517
1518
1519
1520 if (new_window && new_window >= 2 * rcv_window_now)
1521 time_to_ack = true;
1522 }
1523 }
1524 if (time_to_ack)
1525 tcp_send_ack(sk);
1526}
1527
1528static void tcp_prequeue_process(struct sock *sk)
1529{
1530 struct sk_buff *skb;
1531 struct tcp_sock *tp = tcp_sk(sk);
1532
1533 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1534
1535 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1536 sk_backlog_rcv(sk, skb);
1537
1538
1539 tp->ucopy.memory = 0;
1540}
1541
1542static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1543{
1544 struct sk_buff *skb;
1545 u32 offset;
1546
1547 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1548 offset = seq - TCP_SKB_CB(skb)->seq;
1549 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1550 pr_err_once("%s: found a SYN, please report !\n", __func__);
1551 offset--;
1552 }
1553 if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
1554 *off = offset;
1555 return skb;
1556 }
1557
1558
1559
1560
1561 sk_eat_skb(sk, skb);
1562 }
1563 return NULL;
1564}
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1578 sk_read_actor_t recv_actor)
1579{
1580 struct sk_buff *skb;
1581 struct tcp_sock *tp = tcp_sk(sk);
1582 u32 seq = tp->copied_seq;
1583 u32 offset;
1584 int copied = 0;
1585
1586 if (sk->sk_state == TCP_LISTEN)
1587 return -ENOTCONN;
1588 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1589 if (offset < skb->len) {
1590 int used;
1591 size_t len;
1592
1593 len = skb->len - offset;
1594
1595 if (tp->urg_data) {
1596 u32 urg_offset = tp->urg_seq - seq;
1597 if (urg_offset < len)
1598 len = urg_offset;
1599 if (!len)
1600 break;
1601 }
1602 used = recv_actor(desc, skb, offset, len);
1603 if (used <= 0) {
1604 if (!copied)
1605 copied = used;
1606 break;
1607 } else if (used <= len) {
1608 seq += used;
1609 copied += used;
1610 offset += used;
1611 }
1612
1613
1614
1615
1616
1617 skb = tcp_recv_skb(sk, seq - 1, &offset);
1618 if (!skb)
1619 break;
1620
1621
1622
1623 if (offset + 1 != skb->len)
1624 continue;
1625 }
1626 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1627 sk_eat_skb(sk, skb);
1628 ++seq;
1629 break;
1630 }
1631 sk_eat_skb(sk, skb);
1632 if (!desc->count)
1633 break;
1634 tp->copied_seq = seq;
1635 }
1636 tp->copied_seq = seq;
1637
1638 tcp_rcv_space_adjust(sk);
1639
1640
1641 if (copied > 0) {
1642 tcp_recv_skb(sk, seq, &offset);
1643 tcp_cleanup_rbuf(sk, copied);
1644 }
1645 return copied;
1646}
1647EXPORT_SYMBOL(tcp_read_sock);
1648
1649int tcp_peek_len(struct socket *sock)
1650{
1651 return tcp_inq(sock->sk);
1652}
1653EXPORT_SYMBOL(tcp_peek_len);
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1664 int flags, int *addr_len)
1665{
1666 struct tcp_sock *tp = tcp_sk(sk);
1667 int copied = 0;
1668 u32 peek_seq;
1669 u32 *seq;
1670 unsigned long used;
1671 int err;
1672 int target;
1673 long timeo;
1674 struct task_struct *user_recv = NULL;
1675 struct sk_buff *skb, *last;
1676 u32 urg_hole = 0;
1677
1678 if (unlikely(flags & MSG_ERRQUEUE))
1679 return inet_recv_error(sk, msg, len, addr_len);
1680
1681 if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
1682 (sk->sk_state == TCP_ESTABLISHED))
1683 sk_busy_loop(sk, nonblock);
1684
1685 lock_sock(sk);
1686
1687 err = -ENOTCONN;
1688 if (sk->sk_state == TCP_LISTEN)
1689 goto out;
1690
1691 timeo = sock_rcvtimeo(sk, nonblock);
1692
1693
1694 if (flags & MSG_OOB)
1695 goto recv_urg;
1696
1697 if (unlikely(tp->repair)) {
1698 err = -EPERM;
1699 if (!(flags & MSG_PEEK))
1700 goto out;
1701
1702 if (tp->repair_queue == TCP_SEND_QUEUE)
1703 goto recv_sndq;
1704
1705 err = -EINVAL;
1706 if (tp->repair_queue == TCP_NO_QUEUE)
1707 goto out;
1708
1709
1710 }
1711
1712 seq = &tp->copied_seq;
1713 if (flags & MSG_PEEK) {
1714 peek_seq = tp->copied_seq;
1715 seq = &peek_seq;
1716 }
1717
1718 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1719
1720 do {
1721 u32 offset;
1722
1723
1724 if (tp->urg_data && tp->urg_seq == *seq) {
1725 if (copied)
1726 break;
1727 if (signal_pending(current)) {
1728 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1729 break;
1730 }
1731 }
1732
1733
1734
1735 last = skb_peek_tail(&sk->sk_receive_queue);
1736 skb_queue_walk(&sk->sk_receive_queue, skb) {
1737 last = skb;
1738
1739
1740
1741 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1742 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
1743 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1744 flags))
1745 break;
1746
1747 offset = *seq - TCP_SKB_CB(skb)->seq;
1748 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1749 pr_err_once("%s: found a SYN, please report !\n", __func__);
1750 offset--;
1751 }
1752 if (offset < skb->len)
1753 goto found_ok_skb;
1754 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1755 goto found_fin_ok;
1756 WARN(!(flags & MSG_PEEK),
1757 "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
1758 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1759 }
1760
1761
1762
1763 if (copied >= target && !sk->sk_backlog.tail)
1764 break;
1765
1766 if (copied) {
1767 if (sk->sk_err ||
1768 sk->sk_state == TCP_CLOSE ||
1769 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1770 !timeo ||
1771 signal_pending(current))
1772 break;
1773 } else {
1774 if (sock_flag(sk, SOCK_DONE))
1775 break;
1776
1777 if (sk->sk_err) {
1778 copied = sock_error(sk);
1779 break;
1780 }
1781
1782 if (sk->sk_shutdown & RCV_SHUTDOWN)
1783 break;
1784
1785 if (sk->sk_state == TCP_CLOSE) {
1786 if (!sock_flag(sk, SOCK_DONE)) {
1787
1788
1789
1790 copied = -ENOTCONN;
1791 break;
1792 }
1793 break;
1794 }
1795
1796 if (!timeo) {
1797 copied = -EAGAIN;
1798 break;
1799 }
1800
1801 if (signal_pending(current)) {
1802 copied = sock_intr_errno(timeo);
1803 break;
1804 }
1805 }
1806
1807 tcp_cleanup_rbuf(sk, copied);
1808
1809 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1810
1811 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1812 user_recv = current;
1813 tp->ucopy.task = user_recv;
1814 tp->ucopy.msg = msg;
1815 }
1816
1817 tp->ucopy.len = len;
1818
1819 WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1820 !(flags & (MSG_PEEK | MSG_TRUNC)));
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848 if (!skb_queue_empty(&tp->ucopy.prequeue))
1849 goto do_prequeue;
1850
1851
1852 }
1853
1854 if (copied >= target) {
1855
1856 release_sock(sk);
1857 lock_sock(sk);
1858 } else {
1859 sk_wait_data(sk, &timeo, last);
1860 }
1861
1862 if (user_recv) {
1863 int chunk;
1864
1865
1866
1867 chunk = len - tp->ucopy.len;
1868 if (chunk != 0) {
1869 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1870 len -= chunk;
1871 copied += chunk;
1872 }
1873
1874 if (tp->rcv_nxt == tp->copied_seq &&
1875 !skb_queue_empty(&tp->ucopy.prequeue)) {
1876do_prequeue:
1877 tcp_prequeue_process(sk);
1878
1879 chunk = len - tp->ucopy.len;
1880 if (chunk != 0) {
1881 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1882 len -= chunk;
1883 copied += chunk;
1884 }
1885 }
1886 }
1887 if ((flags & MSG_PEEK) &&
1888 (peek_seq - copied - urg_hole != tp->copied_seq)) {
1889 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
1890 current->comm,
1891 task_pid_nr(current));
1892 peek_seq = tp->copied_seq;
1893 }
1894 continue;
1895
1896 found_ok_skb:
1897
1898 used = skb->len - offset;
1899 if (len < used)
1900 used = len;
1901
1902
1903 if (tp->urg_data) {
1904 u32 urg_offset = tp->urg_seq - *seq;
1905 if (urg_offset < used) {
1906 if (!urg_offset) {
1907 if (!sock_flag(sk, SOCK_URGINLINE)) {
1908 ++*seq;
1909 urg_hole++;
1910 offset++;
1911 used--;
1912 if (!used)
1913 goto skip_copy;
1914 }
1915 } else
1916 used = urg_offset;
1917 }
1918 }
1919
1920 if (!(flags & MSG_TRUNC)) {
1921 err = skb_copy_datagram_msg(skb, offset, msg, used);
1922 if (err) {
1923
1924 if (!copied)
1925 copied = -EFAULT;
1926 break;
1927 }
1928 }
1929
1930 *seq += used;
1931 copied += used;
1932 len -= used;
1933
1934 tcp_rcv_space_adjust(sk);
1935
1936skip_copy:
1937 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1938 tp->urg_data = 0;
1939 tcp_fast_path_check(sk);
1940 }
1941 if (used + offset < skb->len)
1942 continue;
1943
1944 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1945 goto found_fin_ok;
1946 if (!(flags & MSG_PEEK))
1947 sk_eat_skb(sk, skb);
1948 continue;
1949
1950 found_fin_ok:
1951
1952 ++*seq;
1953 if (!(flags & MSG_PEEK))
1954 sk_eat_skb(sk, skb);
1955 break;
1956 } while (len > 0);
1957
1958 if (user_recv) {
1959 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1960 int chunk;
1961
1962 tp->ucopy.len = copied > 0 ? len : 0;
1963
1964 tcp_prequeue_process(sk);
1965
1966 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1967 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1968 len -= chunk;
1969 copied += chunk;
1970 }
1971 }
1972
1973 tp->ucopy.task = NULL;
1974 tp->ucopy.len = 0;
1975 }
1976
1977
1978
1979
1980
1981
1982 tcp_cleanup_rbuf(sk, copied);
1983
1984 release_sock(sk);
1985 return copied;
1986
1987out:
1988 release_sock(sk);
1989 return err;
1990
1991recv_urg:
1992 err = tcp_recv_urg(sk, msg, len, flags);
1993 goto out;
1994
1995recv_sndq:
1996 err = tcp_peek_sndq(sk, msg, len);
1997 goto out;
1998}
1999EXPORT_SYMBOL(tcp_recvmsg);
2000
2001void tcp_set_state(struct sock *sk, int state)
2002{
2003 int oldstate = sk->sk_state;
2004
2005 switch (state) {
2006 case TCP_ESTABLISHED:
2007 if (oldstate != TCP_ESTABLISHED)
2008 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2009 break;
2010
2011 case TCP_CLOSE:
2012 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
2013 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
2014
2015 sk->sk_prot->unhash(sk);
2016 if (inet_csk(sk)->icsk_bind_hash &&
2017 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
2018 inet_put_port(sk);
2019
2020 default:
2021 if (oldstate == TCP_ESTABLISHED)
2022 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2023 }
2024
2025
2026
2027
2028 sk_state_store(sk, state);
2029
2030#ifdef STATE_TRACE
2031 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
2032#endif
2033}
2034EXPORT_SYMBOL_GPL(tcp_set_state);
2035
2036
2037
2038
2039
2040
2041
2042
2043static const unsigned char new_state[16] = {
2044
2045 [0 ] = TCP_CLOSE,
2046 [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2047 [TCP_SYN_SENT] = TCP_CLOSE,
2048 [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2049 [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
2050 [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
2051 [TCP_TIME_WAIT] = TCP_CLOSE,
2052 [TCP_CLOSE] = TCP_CLOSE,
2053 [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
2054 [TCP_LAST_ACK] = TCP_LAST_ACK,
2055 [TCP_LISTEN] = TCP_CLOSE,
2056 [TCP_CLOSING] = TCP_CLOSING,
2057 [TCP_NEW_SYN_RECV] = TCP_CLOSE,
2058};
2059
2060static int tcp_close_state(struct sock *sk)
2061{
2062 int next = (int)new_state[sk->sk_state];
2063 int ns = next & TCP_STATE_MASK;
2064
2065 tcp_set_state(sk, ns);
2066
2067 return next & TCP_ACTION_FIN;
2068}
2069
2070
2071
2072
2073
2074
2075void tcp_shutdown(struct sock *sk, int how)
2076{
2077
2078
2079
2080
2081 if (!(how & SEND_SHUTDOWN))
2082 return;
2083
2084
2085 if ((1 << sk->sk_state) &
2086 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2087 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2088
2089 if (tcp_close_state(sk))
2090 tcp_send_fin(sk);
2091 }
2092}
2093EXPORT_SYMBOL(tcp_shutdown);
2094
2095bool tcp_check_oom(struct sock *sk, int shift)
2096{
2097 bool too_many_orphans, out_of_socket_memory;
2098
2099 too_many_orphans = tcp_too_many_orphans(sk, shift);
2100 out_of_socket_memory = tcp_out_of_memory(sk);
2101
2102 if (too_many_orphans)
2103 net_info_ratelimited("too many orphaned sockets\n");
2104 if (out_of_socket_memory)
2105 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2106 return too_many_orphans || out_of_socket_memory;
2107}
2108
2109void tcp_close(struct sock *sk, long timeout)
2110{
2111 struct sk_buff *skb;
2112 int data_was_unread = 0;
2113 int state;
2114
2115 lock_sock(sk);
2116 sk->sk_shutdown = SHUTDOWN_MASK;
2117
2118 if (sk->sk_state == TCP_LISTEN) {
2119 tcp_set_state(sk, TCP_CLOSE);
2120
2121
2122 inet_csk_listen_stop(sk);
2123
2124 goto adjudge_to_death;
2125 }
2126
2127
2128
2129
2130
2131 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2132 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
2133
2134 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2135 len--;
2136 data_was_unread += len;
2137 __kfree_skb(skb);
2138 }
2139
2140 sk_mem_reclaim(sk);
2141
2142
2143 if (sk->sk_state == TCP_CLOSE)
2144 goto adjudge_to_death;
2145
2146
2147
2148
2149
2150
2151
2152
2153 if (unlikely(tcp_sk(sk)->repair)) {
2154 sk->sk_prot->disconnect(sk, 0);
2155 } else if (data_was_unread) {
2156
2157 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2158 tcp_set_state(sk, TCP_CLOSE);
2159 tcp_send_active_reset(sk, sk->sk_allocation);
2160 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2161
2162 sk->sk_prot->disconnect(sk, 0);
2163 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2164 } else if (tcp_close_state(sk)) {
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194 tcp_send_fin(sk);
2195 }
2196
2197 sk_stream_wait_close(sk, timeout);
2198
2199adjudge_to_death:
2200 state = sk->sk_state;
2201 sock_hold(sk);
2202 sock_orphan(sk);
2203
2204
2205 release_sock(sk);
2206
2207
2208
2209
2210
2211 local_bh_disable();
2212 bh_lock_sock(sk);
2213 WARN_ON(sock_owned_by_user(sk));
2214
2215 percpu_counter_inc(sk->sk_prot->orphan_count);
2216
2217
2218 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2219 goto out;
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235 if (sk->sk_state == TCP_FIN_WAIT2) {
2236 struct tcp_sock *tp = tcp_sk(sk);
2237 if (tp->linger2 < 0) {
2238 tcp_set_state(sk, TCP_CLOSE);
2239 tcp_send_active_reset(sk, GFP_ATOMIC);
2240 __NET_INC_STATS(sock_net(sk),
2241 LINUX_MIB_TCPABORTONLINGER);
2242 } else {
2243 const int tmo = tcp_fin_time(sk);
2244
2245 if (tmo > TCP_TIMEWAIT_LEN) {
2246 inet_csk_reset_keepalive_timer(sk,
2247 tmo - TCP_TIMEWAIT_LEN);
2248 } else {
2249 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2250 goto out;
2251 }
2252 }
2253 }
2254 if (sk->sk_state != TCP_CLOSE) {
2255 sk_mem_reclaim(sk);
2256 if (tcp_check_oom(sk, 0)) {
2257 tcp_set_state(sk, TCP_CLOSE);
2258 tcp_send_active_reset(sk, GFP_ATOMIC);
2259 __NET_INC_STATS(sock_net(sk),
2260 LINUX_MIB_TCPABORTONMEMORY);
2261 }
2262 }
2263
2264 if (sk->sk_state == TCP_CLOSE) {
2265 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2266
2267
2268
2269
2270 if (req)
2271 reqsk_fastopen_remove(sk, req, false);
2272 inet_csk_destroy_sock(sk);
2273 }
2274
2275
2276out:
2277 bh_unlock_sock(sk);
2278 local_bh_enable();
2279 sock_put(sk);
2280}
2281EXPORT_SYMBOL(tcp_close);
2282
2283
2284
2285static inline bool tcp_need_reset(int state)
2286{
2287 return (1 << state) &
2288 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2289 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2290}
2291
2292int tcp_disconnect(struct sock *sk, int flags)
2293{
2294 struct inet_sock *inet = inet_sk(sk);
2295 struct inet_connection_sock *icsk = inet_csk(sk);
2296 struct tcp_sock *tp = tcp_sk(sk);
2297 int err = 0;
2298 int old_state = sk->sk_state;
2299
2300 if (old_state != TCP_CLOSE)
2301 tcp_set_state(sk, TCP_CLOSE);
2302
2303
2304 if (old_state == TCP_LISTEN) {
2305 inet_csk_listen_stop(sk);
2306 } else if (unlikely(tp->repair)) {
2307 sk->sk_err = ECONNABORTED;
2308 } else if (tcp_need_reset(old_state) ||
2309 (tp->snd_nxt != tp->write_seq &&
2310 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2311
2312
2313
2314 tcp_send_active_reset(sk, gfp_any());
2315 sk->sk_err = ECONNRESET;
2316 } else if (old_state == TCP_SYN_SENT)
2317 sk->sk_err = ECONNRESET;
2318
2319 tcp_clear_xmit_timers(sk);
2320 __skb_queue_purge(&sk->sk_receive_queue);
2321 tcp_write_queue_purge(sk);
2322 tcp_fastopen_active_disable_ofo_check(sk);
2323 skb_rbtree_purge(&tp->out_of_order_queue);
2324
2325 inet->inet_dport = 0;
2326
2327 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2328 inet_reset_saddr(sk);
2329
2330 sk->sk_shutdown = 0;
2331 sock_reset_flag(sk, SOCK_DONE);
2332 tp->srtt_us = 0;
2333 tp->write_seq += tp->max_window + 2;
2334 if (tp->write_seq == 0)
2335 tp->write_seq = 1;
2336 icsk->icsk_backoff = 0;
2337 tp->snd_cwnd = 2;
2338 icsk->icsk_probes_out = 0;
2339 tp->packets_out = 0;
2340 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2341 tp->snd_cwnd_cnt = 0;
2342 tp->window_clamp = 0;
2343 tcp_set_ca_state(sk, TCP_CA_Open);
2344 tcp_clear_retrans(tp);
2345 inet_csk_delack_init(sk);
2346
2347
2348
2349 icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
2350 tcp_init_send_head(sk);
2351 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2352 __sk_dst_reset(sk);
2353 dst_release(sk->sk_rx_dst);
2354 sk->sk_rx_dst = NULL;
2355 tcp_saved_syn_free(tp);
2356
2357
2358 tcp_free_fastopen_req(tp);
2359 inet->defer_connect = 0;
2360
2361 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2362
2363 sk->sk_error_report(sk);
2364 return err;
2365}
2366EXPORT_SYMBOL(tcp_disconnect);
2367
2368static inline bool tcp_can_repair_sock(const struct sock *sk)
2369{
2370 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2371 (sk->sk_state != TCP_LISTEN);
2372}
2373
2374static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
2375{
2376 struct tcp_repair_window opt;
2377
2378 if (!tp->repair)
2379 return -EPERM;
2380
2381 if (len != sizeof(opt))
2382 return -EINVAL;
2383
2384 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2385 return -EFAULT;
2386
2387 if (opt.max_window < opt.snd_wnd)
2388 return -EINVAL;
2389
2390 if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
2391 return -EINVAL;
2392
2393 if (after(opt.rcv_wup, tp->rcv_nxt))
2394 return -EINVAL;
2395
2396 tp->snd_wl1 = opt.snd_wl1;
2397 tp->snd_wnd = opt.snd_wnd;
2398 tp->max_window = opt.max_window;
2399
2400 tp->rcv_wnd = opt.rcv_wnd;
2401 tp->rcv_wup = opt.rcv_wup;
2402
2403 return 0;
2404}
2405
2406static int tcp_repair_options_est(struct sock *sk,
2407 struct tcp_repair_opt __user *optbuf, unsigned int len)
2408{
2409 struct tcp_sock *tp = tcp_sk(sk);
2410 struct tcp_repair_opt opt;
2411
2412 while (len >= sizeof(opt)) {
2413 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2414 return -EFAULT;
2415
2416 optbuf++;
2417 len -= sizeof(opt);
2418
2419 switch (opt.opt_code) {
2420 case TCPOPT_MSS:
2421 tp->rx_opt.mss_clamp = opt.opt_val;
2422 tcp_mtup_init(sk);
2423 break;
2424 case TCPOPT_WINDOW:
2425 {
2426 u16 snd_wscale = opt.opt_val & 0xFFFF;
2427 u16 rcv_wscale = opt.opt_val >> 16;
2428
2429 if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE)
2430 return -EFBIG;
2431
2432 tp->rx_opt.snd_wscale = snd_wscale;
2433 tp->rx_opt.rcv_wscale = rcv_wscale;
2434 tp->rx_opt.wscale_ok = 1;
2435 }
2436 break;
2437 case TCPOPT_SACK_PERM:
2438 if (opt.opt_val != 0)
2439 return -EINVAL;
2440
2441 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2442 if (sysctl_tcp_fack)
2443 tcp_enable_fack(tp);
2444 break;
2445 case TCPOPT_TIMESTAMP:
2446 if (opt.opt_val != 0)
2447 return -EINVAL;
2448
2449 tp->rx_opt.tstamp_ok = 1;
2450 break;
2451 }
2452 }
2453
2454 return 0;
2455}
2456
2457
2458
2459
2460static int do_tcp_setsockopt(struct sock *sk, int level,
2461 int optname, char __user *optval, unsigned int optlen)
2462{
2463 struct tcp_sock *tp = tcp_sk(sk);
2464 struct inet_connection_sock *icsk = inet_csk(sk);
2465 struct net *net = sock_net(sk);
2466 int val;
2467 int err = 0;
2468
2469
2470 switch (optname) {
2471 case TCP_CONGESTION: {
2472 char name[TCP_CA_NAME_MAX];
2473
2474 if (optlen < 1)
2475 return -EINVAL;
2476
2477 val = strncpy_from_user(name, optval,
2478 min_t(long, TCP_CA_NAME_MAX-1, optlen));
2479 if (val < 0)
2480 return -EFAULT;
2481 name[val] = 0;
2482
2483 lock_sock(sk);
2484 err = tcp_set_congestion_control(sk, name, true, true);
2485 release_sock(sk);
2486 return err;
2487 }
2488 case TCP_ULP: {
2489 char name[TCP_ULP_NAME_MAX];
2490
2491 if (optlen < 1)
2492 return -EINVAL;
2493
2494 val = strncpy_from_user(name, optval,
2495 min_t(long, TCP_ULP_NAME_MAX - 1,
2496 optlen));
2497 if (val < 0)
2498 return -EFAULT;
2499 name[val] = 0;
2500
2501 lock_sock(sk);
2502 err = tcp_set_ulp(sk, name);
2503 release_sock(sk);
2504 return err;
2505 }
2506 default:
2507
2508 break;
2509 }
2510
2511 if (optlen < sizeof(int))
2512 return -EINVAL;
2513
2514 if (get_user(val, (int __user *)optval))
2515 return -EFAULT;
2516
2517 lock_sock(sk);
2518
2519 switch (optname) {
2520 case TCP_MAXSEG:
2521
2522
2523
2524
2525 if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
2526 err = -EINVAL;
2527 break;
2528 }
2529 tp->rx_opt.user_mss = val;
2530 break;
2531
2532 case TCP_NODELAY:
2533 if (val) {
2534
2535
2536
2537
2538
2539
2540
2541
2542 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2543 tcp_push_pending_frames(sk);
2544 } else {
2545 tp->nonagle &= ~TCP_NAGLE_OFF;
2546 }
2547 break;
2548
2549 case TCP_THIN_LINEAR_TIMEOUTS:
2550 if (val < 0 || val > 1)
2551 err = -EINVAL;
2552 else
2553 tp->thin_lto = val;
2554 break;
2555
2556 case TCP_THIN_DUPACK:
2557 if (val < 0 || val > 1)
2558 err = -EINVAL;
2559 break;
2560
2561 case TCP_REPAIR:
2562 if (!tcp_can_repair_sock(sk))
2563 err = -EPERM;
2564 else if (val == 1) {
2565 tp->repair = 1;
2566 sk->sk_reuse = SK_FORCE_REUSE;
2567 tp->repair_queue = TCP_NO_QUEUE;
2568 } else if (val == 0) {
2569 tp->repair = 0;
2570 sk->sk_reuse = SK_NO_REUSE;
2571 tcp_send_window_probe(sk);
2572 } else
2573 err = -EINVAL;
2574
2575 break;
2576
2577 case TCP_REPAIR_QUEUE:
2578 if (!tp->repair)
2579 err = -EPERM;
2580 else if (val < TCP_QUEUES_NR)
2581 tp->repair_queue = val;
2582 else
2583 err = -EINVAL;
2584 break;
2585
2586 case TCP_QUEUE_SEQ:
2587 if (sk->sk_state != TCP_CLOSE)
2588 err = -EPERM;
2589 else if (tp->repair_queue == TCP_SEND_QUEUE)
2590 tp->write_seq = val;
2591 else if (tp->repair_queue == TCP_RECV_QUEUE)
2592 tp->rcv_nxt = val;
2593 else
2594 err = -EINVAL;
2595 break;
2596
2597 case TCP_REPAIR_OPTIONS:
2598 if (!tp->repair)
2599 err = -EINVAL;
2600 else if (sk->sk_state == TCP_ESTABLISHED)
2601 err = tcp_repair_options_est(sk,
2602 (struct tcp_repair_opt __user *)optval,
2603 optlen);
2604 else
2605 err = -EPERM;
2606 break;
2607
2608 case TCP_CORK:
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620 if (val) {
2621 tp->nonagle |= TCP_NAGLE_CORK;
2622 } else {
2623 tp->nonagle &= ~TCP_NAGLE_CORK;
2624 if (tp->nonagle&TCP_NAGLE_OFF)
2625 tp->nonagle |= TCP_NAGLE_PUSH;
2626 tcp_push_pending_frames(sk);
2627 }
2628 break;
2629
2630 case TCP_KEEPIDLE:
2631 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2632 err = -EINVAL;
2633 else {
2634 tp->keepalive_time = val * HZ;
2635 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2636 !((1 << sk->sk_state) &
2637 (TCPF_CLOSE | TCPF_LISTEN))) {
2638 u32 elapsed = keepalive_time_elapsed(tp);
2639 if (tp->keepalive_time > elapsed)
2640 elapsed = tp->keepalive_time - elapsed;
2641 else
2642 elapsed = 0;
2643 inet_csk_reset_keepalive_timer(sk, elapsed);
2644 }
2645 }
2646 break;
2647 case TCP_KEEPINTVL:
2648 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2649 err = -EINVAL;
2650 else
2651 tp->keepalive_intvl = val * HZ;
2652 break;
2653 case TCP_KEEPCNT:
2654 if (val < 1 || val > MAX_TCP_KEEPCNT)
2655 err = -EINVAL;
2656 else
2657 tp->keepalive_probes = val;
2658 break;
2659 case TCP_SYNCNT:
2660 if (val < 1 || val > MAX_TCP_SYNCNT)
2661 err = -EINVAL;
2662 else
2663 icsk->icsk_syn_retries = val;
2664 break;
2665
2666 case TCP_SAVE_SYN:
2667 if (val < 0 || val > 1)
2668 err = -EINVAL;
2669 else
2670 tp->save_syn = val;
2671 break;
2672
2673 case TCP_LINGER2:
2674 if (val < 0)
2675 tp->linger2 = -1;
2676 else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)
2677 tp->linger2 = 0;
2678 else
2679 tp->linger2 = val * HZ;
2680 break;
2681
2682 case TCP_DEFER_ACCEPT:
2683
2684 icsk->icsk_accept_queue.rskq_defer_accept =
2685 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2686 TCP_RTO_MAX / HZ);
2687 break;
2688
2689 case TCP_WINDOW_CLAMP:
2690 if (!val) {
2691 if (sk->sk_state != TCP_CLOSE) {
2692 err = -EINVAL;
2693 break;
2694 }
2695 tp->window_clamp = 0;
2696 } else
2697 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2698 SOCK_MIN_RCVBUF / 2 : val;
2699 break;
2700
2701 case TCP_QUICKACK:
2702 if (!val) {
2703 icsk->icsk_ack.pingpong = 1;
2704 } else {
2705 icsk->icsk_ack.pingpong = 0;
2706 if ((1 << sk->sk_state) &
2707 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2708 inet_csk_ack_scheduled(sk)) {
2709 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2710 tcp_cleanup_rbuf(sk, 1);
2711 if (!(val & 1))
2712 icsk->icsk_ack.pingpong = 1;
2713 }
2714 }
2715 break;
2716
2717#ifdef CONFIG_TCP_MD5SIG
2718 case TCP_MD5SIG:
2719 case TCP_MD5SIG_EXT:
2720
2721 err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
2722 break;
2723#endif
2724 case TCP_USER_TIMEOUT:
2725
2726
2727
2728 if (val < 0)
2729 err = -EINVAL;
2730 else
2731 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2732 break;
2733
2734 case TCP_FASTOPEN:
2735 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2736 TCPF_LISTEN))) {
2737 tcp_fastopen_init_key_once(true);
2738
2739 fastopen_queue_tune(sk, val);
2740 } else {
2741 err = -EINVAL;
2742 }
2743 break;
2744 case TCP_FASTOPEN_CONNECT:
2745 if (val > 1 || val < 0) {
2746 err = -EINVAL;
2747 } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
2748 if (sk->sk_state == TCP_CLOSE)
2749 tp->fastopen_connect = val;
2750 else
2751 err = -EINVAL;
2752 } else {
2753 err = -EOPNOTSUPP;
2754 }
2755 break;
2756 case TCP_TIMESTAMP:
2757 if (!tp->repair)
2758 err = -EPERM;
2759 else
2760 tp->tsoffset = val - tcp_time_stamp_raw();
2761 break;
2762 case TCP_REPAIR_WINDOW:
2763 err = tcp_repair_set_window(tp, optval, optlen);
2764 break;
2765 case TCP_NOTSENT_LOWAT:
2766 tp->notsent_lowat = val;
2767 sk->sk_write_space(sk);
2768 break;
2769 default:
2770 err = -ENOPROTOOPT;
2771 break;
2772 }
2773
2774 release_sock(sk);
2775 return err;
2776}
2777
2778int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2779 unsigned int optlen)
2780{
2781 const struct inet_connection_sock *icsk = inet_csk(sk);
2782
2783 if (level != SOL_TCP)
2784 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2785 optval, optlen);
2786 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2787}
2788EXPORT_SYMBOL(tcp_setsockopt);
2789
2790#ifdef CONFIG_COMPAT
2791int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2792 char __user *optval, unsigned int optlen)
2793{
2794 if (level != SOL_TCP)
2795 return inet_csk_compat_setsockopt(sk, level, optname,
2796 optval, optlen);
2797 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2798}
2799EXPORT_SYMBOL(compat_tcp_setsockopt);
2800#endif
2801
2802static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
2803 struct tcp_info *info)
2804{
2805 u64 stats[__TCP_CHRONO_MAX], total = 0;
2806 enum tcp_chrono i;
2807
2808 for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
2809 stats[i] = tp->chrono_stat[i - 1];
2810 if (i == tp->chrono_type)
2811 stats[i] += tcp_jiffies32 - tp->chrono_start;
2812 stats[i] *= USEC_PER_SEC / HZ;
2813 total += stats[i];
2814 }
2815
2816 info->tcpi_busy_time = total;
2817 info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
2818 info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
2819}
2820
2821
2822void tcp_get_info(struct sock *sk, struct tcp_info *info)
2823{
2824 const struct tcp_sock *tp = tcp_sk(sk);
2825 const struct inet_connection_sock *icsk = inet_csk(sk);
2826 u32 now, intv;
2827 u64 rate64;
2828 bool slow;
2829 u32 rate;
2830
2831 memset(info, 0, sizeof(*info));
2832 if (sk->sk_type != SOCK_STREAM)
2833 return;
2834
2835 info->tcpi_state = sk_state_load(sk);
2836
2837
2838 rate = READ_ONCE(sk->sk_pacing_rate);
2839 rate64 = rate != ~0U ? rate : ~0ULL;
2840 info->tcpi_pacing_rate = rate64;
2841
2842 rate = READ_ONCE(sk->sk_max_pacing_rate);
2843 rate64 = rate != ~0U ? rate : ~0ULL;
2844 info->tcpi_max_pacing_rate = rate64;
2845
2846 info->tcpi_reordering = tp->reordering;
2847 info->tcpi_snd_cwnd = tp->snd_cwnd;
2848
2849 if (info->tcpi_state == TCP_LISTEN) {
2850
2851
2852
2853
2854 info->tcpi_unacked = sk->sk_ack_backlog;
2855 info->tcpi_sacked = sk->sk_max_ack_backlog;
2856 return;
2857 }
2858
2859 slow = lock_sock_fast(sk);
2860
2861 info->tcpi_ca_state = icsk->icsk_ca_state;
2862 info->tcpi_retransmits = icsk->icsk_retransmits;
2863 info->tcpi_probes = icsk->icsk_probes_out;
2864 info->tcpi_backoff = icsk->icsk_backoff;
2865
2866 if (tp->rx_opt.tstamp_ok)
2867 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2868 if (tcp_is_sack(tp))
2869 info->tcpi_options |= TCPI_OPT_SACK;
2870 if (tp->rx_opt.wscale_ok) {
2871 info->tcpi_options |= TCPI_OPT_WSCALE;
2872 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2873 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2874 }
2875
2876 if (tp->ecn_flags & TCP_ECN_OK)
2877 info->tcpi_options |= TCPI_OPT_ECN;
2878 if (tp->ecn_flags & TCP_ECN_SEEN)
2879 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
2880 if (tp->syn_data_acked)
2881 info->tcpi_options |= TCPI_OPT_SYN_DATA;
2882
2883 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2884 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2885 info->tcpi_snd_mss = tp->mss_cache;
2886 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2887
2888 info->tcpi_unacked = tp->packets_out;
2889 info->tcpi_sacked = tp->sacked_out;
2890
2891 info->tcpi_lost = tp->lost_out;
2892 info->tcpi_retrans = tp->retrans_out;
2893 info->tcpi_fackets = tp->fackets_out;
2894
2895 now = tcp_jiffies32;
2896 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2897 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2898 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2899
2900 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2901 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2902 info->tcpi_rtt = tp->srtt_us >> 3;
2903 info->tcpi_rttvar = tp->mdev_us >> 2;
2904 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2905 info->tcpi_advmss = tp->advmss;
2906
2907 info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
2908 info->tcpi_rcv_space = tp->rcvq_space.space;
2909
2910 info->tcpi_total_retrans = tp->total_retrans;
2911
2912 info->tcpi_bytes_acked = tp->bytes_acked;
2913 info->tcpi_bytes_received = tp->bytes_received;
2914 info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
2915 tcp_get_info_chrono_stats(tp, info);
2916
2917 info->tcpi_segs_out = tp->segs_out;
2918 info->tcpi_segs_in = tp->segs_in;
2919
2920 info->tcpi_min_rtt = tcp_min_rtt(tp);
2921 info->tcpi_data_segs_in = tp->data_segs_in;
2922 info->tcpi_data_segs_out = tp->data_segs_out;
2923
2924 info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
2925 rate = READ_ONCE(tp->rate_delivered);
2926 intv = READ_ONCE(tp->rate_interval_us);
2927 if (rate && intv) {
2928 rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
2929 do_div(rate64, intv);
2930 info->tcpi_delivery_rate = rate64;
2931 }
2932 unlock_sock_fast(sk, slow);
2933}
2934EXPORT_SYMBOL_GPL(tcp_get_info);
2935
2936struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
2937{
2938 const struct tcp_sock *tp = tcp_sk(sk);
2939 struct sk_buff *stats;
2940 struct tcp_info info;
2941
2942 stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
2943 if (!stats)
2944 return NULL;
2945
2946 tcp_get_info_chrono_stats(tp, &info);
2947 nla_put_u64_64bit(stats, TCP_NLA_BUSY,
2948 info.tcpi_busy_time, TCP_NLA_PAD);
2949 nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
2950 info.tcpi_rwnd_limited, TCP_NLA_PAD);
2951 nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
2952 info.tcpi_sndbuf_limited, TCP_NLA_PAD);
2953 nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
2954 tp->data_segs_out, TCP_NLA_PAD);
2955 nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
2956 tp->total_retrans, TCP_NLA_PAD);
2957 return stats;
2958}
2959
2960static int do_tcp_getsockopt(struct sock *sk, int level,
2961 int optname, char __user *optval, int __user *optlen)
2962{
2963 struct inet_connection_sock *icsk = inet_csk(sk);
2964 struct tcp_sock *tp = tcp_sk(sk);
2965 struct net *net = sock_net(sk);
2966 int val, len;
2967
2968 if (get_user(len, optlen))
2969 return -EFAULT;
2970
2971 len = min_t(unsigned int, len, sizeof(int));
2972
2973 if (len < 0)
2974 return -EINVAL;
2975
2976 switch (optname) {
2977 case TCP_MAXSEG:
2978 val = tp->mss_cache;
2979 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2980 val = tp->rx_opt.user_mss;
2981 if (tp->repair)
2982 val = tp->rx_opt.mss_clamp;
2983 break;
2984 case TCP_NODELAY:
2985 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2986 break;
2987 case TCP_CORK:
2988 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2989 break;
2990 case TCP_KEEPIDLE:
2991 val = keepalive_time_when(tp) / HZ;
2992 break;
2993 case TCP_KEEPINTVL:
2994 val = keepalive_intvl_when(tp) / HZ;
2995 break;
2996 case TCP_KEEPCNT:
2997 val = keepalive_probes(tp);
2998 break;
2999 case TCP_SYNCNT:
3000 val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
3001 break;
3002 case TCP_LINGER2:
3003 val = tp->linger2;
3004 if (val >= 0)
3005 val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
3006 break;
3007 case TCP_DEFER_ACCEPT:
3008 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
3009 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
3010 break;
3011 case TCP_WINDOW_CLAMP:
3012 val = tp->window_clamp;
3013 break;
3014 case TCP_INFO: {
3015 struct tcp_info info;
3016
3017 if (get_user(len, optlen))
3018 return -EFAULT;
3019
3020 tcp_get_info(sk, &info);
3021
3022 len = min_t(unsigned int, len, sizeof(info));
3023 if (put_user(len, optlen))
3024 return -EFAULT;
3025 if (copy_to_user(optval, &info, len))
3026 return -EFAULT;
3027 return 0;
3028 }
3029 case TCP_CC_INFO: {
3030 const struct tcp_congestion_ops *ca_ops;
3031 union tcp_cc_info info;
3032 size_t sz = 0;
3033 int attr;
3034
3035 if (get_user(len, optlen))
3036 return -EFAULT;
3037
3038 ca_ops = icsk->icsk_ca_ops;
3039 if (ca_ops && ca_ops->get_info)
3040 sz = ca_ops->get_info(sk, ~0U, &attr, &info);
3041
3042 len = min_t(unsigned int, len, sz);
3043 if (put_user(len, optlen))
3044 return -EFAULT;
3045 if (copy_to_user(optval, &info, len))
3046 return -EFAULT;
3047 return 0;
3048 }
3049 case TCP_QUICKACK:
3050 val = !icsk->icsk_ack.pingpong;
3051 break;
3052
3053 case TCP_CONGESTION:
3054 if (get_user(len, optlen))
3055 return -EFAULT;
3056 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
3057 if (put_user(len, optlen))
3058 return -EFAULT;
3059 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
3060 return -EFAULT;
3061 return 0;
3062
3063 case TCP_ULP:
3064 if (get_user(len, optlen))
3065 return -EFAULT;
3066 len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
3067 if (!icsk->icsk_ulp_ops) {
3068 if (put_user(0, optlen))
3069 return -EFAULT;
3070 return 0;
3071 }
3072 if (put_user(len, optlen))
3073 return -EFAULT;
3074 if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
3075 return -EFAULT;
3076 return 0;
3077
3078 case TCP_THIN_LINEAR_TIMEOUTS:
3079 val = tp->thin_lto;
3080 break;
3081
3082 case TCP_THIN_DUPACK:
3083 val = 0;
3084 break;
3085
3086 case TCP_REPAIR:
3087 val = tp->repair;
3088 break;
3089
3090 case TCP_REPAIR_QUEUE:
3091 if (tp->repair)
3092 val = tp->repair_queue;
3093 else
3094 return -EINVAL;
3095 break;
3096
3097 case TCP_REPAIR_WINDOW: {
3098 struct tcp_repair_window opt;
3099
3100 if (get_user(len, optlen))
3101 return -EFAULT;
3102
3103 if (len != sizeof(opt))
3104 return -EINVAL;
3105
3106 if (!tp->repair)
3107 return -EPERM;
3108
3109 opt.snd_wl1 = tp->snd_wl1;
3110 opt.snd_wnd = tp->snd_wnd;
3111 opt.max_window = tp->max_window;
3112 opt.rcv_wnd = tp->rcv_wnd;
3113 opt.rcv_wup = tp->rcv_wup;
3114
3115 if (copy_to_user(optval, &opt, len))
3116 return -EFAULT;
3117 return 0;
3118 }
3119 case TCP_QUEUE_SEQ:
3120 if (tp->repair_queue == TCP_SEND_QUEUE)
3121 val = tp->write_seq;
3122 else if (tp->repair_queue == TCP_RECV_QUEUE)
3123 val = tp->rcv_nxt;
3124 else
3125 return -EINVAL;
3126 break;
3127
3128 case TCP_USER_TIMEOUT:
3129 val = jiffies_to_msecs(icsk->icsk_user_timeout);
3130 break;
3131
3132 case TCP_FASTOPEN:
3133 val = icsk->icsk_accept_queue.fastopenq.max_qlen;
3134 break;
3135
3136 case TCP_FASTOPEN_CONNECT:
3137 val = tp->fastopen_connect;
3138 break;
3139
3140 case TCP_TIMESTAMP:
3141 val = tcp_time_stamp_raw() + tp->tsoffset;
3142 break;
3143 case TCP_NOTSENT_LOWAT:
3144 val = tp->notsent_lowat;
3145 break;
3146 case TCP_SAVE_SYN:
3147 val = tp->save_syn;
3148 break;
3149 case TCP_SAVED_SYN: {
3150 if (get_user(len, optlen))
3151 return -EFAULT;
3152
3153 lock_sock(sk);
3154 if (tp->saved_syn) {
3155 if (len < tp->saved_syn[0]) {
3156 if (put_user(tp->saved_syn[0], optlen)) {
3157 release_sock(sk);
3158 return -EFAULT;
3159 }
3160 release_sock(sk);
3161 return -EINVAL;
3162 }
3163 len = tp->saved_syn[0];
3164 if (put_user(len, optlen)) {
3165 release_sock(sk);
3166 return -EFAULT;
3167 }
3168 if (copy_to_user(optval, tp->saved_syn + 1, len)) {
3169 release_sock(sk);
3170 return -EFAULT;
3171 }
3172 tcp_saved_syn_free(tp);
3173 release_sock(sk);
3174 } else {
3175 release_sock(sk);
3176 len = 0;
3177 if (put_user(len, optlen))
3178 return -EFAULT;
3179 }
3180 return 0;
3181 }
3182 default:
3183 return -ENOPROTOOPT;
3184 }
3185
3186 if (put_user(len, optlen))
3187 return -EFAULT;
3188 if (copy_to_user(optval, &val, len))
3189 return -EFAULT;
3190 return 0;
3191}
3192
3193int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
3194 int __user *optlen)
3195{
3196 struct inet_connection_sock *icsk = inet_csk(sk);
3197
3198 if (level != SOL_TCP)
3199 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
3200 optval, optlen);
3201 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
3202}
3203EXPORT_SYMBOL(tcp_getsockopt);
3204
3205#ifdef CONFIG_COMPAT
3206int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
3207 char __user *optval, int __user *optlen)
3208{
3209 if (level != SOL_TCP)
3210 return inet_csk_compat_getsockopt(sk, level, optname,
3211 optval, optlen);
3212 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
3213}
3214EXPORT_SYMBOL(compat_tcp_getsockopt);
3215#endif
3216
3217#ifdef CONFIG_TCP_MD5SIG
3218static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
3219static DEFINE_MUTEX(tcp_md5sig_mutex);
3220static bool tcp_md5sig_pool_populated = false;
3221
3222static void __tcp_alloc_md5sig_pool(void)
3223{
3224 struct crypto_ahash *hash;
3225 int cpu;
3226
3227 hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
3228 if (IS_ERR(hash))
3229 return;
3230
3231 for_each_possible_cpu(cpu) {
3232 void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
3233 struct ahash_request *req;
3234
3235 if (!scratch) {
3236 scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
3237 sizeof(struct tcphdr),
3238 GFP_KERNEL,
3239 cpu_to_node(cpu));
3240 if (!scratch)
3241 return;
3242 per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
3243 }
3244 if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
3245 continue;
3246
3247 req = ahash_request_alloc(hash, GFP_KERNEL);
3248 if (!req)
3249 return;
3250
3251 ahash_request_set_callback(req, 0, NULL, NULL);
3252
3253 per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
3254 }
3255
3256
3257
3258 smp_wmb();
3259 tcp_md5sig_pool_populated = true;
3260}
3261
3262bool tcp_alloc_md5sig_pool(void)
3263{
3264 if (unlikely(!tcp_md5sig_pool_populated)) {
3265 mutex_lock(&tcp_md5sig_mutex);
3266
3267 if (!tcp_md5sig_pool_populated)
3268 __tcp_alloc_md5sig_pool();
3269
3270 mutex_unlock(&tcp_md5sig_mutex);
3271 }
3272 return tcp_md5sig_pool_populated;
3273}
3274EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3285{
3286 local_bh_disable();
3287
3288 if (tcp_md5sig_pool_populated) {
3289
3290 smp_rmb();
3291 return this_cpu_ptr(&tcp_md5sig_pool);
3292 }
3293 local_bh_enable();
3294 return NULL;
3295}
3296EXPORT_SYMBOL(tcp_get_md5sig_pool);
3297
3298int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3299 const struct sk_buff *skb, unsigned int header_len)
3300{
3301 struct scatterlist sg;
3302 const struct tcphdr *tp = tcp_hdr(skb);
3303 struct ahash_request *req = hp->md5_req;
3304 unsigned int i;
3305 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
3306 skb_headlen(skb) - header_len : 0;
3307 const struct skb_shared_info *shi = skb_shinfo(skb);
3308 struct sk_buff *frag_iter;
3309
3310 sg_init_table(&sg, 1);
3311
3312 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
3313 ahash_request_set_crypt(req, &sg, NULL, head_data_len);
3314 if (crypto_ahash_update(req))
3315 return 1;
3316
3317 for (i = 0; i < shi->nr_frags; ++i) {
3318 const struct skb_frag_struct *f = &shi->frags[i];
3319 unsigned int offset = f->page_offset;
3320 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
3321
3322 sg_set_page(&sg, page, skb_frag_size(f),
3323 offset_in_page(offset));
3324 ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
3325 if (crypto_ahash_update(req))
3326 return 1;
3327 }
3328
3329 skb_walk_frags(skb, frag_iter)
3330 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3331 return 1;
3332
3333 return 0;
3334}
3335EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3336
3337int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
3338{
3339 struct scatterlist sg;
3340
3341 sg_init_one(&sg, key->key, key->keylen);
3342 ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen);
3343 return crypto_ahash_update(hp->md5_req);
3344}
3345EXPORT_SYMBOL(tcp_md5_hash_key);
3346
3347#endif
3348
3349void tcp_done(struct sock *sk)
3350{
3351 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3352
3353 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3354 TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3355
3356 tcp_set_state(sk, TCP_CLOSE);
3357 tcp_clear_xmit_timers(sk);
3358 if (req)
3359 reqsk_fastopen_remove(sk, req, false);
3360
3361 sk->sk_shutdown = SHUTDOWN_MASK;
3362
3363 if (!sock_flag(sk, SOCK_DEAD))
3364 sk->sk_state_change(sk);
3365 else
3366 inet_csk_destroy_sock(sk);
3367}
3368EXPORT_SYMBOL_GPL(tcp_done);
3369
3370int tcp_abort(struct sock *sk, int err)
3371{
3372 if (!sk_fullsock(sk)) {
3373 if (sk->sk_state == TCP_NEW_SYN_RECV) {
3374 struct request_sock *req = inet_reqsk(sk);
3375
3376 local_bh_disable();
3377 inet_csk_reqsk_queue_drop_and_put(req->rsk_listener,
3378 req);
3379 local_bh_enable();
3380 return 0;
3381 }
3382 return -EOPNOTSUPP;
3383 }
3384
3385
3386 lock_sock(sk);
3387
3388 if (sk->sk_state == TCP_LISTEN) {
3389 tcp_set_state(sk, TCP_CLOSE);
3390 inet_csk_listen_stop(sk);
3391 }
3392
3393
3394 local_bh_disable();
3395 bh_lock_sock(sk);
3396
3397 if (!sock_flag(sk, SOCK_DEAD)) {
3398 sk->sk_err = err;
3399
3400 smp_wmb();
3401 sk->sk_error_report(sk);
3402 if (tcp_need_reset(sk->sk_state))
3403 tcp_send_active_reset(sk, GFP_ATOMIC);
3404 tcp_done(sk);
3405 }
3406
3407 bh_unlock_sock(sk);
3408 local_bh_enable();
3409 release_sock(sk);
3410 return 0;
3411}
3412EXPORT_SYMBOL_GPL(tcp_abort);
3413
3414extern struct tcp_congestion_ops tcp_reno;
3415
3416static __initdata unsigned long thash_entries;
3417static int __init set_thash_entries(char *str)
3418{
3419 ssize_t ret;
3420
3421 if (!str)
3422 return 0;
3423
3424 ret = kstrtoul(str, 0, &thash_entries);
3425 if (ret)
3426 return 0;
3427
3428 return 1;
3429}
3430__setup("thash_entries=", set_thash_entries);
3431
3432static void __init tcp_init_mem(void)
3433{
3434 unsigned long limit = nr_free_buffer_pages() / 16;
3435
3436 limit = max(limit, 128UL);
3437 sysctl_tcp_mem[0] = limit / 4 * 3;
3438 sysctl_tcp_mem[1] = limit;
3439 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
3440}
3441
3442void __init tcp_init(void)
3443{
3444 int max_rshare, max_wshare, cnt;
3445 unsigned long limit;
3446 unsigned int i;
3447
3448 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
3449 FIELD_SIZEOF(struct sk_buff, cb));
3450
3451 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
3452 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
3453 inet_hashinfo_init(&tcp_hashinfo);
3454 tcp_hashinfo.bind_bucket_cachep =
3455 kmem_cache_create("tcp_bind_bucket",
3456 sizeof(struct inet_bind_bucket), 0,
3457 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3458
3459
3460
3461
3462
3463
3464 tcp_hashinfo.ehash =
3465 alloc_large_system_hash("TCP established",
3466 sizeof(struct inet_ehash_bucket),
3467 thash_entries,
3468 17,
3469 0,
3470 NULL,
3471 &tcp_hashinfo.ehash_mask,
3472 0,
3473 thash_entries ? 0 : 512 * 1024);
3474 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
3475 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
3476
3477 if (inet_ehash_locks_alloc(&tcp_hashinfo))
3478 panic("TCP: failed to alloc ehash_locks");
3479 tcp_hashinfo.bhash =
3480 alloc_large_system_hash("TCP bind",
3481 sizeof(struct inet_bind_hashbucket),
3482 tcp_hashinfo.ehash_mask + 1,
3483 17,
3484 0,
3485 &tcp_hashinfo.bhash_size,
3486 NULL,
3487 0,
3488 64 * 1024);
3489 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
3490 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3491 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3492 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3493 }
3494
3495
3496 cnt = tcp_hashinfo.ehash_mask + 1;
3497 sysctl_tcp_max_orphans = cnt / 2;
3498
3499 tcp_init_mem();
3500
3501 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
3502 max_wshare = min(4UL*1024*1024, limit);
3503 max_rshare = min(6UL*1024*1024, limit);
3504
3505 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3506 sysctl_tcp_wmem[1] = 16*1024;
3507 sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3508
3509 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3510 sysctl_tcp_rmem[1] = 87380;
3511 sysctl_tcp_rmem[2] = max(87380, max_rshare);
3512
3513 pr_info("Hash tables configured (established %u bind %u)\n",
3514 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3515
3516 tcp_v4_init();
3517 tcp_metrics_init();
3518 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
3519 tcp_tasklet_init();
3520}
3521