1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248#include <linux/kernel.h>
249#include <linux/module.h>
250#include <linux/types.h>
251#include <linux/fcntl.h>
252#include <linux/poll.h>
253#include <linux/init.h>
254#include <linux/fs.h>
255#include <linux/skbuff.h>
256#include <linux/scatterlist.h>
257#include <linux/splice.h>
258#include <linux/net.h>
259#include <linux/socket.h>
260#include <linux/random.h>
261#include <linux/bootmem.h>
262#include <linux/highmem.h>
263#include <linux/swap.h>
264#include <linux/cache.h>
265#include <linux/err.h>
266#include <linux/crypto.h>
267
268#include <net/icmp.h>
269#include <net/tcp.h>
270#include <net/xfrm.h>
271#include <net/ip.h>
272#include <net/netdma.h>
273#include <net/sock.h>
274
275#include <asm/uaccess.h>
276#include <asm/ioctls.h>
277
278int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
279
280struct percpu_counter tcp_orphan_count;
281EXPORT_SYMBOL_GPL(tcp_orphan_count);
282
283int sysctl_tcp_mem[3] __read_mostly;
284int sysctl_tcp_wmem[3] __read_mostly;
285int sysctl_tcp_rmem[3] __read_mostly;
286
287EXPORT_SYMBOL(sysctl_tcp_mem);
288EXPORT_SYMBOL(sysctl_tcp_rmem);
289EXPORT_SYMBOL(sysctl_tcp_wmem);
290
291atomic_t tcp_memory_allocated;
292EXPORT_SYMBOL(tcp_memory_allocated);
293
294
295
296
297struct percpu_counter tcp_sockets_allocated;
298EXPORT_SYMBOL(tcp_sockets_allocated);
299
300
301
302
303struct tcp_splice_state {
304 struct pipe_inode_info *pipe;
305 size_t len;
306 unsigned int flags;
307};
308
309
310
311
312
313
314
315int tcp_memory_pressure __read_mostly;
316
317EXPORT_SYMBOL(tcp_memory_pressure);
318
319void tcp_enter_memory_pressure(struct sock *sk)
320{
321 if (!tcp_memory_pressure) {
322 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
323 tcp_memory_pressure = 1;
324 }
325}
326
327EXPORT_SYMBOL(tcp_enter_memory_pressure);
328
329
330static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
331{
332 u8 res = 0;
333
334 if (seconds > 0) {
335 int period = timeout;
336
337 res = 1;
338 while (seconds > period && res < 255) {
339 res++;
340 timeout <<= 1;
341 if (timeout > rto_max)
342 timeout = rto_max;
343 period += timeout;
344 }
345 }
346 return res;
347}
348
349
350static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
351{
352 int period = 0;
353
354 if (retrans > 0) {
355 period = timeout;
356 while (--retrans) {
357 timeout <<= 1;
358 if (timeout > rto_max)
359 timeout = rto_max;
360 period += timeout;
361 }
362 }
363 return period;
364}
365
366
367
368
369
370
371
372
373unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
374{
375 unsigned int mask;
376 struct sock *sk = sock->sk;
377 struct tcp_sock *tp = tcp_sk(sk);
378
379 sock_poll_wait(file, sk->sk_sleep, wait);
380 if (sk->sk_state == TCP_LISTEN)
381 return inet_csk_listen_poll(sk);
382
383
384
385
386
387
388 mask = 0;
389 if (sk->sk_err)
390 mask = POLLERR;
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
420 mask |= POLLHUP;
421 if (sk->sk_shutdown & RCV_SHUTDOWN)
422 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
423
424
425 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
426 int target = sock_rcvlowat(sk, 0, INT_MAX);
427
428 if (tp->urg_seq == tp->copied_seq &&
429 !sock_flag(sk, SOCK_URGINLINE) &&
430 tp->urg_data)
431 target--;
432
433
434
435
436 if (tp->rcv_nxt - tp->copied_seq >= target)
437 mask |= POLLIN | POLLRDNORM;
438
439 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
440 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
441 mask |= POLLOUT | POLLWRNORM;
442 } else {
443 set_bit(SOCK_ASYNC_NOSPACE,
444 &sk->sk_socket->flags);
445 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
446
447
448
449
450
451 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
452 mask |= POLLOUT | POLLWRNORM;
453 }
454 }
455
456 if (tp->urg_data & TCP_URG_VALID)
457 mask |= POLLPRI;
458 }
459 return mask;
460}
461
462int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
463{
464 struct tcp_sock *tp = tcp_sk(sk);
465 int answ;
466
467 switch (cmd) {
468 case SIOCINQ:
469 if (sk->sk_state == TCP_LISTEN)
470 return -EINVAL;
471
472 lock_sock(sk);
473 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
474 answ = 0;
475 else if (sock_flag(sk, SOCK_URGINLINE) ||
476 !tp->urg_data ||
477 before(tp->urg_seq, tp->copied_seq) ||
478 !before(tp->urg_seq, tp->rcv_nxt)) {
479 struct sk_buff *skb;
480
481 answ = tp->rcv_nxt - tp->copied_seq;
482
483
484 skb = skb_peek_tail(&sk->sk_receive_queue);
485 if (answ && skb)
486 answ -= tcp_hdr(skb)->fin;
487 } else
488 answ = tp->urg_seq - tp->copied_seq;
489 release_sock(sk);
490 break;
491 case SIOCATMARK:
492 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
493 break;
494 case SIOCOUTQ:
495 if (sk->sk_state == TCP_LISTEN)
496 return -EINVAL;
497
498 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
499 answ = 0;
500 else
501 answ = tp->write_seq - tp->snd_una;
502 break;
503 default:
504 return -ENOIOCTLCMD;
505 }
506
507 return put_user(answ, (int __user *)arg);
508}
509
510static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
511{
512 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
513 tp->pushed_seq = tp->write_seq;
514}
515
516static inline int forced_push(struct tcp_sock *tp)
517{
518 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
519}
520
521static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
522{
523 struct tcp_sock *tp = tcp_sk(sk);
524 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
525
526 skb->csum = 0;
527 tcb->seq = tcb->end_seq = tp->write_seq;
528 tcb->flags = TCPCB_FLAG_ACK;
529 tcb->sacked = 0;
530 skb_header_release(skb);
531 tcp_add_write_queue_tail(sk, skb);
532 sk->sk_wmem_queued += skb->truesize;
533 sk_mem_charge(sk, skb->truesize);
534 if (tp->nonagle & TCP_NAGLE_PUSH)
535 tp->nonagle &= ~TCP_NAGLE_PUSH;
536}
537
538static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
539 struct sk_buff *skb)
540{
541 if (flags & MSG_OOB)
542 tp->snd_up = tp->write_seq;
543}
544
545static inline void tcp_push(struct sock *sk, int flags, int mss_now,
546 int nonagle)
547{
548 struct tcp_sock *tp = tcp_sk(sk);
549
550 if (tcp_send_head(sk)) {
551 struct sk_buff *skb = tcp_write_queue_tail(sk);
552 if (!(flags & MSG_MORE) || forced_push(tp))
553 tcp_mark_push(tp, skb);
554 tcp_mark_urg(tp, flags, skb);
555 __tcp_push_pending_frames(sk, mss_now,
556 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
557 }
558}
559
560static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
561 unsigned int offset, size_t len)
562{
563 struct tcp_splice_state *tss = rd_desc->arg.data;
564 int ret;
565
566 ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
567 tss->flags);
568 if (ret > 0)
569 rd_desc->count -= ret;
570 return ret;
571}
572
573static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
574{
575
576 read_descriptor_t rd_desc = {
577 .arg.data = tss,
578 .count = tss->len,
579 };
580
581 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
582}
583
584
585
586
587
588
589
590
591
592
593
594
595
596ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
597 struct pipe_inode_info *pipe, size_t len,
598 unsigned int flags)
599{
600 struct sock *sk = sock->sk;
601 struct tcp_splice_state tss = {
602 .pipe = pipe,
603 .len = len,
604 .flags = flags,
605 };
606 long timeo;
607 ssize_t spliced;
608 int ret;
609
610
611
612
613 if (unlikely(*ppos))
614 return -ESPIPE;
615
616 ret = spliced = 0;
617
618 lock_sock(sk);
619
620 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
621 while (tss.len) {
622 ret = __tcp_splice_read(sk, &tss);
623 if (ret < 0)
624 break;
625 else if (!ret) {
626 if (spliced)
627 break;
628 if (sock_flag(sk, SOCK_DONE))
629 break;
630 if (sk->sk_err) {
631 ret = sock_error(sk);
632 break;
633 }
634 if (sk->sk_shutdown & RCV_SHUTDOWN)
635 break;
636 if (sk->sk_state == TCP_CLOSE) {
637
638
639
640
641 if (!sock_flag(sk, SOCK_DONE))
642 ret = -ENOTCONN;
643 break;
644 }
645 if (!timeo) {
646 ret = -EAGAIN;
647 break;
648 }
649 sk_wait_data(sk, &timeo);
650 if (signal_pending(current)) {
651 ret = sock_intr_errno(timeo);
652 break;
653 }
654 continue;
655 }
656 tss.len -= ret;
657 spliced += ret;
658
659 if (!timeo)
660 break;
661 release_sock(sk);
662 lock_sock(sk);
663
664 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
665 (sk->sk_shutdown & RCV_SHUTDOWN) ||
666 signal_pending(current))
667 break;
668 }
669
670 release_sock(sk);
671
672 if (spliced)
673 return spliced;
674
675 return ret;
676}
677
678struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
679{
680 struct sk_buff *skb;
681
682
683 size = ALIGN(size, 4);
684
685 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
686 if (skb) {
687 if (sk_wmem_schedule(sk, skb->truesize)) {
688
689
690
691
692 skb_reserve(skb, skb_tailroom(skb) - size);
693 return skb;
694 }
695 __kfree_skb(skb);
696 } else {
697 sk->sk_prot->enter_memory_pressure(sk);
698 sk_stream_moderate_sndbuf(sk);
699 }
700 return NULL;
701}
702
703static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
704 int large_allowed)
705{
706 struct tcp_sock *tp = tcp_sk(sk);
707 u32 xmit_size_goal, old_size_goal;
708
709 xmit_size_goal = mss_now;
710
711 if (large_allowed && sk_can_gso(sk)) {
712 xmit_size_goal = ((sk->sk_gso_max_size - 1) -
713 inet_csk(sk)->icsk_af_ops->net_header_len -
714 inet_csk(sk)->icsk_ext_hdr_len -
715 tp->tcp_header_len);
716
717 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
718
719
720 old_size_goal = tp->xmit_size_goal_segs * mss_now;
721
722 if (likely(old_size_goal <= xmit_size_goal &&
723 old_size_goal + mss_now > xmit_size_goal)) {
724 xmit_size_goal = old_size_goal;
725 } else {
726 tp->xmit_size_goal_segs = xmit_size_goal / mss_now;
727 xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
728 }
729 }
730
731 return max(xmit_size_goal, mss_now);
732}
733
734static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
735{
736 int mss_now;
737
738 mss_now = tcp_current_mss(sk);
739 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
740
741 return mss_now;
742}
743
744static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
745 size_t psize, int flags)
746{
747 struct tcp_sock *tp = tcp_sk(sk);
748 int mss_now, size_goal;
749 int err;
750 ssize_t copied;
751 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
752
753
754 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
755 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
756 goto out_err;
757
758 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
759
760 mss_now = tcp_send_mss(sk, &size_goal, flags);
761 copied = 0;
762
763 err = -EPIPE;
764 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
765 goto out_err;
766
767 while (psize > 0) {
768 struct sk_buff *skb = tcp_write_queue_tail(sk);
769 struct page *page = pages[poffset / PAGE_SIZE];
770 int copy, i, can_coalesce;
771 int offset = poffset % PAGE_SIZE;
772 int size = min_t(size_t, psize, PAGE_SIZE - offset);
773
774 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
775new_segment:
776 if (!sk_stream_memory_free(sk))
777 goto wait_for_sndbuf;
778
779 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
780 if (!skb)
781 goto wait_for_memory;
782
783 skb_entail(sk, skb);
784 copy = size_goal;
785 }
786
787 if (copy > size)
788 copy = size;
789
790 i = skb_shinfo(skb)->nr_frags;
791 can_coalesce = skb_can_coalesce(skb, i, page, offset);
792 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
793 tcp_mark_push(tp, skb);
794 goto new_segment;
795 }
796 if (!sk_wmem_schedule(sk, copy))
797 goto wait_for_memory;
798
799 if (can_coalesce) {
800 skb_shinfo(skb)->frags[i - 1].size += copy;
801 } else {
802 get_page(page);
803 skb_fill_page_desc(skb, i, page, offset, copy);
804 }
805
806 skb->len += copy;
807 skb->data_len += copy;
808 skb->truesize += copy;
809 sk->sk_wmem_queued += copy;
810 sk_mem_charge(sk, copy);
811 skb->ip_summed = CHECKSUM_PARTIAL;
812 tp->write_seq += copy;
813 TCP_SKB_CB(skb)->end_seq += copy;
814 skb_shinfo(skb)->gso_segs = 0;
815
816 if (!copied)
817 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
818
819 copied += copy;
820 poffset += copy;
821 if (!(psize -= copy))
822 goto out;
823
824 if (skb->len < size_goal || (flags & MSG_OOB))
825 continue;
826
827 if (forced_push(tp)) {
828 tcp_mark_push(tp, skb);
829 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
830 } else if (skb == tcp_send_head(sk))
831 tcp_push_one(sk, mss_now);
832 continue;
833
834wait_for_sndbuf:
835 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
836wait_for_memory:
837 if (copied)
838 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
839
840 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
841 goto do_error;
842
843 mss_now = tcp_send_mss(sk, &size_goal, flags);
844 }
845
846out:
847 if (copied)
848 tcp_push(sk, flags, mss_now, tp->nonagle);
849 return copied;
850
851do_error:
852 if (copied)
853 goto out;
854out_err:
855 return sk_stream_error(sk, flags, err);
856}
857
858ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
859 size_t size, int flags)
860{
861 ssize_t res;
862 struct sock *sk = sock->sk;
863
864 if (!(sk->sk_route_caps & NETIF_F_SG) ||
865 !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
866 return sock_no_sendpage(sock, page, offset, size, flags);
867
868 lock_sock(sk);
869 TCP_CHECK_TIMER(sk);
870 res = do_tcp_sendpages(sk, &page, offset, size, flags);
871 TCP_CHECK_TIMER(sk);
872 release_sock(sk);
873 return res;
874}
875
876#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
877#define TCP_OFF(sk) (sk->sk_sndmsg_off)
878
879static inline int select_size(struct sock *sk)
880{
881 struct tcp_sock *tp = tcp_sk(sk);
882 int tmp = tp->mss_cache;
883
884 if (sk->sk_route_caps & NETIF_F_SG) {
885 if (sk_can_gso(sk))
886 tmp = 0;
887 else {
888 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
889
890 if (tmp >= pgbreak &&
891 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
892 tmp = pgbreak;
893 }
894 }
895
896 return tmp;
897}
898
899int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
900 size_t size)
901{
902 struct sock *sk = sock->sk;
903 struct iovec *iov;
904 struct tcp_sock *tp = tcp_sk(sk);
905 struct sk_buff *skb;
906 int iovlen, flags;
907 int mss_now, size_goal;
908 int err, copied;
909 long timeo;
910
911 lock_sock(sk);
912 TCP_CHECK_TIMER(sk);
913
914 flags = msg->msg_flags;
915 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
916
917
918 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
919 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
920 goto out_err;
921
922
923 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
924
925 mss_now = tcp_send_mss(sk, &size_goal, flags);
926
927
928 iovlen = msg->msg_iovlen;
929 iov = msg->msg_iov;
930 copied = 0;
931
932 err = -EPIPE;
933 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
934 goto out_err;
935
936 while (--iovlen >= 0) {
937 int seglen = iov->iov_len;
938 unsigned char __user *from = iov->iov_base;
939
940 iov++;
941
942 while (seglen > 0) {
943 int copy = 0;
944 int max = size_goal;
945
946 skb = tcp_write_queue_tail(sk);
947 if (tcp_send_head(sk)) {
948 if (skb->ip_summed == CHECKSUM_NONE)
949 max = mss_now;
950 copy = max - skb->len;
951 }
952
953 if (copy <= 0) {
954new_segment:
955
956
957
958 if (!sk_stream_memory_free(sk))
959 goto wait_for_sndbuf;
960
961 skb = sk_stream_alloc_skb(sk, select_size(sk),
962 sk->sk_allocation);
963 if (!skb)
964 goto wait_for_memory;
965
966
967
968
969 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
970 skb->ip_summed = CHECKSUM_PARTIAL;
971
972 skb_entail(sk, skb);
973 copy = size_goal;
974 max = size_goal;
975 }
976
977
978 if (copy > seglen)
979 copy = seglen;
980
981
982 if (skb_tailroom(skb) > 0) {
983
984 if (copy > skb_tailroom(skb))
985 copy = skb_tailroom(skb);
986 if ((err = skb_add_data(skb, from, copy)) != 0)
987 goto do_fault;
988 } else {
989 int merge = 0;
990 int i = skb_shinfo(skb)->nr_frags;
991 struct page *page = TCP_PAGE(sk);
992 int off = TCP_OFF(sk);
993
994 if (skb_can_coalesce(skb, i, page, off) &&
995 off != PAGE_SIZE) {
996
997
998 merge = 1;
999 } else if (i == MAX_SKB_FRAGS ||
1000 (!i &&
1001 !(sk->sk_route_caps & NETIF_F_SG))) {
1002
1003
1004
1005
1006 tcp_mark_push(tp, skb);
1007 goto new_segment;
1008 } else if (page) {
1009 if (off == PAGE_SIZE) {
1010 put_page(page);
1011 TCP_PAGE(sk) = page = NULL;
1012 off = 0;
1013 }
1014 } else
1015 off = 0;
1016
1017 if (copy > PAGE_SIZE - off)
1018 copy = PAGE_SIZE - off;
1019
1020 if (!sk_wmem_schedule(sk, copy))
1021 goto wait_for_memory;
1022
1023 if (!page) {
1024
1025 if (!(page = sk_stream_alloc_page(sk)))
1026 goto wait_for_memory;
1027 }
1028
1029
1030
1031 err = skb_copy_to_page(sk, from, skb, page,
1032 off, copy);
1033 if (err) {
1034
1035
1036
1037 if (!TCP_PAGE(sk)) {
1038 TCP_PAGE(sk) = page;
1039 TCP_OFF(sk) = 0;
1040 }
1041 goto do_error;
1042 }
1043
1044
1045 if (merge) {
1046 skb_shinfo(skb)->frags[i - 1].size +=
1047 copy;
1048 } else {
1049 skb_fill_page_desc(skb, i, page, off, copy);
1050 if (TCP_PAGE(sk)) {
1051 get_page(page);
1052 } else if (off + copy < PAGE_SIZE) {
1053 get_page(page);
1054 TCP_PAGE(sk) = page;
1055 }
1056 }
1057
1058 TCP_OFF(sk) = off + copy;
1059 }
1060
1061 if (!copied)
1062 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1063
1064 tp->write_seq += copy;
1065 TCP_SKB_CB(skb)->end_seq += copy;
1066 skb_shinfo(skb)->gso_segs = 0;
1067
1068 from += copy;
1069 copied += copy;
1070 if ((seglen -= copy) == 0 && iovlen == 0)
1071 goto out;
1072
1073 if (skb->len < max || (flags & MSG_OOB))
1074 continue;
1075
1076 if (forced_push(tp)) {
1077 tcp_mark_push(tp, skb);
1078 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1079 } else if (skb == tcp_send_head(sk))
1080 tcp_push_one(sk, mss_now);
1081 continue;
1082
1083wait_for_sndbuf:
1084 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1085wait_for_memory:
1086 if (copied)
1087 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1088
1089 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1090 goto do_error;
1091
1092 mss_now = tcp_send_mss(sk, &size_goal, flags);
1093 }
1094 }
1095
1096out:
1097 if (copied)
1098 tcp_push(sk, flags, mss_now, tp->nonagle);
1099 TCP_CHECK_TIMER(sk);
1100 release_sock(sk);
1101 return copied;
1102
1103do_fault:
1104 if (!skb->len) {
1105 tcp_unlink_write_queue(skb, sk);
1106
1107
1108
1109 tcp_check_send_head(sk, skb);
1110 sk_wmem_free_skb(sk, skb);
1111 }
1112
1113do_error:
1114 if (copied)
1115 goto out;
1116out_err:
1117 err = sk_stream_error(sk, flags, err);
1118 TCP_CHECK_TIMER(sk);
1119 release_sock(sk);
1120 return err;
1121}
1122
1123
1124
1125
1126
1127
1128static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1129{
1130 struct tcp_sock *tp = tcp_sk(sk);
1131
1132
1133 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1134 tp->urg_data == TCP_URG_READ)
1135 return -EINVAL;
1136
1137 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1138 return -ENOTCONN;
1139
1140 if (tp->urg_data & TCP_URG_VALID) {
1141 int err = 0;
1142 char c = tp->urg_data;
1143
1144 if (!(flags & MSG_PEEK))
1145 tp->urg_data = TCP_URG_READ;
1146
1147
1148 msg->msg_flags |= MSG_OOB;
1149
1150 if (len > 0) {
1151 if (!(flags & MSG_TRUNC))
1152 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1153 len = 1;
1154 } else
1155 msg->msg_flags |= MSG_TRUNC;
1156
1157 return err ? -EFAULT : len;
1158 }
1159
1160 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1161 return 0;
1162
1163
1164
1165
1166
1167
1168
1169 return -EAGAIN;
1170}
1171
1172
1173
1174
1175
1176
1177
1178void tcp_cleanup_rbuf(struct sock *sk, int copied)
1179{
1180 struct tcp_sock *tp = tcp_sk(sk);
1181 int time_to_ack = 0;
1182
1183#if TCP_DEBUG
1184 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1185
1186 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1187 KERN_INFO "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1188 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1189#endif
1190
1191 if (inet_csk_ack_scheduled(sk)) {
1192 const struct inet_connection_sock *icsk = inet_csk(sk);
1193
1194
1195 if (icsk->icsk_ack.blocked ||
1196
1197 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1198
1199
1200
1201
1202
1203
1204 (copied > 0 &&
1205 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1206 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1207 !icsk->icsk_ack.pingpong)) &&
1208 !atomic_read(&sk->sk_rmem_alloc)))
1209 time_to_ack = 1;
1210 }
1211
1212
1213
1214
1215
1216
1217
1218 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1219 __u32 rcv_window_now = tcp_receive_window(tp);
1220
1221
1222 if (2*rcv_window_now <= tp->window_clamp) {
1223 __u32 new_window = __tcp_select_window(sk);
1224
1225
1226
1227
1228
1229
1230 if (new_window && new_window >= 2 * rcv_window_now)
1231 time_to_ack = 1;
1232 }
1233 }
1234 if (time_to_ack)
1235 tcp_send_ack(sk);
1236}
1237
1238static void tcp_prequeue_process(struct sock *sk)
1239{
1240 struct sk_buff *skb;
1241 struct tcp_sock *tp = tcp_sk(sk);
1242
1243 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1244
1245
1246
1247 local_bh_disable();
1248 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1249 sk_backlog_rcv(sk, skb);
1250 local_bh_enable();
1251
1252
1253 tp->ucopy.memory = 0;
1254}
1255
1256static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1257{
1258 struct sk_buff *skb;
1259 u32 offset;
1260
1261 skb_queue_walk(&sk->sk_receive_queue, skb) {
1262 offset = seq - TCP_SKB_CB(skb)->seq;
1263 if (tcp_hdr(skb)->syn)
1264 offset--;
1265 if (offset < skb->len || tcp_hdr(skb)->fin) {
1266 *off = offset;
1267 return skb;
1268 }
1269 }
1270 return NULL;
1271}
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1285 sk_read_actor_t recv_actor)
1286{
1287 struct sk_buff *skb;
1288 struct tcp_sock *tp = tcp_sk(sk);
1289 u32 seq = tp->copied_seq;
1290 u32 offset;
1291 int copied = 0;
1292
1293 if (sk->sk_state == TCP_LISTEN)
1294 return -ENOTCONN;
1295 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1296 if (offset < skb->len) {
1297 int used;
1298 size_t len;
1299
1300 len = skb->len - offset;
1301
1302 if (tp->urg_data) {
1303 u32 urg_offset = tp->urg_seq - seq;
1304 if (urg_offset < len)
1305 len = urg_offset;
1306 if (!len)
1307 break;
1308 }
1309 used = recv_actor(desc, skb, offset, len);
1310 if (used < 0) {
1311 if (!copied)
1312 copied = used;
1313 break;
1314 } else if (used <= len) {
1315 seq += used;
1316 copied += used;
1317 offset += used;
1318 }
1319
1320
1321
1322
1323
1324
1325 skb = tcp_recv_skb(sk, seq-1, &offset);
1326 if (!skb || (offset+1 != skb->len))
1327 break;
1328 }
1329 if (tcp_hdr(skb)->fin) {
1330 sk_eat_skb(sk, skb, 0);
1331 ++seq;
1332 break;
1333 }
1334 sk_eat_skb(sk, skb, 0);
1335 if (!desc->count)
1336 break;
1337 }
1338 tp->copied_seq = seq;
1339
1340 tcp_rcv_space_adjust(sk);
1341
1342
1343 if (copied > 0)
1344 tcp_cleanup_rbuf(sk, copied);
1345 return copied;
1346}
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1357 size_t len, int nonblock, int flags, int *addr_len)
1358{
1359 struct tcp_sock *tp = tcp_sk(sk);
1360 int copied = 0;
1361 u32 peek_seq;
1362 u32 *seq;
1363 unsigned long used;
1364 int err;
1365 int target;
1366 long timeo;
1367 struct task_struct *user_recv = NULL;
1368 int copied_early = 0;
1369 struct sk_buff *skb;
1370 u32 urg_hole = 0;
1371
1372 lock_sock(sk);
1373
1374 TCP_CHECK_TIMER(sk);
1375
1376 err = -ENOTCONN;
1377 if (sk->sk_state == TCP_LISTEN)
1378 goto out;
1379
1380 timeo = sock_rcvtimeo(sk, nonblock);
1381
1382
1383 if (flags & MSG_OOB)
1384 goto recv_urg;
1385
1386 seq = &tp->copied_seq;
1387 if (flags & MSG_PEEK) {
1388 peek_seq = tp->copied_seq;
1389 seq = &peek_seq;
1390 }
1391
1392 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1393
1394#ifdef CONFIG_NET_DMA
1395 tp->ucopy.dma_chan = NULL;
1396 preempt_disable();
1397 skb = skb_peek_tail(&sk->sk_receive_queue);
1398 {
1399 int available = 0;
1400
1401 if (skb)
1402 available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
1403 if ((available < target) &&
1404 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1405 !sysctl_tcp_low_latency &&
1406 dma_find_channel(DMA_MEMCPY)) {
1407 preempt_enable_no_resched();
1408 tp->ucopy.pinned_list =
1409 dma_pin_iovec_pages(msg->msg_iov, len);
1410 } else {
1411 preempt_enable_no_resched();
1412 }
1413 }
1414#endif
1415
1416 do {
1417 u32 offset;
1418
1419
1420 if (tp->urg_data && tp->urg_seq == *seq) {
1421 if (copied)
1422 break;
1423 if (signal_pending(current)) {
1424 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1425 break;
1426 }
1427 }
1428
1429
1430
1431 skb_queue_walk(&sk->sk_receive_queue, skb) {
1432
1433
1434
1435 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1436 KERN_INFO "recvmsg bug: copied %X "
1437 "seq %X rcvnxt %X fl %X\n", *seq,
1438 TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1439 flags))
1440 break;
1441
1442 offset = *seq - TCP_SKB_CB(skb)->seq;
1443 if (tcp_hdr(skb)->syn)
1444 offset--;
1445 if (offset < skb->len)
1446 goto found_ok_skb;
1447 if (tcp_hdr(skb)->fin)
1448 goto found_fin_ok;
1449 WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: "
1450 "copied %X seq %X rcvnxt %X fl %X\n",
1451 *seq, TCP_SKB_CB(skb)->seq,
1452 tp->rcv_nxt, flags);
1453 }
1454
1455
1456
1457 if (copied >= target && !sk->sk_backlog.tail)
1458 break;
1459
1460 if (copied) {
1461 if (sk->sk_err ||
1462 sk->sk_state == TCP_CLOSE ||
1463 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1464 !timeo ||
1465 signal_pending(current))
1466 break;
1467 } else {
1468 if (sock_flag(sk, SOCK_DONE))
1469 break;
1470
1471 if (sk->sk_err) {
1472 copied = sock_error(sk);
1473 break;
1474 }
1475
1476 if (sk->sk_shutdown & RCV_SHUTDOWN)
1477 break;
1478
1479 if (sk->sk_state == TCP_CLOSE) {
1480 if (!sock_flag(sk, SOCK_DONE)) {
1481
1482
1483
1484 copied = -ENOTCONN;
1485 break;
1486 }
1487 break;
1488 }
1489
1490 if (!timeo) {
1491 copied = -EAGAIN;
1492 break;
1493 }
1494
1495 if (signal_pending(current)) {
1496 copied = sock_intr_errno(timeo);
1497 break;
1498 }
1499 }
1500
1501 tcp_cleanup_rbuf(sk, copied);
1502
1503 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1504
1505 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1506 user_recv = current;
1507 tp->ucopy.task = user_recv;
1508 tp->ucopy.iov = msg->msg_iov;
1509 }
1510
1511 tp->ucopy.len = len;
1512
1513 WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1514 !(flags & (MSG_PEEK | MSG_TRUNC)));
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542 if (!skb_queue_empty(&tp->ucopy.prequeue))
1543 goto do_prequeue;
1544
1545
1546 }
1547
1548 if (copied >= target) {
1549
1550 release_sock(sk);
1551 lock_sock(sk);
1552 } else
1553 sk_wait_data(sk, &timeo);
1554
1555#ifdef CONFIG_NET_DMA
1556 tp->ucopy.wakeup = 0;
1557#endif
1558
1559 if (user_recv) {
1560 int chunk;
1561
1562
1563
1564 if ((chunk = len - tp->ucopy.len) != 0) {
1565 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1566 len -= chunk;
1567 copied += chunk;
1568 }
1569
1570 if (tp->rcv_nxt == tp->copied_seq &&
1571 !skb_queue_empty(&tp->ucopy.prequeue)) {
1572do_prequeue:
1573 tcp_prequeue_process(sk);
1574
1575 if ((chunk = len - tp->ucopy.len) != 0) {
1576 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1577 len -= chunk;
1578 copied += chunk;
1579 }
1580 }
1581 }
1582 if ((flags & MSG_PEEK) &&
1583 (peek_seq - copied - urg_hole != tp->copied_seq)) {
1584 if (net_ratelimit())
1585 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1586 current->comm, task_pid_nr(current));
1587 peek_seq = tp->copied_seq;
1588 }
1589 continue;
1590
1591 found_ok_skb:
1592
1593 used = skb->len - offset;
1594 if (len < used)
1595 used = len;
1596
1597
1598 if (tp->urg_data) {
1599 u32 urg_offset = tp->urg_seq - *seq;
1600 if (urg_offset < used) {
1601 if (!urg_offset) {
1602 if (!sock_flag(sk, SOCK_URGINLINE)) {
1603 ++*seq;
1604 urg_hole++;
1605 offset++;
1606 used--;
1607 if (!used)
1608 goto skip_copy;
1609 }
1610 } else
1611 used = urg_offset;
1612 }
1613 }
1614
1615 if (!(flags & MSG_TRUNC)) {
1616#ifdef CONFIG_NET_DMA
1617 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1618 tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1619
1620 if (tp->ucopy.dma_chan) {
1621 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1622 tp->ucopy.dma_chan, skb, offset,
1623 msg->msg_iov, used,
1624 tp->ucopy.pinned_list);
1625
1626 if (tp->ucopy.dma_cookie < 0) {
1627
1628 printk(KERN_ALERT "dma_cookie < 0\n");
1629
1630
1631 if (!copied)
1632 copied = -EFAULT;
1633 break;
1634 }
1635 if ((offset + used) == skb->len)
1636 copied_early = 1;
1637
1638 } else
1639#endif
1640 {
1641 err = skb_copy_datagram_iovec(skb, offset,
1642 msg->msg_iov, used);
1643 if (err) {
1644
1645 if (!copied)
1646 copied = -EFAULT;
1647 break;
1648 }
1649 }
1650 }
1651
1652 *seq += used;
1653 copied += used;
1654 len -= used;
1655
1656 tcp_rcv_space_adjust(sk);
1657
1658skip_copy:
1659 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1660 tp->urg_data = 0;
1661 tcp_fast_path_check(sk);
1662 }
1663 if (used + offset < skb->len)
1664 continue;
1665
1666 if (tcp_hdr(skb)->fin)
1667 goto found_fin_ok;
1668 if (!(flags & MSG_PEEK)) {
1669 sk_eat_skb(sk, skb, copied_early);
1670 copied_early = 0;
1671 }
1672 continue;
1673
1674 found_fin_ok:
1675
1676 ++*seq;
1677 if (!(flags & MSG_PEEK)) {
1678 sk_eat_skb(sk, skb, copied_early);
1679 copied_early = 0;
1680 }
1681 break;
1682 } while (len > 0);
1683
1684 if (user_recv) {
1685 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1686 int chunk;
1687
1688 tp->ucopy.len = copied > 0 ? len : 0;
1689
1690 tcp_prequeue_process(sk);
1691
1692 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1693 NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1694 len -= chunk;
1695 copied += chunk;
1696 }
1697 }
1698
1699 tp->ucopy.task = NULL;
1700 tp->ucopy.len = 0;
1701 }
1702
1703#ifdef CONFIG_NET_DMA
1704 if (tp->ucopy.dma_chan) {
1705 dma_cookie_t done, used;
1706
1707 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1708
1709 while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1710 tp->ucopy.dma_cookie, &done,
1711 &used) == DMA_IN_PROGRESS) {
1712
1713 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1714 (dma_async_is_complete(skb->dma_cookie, done,
1715 used) == DMA_SUCCESS)) {
1716 __skb_dequeue(&sk->sk_async_wait_queue);
1717 kfree_skb(skb);
1718 }
1719 }
1720
1721
1722 __skb_queue_purge(&sk->sk_async_wait_queue);
1723 tp->ucopy.dma_chan = NULL;
1724 }
1725 if (tp->ucopy.pinned_list) {
1726 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1727 tp->ucopy.pinned_list = NULL;
1728 }
1729#endif
1730
1731
1732
1733
1734
1735
1736 tcp_cleanup_rbuf(sk, copied);
1737
1738 TCP_CHECK_TIMER(sk);
1739 release_sock(sk);
1740 return copied;
1741
1742out:
1743 TCP_CHECK_TIMER(sk);
1744 release_sock(sk);
1745 return err;
1746
1747recv_urg:
1748 err = tcp_recv_urg(sk, msg, len, flags);
1749 goto out;
1750}
1751
1752void tcp_set_state(struct sock *sk, int state)
1753{
1754 int oldstate = sk->sk_state;
1755
1756 switch (state) {
1757 case TCP_ESTABLISHED:
1758 if (oldstate != TCP_ESTABLISHED)
1759 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1760 break;
1761
1762 case TCP_CLOSE:
1763 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
1764 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
1765
1766 sk->sk_prot->unhash(sk);
1767 if (inet_csk(sk)->icsk_bind_hash &&
1768 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1769 inet_put_port(sk);
1770
1771 default:
1772 if (oldstate == TCP_ESTABLISHED)
1773 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
1774 }
1775
1776
1777
1778
1779 sk->sk_state = state;
1780
1781#ifdef STATE_TRACE
1782 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
1783#endif
1784}
1785EXPORT_SYMBOL_GPL(tcp_set_state);
1786
1787
1788
1789
1790
1791
1792
1793
1794static const unsigned char new_state[16] = {
1795
1796 TCP_CLOSE,
1797 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1798 TCP_CLOSE,
1799 TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1800 TCP_FIN_WAIT1,
1801 TCP_FIN_WAIT2,
1802 TCP_CLOSE,
1803 TCP_CLOSE,
1804 TCP_LAST_ACK | TCP_ACTION_FIN,
1805 TCP_LAST_ACK,
1806 TCP_CLOSE,
1807 TCP_CLOSING,
1808};
1809
1810static int tcp_close_state(struct sock *sk)
1811{
1812 int next = (int)new_state[sk->sk_state];
1813 int ns = next & TCP_STATE_MASK;
1814
1815 tcp_set_state(sk, ns);
1816
1817 return next & TCP_ACTION_FIN;
1818}
1819
1820
1821
1822
1823
1824
1825void tcp_shutdown(struct sock *sk, int how)
1826{
1827
1828
1829
1830
1831 if (!(how & SEND_SHUTDOWN))
1832 return;
1833
1834
1835 if ((1 << sk->sk_state) &
1836 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1837 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1838
1839 if (tcp_close_state(sk))
1840 tcp_send_fin(sk);
1841 }
1842}
1843
1844void tcp_close(struct sock *sk, long timeout)
1845{
1846 struct sk_buff *skb;
1847 int data_was_unread = 0;
1848 int state;
1849
1850 lock_sock(sk);
1851 sk->sk_shutdown = SHUTDOWN_MASK;
1852
1853 if (sk->sk_state == TCP_LISTEN) {
1854 tcp_set_state(sk, TCP_CLOSE);
1855
1856
1857 inet_csk_listen_stop(sk);
1858
1859 goto adjudge_to_death;
1860 }
1861
1862
1863
1864
1865
1866 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1867 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1868 tcp_hdr(skb)->fin;
1869 data_was_unread += len;
1870 __kfree_skb(skb);
1871 }
1872
1873 sk_mem_reclaim(sk);
1874
1875
1876
1877
1878
1879
1880
1881
1882 if (data_was_unread) {
1883
1884 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
1885 tcp_set_state(sk, TCP_CLOSE);
1886 tcp_send_active_reset(sk, sk->sk_allocation);
1887 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1888
1889 sk->sk_prot->disconnect(sk, 0);
1890 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
1891 } else if (tcp_close_state(sk)) {
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917 tcp_send_fin(sk);
1918 }
1919
1920 sk_stream_wait_close(sk, timeout);
1921
1922adjudge_to_death:
1923 state = sk->sk_state;
1924 sock_hold(sk);
1925 sock_orphan(sk);
1926
1927
1928 release_sock(sk);
1929
1930
1931
1932
1933
1934 local_bh_disable();
1935 bh_lock_sock(sk);
1936 WARN_ON(sock_owned_by_user(sk));
1937
1938 percpu_counter_inc(sk->sk_prot->orphan_count);
1939
1940
1941 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
1942 goto out;
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958 if (sk->sk_state == TCP_FIN_WAIT2) {
1959 struct tcp_sock *tp = tcp_sk(sk);
1960 if (tp->linger2 < 0) {
1961 tcp_set_state(sk, TCP_CLOSE);
1962 tcp_send_active_reset(sk, GFP_ATOMIC);
1963 NET_INC_STATS_BH(sock_net(sk),
1964 LINUX_MIB_TCPABORTONLINGER);
1965 } else {
1966 const int tmo = tcp_fin_time(sk);
1967
1968 if (tmo > TCP_TIMEWAIT_LEN) {
1969 inet_csk_reset_keepalive_timer(sk,
1970 tmo - TCP_TIMEWAIT_LEN);
1971 } else {
1972 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1973 goto out;
1974 }
1975 }
1976 }
1977 if (sk->sk_state != TCP_CLOSE) {
1978 int orphan_count = percpu_counter_read_positive(
1979 sk->sk_prot->orphan_count);
1980
1981 sk_mem_reclaim(sk);
1982 if (tcp_too_many_orphans(sk, orphan_count)) {
1983 if (net_ratelimit())
1984 printk(KERN_INFO "TCP: too many of orphaned "
1985 "sockets\n");
1986 tcp_set_state(sk, TCP_CLOSE);
1987 tcp_send_active_reset(sk, GFP_ATOMIC);
1988 NET_INC_STATS_BH(sock_net(sk),
1989 LINUX_MIB_TCPABORTONMEMORY);
1990 }
1991 }
1992
1993 if (sk->sk_state == TCP_CLOSE)
1994 inet_csk_destroy_sock(sk);
1995
1996
1997out:
1998 bh_unlock_sock(sk);
1999 local_bh_enable();
2000 sock_put(sk);
2001}
2002
2003
2004
2005static inline int tcp_need_reset(int state)
2006{
2007 return (1 << state) &
2008 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2009 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2010}
2011
2012int tcp_disconnect(struct sock *sk, int flags)
2013{
2014 struct inet_sock *inet = inet_sk(sk);
2015 struct inet_connection_sock *icsk = inet_csk(sk);
2016 struct tcp_sock *tp = tcp_sk(sk);
2017 int err = 0;
2018 int old_state = sk->sk_state;
2019
2020 if (old_state != TCP_CLOSE)
2021 tcp_set_state(sk, TCP_CLOSE);
2022
2023
2024 if (old_state == TCP_LISTEN) {
2025 inet_csk_listen_stop(sk);
2026 } else if (tcp_need_reset(old_state) ||
2027 (tp->snd_nxt != tp->write_seq &&
2028 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2029
2030
2031
2032 tcp_send_active_reset(sk, gfp_any());
2033 sk->sk_err = ECONNRESET;
2034 } else if (old_state == TCP_SYN_SENT)
2035 sk->sk_err = ECONNRESET;
2036
2037 tcp_clear_xmit_timers(sk);
2038 __skb_queue_purge(&sk->sk_receive_queue);
2039 tcp_write_queue_purge(sk);
2040 __skb_queue_purge(&tp->out_of_order_queue);
2041#ifdef CONFIG_NET_DMA
2042 __skb_queue_purge(&sk->sk_async_wait_queue);
2043#endif
2044
2045 inet->dport = 0;
2046
2047 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2048 inet_reset_saddr(sk);
2049
2050 sk->sk_shutdown = 0;
2051 sock_reset_flag(sk, SOCK_DONE);
2052 tp->srtt = 0;
2053 if ((tp->write_seq += tp->max_window + 2) == 0)
2054 tp->write_seq = 1;
2055 icsk->icsk_backoff = 0;
2056 tp->snd_cwnd = 2;
2057 icsk->icsk_probes_out = 0;
2058 tp->packets_out = 0;
2059 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2060 tp->snd_cwnd_cnt = 0;
2061 tp->bytes_acked = 0;
2062 tcp_set_ca_state(sk, TCP_CA_Open);
2063 tcp_clear_retrans(tp);
2064 inet_csk_delack_init(sk);
2065 tcp_init_send_head(sk);
2066 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2067 __sk_dst_reset(sk);
2068
2069 WARN_ON(inet->num && !icsk->icsk_bind_hash);
2070
2071 sk->sk_error_report(sk);
2072 return err;
2073}
2074
2075
2076
2077
2078static int do_tcp_setsockopt(struct sock *sk, int level,
2079 int optname, char __user *optval, unsigned int optlen)
2080{
2081 struct tcp_sock *tp = tcp_sk(sk);
2082 struct inet_connection_sock *icsk = inet_csk(sk);
2083 int val;
2084 int err = 0;
2085
2086
2087 if (optname == TCP_CONGESTION) {
2088 char name[TCP_CA_NAME_MAX];
2089
2090 if (optlen < 1)
2091 return -EINVAL;
2092
2093 val = strncpy_from_user(name, optval,
2094 min_t(long, TCP_CA_NAME_MAX-1, optlen));
2095 if (val < 0)
2096 return -EFAULT;
2097 name[val] = 0;
2098
2099 lock_sock(sk);
2100 err = tcp_set_congestion_control(sk, name);
2101 release_sock(sk);
2102 return err;
2103 }
2104
2105 if (optlen < sizeof(int))
2106 return -EINVAL;
2107
2108 if (get_user(val, (int __user *)optval))
2109 return -EFAULT;
2110
2111 lock_sock(sk);
2112
2113 switch (optname) {
2114 case TCP_MAXSEG:
2115
2116
2117
2118 if (val < 8 || val > MAX_TCP_WINDOW) {
2119 err = -EINVAL;
2120 break;
2121 }
2122 tp->rx_opt.user_mss = val;
2123 break;
2124
2125 case TCP_NODELAY:
2126 if (val) {
2127
2128
2129
2130
2131
2132
2133
2134
2135 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2136 tcp_push_pending_frames(sk);
2137 } else {
2138 tp->nonagle &= ~TCP_NAGLE_OFF;
2139 }
2140 break;
2141
2142 case TCP_CORK:
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154 if (val) {
2155 tp->nonagle |= TCP_NAGLE_CORK;
2156 } else {
2157 tp->nonagle &= ~TCP_NAGLE_CORK;
2158 if (tp->nonagle&TCP_NAGLE_OFF)
2159 tp->nonagle |= TCP_NAGLE_PUSH;
2160 tcp_push_pending_frames(sk);
2161 }
2162 break;
2163
2164 case TCP_KEEPIDLE:
2165 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2166 err = -EINVAL;
2167 else {
2168 tp->keepalive_time = val * HZ;
2169 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2170 !((1 << sk->sk_state) &
2171 (TCPF_CLOSE | TCPF_LISTEN))) {
2172 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2173 if (tp->keepalive_time > elapsed)
2174 elapsed = tp->keepalive_time - elapsed;
2175 else
2176 elapsed = 0;
2177 inet_csk_reset_keepalive_timer(sk, elapsed);
2178 }
2179 }
2180 break;
2181 case TCP_KEEPINTVL:
2182 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2183 err = -EINVAL;
2184 else
2185 tp->keepalive_intvl = val * HZ;
2186 break;
2187 case TCP_KEEPCNT:
2188 if (val < 1 || val > MAX_TCP_KEEPCNT)
2189 err = -EINVAL;
2190 else
2191 tp->keepalive_probes = val;
2192 break;
2193 case TCP_SYNCNT:
2194 if (val < 1 || val > MAX_TCP_SYNCNT)
2195 err = -EINVAL;
2196 else
2197 icsk->icsk_syn_retries = val;
2198 break;
2199
2200 case TCP_LINGER2:
2201 if (val < 0)
2202 tp->linger2 = -1;
2203 else if (val > sysctl_tcp_fin_timeout / HZ)
2204 tp->linger2 = 0;
2205 else
2206 tp->linger2 = val * HZ;
2207 break;
2208
2209 case TCP_DEFER_ACCEPT:
2210
2211 icsk->icsk_accept_queue.rskq_defer_accept =
2212 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2213 TCP_RTO_MAX / HZ);
2214 break;
2215
2216 case TCP_WINDOW_CLAMP:
2217 if (!val) {
2218 if (sk->sk_state != TCP_CLOSE) {
2219 err = -EINVAL;
2220 break;
2221 }
2222 tp->window_clamp = 0;
2223 } else
2224 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2225 SOCK_MIN_RCVBUF / 2 : val;
2226 break;
2227
2228 case TCP_QUICKACK:
2229 if (!val) {
2230 icsk->icsk_ack.pingpong = 1;
2231 } else {
2232 icsk->icsk_ack.pingpong = 0;
2233 if ((1 << sk->sk_state) &
2234 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2235 inet_csk_ack_scheduled(sk)) {
2236 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2237 tcp_cleanup_rbuf(sk, 1);
2238 if (!(val & 1))
2239 icsk->icsk_ack.pingpong = 1;
2240 }
2241 }
2242 break;
2243
2244#ifdef CONFIG_TCP_MD5SIG
2245 case TCP_MD5SIG:
2246
2247 err = tp->af_specific->md5_parse(sk, optval, optlen);
2248 break;
2249#endif
2250
2251 default:
2252 err = -ENOPROTOOPT;
2253 break;
2254 }
2255
2256 release_sock(sk);
2257 return err;
2258}
2259
2260int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2261 unsigned int optlen)
2262{
2263 struct inet_connection_sock *icsk = inet_csk(sk);
2264
2265 if (level != SOL_TCP)
2266 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2267 optval, optlen);
2268 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2269}
2270
2271#ifdef CONFIG_COMPAT
2272int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2273 char __user *optval, unsigned int optlen)
2274{
2275 if (level != SOL_TCP)
2276 return inet_csk_compat_setsockopt(sk, level, optname,
2277 optval, optlen);
2278 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2279}
2280
2281EXPORT_SYMBOL(compat_tcp_setsockopt);
2282#endif
2283
2284
2285void tcp_get_info(struct sock *sk, struct tcp_info *info)
2286{
2287 struct tcp_sock *tp = tcp_sk(sk);
2288 const struct inet_connection_sock *icsk = inet_csk(sk);
2289 u32 now = tcp_time_stamp;
2290
2291 memset(info, 0, sizeof(*info));
2292
2293 info->tcpi_state = sk->sk_state;
2294 info->tcpi_ca_state = icsk->icsk_ca_state;
2295 info->tcpi_retransmits = icsk->icsk_retransmits;
2296 info->tcpi_probes = icsk->icsk_probes_out;
2297 info->tcpi_backoff = icsk->icsk_backoff;
2298
2299 if (tp->rx_opt.tstamp_ok)
2300 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2301 if (tcp_is_sack(tp))
2302 info->tcpi_options |= TCPI_OPT_SACK;
2303 if (tp->rx_opt.wscale_ok) {
2304 info->tcpi_options |= TCPI_OPT_WSCALE;
2305 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2306 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2307 }
2308
2309 if (tp->ecn_flags&TCP_ECN_OK)
2310 info->tcpi_options |= TCPI_OPT_ECN;
2311
2312 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2313 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2314 info->tcpi_snd_mss = tp->mss_cache;
2315 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2316
2317 if (sk->sk_state == TCP_LISTEN) {
2318 info->tcpi_unacked = sk->sk_ack_backlog;
2319 info->tcpi_sacked = sk->sk_max_ack_backlog;
2320 } else {
2321 info->tcpi_unacked = tp->packets_out;
2322 info->tcpi_sacked = tp->sacked_out;
2323 }
2324 info->tcpi_lost = tp->lost_out;
2325 info->tcpi_retrans = tp->retrans_out;
2326 info->tcpi_fackets = tp->fackets_out;
2327
2328 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2329 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2330 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2331
2332 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2333 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2334 info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2335 info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2336 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2337 info->tcpi_snd_cwnd = tp->snd_cwnd;
2338 info->tcpi_advmss = tp->advmss;
2339 info->tcpi_reordering = tp->reordering;
2340
2341 info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2342 info->tcpi_rcv_space = tp->rcvq_space.space;
2343
2344 info->tcpi_total_retrans = tp->total_retrans;
2345}
2346
2347EXPORT_SYMBOL_GPL(tcp_get_info);
2348
2349static int do_tcp_getsockopt(struct sock *sk, int level,
2350 int optname, char __user *optval, int __user *optlen)
2351{
2352 struct inet_connection_sock *icsk = inet_csk(sk);
2353 struct tcp_sock *tp = tcp_sk(sk);
2354 int val, len;
2355
2356 if (get_user(len, optlen))
2357 return -EFAULT;
2358
2359 len = min_t(unsigned int, len, sizeof(int));
2360
2361 if (len < 0)
2362 return -EINVAL;
2363
2364 switch (optname) {
2365 case TCP_MAXSEG:
2366 val = tp->mss_cache;
2367 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2368 val = tp->rx_opt.user_mss;
2369 break;
2370 case TCP_NODELAY:
2371 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2372 break;
2373 case TCP_CORK:
2374 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2375 break;
2376 case TCP_KEEPIDLE:
2377 val = keepalive_time_when(tp) / HZ;
2378 break;
2379 case TCP_KEEPINTVL:
2380 val = keepalive_intvl_when(tp) / HZ;
2381 break;
2382 case TCP_KEEPCNT:
2383 val = keepalive_probes(tp);
2384 break;
2385 case TCP_SYNCNT:
2386 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2387 break;
2388 case TCP_LINGER2:
2389 val = tp->linger2;
2390 if (val >= 0)
2391 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2392 break;
2393 case TCP_DEFER_ACCEPT:
2394 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
2395 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
2396 break;
2397 case TCP_WINDOW_CLAMP:
2398 val = tp->window_clamp;
2399 break;
2400 case TCP_INFO: {
2401 struct tcp_info info;
2402
2403 if (get_user(len, optlen))
2404 return -EFAULT;
2405
2406 tcp_get_info(sk, &info);
2407
2408 len = min_t(unsigned int, len, sizeof(info));
2409 if (put_user(len, optlen))
2410 return -EFAULT;
2411 if (copy_to_user(optval, &info, len))
2412 return -EFAULT;
2413 return 0;
2414 }
2415 case TCP_QUICKACK:
2416 val = !icsk->icsk_ack.pingpong;
2417 break;
2418
2419 case TCP_CONGESTION:
2420 if (get_user(len, optlen))
2421 return -EFAULT;
2422 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2423 if (put_user(len, optlen))
2424 return -EFAULT;
2425 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2426 return -EFAULT;
2427 return 0;
2428 default:
2429 return -ENOPROTOOPT;
2430 }
2431
2432 if (put_user(len, optlen))
2433 return -EFAULT;
2434 if (copy_to_user(optval, &val, len))
2435 return -EFAULT;
2436 return 0;
2437}
2438
2439int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2440 int __user *optlen)
2441{
2442 struct inet_connection_sock *icsk = inet_csk(sk);
2443
2444 if (level != SOL_TCP)
2445 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2446 optval, optlen);
2447 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2448}
2449
2450#ifdef CONFIG_COMPAT
2451int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2452 char __user *optval, int __user *optlen)
2453{
2454 if (level != SOL_TCP)
2455 return inet_csk_compat_getsockopt(sk, level, optname,
2456 optval, optlen);
2457 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2458}
2459
2460EXPORT_SYMBOL(compat_tcp_getsockopt);
2461#endif
2462
2463struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
2464{
2465 struct sk_buff *segs = ERR_PTR(-EINVAL);
2466 struct tcphdr *th;
2467 unsigned thlen;
2468 unsigned int seq;
2469 __be32 delta;
2470 unsigned int oldlen;
2471 unsigned int mss;
2472
2473 if (!pskb_may_pull(skb, sizeof(*th)))
2474 goto out;
2475
2476 th = tcp_hdr(skb);
2477 thlen = th->doff * 4;
2478 if (thlen < sizeof(*th))
2479 goto out;
2480
2481 if (!pskb_may_pull(skb, thlen))
2482 goto out;
2483
2484 oldlen = (u16)~skb->len;
2485 __skb_pull(skb, thlen);
2486
2487 mss = skb_shinfo(skb)->gso_size;
2488 if (unlikely(skb->len <= mss))
2489 goto out;
2490
2491 if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
2492
2493 int type = skb_shinfo(skb)->gso_type;
2494
2495 if (unlikely(type &
2496 ~(SKB_GSO_TCPV4 |
2497 SKB_GSO_DODGY |
2498 SKB_GSO_TCP_ECN |
2499 SKB_GSO_TCPV6 |
2500 0) ||
2501 !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
2502 goto out;
2503
2504 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
2505
2506 segs = NULL;
2507 goto out;
2508 }
2509
2510 segs = skb_segment(skb, features);
2511 if (IS_ERR(segs))
2512 goto out;
2513
2514 delta = htonl(oldlen + (thlen + mss));
2515
2516 skb = segs;
2517 th = tcp_hdr(skb);
2518 seq = ntohl(th->seq);
2519
2520 do {
2521 th->fin = th->psh = 0;
2522
2523 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2524 (__force u32)delta));
2525 if (skb->ip_summed != CHECKSUM_PARTIAL)
2526 th->check =
2527 csum_fold(csum_partial(skb_transport_header(skb),
2528 thlen, skb->csum));
2529
2530 seq += mss;
2531 skb = skb->next;
2532 th = tcp_hdr(skb);
2533
2534 th->seq = htonl(seq);
2535 th->cwr = 0;
2536 } while (skb->next);
2537
2538 delta = htonl(oldlen + (skb->tail - skb->transport_header) +
2539 skb->data_len);
2540 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2541 (__force u32)delta));
2542 if (skb->ip_summed != CHECKSUM_PARTIAL)
2543 th->check = csum_fold(csum_partial(skb_transport_header(skb),
2544 thlen, skb->csum));
2545
2546out:
2547 return segs;
2548}
2549EXPORT_SYMBOL(tcp_tso_segment);
2550
2551struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2552{
2553 struct sk_buff **pp = NULL;
2554 struct sk_buff *p;
2555 struct tcphdr *th;
2556 struct tcphdr *th2;
2557 unsigned int len;
2558 unsigned int thlen;
2559 unsigned int flags;
2560 unsigned int mss = 1;
2561 unsigned int hlen;
2562 unsigned int off;
2563 int flush = 1;
2564 int i;
2565
2566 off = skb_gro_offset(skb);
2567 hlen = off + sizeof(*th);
2568 th = skb_gro_header_fast(skb, off);
2569 if (skb_gro_header_hard(skb, hlen)) {
2570 th = skb_gro_header_slow(skb, hlen, off);
2571 if (unlikely(!th))
2572 goto out;
2573 }
2574
2575 thlen = th->doff * 4;
2576 if (thlen < sizeof(*th))
2577 goto out;
2578
2579 hlen = off + thlen;
2580 if (skb_gro_header_hard(skb, hlen)) {
2581 th = skb_gro_header_slow(skb, hlen, off);
2582 if (unlikely(!th))
2583 goto out;
2584 }
2585
2586 skb_gro_pull(skb, thlen);
2587
2588 len = skb_gro_len(skb);
2589 flags = tcp_flag_word(th);
2590
2591 for (; (p = *head); head = &p->next) {
2592 if (!NAPI_GRO_CB(p)->same_flow)
2593 continue;
2594
2595 th2 = tcp_hdr(p);
2596
2597 if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
2598 NAPI_GRO_CB(p)->same_flow = 0;
2599 continue;
2600 }
2601
2602 goto found;
2603 }
2604
2605 goto out_check_final;
2606
2607found:
2608 flush = NAPI_GRO_CB(p)->flush;
2609 flush |= flags & TCP_FLAG_CWR;
2610 flush |= (flags ^ tcp_flag_word(th2)) &
2611 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH);
2612 flush |= th->ack_seq ^ th2->ack_seq;
2613 for (i = sizeof(*th); i < thlen; i += 4)
2614 flush |= *(u32 *)((u8 *)th + i) ^
2615 *(u32 *)((u8 *)th2 + i);
2616
2617 mss = skb_shinfo(p)->gso_size;
2618
2619 flush |= (len - 1) >= mss;
2620 flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
2621
2622 if (flush || skb_gro_receive(head, skb)) {
2623 mss = 1;
2624 goto out_check_final;
2625 }
2626
2627 p = *head;
2628 th2 = tcp_hdr(p);
2629 tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
2630
2631out_check_final:
2632 flush = len < mss;
2633 flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST |
2634 TCP_FLAG_SYN | TCP_FLAG_FIN);
2635
2636 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
2637 pp = head;
2638
2639out:
2640 NAPI_GRO_CB(skb)->flush |= flush;
2641
2642 return pp;
2643}
2644EXPORT_SYMBOL(tcp_gro_receive);
2645
2646int tcp_gro_complete(struct sk_buff *skb)
2647{
2648 struct tcphdr *th = tcp_hdr(skb);
2649
2650 skb->csum_start = skb_transport_header(skb) - skb->head;
2651 skb->csum_offset = offsetof(struct tcphdr, check);
2652 skb->ip_summed = CHECKSUM_PARTIAL;
2653
2654 skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
2655
2656 if (th->cwr)
2657 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
2658
2659 return 0;
2660}
2661EXPORT_SYMBOL(tcp_gro_complete);
2662
2663#ifdef CONFIG_TCP_MD5SIG
2664static unsigned long tcp_md5sig_users;
2665static struct tcp_md5sig_pool **tcp_md5sig_pool;
2666static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
2667
2668static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
2669{
2670 int cpu;
2671 for_each_possible_cpu(cpu) {
2672 struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu);
2673 if (p) {
2674 if (p->md5_desc.tfm)
2675 crypto_free_hash(p->md5_desc.tfm);
2676 kfree(p);
2677 p = NULL;
2678 }
2679 }
2680 free_percpu(pool);
2681}
2682
2683void tcp_free_md5sig_pool(void)
2684{
2685 struct tcp_md5sig_pool **pool = NULL;
2686
2687 spin_lock_bh(&tcp_md5sig_pool_lock);
2688 if (--tcp_md5sig_users == 0) {
2689 pool = tcp_md5sig_pool;
2690 tcp_md5sig_pool = NULL;
2691 }
2692 spin_unlock_bh(&tcp_md5sig_pool_lock);
2693 if (pool)
2694 __tcp_free_md5sig_pool(pool);
2695}
2696
2697EXPORT_SYMBOL(tcp_free_md5sig_pool);
2698
2699static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(struct sock *sk)
2700{
2701 int cpu;
2702 struct tcp_md5sig_pool **pool;
2703
2704 pool = alloc_percpu(struct tcp_md5sig_pool *);
2705 if (!pool)
2706 return NULL;
2707
2708 for_each_possible_cpu(cpu) {
2709 struct tcp_md5sig_pool *p;
2710 struct crypto_hash *hash;
2711
2712 p = kzalloc(sizeof(*p), sk->sk_allocation);
2713 if (!p)
2714 goto out_free;
2715 *per_cpu_ptr(pool, cpu) = p;
2716
2717 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
2718 if (!hash || IS_ERR(hash))
2719 goto out_free;
2720
2721 p->md5_desc.tfm = hash;
2722 }
2723 return pool;
2724out_free:
2725 __tcp_free_md5sig_pool(pool);
2726 return NULL;
2727}
2728
2729struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(struct sock *sk)
2730{
2731 struct tcp_md5sig_pool **pool;
2732 int alloc = 0;
2733
2734retry:
2735 spin_lock_bh(&tcp_md5sig_pool_lock);
2736 pool = tcp_md5sig_pool;
2737 if (tcp_md5sig_users++ == 0) {
2738 alloc = 1;
2739 spin_unlock_bh(&tcp_md5sig_pool_lock);
2740 } else if (!pool) {
2741 tcp_md5sig_users--;
2742 spin_unlock_bh(&tcp_md5sig_pool_lock);
2743 cpu_relax();
2744 goto retry;
2745 } else
2746 spin_unlock_bh(&tcp_md5sig_pool_lock);
2747
2748 if (alloc) {
2749
2750 struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool(sk);
2751 spin_lock_bh(&tcp_md5sig_pool_lock);
2752 if (!p) {
2753 tcp_md5sig_users--;
2754 spin_unlock_bh(&tcp_md5sig_pool_lock);
2755 return NULL;
2756 }
2757 pool = tcp_md5sig_pool;
2758 if (pool) {
2759
2760 spin_unlock_bh(&tcp_md5sig_pool_lock);
2761 __tcp_free_md5sig_pool(p);
2762 } else {
2763 tcp_md5sig_pool = pool = p;
2764 spin_unlock_bh(&tcp_md5sig_pool_lock);
2765 }
2766 }
2767 return pool;
2768}
2769
2770EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
2771
2772struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu)
2773{
2774 struct tcp_md5sig_pool **p;
2775 spin_lock_bh(&tcp_md5sig_pool_lock);
2776 p = tcp_md5sig_pool;
2777 if (p)
2778 tcp_md5sig_users++;
2779 spin_unlock_bh(&tcp_md5sig_pool_lock);
2780 return (p ? *per_cpu_ptr(p, cpu) : NULL);
2781}
2782
2783EXPORT_SYMBOL(__tcp_get_md5sig_pool);
2784
2785void __tcp_put_md5sig_pool(void)
2786{
2787 tcp_free_md5sig_pool();
2788}
2789
2790EXPORT_SYMBOL(__tcp_put_md5sig_pool);
2791
2792int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
2793 struct tcphdr *th)
2794{
2795 struct scatterlist sg;
2796 int err;
2797
2798 __sum16 old_checksum = th->check;
2799 th->check = 0;
2800
2801 sg_init_one(&sg, th, sizeof(struct tcphdr));
2802 err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr));
2803 th->check = old_checksum;
2804 return err;
2805}
2806
2807EXPORT_SYMBOL(tcp_md5_hash_header);
2808
2809int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
2810 struct sk_buff *skb, unsigned header_len)
2811{
2812 struct scatterlist sg;
2813 const struct tcphdr *tp = tcp_hdr(skb);
2814 struct hash_desc *desc = &hp->md5_desc;
2815 unsigned i;
2816 const unsigned head_data_len = skb_headlen(skb) > header_len ?
2817 skb_headlen(skb) - header_len : 0;
2818 const struct skb_shared_info *shi = skb_shinfo(skb);
2819
2820 sg_init_table(&sg, 1);
2821
2822 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
2823 if (crypto_hash_update(desc, &sg, head_data_len))
2824 return 1;
2825
2826 for (i = 0; i < shi->nr_frags; ++i) {
2827 const struct skb_frag_struct *f = &shi->frags[i];
2828 sg_set_page(&sg, f->page, f->size, f->page_offset);
2829 if (crypto_hash_update(desc, &sg, f->size))
2830 return 1;
2831 }
2832
2833 return 0;
2834}
2835
2836EXPORT_SYMBOL(tcp_md5_hash_skb_data);
2837
2838int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
2839{
2840 struct scatterlist sg;
2841
2842 sg_init_one(&sg, key->key, key->keylen);
2843 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
2844}
2845
2846EXPORT_SYMBOL(tcp_md5_hash_key);
2847
2848#endif
2849
2850void tcp_done(struct sock *sk)
2851{
2852 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
2853 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
2854
2855 tcp_set_state(sk, TCP_CLOSE);
2856 tcp_clear_xmit_timers(sk);
2857
2858 sk->sk_shutdown = SHUTDOWN_MASK;
2859
2860 if (!sock_flag(sk, SOCK_DEAD))
2861 sk->sk_state_change(sk);
2862 else
2863 inet_csk_destroy_sock(sk);
2864}
2865EXPORT_SYMBOL_GPL(tcp_done);
2866
2867extern struct tcp_congestion_ops tcp_reno;
2868
2869static __initdata unsigned long thash_entries;
2870static int __init set_thash_entries(char *str)
2871{
2872 if (!str)
2873 return 0;
2874 thash_entries = simple_strtoul(str, &str, 0);
2875 return 1;
2876}
2877__setup("thash_entries=", set_thash_entries);
2878
2879void __init tcp_init(void)
2880{
2881 struct sk_buff *skb = NULL;
2882 unsigned long nr_pages, limit;
2883 int order, i, max_share;
2884
2885 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
2886
2887 percpu_counter_init(&tcp_sockets_allocated, 0);
2888 percpu_counter_init(&tcp_orphan_count, 0);
2889 tcp_hashinfo.bind_bucket_cachep =
2890 kmem_cache_create("tcp_bind_bucket",
2891 sizeof(struct inet_bind_bucket), 0,
2892 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2893
2894
2895
2896
2897
2898
2899 tcp_hashinfo.ehash =
2900 alloc_large_system_hash("TCP established",
2901 sizeof(struct inet_ehash_bucket),
2902 thash_entries,
2903 (totalram_pages >= 128 * 1024) ?
2904 13 : 15,
2905 0,
2906 &tcp_hashinfo.ehash_size,
2907 NULL,
2908 thash_entries ? 0 : 512 * 1024);
2909 tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
2910 for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
2911 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
2912 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
2913 }
2914 if (inet_ehash_locks_alloc(&tcp_hashinfo))
2915 panic("TCP: failed to alloc ehash_locks");
2916 tcp_hashinfo.bhash =
2917 alloc_large_system_hash("TCP bind",
2918 sizeof(struct inet_bind_hashbucket),
2919 tcp_hashinfo.ehash_size,
2920 (totalram_pages >= 128 * 1024) ?
2921 13 : 15,
2922 0,
2923 &tcp_hashinfo.bhash_size,
2924 NULL,
2925 64 * 1024);
2926 tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2927 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2928 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2929 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2930 }
2931
2932
2933
2934
2935 for (order = 0; ((1 << order) << PAGE_SHIFT) <
2936 (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2937 order++)
2938 ;
2939 if (order >= 4) {
2940 tcp_death_row.sysctl_max_tw_buckets = 180000;
2941 sysctl_tcp_max_orphans = 4096 << (order - 4);
2942 sysctl_max_syn_backlog = 1024;
2943 } else if (order < 3) {
2944 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2945 sysctl_tcp_max_orphans >>= (3 - order);
2946 sysctl_max_syn_backlog = 128;
2947 }
2948
2949
2950
2951
2952
2953 nr_pages = totalram_pages - totalhigh_pages;
2954 limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
2955 limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
2956 limit = max(limit, 128UL);
2957 sysctl_tcp_mem[0] = limit / 4 * 3;
2958 sysctl_tcp_mem[1] = limit;
2959 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
2960
2961
2962 limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
2963 max_share = min(4UL*1024*1024, limit);
2964
2965 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
2966 sysctl_tcp_wmem[1] = 16*1024;
2967 sysctl_tcp_wmem[2] = max(64*1024, max_share);
2968
2969 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
2970 sysctl_tcp_rmem[1] = 87380;
2971 sysctl_tcp_rmem[2] = max(87380, max_share);
2972
2973 printk(KERN_INFO "TCP: Hash tables configured "
2974 "(established %d bind %d)\n",
2975 tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size);
2976
2977 tcp_register_congestion_control(&tcp_reno);
2978}
2979
2980EXPORT_SYMBOL(tcp_close);
2981EXPORT_SYMBOL(tcp_disconnect);
2982EXPORT_SYMBOL(tcp_getsockopt);
2983EXPORT_SYMBOL(tcp_ioctl);
2984EXPORT_SYMBOL(tcp_poll);
2985EXPORT_SYMBOL(tcp_read_sock);
2986EXPORT_SYMBOL(tcp_recvmsg);
2987EXPORT_SYMBOL(tcp_sendmsg);
2988EXPORT_SYMBOL(tcp_splice_read);
2989EXPORT_SYMBOL(tcp_sendpage);
2990EXPORT_SYMBOL(tcp_setsockopt);
2991EXPORT_SYMBOL(tcp_shutdown);
2992