1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43#include <linux/module.h>
44#include <linux/mm.h>
45#include <net/tcp.h>
46#include <linux/inet_diag.h>
47
48#define DCTCP_MAX_ALPHA 1024U
49
50struct dctcp {
51 u32 acked_bytes_ecn;
52 u32 acked_bytes_total;
53 u32 prior_snd_una;
54 u32 prior_rcv_nxt;
55 u32 dctcp_alpha;
56 u32 next_seq;
57 u32 ce_state;
58 u32 delayed_ack_reserved;
59};
60
61static unsigned int dctcp_shift_g __read_mostly = 4;
62module_param(dctcp_shift_g, uint, 0644);
63MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha");
64
65static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
66module_param(dctcp_alpha_on_init, uint, 0644);
67MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
68
69static unsigned int dctcp_clamp_alpha_on_loss __read_mostly;
70module_param(dctcp_clamp_alpha_on_loss, uint, 0644);
71MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss,
72 "parameter for clamping alpha on loss");
73
74static struct tcp_congestion_ops dctcp_reno;
75
76static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
77{
78 ca->next_seq = tp->snd_nxt;
79
80 ca->acked_bytes_ecn = 0;
81 ca->acked_bytes_total = 0;
82}
83
84static void dctcp_init(struct sock *sk)
85{
86 const struct tcp_sock *tp = tcp_sk(sk);
87
88 if ((tp->ecn_flags & TCP_ECN_OK) ||
89 (sk->sk_state == TCP_LISTEN ||
90 sk->sk_state == TCP_CLOSE)) {
91 struct dctcp *ca = inet_csk_ca(sk);
92
93 ca->prior_snd_una = tp->snd_una;
94 ca->prior_rcv_nxt = tp->rcv_nxt;
95
96 ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
97
98 ca->delayed_ack_reserved = 0;
99 ca->ce_state = 0;
100
101 dctcp_reset(tp, ca);
102 return;
103 }
104
105
106
107
108 inet_csk(sk)->icsk_ca_ops = &dctcp_reno;
109 INET_ECN_dontxmit(sk);
110}
111
112static u32 dctcp_ssthresh(struct sock *sk)
113{
114 const struct dctcp *ca = inet_csk_ca(sk);
115 struct tcp_sock *tp = tcp_sk(sk);
116
117 return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
118}
119
120
121
122
123
124
125
126static void dctcp_ce_state_0_to_1(struct sock *sk)
127{
128 struct dctcp *ca = inet_csk_ca(sk);
129 struct tcp_sock *tp = tcp_sk(sk);
130
131
132
133
134 if (!ca->ce_state && ca->delayed_ack_reserved) {
135 u32 tmp_rcv_nxt;
136
137
138 tmp_rcv_nxt = tp->rcv_nxt;
139
140
141 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
142 tp->rcv_nxt = ca->prior_rcv_nxt;
143
144 tcp_send_ack(sk);
145
146
147 tp->rcv_nxt = tmp_rcv_nxt;
148 }
149
150 ca->prior_rcv_nxt = tp->rcv_nxt;
151 ca->ce_state = 1;
152
153 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
154}
155
156static void dctcp_ce_state_1_to_0(struct sock *sk)
157{
158 struct dctcp *ca = inet_csk_ca(sk);
159 struct tcp_sock *tp = tcp_sk(sk);
160
161
162
163
164 if (ca->ce_state && ca->delayed_ack_reserved) {
165 u32 tmp_rcv_nxt;
166
167
168 tmp_rcv_nxt = tp->rcv_nxt;
169
170
171 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
172 tp->rcv_nxt = ca->prior_rcv_nxt;
173
174 tcp_send_ack(sk);
175
176
177 tp->rcv_nxt = tmp_rcv_nxt;
178 }
179
180 ca->prior_rcv_nxt = tp->rcv_nxt;
181 ca->ce_state = 0;
182
183 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
184}
185
186static void dctcp_update_alpha(struct sock *sk, u32 flags)
187{
188 const struct tcp_sock *tp = tcp_sk(sk);
189 struct dctcp *ca = inet_csk_ca(sk);
190 u32 acked_bytes = tp->snd_una - ca->prior_snd_una;
191
192
193
194
195 if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE))
196 acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
197 if (acked_bytes) {
198 ca->acked_bytes_total += acked_bytes;
199 ca->prior_snd_una = tp->snd_una;
200
201 if (flags & CA_ACK_ECE)
202 ca->acked_bytes_ecn += acked_bytes;
203 }
204
205
206 if (!before(tp->snd_una, ca->next_seq)) {
207
208 if (ca->acked_bytes_total == 0)
209 ca->acked_bytes_total = 1;
210
211
212 ca->dctcp_alpha = ca->dctcp_alpha -
213 (ca->dctcp_alpha >> dctcp_shift_g) +
214 (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) /
215 ca->acked_bytes_total;
216
217 if (ca->dctcp_alpha > DCTCP_MAX_ALPHA)
218
219 ca->dctcp_alpha = DCTCP_MAX_ALPHA;
220
221 dctcp_reset(tp, ca);
222 }
223}
224
225static void dctcp_state(struct sock *sk, u8 new_state)
226{
227 if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) {
228 struct dctcp *ca = inet_csk_ca(sk);
229
230
231
232
233
234
235
236
237
238 ca->dctcp_alpha = DCTCP_MAX_ALPHA;
239 }
240}
241
242static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
243{
244 struct dctcp *ca = inet_csk_ca(sk);
245
246 switch (ev) {
247 case CA_EVENT_DELAYED_ACK:
248 if (!ca->delayed_ack_reserved)
249 ca->delayed_ack_reserved = 1;
250 break;
251 case CA_EVENT_NON_DELAYED_ACK:
252 if (ca->delayed_ack_reserved)
253 ca->delayed_ack_reserved = 0;
254 break;
255 default:
256
257 break;
258 }
259}
260
261static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
262{
263 switch (ev) {
264 case CA_EVENT_ECN_IS_CE:
265 dctcp_ce_state_0_to_1(sk);
266 break;
267 case CA_EVENT_ECN_NO_CE:
268 dctcp_ce_state_1_to_0(sk);
269 break;
270 case CA_EVENT_DELAYED_ACK:
271 case CA_EVENT_NON_DELAYED_ACK:
272 dctcp_update_ack_reserved(sk, ev);
273 break;
274 default:
275
276 break;
277 }
278}
279
280static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
281{
282 const struct dctcp *ca = inet_csk_ca(sk);
283
284
285
286
287 if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
288 ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
289 struct tcp_dctcp_info info;
290
291 memset(&info, 0, sizeof(info));
292 if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
293 info.dctcp_enabled = 1;
294 info.dctcp_ce_state = (u16) ca->ce_state;
295 info.dctcp_alpha = ca->dctcp_alpha;
296 info.dctcp_ab_ecn = ca->acked_bytes_ecn;
297 info.dctcp_ab_tot = ca->acked_bytes_total;
298 }
299
300 nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info);
301 }
302}
303
304static struct tcp_congestion_ops dctcp __read_mostly = {
305 .init = dctcp_init,
306 .in_ack_event = dctcp_update_alpha,
307 .cwnd_event = dctcp_cwnd_event,
308 .ssthresh = dctcp_ssthresh,
309 .cong_avoid = tcp_reno_cong_avoid,
310 .set_state = dctcp_state,
311 .get_info = dctcp_get_info,
312 .flags = TCP_CONG_NEEDS_ECN,
313 .owner = THIS_MODULE,
314 .name = "dctcp",
315};
316
317static struct tcp_congestion_ops dctcp_reno __read_mostly = {
318 .ssthresh = tcp_reno_ssthresh,
319 .cong_avoid = tcp_reno_cong_avoid,
320 .get_info = dctcp_get_info,
321 .owner = THIS_MODULE,
322 .name = "dctcp-reno",
323};
324
325static int __init dctcp_register(void)
326{
327 BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
328 return tcp_register_congestion_control(&dctcp);
329}
330
331static void __exit dctcp_unregister(void)
332{
333 tcp_unregister_congestion_control(&dctcp);
334}
335
336module_init(dctcp_register);
337module_exit(dctcp_unregister);
338
339MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
340MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
341MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>");
342
343MODULE_LICENSE("GPL v2");
344MODULE_DESCRIPTION("DataCenter TCP (DCTCP)");
345