1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26#include <linux/mm.h>
27#include <linux/module.h>
28#include <linux/math64.h>
29#include <net/tcp.h>
30
31#define BICTCP_BETA_SCALE 1024
32
33
34#define BICTCP_HZ 10
35
36
37#define HYSTART_ACK_TRAIN 0x1
38#define HYSTART_DELAY 0x2
39
40
41#define HYSTART_MIN_SAMPLES 8
42#define HYSTART_DELAY_MIN (4U<<3)
43#define HYSTART_DELAY_MAX (16U<<3)
44#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
45
46static int fast_convergence __read_mostly = 1;
47static int beta __read_mostly = 717;
48static int initial_ssthresh __read_mostly;
49static int bic_scale __read_mostly = 41;
50static int tcp_friendliness __read_mostly = 1;
51
52static int hystart __read_mostly = 1;
53static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
54static int hystart_low_window __read_mostly = 16;
55static int hystart_ack_delta __read_mostly = 2;
56
57static u32 cube_rtt_scale __read_mostly;
58static u32 beta_scale __read_mostly;
59static u64 cube_factor __read_mostly;
60
61
62module_param(fast_convergence, int, 0644);
63MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
64module_param(beta, int, 0644);
65MODULE_PARM_DESC(beta, "beta for multiplicative increase");
66module_param(initial_ssthresh, int, 0644);
67MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
68module_param(bic_scale, int, 0444);
69MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
70module_param(tcp_friendliness, int, 0644);
71MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
72module_param(hystart, int, 0644);
73MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm");
74module_param(hystart_detect, int, 0644);
75MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms"
76 " 1: packet-train 2: delay 3: both packet-train and delay");
77module_param(hystart_low_window, int, 0644);
78MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
79module_param(hystart_ack_delta, int, 0644);
80MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)");
81
82
83struct bictcp {
84 u32 cnt;
85 u32 last_max_cwnd;
86 u32 loss_cwnd;
87 u32 last_cwnd;
88 u32 last_time;
89 u32 bic_origin_point;
90 u32 bic_K;
91
92 u32 delay_min;
93 u32 epoch_start;
94 u32 ack_cnt;
95 u32 tcp_cwnd;
96#define ACK_RATIO_SHIFT 4
97#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT)
98 u16 delayed_ack;
99 u8 sample_cnt;
100 u8 found;
101 u32 round_start;
102 u32 end_seq;
103 u32 last_ack;
104 u32 curr_rtt;
105};
106
107static inline void bictcp_reset(struct bictcp *ca)
108{
109 ca->cnt = 0;
110 ca->last_max_cwnd = 0;
111 ca->last_cwnd = 0;
112 ca->last_time = 0;
113 ca->bic_origin_point = 0;
114 ca->bic_K = 0;
115 ca->delay_min = 0;
116 ca->epoch_start = 0;
117 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
118 ca->ack_cnt = 0;
119 ca->tcp_cwnd = 0;
120 ca->found = 0;
121}
122
123static inline u32 bictcp_clock(void)
124{
125#if HZ < 1000
126 return ktime_to_ms(ktime_get_real());
127#else
128 return jiffies_to_msecs(jiffies);
129#endif
130}
131
132static inline void bictcp_hystart_reset(struct sock *sk)
133{
134 struct tcp_sock *tp = tcp_sk(sk);
135 struct bictcp *ca = inet_csk_ca(sk);
136
137 ca->round_start = ca->last_ack = bictcp_clock();
138 ca->end_seq = tp->snd_nxt;
139 ca->curr_rtt = 0;
140 ca->sample_cnt = 0;
141}
142
143static void bictcp_init(struct sock *sk)
144{
145 struct bictcp *ca = inet_csk_ca(sk);
146
147 bictcp_reset(ca);
148 ca->loss_cwnd = 0;
149
150 if (hystart)
151 bictcp_hystart_reset(sk);
152
153 if (!hystart && initial_ssthresh)
154 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
155}
156
157
158
159
160
161static u32 cubic_root(u64 a)
162{
163 u32 x, b, shift;
164
165
166
167
168
169
170
171
172 static const u8 v[] = {
173 0, 54, 54, 54, 118, 118, 118, 118,
174 123, 129, 134, 138, 143, 147, 151, 156,
175 157, 161, 164, 168, 170, 173, 176, 179,
176 181, 185, 187, 190, 192, 194, 197, 199,
177 200, 202, 204, 206, 209, 211, 213, 215,
178 217, 219, 221, 222, 224, 225, 227, 229,
179 231, 232, 234, 236, 237, 239, 240, 242,
180 244, 245, 246, 248, 250, 251, 252, 254,
181 };
182
183 b = fls64(a);
184 if (b < 7) {
185
186 return ((u32)v[(u32)a] + 35) >> 6;
187 }
188
189 b = ((b * 84) >> 8) - 1;
190 shift = (a >> (b * 3));
191
192 x = ((u32)(((u32)v[shift] + 10) << b)) >> 6;
193
194
195
196
197
198
199
200 x = (2 * x + (u32)div64_u64(a, (u64)x * (u64)(x - 1)));
201 x = ((x * 341) >> 10);
202 return x;
203}
204
205
206
207
208static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
209{
210 u32 delta, bic_target, max_cnt;
211 u64 offs, t;
212
213 ca->ack_cnt++;
214
215 if (ca->last_cwnd == cwnd &&
216 (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
217 return;
218
219 ca->last_cwnd = cwnd;
220 ca->last_time = tcp_time_stamp;
221
222 if (ca->epoch_start == 0) {
223 ca->epoch_start = tcp_time_stamp;
224 ca->ack_cnt = 1;
225 ca->tcp_cwnd = cwnd;
226
227 if (ca->last_max_cwnd <= cwnd) {
228 ca->bic_K = 0;
229 ca->bic_origin_point = cwnd;
230 } else {
231
232
233
234 ca->bic_K = cubic_root(cube_factor
235 * (ca->last_max_cwnd - cwnd));
236 ca->bic_origin_point = ca->last_max_cwnd;
237 }
238 }
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254 t = (s32)(tcp_time_stamp - ca->epoch_start);
255 t += msecs_to_jiffies(ca->delay_min >> 3);
256
257 t <<= BICTCP_HZ;
258 do_div(t, HZ);
259
260 if (t < ca->bic_K)
261 offs = ca->bic_K - t;
262 else
263 offs = t - ca->bic_K;
264
265
266 delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
267 if (t < ca->bic_K)
268 bic_target = ca->bic_origin_point - delta;
269 else
270 bic_target = ca->bic_origin_point + delta;
271
272
273 if (bic_target > cwnd) {
274 ca->cnt = cwnd / (bic_target - cwnd);
275 } else {
276 ca->cnt = 100 * cwnd;
277 }
278
279
280
281
282
283 if (ca->last_max_cwnd == 0 && ca->cnt > 20)
284 ca->cnt = 20;
285
286
287 if (tcp_friendliness) {
288 u32 scale = beta_scale;
289
290 delta = (cwnd * scale) >> 3;
291 while (ca->ack_cnt > delta) {
292 ca->ack_cnt -= delta;
293 ca->tcp_cwnd++;
294 }
295
296 if (ca->tcp_cwnd > cwnd) {
297 delta = ca->tcp_cwnd - cwnd;
298 max_cnt = cwnd / delta;
299 if (ca->cnt > max_cnt)
300 ca->cnt = max_cnt;
301 }
302 }
303
304 ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
305 if (ca->cnt == 0)
306 ca->cnt = 1;
307}
308
309static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
310{
311 struct tcp_sock *tp = tcp_sk(sk);
312 struct bictcp *ca = inet_csk_ca(sk);
313
314 if (!tcp_is_cwnd_limited(sk))
315 return;
316
317 if (tp->snd_cwnd <= tp->snd_ssthresh) {
318 if (hystart && after(ack, ca->end_seq))
319 bictcp_hystart_reset(sk);
320 tcp_slow_start(tp, acked);
321 } else {
322 bictcp_update(ca, tp->snd_cwnd);
323 tcp_cong_avoid_ai(tp, ca->cnt);
324 }
325}
326
327static u32 bictcp_recalc_ssthresh(struct sock *sk)
328{
329 const struct tcp_sock *tp = tcp_sk(sk);
330 struct bictcp *ca = inet_csk_ca(sk);
331
332 ca->epoch_start = 0;
333
334
335 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
336 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
337 / (2 * BICTCP_BETA_SCALE);
338 else
339 ca->last_max_cwnd = tp->snd_cwnd;
340
341 ca->loss_cwnd = tp->snd_cwnd;
342
343 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
344}
345
346static u32 bictcp_undo_cwnd(struct sock *sk)
347{
348 struct bictcp *ca = inet_csk_ca(sk);
349
350 return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
351}
352
353static void bictcp_state(struct sock *sk, u8 new_state)
354{
355 if (new_state == TCP_CA_Loss) {
356 bictcp_reset(inet_csk_ca(sk));
357 bictcp_hystart_reset(sk);
358 }
359}
360
361static void hystart_update(struct sock *sk, u32 delay)
362{
363 struct tcp_sock *tp = tcp_sk(sk);
364 struct bictcp *ca = inet_csk_ca(sk);
365
366 if (!(ca->found & hystart_detect)) {
367 u32 now = bictcp_clock();
368
369
370 if ((s32)(now - ca->last_ack) <= hystart_ack_delta) {
371 ca->last_ack = now;
372 if ((s32)(now - ca->round_start) > ca->delay_min >> 4)
373 ca->found |= HYSTART_ACK_TRAIN;
374 }
375
376
377 if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
378 if (ca->curr_rtt == 0 || ca->curr_rtt > delay)
379 ca->curr_rtt = delay;
380
381 ca->sample_cnt++;
382 } else {
383 if (ca->curr_rtt > ca->delay_min +
384 HYSTART_DELAY_THRESH(ca->delay_min>>4))
385 ca->found |= HYSTART_DELAY;
386 }
387
388
389
390
391 if (ca->found & hystart_detect)
392 tp->snd_ssthresh = tp->snd_cwnd;
393 }
394}
395
396
397
398
399static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
400{
401 const struct inet_connection_sock *icsk = inet_csk(sk);
402 const struct tcp_sock *tp = tcp_sk(sk);
403 struct bictcp *ca = inet_csk_ca(sk);
404 u32 delay;
405
406 if (icsk->icsk_ca_state == TCP_CA_Open) {
407 u32 ratio = ca->delayed_ack;
408
409 ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT;
410 ratio += cnt;
411
412 ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT);
413 }
414
415
416 if (rtt_us < 0)
417 return;
418
419
420 if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
421 return;
422
423 delay = (rtt_us << 3) / USEC_PER_MSEC;
424 if (delay == 0)
425 delay = 1;
426
427
428 if (ca->delay_min == 0 || ca->delay_min > delay)
429 ca->delay_min = delay;
430
431
432 if (hystart && tp->snd_cwnd <= tp->snd_ssthresh &&
433 tp->snd_cwnd >= hystart_low_window)
434 hystart_update(sk, delay);
435}
436
437static struct tcp_congestion_ops cubictcp __read_mostly = {
438 .init = bictcp_init,
439 .ssthresh = bictcp_recalc_ssthresh,
440 .cong_avoid = bictcp_cong_avoid,
441 .set_state = bictcp_state,
442 .undo_cwnd = bictcp_undo_cwnd,
443 .pkts_acked = bictcp_acked,
444 .owner = THIS_MODULE,
445 .name = "cubic",
446};
447
448static int __init cubictcp_register(void)
449{
450 BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
451
452
453
454
455
456 beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
457 / (BICTCP_BETA_SCALE - beta);
458
459 cube_rtt_scale = (bic_scale * 10);
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475 cube_factor = 1ull << (10+3*BICTCP_HZ);
476
477
478 do_div(cube_factor, bic_scale * 10);
479
480 return tcp_register_congestion_control(&cubictcp);
481}
482
483static void __exit cubictcp_unregister(void)
484{
485 tcp_unregister_congestion_control(&cubictcp);
486}
487
488module_init(cubictcp_register);
489module_exit(cubictcp_unregister);
490
491MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
492MODULE_LICENSE("GPL");
493MODULE_DESCRIPTION("CUBIC TCP");
494MODULE_VERSION("2.3");
495