1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26#include <linux/mm.h>
27#include <linux/module.h>
28#include <linux/math64.h>
29#include <net/tcp.h>
30
31#define BICTCP_BETA_SCALE 1024
32
33
34#define BICTCP_HZ 10
35
36
37#define HYSTART_ACK_TRAIN 0x1
38#define HYSTART_DELAY 0x2
39
40
41#define HYSTART_MIN_SAMPLES 8
42#define HYSTART_DELAY_MIN (4U<<3)
43#define HYSTART_DELAY_MAX (16U<<3)
44#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
45
46static int fast_convergence __read_mostly = 1;
47static int beta __read_mostly = 717;
48static int initial_ssthresh __read_mostly;
49static int bic_scale __read_mostly = 41;
50static int tcp_friendliness __read_mostly = 1;
51
52static int hystart __read_mostly = 1;
53static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
54static int hystart_low_window __read_mostly = 16;
55static int hystart_ack_delta __read_mostly = 2;
56
57static u32 cube_rtt_scale __read_mostly;
58static u32 beta_scale __read_mostly;
59static u64 cube_factor __read_mostly;
60
61
62module_param(fast_convergence, int, 0644);
63MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
64module_param(beta, int, 0644);
65MODULE_PARM_DESC(beta, "beta for multiplicative increase");
66module_param(initial_ssthresh, int, 0644);
67MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
68module_param(bic_scale, int, 0444);
69MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
70module_param(tcp_friendliness, int, 0644);
71MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
72module_param(hystart, int, 0644);
73MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm");
74module_param(hystart_detect, int, 0644);
75MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms"
76 " 1: packet-train 2: delay 3: both packet-train and delay");
77module_param(hystart_low_window, int, 0644);
78MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
79module_param(hystart_ack_delta, int, 0644);
80MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)");
81
82
83struct bictcp {
84 u32 cnt;
85 u32 last_max_cwnd;
86 u32 loss_cwnd;
87 u32 last_cwnd;
88 u32 last_time;
89 u32 bic_origin_point;
90 u32 bic_K;
91 u32 delay_min;
92 u32 epoch_start;
93 u32 ack_cnt;
94 u32 tcp_cwnd;
95#define ACK_RATIO_SHIFT 4
96#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT)
97 u16 delayed_ack;
98 u8 sample_cnt;
99 u8 found;
100 u32 round_start;
101 u32 end_seq;
102 u32 last_ack;
103 u32 curr_rtt;
104};
105
106static inline void bictcp_reset(struct bictcp *ca)
107{
108 ca->cnt = 0;
109 ca->last_max_cwnd = 0;
110 ca->loss_cwnd = 0;
111 ca->last_cwnd = 0;
112 ca->last_time = 0;
113 ca->bic_origin_point = 0;
114 ca->bic_K = 0;
115 ca->delay_min = 0;
116 ca->epoch_start = 0;
117 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
118 ca->ack_cnt = 0;
119 ca->tcp_cwnd = 0;
120 ca->found = 0;
121}
122
123static inline u32 bictcp_clock(void)
124{
125#if HZ < 1000
126 return ktime_to_ms(ktime_get_real());
127#else
128 return jiffies_to_msecs(jiffies);
129#endif
130}
131
132static inline void bictcp_hystart_reset(struct sock *sk)
133{
134 struct tcp_sock *tp = tcp_sk(sk);
135 struct bictcp *ca = inet_csk_ca(sk);
136
137 ca->round_start = ca->last_ack = bictcp_clock();
138 ca->end_seq = tp->snd_nxt;
139 ca->curr_rtt = 0;
140 ca->sample_cnt = 0;
141}
142
143static void bictcp_init(struct sock *sk)
144{
145 bictcp_reset(inet_csk_ca(sk));
146
147 if (hystart)
148 bictcp_hystart_reset(sk);
149
150 if (!hystart && initial_ssthresh)
151 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
152}
153
154
155
156
157
158static u32 cubic_root(u64 a)
159{
160 u32 x, b, shift;
161
162
163
164
165
166
167
168
169 static const u8 v[] = {
170 0, 54, 54, 54, 118, 118, 118, 118,
171 123, 129, 134, 138, 143, 147, 151, 156,
172 157, 161, 164, 168, 170, 173, 176, 179,
173 181, 185, 187, 190, 192, 194, 197, 199,
174 200, 202, 204, 206, 209, 211, 213, 215,
175 217, 219, 221, 222, 224, 225, 227, 229,
176 231, 232, 234, 236, 237, 239, 240, 242,
177 244, 245, 246, 248, 250, 251, 252, 254,
178 };
179
180 b = fls64(a);
181 if (b < 7) {
182
183 return ((u32)v[(u32)a] + 35) >> 6;
184 }
185
186 b = ((b * 84) >> 8) - 1;
187 shift = (a >> (b * 3));
188
189 x = ((u32)(((u32)v[shift] + 10) << b)) >> 6;
190
191
192
193
194
195
196
197 x = (2 * x + (u32)div64_u64(a, (u64)x * (u64)(x - 1)));
198 x = ((x * 341) >> 10);
199 return x;
200}
201
202
203
204
205static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
206{
207 u64 offs;
208 u32 delta, t, bic_target, max_cnt;
209
210 ca->ack_cnt++;
211
212 if (ca->last_cwnd == cwnd &&
213 (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
214 return;
215
216 ca->last_cwnd = cwnd;
217 ca->last_time = tcp_time_stamp;
218
219 if (ca->epoch_start == 0) {
220 ca->epoch_start = tcp_time_stamp;
221 ca->ack_cnt = 1;
222 ca->tcp_cwnd = cwnd;
223
224 if (ca->last_max_cwnd <= cwnd) {
225 ca->bic_K = 0;
226 ca->bic_origin_point = cwnd;
227 } else {
228
229
230
231 ca->bic_K = cubic_root(cube_factor
232 * (ca->last_max_cwnd - cwnd));
233 ca->bic_origin_point = ca->last_max_cwnd;
234 }
235 }
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252 t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3)
253 - ca->epoch_start) << BICTCP_HZ) / HZ;
254
255 if (t < ca->bic_K)
256 offs = ca->bic_K - t;
257 else
258 offs = t - ca->bic_K;
259
260
261 delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
262 if (t < ca->bic_K)
263 bic_target = ca->bic_origin_point - delta;
264 else
265 bic_target = ca->bic_origin_point + delta;
266
267
268 if (bic_target > cwnd) {
269 ca->cnt = cwnd / (bic_target - cwnd);
270 } else {
271 ca->cnt = 100 * cwnd;
272 }
273
274
275
276
277
278 if (ca->loss_cwnd == 0 && ca->cnt > 20)
279 ca->cnt = 20;
280
281
282 if (tcp_friendliness) {
283 u32 scale = beta_scale;
284 delta = (cwnd * scale) >> 3;
285 while (ca->ack_cnt > delta) {
286 ca->ack_cnt -= delta;
287 ca->tcp_cwnd++;
288 }
289
290 if (ca->tcp_cwnd > cwnd){
291 delta = ca->tcp_cwnd - cwnd;
292 max_cnt = cwnd / delta;
293 if (ca->cnt > max_cnt)
294 ca->cnt = max_cnt;
295 }
296 }
297
298 ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
299 if (ca->cnt == 0)
300 ca->cnt = 1;
301}
302
303static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
304{
305 struct tcp_sock *tp = tcp_sk(sk);
306 struct bictcp *ca = inet_csk_ca(sk);
307
308 if (!tcp_is_cwnd_limited(sk, in_flight))
309 return;
310
311 if (tp->snd_cwnd <= tp->snd_ssthresh) {
312 if (hystart && after(ack, ca->end_seq))
313 bictcp_hystart_reset(sk);
314 tcp_slow_start(tp);
315 } else {
316 bictcp_update(ca, tp->snd_cwnd);
317 tcp_cong_avoid_ai(tp, ca->cnt);
318 }
319
320}
321
322static u32 bictcp_recalc_ssthresh(struct sock *sk)
323{
324 const struct tcp_sock *tp = tcp_sk(sk);
325 struct bictcp *ca = inet_csk_ca(sk);
326
327 ca->epoch_start = 0;
328
329
330 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
331 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
332 / (2 * BICTCP_BETA_SCALE);
333 else
334 ca->last_max_cwnd = tp->snd_cwnd;
335
336 ca->loss_cwnd = tp->snd_cwnd;
337
338 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
339}
340
341static u32 bictcp_undo_cwnd(struct sock *sk)
342{
343 struct bictcp *ca = inet_csk_ca(sk);
344
345 return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
346}
347
348static void bictcp_state(struct sock *sk, u8 new_state)
349{
350 if (new_state == TCP_CA_Loss) {
351 bictcp_reset(inet_csk_ca(sk));
352 bictcp_hystart_reset(sk);
353 }
354}
355
356static void hystart_update(struct sock *sk, u32 delay)
357{
358 struct tcp_sock *tp = tcp_sk(sk);
359 struct bictcp *ca = inet_csk_ca(sk);
360
361 if (!(ca->found & hystart_detect)) {
362 u32 now = bictcp_clock();
363
364
365 if ((s32)(now - ca->last_ack) <= hystart_ack_delta) {
366 ca->last_ack = now;
367 if ((s32)(now - ca->round_start) > ca->delay_min >> 4)
368 ca->found |= HYSTART_ACK_TRAIN;
369 }
370
371
372 if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
373 if (ca->curr_rtt == 0 || ca->curr_rtt > delay)
374 ca->curr_rtt = delay;
375
376 ca->sample_cnt++;
377 } else {
378 if (ca->curr_rtt > ca->delay_min +
379 HYSTART_DELAY_THRESH(ca->delay_min>>4))
380 ca->found |= HYSTART_DELAY;
381 }
382
383
384
385
386 if (ca->found & hystart_detect)
387 tp->snd_ssthresh = tp->snd_cwnd;
388 }
389}
390
391
392
393
394static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
395{
396 const struct inet_connection_sock *icsk = inet_csk(sk);
397 const struct tcp_sock *tp = tcp_sk(sk);
398 struct bictcp *ca = inet_csk_ca(sk);
399 u32 delay;
400
401 if (icsk->icsk_ca_state == TCP_CA_Open) {
402 u32 ratio = ca->delayed_ack;
403
404 ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT;
405 ratio += cnt;
406
407 ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT);
408 }
409
410
411 if (rtt_us < 0)
412 return;
413
414
415 if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ)
416 return;
417
418 delay = (rtt_us << 3) / USEC_PER_MSEC;
419 if (delay == 0)
420 delay = 1;
421
422
423 if (ca->delay_min == 0 || ca->delay_min > delay)
424 ca->delay_min = delay;
425
426
427 if (hystart && tp->snd_cwnd <= tp->snd_ssthresh &&
428 tp->snd_cwnd >= hystart_low_window)
429 hystart_update(sk, delay);
430}
431
432static struct tcp_congestion_ops cubictcp __read_mostly = {
433 .init = bictcp_init,
434 .ssthresh = bictcp_recalc_ssthresh,
435 .cong_avoid = bictcp_cong_avoid,
436 .set_state = bictcp_state,
437 .undo_cwnd = bictcp_undo_cwnd,
438 .pkts_acked = bictcp_acked,
439 .owner = THIS_MODULE,
440 .name = "cubic",
441};
442
443static int __init cubictcp_register(void)
444{
445 BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
446
447
448
449
450
451 beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta);
452
453 cube_rtt_scale = (bic_scale * 10);
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469 cube_factor = 1ull << (10+3*BICTCP_HZ);
470
471
472 do_div(cube_factor, bic_scale * 10);
473
474
475 if (hystart && HZ < 1000)
476 cubictcp.flags |= TCP_CONG_RTT_STAMP;
477
478 return tcp_register_congestion_control(&cubictcp);
479}
480
481static void __exit cubictcp_unregister(void)
482{
483 tcp_unregister_congestion_control(&cubictcp);
484}
485
486module_init(cubictcp_register);
487module_exit(cubictcp_unregister);
488
489MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
490MODULE_LICENSE("GPL");
491MODULE_DESCRIPTION("CUBIC TCP");
492MODULE_VERSION("2.3");
493