1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26#include <linux/mm.h>
27#include <linux/module.h>
28#include <linux/math64.h>
29#include <net/tcp.h>
30
31#define BICTCP_BETA_SCALE 1024
32
33
34#define BICTCP_HZ 10
35
36
37#define HYSTART_ACK_TRAIN 0x1
38#define HYSTART_DELAY 0x2
39
40
41#define HYSTART_MIN_SAMPLES 8
42#define HYSTART_DELAY_MIN (4U<<3)
43#define HYSTART_DELAY_MAX (16U<<3)
44#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
45
46static int fast_convergence __read_mostly = 1;
47static int beta __read_mostly = 717;
48static int initial_ssthresh __read_mostly;
49static int bic_scale __read_mostly = 41;
50static int tcp_friendliness __read_mostly = 1;
51
52static int hystart __read_mostly = 1;
53static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
54static int hystart_low_window __read_mostly = 16;
55static int hystart_ack_delta __read_mostly = 2;
56
57static u32 cube_rtt_scale __read_mostly;
58static u32 beta_scale __read_mostly;
59static u64 cube_factor __read_mostly;
60
61
62module_param(fast_convergence, int, 0644);
63MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
64module_param(beta, int, 0644);
65MODULE_PARM_DESC(beta, "beta for multiplicative increase");
66module_param(initial_ssthresh, int, 0644);
67MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
68module_param(bic_scale, int, 0444);
69MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
70module_param(tcp_friendliness, int, 0644);
71MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
72module_param(hystart, int, 0644);
73MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm");
74module_param(hystart_detect, int, 0644);
75MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms"
76 " 1: packet-train 2: delay 3: both packet-train and delay");
77module_param(hystart_low_window, int, 0644);
78MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
79module_param(hystart_ack_delta, int, 0644);
80MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)");
81
82
83struct bictcp {
84 u32 cnt;
85 u32 last_max_cwnd;
86 u32 loss_cwnd;
87 u32 last_cwnd;
88 u32 last_time;
89 u32 bic_origin_point;
90 u32 bic_K;
91
92 u32 delay_min;
93 u32 epoch_start;
94 u32 ack_cnt;
95 u32 tcp_cwnd;
96 u16 unused;
97 u8 sample_cnt;
98 u8 found;
99 u32 round_start;
100 u32 end_seq;
101 u32 last_ack;
102 u32 curr_rtt;
103};
104
105static inline void bictcp_reset(struct bictcp *ca)
106{
107 ca->cnt = 0;
108 ca->last_max_cwnd = 0;
109 ca->last_cwnd = 0;
110 ca->last_time = 0;
111 ca->bic_origin_point = 0;
112 ca->bic_K = 0;
113 ca->delay_min = 0;
114 ca->epoch_start = 0;
115 ca->ack_cnt = 0;
116 ca->tcp_cwnd = 0;
117 ca->found = 0;
118}
119
120static inline u32 bictcp_clock(void)
121{
122#if HZ < 1000
123 return ktime_to_ms(ktime_get_real());
124#else
125 return jiffies_to_msecs(jiffies);
126#endif
127}
128
129static inline void bictcp_hystart_reset(struct sock *sk)
130{
131 struct tcp_sock *tp = tcp_sk(sk);
132 struct bictcp *ca = inet_csk_ca(sk);
133
134 ca->round_start = ca->last_ack = bictcp_clock();
135 ca->end_seq = tp->snd_nxt;
136 ca->curr_rtt = 0;
137 ca->sample_cnt = 0;
138}
139
140static void bictcp_init(struct sock *sk)
141{
142 struct bictcp *ca = inet_csk_ca(sk);
143
144 bictcp_reset(ca);
145 ca->loss_cwnd = 0;
146
147 if (hystart)
148 bictcp_hystart_reset(sk);
149
150 if (!hystart && initial_ssthresh)
151 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
152}
153
154static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
155{
156 if (event == CA_EVENT_TX_START) {
157 struct bictcp *ca = inet_csk_ca(sk);
158 u32 now = tcp_time_stamp;
159 s32 delta;
160
161 delta = now - tcp_sk(sk)->lsndtime;
162
163
164
165
166 if (ca->epoch_start && delta > 0) {
167 ca->epoch_start += delta;
168 if (after(ca->epoch_start, now))
169 ca->epoch_start = now;
170 }
171 return;
172 }
173}
174
175
176
177
178
179static u32 cubic_root(u64 a)
180{
181 u32 x, b, shift;
182
183
184
185
186
187
188
189
190 static const u8 v[] = {
191 0, 54, 54, 54, 118, 118, 118, 118,
192 123, 129, 134, 138, 143, 147, 151, 156,
193 157, 161, 164, 168, 170, 173, 176, 179,
194 181, 185, 187, 190, 192, 194, 197, 199,
195 200, 202, 204, 206, 209, 211, 213, 215,
196 217, 219, 221, 222, 224, 225, 227, 229,
197 231, 232, 234, 236, 237, 239, 240, 242,
198 244, 245, 246, 248, 250, 251, 252, 254,
199 };
200
201 b = fls64(a);
202 if (b < 7) {
203
204 return ((u32)v[(u32)a] + 35) >> 6;
205 }
206
207 b = ((b * 84) >> 8) - 1;
208 shift = (a >> (b * 3));
209
210 x = ((u32)(((u32)v[shift] + 10) << b)) >> 6;
211
212
213
214
215
216
217
218 x = (2 * x + (u32)div64_u64(a, (u64)x * (u64)(x - 1)));
219 x = ((x * 341) >> 10);
220 return x;
221}
222
223
224
225
226static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
227{
228 u32 delta, bic_target, max_cnt;
229 u64 offs, t;
230
231 ca->ack_cnt += acked;
232
233 if (ca->last_cwnd == cwnd &&
234 (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
235 return;
236
237
238
239
240
241 if (ca->epoch_start && tcp_time_stamp == ca->last_time)
242 goto tcp_friendliness;
243
244 ca->last_cwnd = cwnd;
245 ca->last_time = tcp_time_stamp;
246
247 if (ca->epoch_start == 0) {
248 ca->epoch_start = tcp_time_stamp;
249 ca->ack_cnt = acked;
250 ca->tcp_cwnd = cwnd;
251
252 if (ca->last_max_cwnd <= cwnd) {
253 ca->bic_K = 0;
254 ca->bic_origin_point = cwnd;
255 } else {
256
257
258
259 ca->bic_K = cubic_root(cube_factor
260 * (ca->last_max_cwnd - cwnd));
261 ca->bic_origin_point = ca->last_max_cwnd;
262 }
263 }
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279 t = (s32)(tcp_time_stamp - ca->epoch_start);
280 t += msecs_to_jiffies(ca->delay_min >> 3);
281
282 t <<= BICTCP_HZ;
283 do_div(t, HZ);
284
285 if (t < ca->bic_K)
286 offs = ca->bic_K - t;
287 else
288 offs = t - ca->bic_K;
289
290
291 delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
292 if (t < ca->bic_K)
293 bic_target = ca->bic_origin_point - delta;
294 else
295 bic_target = ca->bic_origin_point + delta;
296
297
298 if (bic_target > cwnd) {
299 ca->cnt = cwnd / (bic_target - cwnd);
300 } else {
301 ca->cnt = 100 * cwnd;
302 }
303
304
305
306
307
308 if (ca->last_max_cwnd == 0 && ca->cnt > 20)
309 ca->cnt = 20;
310
311tcp_friendliness:
312
313 if (tcp_friendliness) {
314 u32 scale = beta_scale;
315
316 delta = (cwnd * scale) >> 3;
317 while (ca->ack_cnt > delta) {
318 ca->ack_cnt -= delta;
319 ca->tcp_cwnd++;
320 }
321
322 if (ca->tcp_cwnd > cwnd) {
323 delta = ca->tcp_cwnd - cwnd;
324 max_cnt = cwnd / delta;
325 if (ca->cnt > max_cnt)
326 ca->cnt = max_cnt;
327 }
328 }
329
330
331
332
333 ca->cnt = max(ca->cnt, 2U);
334}
335
336static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
337{
338 struct tcp_sock *tp = tcp_sk(sk);
339 struct bictcp *ca = inet_csk_ca(sk);
340
341 if (!tcp_is_cwnd_limited(sk))
342 return;
343
344 if (tcp_in_slow_start(tp)) {
345 if (hystart && after(ack, ca->end_seq))
346 bictcp_hystart_reset(sk);
347 acked = tcp_slow_start(tp, acked);
348 if (!acked)
349 return;
350 }
351 bictcp_update(ca, tp->snd_cwnd, acked);
352 tcp_cong_avoid_ai(tp, ca->cnt, acked);
353}
354
355static u32 bictcp_recalc_ssthresh(struct sock *sk)
356{
357 const struct tcp_sock *tp = tcp_sk(sk);
358 struct bictcp *ca = inet_csk_ca(sk);
359
360 ca->epoch_start = 0;
361
362
363 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
364 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
365 / (2 * BICTCP_BETA_SCALE);
366 else
367 ca->last_max_cwnd = tp->snd_cwnd;
368
369 ca->loss_cwnd = tp->snd_cwnd;
370
371 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
372}
373
374static u32 bictcp_undo_cwnd(struct sock *sk)
375{
376 struct bictcp *ca = inet_csk_ca(sk);
377
378 return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
379}
380
381static void bictcp_state(struct sock *sk, u8 new_state)
382{
383 if (new_state == TCP_CA_Loss) {
384 bictcp_reset(inet_csk_ca(sk));
385 bictcp_hystart_reset(sk);
386 }
387}
388
389static void hystart_update(struct sock *sk, u32 delay)
390{
391 struct tcp_sock *tp = tcp_sk(sk);
392 struct bictcp *ca = inet_csk_ca(sk);
393
394 if (ca->found & hystart_detect)
395 return;
396
397 if (hystart_detect & HYSTART_ACK_TRAIN) {
398 u32 now = bictcp_clock();
399
400
401 if ((s32)(now - ca->last_ack) <= hystart_ack_delta) {
402 ca->last_ack = now;
403 if ((s32)(now - ca->round_start) > ca->delay_min >> 4) {
404 ca->found |= HYSTART_ACK_TRAIN;
405 NET_INC_STATS(sock_net(sk),
406 LINUX_MIB_TCPHYSTARTTRAINDETECT);
407 NET_ADD_STATS(sock_net(sk),
408 LINUX_MIB_TCPHYSTARTTRAINCWND,
409 tp->snd_cwnd);
410 tp->snd_ssthresh = tp->snd_cwnd;
411 }
412 }
413 }
414
415 if (hystart_detect & HYSTART_DELAY) {
416
417 if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
418 if (ca->curr_rtt == 0 || ca->curr_rtt > delay)
419 ca->curr_rtt = delay;
420
421 ca->sample_cnt++;
422 } else {
423 if (ca->curr_rtt > ca->delay_min +
424 HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
425 ca->found |= HYSTART_DELAY;
426 NET_INC_STATS(sock_net(sk),
427 LINUX_MIB_TCPHYSTARTDELAYDETECT);
428 NET_ADD_STATS(sock_net(sk),
429 LINUX_MIB_TCPHYSTARTDELAYCWND,
430 tp->snd_cwnd);
431 tp->snd_ssthresh = tp->snd_cwnd;
432 }
433 }
434 }
435}
436
437
438
439
440static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
441{
442 const struct tcp_sock *tp = tcp_sk(sk);
443 struct bictcp *ca = inet_csk_ca(sk);
444 u32 delay;
445
446
447 if (sample->rtt_us < 0)
448 return;
449
450
451 if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
452 return;
453
454 delay = (sample->rtt_us << 3) / USEC_PER_MSEC;
455 if (delay == 0)
456 delay = 1;
457
458
459 if (ca->delay_min == 0 || ca->delay_min > delay)
460 ca->delay_min = delay;
461
462
463 if (hystart && tcp_in_slow_start(tp) &&
464 tp->snd_cwnd >= hystart_low_window)
465 hystart_update(sk, delay);
466}
467
468static struct tcp_congestion_ops cubictcp __read_mostly = {
469 .init = bictcp_init,
470 .ssthresh = bictcp_recalc_ssthresh,
471 .cong_avoid = bictcp_cong_avoid,
472 .set_state = bictcp_state,
473 .undo_cwnd = bictcp_undo_cwnd,
474 .cwnd_event = bictcp_cwnd_event,
475 .pkts_acked = bictcp_acked,
476 .owner = THIS_MODULE,
477 .name = "cubic",
478};
479
480static int __init cubictcp_register(void)
481{
482 BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
483
484
485
486
487
488 beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
489 / (BICTCP_BETA_SCALE - beta);
490
491 cube_rtt_scale = (bic_scale * 10);
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507 cube_factor = 1ull << (10+3*BICTCP_HZ);
508
509
510 do_div(cube_factor, bic_scale * 10);
511
512 return tcp_register_congestion_control(&cubictcp);
513}
514
515static void __exit cubictcp_unregister(void)
516{
517 tcp_unregister_congestion_control(&cubictcp);
518}
519
520module_init(cubictcp_register);
521module_exit(cubictcp_unregister);
522
523MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
524MODULE_LICENSE("GPL");
525MODULE_DESCRIPTION("CUBIC TCP");
526MODULE_VERSION("2.3");
527