1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27#include <linux/mm.h>
28#include <linux/module.h>
29#include <linux/math64.h>
30#include <net/tcp.h>
31
32#define BICTCP_BETA_SCALE 1024
33
34
35#define BICTCP_HZ 10
36
37
38#define HYSTART_ACK_TRAIN 0x1
39#define HYSTART_DELAY 0x2
40
41
42#define HYSTART_MIN_SAMPLES 8
43#define HYSTART_DELAY_MIN (4000U)
44#define HYSTART_DELAY_MAX (16000U)
45#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
46
47static int fast_convergence __read_mostly = 1;
48static int beta __read_mostly = 717;
49static int initial_ssthresh __read_mostly;
50static int bic_scale __read_mostly = 41;
51static int tcp_friendliness __read_mostly = 1;
52
53static int hystart __read_mostly = 1;
54static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
55static int hystart_low_window __read_mostly = 16;
56static int hystart_ack_delta_us __read_mostly = 2000;
57
58static u32 cube_rtt_scale __read_mostly;
59static u32 beta_scale __read_mostly;
60static u64 cube_factor __read_mostly;
61
62
63module_param(fast_convergence, int, 0644);
64MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
65module_param(beta, int, 0644);
66MODULE_PARM_DESC(beta, "beta for multiplicative increase");
67module_param(initial_ssthresh, int, 0644);
68MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
69module_param(bic_scale, int, 0444);
70MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
71module_param(tcp_friendliness, int, 0644);
72MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
73module_param(hystart, int, 0644);
74MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm");
75module_param(hystart_detect, int, 0644);
76MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms"
77 " 1: packet-train 2: delay 3: both packet-train and delay");
78module_param(hystart_low_window, int, 0644);
79MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
80module_param(hystart_ack_delta_us, int, 0644);
81MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)");
82
83
84struct bictcp {
85 u32 cnt;
86 u32 last_max_cwnd;
87 u32 last_cwnd;
88 u32 last_time;
89 u32 bic_origin_point;
90 u32 bic_K;
91
92 u32 delay_min;
93 u32 epoch_start;
94 u32 ack_cnt;
95 u32 tcp_cwnd;
96 u16 unused;
97 u8 sample_cnt;
98 u8 found;
99 u32 round_start;
100 u32 end_seq;
101 u32 last_ack;
102 u32 curr_rtt;
103};
104
105static inline void bictcp_reset(struct bictcp *ca)
106{
107 memset(ca, 0, offsetof(struct bictcp, unused));
108 ca->found = 0;
109}
110
111static inline u32 bictcp_clock_us(const struct sock *sk)
112{
113 return tcp_sk(sk)->tcp_mstamp;
114}
115
116static inline void bictcp_hystart_reset(struct sock *sk)
117{
118 struct tcp_sock *tp = tcp_sk(sk);
119 struct bictcp *ca = inet_csk_ca(sk);
120
121 ca->round_start = ca->last_ack = bictcp_clock_us(sk);
122 ca->end_seq = tp->snd_nxt;
123 ca->curr_rtt = ~0U;
124 ca->sample_cnt = 0;
125}
126
127static void cubictcp_init(struct sock *sk)
128{
129 struct bictcp *ca = inet_csk_ca(sk);
130
131 bictcp_reset(ca);
132
133 if (hystart)
134 bictcp_hystart_reset(sk);
135
136 if (!hystart && initial_ssthresh)
137 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
138}
139
140static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
141{
142 if (event == CA_EVENT_TX_START) {
143 struct bictcp *ca = inet_csk_ca(sk);
144 u32 now = tcp_jiffies32;
145 s32 delta;
146
147 delta = now - tcp_sk(sk)->lsndtime;
148
149
150
151
152 if (ca->epoch_start && delta > 0) {
153 ca->epoch_start += delta;
154 if (after(ca->epoch_start, now))
155 ca->epoch_start = now;
156 }
157 return;
158 }
159}
160
161
162
163
164
165static u32 cubic_root(u64 a)
166{
167 u32 x, b, shift;
168
169
170
171
172
173
174
175
176 static const u8 v[] = {
177 0, 54, 54, 54, 118, 118, 118, 118,
178 123, 129, 134, 138, 143, 147, 151, 156,
179 157, 161, 164, 168, 170, 173, 176, 179,
180 181, 185, 187, 190, 192, 194, 197, 199,
181 200, 202, 204, 206, 209, 211, 213, 215,
182 217, 219, 221, 222, 224, 225, 227, 229,
183 231, 232, 234, 236, 237, 239, 240, 242,
184 244, 245, 246, 248, 250, 251, 252, 254,
185 };
186
187 b = fls64(a);
188 if (b < 7) {
189
190 return ((u32)v[(u32)a] + 35) >> 6;
191 }
192
193 b = ((b * 84) >> 8) - 1;
194 shift = (a >> (b * 3));
195
196 x = ((u32)(((u32)v[shift] + 10) << b)) >> 6;
197
198
199
200
201
202
203
204 x = (2 * x + (u32)div64_u64(a, (u64)x * (u64)(x - 1)));
205 x = ((x * 341) >> 10);
206 return x;
207}
208
209
210
211
212static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
213{
214 u32 delta, bic_target, max_cnt;
215 u64 offs, t;
216
217 ca->ack_cnt += acked;
218
219 if (ca->last_cwnd == cwnd &&
220 (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32)
221 return;
222
223
224
225
226
227 if (ca->epoch_start && tcp_jiffies32 == ca->last_time)
228 goto tcp_friendliness;
229
230 ca->last_cwnd = cwnd;
231 ca->last_time = tcp_jiffies32;
232
233 if (ca->epoch_start == 0) {
234 ca->epoch_start = tcp_jiffies32;
235 ca->ack_cnt = acked;
236 ca->tcp_cwnd = cwnd;
237
238 if (ca->last_max_cwnd <= cwnd) {
239 ca->bic_K = 0;
240 ca->bic_origin_point = cwnd;
241 } else {
242
243
244
245 ca->bic_K = cubic_root(cube_factor
246 * (ca->last_max_cwnd - cwnd));
247 ca->bic_origin_point = ca->last_max_cwnd;
248 }
249 }
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265 t = (s32)(tcp_jiffies32 - ca->epoch_start);
266 t += usecs_to_jiffies(ca->delay_min);
267
268 t <<= BICTCP_HZ;
269 do_div(t, HZ);
270
271 if (t < ca->bic_K)
272 offs = ca->bic_K - t;
273 else
274 offs = t - ca->bic_K;
275
276
277 delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
278 if (t < ca->bic_K)
279 bic_target = ca->bic_origin_point - delta;
280 else
281 bic_target = ca->bic_origin_point + delta;
282
283
284 if (bic_target > cwnd) {
285 ca->cnt = cwnd / (bic_target - cwnd);
286 } else {
287 ca->cnt = 100 * cwnd;
288 }
289
290
291
292
293
294 if (ca->last_max_cwnd == 0 && ca->cnt > 20)
295 ca->cnt = 20;
296
297tcp_friendliness:
298
299 if (tcp_friendliness) {
300 u32 scale = beta_scale;
301
302 delta = (cwnd * scale) >> 3;
303 while (ca->ack_cnt > delta) {
304 ca->ack_cnt -= delta;
305 ca->tcp_cwnd++;
306 }
307
308 if (ca->tcp_cwnd > cwnd) {
309 delta = ca->tcp_cwnd - cwnd;
310 max_cnt = cwnd / delta;
311 if (ca->cnt > max_cnt)
312 ca->cnt = max_cnt;
313 }
314 }
315
316
317
318
319 ca->cnt = max(ca->cnt, 2U);
320}
321
322static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
323{
324 struct tcp_sock *tp = tcp_sk(sk);
325 struct bictcp *ca = inet_csk_ca(sk);
326
327 if (!tcp_is_cwnd_limited(sk))
328 return;
329
330 if (tcp_in_slow_start(tp)) {
331 if (hystart && after(ack, ca->end_seq))
332 bictcp_hystart_reset(sk);
333 acked = tcp_slow_start(tp, acked);
334 if (!acked)
335 return;
336 }
337 bictcp_update(ca, tp->snd_cwnd, acked);
338 tcp_cong_avoid_ai(tp, ca->cnt, acked);
339}
340
341static u32 cubictcp_recalc_ssthresh(struct sock *sk)
342{
343 const struct tcp_sock *tp = tcp_sk(sk);
344 struct bictcp *ca = inet_csk_ca(sk);
345
346 ca->epoch_start = 0;
347
348
349 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
350 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
351 / (2 * BICTCP_BETA_SCALE);
352 else
353 ca->last_max_cwnd = tp->snd_cwnd;
354
355 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
356}
357
358static void cubictcp_state(struct sock *sk, u8 new_state)
359{
360 if (new_state == TCP_CA_Loss) {
361 bictcp_reset(inet_csk_ca(sk));
362 bictcp_hystart_reset(sk);
363 }
364}
365
366
367
368
369
370
371
372
373
374
375static u32 hystart_ack_delay(struct sock *sk)
376{
377 unsigned long rate;
378
379 rate = READ_ONCE(sk->sk_pacing_rate);
380 if (!rate)
381 return 0;
382 return min_t(u64, USEC_PER_MSEC,
383 div64_ul((u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate));
384}
385
386static void hystart_update(struct sock *sk, u32 delay)
387{
388 struct tcp_sock *tp = tcp_sk(sk);
389 struct bictcp *ca = inet_csk_ca(sk);
390 u32 threshold;
391
392 if (hystart_detect & HYSTART_ACK_TRAIN) {
393 u32 now = bictcp_clock_us(sk);
394
395
396 if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
397 ca->last_ack = now;
398
399 threshold = ca->delay_min + hystart_ack_delay(sk);
400
401
402
403
404
405
406 if (sk->sk_pacing_status == SK_PACING_NONE)
407 threshold >>= 1;
408
409 if ((s32)(now - ca->round_start) > threshold) {
410 ca->found = 1;
411 pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n",
412 now - ca->round_start, threshold,
413 ca->delay_min, hystart_ack_delay(sk), tp->snd_cwnd);
414 NET_INC_STATS(sock_net(sk),
415 LINUX_MIB_TCPHYSTARTTRAINDETECT);
416 NET_ADD_STATS(sock_net(sk),
417 LINUX_MIB_TCPHYSTARTTRAINCWND,
418 tp->snd_cwnd);
419 tp->snd_ssthresh = tp->snd_cwnd;
420 }
421 }
422 }
423
424 if (hystart_detect & HYSTART_DELAY) {
425
426 if (ca->curr_rtt > delay)
427 ca->curr_rtt = delay;
428 if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
429 ca->sample_cnt++;
430 } else {
431 if (ca->curr_rtt > ca->delay_min +
432 HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
433 ca->found = 1;
434 NET_INC_STATS(sock_net(sk),
435 LINUX_MIB_TCPHYSTARTDELAYDETECT);
436 NET_ADD_STATS(sock_net(sk),
437 LINUX_MIB_TCPHYSTARTDELAYCWND,
438 tp->snd_cwnd);
439 tp->snd_ssthresh = tp->snd_cwnd;
440 }
441 }
442 }
443}
444
445static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample)
446{
447 const struct tcp_sock *tp = tcp_sk(sk);
448 struct bictcp *ca = inet_csk_ca(sk);
449 u32 delay;
450
451
452 if (sample->rtt_us < 0)
453 return;
454
455
456 if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
457 return;
458
459 delay = sample->rtt_us;
460 if (delay == 0)
461 delay = 1;
462
463
464 if (ca->delay_min == 0 || ca->delay_min > delay)
465 ca->delay_min = delay;
466
467
468 if (!ca->found && tcp_in_slow_start(tp) && hystart &&
469 tp->snd_cwnd >= hystart_low_window)
470 hystart_update(sk, delay);
471}
472
473static struct tcp_congestion_ops cubictcp __read_mostly = {
474 .init = cubictcp_init,
475 .ssthresh = cubictcp_recalc_ssthresh,
476 .cong_avoid = cubictcp_cong_avoid,
477 .set_state = cubictcp_state,
478 .undo_cwnd = tcp_reno_undo_cwnd,
479 .cwnd_event = cubictcp_cwnd_event,
480 .pkts_acked = cubictcp_acked,
481 .owner = THIS_MODULE,
482 .name = "cubic",
483};
484
485static int __init cubictcp_register(void)
486{
487 BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
488
489
490
491
492
493 beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
494 / (BICTCP_BETA_SCALE - beta);
495
496 cube_rtt_scale = (bic_scale * 10);
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512 cube_factor = 1ull << (10+3*BICTCP_HZ);
513
514
515 do_div(cube_factor, bic_scale * 10);
516
517 return tcp_register_congestion_control(&cubictcp);
518}
519
520static void __exit cubictcp_unregister(void)
521{
522 tcp_unregister_congestion_control(&cubictcp);
523}
524
525module_init(cubictcp_register);
526module_exit(cubictcp_unregister);
527
528MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
529MODULE_LICENSE("GPL");
530MODULE_DESCRIPTION("CUBIC TCP");
531MODULE_VERSION("2.3");
532