1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/bpf.h>
18#include <linux/stddef.h>
19#include <linux/tcp.h>
20#include "bpf_tcp_helpers.h"
21
22char _license[] SEC("license") = "GPL";
23
24#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
25
26#define BICTCP_BETA_SCALE 1024
27
28
29#define BICTCP_HZ 10
30
31
32#define HYSTART_ACK_TRAIN 0x1
33#define HYSTART_DELAY 0x2
34
35
36#define HYSTART_MIN_SAMPLES 8
37#define HYSTART_DELAY_MIN (4000U)
38#define HYSTART_DELAY_MAX (16000U)
39#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
40
41static int fast_convergence = 1;
42static const int beta = 717;
43static int initial_ssthresh;
44static const int bic_scale = 41;
45static int tcp_friendliness = 1;
46
47static int hystart = 1;
48static int hystart_detect = HYSTART_ACK_TRAIN | HYSTART_DELAY;
49static int hystart_low_window = 16;
50static int hystart_ack_delta_us = 2000;
51
52static const __u32 cube_rtt_scale = (bic_scale * 10);
53static const __u32 beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
54 / (BICTCP_BETA_SCALE - beta);
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69static const __u64 cube_factor = (__u64)(1ull << (10+3*BICTCP_HZ))
70 / (bic_scale * 10);
71
72
73struct bictcp {
74 __u32 cnt;
75 __u32 last_max_cwnd;
76 __u32 last_cwnd;
77 __u32 last_time;
78 __u32 bic_origin_point;
79 __u32 bic_K;
80
81 __u32 delay_min;
82 __u32 epoch_start;
83 __u32 ack_cnt;
84 __u32 tcp_cwnd;
85 __u16 unused;
86 __u8 sample_cnt;
87 __u8 found;
88 __u32 round_start;
89 __u32 end_seq;
90 __u32 last_ack;
91 __u32 curr_rtt;
92};
93
94static inline void bictcp_reset(struct bictcp *ca)
95{
96 ca->cnt = 0;
97 ca->last_max_cwnd = 0;
98 ca->last_cwnd = 0;
99 ca->last_time = 0;
100 ca->bic_origin_point = 0;
101 ca->bic_K = 0;
102 ca->delay_min = 0;
103 ca->epoch_start = 0;
104 ca->ack_cnt = 0;
105 ca->tcp_cwnd = 0;
106 ca->found = 0;
107}
108
109extern unsigned long CONFIG_HZ __kconfig;
110#define HZ CONFIG_HZ
111#define USEC_PER_MSEC 1000UL
112#define USEC_PER_SEC 1000000UL
113#define USEC_PER_JIFFY (USEC_PER_SEC / HZ)
114
115static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor)
116{
117 return dividend / divisor;
118}
119
120#define div64_ul div64_u64
121
122#define BITS_PER_U64 (sizeof(__u64) * 8)
123static __always_inline int fls64(__u64 x)
124{
125 int num = BITS_PER_U64 - 1;
126
127 if (x == 0)
128 return 0;
129
130 if (!(x & (~0ull << (BITS_PER_U64-32)))) {
131 num -= 32;
132 x <<= 32;
133 }
134 if (!(x & (~0ull << (BITS_PER_U64-16)))) {
135 num -= 16;
136 x <<= 16;
137 }
138 if (!(x & (~0ull << (BITS_PER_U64-8)))) {
139 num -= 8;
140 x <<= 8;
141 }
142 if (!(x & (~0ull << (BITS_PER_U64-4)))) {
143 num -= 4;
144 x <<= 4;
145 }
146 if (!(x & (~0ull << (BITS_PER_U64-2)))) {
147 num -= 2;
148 x <<= 2;
149 }
150 if (!(x & (~0ull << (BITS_PER_U64-1))))
151 num -= 1;
152
153 return num + 1;
154}
155
156static __always_inline __u32 bictcp_clock_us(const struct sock *sk)
157{
158 return tcp_sk(sk)->tcp_mstamp;
159}
160
161static __always_inline void bictcp_hystart_reset(struct sock *sk)
162{
163 struct tcp_sock *tp = tcp_sk(sk);
164 struct bictcp *ca = inet_csk_ca(sk);
165
166 ca->round_start = ca->last_ack = bictcp_clock_us(sk);
167 ca->end_seq = tp->snd_nxt;
168 ca->curr_rtt = ~0U;
169 ca->sample_cnt = 0;
170}
171
172
173SEC("struct_ops/bpf_cubic_init")
174void BPF_PROG(bpf_cubic_init, struct sock *sk)
175{
176 struct bictcp *ca = inet_csk_ca(sk);
177
178 bictcp_reset(ca);
179
180 if (hystart)
181 bictcp_hystart_reset(sk);
182
183 if (!hystart && initial_ssthresh)
184 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
185}
186
187
188SEC("struct_ops/bpf_cubic_cwnd_event")
189void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event)
190{
191 if (event == CA_EVENT_TX_START) {
192 struct bictcp *ca = inet_csk_ca(sk);
193 __u32 now = tcp_jiffies32;
194 __s32 delta;
195
196 delta = now - tcp_sk(sk)->lsndtime;
197
198
199
200
201 if (ca->epoch_start && delta > 0) {
202 ca->epoch_start += delta;
203 if (after(ca->epoch_start, now))
204 ca->epoch_start = now;
205 }
206 return;
207 }
208}
209
210
211
212
213
214
215
216
217
218static const __u8 v[] = {
219 0, 54, 54, 54, 118, 118, 118, 118,
220 123, 129, 134, 138, 143, 147, 151, 156,
221 157, 161, 164, 168, 170, 173, 176, 179,
222 181, 185, 187, 190, 192, 194, 197, 199,
223 200, 202, 204, 206, 209, 211, 213, 215,
224 217, 219, 221, 222, 224, 225, 227, 229,
225 231, 232, 234, 236, 237, 239, 240, 242,
226 244, 245, 246, 248, 250, 251, 252, 254,
227};
228
229
230
231
232
233static __always_inline __u32 cubic_root(__u64 a)
234{
235 __u32 x, b, shift;
236
237 if (a < 64) {
238
239 return ((__u32)v[(__u32)a] + 35) >> 6;
240 }
241
242 b = fls64(a);
243 b = ((b * 84) >> 8) - 1;
244 shift = (a >> (b * 3));
245
246
247 if (shift >= 64)
248 return 0;
249
250 x = ((__u32)(((__u32)v[shift] + 10) << b)) >> 6;
251
252
253
254
255
256
257
258 x = (2 * x + (__u32)div64_u64(a, (__u64)x * (__u64)(x - 1)));
259 x = ((x * 341) >> 10);
260 return x;
261}
262
263
264
265
266static __always_inline void bictcp_update(struct bictcp *ca, __u32 cwnd,
267 __u32 acked)
268{
269 __u32 delta, bic_target, max_cnt;
270 __u64 offs, t;
271
272 ca->ack_cnt += acked;
273
274 if (ca->last_cwnd == cwnd &&
275 (__s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32)
276 return;
277
278
279
280
281
282 if (ca->epoch_start && tcp_jiffies32 == ca->last_time)
283 goto tcp_friendliness;
284
285 ca->last_cwnd = cwnd;
286 ca->last_time = tcp_jiffies32;
287
288 if (ca->epoch_start == 0) {
289 ca->epoch_start = tcp_jiffies32;
290 ca->ack_cnt = acked;
291 ca->tcp_cwnd = cwnd;
292
293 if (ca->last_max_cwnd <= cwnd) {
294 ca->bic_K = 0;
295 ca->bic_origin_point = cwnd;
296 } else {
297
298
299
300 ca->bic_K = cubic_root(cube_factor
301 * (ca->last_max_cwnd - cwnd));
302 ca->bic_origin_point = ca->last_max_cwnd;
303 }
304 }
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320 t = (__s32)(tcp_jiffies32 - ca->epoch_start) * USEC_PER_JIFFY;
321 t += ca->delay_min;
322
323 t <<= BICTCP_HZ;
324 t /= USEC_PER_SEC;
325
326 if (t < ca->bic_K)
327 offs = ca->bic_K - t;
328 else
329 offs = t - ca->bic_K;
330
331
332 delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
333 if (t < ca->bic_K)
334 bic_target = ca->bic_origin_point - delta;
335 else
336 bic_target = ca->bic_origin_point + delta;
337
338
339 if (bic_target > cwnd) {
340 ca->cnt = cwnd / (bic_target - cwnd);
341 } else {
342 ca->cnt = 100 * cwnd;
343 }
344
345
346
347
348
349 if (ca->last_max_cwnd == 0 && ca->cnt > 20)
350 ca->cnt = 20;
351
352tcp_friendliness:
353
354 if (tcp_friendliness) {
355 __u32 scale = beta_scale;
356 __u32 n;
357
358
359 delta = (cwnd * scale) >> 3;
360 if (ca->ack_cnt > delta && delta) {
361 n = ca->ack_cnt / delta;
362 ca->ack_cnt -= n * delta;
363 ca->tcp_cwnd += n;
364 }
365
366 if (ca->tcp_cwnd > cwnd) {
367 delta = ca->tcp_cwnd - cwnd;
368 max_cnt = cwnd / delta;
369 if (ca->cnt > max_cnt)
370 ca->cnt = max_cnt;
371 }
372 }
373
374
375
376
377 ca->cnt = max(ca->cnt, 2U);
378}
379
380
381void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
382{
383 struct tcp_sock *tp = tcp_sk(sk);
384 struct bictcp *ca = inet_csk_ca(sk);
385
386 if (!tcp_is_cwnd_limited(sk))
387 return;
388
389 if (tcp_in_slow_start(tp)) {
390 if (hystart && after(ack, ca->end_seq))
391 bictcp_hystart_reset(sk);
392 acked = tcp_slow_start(tp, acked);
393 if (!acked)
394 return;
395 }
396 bictcp_update(ca, tp->snd_cwnd, acked);
397 tcp_cong_avoid_ai(tp, ca->cnt, acked);
398}
399
400__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk)
401{
402 const struct tcp_sock *tp = tcp_sk(sk);
403 struct bictcp *ca = inet_csk_ca(sk);
404
405 ca->epoch_start = 0;
406
407
408 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
409 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
410 / (2 * BICTCP_BETA_SCALE);
411 else
412 ca->last_max_cwnd = tp->snd_cwnd;
413
414 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
415}
416
417void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state)
418{
419 if (new_state == TCP_CA_Loss) {
420 bictcp_reset(inet_csk_ca(sk));
421 bictcp_hystart_reset(sk);
422 }
423}
424
425#define GSO_MAX_SIZE 65536
426
427
428
429
430
431
432
433
434
435
436static __always_inline __u32 hystart_ack_delay(struct sock *sk)
437{
438 unsigned long rate;
439
440 rate = sk->sk_pacing_rate;
441 if (!rate)
442 return 0;
443 return min((__u64)USEC_PER_MSEC,
444 div64_ul((__u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate));
445}
446
447static __always_inline void hystart_update(struct sock *sk, __u32 delay)
448{
449 struct tcp_sock *tp = tcp_sk(sk);
450 struct bictcp *ca = inet_csk_ca(sk);
451 __u32 threshold;
452
453 if (hystart_detect & HYSTART_ACK_TRAIN) {
454 __u32 now = bictcp_clock_us(sk);
455
456
457 if ((__s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
458 ca->last_ack = now;
459
460 threshold = ca->delay_min + hystart_ack_delay(sk);
461
462
463
464
465
466
467 if (sk->sk_pacing_status == SK_PACING_NONE)
468 threshold >>= 1;
469
470 if ((__s32)(now - ca->round_start) > threshold) {
471 ca->found = 1;
472 tp->snd_ssthresh = tp->snd_cwnd;
473 }
474 }
475 }
476
477 if (hystart_detect & HYSTART_DELAY) {
478
479 if (ca->curr_rtt > delay)
480 ca->curr_rtt = delay;
481 if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
482 ca->sample_cnt++;
483 } else {
484 if (ca->curr_rtt > ca->delay_min +
485 HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
486 ca->found = 1;
487 tp->snd_ssthresh = tp->snd_cwnd;
488 }
489 }
490 }
491}
492
493void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk,
494 const struct ack_sample *sample)
495{
496 const struct tcp_sock *tp = tcp_sk(sk);
497 struct bictcp *ca = inet_csk_ca(sk);
498 __u32 delay;
499
500
501 if (sample->rtt_us < 0)
502 return;
503
504
505 if (ca->epoch_start && (__s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
506 return;
507
508 delay = sample->rtt_us;
509 if (delay == 0)
510 delay = 1;
511
512
513 if (ca->delay_min == 0 || ca->delay_min > delay)
514 ca->delay_min = delay;
515
516
517 if (!ca->found && tcp_in_slow_start(tp) && hystart &&
518 tp->snd_cwnd >= hystart_low_window)
519 hystart_update(sk, delay);
520}
521
522extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym;
523
524__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk)
525{
526 return tcp_reno_undo_cwnd(sk);
527}
528
529SEC(".struct_ops")
530struct tcp_congestion_ops cubic = {
531 .init = (void *)bpf_cubic_init,
532 .ssthresh = (void *)bpf_cubic_recalc_ssthresh,
533 .cong_avoid = (void *)bpf_cubic_cong_avoid,
534 .set_state = (void *)bpf_cubic_state,
535 .undo_cwnd = (void *)bpf_cubic_undo_cwnd,
536 .cwnd_event = (void *)bpf_cubic_cwnd_event,
537 .pkts_acked = (void *)bpf_cubic_acked,
538 .name = "bpf_cubic",
539};
540