1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/bpf.h>
18#include <linux/stddef.h>
19#include <linux/tcp.h>
20#include "bpf_tcp_helpers.h"
21
22char _license[] SEC("license") = "GPL";
23
24#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
25
26#define BICTCP_BETA_SCALE 1024
27
28
29#define BICTCP_HZ 10
30
31
32#define HYSTART_ACK_TRAIN 0x1
33#define HYSTART_DELAY 0x2
34
35
36#define HYSTART_MIN_SAMPLES 8
37#define HYSTART_DELAY_MIN (4000U)
38#define HYSTART_DELAY_MAX (16000U)
39#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
40
41static int fast_convergence = 1;
42static const int beta = 717;
43static int initial_ssthresh;
44static const int bic_scale = 41;
45static int tcp_friendliness = 1;
46
47static int hystart = 1;
48static int hystart_detect = HYSTART_ACK_TRAIN | HYSTART_DELAY;
49static int hystart_low_window = 16;
50static int hystart_ack_delta_us = 2000;
51
52static const __u32 cube_rtt_scale = (bic_scale * 10);
53static const __u32 beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
54 / (BICTCP_BETA_SCALE - beta);
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69static const __u64 cube_factor = (__u64)(1ull << (10+3*BICTCP_HZ))
70 / (bic_scale * 10);
71
72
73struct bictcp {
74 __u32 cnt;
75 __u32 last_max_cwnd;
76 __u32 last_cwnd;
77 __u32 last_time;
78 __u32 bic_origin_point;
79 __u32 bic_K;
80
81 __u32 delay_min;
82 __u32 epoch_start;
83 __u32 ack_cnt;
84 __u32 tcp_cwnd;
85 __u16 unused;
86 __u8 sample_cnt;
87 __u8 found;
88 __u32 round_start;
89 __u32 end_seq;
90 __u32 last_ack;
91 __u32 curr_rtt;
92};
93
94static inline void bictcp_reset(struct bictcp *ca)
95{
96 ca->cnt = 0;
97 ca->last_max_cwnd = 0;
98 ca->last_cwnd = 0;
99 ca->last_time = 0;
100 ca->bic_origin_point = 0;
101 ca->bic_K = 0;
102 ca->delay_min = 0;
103 ca->epoch_start = 0;
104 ca->ack_cnt = 0;
105 ca->tcp_cwnd = 0;
106 ca->found = 0;
107}
108
109extern unsigned long CONFIG_HZ __kconfig;
110#define HZ CONFIG_HZ
111#define USEC_PER_MSEC 1000UL
112#define USEC_PER_SEC 1000000UL
113#define USEC_PER_JIFFY (USEC_PER_SEC / HZ)
114
115static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor)
116{
117 return dividend / divisor;
118}
119
120#define div64_ul div64_u64
121
122#define BITS_PER_U64 (sizeof(__u64) * 8)
123static __always_inline int fls64(__u64 x)
124{
125 int num = BITS_PER_U64 - 1;
126
127 if (x == 0)
128 return 0;
129
130 if (!(x & (~0ull << (BITS_PER_U64-32)))) {
131 num -= 32;
132 x <<= 32;
133 }
134 if (!(x & (~0ull << (BITS_PER_U64-16)))) {
135 num -= 16;
136 x <<= 16;
137 }
138 if (!(x & (~0ull << (BITS_PER_U64-8)))) {
139 num -= 8;
140 x <<= 8;
141 }
142 if (!(x & (~0ull << (BITS_PER_U64-4)))) {
143 num -= 4;
144 x <<= 4;
145 }
146 if (!(x & (~0ull << (BITS_PER_U64-2)))) {
147 num -= 2;
148 x <<= 2;
149 }
150 if (!(x & (~0ull << (BITS_PER_U64-1))))
151 num -= 1;
152
153 return num + 1;
154}
155
156static __always_inline __u32 bictcp_clock_us(const struct sock *sk)
157{
158 return tcp_sk(sk)->tcp_mstamp;
159}
160
161static __always_inline void bictcp_hystart_reset(struct sock *sk)
162{
163 struct tcp_sock *tp = tcp_sk(sk);
164 struct bictcp *ca = inet_csk_ca(sk);
165
166 ca->round_start = ca->last_ack = bictcp_clock_us(sk);
167 ca->end_seq = tp->snd_nxt;
168 ca->curr_rtt = ~0U;
169 ca->sample_cnt = 0;
170}
171
172
173
174
175
176
177SEC("struct_ops/bpf_cubic_init")
178void BPF_PROG(bpf_cubic_init, struct sock *sk)
179{
180 struct bictcp *ca = inet_csk_ca(sk);
181
182 bictcp_reset(ca);
183
184 if (hystart)
185 bictcp_hystart_reset(sk);
186
187 if (!hystart && initial_ssthresh)
188 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
189}
190
191
192
193
194SEC("no-sec-prefix-bictcp_cwnd_event")
195void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event)
196{
197 if (event == CA_EVENT_TX_START) {
198 struct bictcp *ca = inet_csk_ca(sk);
199 __u32 now = tcp_jiffies32;
200 __s32 delta;
201
202 delta = now - tcp_sk(sk)->lsndtime;
203
204
205
206
207 if (ca->epoch_start && delta > 0) {
208 ca->epoch_start += delta;
209 if (after(ca->epoch_start, now))
210 ca->epoch_start = now;
211 }
212 return;
213 }
214}
215
216
217
218
219
220
221
222
223
224static const __u8 v[] = {
225 0, 54, 54, 54, 118, 118, 118, 118,
226 123, 129, 134, 138, 143, 147, 151, 156,
227 157, 161, 164, 168, 170, 173, 176, 179,
228 181, 185, 187, 190, 192, 194, 197, 199,
229 200, 202, 204, 206, 209, 211, 213, 215,
230 217, 219, 221, 222, 224, 225, 227, 229,
231 231, 232, 234, 236, 237, 239, 240, 242,
232 244, 245, 246, 248, 250, 251, 252, 254,
233};
234
235
236
237
238
239static __always_inline __u32 cubic_root(__u64 a)
240{
241 __u32 x, b, shift;
242
243 if (a < 64) {
244
245 return ((__u32)v[(__u32)a] + 35) >> 6;
246 }
247
248 b = fls64(a);
249 b = ((b * 84) >> 8) - 1;
250 shift = (a >> (b * 3));
251
252
253 if (shift >= 64)
254 return 0;
255
256 x = ((__u32)(((__u32)v[shift] + 10) << b)) >> 6;
257
258
259
260
261
262
263
264 x = (2 * x + (__u32)div64_u64(a, (__u64)x * (__u64)(x - 1)));
265 x = ((x * 341) >> 10);
266 return x;
267}
268
269
270
271
272static __always_inline void bictcp_update(struct bictcp *ca, __u32 cwnd,
273 __u32 acked)
274{
275 __u32 delta, bic_target, max_cnt;
276 __u64 offs, t;
277
278 ca->ack_cnt += acked;
279
280 if (ca->last_cwnd == cwnd &&
281 (__s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32)
282 return;
283
284
285
286
287
288 if (ca->epoch_start && tcp_jiffies32 == ca->last_time)
289 goto tcp_friendliness;
290
291 ca->last_cwnd = cwnd;
292 ca->last_time = tcp_jiffies32;
293
294 if (ca->epoch_start == 0) {
295 ca->epoch_start = tcp_jiffies32;
296 ca->ack_cnt = acked;
297 ca->tcp_cwnd = cwnd;
298
299 if (ca->last_max_cwnd <= cwnd) {
300 ca->bic_K = 0;
301 ca->bic_origin_point = cwnd;
302 } else {
303
304
305
306 ca->bic_K = cubic_root(cube_factor
307 * (ca->last_max_cwnd - cwnd));
308 ca->bic_origin_point = ca->last_max_cwnd;
309 }
310 }
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326 t = (__s32)(tcp_jiffies32 - ca->epoch_start) * USEC_PER_JIFFY;
327 t += ca->delay_min;
328
329 t <<= BICTCP_HZ;
330 t /= USEC_PER_SEC;
331
332 if (t < ca->bic_K)
333 offs = ca->bic_K - t;
334 else
335 offs = t - ca->bic_K;
336
337
338 delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
339 if (t < ca->bic_K)
340 bic_target = ca->bic_origin_point - delta;
341 else
342 bic_target = ca->bic_origin_point + delta;
343
344
345 if (bic_target > cwnd) {
346 ca->cnt = cwnd / (bic_target - cwnd);
347 } else {
348 ca->cnt = 100 * cwnd;
349 }
350
351
352
353
354
355 if (ca->last_max_cwnd == 0 && ca->cnt > 20)
356 ca->cnt = 20;
357
358tcp_friendliness:
359
360 if (tcp_friendliness) {
361 __u32 scale = beta_scale;
362 __u32 n;
363
364
365 delta = (cwnd * scale) >> 3;
366 if (ca->ack_cnt > delta && delta) {
367 n = ca->ack_cnt / delta;
368 ca->ack_cnt -= n * delta;
369 ca->tcp_cwnd += n;
370 }
371
372 if (ca->tcp_cwnd > cwnd) {
373 delta = ca->tcp_cwnd - cwnd;
374 max_cnt = cwnd / delta;
375 if (ca->cnt > max_cnt)
376 ca->cnt = max_cnt;
377 }
378 }
379
380
381
382
383 ca->cnt = max(ca->cnt, 2U);
384}
385
386
387void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
388{
389 struct tcp_sock *tp = tcp_sk(sk);
390 struct bictcp *ca = inet_csk_ca(sk);
391
392 if (!tcp_is_cwnd_limited(sk))
393 return;
394
395 if (tcp_in_slow_start(tp)) {
396 if (hystart && after(ack, ca->end_seq))
397 bictcp_hystart_reset(sk);
398 acked = tcp_slow_start(tp, acked);
399 if (!acked)
400 return;
401 }
402 bictcp_update(ca, tp->snd_cwnd, acked);
403 tcp_cong_avoid_ai(tp, ca->cnt, acked);
404}
405
406__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk)
407{
408 const struct tcp_sock *tp = tcp_sk(sk);
409 struct bictcp *ca = inet_csk_ca(sk);
410
411 ca->epoch_start = 0;
412
413
414 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
415 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
416 / (2 * BICTCP_BETA_SCALE);
417 else
418 ca->last_max_cwnd = tp->snd_cwnd;
419
420 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
421}
422
423void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state)
424{
425 if (new_state == TCP_CA_Loss) {
426 bictcp_reset(inet_csk_ca(sk));
427 bictcp_hystart_reset(sk);
428 }
429}
430
431#define GSO_MAX_SIZE 65536
432
433
434
435
436
437
438
439
440
441
442static __always_inline __u32 hystart_ack_delay(struct sock *sk)
443{
444 unsigned long rate;
445
446 rate = sk->sk_pacing_rate;
447 if (!rate)
448 return 0;
449 return min((__u64)USEC_PER_MSEC,
450 div64_ul((__u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate));
451}
452
453static __always_inline void hystart_update(struct sock *sk, __u32 delay)
454{
455 struct tcp_sock *tp = tcp_sk(sk);
456 struct bictcp *ca = inet_csk_ca(sk);
457 __u32 threshold;
458
459 if (hystart_detect & HYSTART_ACK_TRAIN) {
460 __u32 now = bictcp_clock_us(sk);
461
462
463 if ((__s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
464 ca->last_ack = now;
465
466 threshold = ca->delay_min + hystart_ack_delay(sk);
467
468
469
470
471
472
473 if (sk->sk_pacing_status == SK_PACING_NONE)
474 threshold >>= 1;
475
476 if ((__s32)(now - ca->round_start) > threshold) {
477 ca->found = 1;
478 tp->snd_ssthresh = tp->snd_cwnd;
479 }
480 }
481 }
482
483 if (hystart_detect & HYSTART_DELAY) {
484
485 if (ca->curr_rtt > delay)
486 ca->curr_rtt = delay;
487 if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
488 ca->sample_cnt++;
489 } else {
490 if (ca->curr_rtt > ca->delay_min +
491 HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
492 ca->found = 1;
493 tp->snd_ssthresh = tp->snd_cwnd;
494 }
495 }
496 }
497}
498
499void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk,
500 const struct ack_sample *sample)
501{
502 const struct tcp_sock *tp = tcp_sk(sk);
503 struct bictcp *ca = inet_csk_ca(sk);
504 __u32 delay;
505
506
507 if (sample->rtt_us < 0)
508 return;
509
510
511 if (ca->epoch_start && (__s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
512 return;
513
514 delay = sample->rtt_us;
515 if (delay == 0)
516 delay = 1;
517
518
519 if (ca->delay_min == 0 || ca->delay_min > delay)
520 ca->delay_min = delay;
521
522
523 if (!ca->found && tcp_in_slow_start(tp) && hystart &&
524 tp->snd_cwnd >= hystart_low_window)
525 hystart_update(sk, delay);
526}
527
528extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym;
529
530__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk)
531{
532 return tcp_reno_undo_cwnd(sk);
533}
534
535SEC(".struct_ops")
536struct tcp_congestion_ops cubic = {
537 .init = (void *)bpf_cubic_init,
538 .ssthresh = (void *)bpf_cubic_recalc_ssthresh,
539 .cong_avoid = (void *)bpf_cubic_cong_avoid,
540 .set_state = (void *)bpf_cubic_state,
541 .undo_cwnd = (void *)bpf_cubic_undo_cwnd,
542 .cwnd_event = (void *)bpf_cubic_cwnd_event,
543 .pkts_acked = (void *)bpf_cubic_acked,
544 .name = "bpf_cubic",
545};
546