1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/bpf.h>
18#include "bpf_tcp_helpers.h"
19
20char _license[] SEC("license") = "GPL";
21
22#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
23
24#define BICTCP_BETA_SCALE 1024
25
26
27#define BICTCP_HZ 10
28
29
30#define HYSTART_ACK_TRAIN 0x1
31#define HYSTART_DELAY 0x2
32
33
34#define HYSTART_MIN_SAMPLES 8
35#define HYSTART_DELAY_MIN (4000U)
36#define HYSTART_DELAY_MAX (16000U)
37#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
38
39static int fast_convergence = 1;
40static const int beta = 717;
41static int initial_ssthresh;
42static const int bic_scale = 41;
43static int tcp_friendliness = 1;
44
45static int hystart = 1;
46static int hystart_detect = HYSTART_ACK_TRAIN | HYSTART_DELAY;
47static int hystart_low_window = 16;
48static int hystart_ack_delta_us = 2000;
49
50static const __u32 cube_rtt_scale = (bic_scale * 10);
51static const __u32 beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
52 / (BICTCP_BETA_SCALE - beta);
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67static const __u64 cube_factor = (__u64)(1ull << (10+3*BICTCP_HZ))
68 / (bic_scale * 10);
69
70
71struct bictcp {
72 __u32 cnt;
73 __u32 last_max_cwnd;
74 __u32 last_cwnd;
75 __u32 last_time;
76 __u32 bic_origin_point;
77 __u32 bic_K;
78
79 __u32 delay_min;
80 __u32 epoch_start;
81 __u32 ack_cnt;
82 __u32 tcp_cwnd;
83 __u16 unused;
84 __u8 sample_cnt;
85 __u8 found;
86 __u32 round_start;
87 __u32 end_seq;
88 __u32 last_ack;
89 __u32 curr_rtt;
90};
91
92static inline void bictcp_reset(struct bictcp *ca)
93{
94 ca->cnt = 0;
95 ca->last_max_cwnd = 0;
96 ca->last_cwnd = 0;
97 ca->last_time = 0;
98 ca->bic_origin_point = 0;
99 ca->bic_K = 0;
100 ca->delay_min = 0;
101 ca->epoch_start = 0;
102 ca->ack_cnt = 0;
103 ca->tcp_cwnd = 0;
104 ca->found = 0;
105}
106
107extern unsigned long CONFIG_HZ __kconfig;
108#define HZ CONFIG_HZ
109#define USEC_PER_MSEC 1000UL
110#define USEC_PER_SEC 1000000UL
111#define USEC_PER_JIFFY (USEC_PER_SEC / HZ)
112
113static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor)
114{
115 return dividend / divisor;
116}
117
118#define div64_ul div64_u64
119
120#define BITS_PER_U64 (sizeof(__u64) * 8)
121static __always_inline int fls64(__u64 x)
122{
123 int num = BITS_PER_U64 - 1;
124
125 if (x == 0)
126 return 0;
127
128 if (!(x & (~0ull << (BITS_PER_U64-32)))) {
129 num -= 32;
130 x <<= 32;
131 }
132 if (!(x & (~0ull << (BITS_PER_U64-16)))) {
133 num -= 16;
134 x <<= 16;
135 }
136 if (!(x & (~0ull << (BITS_PER_U64-8)))) {
137 num -= 8;
138 x <<= 8;
139 }
140 if (!(x & (~0ull << (BITS_PER_U64-4)))) {
141 num -= 4;
142 x <<= 4;
143 }
144 if (!(x & (~0ull << (BITS_PER_U64-2)))) {
145 num -= 2;
146 x <<= 2;
147 }
148 if (!(x & (~0ull << (BITS_PER_U64-1))))
149 num -= 1;
150
151 return num + 1;
152}
153
154static __always_inline __u32 bictcp_clock_us(const struct sock *sk)
155{
156 return tcp_sk(sk)->tcp_mstamp;
157}
158
159static __always_inline void bictcp_hystart_reset(struct sock *sk)
160{
161 struct tcp_sock *tp = tcp_sk(sk);
162 struct bictcp *ca = inet_csk_ca(sk);
163
164 ca->round_start = ca->last_ack = bictcp_clock_us(sk);
165 ca->end_seq = tp->snd_nxt;
166 ca->curr_rtt = ~0U;
167 ca->sample_cnt = 0;
168}
169
170
171
172
173
174
175SEC("struct_ops/bictcp_init")
176void BPF_PROG(bictcp_init, struct sock *sk)
177{
178 struct bictcp *ca = inet_csk_ca(sk);
179
180 bictcp_reset(ca);
181
182 if (hystart)
183 bictcp_hystart_reset(sk);
184
185 if (!hystart && initial_ssthresh)
186 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
187}
188
189
190
191
192SEC("no-sec-prefix-bictcp_cwnd_event")
193void BPF_PROG(bictcp_cwnd_event, struct sock *sk, enum tcp_ca_event event)
194{
195 if (event == CA_EVENT_TX_START) {
196 struct bictcp *ca = inet_csk_ca(sk);
197 __u32 now = tcp_jiffies32;
198 __s32 delta;
199
200 delta = now - tcp_sk(sk)->lsndtime;
201
202
203
204
205 if (ca->epoch_start && delta > 0) {
206 ca->epoch_start += delta;
207 if (after(ca->epoch_start, now))
208 ca->epoch_start = now;
209 }
210 return;
211 }
212}
213
214
215
216
217
218
219
220
221
222static const __u8 v[] = {
223 0, 54, 54, 54, 118, 118, 118, 118,
224 123, 129, 134, 138, 143, 147, 151, 156,
225 157, 161, 164, 168, 170, 173, 176, 179,
226 181, 185, 187, 190, 192, 194, 197, 199,
227 200, 202, 204, 206, 209, 211, 213, 215,
228 217, 219, 221, 222, 224, 225, 227, 229,
229 231, 232, 234, 236, 237, 239, 240, 242,
230 244, 245, 246, 248, 250, 251, 252, 254,
231};
232
233
234
235
236
237static __always_inline __u32 cubic_root(__u64 a)
238{
239 __u32 x, b, shift;
240
241 if (a < 64) {
242
243 return ((__u32)v[(__u32)a] + 35) >> 6;
244 }
245
246 b = fls64(a);
247 b = ((b * 84) >> 8) - 1;
248 shift = (a >> (b * 3));
249
250
251 if (shift >= 64)
252 return 0;
253
254 x = ((__u32)(((__u32)v[shift] + 10) << b)) >> 6;
255
256
257
258
259
260
261
262 x = (2 * x + (__u32)div64_u64(a, (__u64)x * (__u64)(x - 1)));
263 x = ((x * 341) >> 10);
264 return x;
265}
266
267
268
269
270static __always_inline void bictcp_update(struct bictcp *ca, __u32 cwnd,
271 __u32 acked)
272{
273 __u32 delta, bic_target, max_cnt;
274 __u64 offs, t;
275
276 ca->ack_cnt += acked;
277
278 if (ca->last_cwnd == cwnd &&
279 (__s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32)
280 return;
281
282
283
284
285
286 if (ca->epoch_start && tcp_jiffies32 == ca->last_time)
287 goto tcp_friendliness;
288
289 ca->last_cwnd = cwnd;
290 ca->last_time = tcp_jiffies32;
291
292 if (ca->epoch_start == 0) {
293 ca->epoch_start = tcp_jiffies32;
294 ca->ack_cnt = acked;
295 ca->tcp_cwnd = cwnd;
296
297 if (ca->last_max_cwnd <= cwnd) {
298 ca->bic_K = 0;
299 ca->bic_origin_point = cwnd;
300 } else {
301
302
303
304 ca->bic_K = cubic_root(cube_factor
305 * (ca->last_max_cwnd - cwnd));
306 ca->bic_origin_point = ca->last_max_cwnd;
307 }
308 }
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324 t = (__s32)(tcp_jiffies32 - ca->epoch_start) * USEC_PER_JIFFY;
325 t += ca->delay_min;
326
327 t <<= BICTCP_HZ;
328 t /= USEC_PER_SEC;
329
330 if (t < ca->bic_K)
331 offs = ca->bic_K - t;
332 else
333 offs = t - ca->bic_K;
334
335
336 delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
337 if (t < ca->bic_K)
338 bic_target = ca->bic_origin_point - delta;
339 else
340 bic_target = ca->bic_origin_point + delta;
341
342
343 if (bic_target > cwnd) {
344 ca->cnt = cwnd / (bic_target - cwnd);
345 } else {
346 ca->cnt = 100 * cwnd;
347 }
348
349
350
351
352
353 if (ca->last_max_cwnd == 0 && ca->cnt > 20)
354 ca->cnt = 20;
355
356tcp_friendliness:
357
358 if (tcp_friendliness) {
359 __u32 scale = beta_scale;
360 __u32 n;
361
362
363 delta = (cwnd * scale) >> 3;
364 if (ca->ack_cnt > delta && delta) {
365 n = ca->ack_cnt / delta;
366 ca->ack_cnt -= n * delta;
367 ca->tcp_cwnd += n;
368 }
369
370 if (ca->tcp_cwnd > cwnd) {
371 delta = ca->tcp_cwnd - cwnd;
372 max_cnt = cwnd / delta;
373 if (ca->cnt > max_cnt)
374 ca->cnt = max_cnt;
375 }
376 }
377
378
379
380
381 ca->cnt = max(ca->cnt, 2U);
382}
383
384
385void BPF_STRUCT_OPS(bictcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
386{
387 struct tcp_sock *tp = tcp_sk(sk);
388 struct bictcp *ca = inet_csk_ca(sk);
389
390 if (!tcp_is_cwnd_limited(sk))
391 return;
392
393 if (tcp_in_slow_start(tp)) {
394 if (hystart && after(ack, ca->end_seq))
395 bictcp_hystart_reset(sk);
396 acked = tcp_slow_start(tp, acked);
397 if (!acked)
398 return;
399 }
400 bictcp_update(ca, tp->snd_cwnd, acked);
401 tcp_cong_avoid_ai(tp, ca->cnt, acked);
402}
403
404__u32 BPF_STRUCT_OPS(bictcp_recalc_ssthresh, struct sock *sk)
405{
406 const struct tcp_sock *tp = tcp_sk(sk);
407 struct bictcp *ca = inet_csk_ca(sk);
408
409 ca->epoch_start = 0;
410
411
412 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
413 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
414 / (2 * BICTCP_BETA_SCALE);
415 else
416 ca->last_max_cwnd = tp->snd_cwnd;
417
418 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
419}
420
421void BPF_STRUCT_OPS(bictcp_state, struct sock *sk, __u8 new_state)
422{
423 if (new_state == TCP_CA_Loss) {
424 bictcp_reset(inet_csk_ca(sk));
425 bictcp_hystart_reset(sk);
426 }
427}
428
429#define GSO_MAX_SIZE 65536
430
431
432
433
434
435
436
437
438
439
440static __always_inline __u32 hystart_ack_delay(struct sock *sk)
441{
442 unsigned long rate;
443
444 rate = sk->sk_pacing_rate;
445 if (!rate)
446 return 0;
447 return min((__u64)USEC_PER_MSEC,
448 div64_ul((__u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate));
449}
450
451static __always_inline void hystart_update(struct sock *sk, __u32 delay)
452{
453 struct tcp_sock *tp = tcp_sk(sk);
454 struct bictcp *ca = inet_csk_ca(sk);
455 __u32 threshold;
456
457 if (hystart_detect & HYSTART_ACK_TRAIN) {
458 __u32 now = bictcp_clock_us(sk);
459
460
461 if ((__s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
462 ca->last_ack = now;
463
464 threshold = ca->delay_min + hystart_ack_delay(sk);
465
466
467
468
469
470
471 if (sk->sk_pacing_status == SK_PACING_NONE)
472 threshold >>= 1;
473
474 if ((__s32)(now - ca->round_start) > threshold) {
475 ca->found = 1;
476 tp->snd_ssthresh = tp->snd_cwnd;
477 }
478 }
479 }
480
481 if (hystart_detect & HYSTART_DELAY) {
482
483 if (ca->curr_rtt > delay)
484 ca->curr_rtt = delay;
485 if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
486 ca->sample_cnt++;
487 } else {
488 if (ca->curr_rtt > ca->delay_min +
489 HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
490 ca->found = 1;
491 tp->snd_ssthresh = tp->snd_cwnd;
492 }
493 }
494 }
495}
496
497void BPF_STRUCT_OPS(bictcp_acked, struct sock *sk,
498 const struct ack_sample *sample)
499{
500 const struct tcp_sock *tp = tcp_sk(sk);
501 struct bictcp *ca = inet_csk_ca(sk);
502 __u32 delay;
503
504
505 if (sample->rtt_us < 0)
506 return;
507
508
509 if (ca->epoch_start && (__s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
510 return;
511
512 delay = sample->rtt_us;
513 if (delay == 0)
514 delay = 1;
515
516
517 if (ca->delay_min == 0 || ca->delay_min > delay)
518 ca->delay_min = delay;
519
520
521 if (!ca->found && tcp_in_slow_start(tp) && hystart &&
522 tp->snd_cwnd >= hystart_low_window)
523 hystart_update(sk, delay);
524}
525
526__u32 BPF_STRUCT_OPS(tcp_reno_undo_cwnd, struct sock *sk)
527{
528 const struct tcp_sock *tp = tcp_sk(sk);
529
530 return max(tp->snd_cwnd, tp->prior_cwnd);
531}
532
533SEC(".struct_ops")
534struct tcp_congestion_ops cubic = {
535 .init = (void *)bictcp_init,
536 .ssthresh = (void *)bictcp_recalc_ssthresh,
537 .cong_avoid = (void *)bictcp_cong_avoid,
538 .set_state = (void *)bictcp_state,
539 .undo_cwnd = (void *)tcp_reno_undo_cwnd,
540 .cwnd_event = (void *)bictcp_cwnd_event,
541 .pkts_acked = (void *)bictcp_acked,
542 .name = "bpf_cubic",
543};
544