1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32#define _GNU_SOURCE
33
34#include <stdio.h>
35#include <stdlib.h>
36#include <assert.h>
37#include <sys/resource.h>
38#include <sys/time.h>
39#include <unistd.h>
40#include <errno.h>
41#include <fcntl.h>
42#include <linux/unistd.h>
43
44#include <linux/bpf.h>
45#include <bpf/bpf.h>
46#include <getopt.h>
47
48#include "bpf_load.h"
49#include "bpf_rlimit.h"
50#include "cgroup_helpers.h"
51#include "hbm.h"
52#include "bpf_util.h"
53#include "bpf.h"
54#include "libbpf.h"
55
56bool outFlag = true;
57int minRate = 1000;
58int rate = 1000;
59int dur = 1;
60bool stats_flag;
61bool loopback_flag;
62bool debugFlag;
63bool work_conserving_flag;
64bool no_cn_flag;
65bool edt_flag;
66
67static void Usage(void);
68static void read_trace_pipe2(void);
69static void do_error(char *msg, bool errno_flag);
70
71#define DEBUGFS "/sys/kernel/debug/tracing/"
72
73struct bpf_object *obj;
74int bpfprog_fd;
75int cgroup_storage_fd;
76
77static void read_trace_pipe2(void)
78{
79 int trace_fd;
80 FILE *outf;
81 char *outFname = "hbm_out.log";
82
83 trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
84 if (trace_fd < 0) {
85 printf("Error opening trace_pipe\n");
86 return;
87 }
88
89
90
91
92 outf = fopen(outFname, "w");
93
94 if (outf == NULL)
95 printf("Error creating %s\n", outFname);
96
97 while (1) {
98 static char buf[4097];
99 ssize_t sz;
100
101 sz = read(trace_fd, buf, sizeof(buf) - 1);
102 if (sz > 0) {
103 buf[sz] = 0;
104 puts(buf);
105 if (outf != NULL) {
106 fprintf(outf, "%s\n", buf);
107 fflush(outf);
108 }
109 }
110 }
111}
112
113static void do_error(char *msg, bool errno_flag)
114{
115 if (errno_flag)
116 printf("ERROR: %s, errno: %d\n", msg, errno);
117 else
118 printf("ERROR: %s\n", msg);
119 exit(1);
120}
121
122static int prog_load(char *prog)
123{
124 struct bpf_prog_load_attr prog_load_attr = {
125 .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
126 .file = prog,
127 .expected_attach_type = BPF_CGROUP_INET_EGRESS,
128 };
129 int map_fd;
130 struct bpf_map *map;
131
132 int ret = 0;
133
134 if (access(prog, O_RDONLY) < 0) {
135 printf("Error accessing file %s: %s\n", prog, strerror(errno));
136 return 1;
137 }
138 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd))
139 ret = 1;
140 if (!ret) {
141 map = bpf_object__find_map_by_name(obj, "queue_stats");
142 map_fd = bpf_map__fd(map);
143 if (map_fd < 0) {
144 printf("Map not found: %s\n", strerror(map_fd));
145 ret = 1;
146 }
147 }
148
149 if (ret) {
150 printf("ERROR: bpf_prog_load_xattr failed for: %s\n", prog);
151 printf(" Output from verifier:\n%s\n------\n", bpf_log_buf);
152 ret = -1;
153 } else {
154 ret = map_fd;
155 }
156
157 return ret;
158}
159
160static int run_bpf_prog(char *prog, int cg_id)
161{
162 int map_fd;
163 int rc = 0;
164 int key = 0;
165 int cg1 = 0;
166 int type = BPF_CGROUP_INET_EGRESS;
167 char cg_dir[100];
168 struct hbm_queue_stats qstats = {0};
169
170 sprintf(cg_dir, "/hbm%d", cg_id);
171 map_fd = prog_load(prog);
172 if (map_fd == -1)
173 return 1;
174
175 if (setup_cgroup_environment()) {
176 printf("ERROR: setting cgroup environment\n");
177 goto err;
178 }
179 cg1 = create_and_get_cgroup(cg_dir);
180 if (!cg1) {
181 printf("ERROR: create_and_get_cgroup\n");
182 goto err;
183 }
184 if (join_cgroup(cg_dir)) {
185 printf("ERROR: join_cgroup\n");
186 goto err;
187 }
188
189 qstats.rate = rate;
190 qstats.stats = stats_flag ? 1 : 0;
191 qstats.loopback = loopback_flag ? 1 : 0;
192 qstats.no_cn = no_cn_flag ? 1 : 0;
193 if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) {
194 printf("ERROR: Could not update map element\n");
195 goto err;
196 }
197
198 if (!outFlag)
199 type = BPF_CGROUP_INET_INGRESS;
200 if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) {
201 printf("ERROR: bpf_prog_attach fails!\n");
202 log_err("Attaching prog");
203 goto err;
204 }
205
206 if (work_conserving_flag) {
207 struct timeval t0, t_last, t_new;
208 FILE *fin;
209 unsigned long long last_eth_tx_bytes, new_eth_tx_bytes;
210 signed long long last_cg_tx_bytes, new_cg_tx_bytes;
211 signed long long delta_time, delta_bytes, delta_rate;
212 int delta_ms;
213#define DELTA_RATE_CHECK 10000
214#define RATE_THRESHOLD 9500000000
215
216 bpf_map_lookup_elem(map_fd, &key, &qstats);
217 if (gettimeofday(&t0, NULL) < 0)
218 do_error("gettimeofday failed", true);
219 t_last = t0;
220 fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r");
221 if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1)
222 do_error("fscanf fails", false);
223 fclose(fin);
224 last_cg_tx_bytes = qstats.bytes_total;
225 while (true) {
226 usleep(DELTA_RATE_CHECK);
227 if (gettimeofday(&t_new, NULL) < 0)
228 do_error("gettimeofday failed", true);
229 delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 +
230 (t_new.tv_usec - t0.tv_usec)/1000;
231 if (delta_ms > dur * 1000)
232 break;
233 delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 +
234 (t_new.tv_usec - t_last.tv_usec);
235 if (delta_time == 0)
236 continue;
237 t_last = t_new;
238 fin = fopen("/sys/class/net/eth0/statistics/tx_bytes",
239 "r");
240 if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1)
241 do_error("fscanf fails", false);
242 fclose(fin);
243 printf(" new_eth_tx_bytes:%llu\n",
244 new_eth_tx_bytes);
245 bpf_map_lookup_elem(map_fd, &key, &qstats);
246 new_cg_tx_bytes = qstats.bytes_total;
247 delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes;
248 last_eth_tx_bytes = new_eth_tx_bytes;
249 delta_rate = (delta_bytes * 8000000) / delta_time;
250 printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps",
251 delta_ms, delta_rate/1000000000.0,
252 rate/1000.0);
253 if (delta_rate < RATE_THRESHOLD) {
254
255
256
257
258
259 int rate_diff100;
260
261 delta_bytes = new_cg_tx_bytes -
262 last_cg_tx_bytes;
263 last_cg_tx_bytes = new_cg_tx_bytes;
264 delta_rate = (delta_bytes * 8000000) /
265 delta_time;
266 printf(" rate:%.3fGbps",
267 delta_rate/1000000000.0);
268 rate_diff100 = (((long long)rate)*1000000 -
269 delta_rate) * 100 /
270 (((long long) rate) * 1000000);
271 printf(" rdiff:%d", rate_diff100);
272 if (rate_diff100 <= 3) {
273 rate += (rate >> 4);
274 if (rate > RATE_THRESHOLD / 1000000)
275 rate = RATE_THRESHOLD / 1000000;
276 qstats.rate = rate;
277 printf(" INC\n");
278 } else {
279 printf("\n");
280 }
281 } else {
282
283
284
285
286 printf(" DEC\n");
287 rate -= (rate >> 3);
288 if (rate < minRate)
289 rate = minRate;
290 qstats.rate = rate;
291 }
292 if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY))
293 do_error("update map element fails", false);
294 }
295 } else {
296 sleep(dur);
297 }
298
299 if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) {
300 char fname[100];
301 FILE *fout;
302
303 if (!outFlag)
304 sprintf(fname, "hbm.%d.in", cg_id);
305 else
306 sprintf(fname, "hbm.%d.out", cg_id);
307 fout = fopen(fname, "w");
308 fprintf(fout, "id:%d\n", cg_id);
309 fprintf(fout, "ERROR: Could not lookup queue_stats\n");
310 } else if (stats_flag && qstats.lastPacketTime >
311 qstats.firstPacketTime) {
312 long long delta_us = (qstats.lastPacketTime -
313 qstats.firstPacketTime)/1000;
314 unsigned int rate_mbps = ((qstats.bytes_total -
315 qstats.bytes_dropped) * 8 /
316 delta_us);
317 double percent_pkts, percent_bytes;
318 char fname[100];
319 FILE *fout;
320 int k;
321 static const char *returnValNames[] = {
322 "DROP_PKT",
323 "ALLOW_PKT",
324 "DROP_PKT_CWR",
325 "ALLOW_PKT_CWR"
326 };
327#define RET_VAL_COUNT 4
328
329
330
331
332
333 sprintf(fname, "hbm.%d.out", cg_id);
334 fout = fopen(fname, "w");
335 fprintf(fout, "id:%d\n", cg_id);
336 fprintf(fout, "rate_mbps:%d\n", rate_mbps);
337 fprintf(fout, "duration:%.1f secs\n",
338 (qstats.lastPacketTime - qstats.firstPacketTime) /
339 1000000000.0);
340 fprintf(fout, "packets:%d\n", (int)qstats.pkts_total);
341 fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total /
342 1000000));
343 fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped);
344 fprintf(fout, "bytes_dropped_MB:%d\n",
345 (int)(qstats.bytes_dropped /
346 1000000));
347
348 percent_pkts = (qstats.pkts_marked * 100.0) /
349 (qstats.pkts_total + 1);
350 percent_bytes = (qstats.bytes_marked * 100.0) /
351 (qstats.bytes_total + 1);
352 fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts);
353 fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes);
354
355
356 percent_pkts = (qstats.pkts_dropped * 100.0) /
357 (qstats.pkts_total + 1);
358 percent_bytes = (qstats.bytes_dropped * 100.0) /
359 (qstats.bytes_total + 1);
360 fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts);
361 fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes);
362
363
364 percent_pkts = (qstats.pkts_ecn_ce * 100.0) /
365 (qstats.pkts_total + 1);
366 fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts,
367 (int)qstats.pkts_ecn_ce);
368
369
370 fprintf(fout, "avg cwnd:%d\n",
371 (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1)));
372
373 fprintf(fout, "avg rtt:%d\n",
374 (int)(qstats.sum_rtt / (qstats.pkts_total + 1)));
375
376 if (edt_flag)
377 fprintf(fout, "avg credit_ms:%.03f\n",
378 (qstats.sum_credit /
379 (qstats.pkts_total + 1.0)) / 1000000.0);
380 else
381 fprintf(fout, "avg credit:%d\n",
382 (int)(qstats.sum_credit /
383 (1500 * ((int)qstats.pkts_total ) + 1)));
384
385
386 for (k = 0; k < RET_VAL_COUNT; k++) {
387 percent_pkts = (qstats.returnValCount[k] * 100.0) /
388 (qstats.pkts_total + 1);
389 fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k],
390 percent_pkts, (int)qstats.returnValCount[k]);
391 }
392 fclose(fout);
393 }
394
395 if (debugFlag)
396 read_trace_pipe2();
397 return rc;
398err:
399 rc = 1;
400
401 if (cg1)
402 close(cg1);
403 cleanup_cgroup_environment();
404
405 return rc;
406}
407
408static void Usage(void)
409{
410 printf("This program loads a cgroup skb BPF program to enforce\n"
411 "cgroup output (egress) bandwidth limits.\n\n"
412 "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n"
413 " [-s] [-t <secs>] [-w] [-h] [prog]\n"
414 " Where:\n"
415 " -o indicates egress direction (default)\n"
416 " -d print BPF trace debug buffer\n"
417 " --edt use fq's Earliest Departure Time\n"
418 " -l also limit flows using loopback\n"
419 " -n <#> to create cgroup \"/hbm#\" and attach prog\n"
420 " Default is /hbm1\n"
421 " --no_cn disable CN notifications\n"
422 " -r <rate> Rate in Mbps\n"
423 " -s Update HBM stats\n"
424 " -t <time> Exit after specified seconds (default is 0)\n"
425 " -w Work conserving flag. cgroup can increase\n"
426 " bandwidth beyond the rate limit specified\n"
427 " while there is available bandwidth. Current\n"
428 " implementation assumes there is only eth0\n"
429 " but can be extended to support multiple NICs\n"
430 " -h print this info\n"
431 " prog BPF program file name. Name defaults to\n"
432 " hbm_out_kern.o\n");
433}
434
435int main(int argc, char **argv)
436{
437 char *prog = "hbm_out_kern.o";
438 int k;
439 int cg_id = 1;
440 char *optstring = "iodln:r:st:wh";
441 struct option loptions[] = {
442 {"no_cn", 0, NULL, 1},
443 {"edt", 0, NULL, 2},
444 {NULL, 0, NULL, 0}
445 };
446
447 while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) {
448 switch (k) {
449 case 1:
450 no_cn_flag = true;
451 break;
452 case 2:
453 prog = "hbm_edt_kern.o";
454 edt_flag = true;
455 break;
456 case'o':
457 break;
458 case 'd':
459 debugFlag = true;
460 break;
461 case 'l':
462 loopback_flag = true;
463 break;
464 case 'n':
465 cg_id = atoi(optarg);
466 break;
467 case 'r':
468 minRate = atoi(optarg) * 1.024;
469 rate = minRate;
470 break;
471 case 's':
472 stats_flag = true;
473 break;
474 case 't':
475 dur = atoi(optarg);
476 break;
477 case 'w':
478 work_conserving_flag = true;
479 break;
480 case '?':
481 if (optopt == 'n' || optopt == 'r' || optopt == 't')
482 fprintf(stderr,
483 "Option -%c requires an argument.\n\n",
484 optopt);
485 case 'h':
486
487 default:
488 Usage();
489 return 0;
490 }
491 }
492
493 if (optind < argc)
494 prog = argv[optind];
495 printf("HBM prog: %s\n", prog != NULL ? prog : "NULL");
496
497 return run_bpf_prog(prog, cg_id);
498}
499