1
2
3
4
5
6#include "test.h"
7
8#ifdef RTE_EXEC_ENV_WINDOWS
9static int
10test_lpm_perf(void)
11{
12 printf("lpm_perf not supported on Windows, skipping test\n");
13 return TEST_SKIPPED;
14}
15
16#else
17#include <stdio.h>
18#include <stdint.h>
19#include <stdlib.h>
20#include <math.h>
21
22#include <rte_cycles.h>
23#include <rte_random.h>
24#include <rte_branch_prediction.h>
25#include <rte_malloc.h>
26#include <rte_ip.h>
27#include <rte_lpm.h>
28
29#include "test_xmmt_ops.h"
30
31struct rte_lpm *lpm;
32static struct rte_rcu_qsbr *rv;
33static volatile uint8_t writer_done;
34static volatile uint32_t thr_id;
35static uint64_t gwrite_cycles;
36static uint32_t num_writers;
37
38static pthread_mutex_t lpm_mutex = PTHREAD_MUTEX_INITIALIZER;
39
40
41
42
43#define QSBR_REPORTING_INTERVAL 1024
44
45#define TEST_LPM_ASSERT(cond) do { \
46 if (!(cond)) { \
47 printf("Error at line %d: \n", __LINE__); \
48 return -1; \
49 } \
50} while(0)
51
52#define ITERATIONS (1 << 10)
53#define RCU_ITERATIONS 10
54#define BATCH_SIZE (1 << 12)
55#define BULK_SIZE 32
56
57#define MAX_RULE_NUM (1200000)
58
59struct route_rule {
60 uint32_t ip;
61 uint8_t depth;
62};
63
64static struct route_rule large_route_table[MAX_RULE_NUM];
65
66struct route_rule large_ldepth_route_table[MAX_RULE_NUM];
67
68static uint32_t num_route_entries;
69static uint32_t num_ldepth_route_entries;
70#define NUM_ROUTE_ENTRIES num_route_entries
71#define NUM_LDEPTH_ROUTE_ENTRIES num_ldepth_route_entries
72
73#define TOTAL_WRITES (RCU_ITERATIONS * NUM_LDEPTH_ROUTE_ENTRIES)
74
75enum {
76 IP_CLASS_A,
77 IP_CLASS_B,
78 IP_CLASS_C
79};
80
81
82
83
84
85struct route_rule_count {
86 uint32_t a[RTE_LPM_MAX_DEPTH];
87 uint32_t b[RTE_LPM_MAX_DEPTH];
88 uint32_t c[RTE_LPM_MAX_DEPTH];
89};
90
91
92
93
94
95
96
97
98
99static struct route_rule_count rule_count = {
100 .a = {
101 0,
102 0,
103 1,
104 0,
105 2,
106 1,
107 3,
108 185,
109 26,
110 16,
111 39,
112 144,
113 233,
114 528,
115 866,
116 3856,
117 3268,
118 5662,
119 17301,
120 22226,
121 11147,
122 16746,
123 17120,
124 77578,
125 401,
126 656,
127 1107,
128 1121,
129 2316,
130 717,
131 10,
132 66
133 },
134 .b = {
135 0,
136 0,
137 0,
138 0,
139 1,
140 1,
141 1,
142 3,
143 3,
144 30,
145 25,
146 168,
147 305,
148 569,
149 1129,
150 50800,
151 1645,
152 1820,
153 3506,
154 3258,
155 3424,
156 4971,
157 6885,
158 39771,
159 424,
160 170,
161 433,
162 92,
163 366,
164 377,
165 2,
166 200
167 },
168 .c = {
169 0,
170 0,
171 0,
172 0,
173 0,
174 0,
175 0,
176 12,
177 8,
178 9,
179 33,
180 69,
181 237,
182 1007,
183 1717,
184 14663,
185 8070,
186 16185,
187 48261,
188 36870,
189 33960,
190 50638,
191 61422,
192 466549,
193 1829,
194 4824,
195 4927,
196 5914,
197 10254,
198 4905,
199 1,
200 716
201 }
202};
203
204static void generate_random_rule_prefix(uint32_t ip_class, uint8_t depth)
205{
206
207#define IP_HEAD_MASK_A 0x00000000
208#define IP_HEAD_BIT_NUM_A 1
209
210
211#define IP_HEAD_MASK_B 0x80000000
212#define IP_HEAD_BIT_NUM_B 2
213
214
215#define IP_HEAD_MASK_C 0xC0000000
216#define IP_HEAD_BIT_NUM_C 3
217
218 uint32_t class_depth;
219 uint32_t range;
220 uint32_t mask;
221 uint32_t step;
222 uint32_t start;
223 uint32_t fixed_bit_num;
224 uint32_t ip_head_mask;
225 uint32_t rule_num;
226 uint32_t k;
227 struct route_rule *ptr_rule, *ptr_ldepth_rule;
228
229 if (ip_class == IP_CLASS_A) {
230 fixed_bit_num = IP_HEAD_BIT_NUM_A;
231 ip_head_mask = IP_HEAD_MASK_A;
232 rule_num = rule_count.a[depth - 1];
233 } else if (ip_class == IP_CLASS_B) {
234 fixed_bit_num = IP_HEAD_BIT_NUM_B;
235 ip_head_mask = IP_HEAD_MASK_B;
236 rule_num = rule_count.b[depth - 1];
237 } else {
238 fixed_bit_num = IP_HEAD_BIT_NUM_C;
239 ip_head_mask = IP_HEAD_MASK_C;
240 rule_num = rule_count.c[depth - 1];
241 }
242
243 if (rule_num == 0)
244 return;
245
246
247
248
249 class_depth = depth - fixed_bit_num;
250
251
252
253
254 range = 1 << class_depth;
255
256
257
258
259 mask = range - 1;
260
261
262 if (range <= rule_num)
263 step = 1;
264 else
265 step = round((double)range / rule_num);
266
267
268
269
270 start = lrand48() & mask;
271 ptr_rule = &large_route_table[num_route_entries];
272 ptr_ldepth_rule = &large_ldepth_route_table[num_ldepth_route_entries];
273 for (k = 0; k < rule_num; k++) {
274 ptr_rule->ip = (start << (RTE_LPM_MAX_DEPTH - depth))
275 | ip_head_mask;
276 ptr_rule->depth = depth;
277
278
279
280 if (depth > 24) {
281 ptr_ldepth_rule->ip = ptr_rule->ip;
282 ptr_ldepth_rule->depth = ptr_rule->depth;
283 ptr_ldepth_rule++;
284 num_ldepth_route_entries++;
285 }
286 ptr_rule++;
287 start = (start + step) & mask;
288 }
289 num_route_entries += rule_num;
290}
291
292static void insert_rule_in_random_pos(uint32_t ip, uint8_t depth)
293{
294 uint32_t pos;
295 int try_count = 0;
296 struct route_rule tmp;
297
298 do {
299 pos = lrand48();
300 try_count++;
301 } while ((try_count < 10) && (pos > num_route_entries));
302
303 if ((pos > num_route_entries) || (pos >= MAX_RULE_NUM))
304 pos = num_route_entries >> 1;
305
306 tmp = large_route_table[pos];
307 large_route_table[pos].ip = ip;
308 large_route_table[pos].depth = depth;
309 if (num_route_entries < MAX_RULE_NUM)
310 large_route_table[num_route_entries++] = tmp;
311}
312
313static void generate_large_route_rule_table(void)
314{
315 uint32_t ip_class;
316 uint8_t depth;
317
318 num_route_entries = 0;
319 num_ldepth_route_entries = 0;
320 memset(large_route_table, 0, sizeof(large_route_table));
321
322 for (ip_class = IP_CLASS_A; ip_class <= IP_CLASS_C; ip_class++) {
323 for (depth = 1; depth <= RTE_LPM_MAX_DEPTH; depth++) {
324 generate_random_rule_prefix(ip_class, depth);
325 }
326 }
327
328
329
330
331
332 insert_rule_in_random_pos(RTE_IPV4(0, 0, 0, 0), 8);
333 insert_rule_in_random_pos(RTE_IPV4(10, 2, 23, 147), 32);
334 insert_rule_in_random_pos(RTE_IPV4(192, 168, 100, 10), 24);
335 insert_rule_in_random_pos(RTE_IPV4(192, 168, 25, 100), 24);
336 insert_rule_in_random_pos(RTE_IPV4(192, 168, 129, 124), 32);
337}
338
339static void
340print_route_distribution(const struct route_rule *table, uint32_t n)
341{
342 unsigned i, j;
343
344 printf("Route distribution per prefix width: \n");
345 printf("DEPTH QUANTITY (PERCENT)\n");
346 printf("--------------------------- \n");
347
348
349 for (i = 1; i <= 32; i++) {
350 unsigned depth_counter = 0;
351 double percent_hits;
352
353 for (j = 0; j < n; j++)
354 if (table[j].depth == (uint8_t) i)
355 depth_counter++;
356
357 percent_hits = ((double)depth_counter)/((double)n) * 100;
358 printf("%.2u%15u (%.2f)\n", i, depth_counter, percent_hits);
359 }
360 printf("\n");
361}
362
363
364static uint16_t enabled_core_ids[RTE_MAX_LCORE];
365static unsigned int num_cores;
366
367
368static inline uint32_t
369alloc_thread_id(void)
370{
371 uint32_t tmp_thr_id;
372
373 tmp_thr_id = __atomic_fetch_add(&thr_id, 1, __ATOMIC_RELAXED);
374 if (tmp_thr_id >= RTE_MAX_LCORE)
375 printf("Invalid thread id %u\n", tmp_thr_id);
376
377 return tmp_thr_id;
378}
379
380
381
382
383static int
384test_lpm_reader(void *arg)
385{
386 int i;
387 uint32_t ip_batch[QSBR_REPORTING_INTERVAL];
388 uint32_t next_hop_return = 0;
389
390 RTE_SET_USED(arg);
391 do {
392 for (i = 0; i < QSBR_REPORTING_INTERVAL; i++)
393 ip_batch[i] = rte_rand();
394
395 for (i = 0; i < QSBR_REPORTING_INTERVAL; i++)
396 rte_lpm_lookup(lpm, ip_batch[i], &next_hop_return);
397
398 } while (!writer_done);
399
400 return 0;
401}
402
403
404
405
406static int
407test_lpm_rcu_qsbr_reader(void *arg)
408{
409 int i;
410 uint32_t thread_id = alloc_thread_id();
411 uint32_t ip_batch[QSBR_REPORTING_INTERVAL];
412 uint32_t next_hop_return = 0;
413
414 RTE_SET_USED(arg);
415
416 rte_rcu_qsbr_thread_register(rv, thread_id);
417 rte_rcu_qsbr_thread_online(rv, thread_id);
418
419 do {
420 for (i = 0; i < QSBR_REPORTING_INTERVAL; i++)
421 ip_batch[i] = rte_rand();
422
423 for (i = 0; i < QSBR_REPORTING_INTERVAL; i++)
424 rte_lpm_lookup(lpm, ip_batch[i], &next_hop_return);
425
426
427 rte_rcu_qsbr_quiescent(rv, thread_id);
428 } while (!writer_done);
429
430 rte_rcu_qsbr_thread_offline(rv, thread_id);
431 rte_rcu_qsbr_thread_unregister(rv, thread_id);
432
433 return 0;
434}
435
436
437
438
439static int
440test_lpm_rcu_qsbr_writer(void *arg)
441{
442 unsigned int i, j, si, ei;
443 uint64_t begin, total_cycles;
444 uint32_t next_hop_add = 0xAA;
445 uint8_t pos_core = (uint8_t)((uintptr_t)arg);
446
447 si = (pos_core * NUM_LDEPTH_ROUTE_ENTRIES) / num_writers;
448 ei = ((pos_core + 1) * NUM_LDEPTH_ROUTE_ENTRIES) / num_writers;
449
450
451 begin = rte_rdtsc_precise();
452 for (i = 0; i < RCU_ITERATIONS; i++) {
453
454 for (j = si; j < ei; j++) {
455 if (num_writers > 1)
456 pthread_mutex_lock(&lpm_mutex);
457 if (rte_lpm_add(lpm, large_ldepth_route_table[j].ip,
458 large_ldepth_route_table[j].depth,
459 next_hop_add) != 0) {
460 printf("Failed to add iteration %d, route# %d\n",
461 i, j);
462 goto error;
463 }
464 if (num_writers > 1)
465 pthread_mutex_unlock(&lpm_mutex);
466 }
467
468
469 for (j = si; j < ei; j++) {
470 if (num_writers > 1)
471 pthread_mutex_lock(&lpm_mutex);
472 if (rte_lpm_delete(lpm, large_ldepth_route_table[j].ip,
473 large_ldepth_route_table[j].depth) != 0) {
474 printf("Failed to delete iteration %d, route# %d\n",
475 i, j);
476 goto error;
477 }
478 if (num_writers > 1)
479 pthread_mutex_unlock(&lpm_mutex);
480 }
481 }
482
483 total_cycles = rte_rdtsc_precise() - begin;
484
485 __atomic_fetch_add(&gwrite_cycles, total_cycles, __ATOMIC_RELAXED);
486
487 return 0;
488
489error:
490 if (num_writers > 1)
491 pthread_mutex_unlock(&lpm_mutex);
492 return -1;
493}
494
495
496
497
498
499static int
500test_lpm_rcu_perf_multi_writer(uint8_t use_rcu)
501{
502 struct rte_lpm_config config;
503 size_t sz;
504 unsigned int i, j;
505 uint16_t core_id;
506 struct rte_lpm_rcu_config rcu_cfg = {0};
507 int (*reader_f)(void *arg) = NULL;
508
509 if (rte_lcore_count() < 3) {
510 printf("Not enough cores for lpm_rcu_perf_autotest, expecting at least 3\n");
511 return TEST_SKIPPED;
512 }
513
514 num_cores = 0;
515 RTE_LCORE_FOREACH_WORKER(core_id) {
516 enabled_core_ids[num_cores] = core_id;
517 num_cores++;
518 }
519
520 for (j = 1; j < 3; j++) {
521 if (use_rcu)
522 printf("\nPerf test: %d writer(s), %d reader(s),"
523 " RCU integration enabled\n", j, num_cores - j);
524 else
525 printf("\nPerf test: %d writer(s), %d reader(s),"
526 " RCU integration disabled\n", j, num_cores - j);
527
528 num_writers = j;
529
530
531 config.max_rules = NUM_LDEPTH_ROUTE_ENTRIES;
532 config.number_tbl8s = NUM_LDEPTH_ROUTE_ENTRIES;
533 config.flags = 0;
534 lpm = rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
535 TEST_LPM_ASSERT(lpm != NULL);
536
537
538 if (use_rcu) {
539 sz = rte_rcu_qsbr_get_memsize(num_cores);
540 rv = (struct rte_rcu_qsbr *)rte_zmalloc("rcu0", sz,
541 RTE_CACHE_LINE_SIZE);
542 rte_rcu_qsbr_init(rv, num_cores);
543
544 rcu_cfg.v = rv;
545
546 if (rte_lpm_rcu_qsbr_add(lpm, &rcu_cfg) != 0) {
547 printf("RCU variable assignment failed\n");
548 goto error;
549 }
550
551 reader_f = test_lpm_rcu_qsbr_reader;
552 } else
553 reader_f = test_lpm_reader;
554
555 writer_done = 0;
556 __atomic_store_n(&gwrite_cycles, 0, __ATOMIC_RELAXED);
557
558 __atomic_store_n(&thr_id, 0, __ATOMIC_SEQ_CST);
559
560
561 for (i = j; i < num_cores; i++)
562 rte_eal_remote_launch(reader_f, NULL,
563 enabled_core_ids[i]);
564
565
566 for (i = 0; i < j; i++)
567 rte_eal_remote_launch(test_lpm_rcu_qsbr_writer,
568 (void *)(uintptr_t)i,
569 enabled_core_ids[i]);
570
571
572 for (i = 0; i < j; i++)
573 if (rte_eal_wait_lcore(enabled_core_ids[i]) < 0)
574 goto error;
575
576 printf("Total LPM Adds: %d\n", TOTAL_WRITES);
577 printf("Total LPM Deletes: %d\n", TOTAL_WRITES);
578 printf("Average LPM Add/Del: %"PRIu64" cycles\n",
579 __atomic_load_n(&gwrite_cycles, __ATOMIC_RELAXED)
580 / TOTAL_WRITES);
581
582 writer_done = 1;
583
584 for (i = j; i < num_cores; i++)
585 rte_eal_wait_lcore(enabled_core_ids[i]);
586
587 rte_lpm_free(lpm);
588 rte_free(rv);
589 lpm = NULL;
590 rv = NULL;
591 }
592
593 return 0;
594
595error:
596 writer_done = 1;
597
598 rte_eal_mp_wait_lcore();
599
600 rte_lpm_free(lpm);
601 rte_free(rv);
602
603 return -1;
604}
605
606static int
607test_lpm_perf(void)
608{
609 struct rte_lpm_config config;
610
611 config.max_rules = 2000000;
612 config.number_tbl8s = 2048;
613 config.flags = 0;
614 uint64_t begin, total_time, lpm_used_entries = 0;
615 unsigned i, j;
616 uint32_t next_hop_add = 0xAA, next_hop_return = 0;
617 int status = 0;
618 uint64_t cache_line_counter = 0;
619 int64_t count = 0;
620
621 rte_srand(rte_rdtsc());
622
623 generate_large_route_rule_table();
624
625 printf("No. routes = %u\n", (unsigned) NUM_ROUTE_ENTRIES);
626
627 print_route_distribution(large_route_table, (uint32_t) NUM_ROUTE_ENTRIES);
628
629 lpm = rte_lpm_create(__func__, SOCKET_ID_ANY, &config);
630 TEST_LPM_ASSERT(lpm != NULL);
631
632
633 begin = rte_rdtsc();
634
635 for (i = 0; i < NUM_ROUTE_ENTRIES; i++) {
636 if (rte_lpm_add(lpm, large_route_table[i].ip,
637 large_route_table[i].depth, next_hop_add) == 0)
638 status++;
639 }
640
641 total_time = rte_rdtsc() - begin;
642
643 printf("Unique added entries = %d\n", status);
644
645 for (i = 0; i < RTE_LPM_TBL24_NUM_ENTRIES; i++) {
646 if (lpm->tbl24[i].valid)
647 lpm_used_entries++;
648
649 if (i % 32 == 0) {
650 if ((uint64_t)count < lpm_used_entries) {
651 cache_line_counter++;
652 count = lpm_used_entries;
653 }
654 }
655 }
656
657 printf("Used table 24 entries = %u (%g%%)\n",
658 (unsigned) lpm_used_entries,
659 (lpm_used_entries * 100.0) / RTE_LPM_TBL24_NUM_ENTRIES);
660 printf("64 byte Cache entries used = %u (%u bytes)\n",
661 (unsigned) cache_line_counter, (unsigned) cache_line_counter * 64);
662
663 printf("Average LPM Add: %g cycles\n",
664 (double)total_time / NUM_ROUTE_ENTRIES);
665
666
667 total_time = 0;
668 count = 0;
669
670 for (i = 0; i < ITERATIONS; i++) {
671 static uint32_t ip_batch[BATCH_SIZE];
672
673 for (j = 0; j < BATCH_SIZE; j++)
674 ip_batch[j] = rte_rand();
675
676
677 begin = rte_rdtsc();
678
679 for (j = 0; j < BATCH_SIZE; j++) {
680 if (rte_lpm_lookup(lpm, ip_batch[j], &next_hop_return) != 0)
681 count++;
682 }
683
684 total_time += rte_rdtsc() - begin;
685
686 }
687 printf("Average LPM Lookup: %.1f cycles (fails = %.1f%%)\n",
688 (double)total_time / ((double)ITERATIONS * BATCH_SIZE),
689 (count * 100.0) / (double)(ITERATIONS * BATCH_SIZE));
690
691
692 total_time = 0;
693 count = 0;
694 for (i = 0; i < ITERATIONS; i++) {
695 static uint32_t ip_batch[BATCH_SIZE];
696 uint32_t next_hops[BULK_SIZE];
697
698
699 for (j = 0; j < BATCH_SIZE; j++)
700 ip_batch[j] = rte_rand();
701
702
703 begin = rte_rdtsc();
704 for (j = 0; j < BATCH_SIZE; j += BULK_SIZE) {
705 unsigned k;
706 rte_lpm_lookup_bulk(lpm, &ip_batch[j], next_hops, BULK_SIZE);
707 for (k = 0; k < BULK_SIZE; k++)
708 if (unlikely(!(next_hops[k] & RTE_LPM_LOOKUP_SUCCESS)))
709 count++;
710 }
711
712 total_time += rte_rdtsc() - begin;
713 }
714 printf("BULK LPM Lookup: %.1f cycles (fails = %.1f%%)\n",
715 (double)total_time / ((double)ITERATIONS * BATCH_SIZE),
716 (count * 100.0) / (double)(ITERATIONS * BATCH_SIZE));
717
718
719 total_time = 0;
720 count = 0;
721 for (i = 0; i < ITERATIONS; i++) {
722 static uint32_t ip_batch[BATCH_SIZE];
723 uint32_t next_hops[4];
724
725
726 for (j = 0; j < BATCH_SIZE; j++)
727 ip_batch[j] = rte_rand();
728
729
730 begin = rte_rdtsc();
731 for (j = 0; j < BATCH_SIZE; j += RTE_DIM(next_hops)) {
732 unsigned k;
733 xmm_t ipx4;
734
735 ipx4 = vect_loadu_sil128((xmm_t *)(ip_batch + j));
736 ipx4 = *(xmm_t *)(ip_batch + j);
737 rte_lpm_lookupx4(lpm, ipx4, next_hops, UINT32_MAX);
738 for (k = 0; k < RTE_DIM(next_hops); k++)
739 if (unlikely(next_hops[k] == UINT32_MAX))
740 count++;
741 }
742
743 total_time += rte_rdtsc() - begin;
744 }
745 printf("LPM LookupX4: %.1f cycles (fails = %.1f%%)\n",
746 (double)total_time / ((double)ITERATIONS * BATCH_SIZE),
747 (count * 100.0) / (double)(ITERATIONS * BATCH_SIZE));
748
749
750 status = 0;
751 begin = rte_rdtsc();
752
753 for (i = 0; i < NUM_ROUTE_ENTRIES; i++) {
754
755 status += rte_lpm_delete(lpm, large_route_table[i].ip,
756 large_route_table[i].depth);
757 }
758
759 total_time = rte_rdtsc() - begin;
760
761 printf("Average LPM Delete: %g cycles\n",
762 (double)total_time / NUM_ROUTE_ENTRIES);
763
764 rte_lpm_delete_all(lpm);
765 rte_lpm_free(lpm);
766
767 if (test_lpm_rcu_perf_multi_writer(0) < 0)
768 return -1;
769
770 if (test_lpm_rcu_perf_multi_writer(1) < 0)
771 return -1;
772
773 return 0;
774}
775
776#endif
777
778REGISTER_TEST_COMMAND(lpm_perf_autotest, test_lpm_perf);
779