1/* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation 3 */ 4 5#ifndef __INCLUDE_RTE_SCHED_H__ 6#define __INCLUDE_RTE_SCHED_H__ 7 8#ifdef __cplusplus 9extern "C" { 10#endif 11 12/** 13 * @file 14 * RTE Hierarchical Scheduler 15 * 16 * The hierarchical scheduler prioritizes the transmission of packets 17 * from different users and traffic classes according to the Service 18 * Level Agreements (SLAs) defined for the current network node. 19 * 20 * The scheduler supports thousands of packet queues grouped under a 21 * 5-level hierarchy: 22 * 1. Port: 23 * - Typical usage: output Ethernet port; 24 * - Multiple ports are scheduled in round robin order with 25 * equal priority; 26 * 2. Subport: 27 * - Typical usage: group of users; 28 * - Traffic shaping using the token bucket algorithm 29 * (one bucket per subport); 30 * - Upper limit enforced per traffic class at subport level; 31 * - Lower priority traffic classes able to reuse subport 32 * bandwidth currently unused by higher priority traffic 33 * classes of the same subport; 34 * - When any subport traffic class is oversubscribed 35 * (configuration time event), the usage of subport member 36 * pipes with high demand for that traffic class pipes is 37 * truncated to a dynamically adjusted value with no 38 * impact to low demand pipes; 39 * 3. Pipe: 40 * - Typical usage: individual user/subscriber; 41 * - Traffic shaping using the token bucket algorithm 42 * (one bucket per pipe); 43 * 4. Traffic class: 44 * - Traffic classes of the same pipe handled in strict 45 * priority order; 46 * - Upper limit enforced per traffic class at the pipe level; 47 * - Lower priority traffic classes able to reuse pipe 48 * bandwidth currently unused by higher priority traffic 49 * classes of the same pipe; 50 * 5. Queue: 51 * - Typical usage: queue hosting packets from one or 52 * multiple connections of same traffic class belonging to 53 * the same user; 54 * - Weighted Round Robin (WRR) is used to service the 55 * queues within same pipe lowest priority traffic class (best-effort). 56 * 57 */ 58 59#include <rte_compat.h> 60#include <rte_mbuf.h> 61#include <rte_meter.h> 62 63/** Congestion Management */ 64#include "rte_red.h" 65#include "rte_pie.h" 66 67/** Maximum number of queues per pipe. 68 * Note that the multiple queues (power of 2) can only be assigned to 69 * lowest priority (best-effort) traffic class. Other higher priority traffic 70 * classes can only have one queue. 71 * Can not change. 72 * 73 * @see struct rte_sched_port_params 74 */ 75#define RTE_SCHED_QUEUES_PER_PIPE 16 76 77/** Number of WRR queues for best-effort traffic class per pipe. 78 * 79 * @see struct rte_sched_pipe_params 80 */ 81#define RTE_SCHED_BE_QUEUES_PER_PIPE 4 82 83/** Number of traffic classes per pipe (as well as subport). 84 * @see struct rte_sched_subport_params 85 * @see struct rte_sched_pipe_params 86 */ 87#define RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE \ 88(RTE_SCHED_QUEUES_PER_PIPE - RTE_SCHED_BE_QUEUES_PER_PIPE + 1) 89 90/** Best-effort traffic class ID 91 * Can not change. 92 */ 93#define RTE_SCHED_TRAFFIC_CLASS_BE (RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE - 1) 94 95/* 96 * Ethernet framing overhead. Overhead fields per Ethernet frame: 97 * 1. Preamble: 7 bytes; 98 * 2. Start of Frame Delimiter (SFD): 1 byte; 99 * 3. Frame Check Sequence (FCS): 4 bytes; 100 * 4. Inter Frame Gap (IFG): 12 bytes. 101 * 102 * The FCS is considered overhead only if not included in the packet 103 * length (field pkt_len of struct rte_mbuf). 104 * 105 * @see struct rte_sched_port_params 106 */ 107#ifndef RTE_SCHED_FRAME_OVERHEAD_DEFAULT 108#define RTE_SCHED_FRAME_OVERHEAD_DEFAULT 24 109#endif 110 111/** 112 * Congestion Management (CMAN) mode 113 * 114 * This is used for controlling the admission of packets into a packet queue or 115 * group of packet queues on congestion. 116 * 117 * The *Random Early Detection (RED)* algorithm works by proactively dropping 118 * more and more input packets as the queue occupancy builds up. When the queue 119 * is full or almost full, RED effectively works as *tail drop*. The *Weighted 120 * RED* algorithm uses a separate set of RED thresholds for each packet color. 121 * 122 * Similar to RED, Proportional Integral Controller Enhanced (PIE) randomly 123 * drops a packet at the onset of the congestion and tries to control the 124 * latency around the target value. The congestion detection, however, is based 125 * on the queueing latency instead of the queue length like RED. For more 126 * information, refer RFC8033. 127 */ 128enum rte_sched_cman_mode { 129 RTE_SCHED_CMAN_RED, /**< Random Early Detection (RED) */ 130 RTE_SCHED_CMAN_PIE, /**< Proportional Integral Controller Enhanced (PIE) */ 131}; 132 133/* 134 * Pipe configuration parameters. The period and credits_per_period 135 * parameters are measured in bytes, with one byte meaning the time 136 * duration associated with the transmission of one byte on the 137 * physical medium of the output port, with pipe or pipe traffic class 138 * rate (measured as percentage of output port rate) determined as 139 * credits_per_period divided by period. One credit represents one 140 * byte. 141 */ 142struct rte_sched_pipe_params { 143 /** Token bucket rate (measured in bytes per second) */ 144 uint64_t tb_rate; 145 146 /** Token bucket size (measured in credits) */ 147 uint64_t tb_size; 148 149 /** Traffic class rates (measured in bytes per second) */ 150 uint64_t tc_rate[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; 151 152 /** Enforcement period (measured in milliseconds) */ 153 uint64_t tc_period; 154 155 /** Best-effort traffic class oversubscription weight */ 156 uint8_t tc_ov_weight; 157 158 /** WRR weights of best-effort traffic class queues */ 159 uint8_t wrr_weights[RTE_SCHED_BE_QUEUES_PER_PIPE]; 160}; 161 162/* 163 * Congestion Management configuration parameters. 164 */ 165struct rte_sched_cman_params { 166 /** Congestion Management mode */ 167 enum rte_sched_cman_mode cman_mode; 168 169 union { 170 /** RED parameters */ 171 struct rte_red_params red_params[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE][RTE_COLORS]; 172 173 /** PIE parameters */ 174 struct rte_pie_params pie_params[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; 175 }; 176}; 177 178/* 179 * Subport configuration parameters. The period and credits_per_period 180 * parameters are measured in bytes, with one byte meaning the time 181 * duration associated with the transmission of one byte on the 182 * physical medium of the output port, with pipe or pipe traffic class 183 * rate (measured as percentage of output port rate) determined as 184 * credits_per_period divided by period. One credit represents one 185 * byte. 186 */ 187struct rte_sched_subport_params { 188 /** Number of subport pipes. 189 * The subport can enable/allocate fewer pipes than the maximum 190 * number set through struct port_params::n_max_pipes_per_subport, 191 * as needed, to avoid memory allocation for the queues of the 192 * pipes that are not really needed. 193 */ 194 uint32_t n_pipes_per_subport_enabled; 195 196 /** Packet queue size for each traffic class. 197 * All the pipes within the same subport share the similar 198 * configuration for the queues. 199 */ 200 uint16_t qsize[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; 201 202 /** Pipe profile table. 203 * Every pipe is configured using one of the profiles from this table. 204 */ 205 struct rte_sched_pipe_params *pipe_profiles; 206 207 /** Profiles in the pipe profile table */ 208 uint32_t n_pipe_profiles; 209 210 /** Max allowed profiles in the pipe profile table */ 211 uint32_t n_max_pipe_profiles; 212 213 /** Congestion Management parameters 214 * If NULL the congestion management is disabled for the subport, 215 * otherwise proper parameters need to be provided. 216 */ 217 struct rte_sched_cman_params *cman_params; 218}; 219 220struct rte_sched_subport_profile_params { 221 /** Token bucket rate (measured in bytes per second) */ 222 uint64_t tb_rate; 223 224 /** Token bucket size (measured in credits) */ 225 uint64_t tb_size; 226 227 /** Traffic class rates (measured in bytes per second) */ 228 uint64_t tc_rate[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; 229 230 /** Enforcement period for rates (measured in milliseconds) */ 231 uint64_t tc_period; 232}; 233 234/** Subport statistics */ 235struct rte_sched_subport_stats { 236 /** Number of packets successfully written */ 237 uint64_t n_pkts_tc[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; 238 239 /** Number of packets dropped */ 240 uint64_t n_pkts_tc_dropped[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; 241 242 /** Number of bytes successfully written for each traffic class */ 243 uint64_t n_bytes_tc[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; 244 245 /** Number of bytes dropped for each traffic class */ 246 uint64_t n_bytes_tc_dropped[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; 247 248 /** Number of packets dropped by congestion management scheme */ 249 uint64_t n_pkts_cman_dropped[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; 250}; 251 252/** Queue statistics */ 253struct rte_sched_queue_stats { 254 /** Packets successfully written */ 255 uint64_t n_pkts; 256 257 /** Packets dropped */ 258 uint64_t n_pkts_dropped; 259 260 /** Packets dropped by congestion management scheme */ 261 uint64_t n_pkts_cman_dropped; 262 263 /** Bytes successfully written */ 264 uint64_t n_bytes; 265 266 /** Bytes dropped */ 267 uint64_t n_bytes_dropped; 268}; 269 270/** Port configuration parameters. */ 271struct rte_sched_port_params { 272 /** Name of the port to be associated */ 273 const char *name; 274 275 /** CPU socket ID */ 276 int socket; 277 278 /** Output port rate (measured in bytes per second) */ 279 uint64_t rate; 280 281 /** Maximum Ethernet frame size (measured in bytes). 282 * Should not include the framing overhead. 283 */ 284 uint32_t mtu; 285 286 /** Framing overhead per packet (measured in bytes) */ 287 uint32_t frame_overhead; 288 289 /** Number of subports */ 290 uint32_t n_subports_per_port; 291 292 /** subport profile table. 293 * Every pipe is configured using one of the profiles from this table. 294 */ 295 struct rte_sched_subport_profile_params *subport_profiles; 296 297 /** Profiles in the pipe profile table */ 298 uint32_t n_subport_profiles; 299 300 /** Max allowed profiles in the pipe profile table */ 301 uint32_t n_max_subport_profiles; 302 303 /** Maximum number of subport pipes. 304 * This parameter is used to reserve a fixed number of bits 305 * in struct rte_mbuf::sched.queue_id for the pipe_id for all 306 * the subports of the same port. 307 */ 308 uint32_t n_pipes_per_subport; 309}; 310 311/* 312 * Configuration 313 * 314 ***/ 315 316/** 317 * Hierarchical scheduler port configuration 318 * 319 * @param params 320 * Port scheduler configuration parameter structure 321 * @return 322 * Handle to port scheduler instance upon success or NULL otherwise. 323 */ 324struct rte_sched_port * 325rte_sched_port_config(struct rte_sched_port_params *params); 326 327/** 328 * Hierarchical scheduler port free 329 * 330 * @param port 331 * Handle to port scheduler instance. 332 * If port is NULL, no operation is performed. 333 */ 334void 335rte_sched_port_free(struct rte_sched_port *port); 336 337/** 338 * Hierarchical scheduler pipe profile add 339 * 340 * @param port 341 * Handle to port scheduler instance 342 * @param subport_id 343 * Subport ID 344 * @param params 345 * Pipe profile parameters 346 * @param pipe_profile_id 347 * Set to valid profile id when profile is added successfully. 348 * @return 349 * 0 upon success, error code otherwise 350 */ 351int 352rte_sched_subport_pipe_profile_add(struct rte_sched_port *port, 353 uint32_t subport_id, 354 struct rte_sched_pipe_params *params, 355 uint32_t *pipe_profile_id); 356 357/** 358 * @warning 359 * @b EXPERIMENTAL: this API may change without prior notice. 360 * 361 * Hierarchical scheduler subport bandwidth profile add 362 * Note that this function is safe to use in runtime for adding new 363 * subport bandwidth profile as it doesn't have any impact on hierarchical 364 * structure of the scheduler. 365 * @param port 366 * Handle to port scheduler instance 367 * @param profile 368 * Subport bandwidth profile 369 * @param subport_profile_id 370 * Subport profile id 371 * @return 372 * 0 upon success, error code otherwise 373 */ 374__rte_experimental 375int 376rte_sched_port_subport_profile_add(struct rte_sched_port *port, 377 struct rte_sched_subport_profile_params *profile, 378 uint32_t *subport_profile_id); 379 380/** 381 * Hierarchical scheduler subport configuration 382 * Note that this function is safe to use at runtime 383 * to configure subport bandwidth profile. 384 * @param port 385 * Handle to port scheduler instance 386 * @param subport_id 387 * Subport ID 388 * @param params 389 * Subport configuration parameters. Must be non-NULL 390 * for first invocation (i.e initialization) for a given 391 * subport. Ignored (recommended value is NULL) for all 392 * subsequent invocation on the same subport. 393 * @param subport_profile_id 394 * ID of subport bandwidth profile 395 * @return 396 * 0 upon success, error code otherwise 397 */ 398int 399rte_sched_subport_config(struct rte_sched_port *port, 400 uint32_t subport_id, 401 struct rte_sched_subport_params *params, 402 uint32_t subport_profile_id); 403 404/** 405 * Hierarchical scheduler pipe configuration 406 * 407 * @param port 408 * Handle to port scheduler instance 409 * @param subport_id 410 * Subport ID 411 * @param pipe_id 412 * Pipe ID within subport 413 * @param pipe_profile 414 * ID of subport-level pre-configured pipe profile 415 * @return 416 * 0 upon success, error code otherwise 417 */ 418int 419rte_sched_pipe_config(struct rte_sched_port *port, 420 uint32_t subport_id, 421 uint32_t pipe_id, 422 int32_t pipe_profile); 423 424/** 425 * Hierarchical scheduler memory footprint size per port 426 * 427 * @param port_params 428 * Port scheduler configuration parameter structure 429 * @param subport_params 430 * Array of subport parameter structures 431 * @return 432 * Memory footprint size in bytes upon success, 0 otherwise 433 */ 434uint32_t 435rte_sched_port_get_memory_footprint(struct rte_sched_port_params *port_params, 436 struct rte_sched_subport_params **subport_params); 437/* 438 * Statistics 439 * 440 ***/ 441 442/** 443 * Hierarchical scheduler subport statistics read 444 * 445 * @param port 446 * Handle to port scheduler instance 447 * @param subport_id 448 * Subport ID 449 * @param stats 450 * Pointer to pre-allocated subport statistics structure where the statistics 451 * counters should be stored 452 * @param tc_ov 453 * Pointer to pre-allocated RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE-entry array 454 * where the oversubscription status for each of the subport traffic classes 455 * should be stored. 456 * @return 457 * 0 upon success, error code otherwise 458 */ 459int 460rte_sched_subport_read_stats(struct rte_sched_port *port, 461 uint32_t subport_id, 462 struct rte_sched_subport_stats *stats, 463 uint32_t *tc_ov); 464 465/** 466 * Hierarchical scheduler queue statistics read 467 * 468 * @param port 469 * Handle to port scheduler instance 470 * @param queue_id 471 * Queue ID within port scheduler 472 * @param stats 473 * Pointer to pre-allocated subport statistics structure where the statistics 474 * counters should be stored 475 * @param qlen 476 * Pointer to pre-allocated variable where the current queue length 477 * should be stored. 478 * @return 479 * 0 upon success, error code otherwise 480 */ 481int 482rte_sched_queue_read_stats(struct rte_sched_port *port, 483 uint32_t queue_id, 484 struct rte_sched_queue_stats *stats, 485 uint16_t *qlen); 486 487/** 488 * Scheduler hierarchy path write to packet descriptor. Typically 489 * called by the packet classification stage. 490 * 491 * @param port 492 * Handle to port scheduler instance 493 * @param pkt 494 * Packet descriptor handle 495 * @param subport 496 * Subport ID 497 * @param pipe 498 * Pipe ID within subport 499 * @param traffic_class 500 * Traffic class ID within pipe (0 .. RTE_SCHED_TRAFFIC_CLASS_BE) 501 * @param queue 502 * Queue ID within pipe traffic class, 0 for high priority TCs, and 503 * 0 .. (RTE_SCHED_BE_QUEUES_PER_PIPE - 1) for best-effort TC 504 * @param color 505 * Packet color set 506 */ 507void 508rte_sched_port_pkt_write(struct rte_sched_port *port, 509 struct rte_mbuf *pkt, 510 uint32_t subport, uint32_t pipe, uint32_t traffic_class, 511 uint32_t queue, enum rte_color color); 512 513/** 514 * Scheduler hierarchy path read from packet descriptor (struct 515 * rte_mbuf). Typically called as part of the hierarchical scheduler 516 * enqueue operation. The subport, pipe, traffic class and queue 517 * parameters need to be pre-allocated by the caller. 518 * 519 * @param port 520 * Handle to port scheduler instance 521 * @param pkt 522 * Packet descriptor handle 523 * @param subport 524 * Subport ID 525 * @param pipe 526 * Pipe ID within subport 527 * @param traffic_class 528 * Traffic class ID within pipe (0 .. RTE_SCHED_TRAFFIC_CLASS_BE) 529 * @param queue 530 * Queue ID within pipe traffic class, 0 for high priority TCs, and 531 * 0 .. (RTE_SCHED_BE_QUEUES_PER_PIPE - 1) for best-effort TC 532 */ 533void 534rte_sched_port_pkt_read_tree_path(struct rte_sched_port *port, 535 const struct rte_mbuf *pkt, 536 uint32_t *subport, uint32_t *pipe, 537 uint32_t *traffic_class, uint32_t *queue); 538 539enum rte_color 540rte_sched_port_pkt_read_color(const struct rte_mbuf *pkt); 541 542/** 543 * Hierarchical scheduler port enqueue. Writes up to n_pkts to port 544 * scheduler and returns the number of packets actually written. For 545 * each packet, the port scheduler queue to write the packet to is 546 * identified by reading the hierarchy path from the packet 547 * descriptor; if the queue is full or congested and the packet is not 548 * written to the queue, then the packet is automatically dropped 549 * without any action required from the caller. 550 * 551 * @param port 552 * Handle to port scheduler instance 553 * @param pkts 554 * Array storing the packet descriptor handles 555 * @param n_pkts 556 * Number of packets to enqueue from the pkts array into the port scheduler 557 * @return 558 * Number of packets successfully enqueued 559 */ 560int 561rte_sched_port_enqueue(struct rte_sched_port *port, struct rte_mbuf **pkts, uint32_t n_pkts); 562 563/** 564 * Hierarchical scheduler port dequeue. Reads up to n_pkts from the 565 * port scheduler and stores them in the pkts array and returns the 566 * number of packets actually read. The pkts array needs to be 567 * pre-allocated by the caller with at least n_pkts entries. 568 * 569 * @param port 570 * Handle to port scheduler instance 571 * @param pkts 572 * Pre-allocated packet descriptor array where the packets dequeued 573 * from the port 574 * scheduler should be stored 575 * @param n_pkts 576 * Number of packets to dequeue from the port scheduler 577 * @return 578 * Number of packets successfully dequeued and placed in the pkts array 579 */ 580int 581rte_sched_port_dequeue(struct rte_sched_port *port, struct rte_mbuf **pkts, uint32_t n_pkts); 582 583/** 584 * Hierarchical scheduler subport traffic class 585 * oversubscription enable/disable. 586 * This function should be called at the time of subport initialization. 587 * 588 * @param port 589 * Handle to port scheduler instance 590 * @param subport_id 591 * Subport ID 592 * @param tc_ov_enable 593 * Boolean flag to enable/disable TC OV 594 * @return 595 * 0 upon success, error code otherwise 596 */ 597__rte_experimental 598int 599rte_sched_subport_tc_ov_config(struct rte_sched_port *port, uint32_t subport_id, bool tc_ov_enable); 600 601#ifdef __cplusplus 602} 603#endif 604 605#endif /* __INCLUDE_RTE_SCHED_H__ */ 606