linux/net/netfilter/ipvs/ip_vs_wrr.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * IPVS:        Weighted Round-Robin Scheduling module
   4 *
   5 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
   6 *
   7 * Changes:
   8 *     Wensong Zhang            :     changed the ip_vs_wrr_schedule to return dest
   9 *     Wensong Zhang            :     changed some comestics things for debugging
  10 *     Wensong Zhang            :     changed for the d-linked destination list
  11 *     Wensong Zhang            :     added the ip_vs_wrr_update_svc
  12 *     Julian Anastasov         :     fixed the bug of returning destination
  13 *                                    with weight 0 when all weights are zero
  14 */
  15
  16#define KMSG_COMPONENT "IPVS"
  17#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  18
  19#include <linux/module.h>
  20#include <linux/kernel.h>
  21#include <linux/slab.h>
  22#include <linux/net.h>
  23#include <linux/gcd.h>
  24
  25#include <net/ip_vs.h>
  26
  27/* The WRR algorithm depends on some caclulations:
  28 * - mw: maximum weight
  29 * - di: weight step, greatest common divisor from all weights
  30 * - cw: current required weight
  31 * As result, all weights are in the [di..mw] range with a step=di.
  32 *
  33 * First, we start with cw = mw and select dests with weight >= cw.
  34 * Then cw is reduced with di and all dests are checked again.
  35 * Last pass should be with cw = di. We have mw/di passes in total:
  36 *
  37 * pass 1: cw = max weight
  38 * pass 2: cw = max weight - di
  39 * pass 3: cw = max weight - 2 * di
  40 * ...
  41 * last pass: cw = di
  42 *
  43 * Weights are supposed to be >= di but we run in parallel with
  44 * weight changes, it is possible some dest weight to be reduced
  45 * below di, bad if it is the only available dest.
  46 *
  47 * So, we modify how mw is calculated, now it is reduced with (di - 1),
  48 * so that last cw is 1 to catch such dests with weight below di:
  49 * pass 1: cw = max weight - (di - 1)
  50 * pass 2: cw = max weight - di - (di - 1)
  51 * pass 3: cw = max weight - 2 * di - (di - 1)
  52 * ...
  53 * last pass: cw = 1
  54 *
  55 */
  56
  57/*
  58 * current destination pointer for weighted round-robin scheduling
  59 */
  60struct ip_vs_wrr_mark {
  61        struct ip_vs_dest *cl;  /* current dest or head */
  62        int cw;                 /* current weight */
  63        int mw;                 /* maximum weight */
  64        int di;                 /* decreasing interval */
  65        struct rcu_head         rcu_head;
  66};
  67
  68
  69static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
  70{
  71        struct ip_vs_dest *dest;
  72        int weight;
  73        int g = 0;
  74
  75        list_for_each_entry(dest, &svc->destinations, n_list) {
  76                weight = atomic_read(&dest->weight);
  77                if (weight > 0) {
  78                        if (g > 0)
  79                                g = gcd(weight, g);
  80                        else
  81                                g = weight;
  82                }
  83        }
  84        return g ? g : 1;
  85}
  86
  87
  88/*
  89 *    Get the maximum weight of the service destinations.
  90 */
  91static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
  92{
  93        struct ip_vs_dest *dest;
  94        int new_weight, weight = 0;
  95
  96        list_for_each_entry(dest, &svc->destinations, n_list) {
  97                new_weight = atomic_read(&dest->weight);
  98                if (new_weight > weight)
  99                        weight = new_weight;
 100        }
 101
 102        return weight;
 103}
 104
 105
 106static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
 107{
 108        struct ip_vs_wrr_mark *mark;
 109
 110        /*
 111         *    Allocate the mark variable for WRR scheduling
 112         */
 113        mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL);
 114        if (mark == NULL)
 115                return -ENOMEM;
 116
 117        mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
 118        mark->di = ip_vs_wrr_gcd_weight(svc);
 119        mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
 120        mark->cw = mark->mw;
 121        svc->sched_data = mark;
 122
 123        return 0;
 124}
 125
 126
 127static void ip_vs_wrr_done_svc(struct ip_vs_service *svc)
 128{
 129        struct ip_vs_wrr_mark *mark = svc->sched_data;
 130
 131        /*
 132         *    Release the mark variable
 133         */
 134        kfree_rcu(mark, rcu_head);
 135}
 136
 137
 138static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc,
 139                                  struct ip_vs_dest *dest)
 140{
 141        struct ip_vs_wrr_mark *mark = svc->sched_data;
 142
 143        spin_lock_bh(&svc->sched_lock);
 144        mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
 145        mark->di = ip_vs_wrr_gcd_weight(svc);
 146        mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
 147        if (mark->cw > mark->mw || !mark->cw)
 148                mark->cw = mark->mw;
 149        else if (mark->di > 1)
 150                mark->cw = (mark->cw / mark->di) * mark->di + 1;
 151        spin_unlock_bh(&svc->sched_lock);
 152        return 0;
 153}
 154
 155
 156/*
 157 *    Weighted Round-Robin Scheduling
 158 */
 159static struct ip_vs_dest *
 160ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
 161                   struct ip_vs_iphdr *iph)
 162{
 163        struct ip_vs_dest *dest, *last, *stop = NULL;
 164        struct ip_vs_wrr_mark *mark = svc->sched_data;
 165        bool last_pass = false, restarted = false;
 166
 167        IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
 168
 169        spin_lock_bh(&svc->sched_lock);
 170        dest = mark->cl;
 171        /* No available dests? */
 172        if (mark->mw == 0)
 173                goto err_noavail;
 174        last = dest;
 175        /* Stop only after all dests were checked for weight >= 1 (last pass) */
 176        while (1) {
 177                list_for_each_entry_continue_rcu(dest,
 178                                                 &svc->destinations,
 179                                                 n_list) {
 180                        if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
 181                            atomic_read(&dest->weight) >= mark->cw)
 182                                goto found;
 183                        if (dest == stop)
 184                                goto err_over;
 185                }
 186                mark->cw -= mark->di;
 187                if (mark->cw <= 0) {
 188                        mark->cw = mark->mw;
 189                        /* Stop if we tried last pass from first dest:
 190                         * 1. last_pass: we started checks when cw > di but
 191                         *      then all dests were checked for w >= 1
 192                         * 2. last was head: the first and only traversal
 193                         *      was for weight >= 1, for all dests.
 194                         */
 195                        if (last_pass ||
 196                            &last->n_list == &svc->destinations)
 197                                goto err_over;
 198                        restarted = true;
 199                }
 200                last_pass = mark->cw <= mark->di;
 201                if (last_pass && restarted &&
 202                    &last->n_list != &svc->destinations) {
 203                        /* First traversal was for w >= 1 but only
 204                         * for dests after 'last', now do the same
 205                         * for all dests up to 'last'.
 206                         */
 207                        stop = last;
 208                }
 209        }
 210
 211found:
 212        IP_VS_DBG_BUF(6, "WRR: server %s:%u "
 213                      "activeconns %d refcnt %d weight %d\n",
 214                      IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
 215                      atomic_read(&dest->activeconns),
 216                      refcount_read(&dest->refcnt),
 217                      atomic_read(&dest->weight));
 218        mark->cl = dest;
 219
 220  out:
 221        spin_unlock_bh(&svc->sched_lock);
 222        return dest;
 223
 224err_noavail:
 225        mark->cl = dest;
 226        dest = NULL;
 227        ip_vs_scheduler_err(svc, "no destination available");
 228        goto out;
 229
 230err_over:
 231        mark->cl = dest;
 232        dest = NULL;
 233        ip_vs_scheduler_err(svc, "no destination available: "
 234                            "all destinations are overloaded");
 235        goto out;
 236}
 237
 238
 239static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
 240        .name =                 "wrr",
 241        .refcnt =               ATOMIC_INIT(0),
 242        .module =               THIS_MODULE,
 243        .n_list =               LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
 244        .init_service =         ip_vs_wrr_init_svc,
 245        .done_service =         ip_vs_wrr_done_svc,
 246        .add_dest =             ip_vs_wrr_dest_changed,
 247        .del_dest =             ip_vs_wrr_dest_changed,
 248        .upd_dest =             ip_vs_wrr_dest_changed,
 249        .schedule =             ip_vs_wrr_schedule,
 250};
 251
 252static int __init ip_vs_wrr_init(void)
 253{
 254        return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
 255}
 256
 257static void __exit ip_vs_wrr_cleanup(void)
 258{
 259        unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
 260        synchronize_rcu();
 261}
 262
 263module_init(ip_vs_wrr_init);
 264module_exit(ip_vs_wrr_cleanup);
 265MODULE_LICENSE("GPL");
 266