LXR linux/net/netfilter/ipvs/ip_vs

   1/*
   2 * IPVS:        Shortest Expected Delay scheduling module
   3 *
   4 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
   5 *
   6 *              This program is free software; you can redistribute it and/or
   7 *              modify it under the terms of the GNU General Public License
   8 *              as published by the Free Software Foundation; either version
   9 *              2 of the License, or (at your option) any later version.
  10 *
  11 * Changes:
  12 *
  13 */
  14
  15/*
  16 * The SED algorithm attempts to minimize each job's expected delay until
  17 * completion. The expected delay that the job will experience is
  18 * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
  19 * jobs on the ith server and Ui is the fixed service rate (weight) of
  20 * the ith server. The SED algorithm adopts a greedy policy that each does
  21 * what is in its own best interest, i.e. to join the queue which would
  22 * minimize its expected delay of completion.
  23 *
  24 * See the following paper for more information:
  25 * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
  26 * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
  27 * pages 986-994, 1988.
  28 *
  29 * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
  30 *
  31 * The difference between SED and WLC is that SED includes the incoming
  32 * job in the cost function (the increment of 1). SED may outperform
  33 * WLC, while scheduling big jobs under larger heterogeneous systems
  34 * (the server weight varies a lot).
  35 *
  36 */
  37
  38#define KMSG_COMPONENT "IPVS"
  39#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  40
  41#include <linux/module.h>
  42#include <linux/kernel.h>
  43
  44#include <net/ip_vs.h>
  45
  46
  47static inline int
  48ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
  49{
  50        /*
  51         * We only use the active connection number in the cost
  52         * calculation here.
  53         */
  54        return atomic_read(&dest->activeconns) + 1;
  55}
  56
  57
  58/*
  59 *      Weighted Least Connection scheduling
  60 */
  61static struct ip_vs_dest *
  62ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
  63                   struct ip_vs_iphdr *iph)
  64{
  65        struct ip_vs_dest *dest, *least;
  66        int loh, doh;
  67
  68        IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
  69
  70        /*
  71         * We calculate the load of each dest server as follows:
  72         *      (server expected overhead) / dest->weight
  73         *
  74         * Remember -- no floats in kernel mode!!!
  75         * The comparison of h1*w2 > h2*w1 is equivalent to that of
  76         *                h1/w1 > h2/w2
  77         * if every weight is larger than zero.
  78         *
  79         * The server with weight=0 is quiesced and will not receive any
  80         * new connections.
  81         */
  82
  83        list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
  84                if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
  85                    atomic_read(&dest->weight) > 0) {
  86                        least = dest;
  87                        loh = ip_vs_sed_dest_overhead(least);
  88                        goto nextstage;
  89                }
  90        }
  91        ip_vs_scheduler_err(svc, "no destination available");
  92        return NULL;
  93
  94        /*
  95         *    Find the destination with the least load.
  96         */
  97  nextstage:
  98        list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
  99                if (dest->flags & IP_VS_DEST_F_OVERLOAD)
 100                        continue;
 101                doh = ip_vs_sed_dest_overhead(dest);
 102                if ((__s64)loh * atomic_read(&dest->weight) >
 103                    (__s64)doh * atomic_read(&least->weight)) {
 104                        least = dest;
 105                        loh = doh;
 106                }
 107        }
 108
 109        IP_VS_DBG_BUF(6, "SED: server %s:%u "
 110                      "activeconns %d refcnt %d weight %d overhead %d\n",
 111                      IP_VS_DBG_ADDR(least->af, &least->addr),
 112                      ntohs(least->port),
 113                      atomic_read(&least->activeconns),
 114                      atomic_read(&least->refcnt),
 115                      atomic_read(&least->weight), loh);
 116
 117        return least;
 118}
 119
 120
 121static struct ip_vs_scheduler ip_vs_sed_scheduler =
 122{
 123        .name =                 "sed",
 124        .refcnt =               ATOMIC_INIT(0),
 125        .module =               THIS_MODULE,
 126        .n_list =               LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list),
 127        .schedule =             ip_vs_sed_schedule,
 128};
 129
 130
 131static int __init ip_vs_sed_init(void)
 132{
 133        return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
 134}
 135
 136static void __exit ip_vs_sed_cleanup(void)
 137{
 138        unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
 139        synchronize_rcu();
 140}
 141
 142module_init(ip_vs_sed_init);
 143module_exit(ip_vs_sed_cleanup);
 144MODULE_LICENSE("GPL");
 145