linux/net/netfilter/ipvs/ip_vs_mh.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/* IPVS:        Maglev Hashing scheduling module
   3 *
   4 * Authors:     Inju Song <inju.song@navercorp.com>
   5 *
   6 */
   7
   8/* The mh algorithm is to assign a preference list of all the lookup
   9 * table positions to each destination and populate the table with
  10 * the most-preferred position of destinations. Then it is to select
  11 * destination with the hash key of source IP address through looking
  12 * up a the lookup table.
  13 *
  14 * The algorithm is detailed in:
  15 * [3.4 Consistent Hasing]
  16https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
  17 *
  18 */
  19
  20#define KMSG_COMPONENT "IPVS"
  21#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  22
  23#include <linux/ip.h>
  24#include <linux/slab.h>
  25#include <linux/module.h>
  26#include <linux/kernel.h>
  27#include <linux/skbuff.h>
  28
  29#include <net/ip_vs.h>
  30
  31#include <linux/siphash.h>
  32#include <linux/bitops.h>
  33#include <linux/gcd.h>
  34
  35#define IP_VS_SVC_F_SCHED_MH_FALLBACK   IP_VS_SVC_F_SCHED1 /* MH fallback */
  36#define IP_VS_SVC_F_SCHED_MH_PORT       IP_VS_SVC_F_SCHED2 /* MH use port */
  37
  38struct ip_vs_mh_lookup {
  39        struct ip_vs_dest __rcu *dest;  /* real server (cache) */
  40};
  41
  42struct ip_vs_mh_dest_setup {
  43        unsigned int    offset; /* starting offset */
  44        unsigned int    skip;   /* skip */
  45        unsigned int    perm;   /* next_offset */
  46        int             turns;  /* weight / gcd() and rshift */
  47};
  48
  49/* Available prime numbers for MH table */
  50static int primes[] = {251, 509, 1021, 2039, 4093,
  51                       8191, 16381, 32749, 65521, 131071};
  52
  53/* For IPVS MH entry hash table */
  54#ifndef CONFIG_IP_VS_MH_TAB_INDEX
  55#define CONFIG_IP_VS_MH_TAB_INDEX       12
  56#endif
  57#define IP_VS_MH_TAB_BITS               (CONFIG_IP_VS_MH_TAB_INDEX / 2)
  58#define IP_VS_MH_TAB_INDEX              (CONFIG_IP_VS_MH_TAB_INDEX - 8)
  59#define IP_VS_MH_TAB_SIZE               primes[IP_VS_MH_TAB_INDEX]
  60
  61struct ip_vs_mh_state {
  62        struct rcu_head                 rcu_head;
  63        struct ip_vs_mh_lookup          *lookup;
  64        struct ip_vs_mh_dest_setup      *dest_setup;
  65        hsiphash_key_t                  hash1, hash2;
  66        int                             gcd;
  67        int                             rshift;
  68};
  69
  70static inline void generate_hash_secret(hsiphash_key_t *hash1,
  71                                        hsiphash_key_t *hash2)
  72{
  73        hash1->key[0] = 2654435761UL;
  74        hash1->key[1] = 2654435761UL;
  75
  76        hash2->key[0] = 2654446892UL;
  77        hash2->key[1] = 2654446892UL;
  78}
  79
  80/* Helper function to determine if server is unavailable */
  81static inline bool is_unavailable(struct ip_vs_dest *dest)
  82{
  83        return atomic_read(&dest->weight) <= 0 ||
  84               dest->flags & IP_VS_DEST_F_OVERLOAD;
  85}
  86
  87/* Returns hash value for IPVS MH entry */
  88static inline unsigned int
  89ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr,
  90                 __be16 port, hsiphash_key_t *key, unsigned int offset)
  91{
  92        unsigned int v;
  93        __be32 addr_fold = addr->ip;
  94
  95#ifdef CONFIG_IP_VS_IPV6
  96        if (af == AF_INET6)
  97                addr_fold = addr->ip6[0] ^ addr->ip6[1] ^
  98                            addr->ip6[2] ^ addr->ip6[3];
  99#endif
 100        v = (offset + ntohs(port) + ntohl(addr_fold));
 101        return hsiphash(&v, sizeof(v), key);
 102}
 103
 104/* Reset all the hash buckets of the specified table. */
 105static void ip_vs_mh_reset(struct ip_vs_mh_state *s)
 106{
 107        int i;
 108        struct ip_vs_mh_lookup *l;
 109        struct ip_vs_dest *dest;
 110
 111        l = &s->lookup[0];
 112        for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
 113                dest = rcu_dereference_protected(l->dest, 1);
 114                if (dest) {
 115                        ip_vs_dest_put(dest);
 116                        RCU_INIT_POINTER(l->dest, NULL);
 117                }
 118                l++;
 119        }
 120}
 121
 122static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
 123                              struct ip_vs_service *svc)
 124{
 125        struct list_head *p;
 126        struct ip_vs_mh_dest_setup *ds;
 127        struct ip_vs_dest *dest;
 128        int lw;
 129
 130        /* If gcd is smaller then 1, number of dests or
 131         * all last_weight of dests are zero. So, skip
 132         * permutation for the dests.
 133         */
 134        if (s->gcd < 1)
 135                return 0;
 136
 137        /* Set dest_setup for the dests permutation */
 138        p = &svc->destinations;
 139        ds = &s->dest_setup[0];
 140        while ((p = p->next) != &svc->destinations) {
 141                dest = list_entry(p, struct ip_vs_dest, n_list);
 142
 143                ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr,
 144                                              dest->port, &s->hash1, 0) %
 145                                              IP_VS_MH_TAB_SIZE;
 146                ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
 147                                            dest->port, &s->hash2, 0) %
 148                                            (IP_VS_MH_TAB_SIZE - 1) + 1;
 149                ds->perm = ds->offset;
 150
 151                lw = atomic_read(&dest->last_weight);
 152                ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0);
 153                ds++;
 154        }
 155
 156        return 0;
 157}
 158
 159static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
 160                             struct ip_vs_service *svc)
 161{
 162        int n, c, dt_count;
 163        unsigned long *table;
 164        struct list_head *p;
 165        struct ip_vs_mh_dest_setup *ds;
 166        struct ip_vs_dest *dest, *new_dest;
 167
 168        /* If gcd is smaller then 1, number of dests or
 169         * all last_weight of dests are zero. So, skip
 170         * the population for the dests and reset lookup table.
 171         */
 172        if (s->gcd < 1) {
 173                ip_vs_mh_reset(s);
 174                return 0;
 175        }
 176
 177        table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
 178                        sizeof(unsigned long), GFP_KERNEL);
 179        if (!table)
 180                return -ENOMEM;
 181
 182        p = &svc->destinations;
 183        n = 0;
 184        dt_count = 0;
 185        while (n < IP_VS_MH_TAB_SIZE) {
 186                if (p == &svc->destinations)
 187                        p = p->next;
 188
 189                ds = &s->dest_setup[0];
 190                while (p != &svc->destinations) {
 191                        /* Ignore added server with zero weight */
 192                        if (ds->turns < 1) {
 193                                p = p->next;
 194                                ds++;
 195                                continue;
 196                        }
 197
 198                        c = ds->perm;
 199                        while (test_bit(c, table)) {
 200                                /* Add skip, mod IP_VS_MH_TAB_SIZE */
 201                                ds->perm += ds->skip;
 202                                if (ds->perm >= IP_VS_MH_TAB_SIZE)
 203                                        ds->perm -= IP_VS_MH_TAB_SIZE;
 204                                c = ds->perm;
 205                        }
 206
 207                        __set_bit(c, table);
 208
 209                        dest = rcu_dereference_protected(s->lookup[c].dest, 1);
 210                        new_dest = list_entry(p, struct ip_vs_dest, n_list);
 211                        if (dest != new_dest) {
 212                                if (dest)
 213                                        ip_vs_dest_put(dest);
 214                                ip_vs_dest_hold(new_dest);
 215                                RCU_INIT_POINTER(s->lookup[c].dest, new_dest);
 216                        }
 217
 218                        if (++n == IP_VS_MH_TAB_SIZE)
 219                                goto out;
 220
 221                        if (++dt_count >= ds->turns) {
 222                                dt_count = 0;
 223                                p = p->next;
 224                                ds++;
 225                        }
 226                }
 227        }
 228
 229out:
 230        kfree(table);
 231        return 0;
 232}
 233
 234/* Get ip_vs_dest associated with supplied parameters. */
 235static inline struct ip_vs_dest *
 236ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
 237             const union nf_inet_addr *addr, __be16 port)
 238{
 239        unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
 240                                             % IP_VS_MH_TAB_SIZE;
 241        struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
 242
 243        return (!dest || is_unavailable(dest)) ? NULL : dest;
 244}
 245
 246/* As ip_vs_mh_get, but with fallback if selected server is unavailable */
 247static inline struct ip_vs_dest *
 248ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
 249                      const union nf_inet_addr *addr, __be16 port)
 250{
 251        unsigned int offset, roffset;
 252        unsigned int hash, ihash;
 253        struct ip_vs_dest *dest;
 254
 255        /* First try the dest it's supposed to go to */
 256        ihash = ip_vs_mh_hashkey(svc->af, addr, port,
 257                                 &s->hash1, 0) % IP_VS_MH_TAB_SIZE;
 258        dest = rcu_dereference(s->lookup[ihash].dest);
 259        if (!dest)
 260                return NULL;
 261        if (!is_unavailable(dest))
 262                return dest;
 263
 264        IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting",
 265                      IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
 266
 267        /* If the original dest is unavailable, loop around the table
 268         * starting from ihash to find a new dest
 269         */
 270        for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
 271                roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
 272                hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
 273                                        roffset) % IP_VS_MH_TAB_SIZE;
 274                dest = rcu_dereference(s->lookup[hash].dest);
 275                if (!dest)
 276                        break;
 277                if (!is_unavailable(dest))
 278                        return dest;
 279                IP_VS_DBG_BUF(6,
 280                              "MH: selected unavailable server %s:%u (offset %u), reselecting",
 281                              IP_VS_DBG_ADDR(dest->af, &dest->addr),
 282                              ntohs(dest->port), roffset);
 283        }
 284
 285        return NULL;
 286}
 287
 288/* Assign all the hash buckets of the specified table with the service. */
 289static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
 290                             struct ip_vs_service *svc)
 291{
 292        int ret;
 293
 294        if (svc->num_dests > IP_VS_MH_TAB_SIZE)
 295                return -EINVAL;
 296
 297        if (svc->num_dests >= 1) {
 298                s->dest_setup = kcalloc(svc->num_dests,
 299                                        sizeof(struct ip_vs_mh_dest_setup),
 300                                        GFP_KERNEL);
 301                if (!s->dest_setup)
 302                        return -ENOMEM;
 303        }
 304
 305        ip_vs_mh_permutate(s, svc);
 306
 307        ret = ip_vs_mh_populate(s, svc);
 308        if (ret < 0)
 309                goto out;
 310
 311        IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n",
 312                      IP_VS_DBG_ADDR(svc->af, &svc->addr),
 313                      ntohs(svc->port));
 314
 315out:
 316        if (svc->num_dests >= 1) {
 317                kfree(s->dest_setup);
 318                s->dest_setup = NULL;
 319        }
 320        return ret;
 321}
 322
 323static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
 324{
 325        struct ip_vs_dest *dest;
 326        int weight;
 327        int g = 0;
 328
 329        list_for_each_entry(dest, &svc->destinations, n_list) {
 330                weight = atomic_read(&dest->last_weight);
 331                if (weight > 0) {
 332                        if (g > 0)
 333                                g = gcd(weight, g);
 334                        else
 335                                g = weight;
 336                }
 337        }
 338        return g;
 339}
 340
 341/* To avoid assigning huge weight for the MH table,
 342 * calculate shift value with gcd.
 343 */
 344static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
 345{
 346        struct ip_vs_dest *dest;
 347        int new_weight, weight = 0;
 348        int mw, shift;
 349
 350        /* If gcd is smaller then 1, number of dests or
 351         * all last_weight of dests are zero. So, return
 352         * shift value as zero.
 353         */
 354        if (gcd < 1)
 355                return 0;
 356
 357        list_for_each_entry(dest, &svc->destinations, n_list) {
 358                new_weight = atomic_read(&dest->last_weight);
 359                if (new_weight > weight)
 360                        weight = new_weight;
 361        }
 362
 363        /* Because gcd is greater than zero,
 364         * the maximum weight and gcd are always greater than zero
 365         */
 366        mw = weight / gcd;
 367
 368        /* shift = occupied bits of weight/gcd - MH highest bits */
 369        shift = fls(mw) - IP_VS_MH_TAB_BITS;
 370        return (shift >= 0) ? shift : 0;
 371}
 372
 373static void ip_vs_mh_state_free(struct rcu_head *head)
 374{
 375        struct ip_vs_mh_state *s;
 376
 377        s = container_of(head, struct ip_vs_mh_state, rcu_head);
 378        kfree(s->lookup);
 379        kfree(s);
 380}
 381
 382static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
 383{
 384        int ret;
 385        struct ip_vs_mh_state *s;
 386
 387        /* Allocate the MH table for this service */
 388        s = kzalloc(sizeof(*s), GFP_KERNEL);
 389        if (!s)
 390                return -ENOMEM;
 391
 392        s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
 393                            GFP_KERNEL);
 394        if (!s->lookup) {
 395                kfree(s);
 396                return -ENOMEM;
 397        }
 398
 399        generate_hash_secret(&s->hash1, &s->hash2);
 400        s->gcd = ip_vs_mh_gcd_weight(svc);
 401        s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
 402
 403        IP_VS_DBG(6,
 404                  "MH lookup table (memory=%zdbytes) allocated for current service\n",
 405                  sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
 406
 407        /* Assign the lookup table with current dests */
 408        ret = ip_vs_mh_reassign(s, svc);
 409        if (ret < 0) {
 410                ip_vs_mh_reset(s);
 411                ip_vs_mh_state_free(&s->rcu_head);
 412                return ret;
 413        }
 414
 415        /* No more failures, attach state */
 416        svc->sched_data = s;
 417        return 0;
 418}
 419
 420static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
 421{
 422        struct ip_vs_mh_state *s = svc->sched_data;
 423
 424        /* Got to clean up lookup entry here */
 425        ip_vs_mh_reset(s);
 426
 427        call_rcu(&s->rcu_head, ip_vs_mh_state_free);
 428        IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
 429                  sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
 430}
 431
 432static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
 433                                 struct ip_vs_dest *dest)
 434{
 435        struct ip_vs_mh_state *s = svc->sched_data;
 436
 437        s->gcd = ip_vs_mh_gcd_weight(svc);
 438        s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
 439
 440        /* Assign the lookup table with the updated service */
 441        return ip_vs_mh_reassign(s, svc);
 442}
 443
 444/* Helper function to get port number */
 445static inline __be16
 446ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
 447{
 448        __be16 _ports[2], *ports;
 449
 450        /* At this point we know that we have a valid packet of some kind.
 451         * Because ICMP packets are only guaranteed to have the first 8
 452         * bytes, let's just grab the ports.  Fortunately they're in the
 453         * same position for all three of the protocols we care about.
 454         */
 455        switch (iph->protocol) {
 456        case IPPROTO_TCP:
 457        case IPPROTO_UDP:
 458        case IPPROTO_SCTP:
 459                ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
 460                                           &_ports);
 461                if (unlikely(!ports))
 462                        return 0;
 463
 464                if (likely(!ip_vs_iph_inverse(iph)))
 465                        return ports[0];
 466                else
 467                        return ports[1];
 468        default:
 469                return 0;
 470        }
 471}
 472
 473/* Maglev Hashing scheduling */
 474static struct ip_vs_dest *
 475ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
 476                  struct ip_vs_iphdr *iph)
 477{
 478        struct ip_vs_dest *dest;
 479        struct ip_vs_mh_state *s;
 480        __be16 port = 0;
 481        const union nf_inet_addr *hash_addr;
 482
 483        hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
 484
 485        IP_VS_DBG(6, "%s : Scheduling...\n", __func__);
 486
 487        if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT)
 488                port = ip_vs_mh_get_port(skb, iph);
 489
 490        s = (struct ip_vs_mh_state *)svc->sched_data;
 491
 492        if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK)
 493                dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port);
 494        else
 495                dest = ip_vs_mh_get(svc, s, hash_addr, port);
 496
 497        if (!dest) {
 498                ip_vs_scheduler_err(svc, "no destination available");
 499                return NULL;
 500        }
 501
 502        IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n",
 503                      IP_VS_DBG_ADDR(svc->af, hash_addr),
 504                      ntohs(port),
 505                      IP_VS_DBG_ADDR(dest->af, &dest->addr),
 506                      ntohs(dest->port));
 507
 508        return dest;
 509}
 510
 511/* IPVS MH Scheduler structure */
 512static struct ip_vs_scheduler ip_vs_mh_scheduler = {
 513        .name =                 "mh",
 514        .refcnt =               ATOMIC_INIT(0),
 515        .module =               THIS_MODULE,
 516        .n_list  =              LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list),
 517        .init_service =         ip_vs_mh_init_svc,
 518        .done_service =         ip_vs_mh_done_svc,
 519        .add_dest =             ip_vs_mh_dest_changed,
 520        .del_dest =             ip_vs_mh_dest_changed,
 521        .upd_dest =             ip_vs_mh_dest_changed,
 522        .schedule =             ip_vs_mh_schedule,
 523};
 524
 525static int __init ip_vs_mh_init(void)
 526{
 527        return register_ip_vs_scheduler(&ip_vs_mh_scheduler);
 528}
 529
 530static void __exit ip_vs_mh_cleanup(void)
 531{
 532        unregister_ip_vs_scheduler(&ip_vs_mh_scheduler);
 533        rcu_barrier();
 534}
 535
 536module_init(ip_vs_mh_init);
 537module_exit(ip_vs_mh_cleanup);
 538MODULE_DESCRIPTION("Maglev hashing ipvs scheduler");
 539MODULE_LICENSE("GPL v2");
 540MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>");
 541