1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20#define KMSG_COMPONENT "IPVS"
21#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
22
23#include <linux/ip.h>
24#include <linux/slab.h>
25#include <linux/module.h>
26#include <linux/kernel.h>
27#include <linux/skbuff.h>
28
29#include <net/ip_vs.h>
30
31#include <linux/siphash.h>
32#include <linux/bitops.h>
33#include <linux/gcd.h>
34
35#define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1
36#define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2
37
38struct ip_vs_mh_lookup {
39 struct ip_vs_dest __rcu *dest;
40};
41
42struct ip_vs_mh_dest_setup {
43 unsigned int offset;
44 unsigned int skip;
45 unsigned int perm;
46 int turns;
47};
48
49
50static int primes[] = {251, 509, 1021, 2039, 4093,
51 8191, 16381, 32749, 65521, 131071};
52
53
54#ifndef CONFIG_IP_VS_MH_TAB_INDEX
55#define CONFIG_IP_VS_MH_TAB_INDEX 12
56#endif
57#define IP_VS_MH_TAB_BITS (CONFIG_IP_VS_MH_TAB_INDEX / 2)
58#define IP_VS_MH_TAB_INDEX (CONFIG_IP_VS_MH_TAB_INDEX - 8)
59#define IP_VS_MH_TAB_SIZE primes[IP_VS_MH_TAB_INDEX]
60
61struct ip_vs_mh_state {
62 struct rcu_head rcu_head;
63 struct ip_vs_mh_lookup *lookup;
64 struct ip_vs_mh_dest_setup *dest_setup;
65 hsiphash_key_t hash1, hash2;
66 int gcd;
67 int rshift;
68};
69
70static inline void generate_hash_secret(hsiphash_key_t *hash1,
71 hsiphash_key_t *hash2)
72{
73 hash1->key[0] = 2654435761UL;
74 hash1->key[1] = 2654435761UL;
75
76 hash2->key[0] = 2654446892UL;
77 hash2->key[1] = 2654446892UL;
78}
79
80
81static inline bool is_unavailable(struct ip_vs_dest *dest)
82{
83 return atomic_read(&dest->weight) <= 0 ||
84 dest->flags & IP_VS_DEST_F_OVERLOAD;
85}
86
87
88static inline unsigned int
89ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr,
90 __be16 port, hsiphash_key_t *key, unsigned int offset)
91{
92 unsigned int v;
93 __be32 addr_fold = addr->ip;
94
95#ifdef CONFIG_IP_VS_IPV6
96 if (af == AF_INET6)
97 addr_fold = addr->ip6[0] ^ addr->ip6[1] ^
98 addr->ip6[2] ^ addr->ip6[3];
99#endif
100 v = (offset + ntohs(port) + ntohl(addr_fold));
101 return hsiphash(&v, sizeof(v), key);
102}
103
104
105static void ip_vs_mh_reset(struct ip_vs_mh_state *s)
106{
107 int i;
108 struct ip_vs_mh_lookup *l;
109 struct ip_vs_dest *dest;
110
111 l = &s->lookup[0];
112 for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
113 dest = rcu_dereference_protected(l->dest, 1);
114 if (dest) {
115 ip_vs_dest_put(dest);
116 RCU_INIT_POINTER(l->dest, NULL);
117 }
118 l++;
119 }
120}
121
122static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
123 struct ip_vs_service *svc)
124{
125 struct list_head *p;
126 struct ip_vs_mh_dest_setup *ds;
127 struct ip_vs_dest *dest;
128 int lw;
129
130
131
132
133
134 if (s->gcd < 1)
135 return 0;
136
137
138 p = &svc->destinations;
139 ds = &s->dest_setup[0];
140 while ((p = p->next) != &svc->destinations) {
141 dest = list_entry(p, struct ip_vs_dest, n_list);
142
143 ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr,
144 dest->port, &s->hash1, 0) %
145 IP_VS_MH_TAB_SIZE;
146 ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
147 dest->port, &s->hash2, 0) %
148 (IP_VS_MH_TAB_SIZE - 1) + 1;
149 ds->perm = ds->offset;
150
151 lw = atomic_read(&dest->last_weight);
152 ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0);
153 ds++;
154 }
155
156 return 0;
157}
158
159static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
160 struct ip_vs_service *svc)
161{
162 int n, c, dt_count;
163 unsigned long *table;
164 struct list_head *p;
165 struct ip_vs_mh_dest_setup *ds;
166 struct ip_vs_dest *dest, *new_dest;
167
168
169
170
171
172 if (s->gcd < 1) {
173 ip_vs_mh_reset(s);
174 return 0;
175 }
176
177 table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
178 sizeof(unsigned long), GFP_KERNEL);
179 if (!table)
180 return -ENOMEM;
181
182 p = &svc->destinations;
183 n = 0;
184 dt_count = 0;
185 while (n < IP_VS_MH_TAB_SIZE) {
186 if (p == &svc->destinations)
187 p = p->next;
188
189 ds = &s->dest_setup[0];
190 while (p != &svc->destinations) {
191
192 if (ds->turns < 1) {
193 p = p->next;
194 ds++;
195 continue;
196 }
197
198 c = ds->perm;
199 while (test_bit(c, table)) {
200
201 ds->perm += ds->skip;
202 if (ds->perm >= IP_VS_MH_TAB_SIZE)
203 ds->perm -= IP_VS_MH_TAB_SIZE;
204 c = ds->perm;
205 }
206
207 __set_bit(c, table);
208
209 dest = rcu_dereference_protected(s->lookup[c].dest, 1);
210 new_dest = list_entry(p, struct ip_vs_dest, n_list);
211 if (dest != new_dest) {
212 if (dest)
213 ip_vs_dest_put(dest);
214 ip_vs_dest_hold(new_dest);
215 RCU_INIT_POINTER(s->lookup[c].dest, new_dest);
216 }
217
218 if (++n == IP_VS_MH_TAB_SIZE)
219 goto out;
220
221 if (++dt_count >= ds->turns) {
222 dt_count = 0;
223 p = p->next;
224 ds++;
225 }
226 }
227 }
228
229out:
230 kfree(table);
231 return 0;
232}
233
234
235static inline struct ip_vs_dest *
236ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
237 const union nf_inet_addr *addr, __be16 port)
238{
239 unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
240 % IP_VS_MH_TAB_SIZE;
241 struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
242
243 return (!dest || is_unavailable(dest)) ? NULL : dest;
244}
245
246
247static inline struct ip_vs_dest *
248ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
249 const union nf_inet_addr *addr, __be16 port)
250{
251 unsigned int offset, roffset;
252 unsigned int hash, ihash;
253 struct ip_vs_dest *dest;
254
255
256 ihash = ip_vs_mh_hashkey(svc->af, addr, port,
257 &s->hash1, 0) % IP_VS_MH_TAB_SIZE;
258 dest = rcu_dereference(s->lookup[ihash].dest);
259 if (!dest)
260 return NULL;
261 if (!is_unavailable(dest))
262 return dest;
263
264 IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting",
265 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
266
267
268
269
270 for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
271 roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
272 hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
273 roffset) % IP_VS_MH_TAB_SIZE;
274 dest = rcu_dereference(s->lookup[hash].dest);
275 if (!dest)
276 break;
277 if (!is_unavailable(dest))
278 return dest;
279 IP_VS_DBG_BUF(6,
280 "MH: selected unavailable server %s:%u (offset %u), reselecting",
281 IP_VS_DBG_ADDR(dest->af, &dest->addr),
282 ntohs(dest->port), roffset);
283 }
284
285 return NULL;
286}
287
288
289static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
290 struct ip_vs_service *svc)
291{
292 int ret;
293
294 if (svc->num_dests > IP_VS_MH_TAB_SIZE)
295 return -EINVAL;
296
297 if (svc->num_dests >= 1) {
298 s->dest_setup = kcalloc(svc->num_dests,
299 sizeof(struct ip_vs_mh_dest_setup),
300 GFP_KERNEL);
301 if (!s->dest_setup)
302 return -ENOMEM;
303 }
304
305 ip_vs_mh_permutate(s, svc);
306
307 ret = ip_vs_mh_populate(s, svc);
308 if (ret < 0)
309 goto out;
310
311 IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n",
312 IP_VS_DBG_ADDR(svc->af, &svc->addr),
313 ntohs(svc->port));
314
315out:
316 if (svc->num_dests >= 1) {
317 kfree(s->dest_setup);
318 s->dest_setup = NULL;
319 }
320 return ret;
321}
322
323static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
324{
325 struct ip_vs_dest *dest;
326 int weight;
327 int g = 0;
328
329 list_for_each_entry(dest, &svc->destinations, n_list) {
330 weight = atomic_read(&dest->last_weight);
331 if (weight > 0) {
332 if (g > 0)
333 g = gcd(weight, g);
334 else
335 g = weight;
336 }
337 }
338 return g;
339}
340
341
342
343
344static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
345{
346 struct ip_vs_dest *dest;
347 int new_weight, weight = 0;
348 int mw, shift;
349
350
351
352
353
354 if (gcd < 1)
355 return 0;
356
357 list_for_each_entry(dest, &svc->destinations, n_list) {
358 new_weight = atomic_read(&dest->last_weight);
359 if (new_weight > weight)
360 weight = new_weight;
361 }
362
363
364
365
366 mw = weight / gcd;
367
368
369 shift = fls(mw) - IP_VS_MH_TAB_BITS;
370 return (shift >= 0) ? shift : 0;
371}
372
373static void ip_vs_mh_state_free(struct rcu_head *head)
374{
375 struct ip_vs_mh_state *s;
376
377 s = container_of(head, struct ip_vs_mh_state, rcu_head);
378 kfree(s->lookup);
379 kfree(s);
380}
381
382static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
383{
384 int ret;
385 struct ip_vs_mh_state *s;
386
387
388 s = kzalloc(sizeof(*s), GFP_KERNEL);
389 if (!s)
390 return -ENOMEM;
391
392 s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
393 GFP_KERNEL);
394 if (!s->lookup) {
395 kfree(s);
396 return -ENOMEM;
397 }
398
399 generate_hash_secret(&s->hash1, &s->hash2);
400 s->gcd = ip_vs_mh_gcd_weight(svc);
401 s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
402
403 IP_VS_DBG(6,
404 "MH lookup table (memory=%zdbytes) allocated for current service\n",
405 sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
406
407
408 ret = ip_vs_mh_reassign(s, svc);
409 if (ret < 0) {
410 ip_vs_mh_reset(s);
411 ip_vs_mh_state_free(&s->rcu_head);
412 return ret;
413 }
414
415
416 svc->sched_data = s;
417 return 0;
418}
419
420static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
421{
422 struct ip_vs_mh_state *s = svc->sched_data;
423
424
425 ip_vs_mh_reset(s);
426
427 call_rcu(&s->rcu_head, ip_vs_mh_state_free);
428 IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
429 sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
430}
431
432static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
433 struct ip_vs_dest *dest)
434{
435 struct ip_vs_mh_state *s = svc->sched_data;
436
437 s->gcd = ip_vs_mh_gcd_weight(svc);
438 s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
439
440
441 return ip_vs_mh_reassign(s, svc);
442}
443
444
445static inline __be16
446ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
447{
448 __be16 _ports[2], *ports;
449
450
451
452
453
454
455 switch (iph->protocol) {
456 case IPPROTO_TCP:
457 case IPPROTO_UDP:
458 case IPPROTO_SCTP:
459 ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
460 &_ports);
461 if (unlikely(!ports))
462 return 0;
463
464 if (likely(!ip_vs_iph_inverse(iph)))
465 return ports[0];
466 else
467 return ports[1];
468 default:
469 return 0;
470 }
471}
472
473
474static struct ip_vs_dest *
475ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
476 struct ip_vs_iphdr *iph)
477{
478 struct ip_vs_dest *dest;
479 struct ip_vs_mh_state *s;
480 __be16 port = 0;
481 const union nf_inet_addr *hash_addr;
482
483 hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
484
485 IP_VS_DBG(6, "%s : Scheduling...\n", __func__);
486
487 if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT)
488 port = ip_vs_mh_get_port(skb, iph);
489
490 s = (struct ip_vs_mh_state *)svc->sched_data;
491
492 if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK)
493 dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port);
494 else
495 dest = ip_vs_mh_get(svc, s, hash_addr, port);
496
497 if (!dest) {
498 ip_vs_scheduler_err(svc, "no destination available");
499 return NULL;
500 }
501
502 IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n",
503 IP_VS_DBG_ADDR(svc->af, hash_addr),
504 ntohs(port),
505 IP_VS_DBG_ADDR(dest->af, &dest->addr),
506 ntohs(dest->port));
507
508 return dest;
509}
510
511
512static struct ip_vs_scheduler ip_vs_mh_scheduler = {
513 .name = "mh",
514 .refcnt = ATOMIC_INIT(0),
515 .module = THIS_MODULE,
516 .n_list = LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list),
517 .init_service = ip_vs_mh_init_svc,
518 .done_service = ip_vs_mh_done_svc,
519 .add_dest = ip_vs_mh_dest_changed,
520 .del_dest = ip_vs_mh_dest_changed,
521 .upd_dest = ip_vs_mh_dest_changed,
522 .schedule = ip_vs_mh_schedule,
523};
524
525static int __init ip_vs_mh_init(void)
526{
527 return register_ip_vs_scheduler(&ip_vs_mh_scheduler);
528}
529
530static void __exit ip_vs_mh_cleanup(void)
531{
532 unregister_ip_vs_scheduler(&ip_vs_mh_scheduler);
533 rcu_barrier();
534}
535
536module_init(ip_vs_mh_init);
537module_exit(ip_vs_mh_cleanup);
538MODULE_DESCRIPTION("Maglev hashing ipvs scheduler");
539MODULE_LICENSE("GPL v2");
540MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>");
541