1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35#define KMSG_COMPONENT "IPVS"
36#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
37
38#include <linux/module.h>
39#include <linux/slab.h>
40#include <linux/inetdevice.h>
41#include <linux/net.h>
42#include <linux/completion.h>
43#include <linux/delay.h>
44#include <linux/skbuff.h>
45#include <linux/in.h>
46#include <linux/igmp.h>
47#include <linux/udp.h>
48#include <linux/err.h>
49#include <linux/kthread.h>
50#include <linux/wait.h>
51#include <linux/kernel.h>
52#include <linux/sched/signal.h>
53
54#include <asm/unaligned.h>
55
56#include <net/ip.h>
57#include <net/sock.h>
58
59#include <net/ip_vs.h>
60
61#define IP_VS_SYNC_GROUP 0xe0000051
62#define IP_VS_SYNC_PORT 8848
63
64#define SYNC_PROTO_VER 1
65
66static struct lock_class_key __ipvs_sync_key;
67
68
69
70
71struct ip_vs_sync_conn_v0 {
72 __u8 reserved;
73
74
75 __u8 protocol;
76 __be16 cport;
77 __be16 vport;
78 __be16 dport;
79 __be32 caddr;
80 __be32 vaddr;
81 __be32 daddr;
82
83
84 __be16 flags;
85 __be16 state;
86
87
88};
89
90struct ip_vs_sync_conn_options {
91 struct ip_vs_seq in_seq;
92 struct ip_vs_seq out_seq;
93};
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133struct ip_vs_sync_v4 {
134 __u8 type;
135 __u8 protocol;
136 __be16 ver_size;
137
138 __be32 flags;
139 __be16 state;
140
141 __be16 cport;
142 __be16 vport;
143 __be16 dport;
144 __be32 fwmark;
145 __be32 timeout;
146 __be32 caddr;
147 __be32 vaddr;
148 __be32 daddr;
149
150
151};
152
153
154
155struct ip_vs_sync_v6 {
156 __u8 type;
157 __u8 protocol;
158 __be16 ver_size;
159
160 __be32 flags;
161 __be16 state;
162
163 __be16 cport;
164 __be16 vport;
165 __be16 dport;
166 __be32 fwmark;
167 __be32 timeout;
168 struct in6_addr caddr;
169 struct in6_addr vaddr;
170 struct in6_addr daddr;
171
172
173};
174
175union ip_vs_sync_conn {
176 struct ip_vs_sync_v4 v4;
177 struct ip_vs_sync_v6 v6;
178};
179
180
181#define STYPE_INET6 0
182#define STYPE_F_INET6 (1 << STYPE_INET6)
183
184#define SVER_SHIFT 12
185#define SVER_MASK 0x0fff
186
187#define IPVS_OPT_SEQ_DATA 1
188#define IPVS_OPT_PE_DATA 2
189#define IPVS_OPT_PE_NAME 3
190#define IPVS_OPT_PARAM 7
191
192#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1))
193#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1))
194#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1))
195#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
196
197struct ip_vs_sync_thread_data {
198 struct task_struct *task;
199 struct netns_ipvs *ipvs;
200 struct socket *sock;
201 char *buf;
202 int id;
203};
204
205
206#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0))
207#define FULL_CONN_SIZE \
208(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245#define SYNC_MESG_HEADER_LEN 4
246#define MAX_CONNS_PER_SYNCBUFF 255
247
248
249struct ip_vs_sync_mesg_v0 {
250 __u8 nr_conns;
251 __u8 syncid;
252 __be16 size;
253
254
255};
256
257
258struct ip_vs_sync_mesg {
259 __u8 reserved;
260 __u8 syncid;
261 __be16 size;
262 __u8 nr_conns;
263 __s8 version;
264 __u16 spare;
265
266};
267
268union ipvs_sockaddr {
269 struct sockaddr_in in;
270 struct sockaddr_in6 in6;
271};
272
273struct ip_vs_sync_buff {
274 struct list_head list;
275 unsigned long firstuse;
276
277
278 struct ip_vs_sync_mesg *mesg;
279 unsigned char *head;
280 unsigned char *end;
281};
282
283
284
285
286
287static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
288{
289 memset(ho, 0, sizeof(*ho));
290 ho->init_seq = get_unaligned_be32(&no->init_seq);
291 ho->delta = get_unaligned_be32(&no->delta);
292 ho->previous_delta = get_unaligned_be32(&no->previous_delta);
293}
294
295
296
297
298
299static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
300{
301 put_unaligned_be32(ho->init_seq, &no->init_seq);
302 put_unaligned_be32(ho->delta, &no->delta);
303 put_unaligned_be32(ho->previous_delta, &no->previous_delta);
304}
305
306static inline struct ip_vs_sync_buff *
307sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
308{
309 struct ip_vs_sync_buff *sb;
310
311 spin_lock_bh(&ipvs->sync_lock);
312 if (list_empty(&ms->sync_queue)) {
313 sb = NULL;
314 __set_current_state(TASK_INTERRUPTIBLE);
315 } else {
316 sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff,
317 list);
318 list_del(&sb->list);
319 ms->sync_queue_len--;
320 if (!ms->sync_queue_len)
321 ms->sync_queue_delay = 0;
322 }
323 spin_unlock_bh(&ipvs->sync_lock);
324
325 return sb;
326}
327
328
329
330
331static inline struct ip_vs_sync_buff *
332ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len)
333{
334 struct ip_vs_sync_buff *sb;
335
336 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
337 return NULL;
338
339 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg),
340 ipvs->mcfg.sync_maxlen);
341 sb->mesg = kmalloc(len, GFP_ATOMIC);
342 if (!sb->mesg) {
343 kfree(sb);
344 return NULL;
345 }
346 sb->mesg->reserved = 0;
347 sb->mesg->version = SYNC_PROTO_VER;
348 sb->mesg->syncid = ipvs->mcfg.syncid;
349 sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg));
350 sb->mesg->nr_conns = 0;
351 sb->mesg->spare = 0;
352 sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
353 sb->end = (unsigned char *)sb->mesg + len;
354
355 sb->firstuse = jiffies;
356 return sb;
357}
358
359static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
360{
361 kfree(sb->mesg);
362 kfree(sb);
363}
364
365static inline void sb_queue_tail(struct netns_ipvs *ipvs,
366 struct ipvs_master_sync_state *ms)
367{
368 struct ip_vs_sync_buff *sb = ms->sync_buff;
369
370 spin_lock(&ipvs->sync_lock);
371 if (ipvs->sync_state & IP_VS_STATE_MASTER &&
372 ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) {
373 if (!ms->sync_queue_len)
374 schedule_delayed_work(&ms->master_wakeup_work,
375 max(IPVS_SYNC_SEND_DELAY, 1));
376 ms->sync_queue_len++;
377 list_add_tail(&sb->list, &ms->sync_queue);
378 if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) {
379 int id = (int)(ms - ipvs->ms);
380
381 wake_up_process(ipvs->master_tinfo[id].task);
382 }
383 } else
384 ip_vs_sync_buff_release(sb);
385 spin_unlock(&ipvs->sync_lock);
386}
387
388
389
390
391
392static inline struct ip_vs_sync_buff *
393get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms,
394 unsigned long time)
395{
396 struct ip_vs_sync_buff *sb;
397
398 spin_lock_bh(&ipvs->sync_buff_lock);
399 sb = ms->sync_buff;
400 if (sb && time_after_eq(jiffies - sb->firstuse, time)) {
401 ms->sync_buff = NULL;
402 __set_current_state(TASK_RUNNING);
403 } else
404 sb = NULL;
405 spin_unlock_bh(&ipvs->sync_buff_lock);
406 return sb;
407}
408
409static inline int
410select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp)
411{
412 return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask;
413}
414
415
416
417
418static inline struct ip_vs_sync_buff *
419ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len)
420{
421 struct ip_vs_sync_buff *sb;
422 struct ip_vs_sync_mesg_v0 *mesg;
423
424 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
425 return NULL;
426
427 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0),
428 ipvs->mcfg.sync_maxlen);
429 sb->mesg = kmalloc(len, GFP_ATOMIC);
430 if (!sb->mesg) {
431 kfree(sb);
432 return NULL;
433 }
434 mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
435 mesg->nr_conns = 0;
436 mesg->syncid = ipvs->mcfg.syncid;
437 mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0));
438 sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
439 sb->end = (unsigned char *)mesg + len;
440 sb->firstuse = jiffies;
441 return sb;
442}
443
444
445static inline bool in_persistence(struct ip_vs_conn *cp)
446{
447 for (cp = cp->control; cp; cp = cp->control) {
448 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
449 return true;
450 }
451 return false;
452}
453
454
455
456
457
458
459
460
461
462
463static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs,
464 struct ip_vs_conn *cp, int pkts)
465{
466 unsigned long orig = READ_ONCE(cp->sync_endtime);
467 unsigned long now = jiffies;
468 unsigned long n = (now + cp->timeout) & ~3UL;
469 unsigned int sync_refresh_period;
470 int sync_period;
471 int force;
472
473
474 if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE))
475 force = 0;
476 else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp)))
477 return 0;
478 else if (likely(cp->protocol == IPPROTO_TCP)) {
479 if (!((1 << cp->state) &
480 ((1 << IP_VS_TCP_S_ESTABLISHED) |
481 (1 << IP_VS_TCP_S_FIN_WAIT) |
482 (1 << IP_VS_TCP_S_CLOSE) |
483 (1 << IP_VS_TCP_S_CLOSE_WAIT) |
484 (1 << IP_VS_TCP_S_TIME_WAIT))))
485 return 0;
486 force = cp->state != cp->old_state;
487 if (force && cp->state != IP_VS_TCP_S_ESTABLISHED)
488 goto set;
489 } else if (unlikely(cp->protocol == IPPROTO_SCTP)) {
490 if (!((1 << cp->state) &
491 ((1 << IP_VS_SCTP_S_ESTABLISHED) |
492 (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) |
493 (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) |
494 (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) |
495 (1 << IP_VS_SCTP_S_CLOSED))))
496 return 0;
497 force = cp->state != cp->old_state;
498 if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED)
499 goto set;
500 } else {
501
502 force = 0;
503 }
504
505 sync_refresh_period = sysctl_sync_refresh_period(ipvs);
506 if (sync_refresh_period > 0) {
507 long diff = n - orig;
508 long min_diff = max(cp->timeout >> 1, 10UL * HZ);
509
510
511
512
513 if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) {
514 int retries = orig & 3;
515
516 if (retries >= sysctl_sync_retries(ipvs))
517 return 0;
518 if (time_before(now, orig - cp->timeout +
519 (sync_refresh_period >> 3)))
520 return 0;
521 n |= retries + 1;
522 }
523 }
524 sync_period = sysctl_sync_period(ipvs);
525 if (sync_period > 0) {
526 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) &&
527 pkts % sync_period != sysctl_sync_threshold(ipvs))
528 return 0;
529 } else if (!sync_refresh_period &&
530 pkts != sysctl_sync_threshold(ipvs))
531 return 0;
532
533set:
534 cp->old_state = cp->state;
535 n = cmpxchg(&cp->sync_endtime, orig, n);
536 return n == orig || force;
537}
538
539
540
541
542
543static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp,
544 int pkts)
545{
546 struct ip_vs_sync_mesg_v0 *m;
547 struct ip_vs_sync_conn_v0 *s;
548 struct ip_vs_sync_buff *buff;
549 struct ipvs_master_sync_state *ms;
550 int id;
551 unsigned int len;
552
553 if (unlikely(cp->af != AF_INET))
554 return;
555
556 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
557 return;
558
559 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
560 return;
561
562 spin_lock_bh(&ipvs->sync_buff_lock);
563 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
564 spin_unlock_bh(&ipvs->sync_buff_lock);
565 return;
566 }
567
568 id = select_master_thread_id(ipvs, cp);
569 ms = &ipvs->ms[id];
570 buff = ms->sync_buff;
571 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
572 SIMPLE_CONN_SIZE;
573 if (buff) {
574 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
575
576 if (buff->head + len > buff->end || !m->nr_conns) {
577 sb_queue_tail(ipvs, ms);
578 ms->sync_buff = NULL;
579 buff = NULL;
580 }
581 }
582 if (!buff) {
583 buff = ip_vs_sync_buff_create_v0(ipvs, len);
584 if (!buff) {
585 spin_unlock_bh(&ipvs->sync_buff_lock);
586 pr_err("ip_vs_sync_buff_create failed.\n");
587 return;
588 }
589 ms->sync_buff = buff;
590 }
591
592 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
593 s = (struct ip_vs_sync_conn_v0 *) buff->head;
594
595
596 s->reserved = 0;
597 s->protocol = cp->protocol;
598 s->cport = cp->cport;
599 s->vport = cp->vport;
600 s->dport = cp->dport;
601 s->caddr = cp->caddr.ip;
602 s->vaddr = cp->vaddr.ip;
603 s->daddr = cp->daddr.ip;
604 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
605 s->state = htons(cp->state);
606 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
607 struct ip_vs_sync_conn_options *opt =
608 (struct ip_vs_sync_conn_options *)&s[1];
609 memcpy(opt, &cp->in_seq, sizeof(*opt));
610 }
611
612 m->nr_conns++;
613 m->size = htons(ntohs(m->size) + len);
614 buff->head += len;
615 spin_unlock_bh(&ipvs->sync_buff_lock);
616
617
618 cp = cp->control;
619 if (cp) {
620 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
621 pkts = atomic_add_return(1, &cp->in_pkts);
622 else
623 pkts = sysctl_sync_threshold(ipvs);
624 ip_vs_sync_conn(ipvs, cp, pkts);
625 }
626}
627
628
629
630
631
632
633void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts)
634{
635 struct ip_vs_sync_mesg *m;
636 union ip_vs_sync_conn *s;
637 struct ip_vs_sync_buff *buff;
638 struct ipvs_master_sync_state *ms;
639 int id;
640 __u8 *p;
641 unsigned int len, pe_name_len, pad;
642
643
644 if (sysctl_sync_ver(ipvs) == 0) {
645 ip_vs_sync_conn_v0(ipvs, cp, pkts);
646 return;
647 }
648
649 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
650 goto control;
651sloop:
652 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
653 goto control;
654
655
656 pe_name_len = 0;
657 if (cp->pe_data_len) {
658 if (!cp->pe_data || !cp->dest) {
659 IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
660 return;
661 }
662 pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
663 }
664
665 spin_lock_bh(&ipvs->sync_buff_lock);
666 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
667 spin_unlock_bh(&ipvs->sync_buff_lock);
668 return;
669 }
670
671 id = select_master_thread_id(ipvs, cp);
672 ms = &ipvs->ms[id];
673
674#ifdef CONFIG_IP_VS_IPV6
675 if (cp->af == AF_INET6)
676 len = sizeof(struct ip_vs_sync_v6);
677 else
678#endif
679 len = sizeof(struct ip_vs_sync_v4);
680
681 if (cp->flags & IP_VS_CONN_F_SEQ_MASK)
682 len += sizeof(struct ip_vs_sync_conn_options) + 2;
683
684 if (cp->pe_data_len)
685 len += cp->pe_data_len + 2;
686 if (pe_name_len)
687 len += pe_name_len + 2;
688
689
690 pad = 0;
691 buff = ms->sync_buff;
692 if (buff) {
693 m = buff->mesg;
694 pad = (4 - (size_t) buff->head) & 3;
695
696 if (buff->head + len + pad > buff->end || m->reserved) {
697 sb_queue_tail(ipvs, ms);
698 ms->sync_buff = NULL;
699 buff = NULL;
700 pad = 0;
701 }
702 }
703
704 if (!buff) {
705 buff = ip_vs_sync_buff_create(ipvs, len);
706 if (!buff) {
707 spin_unlock_bh(&ipvs->sync_buff_lock);
708 pr_err("ip_vs_sync_buff_create failed.\n");
709 return;
710 }
711 ms->sync_buff = buff;
712 m = buff->mesg;
713 }
714
715 p = buff->head;
716 buff->head += pad + len;
717 m->size = htons(ntohs(m->size) + pad + len);
718
719 while (pad--)
720 *(p++) = 0;
721
722 s = (union ip_vs_sync_conn *)p;
723
724
725 s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0);
726 s->v4.ver_size = htons(len & SVER_MASK);
727 s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED);
728 s->v4.state = htons(cp->state);
729 s->v4.protocol = cp->protocol;
730 s->v4.cport = cp->cport;
731 s->v4.vport = cp->vport;
732 s->v4.dport = cp->dport;
733 s->v4.fwmark = htonl(cp->fwmark);
734 s->v4.timeout = htonl(cp->timeout / HZ);
735 m->nr_conns++;
736
737#ifdef CONFIG_IP_VS_IPV6
738 if (cp->af == AF_INET6) {
739 p += sizeof(struct ip_vs_sync_v6);
740 s->v6.caddr = cp->caddr.in6;
741 s->v6.vaddr = cp->vaddr.in6;
742 s->v6.daddr = cp->daddr.in6;
743 } else
744#endif
745 {
746 p += sizeof(struct ip_vs_sync_v4);
747 s->v4.caddr = cp->caddr.ip;
748 s->v4.vaddr = cp->vaddr.ip;
749 s->v4.daddr = cp->daddr.ip;
750 }
751 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
752 *(p++) = IPVS_OPT_SEQ_DATA;
753 *(p++) = sizeof(struct ip_vs_sync_conn_options);
754 hton_seq((struct ip_vs_seq *)p, &cp->in_seq);
755 p += sizeof(struct ip_vs_seq);
756 hton_seq((struct ip_vs_seq *)p, &cp->out_seq);
757 p += sizeof(struct ip_vs_seq);
758 }
759
760 if (cp->pe_data_len && cp->pe_data) {
761 *(p++) = IPVS_OPT_PE_DATA;
762 *(p++) = cp->pe_data_len;
763 memcpy(p, cp->pe_data, cp->pe_data_len);
764 p += cp->pe_data_len;
765 if (pe_name_len) {
766
767 *(p++) = IPVS_OPT_PE_NAME;
768 *(p++) = pe_name_len;
769 memcpy(p, cp->pe->name, pe_name_len);
770 p += pe_name_len;
771 }
772 }
773
774 spin_unlock_bh(&ipvs->sync_buff_lock);
775
776control:
777
778 cp = cp->control;
779 if (!cp)
780 return;
781 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
782 pkts = atomic_add_return(1, &cp->in_pkts);
783 else
784 pkts = sysctl_sync_threshold(ipvs);
785 goto sloop;
786}
787
788
789
790
791static inline int
792ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc,
793 struct ip_vs_conn_param *p,
794 __u8 *pe_data, unsigned int pe_data_len,
795 __u8 *pe_name, unsigned int pe_name_len)
796{
797#ifdef CONFIG_IP_VS_IPV6
798 if (af == AF_INET6)
799 ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol,
800 (const union nf_inet_addr *)&sc->v6.caddr,
801 sc->v6.cport,
802 (const union nf_inet_addr *)&sc->v6.vaddr,
803 sc->v6.vport, p);
804 else
805#endif
806 ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol,
807 (const union nf_inet_addr *)&sc->v4.caddr,
808 sc->v4.cport,
809 (const union nf_inet_addr *)&sc->v4.vaddr,
810 sc->v4.vport, p);
811
812 if (pe_data_len) {
813 if (pe_name_len) {
814 char buff[IP_VS_PENAME_MAXLEN+1];
815
816 memcpy(buff, pe_name, pe_name_len);
817 buff[pe_name_len]=0;
818 p->pe = __ip_vs_pe_getbyname(buff);
819 if (!p->pe) {
820 IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
821 buff);
822 return 1;
823 }
824 } else {
825 IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
826 return 1;
827 }
828
829 p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC);
830 if (!p->pe_data) {
831 module_put(p->pe->module);
832 return -ENOMEM;
833 }
834 p->pe_data_len = pe_data_len;
835 }
836 return 0;
837}
838
839
840
841
842
843
844
845static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param,
846 unsigned int flags, unsigned int state,
847 unsigned int protocol, unsigned int type,
848 const union nf_inet_addr *daddr, __be16 dport,
849 unsigned long timeout, __u32 fwmark,
850 struct ip_vs_sync_conn_options *opt)
851{
852 struct ip_vs_dest *dest;
853 struct ip_vs_conn *cp;
854
855 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
856 cp = ip_vs_conn_in_get(param);
857 if (cp && ((cp->dport != dport) ||
858 !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) {
859 if (!(flags & IP_VS_CONN_F_INACTIVE)) {
860 ip_vs_conn_expire_now(cp);
861 __ip_vs_conn_put(cp);
862 cp = NULL;
863 } else {
864
865
866
867
868 __ip_vs_conn_put(cp);
869 kfree(param->pe_data);
870 return;
871 }
872 }
873 } else {
874 cp = ip_vs_ct_in_get(param);
875 }
876
877 if (cp) {
878
879 kfree(param->pe_data);
880
881 dest = cp->dest;
882 spin_lock_bh(&cp->lock);
883 if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE &&
884 !(flags & IP_VS_CONN_F_TEMPLATE) && dest) {
885 if (flags & IP_VS_CONN_F_INACTIVE) {
886 atomic_dec(&dest->activeconns);
887 atomic_inc(&dest->inactconns);
888 } else {
889 atomic_inc(&dest->activeconns);
890 atomic_dec(&dest->inactconns);
891 }
892 }
893 flags &= IP_VS_CONN_F_BACKUP_UPD_MASK;
894 flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
895 cp->flags = flags;
896 spin_unlock_bh(&cp->lock);
897 if (!dest)
898 ip_vs_try_bind_dest(cp);
899 } else {
900
901
902
903
904
905 rcu_read_lock();
906
907
908
909
910
911 dest = ip_vs_find_dest(ipvs, type, type, daddr, dport,
912 param->vaddr, param->vport, protocol,
913 fwmark, flags);
914
915 cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest,
916 fwmark);
917 rcu_read_unlock();
918 if (!cp) {
919 kfree(param->pe_data);
920 IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
921 return;
922 }
923 if (!(flags & IP_VS_CONN_F_TEMPLATE))
924 kfree(param->pe_data);
925 }
926
927 if (opt) {
928 cp->in_seq = opt->in_seq;
929 cp->out_seq = opt->out_seq;
930 }
931 atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs));
932 cp->state = state;
933 cp->old_state = cp->state;
934
935
936
937
938
939
940
941
942
943 if (timeout) {
944 if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
945 timeout = MAX_SCHEDULE_TIMEOUT / HZ;
946 cp->timeout = timeout*HZ;
947 } else {
948 struct ip_vs_proto_data *pd;
949
950 pd = ip_vs_proto_data_get(ipvs, protocol);
951 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
952 cp->timeout = pd->timeout_table[state];
953 else
954 cp->timeout = (3*60*HZ);
955 }
956 ip_vs_conn_put(cp);
957}
958
959
960
961
962static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer,
963 const size_t buflen)
964{
965 struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
966 struct ip_vs_sync_conn_v0 *s;
967 struct ip_vs_sync_conn_options *opt;
968 struct ip_vs_protocol *pp;
969 struct ip_vs_conn_param param;
970 char *p;
971 int i;
972
973 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
974 for (i=0; i<m->nr_conns; i++) {
975 unsigned int flags, state;
976
977 if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
978 IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
979 return;
980 }
981 s = (struct ip_vs_sync_conn_v0 *) p;
982 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
983 flags &= ~IP_VS_CONN_F_HASHED;
984 if (flags & IP_VS_CONN_F_SEQ_MASK) {
985 opt = (struct ip_vs_sync_conn_options *)&s[1];
986 p += FULL_CONN_SIZE;
987 if (p > buffer+buflen) {
988 IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
989 return;
990 }
991 } else {
992 opt = NULL;
993 p += SIMPLE_CONN_SIZE;
994 }
995
996 state = ntohs(s->state);
997 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
998 pp = ip_vs_proto_get(s->protocol);
999 if (!pp) {
1000 IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
1001 s->protocol);
1002 continue;
1003 }
1004 if (state >= pp->num_states) {
1005 IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
1006 pp->name, state);
1007 continue;
1008 }
1009 } else {
1010
1011 if (state > 0) {
1012 IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
1013 state);
1014 state = 0;
1015 }
1016 }
1017
1018 ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol,
1019 (const union nf_inet_addr *)&s->caddr,
1020 s->cport,
1021 (const union nf_inet_addr *)&s->vaddr,
1022 s->vport, ¶m);
1023
1024
1025 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->protocol, AF_INET,
1026 (union nf_inet_addr *)&s->daddr, s->dport,
1027 0, 0, opt);
1028 }
1029}
1030
1031
1032
1033
1034static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen,
1035 __u32 *opt_flags,
1036 struct ip_vs_sync_conn_options *opt)
1037{
1038 struct ip_vs_sync_conn_options *topt;
1039
1040 topt = (struct ip_vs_sync_conn_options *)p;
1041
1042 if (plen != sizeof(struct ip_vs_sync_conn_options)) {
1043 IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
1044 return -EINVAL;
1045 }
1046 if (*opt_flags & IPVS_OPT_F_SEQ_DATA) {
1047 IP_VS_DBG(2, "BACKUP, conn options found twice\n");
1048 return -EINVAL;
1049 }
1050 ntoh_seq(&topt->in_seq, &opt->in_seq);
1051 ntoh_seq(&topt->out_seq, &opt->out_seq);
1052 *opt_flags |= IPVS_OPT_F_SEQ_DATA;
1053 return 0;
1054}
1055
1056static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
1057 __u8 **data, unsigned int maxlen,
1058 __u32 *opt_flags, __u32 flag)
1059{
1060 if (plen > maxlen) {
1061 IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen);
1062 return -EINVAL;
1063 }
1064 if (*opt_flags & flag) {
1065 IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag);
1066 return -EINVAL;
1067 }
1068 *data_len = plen;
1069 *data = p;
1070 *opt_flags |= flag;
1071 return 0;
1072}
1073
1074
1075
1076static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end)
1077{
1078 struct ip_vs_sync_conn_options opt;
1079 union ip_vs_sync_conn *s;
1080 struct ip_vs_protocol *pp;
1081 struct ip_vs_conn_param param;
1082 __u32 flags;
1083 unsigned int af, state, pe_data_len=0, pe_name_len=0;
1084 __u8 *pe_data=NULL, *pe_name=NULL;
1085 __u32 opt_flags=0;
1086 int retc=0;
1087
1088 s = (union ip_vs_sync_conn *) p;
1089
1090 if (s->v6.type & STYPE_F_INET6) {
1091#ifdef CONFIG_IP_VS_IPV6
1092 af = AF_INET6;
1093 p += sizeof(struct ip_vs_sync_v6);
1094#else
1095 IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
1096 retc = 10;
1097 goto out;
1098#endif
1099 } else if (!s->v4.type) {
1100 af = AF_INET;
1101 p += sizeof(struct ip_vs_sync_v4);
1102 } else {
1103 return -10;
1104 }
1105 if (p > msg_end)
1106 return -20;
1107
1108
1109 while (p < msg_end) {
1110 int ptype;
1111 int plen;
1112
1113 if (p+2 > msg_end)
1114 return -30;
1115 ptype = *(p++);
1116 plen = *(p++);
1117
1118 if (!plen || ((p + plen) > msg_end))
1119 return -40;
1120
1121 switch (ptype & ~IPVS_OPT_F_PARAM) {
1122 case IPVS_OPT_SEQ_DATA:
1123 if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt))
1124 return -50;
1125 break;
1126
1127 case IPVS_OPT_PE_DATA:
1128 if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data,
1129 IP_VS_PEDATA_MAXLEN, &opt_flags,
1130 IPVS_OPT_F_PE_DATA))
1131 return -60;
1132 break;
1133
1134 case IPVS_OPT_PE_NAME:
1135 if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name,
1136 IP_VS_PENAME_MAXLEN, &opt_flags,
1137 IPVS_OPT_F_PE_NAME))
1138 return -70;
1139 break;
1140
1141 default:
1142
1143 if (!(ptype & IPVS_OPT_F_PARAM)) {
1144 IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
1145 ptype & ~IPVS_OPT_F_PARAM);
1146 retc = 20;
1147 goto out;
1148 }
1149 }
1150 p += plen;
1151 }
1152
1153
1154 flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK;
1155 flags |= IP_VS_CONN_F_SYNC;
1156 state = ntohs(s->v4.state);
1157
1158 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
1159 pp = ip_vs_proto_get(s->v4.protocol);
1160 if (!pp) {
1161 IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
1162 s->v4.protocol);
1163 retc = 30;
1164 goto out;
1165 }
1166 if (state >= pp->num_states) {
1167 IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
1168 pp->name, state);
1169 retc = 40;
1170 goto out;
1171 }
1172 } else {
1173
1174 if (state > 0) {
1175 IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
1176 state);
1177 state = 0;
1178 }
1179 }
1180 if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data,
1181 pe_data_len, pe_name, pe_name_len)) {
1182 retc = 50;
1183 goto out;
1184 }
1185
1186 if (af == AF_INET)
1187 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v4.protocol, af,
1188 (union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
1189 ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
1190 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
1191 );
1192#ifdef CONFIG_IP_VS_IPV6
1193 else
1194 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v6.protocol, af,
1195 (union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
1196 ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
1197 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
1198 );
1199#endif
1200 ip_vs_pe_put(param.pe);
1201 return 0;
1202
1203out:
1204 IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc);
1205 return retc;
1206
1207}
1208
1209
1210
1211
1212
1213static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer,
1214 const size_t buflen)
1215{
1216 struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
1217 __u8 *p, *msg_end;
1218 int i, nr_conns;
1219
1220 if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) {
1221 IP_VS_DBG(2, "BACKUP, message header too short\n");
1222 return;
1223 }
1224
1225 if (buflen != ntohs(m2->size)) {
1226 IP_VS_DBG(2, "BACKUP, bogus message size\n");
1227 return;
1228 }
1229
1230 if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) {
1231 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
1232 return;
1233 }
1234
1235 if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
1236 && (m2->spare == 0)) {
1237
1238 msg_end = buffer + sizeof(struct ip_vs_sync_mesg);
1239 nr_conns = m2->nr_conns;
1240
1241 for (i=0; i<nr_conns; i++) {
1242 union ip_vs_sync_conn *s;
1243 unsigned int size;
1244 int retc;
1245
1246 p = msg_end;
1247 if (p + sizeof(s->v4) > buffer+buflen) {
1248 IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
1249 return;
1250 }
1251 s = (union ip_vs_sync_conn *)p;
1252 size = ntohs(s->v4.ver_size) & SVER_MASK;
1253 msg_end = p + size;
1254
1255 if (msg_end > buffer+buflen) {
1256 IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
1257 return;
1258 }
1259 if (ntohs(s->v4.ver_size) >> SVER_SHIFT) {
1260 IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
1261 ntohs(s->v4.ver_size) >> SVER_SHIFT);
1262 return;
1263 }
1264
1265 retc = ip_vs_proc_sync_conn(ipvs, p, msg_end);
1266 if (retc < 0) {
1267 IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
1268 retc);
1269 return;
1270 }
1271
1272 msg_end = p + ((size + 3) & ~3);
1273 }
1274 } else {
1275
1276 ip_vs_process_message_v0(ipvs, buffer, buflen);
1277 return;
1278 }
1279}
1280
1281
1282
1283
1284
1285static void set_sock_size(struct sock *sk, int mode, int val)
1286{
1287
1288
1289 lock_sock(sk);
1290 if (mode) {
1291 val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
1292 sysctl_wmem_max);
1293 sk->sk_sndbuf = val * 2;
1294 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1295 } else {
1296 val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
1297 sysctl_rmem_max);
1298 sk->sk_rcvbuf = val * 2;
1299 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
1300 }
1301 release_sock(sk);
1302}
1303
1304
1305
1306
1307static void set_mcast_loop(struct sock *sk, u_char loop)
1308{
1309 struct inet_sock *inet = inet_sk(sk);
1310
1311
1312 lock_sock(sk);
1313 inet->mc_loop = loop ? 1 : 0;
1314#ifdef CONFIG_IP_VS_IPV6
1315 if (sk->sk_family == AF_INET6) {
1316 struct ipv6_pinfo *np = inet6_sk(sk);
1317
1318
1319 np->mc_loop = loop ? 1 : 0;
1320 }
1321#endif
1322 release_sock(sk);
1323}
1324
1325
1326
1327
1328static void set_mcast_ttl(struct sock *sk, u_char ttl)
1329{
1330 struct inet_sock *inet = inet_sk(sk);
1331
1332
1333 lock_sock(sk);
1334 inet->mc_ttl = ttl;
1335#ifdef CONFIG_IP_VS_IPV6
1336 if (sk->sk_family == AF_INET6) {
1337 struct ipv6_pinfo *np = inet6_sk(sk);
1338
1339
1340 np->mcast_hops = ttl;
1341 }
1342#endif
1343 release_sock(sk);
1344}
1345
1346
1347static void set_mcast_pmtudisc(struct sock *sk, int val)
1348{
1349 struct inet_sock *inet = inet_sk(sk);
1350
1351
1352 lock_sock(sk);
1353 inet->pmtudisc = val;
1354#ifdef CONFIG_IP_VS_IPV6
1355 if (sk->sk_family == AF_INET6) {
1356 struct ipv6_pinfo *np = inet6_sk(sk);
1357
1358
1359 np->pmtudisc = val;
1360 }
1361#endif
1362 release_sock(sk);
1363}
1364
1365
1366
1367
1368static int set_mcast_if(struct sock *sk, struct net_device *dev)
1369{
1370 struct inet_sock *inet = inet_sk(sk);
1371
1372 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1373 return -EINVAL;
1374
1375 lock_sock(sk);
1376 inet->mc_index = dev->ifindex;
1377
1378#ifdef CONFIG_IP_VS_IPV6
1379 if (sk->sk_family == AF_INET6) {
1380 struct ipv6_pinfo *np = inet6_sk(sk);
1381
1382
1383 np->mcast_oif = dev->ifindex;
1384 }
1385#endif
1386 release_sock(sk);
1387
1388 return 0;
1389}
1390
1391
1392
1393
1394
1395
1396
1397static int
1398join_mcast_group(struct sock *sk, struct in_addr *addr, struct net_device *dev)
1399{
1400 struct ip_mreqn mreq;
1401 int ret;
1402
1403 memset(&mreq, 0, sizeof(mreq));
1404 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
1405
1406 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1407 return -EINVAL;
1408
1409 mreq.imr_ifindex = dev->ifindex;
1410
1411 lock_sock(sk);
1412 ret = ip_mc_join_group(sk, &mreq);
1413 release_sock(sk);
1414
1415 return ret;
1416}
1417
1418#ifdef CONFIG_IP_VS_IPV6
1419static int join_mcast_group6(struct sock *sk, struct in6_addr *addr,
1420 struct net_device *dev)
1421{
1422 int ret;
1423
1424 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1425 return -EINVAL;
1426
1427 lock_sock(sk);
1428 ret = ipv6_sock_mc_join(sk, dev->ifindex, addr);
1429 release_sock(sk);
1430
1431 return ret;
1432}
1433#endif
1434
1435static int bind_mcastif_addr(struct socket *sock, struct net_device *dev)
1436{
1437 __be32 addr;
1438 struct sockaddr_in sin;
1439
1440 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1441 if (!addr)
1442 pr_err("You probably need to specify IP address on "
1443 "multicast interface.\n");
1444
1445 IP_VS_DBG(7, "binding socket with (%s) %pI4\n",
1446 dev->name, &addr);
1447
1448
1449 sin.sin_family = AF_INET;
1450 sin.sin_addr.s_addr = addr;
1451 sin.sin_port = 0;
1452
1453 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
1454}
1455
1456static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen,
1457 struct ipvs_sync_daemon_cfg *c, int id)
1458{
1459 if (AF_INET6 == c->mcast_af) {
1460 sa->in6 = (struct sockaddr_in6) {
1461 .sin6_family = AF_INET6,
1462 .sin6_port = htons(c->mcast_port + id),
1463 };
1464 sa->in6.sin6_addr = c->mcast_group.in6;
1465 *salen = sizeof(sa->in6);
1466 } else {
1467 sa->in = (struct sockaddr_in) {
1468 .sin_family = AF_INET,
1469 .sin_port = htons(c->mcast_port + id),
1470 };
1471 sa->in.sin_addr = c->mcast_group.in;
1472 *salen = sizeof(sa->in);
1473 }
1474}
1475
1476
1477
1478
1479static int make_send_sock(struct netns_ipvs *ipvs, int id,
1480 struct net_device *dev, struct socket **sock_ret)
1481{
1482
1483 union ipvs_sockaddr mcast_addr;
1484 struct socket *sock;
1485 int result, salen;
1486
1487
1488 result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM,
1489 IPPROTO_UDP, &sock);
1490 if (result < 0) {
1491 pr_err("Error during creation of socket; terminating\n");
1492 goto error;
1493 }
1494 *sock_ret = sock;
1495 result = set_mcast_if(sock->sk, dev);
1496 if (result < 0) {
1497 pr_err("Error setting outbound mcast interface\n");
1498 goto error;
1499 }
1500
1501 set_mcast_loop(sock->sk, 0);
1502 set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl);
1503
1504 set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT);
1505 result = sysctl_sync_sock_size(ipvs);
1506 if (result > 0)
1507 set_sock_size(sock->sk, 1, result);
1508
1509 if (AF_INET == ipvs->mcfg.mcast_af)
1510 result = bind_mcastif_addr(sock, dev);
1511 else
1512 result = 0;
1513 if (result < 0) {
1514 pr_err("Error binding address of the mcast interface\n");
1515 goto error;
1516 }
1517
1518 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id);
1519 result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
1520 salen, 0);
1521 if (result < 0) {
1522 pr_err("Error connecting to the multicast addr\n");
1523 goto error;
1524 }
1525
1526 return 0;
1527
1528error:
1529 return result;
1530}
1531
1532
1533
1534
1535
1536static int make_receive_sock(struct netns_ipvs *ipvs, int id,
1537 struct net_device *dev, struct socket **sock_ret)
1538{
1539
1540 union ipvs_sockaddr mcast_addr;
1541 struct socket *sock;
1542 int result, salen;
1543
1544
1545 result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM,
1546 IPPROTO_UDP, &sock);
1547 if (result < 0) {
1548 pr_err("Error during creation of socket; terminating\n");
1549 goto error;
1550 }
1551 *sock_ret = sock;
1552
1553 sock->sk->sk_reuse = SK_CAN_REUSE;
1554 result = sysctl_sync_sock_size(ipvs);
1555 if (result > 0)
1556 set_sock_size(sock->sk, 0, result);
1557
1558 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id);
1559 sock->sk->sk_bound_dev_if = dev->ifindex;
1560 result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen);
1561 if (result < 0) {
1562 pr_err("Error binding to the multicast addr\n");
1563 goto error;
1564 }
1565
1566
1567#ifdef CONFIG_IP_VS_IPV6
1568 if (ipvs->bcfg.mcast_af == AF_INET6)
1569 result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr,
1570 dev);
1571 else
1572#endif
1573 result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr,
1574 dev);
1575 if (result < 0) {
1576 pr_err("Error joining to the multicast group\n");
1577 goto error;
1578 }
1579
1580 return 0;
1581
1582error:
1583 return result;
1584}
1585
1586
1587static int
1588ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
1589{
1590 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
1591 struct kvec iov;
1592 int len;
1593
1594 EnterFunction(7);
1595 iov.iov_base = (void *)buffer;
1596 iov.iov_len = length;
1597
1598 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
1599
1600 LeaveFunction(7);
1601 return len;
1602}
1603
1604static int
1605ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
1606{
1607 int msize;
1608 int ret;
1609
1610 msize = ntohs(msg->size);
1611
1612 ret = ip_vs_send_async(sock, (char *)msg, msize);
1613 if (ret >= 0 || ret == -EAGAIN)
1614 return ret;
1615 pr_err("ip_vs_send_async error %d\n", ret);
1616 return 0;
1617}
1618
1619static int
1620ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
1621{
1622 struct msghdr msg = {NULL,};
1623 struct kvec iov = {buffer, buflen};
1624 int len;
1625
1626 EnterFunction(7);
1627
1628
1629 iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, buflen);
1630 len = sock_recvmsg(sock, &msg, MSG_DONTWAIT);
1631 if (len < 0)
1632 return len;
1633
1634 LeaveFunction(7);
1635 return len;
1636}
1637
1638
1639static void master_wakeup_work_handler(struct work_struct *work)
1640{
1641 struct ipvs_master_sync_state *ms =
1642 container_of(work, struct ipvs_master_sync_state,
1643 master_wakeup_work.work);
1644 struct netns_ipvs *ipvs = ms->ipvs;
1645
1646 spin_lock_bh(&ipvs->sync_lock);
1647 if (ms->sync_queue_len &&
1648 ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) {
1649 int id = (int)(ms - ipvs->ms);
1650
1651 ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE;
1652 wake_up_process(ipvs->master_tinfo[id].task);
1653 }
1654 spin_unlock_bh(&ipvs->sync_lock);
1655}
1656
1657
1658static inline struct ip_vs_sync_buff *
1659next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
1660{
1661 struct ip_vs_sync_buff *sb;
1662
1663 sb = sb_dequeue(ipvs, ms);
1664 if (sb)
1665 return sb;
1666
1667 return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME);
1668}
1669
1670static int sync_thread_master(void *data)
1671{
1672 struct ip_vs_sync_thread_data *tinfo = data;
1673 struct netns_ipvs *ipvs = tinfo->ipvs;
1674 struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id];
1675 struct sock *sk = tinfo->sock->sk;
1676 struct ip_vs_sync_buff *sb;
1677
1678 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
1679 "syncid = %d, id = %d\n",
1680 ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id);
1681
1682 for (;;) {
1683 sb = next_sync_buff(ipvs, ms);
1684 if (unlikely(kthread_should_stop()))
1685 break;
1686 if (!sb) {
1687 schedule_timeout(IPVS_SYNC_CHECK_PERIOD);
1688 continue;
1689 }
1690 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) {
1691
1692
1693
1694 __wait_event_interruptible(*sk_sleep(sk),
1695 sock_writeable(sk) ||
1696 kthread_should_stop());
1697 if (unlikely(kthread_should_stop()))
1698 goto done;
1699 }
1700 ip_vs_sync_buff_release(sb);
1701 }
1702
1703done:
1704 __set_current_state(TASK_RUNNING);
1705 if (sb)
1706 ip_vs_sync_buff_release(sb);
1707
1708
1709 while ((sb = sb_dequeue(ipvs, ms)))
1710 ip_vs_sync_buff_release(sb);
1711 __set_current_state(TASK_RUNNING);
1712
1713
1714 sb = get_curr_sync_buff(ipvs, ms, 0);
1715 if (sb)
1716 ip_vs_sync_buff_release(sb);
1717
1718 return 0;
1719}
1720
1721
1722static int sync_thread_backup(void *data)
1723{
1724 struct ip_vs_sync_thread_data *tinfo = data;
1725 struct netns_ipvs *ipvs = tinfo->ipvs;
1726 int len;
1727
1728 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
1729 "syncid = %d, id = %d\n",
1730 ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id);
1731
1732 while (!kthread_should_stop()) {
1733 wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
1734 !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
1735 || kthread_should_stop());
1736
1737
1738 while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
1739 len = ip_vs_receive(tinfo->sock, tinfo->buf,
1740 ipvs->bcfg.sync_maxlen);
1741 if (len <= 0) {
1742 if (len != -EAGAIN)
1743 pr_err("receiving message error\n");
1744 break;
1745 }
1746
1747 ip_vs_process_message(ipvs, tinfo->buf, len);
1748 }
1749 }
1750
1751 return 0;
1752}
1753
1754
1755int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
1756 int state)
1757{
1758 struct ip_vs_sync_thread_data *ti = NULL, *tinfo;
1759 struct task_struct *task;
1760 struct net_device *dev;
1761 char *name;
1762 int (*threadfn)(void *data);
1763 int id = 0, count, hlen;
1764 int result = -ENOMEM;
1765 u16 mtu, min_mtu;
1766
1767 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
1768 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n",
1769 sizeof(struct ip_vs_sync_conn_v0));
1770
1771
1772 if (!ip_vs_use_count_inc())
1773 return -ENOPROTOOPT;
1774
1775
1776 for (;;) {
1777 rtnl_lock();
1778 if (mutex_trylock(&ipvs->sync_mutex))
1779 break;
1780 rtnl_unlock();
1781 mutex_lock(&ipvs->sync_mutex);
1782 if (rtnl_trylock())
1783 break;
1784 mutex_unlock(&ipvs->sync_mutex);
1785 }
1786
1787 if (!ipvs->sync_state) {
1788 count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX);
1789 ipvs->threads_mask = count - 1;
1790 } else
1791 count = ipvs->threads_mask + 1;
1792
1793 if (c->mcast_af == AF_UNSPEC) {
1794 c->mcast_af = AF_INET;
1795 c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP);
1796 }
1797 if (!c->mcast_port)
1798 c->mcast_port = IP_VS_SYNC_PORT;
1799 if (!c->mcast_ttl)
1800 c->mcast_ttl = 1;
1801
1802 dev = __dev_get_by_name(ipvs->net, c->mcast_ifn);
1803 if (!dev) {
1804 pr_err("Unknown mcast interface: %s\n", c->mcast_ifn);
1805 result = -ENODEV;
1806 goto out_early;
1807 }
1808 hlen = (AF_INET6 == c->mcast_af) ?
1809 sizeof(struct ipv6hdr) + sizeof(struct udphdr) :
1810 sizeof(struct iphdr) + sizeof(struct udphdr);
1811 mtu = (state == IP_VS_STATE_BACKUP) ?
1812 clamp(dev->mtu, 1500U, 65535U) : 1500U;
1813 min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1;
1814
1815 if (c->sync_maxlen)
1816 c->sync_maxlen = clamp_t(unsigned int,
1817 c->sync_maxlen, min_mtu,
1818 65535 - hlen);
1819 else
1820 c->sync_maxlen = mtu - hlen;
1821
1822 if (state == IP_VS_STATE_MASTER) {
1823 result = -EEXIST;
1824 if (ipvs->ms)
1825 goto out_early;
1826
1827 ipvs->mcfg = *c;
1828 name = "ipvs-m:%d:%d";
1829 threadfn = sync_thread_master;
1830 } else if (state == IP_VS_STATE_BACKUP) {
1831 result = -EEXIST;
1832 if (ipvs->backup_tinfo)
1833 goto out_early;
1834
1835 ipvs->bcfg = *c;
1836 name = "ipvs-b:%d:%d";
1837 threadfn = sync_thread_backup;
1838 } else {
1839 result = -EINVAL;
1840 goto out_early;
1841 }
1842
1843 if (state == IP_VS_STATE_MASTER) {
1844 struct ipvs_master_sync_state *ms;
1845
1846 result = -ENOMEM;
1847 ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL);
1848 if (!ipvs->ms)
1849 goto out;
1850 ms = ipvs->ms;
1851 for (id = 0; id < count; id++, ms++) {
1852 INIT_LIST_HEAD(&ms->sync_queue);
1853 ms->sync_queue_len = 0;
1854 ms->sync_queue_delay = 0;
1855 INIT_DELAYED_WORK(&ms->master_wakeup_work,
1856 master_wakeup_work_handler);
1857 ms->ipvs = ipvs;
1858 }
1859 }
1860 result = -ENOMEM;
1861 ti = kcalloc(count, sizeof(struct ip_vs_sync_thread_data),
1862 GFP_KERNEL);
1863 if (!ti)
1864 goto out;
1865
1866 for (id = 0; id < count; id++) {
1867 tinfo = &ti[id];
1868 tinfo->ipvs = ipvs;
1869 if (state == IP_VS_STATE_BACKUP) {
1870 result = -ENOMEM;
1871 tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen,
1872 GFP_KERNEL);
1873 if (!tinfo->buf)
1874 goto out;
1875 }
1876 tinfo->id = id;
1877 if (state == IP_VS_STATE_MASTER)
1878 result = make_send_sock(ipvs, id, dev, &tinfo->sock);
1879 else
1880 result = make_receive_sock(ipvs, id, dev, &tinfo->sock);
1881 if (result < 0)
1882 goto out;
1883
1884 task = kthread_run(threadfn, tinfo, name, ipvs->gen, id);
1885 if (IS_ERR(task)) {
1886 result = PTR_ERR(task);
1887 goto out;
1888 }
1889 tinfo->task = task;
1890 }
1891
1892
1893
1894 if (state == IP_VS_STATE_MASTER)
1895 ipvs->master_tinfo = ti;
1896 else
1897 ipvs->backup_tinfo = ti;
1898 spin_lock_bh(&ipvs->sync_buff_lock);
1899 ipvs->sync_state |= state;
1900 spin_unlock_bh(&ipvs->sync_buff_lock);
1901
1902 mutex_unlock(&ipvs->sync_mutex);
1903 rtnl_unlock();
1904
1905 return 0;
1906
1907out:
1908
1909
1910
1911 rtnl_unlock();
1912 id = min(id, count - 1);
1913 if (ti) {
1914 for (tinfo = ti + id; tinfo >= ti; tinfo--) {
1915 if (tinfo->task)
1916 kthread_stop(tinfo->task);
1917 }
1918 }
1919 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
1920 kfree(ipvs->ms);
1921 ipvs->ms = NULL;
1922 }
1923 mutex_unlock(&ipvs->sync_mutex);
1924
1925
1926 if (ti) {
1927 for (tinfo = ti + id; tinfo >= ti; tinfo--) {
1928 if (tinfo->sock)
1929 sock_release(tinfo->sock);
1930 kfree(tinfo->buf);
1931 }
1932 kfree(ti);
1933 }
1934
1935
1936 ip_vs_use_count_dec();
1937 return result;
1938
1939out_early:
1940 mutex_unlock(&ipvs->sync_mutex);
1941 rtnl_unlock();
1942
1943
1944 ip_vs_use_count_dec();
1945 return result;
1946}
1947
1948
1949int stop_sync_thread(struct netns_ipvs *ipvs, int state)
1950{
1951 struct ip_vs_sync_thread_data *ti, *tinfo;
1952 int id;
1953 int retc = -EINVAL;
1954
1955 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
1956
1957 mutex_lock(&ipvs->sync_mutex);
1958 if (state == IP_VS_STATE_MASTER) {
1959 retc = -ESRCH;
1960 if (!ipvs->ms)
1961 goto err;
1962 ti = ipvs->master_tinfo;
1963
1964
1965
1966
1967
1968
1969
1970 spin_lock_bh(&ipvs->sync_buff_lock);
1971 spin_lock(&ipvs->sync_lock);
1972 ipvs->sync_state &= ~IP_VS_STATE_MASTER;
1973 spin_unlock(&ipvs->sync_lock);
1974 spin_unlock_bh(&ipvs->sync_buff_lock);
1975
1976 retc = 0;
1977 for (id = ipvs->threads_mask; id >= 0; id--) {
1978 struct ipvs_master_sync_state *ms = &ipvs->ms[id];
1979 int ret;
1980
1981 tinfo = &ti[id];
1982 pr_info("stopping master sync thread %d ...\n",
1983 task_pid_nr(tinfo->task));
1984 cancel_delayed_work_sync(&ms->master_wakeup_work);
1985 ret = kthread_stop(tinfo->task);
1986 if (retc >= 0)
1987 retc = ret;
1988 }
1989 kfree(ipvs->ms);
1990 ipvs->ms = NULL;
1991 ipvs->master_tinfo = NULL;
1992 } else if (state == IP_VS_STATE_BACKUP) {
1993 retc = -ESRCH;
1994 if (!ipvs->backup_tinfo)
1995 goto err;
1996 ti = ipvs->backup_tinfo;
1997
1998 ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
1999 retc = 0;
2000 for (id = ipvs->threads_mask; id >= 0; id--) {
2001 int ret;
2002
2003 tinfo = &ti[id];
2004 pr_info("stopping backup sync thread %d ...\n",
2005 task_pid_nr(tinfo->task));
2006 ret = kthread_stop(tinfo->task);
2007 if (retc >= 0)
2008 retc = ret;
2009 }
2010 ipvs->backup_tinfo = NULL;
2011 } else {
2012 goto err;
2013 }
2014 id = ipvs->threads_mask;
2015 mutex_unlock(&ipvs->sync_mutex);
2016
2017
2018 for (tinfo = ti + id; tinfo >= ti; tinfo--) {
2019 if (tinfo->sock)
2020 sock_release(tinfo->sock);
2021 kfree(tinfo->buf);
2022 }
2023 kfree(ti);
2024
2025
2026 ip_vs_use_count_dec();
2027 return retc;
2028
2029err:
2030 mutex_unlock(&ipvs->sync_mutex);
2031 return retc;
2032}
2033
2034
2035
2036
2037int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs)
2038{
2039 __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key);
2040 spin_lock_init(&ipvs->sync_lock);
2041 spin_lock_init(&ipvs->sync_buff_lock);
2042 return 0;
2043}
2044
2045void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs)
2046{
2047 int retc;
2048
2049 retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER);
2050 if (retc && retc != -ESRCH)
2051 pr_err("Failed to stop Master Daemon\n");
2052
2053 retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP);
2054 if (retc && retc != -ESRCH)
2055 pr_err("Failed to stop Backup Daemon\n");
2056}
2057