1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35#define KMSG_COMPONENT "IPVS"
36#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
37
38#include <linux/module.h>
39#include <linux/slab.h>
40#include <linux/inetdevice.h>
41#include <linux/net.h>
42#include <linux/completion.h>
43#include <linux/delay.h>
44#include <linux/skbuff.h>
45#include <linux/in.h>
46#include <linux/igmp.h>
47#include <linux/udp.h>
48#include <linux/err.h>
49#include <linux/kthread.h>
50#include <linux/wait.h>
51#include <linux/kernel.h>
52#include <linux/sched/signal.h>
53
54#include <asm/unaligned.h>
55
56#include <net/ip.h>
57#include <net/sock.h>
58
59#include <net/ip_vs.h>
60
61#define IP_VS_SYNC_GROUP 0xe0000051
62#define IP_VS_SYNC_PORT 8848
63
64#define SYNC_PROTO_VER 1
65
66static struct lock_class_key __ipvs_sync_key;
67
68
69
70
71struct ip_vs_sync_conn_v0 {
72 __u8 reserved;
73
74
75 __u8 protocol;
76 __be16 cport;
77 __be16 vport;
78 __be16 dport;
79 __be32 caddr;
80 __be32 vaddr;
81 __be32 daddr;
82
83
84 __be16 flags;
85 __be16 state;
86
87
88};
89
90struct ip_vs_sync_conn_options {
91 struct ip_vs_seq in_seq;
92 struct ip_vs_seq out_seq;
93};
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133struct ip_vs_sync_v4 {
134 __u8 type;
135 __u8 protocol;
136 __be16 ver_size;
137
138 __be32 flags;
139 __be16 state;
140
141 __be16 cport;
142 __be16 vport;
143 __be16 dport;
144 __be32 fwmark;
145 __be32 timeout;
146 __be32 caddr;
147 __be32 vaddr;
148 __be32 daddr;
149
150
151};
152
153
154
155struct ip_vs_sync_v6 {
156 __u8 type;
157 __u8 protocol;
158 __be16 ver_size;
159
160 __be32 flags;
161 __be16 state;
162
163 __be16 cport;
164 __be16 vport;
165 __be16 dport;
166 __be32 fwmark;
167 __be32 timeout;
168 struct in6_addr caddr;
169 struct in6_addr vaddr;
170 struct in6_addr daddr;
171
172
173};
174
175union ip_vs_sync_conn {
176 struct ip_vs_sync_v4 v4;
177 struct ip_vs_sync_v6 v6;
178};
179
180
181#define STYPE_INET6 0
182#define STYPE_F_INET6 (1 << STYPE_INET6)
183
184#define SVER_SHIFT 12
185#define SVER_MASK 0x0fff
186
187#define IPVS_OPT_SEQ_DATA 1
188#define IPVS_OPT_PE_DATA 2
189#define IPVS_OPT_PE_NAME 3
190#define IPVS_OPT_PARAM 7
191
192#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1))
193#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1))
194#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1))
195#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
196
197struct ip_vs_sync_thread_data {
198 struct netns_ipvs *ipvs;
199 struct socket *sock;
200 char *buf;
201 int id;
202};
203
204
205#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0))
206#define FULL_CONN_SIZE \
207(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244#define SYNC_MESG_HEADER_LEN 4
245#define MAX_CONNS_PER_SYNCBUFF 255
246
247
248struct ip_vs_sync_mesg_v0 {
249 __u8 nr_conns;
250 __u8 syncid;
251 __be16 size;
252
253
254};
255
256
257struct ip_vs_sync_mesg {
258 __u8 reserved;
259 __u8 syncid;
260 __be16 size;
261 __u8 nr_conns;
262 __s8 version;
263 __u16 spare;
264
265};
266
267union ipvs_sockaddr {
268 struct sockaddr_in in;
269 struct sockaddr_in6 in6;
270};
271
272struct ip_vs_sync_buff {
273 struct list_head list;
274 unsigned long firstuse;
275
276
277 struct ip_vs_sync_mesg *mesg;
278 unsigned char *head;
279 unsigned char *end;
280};
281
282
283
284
285
286static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
287{
288 memset(ho, 0, sizeof(*ho));
289 ho->init_seq = get_unaligned_be32(&no->init_seq);
290 ho->delta = get_unaligned_be32(&no->delta);
291 ho->previous_delta = get_unaligned_be32(&no->previous_delta);
292}
293
294
295
296
297
298static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
299{
300 put_unaligned_be32(ho->init_seq, &no->init_seq);
301 put_unaligned_be32(ho->delta, &no->delta);
302 put_unaligned_be32(ho->previous_delta, &no->previous_delta);
303}
304
305static inline struct ip_vs_sync_buff *
306sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
307{
308 struct ip_vs_sync_buff *sb;
309
310 spin_lock_bh(&ipvs->sync_lock);
311 if (list_empty(&ms->sync_queue)) {
312 sb = NULL;
313 __set_current_state(TASK_INTERRUPTIBLE);
314 } else {
315 sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff,
316 list);
317 list_del(&sb->list);
318 ms->sync_queue_len--;
319 if (!ms->sync_queue_len)
320 ms->sync_queue_delay = 0;
321 }
322 spin_unlock_bh(&ipvs->sync_lock);
323
324 return sb;
325}
326
327
328
329
330static inline struct ip_vs_sync_buff *
331ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len)
332{
333 struct ip_vs_sync_buff *sb;
334
335 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
336 return NULL;
337
338 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg),
339 ipvs->mcfg.sync_maxlen);
340 sb->mesg = kmalloc(len, GFP_ATOMIC);
341 if (!sb->mesg) {
342 kfree(sb);
343 return NULL;
344 }
345 sb->mesg->reserved = 0;
346 sb->mesg->version = SYNC_PROTO_VER;
347 sb->mesg->syncid = ipvs->mcfg.syncid;
348 sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg));
349 sb->mesg->nr_conns = 0;
350 sb->mesg->spare = 0;
351 sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
352 sb->end = (unsigned char *)sb->mesg + len;
353
354 sb->firstuse = jiffies;
355 return sb;
356}
357
358static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
359{
360 kfree(sb->mesg);
361 kfree(sb);
362}
363
364static inline void sb_queue_tail(struct netns_ipvs *ipvs,
365 struct ipvs_master_sync_state *ms)
366{
367 struct ip_vs_sync_buff *sb = ms->sync_buff;
368
369 spin_lock(&ipvs->sync_lock);
370 if (ipvs->sync_state & IP_VS_STATE_MASTER &&
371 ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) {
372 if (!ms->sync_queue_len)
373 schedule_delayed_work(&ms->master_wakeup_work,
374 max(IPVS_SYNC_SEND_DELAY, 1));
375 ms->sync_queue_len++;
376 list_add_tail(&sb->list, &ms->sync_queue);
377 if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE)
378 wake_up_process(ms->master_thread);
379 } else
380 ip_vs_sync_buff_release(sb);
381 spin_unlock(&ipvs->sync_lock);
382}
383
384
385
386
387
388static inline struct ip_vs_sync_buff *
389get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms,
390 unsigned long time)
391{
392 struct ip_vs_sync_buff *sb;
393
394 spin_lock_bh(&ipvs->sync_buff_lock);
395 sb = ms->sync_buff;
396 if (sb && time_after_eq(jiffies - sb->firstuse, time)) {
397 ms->sync_buff = NULL;
398 __set_current_state(TASK_RUNNING);
399 } else
400 sb = NULL;
401 spin_unlock_bh(&ipvs->sync_buff_lock);
402 return sb;
403}
404
405static inline int
406select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp)
407{
408 return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask;
409}
410
411
412
413
414static inline struct ip_vs_sync_buff *
415ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len)
416{
417 struct ip_vs_sync_buff *sb;
418 struct ip_vs_sync_mesg_v0 *mesg;
419
420 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
421 return NULL;
422
423 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0),
424 ipvs->mcfg.sync_maxlen);
425 sb->mesg = kmalloc(len, GFP_ATOMIC);
426 if (!sb->mesg) {
427 kfree(sb);
428 return NULL;
429 }
430 mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
431 mesg->nr_conns = 0;
432 mesg->syncid = ipvs->mcfg.syncid;
433 mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0));
434 sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
435 sb->end = (unsigned char *)mesg + len;
436 sb->firstuse = jiffies;
437 return sb;
438}
439
440
441static inline bool in_persistence(struct ip_vs_conn *cp)
442{
443 for (cp = cp->control; cp; cp = cp->control) {
444 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
445 return true;
446 }
447 return false;
448}
449
450
451
452
453
454
455
456
457
458
459static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs,
460 struct ip_vs_conn *cp, int pkts)
461{
462 unsigned long orig = READ_ONCE(cp->sync_endtime);
463 unsigned long now = jiffies;
464 unsigned long n = (now + cp->timeout) & ~3UL;
465 unsigned int sync_refresh_period;
466 int sync_period;
467 int force;
468
469
470 if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE))
471 force = 0;
472 else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp)))
473 return 0;
474 else if (likely(cp->protocol == IPPROTO_TCP)) {
475 if (!((1 << cp->state) &
476 ((1 << IP_VS_TCP_S_ESTABLISHED) |
477 (1 << IP_VS_TCP_S_FIN_WAIT) |
478 (1 << IP_VS_TCP_S_CLOSE) |
479 (1 << IP_VS_TCP_S_CLOSE_WAIT) |
480 (1 << IP_VS_TCP_S_TIME_WAIT))))
481 return 0;
482 force = cp->state != cp->old_state;
483 if (force && cp->state != IP_VS_TCP_S_ESTABLISHED)
484 goto set;
485 } else if (unlikely(cp->protocol == IPPROTO_SCTP)) {
486 if (!((1 << cp->state) &
487 ((1 << IP_VS_SCTP_S_ESTABLISHED) |
488 (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) |
489 (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) |
490 (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) |
491 (1 << IP_VS_SCTP_S_CLOSED))))
492 return 0;
493 force = cp->state != cp->old_state;
494 if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED)
495 goto set;
496 } else {
497
498 force = 0;
499 }
500
501 sync_refresh_period = sysctl_sync_refresh_period(ipvs);
502 if (sync_refresh_period > 0) {
503 long diff = n - orig;
504 long min_diff = max(cp->timeout >> 1, 10UL * HZ);
505
506
507
508
509 if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) {
510 int retries = orig & 3;
511
512 if (retries >= sysctl_sync_retries(ipvs))
513 return 0;
514 if (time_before(now, orig - cp->timeout +
515 (sync_refresh_period >> 3)))
516 return 0;
517 n |= retries + 1;
518 }
519 }
520 sync_period = sysctl_sync_period(ipvs);
521 if (sync_period > 0) {
522 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) &&
523 pkts % sync_period != sysctl_sync_threshold(ipvs))
524 return 0;
525 } else if (!sync_refresh_period &&
526 pkts != sysctl_sync_threshold(ipvs))
527 return 0;
528
529set:
530 cp->old_state = cp->state;
531 n = cmpxchg(&cp->sync_endtime, orig, n);
532 return n == orig || force;
533}
534
535
536
537
538
539static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp,
540 int pkts)
541{
542 struct ip_vs_sync_mesg_v0 *m;
543 struct ip_vs_sync_conn_v0 *s;
544 struct ip_vs_sync_buff *buff;
545 struct ipvs_master_sync_state *ms;
546 int id;
547 unsigned int len;
548
549 if (unlikely(cp->af != AF_INET))
550 return;
551
552 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
553 return;
554
555 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
556 return;
557
558 spin_lock_bh(&ipvs->sync_buff_lock);
559 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
560 spin_unlock_bh(&ipvs->sync_buff_lock);
561 return;
562 }
563
564 id = select_master_thread_id(ipvs, cp);
565 ms = &ipvs->ms[id];
566 buff = ms->sync_buff;
567 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
568 SIMPLE_CONN_SIZE;
569 if (buff) {
570 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
571
572 if (buff->head + len > buff->end || !m->nr_conns) {
573 sb_queue_tail(ipvs, ms);
574 ms->sync_buff = NULL;
575 buff = NULL;
576 }
577 }
578 if (!buff) {
579 buff = ip_vs_sync_buff_create_v0(ipvs, len);
580 if (!buff) {
581 spin_unlock_bh(&ipvs->sync_buff_lock);
582 pr_err("ip_vs_sync_buff_create failed.\n");
583 return;
584 }
585 ms->sync_buff = buff;
586 }
587
588 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
589 s = (struct ip_vs_sync_conn_v0 *) buff->head;
590
591
592 s->reserved = 0;
593 s->protocol = cp->protocol;
594 s->cport = cp->cport;
595 s->vport = cp->vport;
596 s->dport = cp->dport;
597 s->caddr = cp->caddr.ip;
598 s->vaddr = cp->vaddr.ip;
599 s->daddr = cp->daddr.ip;
600 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
601 s->state = htons(cp->state);
602 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
603 struct ip_vs_sync_conn_options *opt =
604 (struct ip_vs_sync_conn_options *)&s[1];
605 memcpy(opt, &cp->in_seq, sizeof(*opt));
606 }
607
608 m->nr_conns++;
609 m->size = htons(ntohs(m->size) + len);
610 buff->head += len;
611 spin_unlock_bh(&ipvs->sync_buff_lock);
612
613
614 cp = cp->control;
615 if (cp) {
616 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
617 pkts = atomic_add_return(1, &cp->in_pkts);
618 else
619 pkts = sysctl_sync_threshold(ipvs);
620 ip_vs_sync_conn(ipvs, cp, pkts);
621 }
622}
623
624
625
626
627
628
629void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts)
630{
631 struct ip_vs_sync_mesg *m;
632 union ip_vs_sync_conn *s;
633 struct ip_vs_sync_buff *buff;
634 struct ipvs_master_sync_state *ms;
635 int id;
636 __u8 *p;
637 unsigned int len, pe_name_len, pad;
638
639
640 if (sysctl_sync_ver(ipvs) == 0) {
641 ip_vs_sync_conn_v0(ipvs, cp, pkts);
642 return;
643 }
644
645 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
646 goto control;
647sloop:
648 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
649 goto control;
650
651
652 pe_name_len = 0;
653 if (cp->pe_data_len) {
654 if (!cp->pe_data || !cp->dest) {
655 IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
656 return;
657 }
658 pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
659 }
660
661 spin_lock_bh(&ipvs->sync_buff_lock);
662 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
663 spin_unlock_bh(&ipvs->sync_buff_lock);
664 return;
665 }
666
667 id = select_master_thread_id(ipvs, cp);
668 ms = &ipvs->ms[id];
669
670#ifdef CONFIG_IP_VS_IPV6
671 if (cp->af == AF_INET6)
672 len = sizeof(struct ip_vs_sync_v6);
673 else
674#endif
675 len = sizeof(struct ip_vs_sync_v4);
676
677 if (cp->flags & IP_VS_CONN_F_SEQ_MASK)
678 len += sizeof(struct ip_vs_sync_conn_options) + 2;
679
680 if (cp->pe_data_len)
681 len += cp->pe_data_len + 2;
682 if (pe_name_len)
683 len += pe_name_len + 2;
684
685
686 pad = 0;
687 buff = ms->sync_buff;
688 if (buff) {
689 m = buff->mesg;
690 pad = (4 - (size_t) buff->head) & 3;
691
692 if (buff->head + len + pad > buff->end || m->reserved) {
693 sb_queue_tail(ipvs, ms);
694 ms->sync_buff = NULL;
695 buff = NULL;
696 pad = 0;
697 }
698 }
699
700 if (!buff) {
701 buff = ip_vs_sync_buff_create(ipvs, len);
702 if (!buff) {
703 spin_unlock_bh(&ipvs->sync_buff_lock);
704 pr_err("ip_vs_sync_buff_create failed.\n");
705 return;
706 }
707 ms->sync_buff = buff;
708 m = buff->mesg;
709 }
710
711 p = buff->head;
712 buff->head += pad + len;
713 m->size = htons(ntohs(m->size) + pad + len);
714
715 while (pad--)
716 *(p++) = 0;
717
718 s = (union ip_vs_sync_conn *)p;
719
720
721 s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0);
722 s->v4.ver_size = htons(len & SVER_MASK);
723 s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED);
724 s->v4.state = htons(cp->state);
725 s->v4.protocol = cp->protocol;
726 s->v4.cport = cp->cport;
727 s->v4.vport = cp->vport;
728 s->v4.dport = cp->dport;
729 s->v4.fwmark = htonl(cp->fwmark);
730 s->v4.timeout = htonl(cp->timeout / HZ);
731 m->nr_conns++;
732
733#ifdef CONFIG_IP_VS_IPV6
734 if (cp->af == AF_INET6) {
735 p += sizeof(struct ip_vs_sync_v6);
736 s->v6.caddr = cp->caddr.in6;
737 s->v6.vaddr = cp->vaddr.in6;
738 s->v6.daddr = cp->daddr.in6;
739 } else
740#endif
741 {
742 p += sizeof(struct ip_vs_sync_v4);
743 s->v4.caddr = cp->caddr.ip;
744 s->v4.vaddr = cp->vaddr.ip;
745 s->v4.daddr = cp->daddr.ip;
746 }
747 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
748 *(p++) = IPVS_OPT_SEQ_DATA;
749 *(p++) = sizeof(struct ip_vs_sync_conn_options);
750 hton_seq((struct ip_vs_seq *)p, &cp->in_seq);
751 p += sizeof(struct ip_vs_seq);
752 hton_seq((struct ip_vs_seq *)p, &cp->out_seq);
753 p += sizeof(struct ip_vs_seq);
754 }
755
756 if (cp->pe_data_len && cp->pe_data) {
757 *(p++) = IPVS_OPT_PE_DATA;
758 *(p++) = cp->pe_data_len;
759 memcpy(p, cp->pe_data, cp->pe_data_len);
760 p += cp->pe_data_len;
761 if (pe_name_len) {
762
763 *(p++) = IPVS_OPT_PE_NAME;
764 *(p++) = pe_name_len;
765 memcpy(p, cp->pe->name, pe_name_len);
766 p += pe_name_len;
767 }
768 }
769
770 spin_unlock_bh(&ipvs->sync_buff_lock);
771
772control:
773
774 cp = cp->control;
775 if (!cp)
776 return;
777 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
778 pkts = atomic_add_return(1, &cp->in_pkts);
779 else
780 pkts = sysctl_sync_threshold(ipvs);
781 goto sloop;
782}
783
784
785
786
787static inline int
788ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc,
789 struct ip_vs_conn_param *p,
790 __u8 *pe_data, unsigned int pe_data_len,
791 __u8 *pe_name, unsigned int pe_name_len)
792{
793#ifdef CONFIG_IP_VS_IPV6
794 if (af == AF_INET6)
795 ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol,
796 (const union nf_inet_addr *)&sc->v6.caddr,
797 sc->v6.cport,
798 (const union nf_inet_addr *)&sc->v6.vaddr,
799 sc->v6.vport, p);
800 else
801#endif
802 ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol,
803 (const union nf_inet_addr *)&sc->v4.caddr,
804 sc->v4.cport,
805 (const union nf_inet_addr *)&sc->v4.vaddr,
806 sc->v4.vport, p);
807
808 if (pe_data_len) {
809 if (pe_name_len) {
810 char buff[IP_VS_PENAME_MAXLEN+1];
811
812 memcpy(buff, pe_name, pe_name_len);
813 buff[pe_name_len]=0;
814 p->pe = __ip_vs_pe_getbyname(buff);
815 if (!p->pe) {
816 IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
817 buff);
818 return 1;
819 }
820 } else {
821 IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
822 return 1;
823 }
824
825 p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC);
826 if (!p->pe_data) {
827 module_put(p->pe->module);
828 return -ENOMEM;
829 }
830 p->pe_data_len = pe_data_len;
831 }
832 return 0;
833}
834
835
836
837
838
839
840
841static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param,
842 unsigned int flags, unsigned int state,
843 unsigned int protocol, unsigned int type,
844 const union nf_inet_addr *daddr, __be16 dport,
845 unsigned long timeout, __u32 fwmark,
846 struct ip_vs_sync_conn_options *opt)
847{
848 struct ip_vs_dest *dest;
849 struct ip_vs_conn *cp;
850
851 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
852 cp = ip_vs_conn_in_get(param);
853 if (cp && ((cp->dport != dport) ||
854 !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) {
855 if (!(flags & IP_VS_CONN_F_INACTIVE)) {
856 ip_vs_conn_expire_now(cp);
857 __ip_vs_conn_put(cp);
858 cp = NULL;
859 } else {
860
861
862
863
864 __ip_vs_conn_put(cp);
865 kfree(param->pe_data);
866 return;
867 }
868 }
869 } else {
870 cp = ip_vs_ct_in_get(param);
871 }
872
873 if (cp) {
874
875 kfree(param->pe_data);
876
877 dest = cp->dest;
878 spin_lock_bh(&cp->lock);
879 if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE &&
880 !(flags & IP_VS_CONN_F_TEMPLATE) && dest) {
881 if (flags & IP_VS_CONN_F_INACTIVE) {
882 atomic_dec(&dest->activeconns);
883 atomic_inc(&dest->inactconns);
884 } else {
885 atomic_inc(&dest->activeconns);
886 atomic_dec(&dest->inactconns);
887 }
888 }
889 flags &= IP_VS_CONN_F_BACKUP_UPD_MASK;
890 flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
891 cp->flags = flags;
892 spin_unlock_bh(&cp->lock);
893 if (!dest)
894 ip_vs_try_bind_dest(cp);
895 } else {
896
897
898
899
900
901 rcu_read_lock();
902
903
904
905
906
907 dest = ip_vs_find_dest(ipvs, type, type, daddr, dport,
908 param->vaddr, param->vport, protocol,
909 fwmark, flags);
910
911 cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest,
912 fwmark);
913 rcu_read_unlock();
914 if (!cp) {
915 kfree(param->pe_data);
916 IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
917 return;
918 }
919 if (!(flags & IP_VS_CONN_F_TEMPLATE))
920 kfree(param->pe_data);
921 }
922
923 if (opt) {
924 cp->in_seq = opt->in_seq;
925 cp->out_seq = opt->out_seq;
926 }
927 atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs));
928 cp->state = state;
929 cp->old_state = cp->state;
930
931
932
933
934
935
936
937
938
939 if (timeout) {
940 if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
941 timeout = MAX_SCHEDULE_TIMEOUT / HZ;
942 cp->timeout = timeout*HZ;
943 } else {
944 struct ip_vs_proto_data *pd;
945
946 pd = ip_vs_proto_data_get(ipvs, protocol);
947 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
948 cp->timeout = pd->timeout_table[state];
949 else
950 cp->timeout = (3*60*HZ);
951 }
952 ip_vs_conn_put(cp);
953}
954
955
956
957
958static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer,
959 const size_t buflen)
960{
961 struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
962 struct ip_vs_sync_conn_v0 *s;
963 struct ip_vs_sync_conn_options *opt;
964 struct ip_vs_protocol *pp;
965 struct ip_vs_conn_param param;
966 char *p;
967 int i;
968
969 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
970 for (i=0; i<m->nr_conns; i++) {
971 unsigned int flags, state;
972
973 if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
974 IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
975 return;
976 }
977 s = (struct ip_vs_sync_conn_v0 *) p;
978 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
979 flags &= ~IP_VS_CONN_F_HASHED;
980 if (flags & IP_VS_CONN_F_SEQ_MASK) {
981 opt = (struct ip_vs_sync_conn_options *)&s[1];
982 p += FULL_CONN_SIZE;
983 if (p > buffer+buflen) {
984 IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
985 return;
986 }
987 } else {
988 opt = NULL;
989 p += SIMPLE_CONN_SIZE;
990 }
991
992 state = ntohs(s->state);
993 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
994 pp = ip_vs_proto_get(s->protocol);
995 if (!pp) {
996 IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
997 s->protocol);
998 continue;
999 }
1000 if (state >= pp->num_states) {
1001 IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
1002 pp->name, state);
1003 continue;
1004 }
1005 } else {
1006 if (state >= IP_VS_CTPL_S_LAST)
1007 IP_VS_DBG(7, "BACKUP v0, Invalid tpl state %u\n",
1008 state);
1009 }
1010
1011 ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol,
1012 (const union nf_inet_addr *)&s->caddr,
1013 s->cport,
1014 (const union nf_inet_addr *)&s->vaddr,
1015 s->vport, ¶m);
1016
1017
1018 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->protocol, AF_INET,
1019 (union nf_inet_addr *)&s->daddr, s->dport,
1020 0, 0, opt);
1021 }
1022}
1023
1024
1025
1026
1027static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen,
1028 __u32 *opt_flags,
1029 struct ip_vs_sync_conn_options *opt)
1030{
1031 struct ip_vs_sync_conn_options *topt;
1032
1033 topt = (struct ip_vs_sync_conn_options *)p;
1034
1035 if (plen != sizeof(struct ip_vs_sync_conn_options)) {
1036 IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
1037 return -EINVAL;
1038 }
1039 if (*opt_flags & IPVS_OPT_F_SEQ_DATA) {
1040 IP_VS_DBG(2, "BACKUP, conn options found twice\n");
1041 return -EINVAL;
1042 }
1043 ntoh_seq(&topt->in_seq, &opt->in_seq);
1044 ntoh_seq(&topt->out_seq, &opt->out_seq);
1045 *opt_flags |= IPVS_OPT_F_SEQ_DATA;
1046 return 0;
1047}
1048
1049static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
1050 __u8 **data, unsigned int maxlen,
1051 __u32 *opt_flags, __u32 flag)
1052{
1053 if (plen > maxlen) {
1054 IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen);
1055 return -EINVAL;
1056 }
1057 if (*opt_flags & flag) {
1058 IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag);
1059 return -EINVAL;
1060 }
1061 *data_len = plen;
1062 *data = p;
1063 *opt_flags |= flag;
1064 return 0;
1065}
1066
1067
1068
1069static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end)
1070{
1071 struct ip_vs_sync_conn_options opt;
1072 union ip_vs_sync_conn *s;
1073 struct ip_vs_protocol *pp;
1074 struct ip_vs_conn_param param;
1075 __u32 flags;
1076 unsigned int af, state, pe_data_len=0, pe_name_len=0;
1077 __u8 *pe_data=NULL, *pe_name=NULL;
1078 __u32 opt_flags=0;
1079 int retc=0;
1080
1081 s = (union ip_vs_sync_conn *) p;
1082
1083 if (s->v6.type & STYPE_F_INET6) {
1084#ifdef CONFIG_IP_VS_IPV6
1085 af = AF_INET6;
1086 p += sizeof(struct ip_vs_sync_v6);
1087#else
1088 IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
1089 retc = 10;
1090 goto out;
1091#endif
1092 } else if (!s->v4.type) {
1093 af = AF_INET;
1094 p += sizeof(struct ip_vs_sync_v4);
1095 } else {
1096 return -10;
1097 }
1098 if (p > msg_end)
1099 return -20;
1100
1101
1102 while (p < msg_end) {
1103 int ptype;
1104 int plen;
1105
1106 if (p+2 > msg_end)
1107 return -30;
1108 ptype = *(p++);
1109 plen = *(p++);
1110
1111 if (!plen || ((p + plen) > msg_end))
1112 return -40;
1113
1114 switch (ptype & ~IPVS_OPT_F_PARAM) {
1115 case IPVS_OPT_SEQ_DATA:
1116 if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt))
1117 return -50;
1118 break;
1119
1120 case IPVS_OPT_PE_DATA:
1121 if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data,
1122 IP_VS_PEDATA_MAXLEN, &opt_flags,
1123 IPVS_OPT_F_PE_DATA))
1124 return -60;
1125 break;
1126
1127 case IPVS_OPT_PE_NAME:
1128 if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name,
1129 IP_VS_PENAME_MAXLEN, &opt_flags,
1130 IPVS_OPT_F_PE_NAME))
1131 return -70;
1132 break;
1133
1134 default:
1135
1136 if (!(ptype & IPVS_OPT_F_PARAM)) {
1137 IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
1138 ptype & ~IPVS_OPT_F_PARAM);
1139 retc = 20;
1140 goto out;
1141 }
1142 }
1143 p += plen;
1144 }
1145
1146
1147 flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK;
1148 flags |= IP_VS_CONN_F_SYNC;
1149 state = ntohs(s->v4.state);
1150
1151 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
1152 pp = ip_vs_proto_get(s->v4.protocol);
1153 if (!pp) {
1154 IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
1155 s->v4.protocol);
1156 retc = 30;
1157 goto out;
1158 }
1159 if (state >= pp->num_states) {
1160 IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
1161 pp->name, state);
1162 retc = 40;
1163 goto out;
1164 }
1165 } else {
1166 if (state >= IP_VS_CTPL_S_LAST)
1167 IP_VS_DBG(7, "BACKUP, Invalid tpl state %u\n",
1168 state);
1169 }
1170 if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data,
1171 pe_data_len, pe_name, pe_name_len)) {
1172 retc = 50;
1173 goto out;
1174 }
1175
1176 if (af == AF_INET)
1177 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v4.protocol, af,
1178 (union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
1179 ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
1180 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
1181 );
1182#ifdef CONFIG_IP_VS_IPV6
1183 else
1184 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v6.protocol, af,
1185 (union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
1186 ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
1187 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
1188 );
1189#endif
1190 ip_vs_pe_put(param.pe);
1191 return 0;
1192
1193out:
1194 IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc);
1195 return retc;
1196
1197}
1198
1199
1200
1201
1202
1203static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer,
1204 const size_t buflen)
1205{
1206 struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
1207 __u8 *p, *msg_end;
1208 int i, nr_conns;
1209
1210 if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) {
1211 IP_VS_DBG(2, "BACKUP, message header too short\n");
1212 return;
1213 }
1214
1215 if (buflen != ntohs(m2->size)) {
1216 IP_VS_DBG(2, "BACKUP, bogus message size\n");
1217 return;
1218 }
1219
1220 if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) {
1221 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
1222 return;
1223 }
1224
1225 if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
1226 && (m2->spare == 0)) {
1227
1228 msg_end = buffer + sizeof(struct ip_vs_sync_mesg);
1229 nr_conns = m2->nr_conns;
1230
1231 for (i=0; i<nr_conns; i++) {
1232 union ip_vs_sync_conn *s;
1233 unsigned int size;
1234 int retc;
1235
1236 p = msg_end;
1237 if (p + sizeof(s->v4) > buffer+buflen) {
1238 IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
1239 return;
1240 }
1241 s = (union ip_vs_sync_conn *)p;
1242 size = ntohs(s->v4.ver_size) & SVER_MASK;
1243 msg_end = p + size;
1244
1245 if (msg_end > buffer+buflen) {
1246 IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
1247 return;
1248 }
1249 if (ntohs(s->v4.ver_size) >> SVER_SHIFT) {
1250 IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
1251 ntohs(s->v4.ver_size) >> SVER_SHIFT);
1252 return;
1253 }
1254
1255 retc = ip_vs_proc_sync_conn(ipvs, p, msg_end);
1256 if (retc < 0) {
1257 IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
1258 retc);
1259 return;
1260 }
1261
1262 msg_end = p + ((size + 3) & ~3);
1263 }
1264 } else {
1265
1266 ip_vs_process_message_v0(ipvs, buffer, buflen);
1267 return;
1268 }
1269}
1270
1271
1272
1273
1274
1275static void set_sock_size(struct sock *sk, int mode, int val)
1276{
1277
1278
1279 lock_sock(sk);
1280 if (mode) {
1281 val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
1282 sysctl_wmem_max);
1283 sk->sk_sndbuf = val * 2;
1284 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1285 } else {
1286 val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
1287 sysctl_rmem_max);
1288 sk->sk_rcvbuf = val * 2;
1289 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
1290 }
1291 release_sock(sk);
1292}
1293
1294
1295
1296
1297static void set_mcast_loop(struct sock *sk, u_char loop)
1298{
1299 struct inet_sock *inet = inet_sk(sk);
1300
1301
1302 lock_sock(sk);
1303 inet->mc_loop = loop ? 1 : 0;
1304#ifdef CONFIG_IP_VS_IPV6
1305 if (sk->sk_family == AF_INET6) {
1306 struct ipv6_pinfo *np = inet6_sk(sk);
1307
1308
1309 np->mc_loop = loop ? 1 : 0;
1310 }
1311#endif
1312 release_sock(sk);
1313}
1314
1315
1316
1317
1318static void set_mcast_ttl(struct sock *sk, u_char ttl)
1319{
1320 struct inet_sock *inet = inet_sk(sk);
1321
1322
1323 lock_sock(sk);
1324 inet->mc_ttl = ttl;
1325#ifdef CONFIG_IP_VS_IPV6
1326 if (sk->sk_family == AF_INET6) {
1327 struct ipv6_pinfo *np = inet6_sk(sk);
1328
1329
1330 np->mcast_hops = ttl;
1331 }
1332#endif
1333 release_sock(sk);
1334}
1335
1336
1337static void set_mcast_pmtudisc(struct sock *sk, int val)
1338{
1339 struct inet_sock *inet = inet_sk(sk);
1340
1341
1342 lock_sock(sk);
1343 inet->pmtudisc = val;
1344#ifdef CONFIG_IP_VS_IPV6
1345 if (sk->sk_family == AF_INET6) {
1346 struct ipv6_pinfo *np = inet6_sk(sk);
1347
1348
1349 np->pmtudisc = val;
1350 }
1351#endif
1352 release_sock(sk);
1353}
1354
1355
1356
1357
1358static int set_mcast_if(struct sock *sk, struct net_device *dev)
1359{
1360 struct inet_sock *inet = inet_sk(sk);
1361
1362 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1363 return -EINVAL;
1364
1365 lock_sock(sk);
1366 inet->mc_index = dev->ifindex;
1367
1368#ifdef CONFIG_IP_VS_IPV6
1369 if (sk->sk_family == AF_INET6) {
1370 struct ipv6_pinfo *np = inet6_sk(sk);
1371
1372
1373 np->mcast_oif = dev->ifindex;
1374 }
1375#endif
1376 release_sock(sk);
1377
1378 return 0;
1379}
1380
1381
1382
1383
1384
1385
1386
1387static int
1388join_mcast_group(struct sock *sk, struct in_addr *addr, struct net_device *dev)
1389{
1390 struct ip_mreqn mreq;
1391 int ret;
1392
1393 memset(&mreq, 0, sizeof(mreq));
1394 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
1395
1396 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1397 return -EINVAL;
1398
1399 mreq.imr_ifindex = dev->ifindex;
1400
1401 lock_sock(sk);
1402 ret = ip_mc_join_group(sk, &mreq);
1403 release_sock(sk);
1404
1405 return ret;
1406}
1407
1408#ifdef CONFIG_IP_VS_IPV6
1409static int join_mcast_group6(struct sock *sk, struct in6_addr *addr,
1410 struct net_device *dev)
1411{
1412 int ret;
1413
1414 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1415 return -EINVAL;
1416
1417 lock_sock(sk);
1418 ret = ipv6_sock_mc_join(sk, dev->ifindex, addr);
1419 release_sock(sk);
1420
1421 return ret;
1422}
1423#endif
1424
1425static int bind_mcastif_addr(struct socket *sock, struct net_device *dev)
1426{
1427 __be32 addr;
1428 struct sockaddr_in sin;
1429
1430 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1431 if (!addr)
1432 pr_err("You probably need to specify IP address on "
1433 "multicast interface.\n");
1434
1435 IP_VS_DBG(7, "binding socket with (%s) %pI4\n",
1436 dev->name, &addr);
1437
1438
1439 sin.sin_family = AF_INET;
1440 sin.sin_addr.s_addr = addr;
1441 sin.sin_port = 0;
1442
1443 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
1444}
1445
1446static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen,
1447 struct ipvs_sync_daemon_cfg *c, int id)
1448{
1449 if (AF_INET6 == c->mcast_af) {
1450 sa->in6 = (struct sockaddr_in6) {
1451 .sin6_family = AF_INET6,
1452 .sin6_port = htons(c->mcast_port + id),
1453 };
1454 sa->in6.sin6_addr = c->mcast_group.in6;
1455 *salen = sizeof(sa->in6);
1456 } else {
1457 sa->in = (struct sockaddr_in) {
1458 .sin_family = AF_INET,
1459 .sin_port = htons(c->mcast_port + id),
1460 };
1461 sa->in.sin_addr = c->mcast_group.in;
1462 *salen = sizeof(sa->in);
1463 }
1464}
1465
1466
1467
1468
1469static int make_send_sock(struct netns_ipvs *ipvs, int id,
1470 struct net_device *dev, struct socket **sock_ret)
1471{
1472
1473 union ipvs_sockaddr mcast_addr;
1474 struct socket *sock;
1475 int result, salen;
1476
1477
1478 result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM,
1479 IPPROTO_UDP, &sock);
1480 if (result < 0) {
1481 pr_err("Error during creation of socket; terminating\n");
1482 goto error;
1483 }
1484 *sock_ret = sock;
1485 result = set_mcast_if(sock->sk, dev);
1486 if (result < 0) {
1487 pr_err("Error setting outbound mcast interface\n");
1488 goto error;
1489 }
1490
1491 set_mcast_loop(sock->sk, 0);
1492 set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl);
1493
1494 set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT);
1495 result = sysctl_sync_sock_size(ipvs);
1496 if (result > 0)
1497 set_sock_size(sock->sk, 1, result);
1498
1499 if (AF_INET == ipvs->mcfg.mcast_af)
1500 result = bind_mcastif_addr(sock, dev);
1501 else
1502 result = 0;
1503 if (result < 0) {
1504 pr_err("Error binding address of the mcast interface\n");
1505 goto error;
1506 }
1507
1508 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id);
1509 result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
1510 salen, 0);
1511 if (result < 0) {
1512 pr_err("Error connecting to the multicast addr\n");
1513 goto error;
1514 }
1515
1516 return 0;
1517
1518error:
1519 return result;
1520}
1521
1522
1523
1524
1525
1526static int make_receive_sock(struct netns_ipvs *ipvs, int id,
1527 struct net_device *dev, struct socket **sock_ret)
1528{
1529
1530 union ipvs_sockaddr mcast_addr;
1531 struct socket *sock;
1532 int result, salen;
1533
1534
1535 result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM,
1536 IPPROTO_UDP, &sock);
1537 if (result < 0) {
1538 pr_err("Error during creation of socket; terminating\n");
1539 goto error;
1540 }
1541 *sock_ret = sock;
1542
1543 sock->sk->sk_reuse = SK_CAN_REUSE;
1544 result = sysctl_sync_sock_size(ipvs);
1545 if (result > 0)
1546 set_sock_size(sock->sk, 0, result);
1547
1548 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id);
1549 sock->sk->sk_bound_dev_if = dev->ifindex;
1550 result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen);
1551 if (result < 0) {
1552 pr_err("Error binding to the multicast addr\n");
1553 goto error;
1554 }
1555
1556
1557#ifdef CONFIG_IP_VS_IPV6
1558 if (ipvs->bcfg.mcast_af == AF_INET6)
1559 result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr,
1560 dev);
1561 else
1562#endif
1563 result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr,
1564 dev);
1565 if (result < 0) {
1566 pr_err("Error joining to the multicast group\n");
1567 goto error;
1568 }
1569
1570 return 0;
1571
1572error:
1573 return result;
1574}
1575
1576
1577static int
1578ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
1579{
1580 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
1581 struct kvec iov;
1582 int len;
1583
1584 EnterFunction(7);
1585 iov.iov_base = (void *)buffer;
1586 iov.iov_len = length;
1587
1588 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
1589
1590 LeaveFunction(7);
1591 return len;
1592}
1593
1594static int
1595ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
1596{
1597 int msize;
1598 int ret;
1599
1600 msize = ntohs(msg->size);
1601
1602 ret = ip_vs_send_async(sock, (char *)msg, msize);
1603 if (ret >= 0 || ret == -EAGAIN)
1604 return ret;
1605 pr_err("ip_vs_send_async error %d\n", ret);
1606 return 0;
1607}
1608
1609static int
1610ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
1611{
1612 struct msghdr msg = {NULL,};
1613 struct kvec iov = {buffer, buflen};
1614 int len;
1615
1616 EnterFunction(7);
1617
1618
1619 iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, buflen);
1620 len = sock_recvmsg(sock, &msg, MSG_DONTWAIT);
1621 if (len < 0)
1622 return len;
1623
1624 LeaveFunction(7);
1625 return len;
1626}
1627
1628
1629static void master_wakeup_work_handler(struct work_struct *work)
1630{
1631 struct ipvs_master_sync_state *ms =
1632 container_of(work, struct ipvs_master_sync_state,
1633 master_wakeup_work.work);
1634 struct netns_ipvs *ipvs = ms->ipvs;
1635
1636 spin_lock_bh(&ipvs->sync_lock);
1637 if (ms->sync_queue_len &&
1638 ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) {
1639 ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE;
1640 wake_up_process(ms->master_thread);
1641 }
1642 spin_unlock_bh(&ipvs->sync_lock);
1643}
1644
1645
1646static inline struct ip_vs_sync_buff *
1647next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
1648{
1649 struct ip_vs_sync_buff *sb;
1650
1651 sb = sb_dequeue(ipvs, ms);
1652 if (sb)
1653 return sb;
1654
1655 return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME);
1656}
1657
1658static int sync_thread_master(void *data)
1659{
1660 struct ip_vs_sync_thread_data *tinfo = data;
1661 struct netns_ipvs *ipvs = tinfo->ipvs;
1662 struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id];
1663 struct sock *sk = tinfo->sock->sk;
1664 struct ip_vs_sync_buff *sb;
1665
1666 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
1667 "syncid = %d, id = %d\n",
1668 ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id);
1669
1670 for (;;) {
1671 sb = next_sync_buff(ipvs, ms);
1672 if (unlikely(kthread_should_stop()))
1673 break;
1674 if (!sb) {
1675 schedule_timeout(IPVS_SYNC_CHECK_PERIOD);
1676 continue;
1677 }
1678 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) {
1679
1680
1681
1682 __wait_event_interruptible(*sk_sleep(sk),
1683 sock_writeable(sk) ||
1684 kthread_should_stop());
1685 if (unlikely(kthread_should_stop()))
1686 goto done;
1687 }
1688 ip_vs_sync_buff_release(sb);
1689 }
1690
1691done:
1692 __set_current_state(TASK_RUNNING);
1693 if (sb)
1694 ip_vs_sync_buff_release(sb);
1695
1696
1697 while ((sb = sb_dequeue(ipvs, ms)))
1698 ip_vs_sync_buff_release(sb);
1699 __set_current_state(TASK_RUNNING);
1700
1701
1702 sb = get_curr_sync_buff(ipvs, ms, 0);
1703 if (sb)
1704 ip_vs_sync_buff_release(sb);
1705
1706
1707 sock_release(tinfo->sock);
1708 kfree(tinfo);
1709
1710 return 0;
1711}
1712
1713
1714static int sync_thread_backup(void *data)
1715{
1716 struct ip_vs_sync_thread_data *tinfo = data;
1717 struct netns_ipvs *ipvs = tinfo->ipvs;
1718 int len;
1719
1720 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
1721 "syncid = %d, id = %d\n",
1722 ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id);
1723
1724 while (!kthread_should_stop()) {
1725 wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
1726 !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
1727 || kthread_should_stop());
1728
1729
1730 while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
1731 len = ip_vs_receive(tinfo->sock, tinfo->buf,
1732 ipvs->bcfg.sync_maxlen);
1733 if (len <= 0) {
1734 if (len != -EAGAIN)
1735 pr_err("receiving message error\n");
1736 break;
1737 }
1738
1739 ip_vs_process_message(ipvs, tinfo->buf, len);
1740 }
1741 }
1742
1743
1744 sock_release(tinfo->sock);
1745 kfree(tinfo->buf);
1746 kfree(tinfo);
1747
1748 return 0;
1749}
1750
1751
1752int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
1753 int state)
1754{
1755 struct ip_vs_sync_thread_data *tinfo = NULL;
1756 struct task_struct **array = NULL, *task;
1757 struct net_device *dev;
1758 char *name;
1759 int (*threadfn)(void *data);
1760 int id = 0, count, hlen;
1761 int result = -ENOMEM;
1762 u16 mtu, min_mtu;
1763
1764 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
1765 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n",
1766 sizeof(struct ip_vs_sync_conn_v0));
1767
1768
1769 for (;;) {
1770 rtnl_lock();
1771 if (mutex_trylock(&ipvs->sync_mutex))
1772 break;
1773 rtnl_unlock();
1774 mutex_lock(&ipvs->sync_mutex);
1775 if (rtnl_trylock())
1776 break;
1777 mutex_unlock(&ipvs->sync_mutex);
1778 }
1779
1780 if (!ipvs->sync_state) {
1781 count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX);
1782 ipvs->threads_mask = count - 1;
1783 } else
1784 count = ipvs->threads_mask + 1;
1785
1786 if (c->mcast_af == AF_UNSPEC) {
1787 c->mcast_af = AF_INET;
1788 c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP);
1789 }
1790 if (!c->mcast_port)
1791 c->mcast_port = IP_VS_SYNC_PORT;
1792 if (!c->mcast_ttl)
1793 c->mcast_ttl = 1;
1794
1795 dev = __dev_get_by_name(ipvs->net, c->mcast_ifn);
1796 if (!dev) {
1797 pr_err("Unknown mcast interface: %s\n", c->mcast_ifn);
1798 result = -ENODEV;
1799 goto out_early;
1800 }
1801 hlen = (AF_INET6 == c->mcast_af) ?
1802 sizeof(struct ipv6hdr) + sizeof(struct udphdr) :
1803 sizeof(struct iphdr) + sizeof(struct udphdr);
1804 mtu = (state == IP_VS_STATE_BACKUP) ?
1805 clamp(dev->mtu, 1500U, 65535U) : 1500U;
1806 min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1;
1807
1808 if (c->sync_maxlen)
1809 c->sync_maxlen = clamp_t(unsigned int,
1810 c->sync_maxlen, min_mtu,
1811 65535 - hlen);
1812 else
1813 c->sync_maxlen = mtu - hlen;
1814
1815 if (state == IP_VS_STATE_MASTER) {
1816 result = -EEXIST;
1817 if (ipvs->ms)
1818 goto out_early;
1819
1820 ipvs->mcfg = *c;
1821 name = "ipvs-m:%d:%d";
1822 threadfn = sync_thread_master;
1823 } else if (state == IP_VS_STATE_BACKUP) {
1824 result = -EEXIST;
1825 if (ipvs->backup_threads)
1826 goto out_early;
1827
1828 ipvs->bcfg = *c;
1829 name = "ipvs-b:%d:%d";
1830 threadfn = sync_thread_backup;
1831 } else {
1832 result = -EINVAL;
1833 goto out_early;
1834 }
1835
1836 if (state == IP_VS_STATE_MASTER) {
1837 struct ipvs_master_sync_state *ms;
1838
1839 result = -ENOMEM;
1840 ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL);
1841 if (!ipvs->ms)
1842 goto out;
1843 ms = ipvs->ms;
1844 for (id = 0; id < count; id++, ms++) {
1845 INIT_LIST_HEAD(&ms->sync_queue);
1846 ms->sync_queue_len = 0;
1847 ms->sync_queue_delay = 0;
1848 INIT_DELAYED_WORK(&ms->master_wakeup_work,
1849 master_wakeup_work_handler);
1850 ms->ipvs = ipvs;
1851 }
1852 } else {
1853 array = kcalloc(count, sizeof(struct task_struct *),
1854 GFP_KERNEL);
1855 result = -ENOMEM;
1856 if (!array)
1857 goto out;
1858 }
1859
1860 for (id = 0; id < count; id++) {
1861 result = -ENOMEM;
1862 tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
1863 if (!tinfo)
1864 goto out;
1865 tinfo->ipvs = ipvs;
1866 tinfo->sock = NULL;
1867 if (state == IP_VS_STATE_BACKUP) {
1868 tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen,
1869 GFP_KERNEL);
1870 if (!tinfo->buf)
1871 goto out;
1872 } else {
1873 tinfo->buf = NULL;
1874 }
1875 tinfo->id = id;
1876 if (state == IP_VS_STATE_MASTER)
1877 result = make_send_sock(ipvs, id, dev, &tinfo->sock);
1878 else
1879 result = make_receive_sock(ipvs, id, dev, &tinfo->sock);
1880 if (result < 0)
1881 goto out;
1882
1883 task = kthread_run(threadfn, tinfo, name, ipvs->gen, id);
1884 if (IS_ERR(task)) {
1885 result = PTR_ERR(task);
1886 goto out;
1887 }
1888 tinfo = NULL;
1889 if (state == IP_VS_STATE_MASTER)
1890 ipvs->ms[id].master_thread = task;
1891 else
1892 array[id] = task;
1893 }
1894
1895
1896
1897 if (state == IP_VS_STATE_BACKUP)
1898 ipvs->backup_threads = array;
1899 spin_lock_bh(&ipvs->sync_buff_lock);
1900 ipvs->sync_state |= state;
1901 spin_unlock_bh(&ipvs->sync_buff_lock);
1902
1903 mutex_unlock(&ipvs->sync_mutex);
1904 rtnl_unlock();
1905
1906
1907 ip_vs_use_count_inc();
1908
1909 return 0;
1910
1911out:
1912
1913
1914
1915
1916 rtnl_unlock();
1917 count = id;
1918 while (count-- > 0) {
1919 if (state == IP_VS_STATE_MASTER)
1920 kthread_stop(ipvs->ms[count].master_thread);
1921 else
1922 kthread_stop(array[count]);
1923 }
1924 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
1925 kfree(ipvs->ms);
1926 ipvs->ms = NULL;
1927 }
1928 mutex_unlock(&ipvs->sync_mutex);
1929 if (tinfo) {
1930 if (tinfo->sock)
1931 sock_release(tinfo->sock);
1932 kfree(tinfo->buf);
1933 kfree(tinfo);
1934 }
1935 kfree(array);
1936 return result;
1937
1938out_early:
1939 mutex_unlock(&ipvs->sync_mutex);
1940 rtnl_unlock();
1941 return result;
1942}
1943
1944
1945int stop_sync_thread(struct netns_ipvs *ipvs, int state)
1946{
1947 struct task_struct **array;
1948 int id;
1949 int retc = -EINVAL;
1950
1951 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
1952
1953 if (state == IP_VS_STATE_MASTER) {
1954 if (!ipvs->ms)
1955 return -ESRCH;
1956
1957
1958
1959
1960
1961
1962
1963 spin_lock_bh(&ipvs->sync_buff_lock);
1964 spin_lock(&ipvs->sync_lock);
1965 ipvs->sync_state &= ~IP_VS_STATE_MASTER;
1966 spin_unlock(&ipvs->sync_lock);
1967 spin_unlock_bh(&ipvs->sync_buff_lock);
1968
1969 retc = 0;
1970 for (id = ipvs->threads_mask; id >= 0; id--) {
1971 struct ipvs_master_sync_state *ms = &ipvs->ms[id];
1972 int ret;
1973
1974 pr_info("stopping master sync thread %d ...\n",
1975 task_pid_nr(ms->master_thread));
1976 cancel_delayed_work_sync(&ms->master_wakeup_work);
1977 ret = kthread_stop(ms->master_thread);
1978 if (retc >= 0)
1979 retc = ret;
1980 }
1981 kfree(ipvs->ms);
1982 ipvs->ms = NULL;
1983 } else if (state == IP_VS_STATE_BACKUP) {
1984 if (!ipvs->backup_threads)
1985 return -ESRCH;
1986
1987 ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
1988 array = ipvs->backup_threads;
1989 retc = 0;
1990 for (id = ipvs->threads_mask; id >= 0; id--) {
1991 int ret;
1992
1993 pr_info("stopping backup sync thread %d ...\n",
1994 task_pid_nr(array[id]));
1995 ret = kthread_stop(array[id]);
1996 if (retc >= 0)
1997 retc = ret;
1998 }
1999 kfree(array);
2000 ipvs->backup_threads = NULL;
2001 }
2002
2003
2004 ip_vs_use_count_dec();
2005
2006 return retc;
2007}
2008
2009
2010
2011
2012int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs)
2013{
2014 __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key);
2015 spin_lock_init(&ipvs->sync_lock);
2016 spin_lock_init(&ipvs->sync_buff_lock);
2017 return 0;
2018}
2019
2020void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs)
2021{
2022 int retc;
2023
2024 mutex_lock(&ipvs->sync_mutex);
2025 retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER);
2026 if (retc && retc != -ESRCH)
2027 pr_err("Failed to stop Master Daemon\n");
2028
2029 retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP);
2030 if (retc && retc != -ESRCH)
2031 pr_err("Failed to stop Backup Daemon\n");
2032 mutex_unlock(&ipvs->sync_mutex);
2033}
2034