1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35#define KMSG_COMPONENT "IPVS"
36#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
37
38#include <linux/module.h>
39#include <linux/slab.h>
40#include <linux/inetdevice.h>
41#include <linux/net.h>
42#include <linux/completion.h>
43#include <linux/delay.h>
44#include <linux/skbuff.h>
45#include <linux/in.h>
46#include <linux/igmp.h>
47#include <linux/udp.h>
48#include <linux/err.h>
49#include <linux/kthread.h>
50#include <linux/wait.h>
51#include <linux/kernel.h>
52#include <linux/sched/signal.h>
53
54#include <asm/unaligned.h>
55
56#include <net/ip.h>
57#include <net/sock.h>
58
59#include <net/ip_vs.h>
60
61#define IP_VS_SYNC_GROUP 0xe0000051
62#define IP_VS_SYNC_PORT 8848
63
64#define SYNC_PROTO_VER 1
65
66static struct lock_class_key __ipvs_sync_key;
67
68
69
70
71struct ip_vs_sync_conn_v0 {
72 __u8 reserved;
73
74
75 __u8 protocol;
76 __be16 cport;
77 __be16 vport;
78 __be16 dport;
79 __be32 caddr;
80 __be32 vaddr;
81 __be32 daddr;
82
83
84 __be16 flags;
85 __be16 state;
86
87
88};
89
90struct ip_vs_sync_conn_options {
91 struct ip_vs_seq in_seq;
92 struct ip_vs_seq out_seq;
93};
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133struct ip_vs_sync_v4 {
134 __u8 type;
135 __u8 protocol;
136 __be16 ver_size;
137
138 __be32 flags;
139 __be16 state;
140
141 __be16 cport;
142 __be16 vport;
143 __be16 dport;
144 __be32 fwmark;
145 __be32 timeout;
146 __be32 caddr;
147 __be32 vaddr;
148 __be32 daddr;
149
150
151};
152
153
154
155struct ip_vs_sync_v6 {
156 __u8 type;
157 __u8 protocol;
158 __be16 ver_size;
159
160 __be32 flags;
161 __be16 state;
162
163 __be16 cport;
164 __be16 vport;
165 __be16 dport;
166 __be32 fwmark;
167 __be32 timeout;
168 struct in6_addr caddr;
169 struct in6_addr vaddr;
170 struct in6_addr daddr;
171
172
173};
174
175union ip_vs_sync_conn {
176 struct ip_vs_sync_v4 v4;
177 struct ip_vs_sync_v6 v6;
178};
179
180
181#define STYPE_INET6 0
182#define STYPE_F_INET6 (1 << STYPE_INET6)
183
184#define SVER_SHIFT 12
185#define SVER_MASK 0x0fff
186
187#define IPVS_OPT_SEQ_DATA 1
188#define IPVS_OPT_PE_DATA 2
189#define IPVS_OPT_PE_NAME 3
190#define IPVS_OPT_PARAM 7
191
192#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1))
193#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1))
194#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1))
195#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
196
197struct ip_vs_sync_thread_data {
198 struct task_struct *task;
199 struct netns_ipvs *ipvs;
200 struct socket *sock;
201 char *buf;
202 int id;
203};
204
205
206#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0))
207#define FULL_CONN_SIZE \
208(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246struct ip_vs_sync_mesg_v0 {
247 __u8 nr_conns;
248 __u8 syncid;
249 __be16 size;
250
251
252};
253
254
255struct ip_vs_sync_mesg {
256 __u8 reserved;
257 __u8 syncid;
258 __be16 size;
259 __u8 nr_conns;
260 __s8 version;
261 __u16 spare;
262
263};
264
265union ipvs_sockaddr {
266 struct sockaddr_in in;
267 struct sockaddr_in6 in6;
268};
269
270struct ip_vs_sync_buff {
271 struct list_head list;
272 unsigned long firstuse;
273
274
275 struct ip_vs_sync_mesg *mesg;
276 unsigned char *head;
277 unsigned char *end;
278};
279
280
281
282
283
284static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
285{
286 memset(ho, 0, sizeof(*ho));
287 ho->init_seq = get_unaligned_be32(&no->init_seq);
288 ho->delta = get_unaligned_be32(&no->delta);
289 ho->previous_delta = get_unaligned_be32(&no->previous_delta);
290}
291
292
293
294
295
296static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
297{
298 put_unaligned_be32(ho->init_seq, &no->init_seq);
299 put_unaligned_be32(ho->delta, &no->delta);
300 put_unaligned_be32(ho->previous_delta, &no->previous_delta);
301}
302
303static inline struct ip_vs_sync_buff *
304sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
305{
306 struct ip_vs_sync_buff *sb;
307
308 spin_lock_bh(&ipvs->sync_lock);
309 if (list_empty(&ms->sync_queue)) {
310 sb = NULL;
311 __set_current_state(TASK_INTERRUPTIBLE);
312 } else {
313 sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff,
314 list);
315 list_del(&sb->list);
316 ms->sync_queue_len--;
317 if (!ms->sync_queue_len)
318 ms->sync_queue_delay = 0;
319 }
320 spin_unlock_bh(&ipvs->sync_lock);
321
322 return sb;
323}
324
325
326
327
328static inline struct ip_vs_sync_buff *
329ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len)
330{
331 struct ip_vs_sync_buff *sb;
332
333 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
334 return NULL;
335
336 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg),
337 ipvs->mcfg.sync_maxlen);
338 sb->mesg = kmalloc(len, GFP_ATOMIC);
339 if (!sb->mesg) {
340 kfree(sb);
341 return NULL;
342 }
343 sb->mesg->reserved = 0;
344 sb->mesg->version = SYNC_PROTO_VER;
345 sb->mesg->syncid = ipvs->mcfg.syncid;
346 sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg));
347 sb->mesg->nr_conns = 0;
348 sb->mesg->spare = 0;
349 sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
350 sb->end = (unsigned char *)sb->mesg + len;
351
352 sb->firstuse = jiffies;
353 return sb;
354}
355
356static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
357{
358 kfree(sb->mesg);
359 kfree(sb);
360}
361
362static inline void sb_queue_tail(struct netns_ipvs *ipvs,
363 struct ipvs_master_sync_state *ms)
364{
365 struct ip_vs_sync_buff *sb = ms->sync_buff;
366
367 spin_lock(&ipvs->sync_lock);
368 if (ipvs->sync_state & IP_VS_STATE_MASTER &&
369 ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) {
370 if (!ms->sync_queue_len)
371 schedule_delayed_work(&ms->master_wakeup_work,
372 max(IPVS_SYNC_SEND_DELAY, 1));
373 ms->sync_queue_len++;
374 list_add_tail(&sb->list, &ms->sync_queue);
375 if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) {
376 int id = (int)(ms - ipvs->ms);
377
378 wake_up_process(ipvs->master_tinfo[id].task);
379 }
380 } else
381 ip_vs_sync_buff_release(sb);
382 spin_unlock(&ipvs->sync_lock);
383}
384
385
386
387
388
389static inline struct ip_vs_sync_buff *
390get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms,
391 unsigned long time)
392{
393 struct ip_vs_sync_buff *sb;
394
395 spin_lock_bh(&ipvs->sync_buff_lock);
396 sb = ms->sync_buff;
397 if (sb && time_after_eq(jiffies - sb->firstuse, time)) {
398 ms->sync_buff = NULL;
399 __set_current_state(TASK_RUNNING);
400 } else
401 sb = NULL;
402 spin_unlock_bh(&ipvs->sync_buff_lock);
403 return sb;
404}
405
406static inline int
407select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp)
408{
409 return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask;
410}
411
412
413
414
415static inline struct ip_vs_sync_buff *
416ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len)
417{
418 struct ip_vs_sync_buff *sb;
419 struct ip_vs_sync_mesg_v0 *mesg;
420
421 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
422 return NULL;
423
424 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0),
425 ipvs->mcfg.sync_maxlen);
426 sb->mesg = kmalloc(len, GFP_ATOMIC);
427 if (!sb->mesg) {
428 kfree(sb);
429 return NULL;
430 }
431 mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
432 mesg->nr_conns = 0;
433 mesg->syncid = ipvs->mcfg.syncid;
434 mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0));
435 sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
436 sb->end = (unsigned char *)mesg + len;
437 sb->firstuse = jiffies;
438 return sb;
439}
440
441
442static inline bool in_persistence(struct ip_vs_conn *cp)
443{
444 for (cp = cp->control; cp; cp = cp->control) {
445 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
446 return true;
447 }
448 return false;
449}
450
451
452
453
454
455
456
457
458
459
460static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs,
461 struct ip_vs_conn *cp, int pkts)
462{
463 unsigned long orig = READ_ONCE(cp->sync_endtime);
464 unsigned long now = jiffies;
465 unsigned long n = (now + cp->timeout) & ~3UL;
466 unsigned int sync_refresh_period;
467 int sync_period;
468 int force;
469
470
471 if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE))
472 force = 0;
473 else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp)))
474 return 0;
475 else if (likely(cp->protocol == IPPROTO_TCP)) {
476 if (!((1 << cp->state) &
477 ((1 << IP_VS_TCP_S_ESTABLISHED) |
478 (1 << IP_VS_TCP_S_FIN_WAIT) |
479 (1 << IP_VS_TCP_S_CLOSE) |
480 (1 << IP_VS_TCP_S_CLOSE_WAIT) |
481 (1 << IP_VS_TCP_S_TIME_WAIT))))
482 return 0;
483 force = cp->state != cp->old_state;
484 if (force && cp->state != IP_VS_TCP_S_ESTABLISHED)
485 goto set;
486 } else if (unlikely(cp->protocol == IPPROTO_SCTP)) {
487 if (!((1 << cp->state) &
488 ((1 << IP_VS_SCTP_S_ESTABLISHED) |
489 (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) |
490 (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) |
491 (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) |
492 (1 << IP_VS_SCTP_S_CLOSED))))
493 return 0;
494 force = cp->state != cp->old_state;
495 if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED)
496 goto set;
497 } else {
498
499 force = 0;
500 }
501
502 sync_refresh_period = sysctl_sync_refresh_period(ipvs);
503 if (sync_refresh_period > 0) {
504 long diff = n - orig;
505 long min_diff = max(cp->timeout >> 1, 10UL * HZ);
506
507
508
509
510 if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) {
511 int retries = orig & 3;
512
513 if (retries >= sysctl_sync_retries(ipvs))
514 return 0;
515 if (time_before(now, orig - cp->timeout +
516 (sync_refresh_period >> 3)))
517 return 0;
518 n |= retries + 1;
519 }
520 }
521 sync_period = sysctl_sync_period(ipvs);
522 if (sync_period > 0) {
523 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) &&
524 pkts % sync_period != sysctl_sync_threshold(ipvs))
525 return 0;
526 } else if (!sync_refresh_period &&
527 pkts != sysctl_sync_threshold(ipvs))
528 return 0;
529
530set:
531 cp->old_state = cp->state;
532 n = cmpxchg(&cp->sync_endtime, orig, n);
533 return n == orig || force;
534}
535
536
537
538
539
540static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp,
541 int pkts)
542{
543 struct ip_vs_sync_mesg_v0 *m;
544 struct ip_vs_sync_conn_v0 *s;
545 struct ip_vs_sync_buff *buff;
546 struct ipvs_master_sync_state *ms;
547 int id;
548 unsigned int len;
549
550 if (unlikely(cp->af != AF_INET))
551 return;
552
553 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
554 return;
555
556 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
557 return;
558
559 spin_lock_bh(&ipvs->sync_buff_lock);
560 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
561 spin_unlock_bh(&ipvs->sync_buff_lock);
562 return;
563 }
564
565 id = select_master_thread_id(ipvs, cp);
566 ms = &ipvs->ms[id];
567 buff = ms->sync_buff;
568 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
569 SIMPLE_CONN_SIZE;
570 if (buff) {
571 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
572
573 if (buff->head + len > buff->end || !m->nr_conns) {
574 sb_queue_tail(ipvs, ms);
575 ms->sync_buff = NULL;
576 buff = NULL;
577 }
578 }
579 if (!buff) {
580 buff = ip_vs_sync_buff_create_v0(ipvs, len);
581 if (!buff) {
582 spin_unlock_bh(&ipvs->sync_buff_lock);
583 pr_err("ip_vs_sync_buff_create failed.\n");
584 return;
585 }
586 ms->sync_buff = buff;
587 }
588
589 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
590 s = (struct ip_vs_sync_conn_v0 *) buff->head;
591
592
593 s->reserved = 0;
594 s->protocol = cp->protocol;
595 s->cport = cp->cport;
596 s->vport = cp->vport;
597 s->dport = cp->dport;
598 s->caddr = cp->caddr.ip;
599 s->vaddr = cp->vaddr.ip;
600 s->daddr = cp->daddr.ip;
601 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
602 s->state = htons(cp->state);
603 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
604 struct ip_vs_sync_conn_options *opt =
605 (struct ip_vs_sync_conn_options *)&s[1];
606 memcpy(opt, &cp->in_seq, sizeof(*opt));
607 }
608
609 m->nr_conns++;
610 m->size = htons(ntohs(m->size) + len);
611 buff->head += len;
612 spin_unlock_bh(&ipvs->sync_buff_lock);
613
614
615 cp = cp->control;
616 if (cp) {
617 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
618 pkts = atomic_inc_return(&cp->in_pkts);
619 else
620 pkts = sysctl_sync_threshold(ipvs);
621 ip_vs_sync_conn(ipvs, cp, pkts);
622 }
623}
624
625
626
627
628
629
630void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts)
631{
632 struct ip_vs_sync_mesg *m;
633 union ip_vs_sync_conn *s;
634 struct ip_vs_sync_buff *buff;
635 struct ipvs_master_sync_state *ms;
636 int id;
637 __u8 *p;
638 unsigned int len, pe_name_len, pad;
639
640
641 if (sysctl_sync_ver(ipvs) == 0) {
642 ip_vs_sync_conn_v0(ipvs, cp, pkts);
643 return;
644 }
645
646 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
647 goto control;
648sloop:
649 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
650 goto control;
651
652
653 pe_name_len = 0;
654 if (cp->pe_data_len) {
655 if (!cp->pe_data || !cp->dest) {
656 IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
657 return;
658 }
659 pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
660 }
661
662 spin_lock_bh(&ipvs->sync_buff_lock);
663 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
664 spin_unlock_bh(&ipvs->sync_buff_lock);
665 return;
666 }
667
668 id = select_master_thread_id(ipvs, cp);
669 ms = &ipvs->ms[id];
670
671#ifdef CONFIG_IP_VS_IPV6
672 if (cp->af == AF_INET6)
673 len = sizeof(struct ip_vs_sync_v6);
674 else
675#endif
676 len = sizeof(struct ip_vs_sync_v4);
677
678 if (cp->flags & IP_VS_CONN_F_SEQ_MASK)
679 len += sizeof(struct ip_vs_sync_conn_options) + 2;
680
681 if (cp->pe_data_len)
682 len += cp->pe_data_len + 2;
683 if (pe_name_len)
684 len += pe_name_len + 2;
685
686
687 pad = 0;
688 buff = ms->sync_buff;
689 if (buff) {
690 m = buff->mesg;
691 pad = (4 - (size_t) buff->head) & 3;
692
693 if (buff->head + len + pad > buff->end || m->reserved) {
694 sb_queue_tail(ipvs, ms);
695 ms->sync_buff = NULL;
696 buff = NULL;
697 pad = 0;
698 }
699 }
700
701 if (!buff) {
702 buff = ip_vs_sync_buff_create(ipvs, len);
703 if (!buff) {
704 spin_unlock_bh(&ipvs->sync_buff_lock);
705 pr_err("ip_vs_sync_buff_create failed.\n");
706 return;
707 }
708 ms->sync_buff = buff;
709 m = buff->mesg;
710 }
711
712 p = buff->head;
713 buff->head += pad + len;
714 m->size = htons(ntohs(m->size) + pad + len);
715
716 while (pad--)
717 *(p++) = 0;
718
719 s = (union ip_vs_sync_conn *)p;
720
721
722 s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0);
723 s->v4.ver_size = htons(len & SVER_MASK);
724 s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED);
725 s->v4.state = htons(cp->state);
726 s->v4.protocol = cp->protocol;
727 s->v4.cport = cp->cport;
728 s->v4.vport = cp->vport;
729 s->v4.dport = cp->dport;
730 s->v4.fwmark = htonl(cp->fwmark);
731 s->v4.timeout = htonl(cp->timeout / HZ);
732 m->nr_conns++;
733
734#ifdef CONFIG_IP_VS_IPV6
735 if (cp->af == AF_INET6) {
736 p += sizeof(struct ip_vs_sync_v6);
737 s->v6.caddr = cp->caddr.in6;
738 s->v6.vaddr = cp->vaddr.in6;
739 s->v6.daddr = cp->daddr.in6;
740 } else
741#endif
742 {
743 p += sizeof(struct ip_vs_sync_v4);
744 s->v4.caddr = cp->caddr.ip;
745 s->v4.vaddr = cp->vaddr.ip;
746 s->v4.daddr = cp->daddr.ip;
747 }
748 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
749 *(p++) = IPVS_OPT_SEQ_DATA;
750 *(p++) = sizeof(struct ip_vs_sync_conn_options);
751 hton_seq((struct ip_vs_seq *)p, &cp->in_seq);
752 p += sizeof(struct ip_vs_seq);
753 hton_seq((struct ip_vs_seq *)p, &cp->out_seq);
754 p += sizeof(struct ip_vs_seq);
755 }
756
757 if (cp->pe_data_len && cp->pe_data) {
758 *(p++) = IPVS_OPT_PE_DATA;
759 *(p++) = cp->pe_data_len;
760 memcpy(p, cp->pe_data, cp->pe_data_len);
761 p += cp->pe_data_len;
762 if (pe_name_len) {
763
764 *(p++) = IPVS_OPT_PE_NAME;
765 *(p++) = pe_name_len;
766 memcpy(p, cp->pe->name, pe_name_len);
767 p += pe_name_len;
768 }
769 }
770
771 spin_unlock_bh(&ipvs->sync_buff_lock);
772
773control:
774
775 cp = cp->control;
776 if (!cp)
777 return;
778 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
779 pkts = atomic_inc_return(&cp->in_pkts);
780 else
781 pkts = sysctl_sync_threshold(ipvs);
782 goto sloop;
783}
784
785
786
787
788static inline int
789ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc,
790 struct ip_vs_conn_param *p,
791 __u8 *pe_data, unsigned int pe_data_len,
792 __u8 *pe_name, unsigned int pe_name_len)
793{
794#ifdef CONFIG_IP_VS_IPV6
795 if (af == AF_INET6)
796 ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol,
797 (const union nf_inet_addr *)&sc->v6.caddr,
798 sc->v6.cport,
799 (const union nf_inet_addr *)&sc->v6.vaddr,
800 sc->v6.vport, p);
801 else
802#endif
803 ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol,
804 (const union nf_inet_addr *)&sc->v4.caddr,
805 sc->v4.cport,
806 (const union nf_inet_addr *)&sc->v4.vaddr,
807 sc->v4.vport, p);
808
809 if (pe_data_len) {
810 if (pe_name_len) {
811 char buff[IP_VS_PENAME_MAXLEN+1];
812
813 memcpy(buff, pe_name, pe_name_len);
814 buff[pe_name_len]=0;
815 p->pe = __ip_vs_pe_getbyname(buff);
816 if (!p->pe) {
817 IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
818 buff);
819 return 1;
820 }
821 } else {
822 IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
823 return 1;
824 }
825
826 p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC);
827 if (!p->pe_data) {
828 module_put(p->pe->module);
829 return -ENOMEM;
830 }
831 p->pe_data_len = pe_data_len;
832 }
833 return 0;
834}
835
836
837
838
839
840
841
842static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param,
843 unsigned int flags, unsigned int state,
844 unsigned int protocol, unsigned int type,
845 const union nf_inet_addr *daddr, __be16 dport,
846 unsigned long timeout, __u32 fwmark,
847 struct ip_vs_sync_conn_options *opt)
848{
849 struct ip_vs_dest *dest;
850 struct ip_vs_conn *cp;
851
852 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
853 cp = ip_vs_conn_in_get(param);
854 if (cp && ((cp->dport != dport) ||
855 !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) {
856 if (!(flags & IP_VS_CONN_F_INACTIVE)) {
857 ip_vs_conn_expire_now(cp);
858 __ip_vs_conn_put(cp);
859 cp = NULL;
860 } else {
861
862
863
864
865 __ip_vs_conn_put(cp);
866 kfree(param->pe_data);
867 return;
868 }
869 }
870 } else {
871 cp = ip_vs_ct_in_get(param);
872 }
873
874 if (cp) {
875
876 kfree(param->pe_data);
877
878 dest = cp->dest;
879 spin_lock_bh(&cp->lock);
880 if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE &&
881 !(flags & IP_VS_CONN_F_TEMPLATE) && dest) {
882 if (flags & IP_VS_CONN_F_INACTIVE) {
883 atomic_dec(&dest->activeconns);
884 atomic_inc(&dest->inactconns);
885 } else {
886 atomic_inc(&dest->activeconns);
887 atomic_dec(&dest->inactconns);
888 }
889 }
890 flags &= IP_VS_CONN_F_BACKUP_UPD_MASK;
891 flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
892 cp->flags = flags;
893 spin_unlock_bh(&cp->lock);
894 if (!dest)
895 ip_vs_try_bind_dest(cp);
896 } else {
897
898
899
900
901
902 rcu_read_lock();
903
904
905
906
907
908 dest = ip_vs_find_dest(ipvs, type, type, daddr, dport,
909 param->vaddr, param->vport, protocol,
910 fwmark, flags);
911
912 cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest,
913 fwmark);
914 rcu_read_unlock();
915 if (!cp) {
916 kfree(param->pe_data);
917 IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
918 return;
919 }
920 if (!(flags & IP_VS_CONN_F_TEMPLATE))
921 kfree(param->pe_data);
922 }
923
924 if (opt) {
925 cp->in_seq = opt->in_seq;
926 cp->out_seq = opt->out_seq;
927 }
928 atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs));
929 cp->state = state;
930 cp->old_state = cp->state;
931
932
933
934
935
936
937
938
939
940 if (timeout) {
941 if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
942 timeout = MAX_SCHEDULE_TIMEOUT / HZ;
943 cp->timeout = timeout*HZ;
944 } else {
945 struct ip_vs_proto_data *pd;
946
947 pd = ip_vs_proto_data_get(ipvs, protocol);
948 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
949 cp->timeout = pd->timeout_table[state];
950 else
951 cp->timeout = (3*60*HZ);
952 }
953 ip_vs_conn_put(cp);
954}
955
956
957
958
959static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer,
960 const size_t buflen)
961{
962 struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
963 struct ip_vs_sync_conn_v0 *s;
964 struct ip_vs_sync_conn_options *opt;
965 struct ip_vs_protocol *pp;
966 struct ip_vs_conn_param param;
967 char *p;
968 int i;
969
970 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
971 for (i=0; i<m->nr_conns; i++) {
972 unsigned int flags, state;
973
974 if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
975 IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
976 return;
977 }
978 s = (struct ip_vs_sync_conn_v0 *) p;
979 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
980 flags &= ~IP_VS_CONN_F_HASHED;
981 if (flags & IP_VS_CONN_F_SEQ_MASK) {
982 opt = (struct ip_vs_sync_conn_options *)&s[1];
983 p += FULL_CONN_SIZE;
984 if (p > buffer+buflen) {
985 IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
986 return;
987 }
988 } else {
989 opt = NULL;
990 p += SIMPLE_CONN_SIZE;
991 }
992
993 state = ntohs(s->state);
994 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
995 pp = ip_vs_proto_get(s->protocol);
996 if (!pp) {
997 IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
998 s->protocol);
999 continue;
1000 }
1001 if (state >= pp->num_states) {
1002 IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
1003 pp->name, state);
1004 continue;
1005 }
1006 } else {
1007 if (state >= IP_VS_CTPL_S_LAST)
1008 IP_VS_DBG(7, "BACKUP v0, Invalid tpl state %u\n",
1009 state);
1010 }
1011
1012 ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol,
1013 (const union nf_inet_addr *)&s->caddr,
1014 s->cport,
1015 (const union nf_inet_addr *)&s->vaddr,
1016 s->vport, ¶m);
1017
1018
1019 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->protocol, AF_INET,
1020 (union nf_inet_addr *)&s->daddr, s->dport,
1021 0, 0, opt);
1022 }
1023}
1024
1025
1026
1027
1028static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen,
1029 __u32 *opt_flags,
1030 struct ip_vs_sync_conn_options *opt)
1031{
1032 struct ip_vs_sync_conn_options *topt;
1033
1034 topt = (struct ip_vs_sync_conn_options *)p;
1035
1036 if (plen != sizeof(struct ip_vs_sync_conn_options)) {
1037 IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
1038 return -EINVAL;
1039 }
1040 if (*opt_flags & IPVS_OPT_F_SEQ_DATA) {
1041 IP_VS_DBG(2, "BACKUP, conn options found twice\n");
1042 return -EINVAL;
1043 }
1044 ntoh_seq(&topt->in_seq, &opt->in_seq);
1045 ntoh_seq(&topt->out_seq, &opt->out_seq);
1046 *opt_flags |= IPVS_OPT_F_SEQ_DATA;
1047 return 0;
1048}
1049
1050static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
1051 __u8 **data, unsigned int maxlen,
1052 __u32 *opt_flags, __u32 flag)
1053{
1054 if (plen > maxlen) {
1055 IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen);
1056 return -EINVAL;
1057 }
1058 if (*opt_flags & flag) {
1059 IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag);
1060 return -EINVAL;
1061 }
1062 *data_len = plen;
1063 *data = p;
1064 *opt_flags |= flag;
1065 return 0;
1066}
1067
1068
1069
1070static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end)
1071{
1072 struct ip_vs_sync_conn_options opt;
1073 union ip_vs_sync_conn *s;
1074 struct ip_vs_protocol *pp;
1075 struct ip_vs_conn_param param;
1076 __u32 flags;
1077 unsigned int af, state, pe_data_len=0, pe_name_len=0;
1078 __u8 *pe_data=NULL, *pe_name=NULL;
1079 __u32 opt_flags=0;
1080 int retc=0;
1081
1082 s = (union ip_vs_sync_conn *) p;
1083
1084 if (s->v6.type & STYPE_F_INET6) {
1085#ifdef CONFIG_IP_VS_IPV6
1086 af = AF_INET6;
1087 p += sizeof(struct ip_vs_sync_v6);
1088#else
1089 IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
1090 retc = 10;
1091 goto out;
1092#endif
1093 } else if (!s->v4.type) {
1094 af = AF_INET;
1095 p += sizeof(struct ip_vs_sync_v4);
1096 } else {
1097 return -10;
1098 }
1099 if (p > msg_end)
1100 return -20;
1101
1102
1103 while (p < msg_end) {
1104 int ptype;
1105 int plen;
1106
1107 if (p+2 > msg_end)
1108 return -30;
1109 ptype = *(p++);
1110 plen = *(p++);
1111
1112 if (!plen || ((p + plen) > msg_end))
1113 return -40;
1114
1115 switch (ptype & ~IPVS_OPT_F_PARAM) {
1116 case IPVS_OPT_SEQ_DATA:
1117 if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt))
1118 return -50;
1119 break;
1120
1121 case IPVS_OPT_PE_DATA:
1122 if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data,
1123 IP_VS_PEDATA_MAXLEN, &opt_flags,
1124 IPVS_OPT_F_PE_DATA))
1125 return -60;
1126 break;
1127
1128 case IPVS_OPT_PE_NAME:
1129 if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name,
1130 IP_VS_PENAME_MAXLEN, &opt_flags,
1131 IPVS_OPT_F_PE_NAME))
1132 return -70;
1133 break;
1134
1135 default:
1136
1137 if (!(ptype & IPVS_OPT_F_PARAM)) {
1138 IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
1139 ptype & ~IPVS_OPT_F_PARAM);
1140 retc = 20;
1141 goto out;
1142 }
1143 }
1144 p += plen;
1145 }
1146
1147
1148 flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK;
1149 flags |= IP_VS_CONN_F_SYNC;
1150 state = ntohs(s->v4.state);
1151
1152 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
1153 pp = ip_vs_proto_get(s->v4.protocol);
1154 if (!pp) {
1155 IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
1156 s->v4.protocol);
1157 retc = 30;
1158 goto out;
1159 }
1160 if (state >= pp->num_states) {
1161 IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
1162 pp->name, state);
1163 retc = 40;
1164 goto out;
1165 }
1166 } else {
1167 if (state >= IP_VS_CTPL_S_LAST)
1168 IP_VS_DBG(7, "BACKUP, Invalid tpl state %u\n",
1169 state);
1170 }
1171 if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data,
1172 pe_data_len, pe_name, pe_name_len)) {
1173 retc = 50;
1174 goto out;
1175 }
1176
1177 if (af == AF_INET)
1178 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v4.protocol, af,
1179 (union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
1180 ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
1181 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
1182 );
1183#ifdef CONFIG_IP_VS_IPV6
1184 else
1185 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v6.protocol, af,
1186 (union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
1187 ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
1188 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
1189 );
1190#endif
1191 ip_vs_pe_put(param.pe);
1192 return 0;
1193
1194out:
1195 IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc);
1196 return retc;
1197
1198}
1199
1200
1201
1202
1203
1204static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer,
1205 const size_t buflen)
1206{
1207 struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
1208 __u8 *p, *msg_end;
1209 int i, nr_conns;
1210
1211 if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) {
1212 IP_VS_DBG(2, "BACKUP, message header too short\n");
1213 return;
1214 }
1215
1216 if (buflen != ntohs(m2->size)) {
1217 IP_VS_DBG(2, "BACKUP, bogus message size\n");
1218 return;
1219 }
1220
1221 if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) {
1222 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
1223 return;
1224 }
1225
1226 if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
1227 && (m2->spare == 0)) {
1228
1229 msg_end = buffer + sizeof(struct ip_vs_sync_mesg);
1230 nr_conns = m2->nr_conns;
1231
1232 for (i=0; i<nr_conns; i++) {
1233 union ip_vs_sync_conn *s;
1234 unsigned int size;
1235 int retc;
1236
1237 p = msg_end;
1238 if (p + sizeof(s->v4) > buffer+buflen) {
1239 IP_VS_ERR_RL("BACKUP, Dropping buffer, too small\n");
1240 return;
1241 }
1242 s = (union ip_vs_sync_conn *)p;
1243 size = ntohs(s->v4.ver_size) & SVER_MASK;
1244 msg_end = p + size;
1245
1246 if (msg_end > buffer+buflen) {
1247 IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
1248 return;
1249 }
1250 if (ntohs(s->v4.ver_size) >> SVER_SHIFT) {
1251 IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
1252 ntohs(s->v4.ver_size) >> SVER_SHIFT);
1253 return;
1254 }
1255
1256 retc = ip_vs_proc_sync_conn(ipvs, p, msg_end);
1257 if (retc < 0) {
1258 IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
1259 retc);
1260 return;
1261 }
1262
1263 msg_end = p + ((size + 3) & ~3);
1264 }
1265 } else {
1266
1267 ip_vs_process_message_v0(ipvs, buffer, buflen);
1268 return;
1269 }
1270}
1271
1272
1273
1274
1275
1276static void set_sock_size(struct sock *sk, int mode, int val)
1277{
1278
1279
1280 lock_sock(sk);
1281 if (mode) {
1282 val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
1283 sysctl_wmem_max);
1284 sk->sk_sndbuf = val * 2;
1285 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1286 } else {
1287 val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
1288 sysctl_rmem_max);
1289 sk->sk_rcvbuf = val * 2;
1290 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
1291 }
1292 release_sock(sk);
1293}
1294
1295
1296
1297
1298static void set_mcast_loop(struct sock *sk, u_char loop)
1299{
1300 struct inet_sock *inet = inet_sk(sk);
1301
1302
1303 lock_sock(sk);
1304 inet->mc_loop = loop ? 1 : 0;
1305#ifdef CONFIG_IP_VS_IPV6
1306 if (sk->sk_family == AF_INET6) {
1307 struct ipv6_pinfo *np = inet6_sk(sk);
1308
1309
1310 np->mc_loop = loop ? 1 : 0;
1311 }
1312#endif
1313 release_sock(sk);
1314}
1315
1316
1317
1318
1319static void set_mcast_ttl(struct sock *sk, u_char ttl)
1320{
1321 struct inet_sock *inet = inet_sk(sk);
1322
1323
1324 lock_sock(sk);
1325 inet->mc_ttl = ttl;
1326#ifdef CONFIG_IP_VS_IPV6
1327 if (sk->sk_family == AF_INET6) {
1328 struct ipv6_pinfo *np = inet6_sk(sk);
1329
1330
1331 np->mcast_hops = ttl;
1332 }
1333#endif
1334 release_sock(sk);
1335}
1336
1337
1338static void set_mcast_pmtudisc(struct sock *sk, int val)
1339{
1340 struct inet_sock *inet = inet_sk(sk);
1341
1342
1343 lock_sock(sk);
1344 inet->pmtudisc = val;
1345#ifdef CONFIG_IP_VS_IPV6
1346 if (sk->sk_family == AF_INET6) {
1347 struct ipv6_pinfo *np = inet6_sk(sk);
1348
1349
1350 np->pmtudisc = val;
1351 }
1352#endif
1353 release_sock(sk);
1354}
1355
1356
1357
1358
1359static int set_mcast_if(struct sock *sk, struct net_device *dev)
1360{
1361 struct inet_sock *inet = inet_sk(sk);
1362
1363 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1364 return -EINVAL;
1365
1366 lock_sock(sk);
1367 inet->mc_index = dev->ifindex;
1368
1369#ifdef CONFIG_IP_VS_IPV6
1370 if (sk->sk_family == AF_INET6) {
1371 struct ipv6_pinfo *np = inet6_sk(sk);
1372
1373
1374 np->mcast_oif = dev->ifindex;
1375 }
1376#endif
1377 release_sock(sk);
1378
1379 return 0;
1380}
1381
1382
1383
1384
1385
1386
1387
1388static int
1389join_mcast_group(struct sock *sk, struct in_addr *addr, struct net_device *dev)
1390{
1391 struct ip_mreqn mreq;
1392 int ret;
1393
1394 memset(&mreq, 0, sizeof(mreq));
1395 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
1396
1397 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1398 return -EINVAL;
1399
1400 mreq.imr_ifindex = dev->ifindex;
1401
1402 lock_sock(sk);
1403 ret = ip_mc_join_group(sk, &mreq);
1404 release_sock(sk);
1405
1406 return ret;
1407}
1408
1409#ifdef CONFIG_IP_VS_IPV6
1410static int join_mcast_group6(struct sock *sk, struct in6_addr *addr,
1411 struct net_device *dev)
1412{
1413 int ret;
1414
1415 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1416 return -EINVAL;
1417
1418 lock_sock(sk);
1419 ret = ipv6_sock_mc_join(sk, dev->ifindex, addr);
1420 release_sock(sk);
1421
1422 return ret;
1423}
1424#endif
1425
1426static int bind_mcastif_addr(struct socket *sock, struct net_device *dev)
1427{
1428 __be32 addr;
1429 struct sockaddr_in sin;
1430
1431 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1432 if (!addr)
1433 pr_err("You probably need to specify IP address on "
1434 "multicast interface.\n");
1435
1436 IP_VS_DBG(7, "binding socket with (%s) %pI4\n",
1437 dev->name, &addr);
1438
1439
1440 sin.sin_family = AF_INET;
1441 sin.sin_addr.s_addr = addr;
1442 sin.sin_port = 0;
1443
1444 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
1445}
1446
1447static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen,
1448 struct ipvs_sync_daemon_cfg *c, int id)
1449{
1450 if (AF_INET6 == c->mcast_af) {
1451 sa->in6 = (struct sockaddr_in6) {
1452 .sin6_family = AF_INET6,
1453 .sin6_port = htons(c->mcast_port + id),
1454 };
1455 sa->in6.sin6_addr = c->mcast_group.in6;
1456 *salen = sizeof(sa->in6);
1457 } else {
1458 sa->in = (struct sockaddr_in) {
1459 .sin_family = AF_INET,
1460 .sin_port = htons(c->mcast_port + id),
1461 };
1462 sa->in.sin_addr = c->mcast_group.in;
1463 *salen = sizeof(sa->in);
1464 }
1465}
1466
1467
1468
1469
1470static int make_send_sock(struct netns_ipvs *ipvs, int id,
1471 struct net_device *dev, struct socket **sock_ret)
1472{
1473
1474 union ipvs_sockaddr mcast_addr;
1475 struct socket *sock;
1476 int result, salen;
1477
1478
1479 result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM,
1480 IPPROTO_UDP, &sock);
1481 if (result < 0) {
1482 pr_err("Error during creation of socket; terminating\n");
1483 goto error;
1484 }
1485 *sock_ret = sock;
1486 result = set_mcast_if(sock->sk, dev);
1487 if (result < 0) {
1488 pr_err("Error setting outbound mcast interface\n");
1489 goto error;
1490 }
1491
1492 set_mcast_loop(sock->sk, 0);
1493 set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl);
1494
1495 set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT);
1496 result = sysctl_sync_sock_size(ipvs);
1497 if (result > 0)
1498 set_sock_size(sock->sk, 1, result);
1499
1500 if (AF_INET == ipvs->mcfg.mcast_af)
1501 result = bind_mcastif_addr(sock, dev);
1502 else
1503 result = 0;
1504 if (result < 0) {
1505 pr_err("Error binding address of the mcast interface\n");
1506 goto error;
1507 }
1508
1509 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id);
1510 result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
1511 salen, 0);
1512 if (result < 0) {
1513 pr_err("Error connecting to the multicast addr\n");
1514 goto error;
1515 }
1516
1517 return 0;
1518
1519error:
1520 return result;
1521}
1522
1523
1524
1525
1526
1527static int make_receive_sock(struct netns_ipvs *ipvs, int id,
1528 struct net_device *dev, struct socket **sock_ret)
1529{
1530
1531 union ipvs_sockaddr mcast_addr;
1532 struct socket *sock;
1533 int result, salen;
1534
1535
1536 result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM,
1537 IPPROTO_UDP, &sock);
1538 if (result < 0) {
1539 pr_err("Error during creation of socket; terminating\n");
1540 goto error;
1541 }
1542 *sock_ret = sock;
1543
1544 sock->sk->sk_reuse = SK_CAN_REUSE;
1545 result = sysctl_sync_sock_size(ipvs);
1546 if (result > 0)
1547 set_sock_size(sock->sk, 0, result);
1548
1549 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id);
1550 sock->sk->sk_bound_dev_if = dev->ifindex;
1551 result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen);
1552 if (result < 0) {
1553 pr_err("Error binding to the multicast addr\n");
1554 goto error;
1555 }
1556
1557
1558#ifdef CONFIG_IP_VS_IPV6
1559 if (ipvs->bcfg.mcast_af == AF_INET6)
1560 result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr,
1561 dev);
1562 else
1563#endif
1564 result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr,
1565 dev);
1566 if (result < 0) {
1567 pr_err("Error joining to the multicast group\n");
1568 goto error;
1569 }
1570
1571 return 0;
1572
1573error:
1574 return result;
1575}
1576
1577
1578static int
1579ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
1580{
1581 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
1582 struct kvec iov;
1583 int len;
1584
1585 EnterFunction(7);
1586 iov.iov_base = (void *)buffer;
1587 iov.iov_len = length;
1588
1589 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
1590
1591 LeaveFunction(7);
1592 return len;
1593}
1594
1595static int
1596ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
1597{
1598 int msize;
1599 int ret;
1600
1601 msize = ntohs(msg->size);
1602
1603 ret = ip_vs_send_async(sock, (char *)msg, msize);
1604 if (ret >= 0 || ret == -EAGAIN)
1605 return ret;
1606 pr_err("ip_vs_send_async error %d\n", ret);
1607 return 0;
1608}
1609
1610static int
1611ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
1612{
1613 struct msghdr msg = {NULL,};
1614 struct kvec iov = {buffer, buflen};
1615 int len;
1616
1617 EnterFunction(7);
1618
1619
1620 iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, buflen);
1621 len = sock_recvmsg(sock, &msg, MSG_DONTWAIT);
1622 if (len < 0)
1623 return len;
1624
1625 LeaveFunction(7);
1626 return len;
1627}
1628
1629
1630static void master_wakeup_work_handler(struct work_struct *work)
1631{
1632 struct ipvs_master_sync_state *ms =
1633 container_of(work, struct ipvs_master_sync_state,
1634 master_wakeup_work.work);
1635 struct netns_ipvs *ipvs = ms->ipvs;
1636
1637 spin_lock_bh(&ipvs->sync_lock);
1638 if (ms->sync_queue_len &&
1639 ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) {
1640 int id = (int)(ms - ipvs->ms);
1641
1642 ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE;
1643 wake_up_process(ipvs->master_tinfo[id].task);
1644 }
1645 spin_unlock_bh(&ipvs->sync_lock);
1646}
1647
1648
1649static inline struct ip_vs_sync_buff *
1650next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
1651{
1652 struct ip_vs_sync_buff *sb;
1653
1654 sb = sb_dequeue(ipvs, ms);
1655 if (sb)
1656 return sb;
1657
1658 return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME);
1659}
1660
1661static int sync_thread_master(void *data)
1662{
1663 struct ip_vs_sync_thread_data *tinfo = data;
1664 struct netns_ipvs *ipvs = tinfo->ipvs;
1665 struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id];
1666 struct sock *sk = tinfo->sock->sk;
1667 struct ip_vs_sync_buff *sb;
1668
1669 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
1670 "syncid = %d, id = %d\n",
1671 ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id);
1672
1673 for (;;) {
1674 sb = next_sync_buff(ipvs, ms);
1675 if (unlikely(kthread_should_stop()))
1676 break;
1677 if (!sb) {
1678 schedule_timeout(IPVS_SYNC_CHECK_PERIOD);
1679 continue;
1680 }
1681 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) {
1682
1683
1684
1685 __wait_event_interruptible(*sk_sleep(sk),
1686 sock_writeable(sk) ||
1687 kthread_should_stop());
1688 if (unlikely(kthread_should_stop()))
1689 goto done;
1690 }
1691 ip_vs_sync_buff_release(sb);
1692 }
1693
1694done:
1695 __set_current_state(TASK_RUNNING);
1696 if (sb)
1697 ip_vs_sync_buff_release(sb);
1698
1699
1700 while ((sb = sb_dequeue(ipvs, ms)))
1701 ip_vs_sync_buff_release(sb);
1702 __set_current_state(TASK_RUNNING);
1703
1704
1705 sb = get_curr_sync_buff(ipvs, ms, 0);
1706 if (sb)
1707 ip_vs_sync_buff_release(sb);
1708
1709 return 0;
1710}
1711
1712
1713static int sync_thread_backup(void *data)
1714{
1715 struct ip_vs_sync_thread_data *tinfo = data;
1716 struct netns_ipvs *ipvs = tinfo->ipvs;
1717 struct sock *sk = tinfo->sock->sk;
1718 struct udp_sock *up = udp_sk(sk);
1719 int len;
1720
1721 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
1722 "syncid = %d, id = %d\n",
1723 ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id);
1724
1725 while (!kthread_should_stop()) {
1726 wait_event_interruptible(*sk_sleep(sk),
1727 !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
1728 !skb_queue_empty_lockless(&up->reader_queue) ||
1729 kthread_should_stop());
1730
1731
1732 while (!skb_queue_empty_lockless(&sk->sk_receive_queue) ||
1733 !skb_queue_empty_lockless(&up->reader_queue)) {
1734 len = ip_vs_receive(tinfo->sock, tinfo->buf,
1735 ipvs->bcfg.sync_maxlen);
1736 if (len <= 0) {
1737 if (len != -EAGAIN)
1738 pr_err("receiving message error\n");
1739 break;
1740 }
1741
1742 ip_vs_process_message(ipvs, tinfo->buf, len);
1743 }
1744 }
1745
1746 return 0;
1747}
1748
1749
1750int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
1751 int state)
1752{
1753 struct ip_vs_sync_thread_data *ti = NULL, *tinfo;
1754 struct task_struct *task;
1755 struct net_device *dev;
1756 char *name;
1757 int (*threadfn)(void *data);
1758 int id = 0, count, hlen;
1759 int result = -ENOMEM;
1760 u16 mtu, min_mtu;
1761
1762 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
1763 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n",
1764 sizeof(struct ip_vs_sync_conn_v0));
1765
1766
1767 if (!ip_vs_use_count_inc())
1768 return -ENOPROTOOPT;
1769
1770
1771 for (;;) {
1772 rtnl_lock();
1773 if (mutex_trylock(&ipvs->sync_mutex))
1774 break;
1775 rtnl_unlock();
1776 mutex_lock(&ipvs->sync_mutex);
1777 if (rtnl_trylock())
1778 break;
1779 mutex_unlock(&ipvs->sync_mutex);
1780 }
1781
1782 if (!ipvs->sync_state) {
1783 count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX);
1784 ipvs->threads_mask = count - 1;
1785 } else
1786 count = ipvs->threads_mask + 1;
1787
1788 if (c->mcast_af == AF_UNSPEC) {
1789 c->mcast_af = AF_INET;
1790 c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP);
1791 }
1792 if (!c->mcast_port)
1793 c->mcast_port = IP_VS_SYNC_PORT;
1794 if (!c->mcast_ttl)
1795 c->mcast_ttl = 1;
1796
1797 dev = __dev_get_by_name(ipvs->net, c->mcast_ifn);
1798 if (!dev) {
1799 pr_err("Unknown mcast interface: %s\n", c->mcast_ifn);
1800 result = -ENODEV;
1801 goto out_early;
1802 }
1803 hlen = (AF_INET6 == c->mcast_af) ?
1804 sizeof(struct ipv6hdr) + sizeof(struct udphdr) :
1805 sizeof(struct iphdr) + sizeof(struct udphdr);
1806 mtu = (state == IP_VS_STATE_BACKUP) ?
1807 clamp(dev->mtu, 1500U, 65535U) : 1500U;
1808 min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1;
1809
1810 if (c->sync_maxlen)
1811 c->sync_maxlen = clamp_t(unsigned int,
1812 c->sync_maxlen, min_mtu,
1813 65535 - hlen);
1814 else
1815 c->sync_maxlen = mtu - hlen;
1816
1817 if (state == IP_VS_STATE_MASTER) {
1818 result = -EEXIST;
1819 if (ipvs->ms)
1820 goto out_early;
1821
1822 ipvs->mcfg = *c;
1823 name = "ipvs-m:%d:%d";
1824 threadfn = sync_thread_master;
1825 } else if (state == IP_VS_STATE_BACKUP) {
1826 result = -EEXIST;
1827 if (ipvs->backup_tinfo)
1828 goto out_early;
1829
1830 ipvs->bcfg = *c;
1831 name = "ipvs-b:%d:%d";
1832 threadfn = sync_thread_backup;
1833 } else {
1834 result = -EINVAL;
1835 goto out_early;
1836 }
1837
1838 if (state == IP_VS_STATE_MASTER) {
1839 struct ipvs_master_sync_state *ms;
1840
1841 result = -ENOMEM;
1842 ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL);
1843 if (!ipvs->ms)
1844 goto out;
1845 ms = ipvs->ms;
1846 for (id = 0; id < count; id++, ms++) {
1847 INIT_LIST_HEAD(&ms->sync_queue);
1848 ms->sync_queue_len = 0;
1849 ms->sync_queue_delay = 0;
1850 INIT_DELAYED_WORK(&ms->master_wakeup_work,
1851 master_wakeup_work_handler);
1852 ms->ipvs = ipvs;
1853 }
1854 }
1855 result = -ENOMEM;
1856 ti = kcalloc(count, sizeof(struct ip_vs_sync_thread_data),
1857 GFP_KERNEL);
1858 if (!ti)
1859 goto out;
1860
1861 for (id = 0; id < count; id++) {
1862 tinfo = &ti[id];
1863 tinfo->ipvs = ipvs;
1864 if (state == IP_VS_STATE_BACKUP) {
1865 result = -ENOMEM;
1866 tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen,
1867 GFP_KERNEL);
1868 if (!tinfo->buf)
1869 goto out;
1870 }
1871 tinfo->id = id;
1872 if (state == IP_VS_STATE_MASTER)
1873 result = make_send_sock(ipvs, id, dev, &tinfo->sock);
1874 else
1875 result = make_receive_sock(ipvs, id, dev, &tinfo->sock);
1876 if (result < 0)
1877 goto out;
1878
1879 task = kthread_run(threadfn, tinfo, name, ipvs->gen, id);
1880 if (IS_ERR(task)) {
1881 result = PTR_ERR(task);
1882 goto out;
1883 }
1884 tinfo->task = task;
1885 }
1886
1887
1888
1889 if (state == IP_VS_STATE_MASTER)
1890 ipvs->master_tinfo = ti;
1891 else
1892 ipvs->backup_tinfo = ti;
1893 spin_lock_bh(&ipvs->sync_buff_lock);
1894 ipvs->sync_state |= state;
1895 spin_unlock_bh(&ipvs->sync_buff_lock);
1896
1897 mutex_unlock(&ipvs->sync_mutex);
1898 rtnl_unlock();
1899
1900 return 0;
1901
1902out:
1903
1904
1905
1906 rtnl_unlock();
1907 id = min(id, count - 1);
1908 if (ti) {
1909 for (tinfo = ti + id; tinfo >= ti; tinfo--) {
1910 if (tinfo->task)
1911 kthread_stop(tinfo->task);
1912 }
1913 }
1914 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
1915 kfree(ipvs->ms);
1916 ipvs->ms = NULL;
1917 }
1918 mutex_unlock(&ipvs->sync_mutex);
1919
1920
1921 if (ti) {
1922 for (tinfo = ti + id; tinfo >= ti; tinfo--) {
1923 if (tinfo->sock)
1924 sock_release(tinfo->sock);
1925 kfree(tinfo->buf);
1926 }
1927 kfree(ti);
1928 }
1929
1930
1931 ip_vs_use_count_dec();
1932 return result;
1933
1934out_early:
1935 mutex_unlock(&ipvs->sync_mutex);
1936 rtnl_unlock();
1937
1938
1939 ip_vs_use_count_dec();
1940 return result;
1941}
1942
1943
1944int stop_sync_thread(struct netns_ipvs *ipvs, int state)
1945{
1946 struct ip_vs_sync_thread_data *ti, *tinfo;
1947 int id;
1948 int retc = -EINVAL;
1949
1950 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
1951
1952 mutex_lock(&ipvs->sync_mutex);
1953 if (state == IP_VS_STATE_MASTER) {
1954 retc = -ESRCH;
1955 if (!ipvs->ms)
1956 goto err;
1957 ti = ipvs->master_tinfo;
1958
1959
1960
1961
1962
1963
1964
1965 spin_lock_bh(&ipvs->sync_buff_lock);
1966 spin_lock(&ipvs->sync_lock);
1967 ipvs->sync_state &= ~IP_VS_STATE_MASTER;
1968 spin_unlock(&ipvs->sync_lock);
1969 spin_unlock_bh(&ipvs->sync_buff_lock);
1970
1971 retc = 0;
1972 for (id = ipvs->threads_mask; id >= 0; id--) {
1973 struct ipvs_master_sync_state *ms = &ipvs->ms[id];
1974 int ret;
1975
1976 tinfo = &ti[id];
1977 pr_info("stopping master sync thread %d ...\n",
1978 task_pid_nr(tinfo->task));
1979 cancel_delayed_work_sync(&ms->master_wakeup_work);
1980 ret = kthread_stop(tinfo->task);
1981 if (retc >= 0)
1982 retc = ret;
1983 }
1984 kfree(ipvs->ms);
1985 ipvs->ms = NULL;
1986 ipvs->master_tinfo = NULL;
1987 } else if (state == IP_VS_STATE_BACKUP) {
1988 retc = -ESRCH;
1989 if (!ipvs->backup_tinfo)
1990 goto err;
1991 ti = ipvs->backup_tinfo;
1992
1993 ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
1994 retc = 0;
1995 for (id = ipvs->threads_mask; id >= 0; id--) {
1996 int ret;
1997
1998 tinfo = &ti[id];
1999 pr_info("stopping backup sync thread %d ...\n",
2000 task_pid_nr(tinfo->task));
2001 ret = kthread_stop(tinfo->task);
2002 if (retc >= 0)
2003 retc = ret;
2004 }
2005 ipvs->backup_tinfo = NULL;
2006 } else {
2007 goto err;
2008 }
2009 id = ipvs->threads_mask;
2010 mutex_unlock(&ipvs->sync_mutex);
2011
2012
2013 for (tinfo = ti + id; tinfo >= ti; tinfo--) {
2014 if (tinfo->sock)
2015 sock_release(tinfo->sock);
2016 kfree(tinfo->buf);
2017 }
2018 kfree(ti);
2019
2020
2021 ip_vs_use_count_dec();
2022 return retc;
2023
2024err:
2025 mutex_unlock(&ipvs->sync_mutex);
2026 return retc;
2027}
2028
2029
2030
2031
2032int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs)
2033{
2034 __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key);
2035 spin_lock_init(&ipvs->sync_lock);
2036 spin_lock_init(&ipvs->sync_buff_lock);
2037 return 0;
2038}
2039
2040void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs)
2041{
2042 int retc;
2043
2044 retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER);
2045 if (retc && retc != -ESRCH)
2046 pr_err("Failed to stop Master Daemon\n");
2047
2048 retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP);
2049 if (retc && retc != -ESRCH)
2050 pr_err("Failed to stop Backup Daemon\n");
2051}
2052