1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35#define KMSG_COMPONENT "IPVS"
36#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
37
38#include <linux/module.h>
39#include <linux/slab.h>
40#include <linux/inetdevice.h>
41#include <linux/net.h>
42#include <linux/completion.h>
43#include <linux/delay.h>
44#include <linux/skbuff.h>
45#include <linux/in.h>
46#include <linux/igmp.h>
47#include <linux/udp.h>
48#include <linux/err.h>
49#include <linux/kthread.h>
50#include <linux/wait.h>
51#include <linux/kernel.h>
52#include <linux/sched/signal.h>
53
54#include <asm/unaligned.h>
55
56#include <net/ip.h>
57#include <net/sock.h>
58
59#include <net/ip_vs.h>
60
61#define IP_VS_SYNC_GROUP 0xe0000051
62#define IP_VS_SYNC_PORT 8848
63
64#define SYNC_PROTO_VER 1
65
66static struct lock_class_key __ipvs_sync_key;
67
68
69
70
71struct ip_vs_sync_conn_v0 {
72 __u8 reserved;
73
74
75 __u8 protocol;
76 __be16 cport;
77 __be16 vport;
78 __be16 dport;
79 __be32 caddr;
80 __be32 vaddr;
81 __be32 daddr;
82
83
84 __be16 flags;
85 __be16 state;
86
87
88};
89
90struct ip_vs_sync_conn_options {
91 struct ip_vs_seq in_seq;
92 struct ip_vs_seq out_seq;
93};
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133struct ip_vs_sync_v4 {
134 __u8 type;
135 __u8 protocol;
136 __be16 ver_size;
137
138 __be32 flags;
139 __be16 state;
140
141 __be16 cport;
142 __be16 vport;
143 __be16 dport;
144 __be32 fwmark;
145 __be32 timeout;
146 __be32 caddr;
147 __be32 vaddr;
148 __be32 daddr;
149
150
151};
152
153
154
155struct ip_vs_sync_v6 {
156 __u8 type;
157 __u8 protocol;
158 __be16 ver_size;
159
160 __be32 flags;
161 __be16 state;
162
163 __be16 cport;
164 __be16 vport;
165 __be16 dport;
166 __be32 fwmark;
167 __be32 timeout;
168 struct in6_addr caddr;
169 struct in6_addr vaddr;
170 struct in6_addr daddr;
171
172
173};
174
175union ip_vs_sync_conn {
176 struct ip_vs_sync_v4 v4;
177 struct ip_vs_sync_v6 v6;
178};
179
180
181#define STYPE_INET6 0
182#define STYPE_F_INET6 (1 << STYPE_INET6)
183
184#define SVER_SHIFT 12
185#define SVER_MASK 0x0fff
186
187#define IPVS_OPT_SEQ_DATA 1
188#define IPVS_OPT_PE_DATA 2
189#define IPVS_OPT_PE_NAME 3
190#define IPVS_OPT_PARAM 7
191
192#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1))
193#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1))
194#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1))
195#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
196
197struct ip_vs_sync_thread_data {
198 struct task_struct *task;
199 struct netns_ipvs *ipvs;
200 struct socket *sock;
201 char *buf;
202 int id;
203};
204
205
206#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0))
207#define FULL_CONN_SIZE \
208(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245#define SYNC_MESG_HEADER_LEN 4
246#define MAX_CONNS_PER_SYNCBUFF 255
247
248
249struct ip_vs_sync_mesg_v0 {
250 __u8 nr_conns;
251 __u8 syncid;
252 __be16 size;
253
254
255};
256
257
258struct ip_vs_sync_mesg {
259 __u8 reserved;
260 __u8 syncid;
261 __be16 size;
262 __u8 nr_conns;
263 __s8 version;
264 __u16 spare;
265
266};
267
268union ipvs_sockaddr {
269 struct sockaddr_in in;
270 struct sockaddr_in6 in6;
271};
272
273struct ip_vs_sync_buff {
274 struct list_head list;
275 unsigned long firstuse;
276
277
278 struct ip_vs_sync_mesg *mesg;
279 unsigned char *head;
280 unsigned char *end;
281};
282
283
284
285
286
287static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
288{
289 memset(ho, 0, sizeof(*ho));
290 ho->init_seq = get_unaligned_be32(&no->init_seq);
291 ho->delta = get_unaligned_be32(&no->delta);
292 ho->previous_delta = get_unaligned_be32(&no->previous_delta);
293}
294
295
296
297
298
299static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
300{
301 put_unaligned_be32(ho->init_seq, &no->init_seq);
302 put_unaligned_be32(ho->delta, &no->delta);
303 put_unaligned_be32(ho->previous_delta, &no->previous_delta);
304}
305
306static inline struct ip_vs_sync_buff *
307sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
308{
309 struct ip_vs_sync_buff *sb;
310
311 spin_lock_bh(&ipvs->sync_lock);
312 if (list_empty(&ms->sync_queue)) {
313 sb = NULL;
314 __set_current_state(TASK_INTERRUPTIBLE);
315 } else {
316 sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff,
317 list);
318 list_del(&sb->list);
319 ms->sync_queue_len--;
320 if (!ms->sync_queue_len)
321 ms->sync_queue_delay = 0;
322 }
323 spin_unlock_bh(&ipvs->sync_lock);
324
325 return sb;
326}
327
328
329
330
331static inline struct ip_vs_sync_buff *
332ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len)
333{
334 struct ip_vs_sync_buff *sb;
335
336 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
337 return NULL;
338
339 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg),
340 ipvs->mcfg.sync_maxlen);
341 sb->mesg = kmalloc(len, GFP_ATOMIC);
342 if (!sb->mesg) {
343 kfree(sb);
344 return NULL;
345 }
346 sb->mesg->reserved = 0;
347 sb->mesg->version = SYNC_PROTO_VER;
348 sb->mesg->syncid = ipvs->mcfg.syncid;
349 sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg));
350 sb->mesg->nr_conns = 0;
351 sb->mesg->spare = 0;
352 sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
353 sb->end = (unsigned char *)sb->mesg + len;
354
355 sb->firstuse = jiffies;
356 return sb;
357}
358
359static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
360{
361 kfree(sb->mesg);
362 kfree(sb);
363}
364
365static inline void sb_queue_tail(struct netns_ipvs *ipvs,
366 struct ipvs_master_sync_state *ms)
367{
368 struct ip_vs_sync_buff *sb = ms->sync_buff;
369
370 spin_lock(&ipvs->sync_lock);
371 if (ipvs->sync_state & IP_VS_STATE_MASTER &&
372 ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) {
373 if (!ms->sync_queue_len)
374 schedule_delayed_work(&ms->master_wakeup_work,
375 max(IPVS_SYNC_SEND_DELAY, 1));
376 ms->sync_queue_len++;
377 list_add_tail(&sb->list, &ms->sync_queue);
378 if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) {
379 int id = (int)(ms - ipvs->ms);
380
381 wake_up_process(ipvs->master_tinfo[id].task);
382 }
383 } else
384 ip_vs_sync_buff_release(sb);
385 spin_unlock(&ipvs->sync_lock);
386}
387
388
389
390
391
392static inline struct ip_vs_sync_buff *
393get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms,
394 unsigned long time)
395{
396 struct ip_vs_sync_buff *sb;
397
398 spin_lock_bh(&ipvs->sync_buff_lock);
399 sb = ms->sync_buff;
400 if (sb && time_after_eq(jiffies - sb->firstuse, time)) {
401 ms->sync_buff = NULL;
402 __set_current_state(TASK_RUNNING);
403 } else
404 sb = NULL;
405 spin_unlock_bh(&ipvs->sync_buff_lock);
406 return sb;
407}
408
409static inline int
410select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp)
411{
412 return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask;
413}
414
415
416
417
418static inline struct ip_vs_sync_buff *
419ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len)
420{
421 struct ip_vs_sync_buff *sb;
422 struct ip_vs_sync_mesg_v0 *mesg;
423
424 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
425 return NULL;
426
427 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0),
428 ipvs->mcfg.sync_maxlen);
429 sb->mesg = kmalloc(len, GFP_ATOMIC);
430 if (!sb->mesg) {
431 kfree(sb);
432 return NULL;
433 }
434 mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
435 mesg->nr_conns = 0;
436 mesg->syncid = ipvs->mcfg.syncid;
437 mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0));
438 sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
439 sb->end = (unsigned char *)mesg + len;
440 sb->firstuse = jiffies;
441 return sb;
442}
443
444
445static inline bool in_persistence(struct ip_vs_conn *cp)
446{
447 for (cp = cp->control; cp; cp = cp->control) {
448 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
449 return true;
450 }
451 return false;
452}
453
454
455
456
457
458
459
460
461
462
463static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs,
464 struct ip_vs_conn *cp, int pkts)
465{
466 unsigned long orig = READ_ONCE(cp->sync_endtime);
467 unsigned long now = jiffies;
468 unsigned long n = (now + cp->timeout) & ~3UL;
469 unsigned int sync_refresh_period;
470 int sync_period;
471 int force;
472
473
474 if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE))
475 force = 0;
476 else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp)))
477 return 0;
478 else if (likely(cp->protocol == IPPROTO_TCP)) {
479 if (!((1 << cp->state) &
480 ((1 << IP_VS_TCP_S_ESTABLISHED) |
481 (1 << IP_VS_TCP_S_FIN_WAIT) |
482 (1 << IP_VS_TCP_S_CLOSE) |
483 (1 << IP_VS_TCP_S_CLOSE_WAIT) |
484 (1 << IP_VS_TCP_S_TIME_WAIT))))
485 return 0;
486 force = cp->state != cp->old_state;
487 if (force && cp->state != IP_VS_TCP_S_ESTABLISHED)
488 goto set;
489 } else if (unlikely(cp->protocol == IPPROTO_SCTP)) {
490 if (!((1 << cp->state) &
491 ((1 << IP_VS_SCTP_S_ESTABLISHED) |
492 (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) |
493 (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) |
494 (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) |
495 (1 << IP_VS_SCTP_S_CLOSED))))
496 return 0;
497 force = cp->state != cp->old_state;
498 if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED)
499 goto set;
500 } else {
501
502 force = 0;
503 }
504
505 sync_refresh_period = sysctl_sync_refresh_period(ipvs);
506 if (sync_refresh_period > 0) {
507 long diff = n - orig;
508 long min_diff = max(cp->timeout >> 1, 10UL * HZ);
509
510
511
512
513 if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) {
514 int retries = orig & 3;
515
516 if (retries >= sysctl_sync_retries(ipvs))
517 return 0;
518 if (time_before(now, orig - cp->timeout +
519 (sync_refresh_period >> 3)))
520 return 0;
521 n |= retries + 1;
522 }
523 }
524 sync_period = sysctl_sync_period(ipvs);
525 if (sync_period > 0) {
526 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) &&
527 pkts % sync_period != sysctl_sync_threshold(ipvs))
528 return 0;
529 } else if (!sync_refresh_period &&
530 pkts != sysctl_sync_threshold(ipvs))
531 return 0;
532
533set:
534 cp->old_state = cp->state;
535 n = cmpxchg(&cp->sync_endtime, orig, n);
536 return n == orig || force;
537}
538
539
540
541
542
543static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp,
544 int pkts)
545{
546 struct ip_vs_sync_mesg_v0 *m;
547 struct ip_vs_sync_conn_v0 *s;
548 struct ip_vs_sync_buff *buff;
549 struct ipvs_master_sync_state *ms;
550 int id;
551 unsigned int len;
552
553 if (unlikely(cp->af != AF_INET))
554 return;
555
556 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
557 return;
558
559 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
560 return;
561
562 spin_lock_bh(&ipvs->sync_buff_lock);
563 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
564 spin_unlock_bh(&ipvs->sync_buff_lock);
565 return;
566 }
567
568 id = select_master_thread_id(ipvs, cp);
569 ms = &ipvs->ms[id];
570 buff = ms->sync_buff;
571 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
572 SIMPLE_CONN_SIZE;
573 if (buff) {
574 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
575
576 if (buff->head + len > buff->end || !m->nr_conns) {
577 sb_queue_tail(ipvs, ms);
578 ms->sync_buff = NULL;
579 buff = NULL;
580 }
581 }
582 if (!buff) {
583 buff = ip_vs_sync_buff_create_v0(ipvs, len);
584 if (!buff) {
585 spin_unlock_bh(&ipvs->sync_buff_lock);
586 pr_err("ip_vs_sync_buff_create failed.\n");
587 return;
588 }
589 ms->sync_buff = buff;
590 }
591
592 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
593 s = (struct ip_vs_sync_conn_v0 *) buff->head;
594
595
596 s->reserved = 0;
597 s->protocol = cp->protocol;
598 s->cport = cp->cport;
599 s->vport = cp->vport;
600 s->dport = cp->dport;
601 s->caddr = cp->caddr.ip;
602 s->vaddr = cp->vaddr.ip;
603 s->daddr = cp->daddr.ip;
604 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
605 s->state = htons(cp->state);
606 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
607 struct ip_vs_sync_conn_options *opt =
608 (struct ip_vs_sync_conn_options *)&s[1];
609 memcpy(opt, &cp->in_seq, sizeof(*opt));
610 }
611
612 m->nr_conns++;
613 m->size = htons(ntohs(m->size) + len);
614 buff->head += len;
615 spin_unlock_bh(&ipvs->sync_buff_lock);
616
617
618 cp = cp->control;
619 if (cp) {
620 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
621 pkts = atomic_add_return(1, &cp->in_pkts);
622 else
623 pkts = sysctl_sync_threshold(ipvs);
624 ip_vs_sync_conn(ipvs, cp, pkts);
625 }
626}
627
628
629
630
631
632
633void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts)
634{
635 struct ip_vs_sync_mesg *m;
636 union ip_vs_sync_conn *s;
637 struct ip_vs_sync_buff *buff;
638 struct ipvs_master_sync_state *ms;
639 int id;
640 __u8 *p;
641 unsigned int len, pe_name_len, pad;
642
643
644 if (sysctl_sync_ver(ipvs) == 0) {
645 ip_vs_sync_conn_v0(ipvs, cp, pkts);
646 return;
647 }
648
649 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
650 goto control;
651sloop:
652 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
653 goto control;
654
655
656 pe_name_len = 0;
657 if (cp->pe_data_len) {
658 if (!cp->pe_data || !cp->dest) {
659 IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
660 return;
661 }
662 pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
663 }
664
665 spin_lock_bh(&ipvs->sync_buff_lock);
666 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
667 spin_unlock_bh(&ipvs->sync_buff_lock);
668 return;
669 }
670
671 id = select_master_thread_id(ipvs, cp);
672 ms = &ipvs->ms[id];
673
674#ifdef CONFIG_IP_VS_IPV6
675 if (cp->af == AF_INET6)
676 len = sizeof(struct ip_vs_sync_v6);
677 else
678#endif
679 len = sizeof(struct ip_vs_sync_v4);
680
681 if (cp->flags & IP_VS_CONN_F_SEQ_MASK)
682 len += sizeof(struct ip_vs_sync_conn_options) + 2;
683
684 if (cp->pe_data_len)
685 len += cp->pe_data_len + 2;
686 if (pe_name_len)
687 len += pe_name_len + 2;
688
689
690 pad = 0;
691 buff = ms->sync_buff;
692 if (buff) {
693 m = buff->mesg;
694 pad = (4 - (size_t) buff->head) & 3;
695
696 if (buff->head + len + pad > buff->end || m->reserved) {
697 sb_queue_tail(ipvs, ms);
698 ms->sync_buff = NULL;
699 buff = NULL;
700 pad = 0;
701 }
702 }
703
704 if (!buff) {
705 buff = ip_vs_sync_buff_create(ipvs, len);
706 if (!buff) {
707 spin_unlock_bh(&ipvs->sync_buff_lock);
708 pr_err("ip_vs_sync_buff_create failed.\n");
709 return;
710 }
711 ms->sync_buff = buff;
712 m = buff->mesg;
713 }
714
715 p = buff->head;
716 buff->head += pad + len;
717 m->size = htons(ntohs(m->size) + pad + len);
718
719 while (pad--)
720 *(p++) = 0;
721
722 s = (union ip_vs_sync_conn *)p;
723
724
725 s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0);
726 s->v4.ver_size = htons(len & SVER_MASK);
727 s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED);
728 s->v4.state = htons(cp->state);
729 s->v4.protocol = cp->protocol;
730 s->v4.cport = cp->cport;
731 s->v4.vport = cp->vport;
732 s->v4.dport = cp->dport;
733 s->v4.fwmark = htonl(cp->fwmark);
734 s->v4.timeout = htonl(cp->timeout / HZ);
735 m->nr_conns++;
736
737#ifdef CONFIG_IP_VS_IPV6
738 if (cp->af == AF_INET6) {
739 p += sizeof(struct ip_vs_sync_v6);
740 s->v6.caddr = cp->caddr.in6;
741 s->v6.vaddr = cp->vaddr.in6;
742 s->v6.daddr = cp->daddr.in6;
743 } else
744#endif
745 {
746 p += sizeof(struct ip_vs_sync_v4);
747 s->v4.caddr = cp->caddr.ip;
748 s->v4.vaddr = cp->vaddr.ip;
749 s->v4.daddr = cp->daddr.ip;
750 }
751 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
752 *(p++) = IPVS_OPT_SEQ_DATA;
753 *(p++) = sizeof(struct ip_vs_sync_conn_options);
754 hton_seq((struct ip_vs_seq *)p, &cp->in_seq);
755 p += sizeof(struct ip_vs_seq);
756 hton_seq((struct ip_vs_seq *)p, &cp->out_seq);
757 p += sizeof(struct ip_vs_seq);
758 }
759
760 if (cp->pe_data_len && cp->pe_data) {
761 *(p++) = IPVS_OPT_PE_DATA;
762 *(p++) = cp->pe_data_len;
763 memcpy(p, cp->pe_data, cp->pe_data_len);
764 p += cp->pe_data_len;
765 if (pe_name_len) {
766
767 *(p++) = IPVS_OPT_PE_NAME;
768 *(p++) = pe_name_len;
769 memcpy(p, cp->pe->name, pe_name_len);
770 p += pe_name_len;
771 }
772 }
773
774 spin_unlock_bh(&ipvs->sync_buff_lock);
775
776control:
777
778 cp = cp->control;
779 if (!cp)
780 return;
781 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
782 pkts = atomic_add_return(1, &cp->in_pkts);
783 else
784 pkts = sysctl_sync_threshold(ipvs);
785 goto sloop;
786}
787
788
789
790
791static inline int
792ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc,
793 struct ip_vs_conn_param *p,
794 __u8 *pe_data, unsigned int pe_data_len,
795 __u8 *pe_name, unsigned int pe_name_len)
796{
797#ifdef CONFIG_IP_VS_IPV6
798 if (af == AF_INET6)
799 ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol,
800 (const union nf_inet_addr *)&sc->v6.caddr,
801 sc->v6.cport,
802 (const union nf_inet_addr *)&sc->v6.vaddr,
803 sc->v6.vport, p);
804 else
805#endif
806 ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol,
807 (const union nf_inet_addr *)&sc->v4.caddr,
808 sc->v4.cport,
809 (const union nf_inet_addr *)&sc->v4.vaddr,
810 sc->v4.vport, p);
811
812 if (pe_data_len) {
813 if (pe_name_len) {
814 char buff[IP_VS_PENAME_MAXLEN+1];
815
816 memcpy(buff, pe_name, pe_name_len);
817 buff[pe_name_len]=0;
818 p->pe = __ip_vs_pe_getbyname(buff);
819 if (!p->pe) {
820 IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
821 buff);
822 return 1;
823 }
824 } else {
825 IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
826 return 1;
827 }
828
829 p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC);
830 if (!p->pe_data) {
831 module_put(p->pe->module);
832 return -ENOMEM;
833 }
834 p->pe_data_len = pe_data_len;
835 }
836 return 0;
837}
838
839
840
841
842
843
844
845static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param,
846 unsigned int flags, unsigned int state,
847 unsigned int protocol, unsigned int type,
848 const union nf_inet_addr *daddr, __be16 dport,
849 unsigned long timeout, __u32 fwmark,
850 struct ip_vs_sync_conn_options *opt)
851{
852 struct ip_vs_dest *dest;
853 struct ip_vs_conn *cp;
854
855 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
856 cp = ip_vs_conn_in_get(param);
857 if (cp && ((cp->dport != dport) ||
858 !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) {
859 if (!(flags & IP_VS_CONN_F_INACTIVE)) {
860 ip_vs_conn_expire_now(cp);
861 __ip_vs_conn_put(cp);
862 cp = NULL;
863 } else {
864
865
866
867
868 __ip_vs_conn_put(cp);
869 kfree(param->pe_data);
870 return;
871 }
872 }
873 } else {
874 cp = ip_vs_ct_in_get(param);
875 }
876
877 if (cp) {
878
879 kfree(param->pe_data);
880
881 dest = cp->dest;
882 spin_lock_bh(&cp->lock);
883 if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE &&
884 !(flags & IP_VS_CONN_F_TEMPLATE) && dest) {
885 if (flags & IP_VS_CONN_F_INACTIVE) {
886 atomic_dec(&dest->activeconns);
887 atomic_inc(&dest->inactconns);
888 } else {
889 atomic_inc(&dest->activeconns);
890 atomic_dec(&dest->inactconns);
891 }
892 }
893 flags &= IP_VS_CONN_F_BACKUP_UPD_MASK;
894 flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
895 cp->flags = flags;
896 spin_unlock_bh(&cp->lock);
897 if (!dest)
898 ip_vs_try_bind_dest(cp);
899 } else {
900
901
902
903
904
905 rcu_read_lock();
906
907
908
909
910
911 dest = ip_vs_find_dest(ipvs, type, type, daddr, dport,
912 param->vaddr, param->vport, protocol,
913 fwmark, flags);
914
915 cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest,
916 fwmark);
917 rcu_read_unlock();
918 if (!cp) {
919 kfree(param->pe_data);
920 IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
921 return;
922 }
923 if (!(flags & IP_VS_CONN_F_TEMPLATE))
924 kfree(param->pe_data);
925 }
926
927 if (opt) {
928 cp->in_seq = opt->in_seq;
929 cp->out_seq = opt->out_seq;
930 }
931 atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs));
932 cp->state = state;
933 cp->old_state = cp->state;
934
935
936
937
938
939
940
941
942
943 if (timeout) {
944 if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
945 timeout = MAX_SCHEDULE_TIMEOUT / HZ;
946 cp->timeout = timeout*HZ;
947 } else {
948 struct ip_vs_proto_data *pd;
949
950 pd = ip_vs_proto_data_get(ipvs, protocol);
951 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
952 cp->timeout = pd->timeout_table[state];
953 else
954 cp->timeout = (3*60*HZ);
955 }
956 ip_vs_conn_put(cp);
957}
958
959
960
961
962static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer,
963 const size_t buflen)
964{
965 struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
966 struct ip_vs_sync_conn_v0 *s;
967 struct ip_vs_sync_conn_options *opt;
968 struct ip_vs_protocol *pp;
969 struct ip_vs_conn_param param;
970 char *p;
971 int i;
972
973 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
974 for (i=0; i<m->nr_conns; i++) {
975 unsigned int flags, state;
976
977 if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
978 IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
979 return;
980 }
981 s = (struct ip_vs_sync_conn_v0 *) p;
982 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
983 flags &= ~IP_VS_CONN_F_HASHED;
984 if (flags & IP_VS_CONN_F_SEQ_MASK) {
985 opt = (struct ip_vs_sync_conn_options *)&s[1];
986 p += FULL_CONN_SIZE;
987 if (p > buffer+buflen) {
988 IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
989 return;
990 }
991 } else {
992 opt = NULL;
993 p += SIMPLE_CONN_SIZE;
994 }
995
996 state = ntohs(s->state);
997 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
998 pp = ip_vs_proto_get(s->protocol);
999 if (!pp) {
1000 IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
1001 s->protocol);
1002 continue;
1003 }
1004 if (state >= pp->num_states) {
1005 IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
1006 pp->name, state);
1007 continue;
1008 }
1009 } else {
1010 if (state >= IP_VS_CTPL_S_LAST)
1011 IP_VS_DBG(7, "BACKUP v0, Invalid tpl state %u\n",
1012 state);
1013 }
1014
1015 ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol,
1016 (const union nf_inet_addr *)&s->caddr,
1017 s->cport,
1018 (const union nf_inet_addr *)&s->vaddr,
1019 s->vport, ¶m);
1020
1021
1022 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->protocol, AF_INET,
1023 (union nf_inet_addr *)&s->daddr, s->dport,
1024 0, 0, opt);
1025 }
1026}
1027
1028
1029
1030
1031static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen,
1032 __u32 *opt_flags,
1033 struct ip_vs_sync_conn_options *opt)
1034{
1035 struct ip_vs_sync_conn_options *topt;
1036
1037 topt = (struct ip_vs_sync_conn_options *)p;
1038
1039 if (plen != sizeof(struct ip_vs_sync_conn_options)) {
1040 IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
1041 return -EINVAL;
1042 }
1043 if (*opt_flags & IPVS_OPT_F_SEQ_DATA) {
1044 IP_VS_DBG(2, "BACKUP, conn options found twice\n");
1045 return -EINVAL;
1046 }
1047 ntoh_seq(&topt->in_seq, &opt->in_seq);
1048 ntoh_seq(&topt->out_seq, &opt->out_seq);
1049 *opt_flags |= IPVS_OPT_F_SEQ_DATA;
1050 return 0;
1051}
1052
1053static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
1054 __u8 **data, unsigned int maxlen,
1055 __u32 *opt_flags, __u32 flag)
1056{
1057 if (plen > maxlen) {
1058 IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen);
1059 return -EINVAL;
1060 }
1061 if (*opt_flags & flag) {
1062 IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag);
1063 return -EINVAL;
1064 }
1065 *data_len = plen;
1066 *data = p;
1067 *opt_flags |= flag;
1068 return 0;
1069}
1070
1071
1072
1073static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end)
1074{
1075 struct ip_vs_sync_conn_options opt;
1076 union ip_vs_sync_conn *s;
1077 struct ip_vs_protocol *pp;
1078 struct ip_vs_conn_param param;
1079 __u32 flags;
1080 unsigned int af, state, pe_data_len=0, pe_name_len=0;
1081 __u8 *pe_data=NULL, *pe_name=NULL;
1082 __u32 opt_flags=0;
1083 int retc=0;
1084
1085 s = (union ip_vs_sync_conn *) p;
1086
1087 if (s->v6.type & STYPE_F_INET6) {
1088#ifdef CONFIG_IP_VS_IPV6
1089 af = AF_INET6;
1090 p += sizeof(struct ip_vs_sync_v6);
1091#else
1092 IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
1093 retc = 10;
1094 goto out;
1095#endif
1096 } else if (!s->v4.type) {
1097 af = AF_INET;
1098 p += sizeof(struct ip_vs_sync_v4);
1099 } else {
1100 return -10;
1101 }
1102 if (p > msg_end)
1103 return -20;
1104
1105
1106 while (p < msg_end) {
1107 int ptype;
1108 int plen;
1109
1110 if (p+2 > msg_end)
1111 return -30;
1112 ptype = *(p++);
1113 plen = *(p++);
1114
1115 if (!plen || ((p + plen) > msg_end))
1116 return -40;
1117
1118 switch (ptype & ~IPVS_OPT_F_PARAM) {
1119 case IPVS_OPT_SEQ_DATA:
1120 if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt))
1121 return -50;
1122 break;
1123
1124 case IPVS_OPT_PE_DATA:
1125 if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data,
1126 IP_VS_PEDATA_MAXLEN, &opt_flags,
1127 IPVS_OPT_F_PE_DATA))
1128 return -60;
1129 break;
1130
1131 case IPVS_OPT_PE_NAME:
1132 if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name,
1133 IP_VS_PENAME_MAXLEN, &opt_flags,
1134 IPVS_OPT_F_PE_NAME))
1135 return -70;
1136 break;
1137
1138 default:
1139
1140 if (!(ptype & IPVS_OPT_F_PARAM)) {
1141 IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
1142 ptype & ~IPVS_OPT_F_PARAM);
1143 retc = 20;
1144 goto out;
1145 }
1146 }
1147 p += plen;
1148 }
1149
1150
1151 flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK;
1152 flags |= IP_VS_CONN_F_SYNC;
1153 state = ntohs(s->v4.state);
1154
1155 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
1156 pp = ip_vs_proto_get(s->v4.protocol);
1157 if (!pp) {
1158 IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
1159 s->v4.protocol);
1160 retc = 30;
1161 goto out;
1162 }
1163 if (state >= pp->num_states) {
1164 IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
1165 pp->name, state);
1166 retc = 40;
1167 goto out;
1168 }
1169 } else {
1170 if (state >= IP_VS_CTPL_S_LAST)
1171 IP_VS_DBG(7, "BACKUP, Invalid tpl state %u\n",
1172 state);
1173 }
1174 if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data,
1175 pe_data_len, pe_name, pe_name_len)) {
1176 retc = 50;
1177 goto out;
1178 }
1179
1180 if (af == AF_INET)
1181 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v4.protocol, af,
1182 (union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
1183 ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
1184 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
1185 );
1186#ifdef CONFIG_IP_VS_IPV6
1187 else
1188 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v6.protocol, af,
1189 (union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
1190 ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
1191 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
1192 );
1193#endif
1194 ip_vs_pe_put(param.pe);
1195 return 0;
1196
1197out:
1198 IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc);
1199 return retc;
1200
1201}
1202
1203
1204
1205
1206
1207static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer,
1208 const size_t buflen)
1209{
1210 struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
1211 __u8 *p, *msg_end;
1212 int i, nr_conns;
1213
1214 if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) {
1215 IP_VS_DBG(2, "BACKUP, message header too short\n");
1216 return;
1217 }
1218
1219 if (buflen != ntohs(m2->size)) {
1220 IP_VS_DBG(2, "BACKUP, bogus message size\n");
1221 return;
1222 }
1223
1224 if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) {
1225 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
1226 return;
1227 }
1228
1229 if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
1230 && (m2->spare == 0)) {
1231
1232 msg_end = buffer + sizeof(struct ip_vs_sync_mesg);
1233 nr_conns = m2->nr_conns;
1234
1235 for (i=0; i<nr_conns; i++) {
1236 union ip_vs_sync_conn *s;
1237 unsigned int size;
1238 int retc;
1239
1240 p = msg_end;
1241 if (p + sizeof(s->v4) > buffer+buflen) {
1242 IP_VS_ERR_RL("BACKUP, Dropping buffer, too small\n");
1243 return;
1244 }
1245 s = (union ip_vs_sync_conn *)p;
1246 size = ntohs(s->v4.ver_size) & SVER_MASK;
1247 msg_end = p + size;
1248
1249 if (msg_end > buffer+buflen) {
1250 IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
1251 return;
1252 }
1253 if (ntohs(s->v4.ver_size) >> SVER_SHIFT) {
1254 IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
1255 ntohs(s->v4.ver_size) >> SVER_SHIFT);
1256 return;
1257 }
1258
1259 retc = ip_vs_proc_sync_conn(ipvs, p, msg_end);
1260 if (retc < 0) {
1261 IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
1262 retc);
1263 return;
1264 }
1265
1266 msg_end = p + ((size + 3) & ~3);
1267 }
1268 } else {
1269
1270 ip_vs_process_message_v0(ipvs, buffer, buflen);
1271 return;
1272 }
1273}
1274
1275
1276
1277
1278
1279static void set_sock_size(struct sock *sk, int mode, int val)
1280{
1281
1282
1283 lock_sock(sk);
1284 if (mode) {
1285 val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
1286 sysctl_wmem_max);
1287 sk->sk_sndbuf = val * 2;
1288 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1289 } else {
1290 val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
1291 sysctl_rmem_max);
1292 sk->sk_rcvbuf = val * 2;
1293 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
1294 }
1295 release_sock(sk);
1296}
1297
1298
1299
1300
1301static void set_mcast_loop(struct sock *sk, u_char loop)
1302{
1303 struct inet_sock *inet = inet_sk(sk);
1304
1305
1306 lock_sock(sk);
1307 inet->mc_loop = loop ? 1 : 0;
1308#ifdef CONFIG_IP_VS_IPV6
1309 if (sk->sk_family == AF_INET6) {
1310 struct ipv6_pinfo *np = inet6_sk(sk);
1311
1312
1313 np->mc_loop = loop ? 1 : 0;
1314 }
1315#endif
1316 release_sock(sk);
1317}
1318
1319
1320
1321
1322static void set_mcast_ttl(struct sock *sk, u_char ttl)
1323{
1324 struct inet_sock *inet = inet_sk(sk);
1325
1326
1327 lock_sock(sk);
1328 inet->mc_ttl = ttl;
1329#ifdef CONFIG_IP_VS_IPV6
1330 if (sk->sk_family == AF_INET6) {
1331 struct ipv6_pinfo *np = inet6_sk(sk);
1332
1333
1334 np->mcast_hops = ttl;
1335 }
1336#endif
1337 release_sock(sk);
1338}
1339
1340
1341static void set_mcast_pmtudisc(struct sock *sk, int val)
1342{
1343 struct inet_sock *inet = inet_sk(sk);
1344
1345
1346 lock_sock(sk);
1347 inet->pmtudisc = val;
1348#ifdef CONFIG_IP_VS_IPV6
1349 if (sk->sk_family == AF_INET6) {
1350 struct ipv6_pinfo *np = inet6_sk(sk);
1351
1352
1353 np->pmtudisc = val;
1354 }
1355#endif
1356 release_sock(sk);
1357}
1358
1359
1360
1361
1362static int set_mcast_if(struct sock *sk, struct net_device *dev)
1363{
1364 struct inet_sock *inet = inet_sk(sk);
1365
1366 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1367 return -EINVAL;
1368
1369 lock_sock(sk);
1370 inet->mc_index = dev->ifindex;
1371
1372#ifdef CONFIG_IP_VS_IPV6
1373 if (sk->sk_family == AF_INET6) {
1374 struct ipv6_pinfo *np = inet6_sk(sk);
1375
1376
1377 np->mcast_oif = dev->ifindex;
1378 }
1379#endif
1380 release_sock(sk);
1381
1382 return 0;
1383}
1384
1385
1386
1387
1388
1389
1390
1391static int
1392join_mcast_group(struct sock *sk, struct in_addr *addr, struct net_device *dev)
1393{
1394 struct ip_mreqn mreq;
1395 int ret;
1396
1397 memset(&mreq, 0, sizeof(mreq));
1398 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
1399
1400 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1401 return -EINVAL;
1402
1403 mreq.imr_ifindex = dev->ifindex;
1404
1405 lock_sock(sk);
1406 ret = ip_mc_join_group(sk, &mreq);
1407 release_sock(sk);
1408
1409 return ret;
1410}
1411
1412#ifdef CONFIG_IP_VS_IPV6
1413static int join_mcast_group6(struct sock *sk, struct in6_addr *addr,
1414 struct net_device *dev)
1415{
1416 int ret;
1417
1418 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1419 return -EINVAL;
1420
1421 lock_sock(sk);
1422 ret = ipv6_sock_mc_join(sk, dev->ifindex, addr);
1423 release_sock(sk);
1424
1425 return ret;
1426}
1427#endif
1428
1429static int bind_mcastif_addr(struct socket *sock, struct net_device *dev)
1430{
1431 __be32 addr;
1432 struct sockaddr_in sin;
1433
1434 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1435 if (!addr)
1436 pr_err("You probably need to specify IP address on "
1437 "multicast interface.\n");
1438
1439 IP_VS_DBG(7, "binding socket with (%s) %pI4\n",
1440 dev->name, &addr);
1441
1442
1443 sin.sin_family = AF_INET;
1444 sin.sin_addr.s_addr = addr;
1445 sin.sin_port = 0;
1446
1447 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
1448}
1449
1450static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen,
1451 struct ipvs_sync_daemon_cfg *c, int id)
1452{
1453 if (AF_INET6 == c->mcast_af) {
1454 sa->in6 = (struct sockaddr_in6) {
1455 .sin6_family = AF_INET6,
1456 .sin6_port = htons(c->mcast_port + id),
1457 };
1458 sa->in6.sin6_addr = c->mcast_group.in6;
1459 *salen = sizeof(sa->in6);
1460 } else {
1461 sa->in = (struct sockaddr_in) {
1462 .sin_family = AF_INET,
1463 .sin_port = htons(c->mcast_port + id),
1464 };
1465 sa->in.sin_addr = c->mcast_group.in;
1466 *salen = sizeof(sa->in);
1467 }
1468}
1469
1470
1471
1472
1473static int make_send_sock(struct netns_ipvs *ipvs, int id,
1474 struct net_device *dev, struct socket **sock_ret)
1475{
1476
1477 union ipvs_sockaddr mcast_addr;
1478 struct socket *sock;
1479 int result, salen;
1480
1481
1482 result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM,
1483 IPPROTO_UDP, &sock);
1484 if (result < 0) {
1485 pr_err("Error during creation of socket; terminating\n");
1486 goto error;
1487 }
1488 *sock_ret = sock;
1489 result = set_mcast_if(sock->sk, dev);
1490 if (result < 0) {
1491 pr_err("Error setting outbound mcast interface\n");
1492 goto error;
1493 }
1494
1495 set_mcast_loop(sock->sk, 0);
1496 set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl);
1497
1498 set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT);
1499 result = sysctl_sync_sock_size(ipvs);
1500 if (result > 0)
1501 set_sock_size(sock->sk, 1, result);
1502
1503 if (AF_INET == ipvs->mcfg.mcast_af)
1504 result = bind_mcastif_addr(sock, dev);
1505 else
1506 result = 0;
1507 if (result < 0) {
1508 pr_err("Error binding address of the mcast interface\n");
1509 goto error;
1510 }
1511
1512 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id);
1513 result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
1514 salen, 0);
1515 if (result < 0) {
1516 pr_err("Error connecting to the multicast addr\n");
1517 goto error;
1518 }
1519
1520 return 0;
1521
1522error:
1523 return result;
1524}
1525
1526
1527
1528
1529
1530static int make_receive_sock(struct netns_ipvs *ipvs, int id,
1531 struct net_device *dev, struct socket **sock_ret)
1532{
1533
1534 union ipvs_sockaddr mcast_addr;
1535 struct socket *sock;
1536 int result, salen;
1537
1538
1539 result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM,
1540 IPPROTO_UDP, &sock);
1541 if (result < 0) {
1542 pr_err("Error during creation of socket; terminating\n");
1543 goto error;
1544 }
1545 *sock_ret = sock;
1546
1547 sock->sk->sk_reuse = SK_CAN_REUSE;
1548 result = sysctl_sync_sock_size(ipvs);
1549 if (result > 0)
1550 set_sock_size(sock->sk, 0, result);
1551
1552 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id);
1553 sock->sk->sk_bound_dev_if = dev->ifindex;
1554 result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen);
1555 if (result < 0) {
1556 pr_err("Error binding to the multicast addr\n");
1557 goto error;
1558 }
1559
1560
1561#ifdef CONFIG_IP_VS_IPV6
1562 if (ipvs->bcfg.mcast_af == AF_INET6)
1563 result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr,
1564 dev);
1565 else
1566#endif
1567 result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr,
1568 dev);
1569 if (result < 0) {
1570 pr_err("Error joining to the multicast group\n");
1571 goto error;
1572 }
1573
1574 return 0;
1575
1576error:
1577 return result;
1578}
1579
1580
1581static int
1582ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
1583{
1584 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
1585 struct kvec iov;
1586 int len;
1587
1588 EnterFunction(7);
1589 iov.iov_base = (void *)buffer;
1590 iov.iov_len = length;
1591
1592 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
1593
1594 LeaveFunction(7);
1595 return len;
1596}
1597
1598static int
1599ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
1600{
1601 int msize;
1602 int ret;
1603
1604 msize = ntohs(msg->size);
1605
1606 ret = ip_vs_send_async(sock, (char *)msg, msize);
1607 if (ret >= 0 || ret == -EAGAIN)
1608 return ret;
1609 pr_err("ip_vs_send_async error %d\n", ret);
1610 return 0;
1611}
1612
1613static int
1614ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
1615{
1616 struct msghdr msg = {NULL,};
1617 struct kvec iov = {buffer, buflen};
1618 int len;
1619
1620 EnterFunction(7);
1621
1622
1623 iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, buflen);
1624 len = sock_recvmsg(sock, &msg, MSG_DONTWAIT);
1625 if (len < 0)
1626 return len;
1627
1628 LeaveFunction(7);
1629 return len;
1630}
1631
1632
1633static void master_wakeup_work_handler(struct work_struct *work)
1634{
1635 struct ipvs_master_sync_state *ms =
1636 container_of(work, struct ipvs_master_sync_state,
1637 master_wakeup_work.work);
1638 struct netns_ipvs *ipvs = ms->ipvs;
1639
1640 spin_lock_bh(&ipvs->sync_lock);
1641 if (ms->sync_queue_len &&
1642 ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) {
1643 int id = (int)(ms - ipvs->ms);
1644
1645 ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE;
1646 wake_up_process(ipvs->master_tinfo[id].task);
1647 }
1648 spin_unlock_bh(&ipvs->sync_lock);
1649}
1650
1651
1652static inline struct ip_vs_sync_buff *
1653next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
1654{
1655 struct ip_vs_sync_buff *sb;
1656
1657 sb = sb_dequeue(ipvs, ms);
1658 if (sb)
1659 return sb;
1660
1661 return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME);
1662}
1663
1664static int sync_thread_master(void *data)
1665{
1666 struct ip_vs_sync_thread_data *tinfo = data;
1667 struct netns_ipvs *ipvs = tinfo->ipvs;
1668 struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id];
1669 struct sock *sk = tinfo->sock->sk;
1670 struct ip_vs_sync_buff *sb;
1671
1672 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
1673 "syncid = %d, id = %d\n",
1674 ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id);
1675
1676 for (;;) {
1677 sb = next_sync_buff(ipvs, ms);
1678 if (unlikely(kthread_should_stop()))
1679 break;
1680 if (!sb) {
1681 schedule_timeout(IPVS_SYNC_CHECK_PERIOD);
1682 continue;
1683 }
1684 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) {
1685
1686
1687
1688 __wait_event_interruptible(*sk_sleep(sk),
1689 sock_writeable(sk) ||
1690 kthread_should_stop());
1691 if (unlikely(kthread_should_stop()))
1692 goto done;
1693 }
1694 ip_vs_sync_buff_release(sb);
1695 }
1696
1697done:
1698 __set_current_state(TASK_RUNNING);
1699 if (sb)
1700 ip_vs_sync_buff_release(sb);
1701
1702
1703 while ((sb = sb_dequeue(ipvs, ms)))
1704 ip_vs_sync_buff_release(sb);
1705 __set_current_state(TASK_RUNNING);
1706
1707
1708 sb = get_curr_sync_buff(ipvs, ms, 0);
1709 if (sb)
1710 ip_vs_sync_buff_release(sb);
1711
1712 return 0;
1713}
1714
1715
1716static int sync_thread_backup(void *data)
1717{
1718 struct ip_vs_sync_thread_data *tinfo = data;
1719 struct netns_ipvs *ipvs = tinfo->ipvs;
1720 int len;
1721
1722 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
1723 "syncid = %d, id = %d\n",
1724 ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id);
1725
1726 while (!kthread_should_stop()) {
1727 wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
1728 !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
1729 || kthread_should_stop());
1730
1731
1732 while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
1733 len = ip_vs_receive(tinfo->sock, tinfo->buf,
1734 ipvs->bcfg.sync_maxlen);
1735 if (len <= 0) {
1736 if (len != -EAGAIN)
1737 pr_err("receiving message error\n");
1738 break;
1739 }
1740
1741 ip_vs_process_message(ipvs, tinfo->buf, len);
1742 }
1743 }
1744
1745 return 0;
1746}
1747
1748
1749int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
1750 int state)
1751{
1752 struct ip_vs_sync_thread_data *ti = NULL, *tinfo;
1753 struct task_struct *task;
1754 struct net_device *dev;
1755 char *name;
1756 int (*threadfn)(void *data);
1757 int id = 0, count, hlen;
1758 int result = -ENOMEM;
1759 u16 mtu, min_mtu;
1760
1761 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
1762 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n",
1763 sizeof(struct ip_vs_sync_conn_v0));
1764
1765
1766 if (!ip_vs_use_count_inc())
1767 return -ENOPROTOOPT;
1768
1769
1770 for (;;) {
1771 rtnl_lock();
1772 if (mutex_trylock(&ipvs->sync_mutex))
1773 break;
1774 rtnl_unlock();
1775 mutex_lock(&ipvs->sync_mutex);
1776 if (rtnl_trylock())
1777 break;
1778 mutex_unlock(&ipvs->sync_mutex);
1779 }
1780
1781 if (!ipvs->sync_state) {
1782 count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX);
1783 ipvs->threads_mask = count - 1;
1784 } else
1785 count = ipvs->threads_mask + 1;
1786
1787 if (c->mcast_af == AF_UNSPEC) {
1788 c->mcast_af = AF_INET;
1789 c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP);
1790 }
1791 if (!c->mcast_port)
1792 c->mcast_port = IP_VS_SYNC_PORT;
1793 if (!c->mcast_ttl)
1794 c->mcast_ttl = 1;
1795
1796 dev = __dev_get_by_name(ipvs->net, c->mcast_ifn);
1797 if (!dev) {
1798 pr_err("Unknown mcast interface: %s\n", c->mcast_ifn);
1799 result = -ENODEV;
1800 goto out_early;
1801 }
1802 hlen = (AF_INET6 == c->mcast_af) ?
1803 sizeof(struct ipv6hdr) + sizeof(struct udphdr) :
1804 sizeof(struct iphdr) + sizeof(struct udphdr);
1805 mtu = (state == IP_VS_STATE_BACKUP) ?
1806 clamp(dev->mtu, 1500U, 65535U) : 1500U;
1807 min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1;
1808
1809 if (c->sync_maxlen)
1810 c->sync_maxlen = clamp_t(unsigned int,
1811 c->sync_maxlen, min_mtu,
1812 65535 - hlen);
1813 else
1814 c->sync_maxlen = mtu - hlen;
1815
1816 if (state == IP_VS_STATE_MASTER) {
1817 result = -EEXIST;
1818 if (ipvs->ms)
1819 goto out_early;
1820
1821 ipvs->mcfg = *c;
1822 name = "ipvs-m:%d:%d";
1823 threadfn = sync_thread_master;
1824 } else if (state == IP_VS_STATE_BACKUP) {
1825 result = -EEXIST;
1826 if (ipvs->backup_tinfo)
1827 goto out_early;
1828
1829 ipvs->bcfg = *c;
1830 name = "ipvs-b:%d:%d";
1831 threadfn = sync_thread_backup;
1832 } else {
1833 result = -EINVAL;
1834 goto out_early;
1835 }
1836
1837 if (state == IP_VS_STATE_MASTER) {
1838 struct ipvs_master_sync_state *ms;
1839
1840 result = -ENOMEM;
1841 ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL);
1842 if (!ipvs->ms)
1843 goto out;
1844 ms = ipvs->ms;
1845 for (id = 0; id < count; id++, ms++) {
1846 INIT_LIST_HEAD(&ms->sync_queue);
1847 ms->sync_queue_len = 0;
1848 ms->sync_queue_delay = 0;
1849 INIT_DELAYED_WORK(&ms->master_wakeup_work,
1850 master_wakeup_work_handler);
1851 ms->ipvs = ipvs;
1852 }
1853 }
1854 result = -ENOMEM;
1855 ti = kcalloc(count, sizeof(struct ip_vs_sync_thread_data),
1856 GFP_KERNEL);
1857 if (!ti)
1858 goto out;
1859
1860 for (id = 0; id < count; id++) {
1861 tinfo = &ti[id];
1862 tinfo->ipvs = ipvs;
1863 if (state == IP_VS_STATE_BACKUP) {
1864 result = -ENOMEM;
1865 tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen,
1866 GFP_KERNEL);
1867 if (!tinfo->buf)
1868 goto out;
1869 }
1870 tinfo->id = id;
1871 if (state == IP_VS_STATE_MASTER)
1872 result = make_send_sock(ipvs, id, dev, &tinfo->sock);
1873 else
1874 result = make_receive_sock(ipvs, id, dev, &tinfo->sock);
1875 if (result < 0)
1876 goto out;
1877
1878 task = kthread_run(threadfn, tinfo, name, ipvs->gen, id);
1879 if (IS_ERR(task)) {
1880 result = PTR_ERR(task);
1881 goto out;
1882 }
1883 tinfo->task = task;
1884 }
1885
1886
1887
1888 if (state == IP_VS_STATE_MASTER)
1889 ipvs->master_tinfo = ti;
1890 else
1891 ipvs->backup_tinfo = ti;
1892 spin_lock_bh(&ipvs->sync_buff_lock);
1893 ipvs->sync_state |= state;
1894 spin_unlock_bh(&ipvs->sync_buff_lock);
1895
1896 mutex_unlock(&ipvs->sync_mutex);
1897 rtnl_unlock();
1898
1899 return 0;
1900
1901out:
1902
1903
1904
1905 rtnl_unlock();
1906 id = min(id, count - 1);
1907 if (ti) {
1908 for (tinfo = ti + id; tinfo >= ti; tinfo--) {
1909 if (tinfo->task)
1910 kthread_stop(tinfo->task);
1911 }
1912 }
1913 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
1914 kfree(ipvs->ms);
1915 ipvs->ms = NULL;
1916 }
1917 mutex_unlock(&ipvs->sync_mutex);
1918
1919
1920 if (ti) {
1921 for (tinfo = ti + id; tinfo >= ti; tinfo--) {
1922 if (tinfo->sock)
1923 sock_release(tinfo->sock);
1924 kfree(tinfo->buf);
1925 }
1926 kfree(ti);
1927 }
1928
1929
1930 ip_vs_use_count_dec();
1931 return result;
1932
1933out_early:
1934 mutex_unlock(&ipvs->sync_mutex);
1935 rtnl_unlock();
1936
1937
1938 ip_vs_use_count_dec();
1939 return result;
1940}
1941
1942
1943int stop_sync_thread(struct netns_ipvs *ipvs, int state)
1944{
1945 struct ip_vs_sync_thread_data *ti, *tinfo;
1946 int id;
1947 int retc = -EINVAL;
1948
1949 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
1950
1951 mutex_lock(&ipvs->sync_mutex);
1952 if (state == IP_VS_STATE_MASTER) {
1953 retc = -ESRCH;
1954 if (!ipvs->ms)
1955 goto err;
1956 ti = ipvs->master_tinfo;
1957
1958
1959
1960
1961
1962
1963
1964 spin_lock_bh(&ipvs->sync_buff_lock);
1965 spin_lock(&ipvs->sync_lock);
1966 ipvs->sync_state &= ~IP_VS_STATE_MASTER;
1967 spin_unlock(&ipvs->sync_lock);
1968 spin_unlock_bh(&ipvs->sync_buff_lock);
1969
1970 retc = 0;
1971 for (id = ipvs->threads_mask; id >= 0; id--) {
1972 struct ipvs_master_sync_state *ms = &ipvs->ms[id];
1973 int ret;
1974
1975 tinfo = &ti[id];
1976 pr_info("stopping master sync thread %d ...\n",
1977 task_pid_nr(tinfo->task));
1978 cancel_delayed_work_sync(&ms->master_wakeup_work);
1979 ret = kthread_stop(tinfo->task);
1980 if (retc >= 0)
1981 retc = ret;
1982 }
1983 kfree(ipvs->ms);
1984 ipvs->ms = NULL;
1985 ipvs->master_tinfo = NULL;
1986 } else if (state == IP_VS_STATE_BACKUP) {
1987 retc = -ESRCH;
1988 if (!ipvs->backup_tinfo)
1989 goto err;
1990 ti = ipvs->backup_tinfo;
1991
1992 ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
1993 retc = 0;
1994 for (id = ipvs->threads_mask; id >= 0; id--) {
1995 int ret;
1996
1997 tinfo = &ti[id];
1998 pr_info("stopping backup sync thread %d ...\n",
1999 task_pid_nr(tinfo->task));
2000 ret = kthread_stop(tinfo->task);
2001 if (retc >= 0)
2002 retc = ret;
2003 }
2004 ipvs->backup_tinfo = NULL;
2005 } else {
2006 goto err;
2007 }
2008 id = ipvs->threads_mask;
2009 mutex_unlock(&ipvs->sync_mutex);
2010
2011
2012 for (tinfo = ti + id; tinfo >= ti; tinfo--) {
2013 if (tinfo->sock)
2014 sock_release(tinfo->sock);
2015 kfree(tinfo->buf);
2016 }
2017 kfree(ti);
2018
2019
2020 ip_vs_use_count_dec();
2021 return retc;
2022
2023err:
2024 mutex_unlock(&ipvs->sync_mutex);
2025 return retc;
2026}
2027
2028
2029
2030
2031int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs)
2032{
2033 __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key);
2034 spin_lock_init(&ipvs->sync_lock);
2035 spin_lock_init(&ipvs->sync_buff_lock);
2036 return 0;
2037}
2038
2039void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs)
2040{
2041 int retc;
2042
2043 retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER);
2044 if (retc && retc != -ESRCH)
2045 pr_err("Failed to stop Master Daemon\n");
2046
2047 retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP);
2048 if (retc && retc != -ESRCH)
2049 pr_err("Failed to stop Backup Daemon\n");
2050}
2051