1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34#define KMSG_COMPONENT "IPVS"
35#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
36
37#include <linux/module.h>
38#include <linux/slab.h>
39#include <linux/inetdevice.h>
40#include <linux/net.h>
41#include <linux/completion.h>
42#include <linux/delay.h>
43#include <linux/skbuff.h>
44#include <linux/in.h>
45#include <linux/igmp.h>
46#include <linux/udp.h>
47#include <linux/err.h>
48#include <linux/kthread.h>
49#include <linux/wait.h>
50#include <linux/kernel.h>
51
52#include <asm/unaligned.h>
53
54#include <net/ip.h>
55#include <net/sock.h>
56
57#include <net/ip_vs.h>
58
59#define IP_VS_SYNC_GROUP 0xe0000051
60#define IP_VS_SYNC_PORT 8848
61
62#define SYNC_PROTO_VER 1
63
64static struct lock_class_key __ipvs_sync_key;
65
66
67
68
69struct ip_vs_sync_conn_v0 {
70 __u8 reserved;
71
72
73 __u8 protocol;
74 __be16 cport;
75 __be16 vport;
76 __be16 dport;
77 __be32 caddr;
78 __be32 vaddr;
79 __be32 daddr;
80
81
82 __be16 flags;
83 __be16 state;
84
85
86};
87
88struct ip_vs_sync_conn_options {
89 struct ip_vs_seq in_seq;
90 struct ip_vs_seq out_seq;
91};
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131struct ip_vs_sync_v4 {
132 __u8 type;
133 __u8 protocol;
134 __be16 ver_size;
135
136 __be32 flags;
137 __be16 state;
138
139 __be16 cport;
140 __be16 vport;
141 __be16 dport;
142 __be32 fwmark;
143 __be32 timeout;
144 __be32 caddr;
145 __be32 vaddr;
146 __be32 daddr;
147
148
149};
150
151
152
153struct ip_vs_sync_v6 {
154 __u8 type;
155 __u8 protocol;
156 __be16 ver_size;
157
158 __be32 flags;
159 __be16 state;
160
161 __be16 cport;
162 __be16 vport;
163 __be16 dport;
164 __be32 fwmark;
165 __be32 timeout;
166 struct in6_addr caddr;
167 struct in6_addr vaddr;
168 struct in6_addr daddr;
169
170
171};
172
173union ip_vs_sync_conn {
174 struct ip_vs_sync_v4 v4;
175 struct ip_vs_sync_v6 v6;
176};
177
178
179#define STYPE_INET6 0
180#define STYPE_F_INET6 (1 << STYPE_INET6)
181
182#define SVER_SHIFT 12
183#define SVER_MASK 0x0fff
184
185#define IPVS_OPT_SEQ_DATA 1
186#define IPVS_OPT_PE_DATA 2
187#define IPVS_OPT_PE_NAME 3
188#define IPVS_OPT_PARAM 7
189
190#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1))
191#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1))
192#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1))
193#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
194
195struct ip_vs_sync_thread_data {
196 struct netns_ipvs *ipvs;
197 struct socket *sock;
198 char *buf;
199 int id;
200};
201
202
203#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0))
204#define FULL_CONN_SIZE \
205(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242#define SYNC_MESG_HEADER_LEN 4
243#define MAX_CONNS_PER_SYNCBUFF 255
244
245
246struct ip_vs_sync_mesg_v0 {
247 __u8 nr_conns;
248 __u8 syncid;
249 __be16 size;
250
251
252};
253
254
255struct ip_vs_sync_mesg {
256 __u8 reserved;
257 __u8 syncid;
258 __be16 size;
259 __u8 nr_conns;
260 __s8 version;
261 __u16 spare;
262
263};
264
265union ipvs_sockaddr {
266 struct sockaddr_in in;
267 struct sockaddr_in6 in6;
268};
269
270struct ip_vs_sync_buff {
271 struct list_head list;
272 unsigned long firstuse;
273
274
275 struct ip_vs_sync_mesg *mesg;
276 unsigned char *head;
277 unsigned char *end;
278};
279
280
281
282
283
284static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
285{
286 ho->init_seq = get_unaligned_be32(&no->init_seq);
287 ho->delta = get_unaligned_be32(&no->delta);
288 ho->previous_delta = get_unaligned_be32(&no->previous_delta);
289}
290
291
292
293
294
295static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
296{
297 put_unaligned_be32(ho->init_seq, &no->init_seq);
298 put_unaligned_be32(ho->delta, &no->delta);
299 put_unaligned_be32(ho->previous_delta, &no->previous_delta);
300}
301
302static inline struct ip_vs_sync_buff *
303sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
304{
305 struct ip_vs_sync_buff *sb;
306
307 spin_lock_bh(&ipvs->sync_lock);
308 if (list_empty(&ms->sync_queue)) {
309 sb = NULL;
310 __set_current_state(TASK_INTERRUPTIBLE);
311 } else {
312 sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff,
313 list);
314 list_del(&sb->list);
315 ms->sync_queue_len--;
316 if (!ms->sync_queue_len)
317 ms->sync_queue_delay = 0;
318 }
319 spin_unlock_bh(&ipvs->sync_lock);
320
321 return sb;
322}
323
324
325
326
327static inline struct ip_vs_sync_buff *
328ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len)
329{
330 struct ip_vs_sync_buff *sb;
331
332 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
333 return NULL;
334
335 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg),
336 ipvs->mcfg.sync_maxlen);
337 sb->mesg = kmalloc(len, GFP_ATOMIC);
338 if (!sb->mesg) {
339 kfree(sb);
340 return NULL;
341 }
342 sb->mesg->reserved = 0;
343 sb->mesg->version = SYNC_PROTO_VER;
344 sb->mesg->syncid = ipvs->mcfg.syncid;
345 sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg));
346 sb->mesg->nr_conns = 0;
347 sb->mesg->spare = 0;
348 sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
349 sb->end = (unsigned char *)sb->mesg + len;
350
351 sb->firstuse = jiffies;
352 return sb;
353}
354
355static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
356{
357 kfree(sb->mesg);
358 kfree(sb);
359}
360
361static inline void sb_queue_tail(struct netns_ipvs *ipvs,
362 struct ipvs_master_sync_state *ms)
363{
364 struct ip_vs_sync_buff *sb = ms->sync_buff;
365
366 spin_lock(&ipvs->sync_lock);
367 if (ipvs->sync_state & IP_VS_STATE_MASTER &&
368 ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) {
369 if (!ms->sync_queue_len)
370 schedule_delayed_work(&ms->master_wakeup_work,
371 max(IPVS_SYNC_SEND_DELAY, 1));
372 ms->sync_queue_len++;
373 list_add_tail(&sb->list, &ms->sync_queue);
374 if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE)
375 wake_up_process(ms->master_thread);
376 } else
377 ip_vs_sync_buff_release(sb);
378 spin_unlock(&ipvs->sync_lock);
379}
380
381
382
383
384
385static inline struct ip_vs_sync_buff *
386get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms,
387 unsigned long time)
388{
389 struct ip_vs_sync_buff *sb;
390
391 spin_lock_bh(&ipvs->sync_buff_lock);
392 sb = ms->sync_buff;
393 if (sb && time_after_eq(jiffies - sb->firstuse, time)) {
394 ms->sync_buff = NULL;
395 __set_current_state(TASK_RUNNING);
396 } else
397 sb = NULL;
398 spin_unlock_bh(&ipvs->sync_buff_lock);
399 return sb;
400}
401
402static inline int
403select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp)
404{
405 return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask;
406}
407
408
409
410
411static inline struct ip_vs_sync_buff *
412ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len)
413{
414 struct ip_vs_sync_buff *sb;
415 struct ip_vs_sync_mesg_v0 *mesg;
416
417 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
418 return NULL;
419
420 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0),
421 ipvs->mcfg.sync_maxlen);
422 sb->mesg = kmalloc(len, GFP_ATOMIC);
423 if (!sb->mesg) {
424 kfree(sb);
425 return NULL;
426 }
427 mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
428 mesg->nr_conns = 0;
429 mesg->syncid = ipvs->mcfg.syncid;
430 mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0));
431 sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
432 sb->end = (unsigned char *)mesg + len;
433 sb->firstuse = jiffies;
434 return sb;
435}
436
437
438static inline bool in_persistence(struct ip_vs_conn *cp)
439{
440 for (cp = cp->control; cp; cp = cp->control) {
441 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
442 return true;
443 }
444 return false;
445}
446
447
448
449
450
451
452
453
454
455
456static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs,
457 struct ip_vs_conn *cp, int pkts)
458{
459 unsigned long orig = ACCESS_ONCE(cp->sync_endtime);
460 unsigned long now = jiffies;
461 unsigned long n = (now + cp->timeout) & ~3UL;
462 unsigned int sync_refresh_period;
463 int sync_period;
464 int force;
465
466
467 if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE))
468 force = 0;
469 else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp)))
470 return 0;
471 else if (likely(cp->protocol == IPPROTO_TCP)) {
472 if (!((1 << cp->state) &
473 ((1 << IP_VS_TCP_S_ESTABLISHED) |
474 (1 << IP_VS_TCP_S_FIN_WAIT) |
475 (1 << IP_VS_TCP_S_CLOSE) |
476 (1 << IP_VS_TCP_S_CLOSE_WAIT) |
477 (1 << IP_VS_TCP_S_TIME_WAIT))))
478 return 0;
479 force = cp->state != cp->old_state;
480 if (force && cp->state != IP_VS_TCP_S_ESTABLISHED)
481 goto set;
482 } else if (unlikely(cp->protocol == IPPROTO_SCTP)) {
483 if (!((1 << cp->state) &
484 ((1 << IP_VS_SCTP_S_ESTABLISHED) |
485 (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) |
486 (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) |
487 (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) |
488 (1 << IP_VS_SCTP_S_CLOSED))))
489 return 0;
490 force = cp->state != cp->old_state;
491 if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED)
492 goto set;
493 } else {
494
495 force = 0;
496 }
497
498 sync_refresh_period = sysctl_sync_refresh_period(ipvs);
499 if (sync_refresh_period > 0) {
500 long diff = n - orig;
501 long min_diff = max(cp->timeout >> 1, 10UL * HZ);
502
503
504
505
506 if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) {
507 int retries = orig & 3;
508
509 if (retries >= sysctl_sync_retries(ipvs))
510 return 0;
511 if (time_before(now, orig - cp->timeout +
512 (sync_refresh_period >> 3)))
513 return 0;
514 n |= retries + 1;
515 }
516 }
517 sync_period = sysctl_sync_period(ipvs);
518 if (sync_period > 0) {
519 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) &&
520 pkts % sync_period != sysctl_sync_threshold(ipvs))
521 return 0;
522 } else if (sync_refresh_period <= 0 &&
523 pkts != sysctl_sync_threshold(ipvs))
524 return 0;
525
526set:
527 cp->old_state = cp->state;
528 n = cmpxchg(&cp->sync_endtime, orig, n);
529 return n == orig || force;
530}
531
532
533
534
535
536static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp,
537 int pkts)
538{
539 struct ip_vs_sync_mesg_v0 *m;
540 struct ip_vs_sync_conn_v0 *s;
541 struct ip_vs_sync_buff *buff;
542 struct ipvs_master_sync_state *ms;
543 int id;
544 unsigned int len;
545
546 if (unlikely(cp->af != AF_INET))
547 return;
548
549 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
550 return;
551
552 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
553 return;
554
555 spin_lock_bh(&ipvs->sync_buff_lock);
556 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
557 spin_unlock_bh(&ipvs->sync_buff_lock);
558 return;
559 }
560
561 id = select_master_thread_id(ipvs, cp);
562 ms = &ipvs->ms[id];
563 buff = ms->sync_buff;
564 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
565 SIMPLE_CONN_SIZE;
566 if (buff) {
567 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
568
569 if (buff->head + len > buff->end || !m->nr_conns) {
570 sb_queue_tail(ipvs, ms);
571 ms->sync_buff = NULL;
572 buff = NULL;
573 }
574 }
575 if (!buff) {
576 buff = ip_vs_sync_buff_create_v0(ipvs, len);
577 if (!buff) {
578 spin_unlock_bh(&ipvs->sync_buff_lock);
579 pr_err("ip_vs_sync_buff_create failed.\n");
580 return;
581 }
582 ms->sync_buff = buff;
583 }
584
585 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
586 s = (struct ip_vs_sync_conn_v0 *) buff->head;
587
588
589 s->reserved = 0;
590 s->protocol = cp->protocol;
591 s->cport = cp->cport;
592 s->vport = cp->vport;
593 s->dport = cp->dport;
594 s->caddr = cp->caddr.ip;
595 s->vaddr = cp->vaddr.ip;
596 s->daddr = cp->daddr.ip;
597 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
598 s->state = htons(cp->state);
599 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
600 struct ip_vs_sync_conn_options *opt =
601 (struct ip_vs_sync_conn_options *)&s[1];
602 memcpy(opt, &cp->in_seq, sizeof(*opt));
603 }
604
605 m->nr_conns++;
606 m->size = htons(ntohs(m->size) + len);
607 buff->head += len;
608 spin_unlock_bh(&ipvs->sync_buff_lock);
609
610
611 cp = cp->control;
612 if (cp) {
613 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
614 pkts = atomic_add_return(1, &cp->in_pkts);
615 else
616 pkts = sysctl_sync_threshold(ipvs);
617 ip_vs_sync_conn(ipvs, cp, pkts);
618 }
619}
620
621
622
623
624
625
626void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts)
627{
628 struct ip_vs_sync_mesg *m;
629 union ip_vs_sync_conn *s;
630 struct ip_vs_sync_buff *buff;
631 struct ipvs_master_sync_state *ms;
632 int id;
633 __u8 *p;
634 unsigned int len, pe_name_len, pad;
635
636
637 if (sysctl_sync_ver(ipvs) == 0) {
638 ip_vs_sync_conn_v0(ipvs, cp, pkts);
639 return;
640 }
641
642 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
643 goto control;
644sloop:
645 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
646 goto control;
647
648
649 pe_name_len = 0;
650 if (cp->pe_data_len) {
651 if (!cp->pe_data || !cp->dest) {
652 IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
653 return;
654 }
655 pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
656 }
657
658 spin_lock_bh(&ipvs->sync_buff_lock);
659 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
660 spin_unlock_bh(&ipvs->sync_buff_lock);
661 return;
662 }
663
664 id = select_master_thread_id(ipvs, cp);
665 ms = &ipvs->ms[id];
666
667#ifdef CONFIG_IP_VS_IPV6
668 if (cp->af == AF_INET6)
669 len = sizeof(struct ip_vs_sync_v6);
670 else
671#endif
672 len = sizeof(struct ip_vs_sync_v4);
673
674 if (cp->flags & IP_VS_CONN_F_SEQ_MASK)
675 len += sizeof(struct ip_vs_sync_conn_options) + 2;
676
677 if (cp->pe_data_len)
678 len += cp->pe_data_len + 2;
679 if (pe_name_len)
680 len += pe_name_len + 2;
681
682
683 pad = 0;
684 buff = ms->sync_buff;
685 if (buff) {
686 m = buff->mesg;
687 pad = (4 - (size_t) buff->head) & 3;
688
689 if (buff->head + len + pad > buff->end || m->reserved) {
690 sb_queue_tail(ipvs, ms);
691 ms->sync_buff = NULL;
692 buff = NULL;
693 pad = 0;
694 }
695 }
696
697 if (!buff) {
698 buff = ip_vs_sync_buff_create(ipvs, len);
699 if (!buff) {
700 spin_unlock_bh(&ipvs->sync_buff_lock);
701 pr_err("ip_vs_sync_buff_create failed.\n");
702 return;
703 }
704 ms->sync_buff = buff;
705 m = buff->mesg;
706 }
707
708 p = buff->head;
709 buff->head += pad + len;
710 m->size = htons(ntohs(m->size) + pad + len);
711
712 while (pad--)
713 *(p++) = 0;
714
715 s = (union ip_vs_sync_conn *)p;
716
717
718 s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0);
719 s->v4.ver_size = htons(len & SVER_MASK);
720 s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED);
721 s->v4.state = htons(cp->state);
722 s->v4.protocol = cp->protocol;
723 s->v4.cport = cp->cport;
724 s->v4.vport = cp->vport;
725 s->v4.dport = cp->dport;
726 s->v4.fwmark = htonl(cp->fwmark);
727 s->v4.timeout = htonl(cp->timeout / HZ);
728 m->nr_conns++;
729
730#ifdef CONFIG_IP_VS_IPV6
731 if (cp->af == AF_INET6) {
732 p += sizeof(struct ip_vs_sync_v6);
733 s->v6.caddr = cp->caddr.in6;
734 s->v6.vaddr = cp->vaddr.in6;
735 s->v6.daddr = cp->daddr.in6;
736 } else
737#endif
738 {
739 p += sizeof(struct ip_vs_sync_v4);
740 s->v4.caddr = cp->caddr.ip;
741 s->v4.vaddr = cp->vaddr.ip;
742 s->v4.daddr = cp->daddr.ip;
743 }
744 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
745 *(p++) = IPVS_OPT_SEQ_DATA;
746 *(p++) = sizeof(struct ip_vs_sync_conn_options);
747 hton_seq((struct ip_vs_seq *)p, &cp->in_seq);
748 p += sizeof(struct ip_vs_seq);
749 hton_seq((struct ip_vs_seq *)p, &cp->out_seq);
750 p += sizeof(struct ip_vs_seq);
751 }
752
753 if (cp->pe_data_len && cp->pe_data) {
754 *(p++) = IPVS_OPT_PE_DATA;
755 *(p++) = cp->pe_data_len;
756 memcpy(p, cp->pe_data, cp->pe_data_len);
757 p += cp->pe_data_len;
758 if (pe_name_len) {
759
760 *(p++) = IPVS_OPT_PE_NAME;
761 *(p++) = pe_name_len;
762 memcpy(p, cp->pe->name, pe_name_len);
763 p += pe_name_len;
764 }
765 }
766
767 spin_unlock_bh(&ipvs->sync_buff_lock);
768
769control:
770
771 cp = cp->control;
772 if (!cp)
773 return;
774 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
775 pkts = atomic_add_return(1, &cp->in_pkts);
776 else
777 pkts = sysctl_sync_threshold(ipvs);
778 goto sloop;
779}
780
781
782
783
784static inline int
785ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc,
786 struct ip_vs_conn_param *p,
787 __u8 *pe_data, unsigned int pe_data_len,
788 __u8 *pe_name, unsigned int pe_name_len)
789{
790#ifdef CONFIG_IP_VS_IPV6
791 if (af == AF_INET6)
792 ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol,
793 (const union nf_inet_addr *)&sc->v6.caddr,
794 sc->v6.cport,
795 (const union nf_inet_addr *)&sc->v6.vaddr,
796 sc->v6.vport, p);
797 else
798#endif
799 ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol,
800 (const union nf_inet_addr *)&sc->v4.caddr,
801 sc->v4.cport,
802 (const union nf_inet_addr *)&sc->v4.vaddr,
803 sc->v4.vport, p);
804
805 if (pe_data_len) {
806 if (pe_name_len) {
807 char buff[IP_VS_PENAME_MAXLEN+1];
808
809 memcpy(buff, pe_name, pe_name_len);
810 buff[pe_name_len]=0;
811 p->pe = __ip_vs_pe_getbyname(buff);
812 if (!p->pe) {
813 IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
814 buff);
815 return 1;
816 }
817 } else {
818 IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
819 return 1;
820 }
821
822 p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC);
823 if (!p->pe_data) {
824 module_put(p->pe->module);
825 return -ENOMEM;
826 }
827 p->pe_data_len = pe_data_len;
828 }
829 return 0;
830}
831
832
833
834
835
836
837
838static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param,
839 unsigned int flags, unsigned int state,
840 unsigned int protocol, unsigned int type,
841 const union nf_inet_addr *daddr, __be16 dport,
842 unsigned long timeout, __u32 fwmark,
843 struct ip_vs_sync_conn_options *opt)
844{
845 struct ip_vs_dest *dest;
846 struct ip_vs_conn *cp;
847
848 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
849 cp = ip_vs_conn_in_get(param);
850 if (cp && ((cp->dport != dport) ||
851 !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) {
852 if (!(flags & IP_VS_CONN_F_INACTIVE)) {
853 ip_vs_conn_expire_now(cp);
854 __ip_vs_conn_put(cp);
855 cp = NULL;
856 } else {
857
858
859
860
861 __ip_vs_conn_put(cp);
862 kfree(param->pe_data);
863 return;
864 }
865 }
866 } else {
867 cp = ip_vs_ct_in_get(param);
868 }
869
870 if (cp) {
871
872 kfree(param->pe_data);
873
874 dest = cp->dest;
875 spin_lock_bh(&cp->lock);
876 if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE &&
877 !(flags & IP_VS_CONN_F_TEMPLATE) && dest) {
878 if (flags & IP_VS_CONN_F_INACTIVE) {
879 atomic_dec(&dest->activeconns);
880 atomic_inc(&dest->inactconns);
881 } else {
882 atomic_inc(&dest->activeconns);
883 atomic_dec(&dest->inactconns);
884 }
885 }
886 flags &= IP_VS_CONN_F_BACKUP_UPD_MASK;
887 flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
888 cp->flags = flags;
889 spin_unlock_bh(&cp->lock);
890 if (!dest)
891 ip_vs_try_bind_dest(cp);
892 } else {
893
894
895
896
897
898 rcu_read_lock();
899
900
901
902
903
904 dest = ip_vs_find_dest(ipvs, type, type, daddr, dport,
905 param->vaddr, param->vport, protocol,
906 fwmark, flags);
907
908 cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest,
909 fwmark);
910 rcu_read_unlock();
911 if (!cp) {
912 kfree(param->pe_data);
913 IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
914 return;
915 }
916 if (!(flags & IP_VS_CONN_F_TEMPLATE))
917 kfree(param->pe_data);
918 }
919
920 if (opt)
921 memcpy(&cp->in_seq, opt, sizeof(*opt));
922 atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs));
923 cp->state = state;
924 cp->old_state = cp->state;
925
926
927
928
929
930
931
932
933
934 if (timeout) {
935 if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
936 timeout = MAX_SCHEDULE_TIMEOUT / HZ;
937 cp->timeout = timeout*HZ;
938 } else {
939 struct ip_vs_proto_data *pd;
940
941 pd = ip_vs_proto_data_get(ipvs, protocol);
942 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
943 cp->timeout = pd->timeout_table[state];
944 else
945 cp->timeout = (3*60*HZ);
946 }
947 ip_vs_conn_put(cp);
948}
949
950
951
952
953static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer,
954 const size_t buflen)
955{
956 struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
957 struct ip_vs_sync_conn_v0 *s;
958 struct ip_vs_sync_conn_options *opt;
959 struct ip_vs_protocol *pp;
960 struct ip_vs_conn_param param;
961 char *p;
962 int i;
963
964 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
965 for (i=0; i<m->nr_conns; i++) {
966 unsigned int flags, state;
967
968 if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
969 IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
970 return;
971 }
972 s = (struct ip_vs_sync_conn_v0 *) p;
973 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
974 flags &= ~IP_VS_CONN_F_HASHED;
975 if (flags & IP_VS_CONN_F_SEQ_MASK) {
976 opt = (struct ip_vs_sync_conn_options *)&s[1];
977 p += FULL_CONN_SIZE;
978 if (p > buffer+buflen) {
979 IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
980 return;
981 }
982 } else {
983 opt = NULL;
984 p += SIMPLE_CONN_SIZE;
985 }
986
987 state = ntohs(s->state);
988 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
989 pp = ip_vs_proto_get(s->protocol);
990 if (!pp) {
991 IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
992 s->protocol);
993 continue;
994 }
995 if (state >= pp->num_states) {
996 IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
997 pp->name, state);
998 continue;
999 }
1000 } else {
1001
1002 if (state > 0) {
1003 IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
1004 state);
1005 state = 0;
1006 }
1007 }
1008
1009 ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol,
1010 (const union nf_inet_addr *)&s->caddr,
1011 s->cport,
1012 (const union nf_inet_addr *)&s->vaddr,
1013 s->vport, ¶m);
1014
1015
1016 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->protocol, AF_INET,
1017 (union nf_inet_addr *)&s->daddr, s->dport,
1018 0, 0, opt);
1019 }
1020}
1021
1022
1023
1024
1025static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen,
1026 __u32 *opt_flags,
1027 struct ip_vs_sync_conn_options *opt)
1028{
1029 struct ip_vs_sync_conn_options *topt;
1030
1031 topt = (struct ip_vs_sync_conn_options *)p;
1032
1033 if (plen != sizeof(struct ip_vs_sync_conn_options)) {
1034 IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
1035 return -EINVAL;
1036 }
1037 if (*opt_flags & IPVS_OPT_F_SEQ_DATA) {
1038 IP_VS_DBG(2, "BACKUP, conn options found twice\n");
1039 return -EINVAL;
1040 }
1041 ntoh_seq(&topt->in_seq, &opt->in_seq);
1042 ntoh_seq(&topt->out_seq, &opt->out_seq);
1043 *opt_flags |= IPVS_OPT_F_SEQ_DATA;
1044 return 0;
1045}
1046
1047static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
1048 __u8 **data, unsigned int maxlen,
1049 __u32 *opt_flags, __u32 flag)
1050{
1051 if (plen > maxlen) {
1052 IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen);
1053 return -EINVAL;
1054 }
1055 if (*opt_flags & flag) {
1056 IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag);
1057 return -EINVAL;
1058 }
1059 *data_len = plen;
1060 *data = p;
1061 *opt_flags |= flag;
1062 return 0;
1063}
1064
1065
1066
1067static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end)
1068{
1069 struct ip_vs_sync_conn_options opt;
1070 union ip_vs_sync_conn *s;
1071 struct ip_vs_protocol *pp;
1072 struct ip_vs_conn_param param;
1073 __u32 flags;
1074 unsigned int af, state, pe_data_len=0, pe_name_len=0;
1075 __u8 *pe_data=NULL, *pe_name=NULL;
1076 __u32 opt_flags=0;
1077 int retc=0;
1078
1079 s = (union ip_vs_sync_conn *) p;
1080
1081 if (s->v6.type & STYPE_F_INET6) {
1082#ifdef CONFIG_IP_VS_IPV6
1083 af = AF_INET6;
1084 p += sizeof(struct ip_vs_sync_v6);
1085#else
1086 IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
1087 retc = 10;
1088 goto out;
1089#endif
1090 } else if (!s->v4.type) {
1091 af = AF_INET;
1092 p += sizeof(struct ip_vs_sync_v4);
1093 } else {
1094 return -10;
1095 }
1096 if (p > msg_end)
1097 return -20;
1098
1099
1100 while (p < msg_end) {
1101 int ptype;
1102 int plen;
1103
1104 if (p+2 > msg_end)
1105 return -30;
1106 ptype = *(p++);
1107 plen = *(p++);
1108
1109 if (!plen || ((p + plen) > msg_end))
1110 return -40;
1111
1112 switch (ptype & ~IPVS_OPT_F_PARAM) {
1113 case IPVS_OPT_SEQ_DATA:
1114 if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt))
1115 return -50;
1116 break;
1117
1118 case IPVS_OPT_PE_DATA:
1119 if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data,
1120 IP_VS_PEDATA_MAXLEN, &opt_flags,
1121 IPVS_OPT_F_PE_DATA))
1122 return -60;
1123 break;
1124
1125 case IPVS_OPT_PE_NAME:
1126 if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name,
1127 IP_VS_PENAME_MAXLEN, &opt_flags,
1128 IPVS_OPT_F_PE_NAME))
1129 return -70;
1130 break;
1131
1132 default:
1133
1134 if (!(ptype & IPVS_OPT_F_PARAM)) {
1135 IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
1136 ptype & ~IPVS_OPT_F_PARAM);
1137 retc = 20;
1138 goto out;
1139 }
1140 }
1141 p += plen;
1142 }
1143
1144
1145 flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK;
1146 flags |= IP_VS_CONN_F_SYNC;
1147 state = ntohs(s->v4.state);
1148
1149 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
1150 pp = ip_vs_proto_get(s->v4.protocol);
1151 if (!pp) {
1152 IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
1153 s->v4.protocol);
1154 retc = 30;
1155 goto out;
1156 }
1157 if (state >= pp->num_states) {
1158 IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
1159 pp->name, state);
1160 retc = 40;
1161 goto out;
1162 }
1163 } else {
1164
1165 if (state > 0) {
1166 IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
1167 state);
1168 state = 0;
1169 }
1170 }
1171 if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data,
1172 pe_data_len, pe_name, pe_name_len)) {
1173 retc = 50;
1174 goto out;
1175 }
1176
1177 if (af == AF_INET)
1178 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v4.protocol, af,
1179 (union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
1180 ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
1181 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
1182 );
1183#ifdef CONFIG_IP_VS_IPV6
1184 else
1185 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v6.protocol, af,
1186 (union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
1187 ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
1188 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
1189 );
1190#endif
1191 ip_vs_pe_put(param.pe);
1192 return 0;
1193
1194out:
1195 IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc);
1196 return retc;
1197
1198}
1199
1200
1201
1202
1203
1204static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer,
1205 const size_t buflen)
1206{
1207 struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
1208 __u8 *p, *msg_end;
1209 int i, nr_conns;
1210
1211 if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) {
1212 IP_VS_DBG(2, "BACKUP, message header too short\n");
1213 return;
1214 }
1215
1216 if (buflen != ntohs(m2->size)) {
1217 IP_VS_DBG(2, "BACKUP, bogus message size\n");
1218 return;
1219 }
1220
1221 if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) {
1222 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
1223 return;
1224 }
1225
1226 if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
1227 && (m2->spare == 0)) {
1228
1229 msg_end = buffer + sizeof(struct ip_vs_sync_mesg);
1230 nr_conns = m2->nr_conns;
1231
1232 for (i=0; i<nr_conns; i++) {
1233 union ip_vs_sync_conn *s;
1234 unsigned int size;
1235 int retc;
1236
1237 p = msg_end;
1238 if (p + sizeof(s->v4) > buffer+buflen) {
1239 IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
1240 return;
1241 }
1242 s = (union ip_vs_sync_conn *)p;
1243 size = ntohs(s->v4.ver_size) & SVER_MASK;
1244 msg_end = p + size;
1245
1246 if (msg_end > buffer+buflen) {
1247 IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
1248 return;
1249 }
1250 if (ntohs(s->v4.ver_size) >> SVER_SHIFT) {
1251 IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
1252 ntohs(s->v4.ver_size) >> SVER_SHIFT);
1253 return;
1254 }
1255
1256 retc = ip_vs_proc_sync_conn(ipvs, p, msg_end);
1257 if (retc < 0) {
1258 IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
1259 retc);
1260 return;
1261 }
1262
1263 msg_end = p + ((size + 3) & ~3);
1264 }
1265 } else {
1266
1267 ip_vs_process_message_v0(ipvs, buffer, buflen);
1268 return;
1269 }
1270}
1271
1272
1273
1274
1275
1276static void set_sock_size(struct sock *sk, int mode, int val)
1277{
1278
1279
1280 lock_sock(sk);
1281 if (mode) {
1282 val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
1283 sysctl_wmem_max);
1284 sk->sk_sndbuf = val * 2;
1285 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1286 } else {
1287 val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
1288 sysctl_rmem_max);
1289 sk->sk_rcvbuf = val * 2;
1290 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
1291 }
1292 release_sock(sk);
1293}
1294
1295
1296
1297
1298static void set_mcast_loop(struct sock *sk, u_char loop)
1299{
1300 struct inet_sock *inet = inet_sk(sk);
1301
1302
1303 lock_sock(sk);
1304 inet->mc_loop = loop ? 1 : 0;
1305#ifdef CONFIG_IP_VS_IPV6
1306 if (sk->sk_family == AF_INET6) {
1307 struct ipv6_pinfo *np = inet6_sk(sk);
1308
1309
1310 np->mc_loop = loop ? 1 : 0;
1311 }
1312#endif
1313 release_sock(sk);
1314}
1315
1316
1317
1318
1319static void set_mcast_ttl(struct sock *sk, u_char ttl)
1320{
1321 struct inet_sock *inet = inet_sk(sk);
1322
1323
1324 lock_sock(sk);
1325 inet->mc_ttl = ttl;
1326#ifdef CONFIG_IP_VS_IPV6
1327 if (sk->sk_family == AF_INET6) {
1328 struct ipv6_pinfo *np = inet6_sk(sk);
1329
1330
1331 np->mcast_hops = ttl;
1332 }
1333#endif
1334 release_sock(sk);
1335}
1336
1337
1338static void set_mcast_pmtudisc(struct sock *sk, int val)
1339{
1340 struct inet_sock *inet = inet_sk(sk);
1341
1342
1343 lock_sock(sk);
1344 inet->pmtudisc = val;
1345#ifdef CONFIG_IP_VS_IPV6
1346 if (sk->sk_family == AF_INET6) {
1347 struct ipv6_pinfo *np = inet6_sk(sk);
1348
1349
1350 np->pmtudisc = val;
1351 }
1352#endif
1353 release_sock(sk);
1354}
1355
1356
1357
1358
1359static int set_mcast_if(struct sock *sk, char *ifname)
1360{
1361 struct net_device *dev;
1362 struct inet_sock *inet = inet_sk(sk);
1363 struct net *net = sock_net(sk);
1364
1365 dev = __dev_get_by_name(net, ifname);
1366 if (!dev)
1367 return -ENODEV;
1368
1369 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1370 return -EINVAL;
1371
1372 lock_sock(sk);
1373 inet->mc_index = dev->ifindex;
1374
1375#ifdef CONFIG_IP_VS_IPV6
1376 if (sk->sk_family == AF_INET6) {
1377 struct ipv6_pinfo *np = inet6_sk(sk);
1378
1379
1380 np->mcast_oif = dev->ifindex;
1381 }
1382#endif
1383 release_sock(sk);
1384
1385 return 0;
1386}
1387
1388
1389
1390
1391
1392
1393
1394static int
1395join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
1396{
1397 struct net *net = sock_net(sk);
1398 struct ip_mreqn mreq;
1399 struct net_device *dev;
1400 int ret;
1401
1402 memset(&mreq, 0, sizeof(mreq));
1403 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
1404
1405 dev = __dev_get_by_name(net, ifname);
1406 if (!dev)
1407 return -ENODEV;
1408 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1409 return -EINVAL;
1410
1411 mreq.imr_ifindex = dev->ifindex;
1412
1413 lock_sock(sk);
1414 ret = ip_mc_join_group(sk, &mreq);
1415 release_sock(sk);
1416
1417 return ret;
1418}
1419
1420#ifdef CONFIG_IP_VS_IPV6
1421static int join_mcast_group6(struct sock *sk, struct in6_addr *addr,
1422 char *ifname)
1423{
1424 struct net *net = sock_net(sk);
1425 struct net_device *dev;
1426 int ret;
1427
1428 dev = __dev_get_by_name(net, ifname);
1429 if (!dev)
1430 return -ENODEV;
1431 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1432 return -EINVAL;
1433
1434 lock_sock(sk);
1435 ret = ipv6_sock_mc_join(sk, dev->ifindex, addr);
1436 release_sock(sk);
1437
1438 return ret;
1439}
1440#endif
1441
1442static int bind_mcastif_addr(struct socket *sock, char *ifname)
1443{
1444 struct net *net = sock_net(sock->sk);
1445 struct net_device *dev;
1446 __be32 addr;
1447 struct sockaddr_in sin;
1448
1449 dev = __dev_get_by_name(net, ifname);
1450 if (!dev)
1451 return -ENODEV;
1452
1453 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1454 if (!addr)
1455 pr_err("You probably need to specify IP address on "
1456 "multicast interface.\n");
1457
1458 IP_VS_DBG(7, "binding socket with (%s) %pI4\n",
1459 ifname, &addr);
1460
1461
1462 sin.sin_family = AF_INET;
1463 sin.sin_addr.s_addr = addr;
1464 sin.sin_port = 0;
1465
1466 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
1467}
1468
1469static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen,
1470 struct ipvs_sync_daemon_cfg *c, int id)
1471{
1472 if (AF_INET6 == c->mcast_af) {
1473 sa->in6 = (struct sockaddr_in6) {
1474 .sin6_family = AF_INET6,
1475 .sin6_port = htons(c->mcast_port + id),
1476 };
1477 sa->in6.sin6_addr = c->mcast_group.in6;
1478 *salen = sizeof(sa->in6);
1479 } else {
1480 sa->in = (struct sockaddr_in) {
1481 .sin_family = AF_INET,
1482 .sin_port = htons(c->mcast_port + id),
1483 };
1484 sa->in.sin_addr = c->mcast_group.in;
1485 *salen = sizeof(sa->in);
1486 }
1487}
1488
1489
1490
1491
1492static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id)
1493{
1494
1495 union ipvs_sockaddr mcast_addr;
1496 struct socket *sock;
1497 int result, salen;
1498
1499
1500 result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM,
1501 IPPROTO_UDP, &sock);
1502 if (result < 0) {
1503 pr_err("Error during creation of socket; terminating\n");
1504 return ERR_PTR(result);
1505 }
1506 result = set_mcast_if(sock->sk, ipvs->mcfg.mcast_ifn);
1507 if (result < 0) {
1508 pr_err("Error setting outbound mcast interface\n");
1509 goto error;
1510 }
1511
1512 set_mcast_loop(sock->sk, 0);
1513 set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl);
1514
1515 set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT);
1516 result = sysctl_sync_sock_size(ipvs);
1517 if (result > 0)
1518 set_sock_size(sock->sk, 1, result);
1519
1520 if (AF_INET == ipvs->mcfg.mcast_af)
1521 result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn);
1522 else
1523 result = 0;
1524 if (result < 0) {
1525 pr_err("Error binding address of the mcast interface\n");
1526 goto error;
1527 }
1528
1529 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id);
1530 result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
1531 salen, 0);
1532 if (result < 0) {
1533 pr_err("Error connecting to the multicast addr\n");
1534 goto error;
1535 }
1536
1537 return sock;
1538
1539error:
1540 sock_release(sock);
1541 return ERR_PTR(result);
1542}
1543
1544
1545
1546
1547
1548static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id)
1549{
1550
1551 union ipvs_sockaddr mcast_addr;
1552 struct socket *sock;
1553 int result, salen;
1554
1555
1556 result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM,
1557 IPPROTO_UDP, &sock);
1558 if (result < 0) {
1559 pr_err("Error during creation of socket; terminating\n");
1560 return ERR_PTR(result);
1561 }
1562
1563 sock->sk->sk_reuse = SK_CAN_REUSE;
1564 result = sysctl_sync_sock_size(ipvs);
1565 if (result > 0)
1566 set_sock_size(sock->sk, 0, result);
1567
1568 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id);
1569 result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen);
1570 if (result < 0) {
1571 pr_err("Error binding to the multicast addr\n");
1572 goto error;
1573 }
1574
1575
1576#ifdef CONFIG_IP_VS_IPV6
1577 if (ipvs->bcfg.mcast_af == AF_INET6)
1578 result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr,
1579 ipvs->bcfg.mcast_ifn);
1580 else
1581#endif
1582 result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr,
1583 ipvs->bcfg.mcast_ifn);
1584 if (result < 0) {
1585 pr_err("Error joining to the multicast group\n");
1586 goto error;
1587 }
1588
1589 return sock;
1590
1591error:
1592 sock_release(sock);
1593 return ERR_PTR(result);
1594}
1595
1596
1597static int
1598ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
1599{
1600 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
1601 struct kvec iov;
1602 int len;
1603
1604 EnterFunction(7);
1605 iov.iov_base = (void *)buffer;
1606 iov.iov_len = length;
1607
1608 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
1609
1610 LeaveFunction(7);
1611 return len;
1612}
1613
1614static int
1615ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
1616{
1617 int msize;
1618 int ret;
1619
1620 msize = ntohs(msg->size);
1621
1622 ret = ip_vs_send_async(sock, (char *)msg, msize);
1623 if (ret >= 0 || ret == -EAGAIN)
1624 return ret;
1625 pr_err("ip_vs_send_async error %d\n", ret);
1626 return 0;
1627}
1628
1629static int
1630ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
1631{
1632 struct msghdr msg = {NULL,};
1633 struct kvec iov;
1634 int len;
1635
1636 EnterFunction(7);
1637
1638
1639 iov.iov_base = buffer;
1640 iov.iov_len = (size_t)buflen;
1641
1642 len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT);
1643
1644 if (len < 0)
1645 return len;
1646
1647 LeaveFunction(7);
1648 return len;
1649}
1650
1651
1652static void master_wakeup_work_handler(struct work_struct *work)
1653{
1654 struct ipvs_master_sync_state *ms =
1655 container_of(work, struct ipvs_master_sync_state,
1656 master_wakeup_work.work);
1657 struct netns_ipvs *ipvs = ms->ipvs;
1658
1659 spin_lock_bh(&ipvs->sync_lock);
1660 if (ms->sync_queue_len &&
1661 ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) {
1662 ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE;
1663 wake_up_process(ms->master_thread);
1664 }
1665 spin_unlock_bh(&ipvs->sync_lock);
1666}
1667
1668
1669static inline struct ip_vs_sync_buff *
1670next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
1671{
1672 struct ip_vs_sync_buff *sb;
1673
1674 sb = sb_dequeue(ipvs, ms);
1675 if (sb)
1676 return sb;
1677
1678 return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME);
1679}
1680
1681static int sync_thread_master(void *data)
1682{
1683 struct ip_vs_sync_thread_data *tinfo = data;
1684 struct netns_ipvs *ipvs = tinfo->ipvs;
1685 struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id];
1686 struct sock *sk = tinfo->sock->sk;
1687 struct ip_vs_sync_buff *sb;
1688
1689 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
1690 "syncid = %d, id = %d\n",
1691 ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id);
1692
1693 for (;;) {
1694 sb = next_sync_buff(ipvs, ms);
1695 if (unlikely(kthread_should_stop()))
1696 break;
1697 if (!sb) {
1698 schedule_timeout(IPVS_SYNC_CHECK_PERIOD);
1699 continue;
1700 }
1701 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) {
1702
1703
1704
1705 __wait_event_interruptible(*sk_sleep(sk),
1706 sock_writeable(sk) ||
1707 kthread_should_stop());
1708 if (unlikely(kthread_should_stop()))
1709 goto done;
1710 }
1711 ip_vs_sync_buff_release(sb);
1712 }
1713
1714done:
1715 __set_current_state(TASK_RUNNING);
1716 if (sb)
1717 ip_vs_sync_buff_release(sb);
1718
1719
1720 while ((sb = sb_dequeue(ipvs, ms)))
1721 ip_vs_sync_buff_release(sb);
1722 __set_current_state(TASK_RUNNING);
1723
1724
1725 sb = get_curr_sync_buff(ipvs, ms, 0);
1726 if (sb)
1727 ip_vs_sync_buff_release(sb);
1728
1729
1730 sock_release(tinfo->sock);
1731 kfree(tinfo);
1732
1733 return 0;
1734}
1735
1736
1737static int sync_thread_backup(void *data)
1738{
1739 struct ip_vs_sync_thread_data *tinfo = data;
1740 struct netns_ipvs *ipvs = tinfo->ipvs;
1741 int len;
1742
1743 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
1744 "syncid = %d, id = %d\n",
1745 ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id);
1746
1747 while (!kthread_should_stop()) {
1748 wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
1749 !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
1750 || kthread_should_stop());
1751
1752
1753 while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
1754 len = ip_vs_receive(tinfo->sock, tinfo->buf,
1755 ipvs->bcfg.sync_maxlen);
1756 if (len <= 0) {
1757 if (len != -EAGAIN)
1758 pr_err("receiving message error\n");
1759 break;
1760 }
1761
1762 ip_vs_process_message(ipvs, tinfo->buf, len);
1763 }
1764 }
1765
1766
1767 sock_release(tinfo->sock);
1768 kfree(tinfo->buf);
1769 kfree(tinfo);
1770
1771 return 0;
1772}
1773
1774
1775int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
1776 int state)
1777{
1778 struct ip_vs_sync_thread_data *tinfo;
1779 struct task_struct **array = NULL, *task;
1780 struct socket *sock;
1781 struct net_device *dev;
1782 char *name;
1783 int (*threadfn)(void *data);
1784 int id, count, hlen;
1785 int result = -ENOMEM;
1786 u16 mtu, min_mtu;
1787
1788 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
1789 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
1790 sizeof(struct ip_vs_sync_conn_v0));
1791
1792 if (!ipvs->sync_state) {
1793 count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX);
1794 ipvs->threads_mask = count - 1;
1795 } else
1796 count = ipvs->threads_mask + 1;
1797
1798 if (c->mcast_af == AF_UNSPEC) {
1799 c->mcast_af = AF_INET;
1800 c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP);
1801 }
1802 if (!c->mcast_port)
1803 c->mcast_port = IP_VS_SYNC_PORT;
1804 if (!c->mcast_ttl)
1805 c->mcast_ttl = 1;
1806
1807 dev = __dev_get_by_name(ipvs->net, c->mcast_ifn);
1808 if (!dev) {
1809 pr_err("Unknown mcast interface: %s\n", c->mcast_ifn);
1810 return -ENODEV;
1811 }
1812 hlen = (AF_INET6 == c->mcast_af) ?
1813 sizeof(struct ipv6hdr) + sizeof(struct udphdr) :
1814 sizeof(struct iphdr) + sizeof(struct udphdr);
1815 mtu = (state == IP_VS_STATE_BACKUP) ?
1816 clamp(dev->mtu, 1500U, 65535U) : 1500U;
1817 min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1;
1818
1819 if (c->sync_maxlen)
1820 c->sync_maxlen = clamp_t(unsigned int,
1821 c->sync_maxlen, min_mtu,
1822 65535 - hlen);
1823 else
1824 c->sync_maxlen = mtu - hlen;
1825
1826 if (state == IP_VS_STATE_MASTER) {
1827 if (ipvs->ms)
1828 return -EEXIST;
1829
1830 ipvs->mcfg = *c;
1831 name = "ipvs-m:%d:%d";
1832 threadfn = sync_thread_master;
1833 } else if (state == IP_VS_STATE_BACKUP) {
1834 if (ipvs->backup_threads)
1835 return -EEXIST;
1836
1837 ipvs->bcfg = *c;
1838 name = "ipvs-b:%d:%d";
1839 threadfn = sync_thread_backup;
1840 } else {
1841 return -EINVAL;
1842 }
1843
1844 if (state == IP_VS_STATE_MASTER) {
1845 struct ipvs_master_sync_state *ms;
1846
1847 ipvs->ms = kzalloc(count * sizeof(ipvs->ms[0]), GFP_KERNEL);
1848 if (!ipvs->ms)
1849 goto out;
1850 ms = ipvs->ms;
1851 for (id = 0; id < count; id++, ms++) {
1852 INIT_LIST_HEAD(&ms->sync_queue);
1853 ms->sync_queue_len = 0;
1854 ms->sync_queue_delay = 0;
1855 INIT_DELAYED_WORK(&ms->master_wakeup_work,
1856 master_wakeup_work_handler);
1857 ms->ipvs = ipvs;
1858 }
1859 } else {
1860 array = kzalloc(count * sizeof(struct task_struct *),
1861 GFP_KERNEL);
1862 if (!array)
1863 goto out;
1864 }
1865
1866 tinfo = NULL;
1867 for (id = 0; id < count; id++) {
1868 if (state == IP_VS_STATE_MASTER)
1869 sock = make_send_sock(ipvs, id);
1870 else
1871 sock = make_receive_sock(ipvs, id);
1872 if (IS_ERR(sock)) {
1873 result = PTR_ERR(sock);
1874 goto outtinfo;
1875 }
1876 tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
1877 if (!tinfo)
1878 goto outsocket;
1879 tinfo->ipvs = ipvs;
1880 tinfo->sock = sock;
1881 if (state == IP_VS_STATE_BACKUP) {
1882 tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen,
1883 GFP_KERNEL);
1884 if (!tinfo->buf)
1885 goto outtinfo;
1886 } else {
1887 tinfo->buf = NULL;
1888 }
1889 tinfo->id = id;
1890
1891 task = kthread_run(threadfn, tinfo, name, ipvs->gen, id);
1892 if (IS_ERR(task)) {
1893 result = PTR_ERR(task);
1894 goto outtinfo;
1895 }
1896 tinfo = NULL;
1897 if (state == IP_VS_STATE_MASTER)
1898 ipvs->ms[id].master_thread = task;
1899 else
1900 array[id] = task;
1901 }
1902
1903
1904
1905 if (state == IP_VS_STATE_BACKUP)
1906 ipvs->backup_threads = array;
1907 spin_lock_bh(&ipvs->sync_buff_lock);
1908 ipvs->sync_state |= state;
1909 spin_unlock_bh(&ipvs->sync_buff_lock);
1910
1911
1912 ip_vs_use_count_inc();
1913
1914 return 0;
1915
1916outsocket:
1917 sock_release(sock);
1918
1919outtinfo:
1920 if (tinfo) {
1921 sock_release(tinfo->sock);
1922 kfree(tinfo->buf);
1923 kfree(tinfo);
1924 }
1925 count = id;
1926 while (count-- > 0) {
1927 if (state == IP_VS_STATE_MASTER)
1928 kthread_stop(ipvs->ms[count].master_thread);
1929 else
1930 kthread_stop(array[count]);
1931 }
1932 kfree(array);
1933
1934out:
1935 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
1936 kfree(ipvs->ms);
1937 ipvs->ms = NULL;
1938 }
1939 return result;
1940}
1941
1942
1943int stop_sync_thread(struct netns_ipvs *ipvs, int state)
1944{
1945 struct task_struct **array;
1946 int id;
1947 int retc = -EINVAL;
1948
1949 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
1950
1951 if (state == IP_VS_STATE_MASTER) {
1952 if (!ipvs->ms)
1953 return -ESRCH;
1954
1955
1956
1957
1958
1959
1960
1961 spin_lock_bh(&ipvs->sync_buff_lock);
1962 spin_lock(&ipvs->sync_lock);
1963 ipvs->sync_state &= ~IP_VS_STATE_MASTER;
1964 spin_unlock(&ipvs->sync_lock);
1965 spin_unlock_bh(&ipvs->sync_buff_lock);
1966
1967 retc = 0;
1968 for (id = ipvs->threads_mask; id >= 0; id--) {
1969 struct ipvs_master_sync_state *ms = &ipvs->ms[id];
1970 int ret;
1971
1972 pr_info("stopping master sync thread %d ...\n",
1973 task_pid_nr(ms->master_thread));
1974 cancel_delayed_work_sync(&ms->master_wakeup_work);
1975 ret = kthread_stop(ms->master_thread);
1976 if (retc >= 0)
1977 retc = ret;
1978 }
1979 kfree(ipvs->ms);
1980 ipvs->ms = NULL;
1981 } else if (state == IP_VS_STATE_BACKUP) {
1982 if (!ipvs->backup_threads)
1983 return -ESRCH;
1984
1985 ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
1986 array = ipvs->backup_threads;
1987 retc = 0;
1988 for (id = ipvs->threads_mask; id >= 0; id--) {
1989 int ret;
1990
1991 pr_info("stopping backup sync thread %d ...\n",
1992 task_pid_nr(array[id]));
1993 ret = kthread_stop(array[id]);
1994 if (retc >= 0)
1995 retc = ret;
1996 }
1997 kfree(array);
1998 ipvs->backup_threads = NULL;
1999 }
2000
2001
2002 ip_vs_use_count_dec();
2003
2004 return retc;
2005}
2006
2007
2008
2009
2010int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs)
2011{
2012 __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key);
2013 spin_lock_init(&ipvs->sync_lock);
2014 spin_lock_init(&ipvs->sync_buff_lock);
2015 return 0;
2016}
2017
2018void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs)
2019{
2020 int retc;
2021
2022 mutex_lock(&ipvs->sync_mutex);
2023 retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER);
2024 if (retc && retc != -ESRCH)
2025 pr_err("Failed to stop Master Daemon\n");
2026
2027 retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP);
2028 if (retc && retc != -ESRCH)
2029 pr_err("Failed to stop Backup Daemon\n");
2030 mutex_unlock(&ipvs->sync_mutex);
2031}
2032