1
2
3
4
5#define _LARGEFILE64_SOURCE
6#define _GNU_SOURCE
7#include <stdio.h>
8#include <string.h>
9#include <unistd.h>
10#include <err.h>
11#include <stdint.h>
12#include <stdlib.h>
13#include <elf.h>
14#include <sys/mman.h>
15#include <sys/param.h>
16#include <sys/types.h>
17#include <sys/stat.h>
18#include <sys/wait.h>
19#include <fcntl.h>
20#include <stdbool.h>
21#include <errno.h>
22#include <ctype.h>
23#include <sys/socket.h>
24#include <sys/ioctl.h>
25#include <sys/time.h>
26#include <time.h>
27#include <netinet/in.h>
28#include <net/if.h>
29#include <linux/sockios.h>
30#include <linux/if_tun.h>
31#include <sys/uio.h>
32#include <termios.h>
33#include <getopt.h>
34#include <zlib.h>
35#include <assert.h>
36#include <sched.h>
37#include "linux/lguest_launcher.h"
38#include "linux/virtio_config.h"
39#include "linux/virtio_net.h"
40#include "linux/virtio_blk.h"
41#include "linux/virtio_console.h"
42#include "linux/virtio_ring.h"
43#include "asm-x86/bootparam.h"
44
45
46
47
48
49
50
51typedef unsigned long long u64;
52typedef uint32_t u32;
53typedef uint16_t u16;
54typedef uint8_t u8;
55
56
57#define PAGE_PRESENT 0x7
58#define NET_PEERNUM 1
59#define BRIDGE_PFX "bridge:"
60#ifndef SIOCBRADDIF
61#define SIOCBRADDIF 0x89a2
62#endif
63
64#define DEVICE_PAGES 256
65
66#define VIRTQUEUE_NUM 128
67
68
69
70static bool verbose;
71#define verbose(args...) \
72 do { if (verbose) printf(args); } while(0)
73
74
75
76static int waker_fd;
77
78static void *guest_base;
79
80static unsigned long guest_limit, guest_max;
81
82
83struct device_list
84{
85
86
87 fd_set infds;
88 int max_infd;
89
90
91 unsigned int next_irq;
92
93
94 unsigned int device_num;
95
96
97 u8 *descpage;
98
99
100 unsigned int desc_used;
101
102
103 struct device *dev;
104
105 struct device **lastdev;
106};
107
108
109static struct device_list devices;
110
111
112struct device
113{
114
115 struct device *next;
116
117
118 struct lguest_device_desc *desc;
119
120
121 const char *name;
122
123
124
125 int fd;
126 bool (*handle_input)(int fd, struct device *me);
127
128
129 struct virtqueue *vq;
130
131
132 void *priv;
133};
134
135
136struct virtqueue
137{
138 struct virtqueue *next;
139
140
141 struct device *dev;
142
143
144 struct lguest_vqconfig config;
145
146
147 struct vring vring;
148
149
150 u16 last_avail_idx;
151
152
153 void (*handle_output)(int fd, struct virtqueue *me);
154};
155
156
157
158#define wmb()
159
160
161
162
163
164
165
166
167
168#define convert(iov, type) \
169 ((type *)_convert((iov), sizeof(type), __alignof__(type), #type))
170
171static void *_convert(struct iovec *iov, size_t size, size_t align,
172 const char *name)
173{
174 if (iov->iov_len != size)
175 errx(1, "Bad iovec size %zu for %s", iov->iov_len, name);
176 if ((unsigned long)iov->iov_base % align != 0)
177 errx(1, "Bad alignment %p for %s", iov->iov_base, name);
178 return iov->iov_base;
179}
180
181
182
183#define cpu_to_le16(v16) (v16)
184#define cpu_to_le32(v32) (v32)
185#define cpu_to_le64(v64) (v64)
186#define le16_to_cpu(v16) (v16)
187#define le32_to_cpu(v32) (v32)
188#define le64_to_cpu(v32) (v64)
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203static void *from_guest_phys(unsigned long addr)
204{
205 return guest_base + addr;
206}
207
208static unsigned long to_guest_phys(const void *addr)
209{
210 return (addr - guest_base);
211}
212
213
214
215
216
217
218static int open_or_die(const char *name, int flags)
219{
220 int fd = open(name, flags);
221 if (fd < 0)
222 err(1, "Failed to open %s", name);
223 return fd;
224}
225
226
227static void *map_zeroed_pages(unsigned int num)
228{
229 int fd = open_or_die("/dev/zero", O_RDONLY);
230 void *addr;
231
232
233
234 addr = mmap(NULL, getpagesize() * num,
235 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0);
236 if (addr == MAP_FAILED)
237 err(1, "Mmaping %u pages of /dev/zero", num);
238
239 return addr;
240}
241
242
243static void *get_pages(unsigned int num)
244{
245 void *addr = from_guest_phys(guest_limit);
246
247 guest_limit += num * getpagesize();
248 if (guest_limit > guest_max)
249 errx(1, "Not enough memory for devices");
250 return addr;
251}
252
253
254
255
256static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
257{
258 ssize_t r;
259
260
261
262
263
264
265
266
267 if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC,
268 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
269 return;
270
271
272 r = pread(fd, addr, len, offset);
273 if (r != len)
274 err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
275}
276
277
278
279
280
281
282
283
284
285
286static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
287{
288 Elf32_Phdr phdr[ehdr->e_phnum];
289 unsigned int i;
290
291
292
293 if (ehdr->e_type != ET_EXEC
294 || ehdr->e_machine != EM_386
295 || ehdr->e_phentsize != sizeof(Elf32_Phdr)
296 || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
297 errx(1, "Malformed elf header");
298
299
300
301
302
303
304 if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
305 err(1, "Seeking to program headers");
306 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
307 err(1, "Reading program headers");
308
309
310
311 for (i = 0; i < ehdr->e_phnum; i++) {
312
313 if (phdr[i].p_type != PT_LOAD)
314 continue;
315
316 verbose("Section %i: size %i addr %p\n",
317 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
318
319
320 map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
321 phdr[i].p_offset, phdr[i].p_filesz);
322 }
323
324
325 return ehdr->e_entry;
326}
327
328
329
330
331
332
333
334
335static unsigned long load_bzimage(int fd)
336{
337 struct boot_params boot;
338 int r;
339
340 void *p = from_guest_phys(0x100000);
341
342
343
344 lseek(fd, 0, SEEK_SET);
345 read(fd, &boot, sizeof(boot));
346
347
348 if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
349 errx(1, "This doesn't look like a bzImage to me");
350
351
352 lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
353
354
355 while ((r = read(fd, p, 65536)) > 0)
356 p += r;
357
358
359 return boot.hdr.code32_start;
360}
361
362
363
364
365static unsigned long load_kernel(int fd)
366{
367 Elf32_Ehdr hdr;
368
369
370 if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
371 err(1, "Reading kernel");
372
373
374 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
375 return map_elf(fd, &hdr);
376
377
378 return load_bzimage(fd);
379}
380
381
382
383
384
385
386static inline unsigned long page_align(unsigned long addr)
387{
388
389 return ((addr + getpagesize()-1) & ~(getpagesize()-1));
390}
391
392
393
394
395
396
397
398
399static unsigned long load_initrd(const char *name, unsigned long mem)
400{
401 int ifd;
402 struct stat st;
403 unsigned long len;
404
405 ifd = open_or_die(name, O_RDONLY);
406
407 if (fstat(ifd, &st) < 0)
408 err(1, "fstat() on initrd '%s'", name);
409
410
411
412 len = page_align(st.st_size);
413 map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
414
415
416 close(ifd);
417 verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
418
419
420 return len;
421}
422
423
424
425
426
427
428
429static unsigned long setup_pagetables(unsigned long mem,
430 unsigned long initrd_size)
431{
432 unsigned long *pgdir, *linear;
433 unsigned int mapped_pages, i, linear_pages;
434 unsigned int ptes_per_page = getpagesize()/sizeof(void *);
435
436 mapped_pages = mem/getpagesize();
437
438
439 linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
440
441
442 pgdir = from_guest_phys(mem) - initrd_size - getpagesize();
443
444
445 linear = (void *)pgdir - linear_pages*getpagesize();
446
447
448
449
450 for (i = 0; i < mapped_pages; i++)
451 linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
452
453
454 for (i = 0; i < mapped_pages; i += ptes_per_page) {
455 pgdir[i/ptes_per_page]
456 = ((to_guest_phys(linear) + i*sizeof(void *))
457 | PAGE_PRESENT);
458 }
459
460 verbose("Linear mapping of %u pages in %u pte pages at %#lx\n",
461 mapped_pages, linear_pages, to_guest_phys(linear));
462
463
464
465 return to_guest_phys(pgdir);
466}
467
468
469
470
471static void concat(char *dst, char *args[])
472{
473 unsigned int i, len = 0;
474
475 for (i = 0; args[i]; i++) {
476 strcpy(dst+len, args[i]);
477 strcat(dst+len, " ");
478 len += strlen(args[i]) + 1;
479 }
480
481 dst[len] = '\0';
482}
483
484
485
486
487
488static int tell_kernel(unsigned long pgdir, unsigned long start)
489{
490 unsigned long args[] = { LHREQ_INITIALIZE,
491 (unsigned long)guest_base,
492 guest_limit / getpagesize(), pgdir, start };
493 int fd;
494
495 verbose("Guest: %p - %p (%#lx)\n",
496 guest_base, guest_base + guest_limit, guest_limit);
497 fd = open_or_die("/dev/lguest", O_RDWR);
498 if (write(fd, args, sizeof(args)) < 0)
499 err(1, "Writing to /dev/lguest");
500
501
502 return fd;
503}
504
505
506static void add_device_fd(int fd)
507{
508 FD_SET(fd, &devices.infds);
509 if (fd > devices.max_infd)
510 devices.max_infd = fd;
511}
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529static void wake_parent(int pipefd, int lguest_fd)
530{
531
532
533 add_device_fd(pipefd);
534
535 for (;;) {
536 fd_set rfds = devices.infds;
537 unsigned long args[] = { LHREQ_BREAK, 1 };
538
539
540 select(devices.max_infd+1, &rfds, NULL, NULL, NULL);
541
542 if (FD_ISSET(pipefd, &rfds)) {
543 int fd;
544
545
546 if (read(pipefd, &fd, sizeof(fd)) == 0)
547 exit(0);
548
549
550
551
552 if (fd >= 0)
553 FD_SET(fd, &devices.infds);
554 else
555 FD_CLR(-fd - 1, &devices.infds);
556 } else
557 write(lguest_fd, args, sizeof(args));
558 }
559}
560
561
562static int setup_waker(int lguest_fd)
563{
564 int pipefd[2], child;
565
566
567
568 pipe(pipefd);
569 child = fork();
570 if (child == -1)
571 err(1, "forking");
572
573 if (child == 0) {
574
575
576 close(pipefd[1]);
577 wake_parent(pipefd[0], lguest_fd);
578 }
579
580 close(pipefd[0]);
581
582
583 return pipefd[1];
584}
585
586
587
588
589
590
591
592
593
594static void *_check_pointer(unsigned long addr, unsigned int size,
595 unsigned int line)
596{
597
598
599 if (addr >= guest_limit || addr + size >= guest_limit)
600 errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr);
601
602
603 return from_guest_phys(addr);
604}
605
606#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
607
608
609
610
611static unsigned next_desc(struct virtqueue *vq, unsigned int i)
612{
613 unsigned int next;
614
615
616 if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT))
617 return vq->vring.num;
618
619
620 next = vq->vring.desc[i].next;
621
622 wmb();
623
624 if (next >= vq->vring.num)
625 errx(1, "Desc next is %u", next);
626
627 return next;
628}
629
630
631
632
633
634
635
636
637static unsigned get_vq_desc(struct virtqueue *vq,
638 struct iovec iov[],
639 unsigned int *out_num, unsigned int *in_num)
640{
641 unsigned int i, head;
642
643
644 if ((u16)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num)
645 errx(1, "Guest moved used index from %u to %u",
646 vq->last_avail_idx, vq->vring.avail->idx);
647
648
649 if (vq->vring.avail->idx == vq->last_avail_idx)
650 return vq->vring.num;
651
652
653
654 head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num];
655
656
657 if (head >= vq->vring.num)
658 errx(1, "Guest says index %u is available", head);
659
660
661 *out_num = *in_num = 0;
662
663 i = head;
664 do {
665
666 iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len;
667 iov[*out_num + *in_num].iov_base
668 = check_pointer(vq->vring.desc[i].addr,
669 vq->vring.desc[i].len);
670
671 if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE)
672 (*in_num)++;
673 else {
674
675
676 if (*in_num)
677 errx(1, "Descriptor has out after in");
678 (*out_num)++;
679 }
680
681
682 if (*out_num + *in_num > vq->vring.num)
683 errx(1, "Looped descriptor");
684 } while ((i = next_desc(vq, i)) != vq->vring.num);
685
686 return head;
687}
688
689
690
691static void add_used(struct virtqueue *vq, unsigned int head, int len)
692{
693 struct vring_used_elem *used;
694
695
696
697 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
698 used->id = head;
699 used->len = len;
700
701 wmb();
702 vq->vring.used->idx++;
703}
704
705
706static void trigger_irq(int fd, struct virtqueue *vq)
707{
708 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
709
710
711 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
712 return;
713
714
715 if (write(fd, buf, sizeof(buf)) != 0)
716 err(1, "Triggering irq %i", vq->config.irq);
717}
718
719
720static void add_used_and_trigger(int fd, struct virtqueue *vq,
721 unsigned int head, int len)
722{
723 add_used(vq, head, len);
724 trigger_irq(fd, vq);
725}
726
727
728
729
730
731
732static struct termios orig_term;
733static void restore_term(void)
734{
735 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
736}
737
738
739struct console_abort
740{
741
742 int count;
743
744 struct timeval start;
745};
746
747
748static bool handle_console_input(int fd, struct device *dev)
749{
750 int len;
751 unsigned int head, in_num, out_num;
752 struct iovec iov[dev->vq->vring.num];
753 struct console_abort *abort = dev->priv;
754
755
756 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
757
758
759
760 if (head == dev->vq->vring.num)
761 return false;
762
763 if (out_num)
764 errx(1, "Output buffers in console in queue?");
765
766
767
768 len = readv(dev->fd, iov, in_num);
769 if (len <= 0) {
770
771
772 warnx("Failed to get console input, ignoring console.");
773
774 restore_term();
775
776 dev->vq->handle_output = NULL;
777
778 return false;
779 }
780
781
782 add_used_and_trigger(fd, dev->vq, head, len);
783
784
785
786
787
788
789 if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) {
790 if (!abort->count++)
791 gettimeofday(&abort->start, NULL);
792 else if (abort->count == 3) {
793 struct timeval now;
794 gettimeofday(&now, NULL);
795 if (now.tv_sec <= abort->start.tv_sec+1) {
796 unsigned long args[] = { LHREQ_BREAK, 0 };
797
798
799 close(waker_fd);
800
801
802 write(fd, args, sizeof(args));
803 exit(2);
804 }
805 abort->count = 0;
806 }
807 } else
808
809 abort->count = 0;
810
811
812 return true;
813}
814
815
816
817static void handle_console_output(int fd, struct virtqueue *vq)
818{
819 unsigned int head, out, in;
820 int len;
821 struct iovec iov[vq->vring.num];
822
823
824 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
825 if (in)
826 errx(1, "Input buffers in output queue?");
827 len = writev(STDOUT_FILENO, iov, out);
828 add_used_and_trigger(fd, vq, head, len);
829 }
830}
831
832
833
834
835
836
837
838static void handle_net_output(int fd, struct virtqueue *vq)
839{
840 unsigned int head, out, in;
841 int len;
842 struct iovec iov[vq->vring.num];
843
844
845 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
846 if (in)
847 errx(1, "Input buffers in output queue?");
848
849
850
851 (void)convert(&iov[0], struct virtio_net_hdr);
852 len = writev(vq->dev->fd, iov+1, out-1);
853 add_used_and_trigger(fd, vq, head, len);
854 }
855}
856
857
858
859static bool handle_tun_input(int fd, struct device *dev)
860{
861 unsigned int head, in_num, out_num;
862 int len;
863 struct iovec iov[dev->vq->vring.num];
864 struct virtio_net_hdr *hdr;
865
866
867 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
868 if (head == dev->vq->vring.num) {
869
870
871
872
873 if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK)
874 warn("network: no dma buffer!");
875
876 return false;
877 } else if (out_num)
878 errx(1, "Output buffers in network recv queue?");
879
880
881 hdr = convert(&iov[0], struct virtio_net_hdr);
882 hdr->flags = 0;
883 hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
884
885
886 len = readv(dev->fd, iov+1, in_num-1);
887 if (len <= 0)
888 err(1, "reading network");
889
890
891 add_used_and_trigger(fd, dev->vq, head, sizeof(*hdr) + len);
892
893 verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
894 ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1],
895 head != dev->vq->vring.num ? "sent" : "discarded");
896
897
898 return true;
899}
900
901
902
903
904static void enable_fd(int fd, struct virtqueue *vq)
905{
906 add_device_fd(vq->dev->fd);
907
908 write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd));
909}
910
911
912static void handle_output(int fd, unsigned long addr)
913{
914 struct device *i;
915 struct virtqueue *vq;
916
917
918 for (i = devices.dev; i; i = i->next) {
919 for (vq = i->vq; vq; vq = vq->next) {
920 if (vq->config.pfn == addr/getpagesize()
921 && vq->handle_output) {
922 verbose("Output to %s\n", vq->dev->name);
923 vq->handle_output(fd, vq);
924 return;
925 }
926 }
927 }
928
929
930
931 if (addr >= guest_limit)
932 errx(1, "Bad NOTIFY %#lx", addr);
933
934 write(STDOUT_FILENO, from_guest_phys(addr),
935 strnlen(from_guest_phys(addr), guest_limit - addr));
936}
937
938
939
940static void handle_input(int fd)
941{
942
943 struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
944
945 for (;;) {
946 struct device *i;
947 fd_set fds = devices.infds;
948
949
950 if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0)
951 break;
952
953
954
955 for (i = devices.dev; i; i = i->next) {
956 if (i->handle_input && FD_ISSET(i->fd, &fds)) {
957 int dev_fd;
958 if (i->handle_input(fd, i))
959 continue;
960
961
962
963
964
965
966
967 FD_CLR(i->fd, &devices.infds);
968
969
970
971 dev_fd = -i->fd - 1;
972 write(waker_fd, &dev_fd, sizeof(dev_fd));
973 }
974 }
975 }
976}
977
978
979
980
981
982
983
984
985
986
987
988static struct lguest_device_desc *new_dev_desc(u16 type)
989{
990 struct lguest_device_desc *d;
991
992
993 if (devices.desc_used + sizeof(*d) > getpagesize())
994 errx(1, "Too many devices");
995
996
997 d = (void *)devices.descpage + devices.desc_used;
998 d->type = type;
999 devices.desc_used += sizeof(*d);
1000
1001 return d;
1002}
1003
1004
1005
1006
1007
1008
1009static void add_desc_field(struct device *dev, u8 type, u8 len, const void *c)
1010{
1011
1012 assert(devices.descpage + devices.desc_used
1013 == (u8 *)(dev->desc + 1) + dev->desc->config_len);
1014
1015
1016 if (devices.desc_used + 2 + len > getpagesize())
1017 errx(1, "Too many devices");
1018
1019
1020 devices.descpage[devices.desc_used++] = type;
1021 devices.descpage[devices.desc_used++] = len;
1022 memcpy(devices.descpage + devices.desc_used, c, len);
1023 devices.desc_used += len;
1024
1025
1026 dev->desc->config_len += 2 + len;
1027}
1028
1029
1030
1031static void add_virtqueue(struct device *dev, unsigned int num_descs,
1032 void (*handle_output)(int fd, struct virtqueue *me))
1033{
1034 unsigned int pages;
1035 struct virtqueue **i, *vq = malloc(sizeof(*vq));
1036 void *p;
1037
1038
1039 pages = (vring_size(num_descs, getpagesize()) + getpagesize() - 1)
1040 / getpagesize();
1041 p = get_pages(pages);
1042
1043
1044 vq->next = NULL;
1045 vq->last_avail_idx = 0;
1046 vq->dev = dev;
1047
1048
1049 vq->config.num = num_descs;
1050 vq->config.irq = devices.next_irq++;
1051 vq->config.pfn = to_guest_phys(p) / getpagesize();
1052
1053
1054 vring_init(&vq->vring, num_descs, p, getpagesize());
1055
1056
1057 add_desc_field(dev, VIRTIO_CONFIG_F_VIRTQUEUE,
1058 sizeof(vq->config), &vq->config);
1059
1060
1061
1062 for (i = &dev->vq; *i; i = &(*i)->next);
1063 *i = vq;
1064
1065
1066
1067 vq->handle_output = handle_output;
1068
1069
1070 if (!handle_output)
1071 vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
1072}
1073
1074
1075
1076static struct device *new_device(const char *name, u16 type, int fd,
1077 bool (*handle_input)(int, struct device *))
1078{
1079 struct device *dev = malloc(sizeof(*dev));
1080
1081
1082
1083
1084
1085 *devices.lastdev = dev;
1086 dev->next = NULL;
1087 devices.lastdev = &dev->next;
1088
1089
1090 dev->fd = fd;
1091
1092
1093 if (handle_input)
1094 add_device_fd(dev->fd);
1095 dev->desc = new_dev_desc(type);
1096 dev->handle_input = handle_input;
1097 dev->name = name;
1098 dev->vq = NULL;
1099 return dev;
1100}
1101
1102
1103
1104static void setup_console(void)
1105{
1106 struct device *dev;
1107
1108
1109 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
1110 struct termios term = orig_term;
1111
1112
1113 term.c_lflag &= ~(ISIG|ICANON|ECHO);
1114 tcsetattr(STDIN_FILENO, TCSANOW, &term);
1115
1116
1117 atexit(restore_term);
1118 }
1119
1120 dev = new_device("console", VIRTIO_ID_CONSOLE,
1121 STDIN_FILENO, handle_console_input);
1122
1123 dev->priv = malloc(sizeof(struct console_abort));
1124 ((struct console_abort *)dev->priv)->count = 0;
1125
1126
1127
1128
1129
1130 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
1131 add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output);
1132
1133 verbose("device %u: console\n", devices.device_num++);
1134}
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153static u32 str2ip(const char *ipaddr)
1154{
1155 unsigned int byte[4];
1156
1157 sscanf(ipaddr, "%u.%u.%u.%u", &byte[0], &byte[1], &byte[2], &byte[3]);
1158 return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3];
1159}
1160
1161
1162
1163
1164
1165
1166static void add_to_bridge(int fd, const char *if_name, const char *br_name)
1167{
1168 int ifidx;
1169 struct ifreq ifr;
1170
1171 if (!*br_name)
1172 errx(1, "must specify bridge name");
1173
1174 ifidx = if_nametoindex(if_name);
1175 if (!ifidx)
1176 errx(1, "interface %s does not exist!", if_name);
1177
1178 strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
1179 ifr.ifr_ifindex = ifidx;
1180 if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
1181 err(1, "can't add %s to bridge %s", if_name, br_name);
1182}
1183
1184
1185
1186
1187static void configure_device(int fd, const char *devname, u32 ipaddr,
1188 unsigned char hwaddr[6])
1189{
1190 struct ifreq ifr;
1191 struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
1192
1193
1194 memset(&ifr, 0, sizeof(ifr));
1195 strcpy(ifr.ifr_name, devname);
1196 sin->sin_family = AF_INET;
1197 sin->sin_addr.s_addr = htonl(ipaddr);
1198 if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
1199 err(1, "Setting %s interface address", devname);
1200 ifr.ifr_flags = IFF_UP;
1201 if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
1202 err(1, "Bringing interface %s up", devname);
1203
1204
1205
1206
1207 if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
1208 err(1, "getting hw address for %s", devname);
1209 memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
1210}
1211
1212
1213
1214
1215
1216static void setup_tun_net(const char *arg)
1217{
1218 struct device *dev;
1219 struct ifreq ifr;
1220 int netfd, ipfd;
1221 u32 ip;
1222 const char *br_name = NULL;
1223 u8 hwaddr[6];
1224
1225
1226
1227
1228
1229 netfd = open_or_die("/dev/net/tun", O_RDWR);
1230 memset(&ifr, 0, sizeof(ifr));
1231 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
1232 strcpy(ifr.ifr_name, "tap%d");
1233 if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
1234 err(1, "configuring /dev/net/tun");
1235
1236
1237 ioctl(netfd, TUNSETNOCSUM, 1);
1238
1239
1240 dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
1241
1242
1243
1244 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
1245 add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
1246
1247
1248
1249 ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
1250 if (ipfd < 0)
1251 err(1, "opening IP socket");
1252
1253
1254 if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
1255 ip = INADDR_ANY;
1256 br_name = arg + strlen(BRIDGE_PFX);
1257 add_to_bridge(ipfd, ifr.ifr_name, br_name);
1258 } else
1259 ip = str2ip(arg);
1260
1261
1262 configure_device(ipfd, ifr.ifr_name, ip, hwaddr);
1263
1264
1265 add_desc_field(dev, VIRTIO_CONFIG_NET_MAC_F, sizeof(hwaddr), hwaddr);
1266
1267
1268 close(ipfd);
1269
1270 verbose("device %u: tun net %u.%u.%u.%u\n",
1271 devices.device_num++,
1272 (u8)(ip>>24),(u8)(ip>>16),(u8)(ip>>8),(u8)ip);
1273 if (br_name)
1274 verbose("attached to bridge: %s\n", br_name);
1275}
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288struct vblk_info
1289{
1290
1291 off64_t len;
1292
1293
1294 int fd;
1295
1296
1297 int workpipe[2];
1298
1299
1300
1301 int done_fd;
1302};
1303
1304
1305
1306
1307
1308
1309
1310
1311static bool service_io(struct device *dev)
1312{
1313 struct vblk_info *vblk = dev->priv;
1314 unsigned int head, out_num, in_num, wlen;
1315 int ret;
1316 struct virtio_blk_inhdr *in;
1317 struct virtio_blk_outhdr *out;
1318 struct iovec iov[dev->vq->vring.num];
1319 off64_t off;
1320
1321
1322 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
1323 if (head == dev->vq->vring.num)
1324 return false;
1325
1326
1327
1328
1329 if (out_num == 0 || in_num == 0)
1330 errx(1, "Bad virtblk cmd %u out=%u in=%u",
1331 head, out_num, in_num);
1332
1333 out = convert(&iov[0], struct virtio_blk_outhdr);
1334 in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr);
1335 off = out->sector * 512;
1336
1337
1338
1339
1340
1341 if (out->type & VIRTIO_BLK_T_BARRIER)
1342 fdatasync(vblk->fd);
1343
1344
1345
1346 if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
1347 fprintf(stderr, "Scsi commands unsupported\n");
1348 in->status = VIRTIO_BLK_S_UNSUPP;
1349 wlen = sizeof(*in);
1350 } else if (out->type & VIRTIO_BLK_T_OUT) {
1351
1352
1353
1354
1355 if (lseek64(vblk->fd, off, SEEK_SET) != off)
1356 err(1, "Bad seek to sector %llu", out->sector);
1357
1358 ret = writev(vblk->fd, iov+1, out_num-1);
1359 verbose("WRITE to sector %llu: %i\n", out->sector, ret);
1360
1361
1362
1363
1364 if (ret > 0 && off + ret > vblk->len) {
1365
1366 ftruncate64(vblk->fd, vblk->len);
1367
1368 errx(1, "Write past end %llu+%u", off, ret);
1369 }
1370 wlen = sizeof(*in);
1371 in->status = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
1372 } else {
1373
1374
1375
1376
1377 if (lseek64(vblk->fd, off, SEEK_SET) != off)
1378 err(1, "Bad seek to sector %llu", out->sector);
1379
1380 ret = readv(vblk->fd, iov+1, in_num-1);
1381 verbose("READ from sector %llu: %i\n", out->sector, ret);
1382 if (ret >= 0) {
1383 wlen = sizeof(*in) + ret;
1384 in->status = VIRTIO_BLK_S_OK;
1385 } else {
1386 wlen = sizeof(*in);
1387 in->status = VIRTIO_BLK_S_IOERR;
1388 }
1389 }
1390
1391
1392
1393 add_used(dev->vq, head, wlen);
1394 return true;
1395}
1396
1397
1398static int io_thread(void *_dev)
1399{
1400 struct device *dev = _dev;
1401 struct vblk_info *vblk = dev->priv;
1402 char c;
1403
1404
1405 close(vblk->workpipe[1]);
1406
1407 close(dev->fd);
1408
1409
1410 while (read(vblk->workpipe[0], &c, 1) == 1) {
1411
1412
1413
1414 while (service_io(dev))
1415 write(vblk->done_fd, &c, 1);
1416 }
1417 return 0;
1418}
1419
1420
1421
1422static bool handle_io_finish(int fd, struct device *dev)
1423{
1424 char c;
1425
1426
1427
1428 if (read(dev->fd, &c, 1) != 1)
1429 exit(1);
1430
1431
1432 trigger_irq(fd, dev->vq);
1433 return true;
1434}
1435
1436
1437static void handle_virtblk_output(int fd, struct virtqueue *vq)
1438{
1439 struct vblk_info *vblk = vq->dev->priv;
1440 char c = 0;
1441
1442
1443 if (write(vblk->workpipe[1], &c, 1) != 1)
1444
1445 exit(1);
1446}
1447
1448
1449static void setup_block_file(const char *filename)
1450{
1451 int p[2];
1452 struct device *dev;
1453 struct vblk_info *vblk;
1454 void *stack;
1455 u64 cap;
1456 unsigned int val;
1457
1458
1459 pipe(p);
1460
1461
1462 dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish);
1463
1464
1465 add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output);
1466
1467
1468 vblk = dev->priv = malloc(sizeof(*vblk));
1469
1470
1471 vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
1472 vblk->len = lseek64(vblk->fd, 0, SEEK_END);
1473
1474
1475 cap = cpu_to_le64(vblk->len / 512);
1476 add_desc_field(dev, VIRTIO_CONFIG_BLK_F_CAPACITY, sizeof(cap), &cap);
1477
1478
1479
1480 val = cpu_to_le32(VIRTQUEUE_NUM - 2);
1481 add_desc_field(dev, VIRTIO_CONFIG_BLK_F_SEG_MAX, sizeof(val), &val);
1482
1483
1484 vblk->done_fd = p[1];
1485
1486
1487
1488 pipe(vblk->workpipe);
1489
1490
1491 stack = malloc(32768);
1492 if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1)
1493 err(1, "Creating clone");
1494
1495
1496 close(vblk->done_fd);
1497 close(vblk->workpipe[0]);
1498
1499 verbose("device %u: virtblock %llu sectors\n",
1500 devices.device_num, cap);
1501}
1502
1503
1504
1505
1506static void __attribute__((noreturn)) run_guest(int lguest_fd)
1507{
1508 for (;;) {
1509 unsigned long args[] = { LHREQ_BREAK, 0 };
1510 unsigned long notify_addr;
1511 int readval;
1512
1513
1514 readval = read(lguest_fd, ¬ify_addr, sizeof(notify_addr));
1515
1516
1517 if (readval == sizeof(notify_addr)) {
1518 verbose("Notify on address %#lx\n", notify_addr);
1519 handle_output(lguest_fd, notify_addr);
1520 continue;
1521
1522 } else if (errno == ENOENT) {
1523 char reason[1024] = { 0 };
1524 read(lguest_fd, reason, sizeof(reason)-1);
1525 errx(1, "%s", reason);
1526
1527
1528 } else if (errno != EAGAIN)
1529 err(1, "Running guest failed");
1530
1531
1532 handle_input(lguest_fd);
1533 if (write(lguest_fd, args, sizeof(args)) < 0)
1534 err(1, "Resetting break");
1535 }
1536}
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546static struct option opts[] = {
1547 { "verbose", 0, NULL, 'v' },
1548 { "tunnet", 1, NULL, 't' },
1549 { "block", 1, NULL, 'b' },
1550 { "initrd", 1, NULL, 'i' },
1551 { NULL },
1552};
1553static void usage(void)
1554{
1555 errx(1, "Usage: lguest [--verbose] "
1556 "[--tunnet=(<ipaddr>|bridge:<bridgename>)\n"
1557 "|--block=<filename>|--initrd=<filename>]...\n"
1558 "<mem-in-mb> vmlinux [args...]");
1559}
1560
1561
1562int main(int argc, char *argv[])
1563{
1564
1565
1566 unsigned long mem = 0, pgdir, start, initrd_size = 0;
1567
1568 int i, c, lguest_fd;
1569
1570 struct boot_params *boot;
1571
1572 const char *initrd_name = NULL;
1573
1574
1575
1576
1577
1578
1579
1580 FD_ZERO(&devices.infds);
1581 devices.max_infd = -1;
1582 devices.lastdev = &devices.dev;
1583 devices.next_irq = 1;
1584
1585
1586
1587
1588
1589 for (i = 1; i < argc; i++) {
1590 if (argv[i][0] != '-') {
1591 mem = atoi(argv[i]) * 1024 * 1024;
1592
1593
1594
1595
1596 guest_base = map_zeroed_pages(mem / getpagesize()
1597 + DEVICE_PAGES);
1598 guest_limit = mem;
1599 guest_max = mem + DEVICE_PAGES*getpagesize();
1600 devices.descpage = get_pages(1);
1601 break;
1602 }
1603 }
1604
1605
1606 while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
1607 switch (c) {
1608 case 'v':
1609 verbose = true;
1610 break;
1611 case 't':
1612 setup_tun_net(optarg);
1613 break;
1614 case 'b':
1615 setup_block_file(optarg);
1616 break;
1617 case 'i':
1618 initrd_name = optarg;
1619 break;
1620 default:
1621 warnx("Unknown argument %s", argv[optind]);
1622 usage();
1623 }
1624 }
1625
1626
1627 if (optind + 2 > argc)
1628 usage();
1629
1630 verbose("Guest base is at %p\n", guest_base);
1631
1632
1633 setup_console();
1634
1635
1636 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
1637
1638
1639 boot = from_guest_phys(0);
1640
1641
1642 if (initrd_name) {
1643 initrd_size = load_initrd(initrd_name, mem);
1644
1645
1646 boot->hdr.ramdisk_image = mem - initrd_size;
1647 boot->hdr.ramdisk_size = initrd_size;
1648
1649 boot->hdr.type_of_loader = 0xFF;
1650 }
1651
1652
1653 pgdir = setup_pagetables(mem, initrd_size);
1654
1655
1656
1657 boot->e820_entries = 1;
1658 boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM });
1659
1660
1661 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
1662
1663 concat((char *)(boot + 1), argv+optind+2);
1664
1665
1666 boot->hdr.version = 0x207;
1667
1668
1669 boot->hdr.hardware_subarch = 1;
1670
1671
1672 boot->hdr.loadflags |= KEEP_SEGMENTS;
1673
1674
1675
1676 lguest_fd = tell_kernel(pgdir, start);
1677
1678
1679
1680
1681 waker_fd = setup_waker(lguest_fd);
1682
1683
1684 run_guest(lguest_fd);
1685}
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698