1
2
3
4
5
6
7#define _LARGEFILE64_SOURCE
8#define _GNU_SOURCE
9#include <stdio.h>
10#include <string.h>
11#include <unistd.h>
12#include <err.h>
13#include <stdint.h>
14#include <stdlib.h>
15#include <elf.h>
16#include <sys/mman.h>
17#include <sys/param.h>
18#include <sys/types.h>
19#include <sys/stat.h>
20#include <sys/wait.h>
21#include <sys/eventfd.h>
22#include <fcntl.h>
23#include <stdbool.h>
24#include <errno.h>
25#include <ctype.h>
26#include <sys/socket.h>
27#include <sys/ioctl.h>
28#include <sys/time.h>
29#include <time.h>
30#include <netinet/in.h>
31#include <net/if.h>
32#include <linux/sockios.h>
33#include <linux/if_tun.h>
34#include <sys/uio.h>
35#include <termios.h>
36#include <getopt.h>
37#include <assert.h>
38#include <sched.h>
39#include <limits.h>
40#include <stddef.h>
41#include <signal.h>
42#include <pwd.h>
43#include <grp.h>
44#include <sys/user.h>
45#include <linux/pci_regs.h>
46
47#ifndef VIRTIO_F_ANY_LAYOUT
48#define VIRTIO_F_ANY_LAYOUT 27
49#endif
50
51
52
53
54
55
56
57
58
59
60typedef unsigned long long u64;
61typedef uint32_t u32;
62typedef uint16_t u16;
63typedef uint8_t u8;
64
65
66#define VIRTIO_CONFIG_NO_LEGACY
67#define VIRTIO_PCI_NO_LEGACY
68#define VIRTIO_BLK_NO_LEGACY
69#define VIRTIO_NET_NO_LEGACY
70
71
72#include "../../include/uapi/linux/virtio_config.h"
73#include "../../include/uapi/linux/virtio_net.h"
74#include "../../include/uapi/linux/virtio_blk.h"
75#include "../../include/uapi/linux/virtio_console.h"
76#include "../../include/uapi/linux/virtio_rng.h"
77#include <linux/virtio_ring.h>
78#include "../../include/uapi/linux/virtio_pci.h"
79#include <asm/bootparam.h>
80#include "../../include/linux/lguest_launcher.h"
81
82#define BRIDGE_PFX "bridge:"
83#ifndef SIOCBRADDIF
84#define SIOCBRADDIF 0x89a2
85#endif
86
87#define DEVICE_PAGES 256
88
89#define VIRTQUEUE_NUM 256
90
91
92
93
94
95static bool verbose;
96#define verbose(args...) \
97 do { if (verbose) printf(args); } while(0)
98
99
100
101static void *guest_base;
102
103static unsigned long guest_limit, guest_max, guest_mmio;
104
105static int lguest_fd;
106
107
108static unsigned int __thread cpu_id;
109
110
111#define MAX_PCI_DEVICES 32
112
113
114struct device_list {
115
116 unsigned int next_irq;
117
118
119 unsigned int device_num;
120
121
122 struct device *pci[MAX_PCI_DEVICES];
123};
124
125
126static struct device_list devices;
127
128
129
130
131
132struct virtio_pci_cfg_cap_u32 {
133 struct virtio_pci_cap cap;
134 u32 pci_cfg_data;
135};
136
137struct virtio_pci_mmio {
138 struct virtio_pci_common_cfg cfg;
139 u16 notify;
140 u8 isr;
141 u8 padding;
142
143};
144
145
146struct pci_config {
147 u16 vendor_id, device_id;
148 u16 command, status;
149 u8 revid, prog_if, subclass, class;
150 u8 cacheline_size, lat_timer, header_type, bist;
151 u32 bar[6];
152 u32 cardbus_cis_ptr;
153 u16 subsystem_vendor_id, subsystem_device_id;
154 u32 expansion_rom_addr;
155 u8 capabilities, reserved1[3];
156 u32 reserved2;
157 u8 irq_line, irq_pin, min_grant, max_latency;
158
159
160 struct virtio_pci_cap common;
161 struct virtio_pci_notify_cap notify;
162 struct virtio_pci_cap isr;
163 struct virtio_pci_cap device;
164 struct virtio_pci_cfg_cap_u32 cfg_access;
165};
166
167
168struct device {
169
170 const char *name;
171
172
173 struct virtqueue *vq;
174
175
176 bool running;
177
178
179 bool wrote_features_ok;
180
181
182 union {
183 struct pci_config config;
184 u32 config_words[sizeof(struct pci_config) / sizeof(u32)];
185 };
186
187
188 u64 features, features_accepted;
189
190
191 struct virtio_pci_mmio *mmio;
192
193
194 size_t mmio_size;
195 u32 mmio_addr;
196
197
198 void *priv;
199};
200
201
202struct virtqueue {
203 struct virtqueue *next;
204
205
206 struct device *dev;
207
208
209 const char *name;
210
211
212 struct vring vring;
213
214
215 struct virtio_pci_common_cfg pci_config;
216
217
218 u16 last_avail_idx;
219
220
221 unsigned int pending_used;
222
223
224 int eventfd;
225
226
227 void (*service)(struct virtqueue *vq);
228 pid_t thread;
229};
230
231
232static char **main_args;
233
234
235static struct termios orig_term;
236
237
238
239
240
241
242#define wmb() __asm__ __volatile__("" : : : "memory")
243#define rmb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory")
244#define mb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory")
245
246
247#define lg_last_avail(vq) ((vq)->last_avail_idx)
248
249
250
251
252
253#define cpu_to_le16(v16) (v16)
254#define cpu_to_le32(v32) (v32)
255#define cpu_to_le64(v64) (v64)
256#define le16_to_cpu(v16) (v16)
257#define le32_to_cpu(v32) (v32)
258#define le64_to_cpu(v64) (v64)
259
260
261
262
263
264#define bad_driver(d, fmt, ...) \
265 errx(1, "%s: bad driver: " fmt, (d)->name, ## __VA_ARGS__)
266#define bad_driver_vq(vq, fmt, ...) \
267 errx(1, "%s vq %s: bad driver: " fmt, (vq)->dev->name, \
268 vq->name, ## __VA_ARGS__)
269
270
271static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
272{
273 unsigned int i;
274
275 for (i = 0; i < num_iov; i++)
276 if (iov[i].iov_len)
277 return false;
278 return true;
279}
280
281
282static void iov_consume(struct device *d,
283 struct iovec iov[], unsigned num_iov,
284 void *dest, unsigned len)
285{
286 unsigned int i;
287
288 for (i = 0; i < num_iov; i++) {
289 unsigned int used;
290
291 used = iov[i].iov_len < len ? iov[i].iov_len : len;
292 if (dest) {
293 memcpy(dest, iov[i].iov_base, used);
294 dest += used;
295 }
296 iov[i].iov_base += used;
297 iov[i].iov_len -= used;
298 len -= used;
299 }
300 if (len != 0)
301 bad_driver(d, "iovec too short!");
302}
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319static void *from_guest_phys(unsigned long addr)
320{
321 return guest_base + addr;
322}
323
324static unsigned long to_guest_phys(const void *addr)
325{
326 return (addr - guest_base);
327}
328
329
330
331
332
333
334
335static int open_or_die(const char *name, int flags)
336{
337 int fd = open(name, flags);
338 if (fd < 0)
339 err(1, "Failed to open %s", name);
340 return fd;
341}
342
343
344static void *map_zeroed_pages(unsigned int num)
345{
346 int fd = open_or_die("/dev/zero", O_RDONLY);
347 void *addr;
348
349
350
351
352
353
354 addr = mmap(NULL, getpagesize() * (num+2),
355 PROT_NONE, MAP_PRIVATE, fd, 0);
356
357 if (addr == MAP_FAILED)
358 err(1, "Mmapping %u pages of /dev/zero", num);
359
360 if (mprotect(addr + getpagesize(), getpagesize() * num,
361 PROT_READ|PROT_WRITE) == -1)
362 err(1, "mprotect rw %u pages failed", num);
363
364
365
366
367
368 close(fd);
369
370
371 return addr + getpagesize();
372}
373
374
375static unsigned long get_mmio_region(size_t size)
376{
377 unsigned long addr = guest_mmio;
378 size_t i;
379
380 if (!size)
381 return addr;
382
383
384 for (i = 1; i < size; i <<= 1);
385
386 guest_mmio += i;
387
388 return addr;
389}
390
391
392
393
394
395
396static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
397{
398 ssize_t r;
399
400
401
402
403
404
405
406
407
408
409 if (mmap(addr, len, PROT_READ|PROT_WRITE,
410 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
411 return;
412
413
414 r = pread(fd, addr, len, offset);
415 if (r != len)
416 err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
417}
418
419
420
421
422
423
424
425
426
427
428
429
430static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
431{
432 Elf32_Phdr phdr[ehdr->e_phnum];
433 unsigned int i;
434
435
436
437
438
439 if (ehdr->e_type != ET_EXEC
440 || ehdr->e_machine != EM_386
441 || ehdr->e_phentsize != sizeof(Elf32_Phdr)
442 || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
443 errx(1, "Malformed elf header");
444
445
446
447
448
449
450
451
452 if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
453 err(1, "Seeking to program headers");
454 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
455 err(1, "Reading program headers");
456
457
458
459
460
461 for (i = 0; i < ehdr->e_phnum; i++) {
462
463 if (phdr[i].p_type != PT_LOAD)
464 continue;
465
466 verbose("Section %i: size %i addr %p\n",
467 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
468
469
470 map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
471 phdr[i].p_offset, phdr[i].p_filesz);
472 }
473
474
475 return ehdr->e_entry;
476}
477
478
479
480
481
482
483
484
485
486
487static unsigned long load_bzimage(int fd)
488{
489 struct boot_params boot;
490 int r;
491
492 void *p = from_guest_phys(0x100000);
493
494
495
496
497
498 lseek(fd, 0, SEEK_SET);
499 read(fd, &boot, sizeof(boot));
500
501
502 if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
503 errx(1, "This doesn't look like a bzImage to me");
504
505
506 lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
507
508
509 while ((r = read(fd, p, 65536)) > 0)
510 p += r;
511
512
513 return boot.hdr.code32_start;
514}
515
516
517
518
519
520
521static unsigned long load_kernel(int fd)
522{
523 Elf32_Ehdr hdr;
524
525
526 if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
527 err(1, "Reading kernel");
528
529
530 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
531 return map_elf(fd, &hdr);
532
533
534 return load_bzimage(fd);
535}
536
537
538
539
540
541
542
543
544static inline unsigned long page_align(unsigned long addr)
545{
546
547 return ((addr + getpagesize()-1) & ~(getpagesize()-1));
548}
549
550
551
552
553
554
555
556
557
558
559static unsigned long load_initrd(const char *name, unsigned long mem)
560{
561 int ifd;
562 struct stat st;
563 unsigned long len;
564
565 ifd = open_or_die(name, O_RDONLY);
566
567 if (fstat(ifd, &st) < 0)
568 err(1, "fstat() on initrd '%s'", name);
569
570
571
572
573
574 len = page_align(st.st_size);
575 map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
576
577
578
579
580 close(ifd);
581 verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
582
583
584 return len;
585}
586
587
588
589
590
591
592static void concat(char *dst, char *args[])
593{
594 unsigned int i, len = 0;
595
596 for (i = 0; args[i]; i++) {
597 if (i) {
598 strcat(dst+len, " ");
599 len++;
600 }
601 strcpy(dst+len, args[i]);
602 len += strlen(args[i]);
603 }
604
605 dst[len] = '\0';
606}
607
608
609
610
611
612
613
614static void tell_kernel(unsigned long start)
615{
616 unsigned long args[] = { LHREQ_INITIALIZE,
617 (unsigned long)guest_base,
618 guest_limit / getpagesize(), start,
619 (guest_mmio+getpagesize()-1) / getpagesize() };
620 verbose("Guest: %p - %p (%#lx, MMIO %#lx)\n",
621 guest_base, guest_base + guest_limit,
622 guest_limit, guest_mmio);
623 lguest_fd = open_or_die("/dev/lguest", O_RDWR);
624 if (write(lguest_fd, args, sizeof(args)) < 0)
625 err(1, "Writing to /dev/lguest");
626}
627
628
629
630
631
632
633
634
635
636
637static void *_check_pointer(struct device *d,
638 unsigned long addr, unsigned int size,
639 unsigned int line)
640{
641
642
643
644
645 if ((addr + size) > guest_limit || (addr + size) < addr)
646 bad_driver(d, "%s:%i: Invalid address %#lx",
647 __FILE__, line, addr);
648
649
650
651
652 return from_guest_phys(addr);
653}
654
655#define check_pointer(d,addr,size) _check_pointer(d, addr, size, __LINE__)
656
657
658
659
660
661
662static unsigned next_desc(struct device *d, struct vring_desc *desc,
663 unsigned int i, unsigned int max)
664{
665 unsigned int next;
666
667
668 if (!(desc[i].flags & VRING_DESC_F_NEXT))
669 return max;
670
671
672 next = desc[i].next;
673
674 wmb();
675
676 if (next >= max)
677 bad_driver(d, "Desc next is %u", next);
678
679 return next;
680}
681
682
683
684
685
686static void trigger_irq(struct virtqueue *vq)
687{
688 unsigned long buf[] = { LHREQ_IRQ, vq->dev->config.irq_line };
689
690
691 if (!vq->pending_used)
692 return;
693 vq->pending_used = 0;
694
695
696
697
698
699
700
701 if (vq->vring.avail->flags > 1)
702 bad_driver_vq(vq, "avail->flags = %u\n", vq->vring.avail->flags);
703
704
705
706
707
708
709
710
711
712
713
714 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
715 return;
716 }
717
718
719
720
721
722
723
724
725 vq->dev->mmio->isr = 0x1;
726
727
728 if (write(lguest_fd, buf, sizeof(buf)) != 0)
729 err(1, "Triggering irq %i", vq->dev->config.irq_line);
730}
731
732
733
734
735
736
737
738
739
740static unsigned wait_for_vq_desc(struct virtqueue *vq,
741 struct iovec iov[],
742 unsigned int *out_num, unsigned int *in_num)
743{
744 unsigned int i, head, max;
745 struct vring_desc *desc;
746 u16 last_avail = lg_last_avail(vq);
747
748
749
750
751
752
753
754
755
756
757 while (last_avail == vq->vring.avail->idx) {
758 u64 event;
759
760
761
762
763
764 trigger_irq(vq);
765
766
767 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
768
769
770
771
772
773 mb();
774 if (last_avail != vq->vring.avail->idx) {
775 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
776 break;
777 }
778
779
780 if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
781 errx(1, "Event read failed?");
782
783
784 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
785 }
786
787
788 if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
789 bad_driver_vq(vq, "Guest moved used index from %u to %u",
790 last_avail, vq->vring.avail->idx);
791
792
793
794
795
796 rmb();
797
798
799
800
801
802 head = vq->vring.avail->ring[last_avail % vq->vring.num];
803 lg_last_avail(vq)++;
804
805
806 if (head >= vq->vring.num)
807 bad_driver_vq(vq, "Guest says index %u is available", head);
808
809
810 *out_num = *in_num = 0;
811
812 max = vq->vring.num;
813 desc = vq->vring.desc;
814 i = head;
815
816
817
818
819
820
821
822 do {
823
824
825
826
827
828 if (desc[i].flags & VRING_DESC_F_INDIRECT) {
829
830
831
832
833
834
835 if (!(vq->dev->features_accepted &
836 (1<<VIRTIO_RING_F_INDIRECT_DESC)))
837 bad_driver_vq(vq, "vq indirect not negotiated");
838
839
840
841
842
843
844
845
846 if (desc != vq->vring.desc)
847 bad_driver_vq(vq, "Indirect within indirect");
848
849
850
851
852
853
854
855 if (desc[i].flags & VRING_DESC_F_NEXT)
856 bad_driver_vq(vq, "indirect and next together");
857
858 if (desc[i].len % sizeof(struct vring_desc))
859 bad_driver_vq(vq,
860 "Invalid size for indirect table");
861
862
863
864
865
866
867
868
869
870
871 max = desc[i].len / sizeof(struct vring_desc);
872 desc = check_pointer(vq->dev, desc[i].addr, desc[i].len);
873 i = 0;
874
875
876
877
878
879
880 if (max > vq->pci_config.queue_size)
881 bad_driver_vq(vq,
882 "indirect has too many entries");
883 }
884
885
886 iov[*out_num + *in_num].iov_len = desc[i].len;
887 iov[*out_num + *in_num].iov_base
888 = check_pointer(vq->dev, desc[i].addr, desc[i].len);
889
890 if (desc[i].flags & VRING_DESC_F_WRITE)
891 (*in_num)++;
892 else {
893
894
895
896
897 if (*in_num)
898 bad_driver_vq(vq,
899 "Descriptor has out after in");
900 (*out_num)++;
901 }
902
903
904 if (*out_num + *in_num > max)
905 bad_driver_vq(vq, "Looped descriptor");
906 } while ((i = next_desc(vq->dev, desc, i, max)) != max);
907
908 return head;
909}
910
911
912
913
914
915
916static void add_used(struct virtqueue *vq, unsigned int head, int len)
917{
918 struct vring_used_elem *used;
919
920
921
922
923
924 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
925 used->id = head;
926 used->len = len;
927
928 wmb();
929 vq->vring.used->idx++;
930 vq->pending_used++;
931}
932
933
934static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
935{
936 add_used(vq, head, len);
937 trigger_irq(vq);
938}
939
940
941
942
943
944
945struct console_abort {
946
947 int count;
948
949 struct timeval start;
950};
951
952
953static void console_input(struct virtqueue *vq)
954{
955 int len;
956 unsigned int head, in_num, out_num;
957 struct console_abort *abort = vq->dev->priv;
958 struct iovec iov[vq->vring.num];
959
960
961 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
962 if (out_num)
963 bad_driver_vq(vq, "Output buffers in console in queue?");
964
965
966 len = readv(STDIN_FILENO, iov, in_num);
967 if (len <= 0) {
968
969 warnx("Failed to get console input, ignoring console.");
970
971
972
973
974 for (;;)
975 pause();
976 }
977
978
979 add_used_and_trigger(vq, head, len);
980
981
982
983
984
985
986
987
988
989 if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
990 abort->count = 0;
991 return;
992 }
993
994 abort->count++;
995 if (abort->count == 1)
996 gettimeofday(&abort->start, NULL);
997 else if (abort->count == 3) {
998 struct timeval now;
999 gettimeofday(&now, NULL);
1000
1001 if (now.tv_sec <= abort->start.tv_sec+1)
1002 kill(0, SIGINT);
1003 abort->count = 0;
1004 }
1005}
1006
1007
1008static void console_output(struct virtqueue *vq)
1009{
1010 unsigned int head, out, in;
1011 struct iovec iov[vq->vring.num];
1012
1013
1014 head = wait_for_vq_desc(vq, iov, &out, &in);
1015 if (in)
1016 bad_driver_vq(vq, "Input buffers in console output queue?");
1017
1018
1019 while (!iov_empty(iov, out)) {
1020 int len = writev(STDOUT_FILENO, iov, out);
1021 if (len <= 0) {
1022 warn("Write to stdout gave %i (%d)", len, errno);
1023 break;
1024 }
1025 iov_consume(vq->dev, iov, out, NULL, len);
1026 }
1027
1028
1029
1030
1031
1032 add_used(vq, head, 0);
1033}
1034
1035
1036
1037
1038
1039
1040
1041struct net_info {
1042 int tunfd;
1043};
1044
1045static void net_output(struct virtqueue *vq)
1046{
1047 struct net_info *net_info = vq->dev->priv;
1048 unsigned int head, out, in;
1049 struct iovec iov[vq->vring.num];
1050
1051
1052 head = wait_for_vq_desc(vq, iov, &out, &in);
1053 if (in)
1054 bad_driver_vq(vq, "Input buffers in net output queue?");
1055
1056
1057
1058
1059 if (writev(net_info->tunfd, iov, out) < 0)
1060 warnx("Write to tun failed (%d)?", errno);
1061
1062
1063
1064
1065
1066 add_used(vq, head, 0);
1067}
1068
1069
1070
1071
1072
1073
1074
1075static bool will_block(int fd)
1076{
1077 fd_set fdset;
1078 struct timeval zero = { 0, 0 };
1079 FD_ZERO(&fdset);
1080 FD_SET(fd, &fdset);
1081 return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
1082}
1083
1084
1085
1086
1087
1088
1089static void net_input(struct virtqueue *vq)
1090{
1091 int len;
1092 unsigned int head, out, in;
1093 struct iovec iov[vq->vring.num];
1094 struct net_info *net_info = vq->dev->priv;
1095
1096
1097
1098
1099
1100 head = wait_for_vq_desc(vq, iov, &out, &in);
1101 if (out)
1102 bad_driver_vq(vq, "Output buffers in net input queue?");
1103
1104
1105
1106
1107
1108 if (vq->pending_used && will_block(net_info->tunfd))
1109 trigger_irq(vq);
1110
1111
1112
1113
1114
1115 len = readv(net_info->tunfd, iov, in);
1116 if (len <= 0)
1117 warn("Failed to read from tun (%d).", errno);
1118
1119
1120
1121
1122
1123 add_used(vq, head, len);
1124}
1125
1126
1127
1128static int do_thread(void *_vq)
1129{
1130 struct virtqueue *vq = _vq;
1131
1132 for (;;)
1133 vq->service(vq);
1134 return 0;
1135}
1136
1137
1138
1139
1140
1141static void kill_launcher(int signal)
1142{
1143 kill(0, SIGTERM);
1144}
1145
1146static void reset_vq_pci_config(struct virtqueue *vq)
1147{
1148 vq->pci_config.queue_size = VIRTQUEUE_NUM;
1149 vq->pci_config.queue_enable = 0;
1150}
1151
1152static void reset_device(struct device *dev)
1153{
1154 struct virtqueue *vq;
1155
1156 verbose("Resetting device %s\n", dev->name);
1157
1158
1159 dev->features_accepted = 0;
1160
1161
1162 signal(SIGCHLD, SIG_IGN);
1163
1164
1165
1166
1167
1168
1169
1170
1171 dev->mmio->cfg.queue_enable = 0;
1172
1173
1174 for (vq = dev->vq; vq; vq = vq->next) {
1175 vq->last_avail_idx = 0;
1176 reset_vq_pci_config(vq);
1177 if (vq->thread != (pid_t)-1) {
1178 kill(vq->thread, SIGTERM);
1179 waitpid(vq->thread, NULL, 0);
1180 vq->thread = (pid_t)-1;
1181 }
1182 }
1183 dev->running = false;
1184 dev->wrote_features_ok = false;
1185
1186
1187 signal(SIGCHLD, (void *)kill_launcher);
1188}
1189
1190static void cleanup_devices(void)
1191{
1192 unsigned int i;
1193
1194 for (i = 1; i < MAX_PCI_DEVICES; i++) {
1195 struct device *d = devices.pci[i];
1196 if (!d)
1197 continue;
1198 reset_device(d);
1199 }
1200
1201
1202 if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
1203 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
1204}
1205
1206
1207
1208
1209
1210
1211
1212static struct device pci_host_bridge;
1213
1214static void init_pci_host_bridge(void)
1215{
1216 pci_host_bridge.name = "PCI Host Bridge";
1217 pci_host_bridge.config.class = 0x06;
1218 pci_host_bridge.config.subclass = 0;
1219 devices.pci[0] = &pci_host_bridge;
1220}
1221
1222
1223#define PCI_CONFIG_ADDR 0xCF8
1224#define PCI_CONFIG_DATA 0xCFC
1225
1226
1227
1228
1229
1230union pci_config_addr {
1231 struct {
1232 unsigned mbz: 2;
1233 unsigned offset: 6;
1234 unsigned funcnum: 3;
1235 unsigned devnum: 5;
1236 unsigned busnum: 8;
1237 unsigned reserved: 7;
1238 unsigned enabled : 1;
1239 } bits;
1240 u32 val;
1241};
1242
1243
1244
1245
1246
1247static union pci_config_addr pci_config_addr;
1248
1249static struct device *find_pci_device(unsigned int index)
1250{
1251 return devices.pci[index];
1252}
1253
1254
1255static void ioread(u16 off, u32 v, u32 mask, u32 *val)
1256{
1257 assert(off < 4);
1258 assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
1259 *val = (v >> (off * 8)) & mask;
1260}
1261
1262
1263static void iowrite(u16 off, u32 v, u32 mask, u32 *dst)
1264{
1265 assert(off < 4);
1266 assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
1267 *dst &= ~(mask << (off * 8));
1268 *dst |= (v & mask) << (off * 8);
1269}
1270
1271
1272
1273
1274
1275static struct device *dev_and_reg(u32 *reg)
1276{
1277 if (!pci_config_addr.bits.enabled)
1278 return NULL;
1279
1280 if (pci_config_addr.bits.funcnum != 0)
1281 return NULL;
1282
1283 if (pci_config_addr.bits.busnum != 0)
1284 return NULL;
1285
1286 if (pci_config_addr.bits.offset * 4 >= sizeof(struct pci_config))
1287 return NULL;
1288
1289 *reg = pci_config_addr.bits.offset;
1290 return find_pci_device(pci_config_addr.bits.devnum);
1291}
1292
1293
1294
1295
1296
1297static bool valid_bar_access(struct device *d,
1298 struct virtio_pci_cfg_cap_u32 *cfg_access)
1299{
1300
1301 if (cfg_access->cap.bar != 0)
1302 return false;
1303
1304
1305 if (cfg_access->cap.offset >= d->mmio_size
1306 || cfg_access->cap.offset + cfg_access->cap.length > d->mmio_size)
1307 return false;
1308
1309
1310 if (cfg_access->cap.length != 1
1311 && cfg_access->cap.length != 2
1312 && cfg_access->cap.length != 4)
1313 return false;
1314
1315
1316
1317
1318
1319
1320
1321 if (cfg_access->cap.offset % cfg_access->cap.length != 0)
1322 return false;
1323
1324
1325 return true;
1326}
1327
1328
1329static bool is_pci_addr_port(u16 port)
1330{
1331 return port >= PCI_CONFIG_ADDR && port < PCI_CONFIG_ADDR + 4;
1332}
1333
1334static bool pci_addr_iowrite(u16 port, u32 mask, u32 val)
1335{
1336 iowrite(port - PCI_CONFIG_ADDR, val, mask,
1337 &pci_config_addr.val);
1338 verbose("PCI%s: %#x/%x: bus %u dev %u func %u reg %u\n",
1339 pci_config_addr.bits.enabled ? "" : " DISABLED",
1340 val, mask,
1341 pci_config_addr.bits.busnum,
1342 pci_config_addr.bits.devnum,
1343 pci_config_addr.bits.funcnum,
1344 pci_config_addr.bits.offset);
1345 return true;
1346}
1347
1348static void pci_addr_ioread(u16 port, u32 mask, u32 *val)
1349{
1350 ioread(port - PCI_CONFIG_ADDR, pci_config_addr.val, mask, val);
1351}
1352
1353
1354static bool is_pci_data_port(u16 port)
1355{
1356 return port >= PCI_CONFIG_DATA && port < PCI_CONFIG_DATA + 4;
1357}
1358
1359static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask);
1360
1361static bool pci_data_iowrite(u16 port, u32 mask, u32 val)
1362{
1363 u32 reg, portoff;
1364 struct device *d = dev_and_reg(®);
1365
1366
1367 if (!d)
1368 return false;
1369
1370
1371 portoff = port - PCI_CONFIG_DATA;
1372
1373
1374
1375
1376
1377 if (&d->config_words[reg] == &d->config.bar[0]) {
1378 int i;
1379
1380 iowrite(portoff, val, mask, &d->config.bar[0]);
1381 for (i = 0; (1 << i) < d->mmio_size; i++)
1382 d->config.bar[0] &= ~(1 << i);
1383 return true;
1384 } else if ((&d->config_words[reg] > &d->config.bar[0]
1385 && &d->config_words[reg] <= &d->config.bar[6])
1386 || &d->config_words[reg] == &d->config.expansion_rom_addr) {
1387
1388 iowrite(portoff, val, mask, &d->config_words[reg]);
1389 return true;
1390
1391 } else if (&d->config_words[reg] == (void *)&d->config.cacheline_size) {
1392
1393 if (mask == 0xFFFFFFFF)
1394 mask = 0xFFFF;
1395 iowrite(portoff, val, mask, &d->config_words[reg]);
1396 return true;
1397 } else if (&d->config_words[reg] == (void *)&d->config.command
1398 && mask == 0xFFFF) {
1399
1400 return true;
1401 } else if (&d->config_words[reg]
1402 == (void *)&d->config.cfg_access.cap.bar
1403 || &d->config_words[reg]
1404 == &d->config.cfg_access.cap.length
1405 || &d->config_words[reg]
1406 == &d->config.cfg_access.cap.offset) {
1407
1408
1409
1410
1411
1412
1413
1414 iowrite(portoff, val, mask, &d->config_words[reg]);
1415 return true;
1416 } else if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
1417 u32 write_mask;
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429 if (!valid_bar_access(d, &d->config.cfg_access))
1430 return false;
1431
1432 iowrite(portoff, val, mask, &d->config.cfg_access.pci_cfg_data);
1433
1434
1435
1436
1437
1438 write_mask = (1ULL<<(8*d->config.cfg_access.cap.length)) - 1;
1439 verbose("Window writing %#x/%#x to bar %u, offset %u len %u\n",
1440 d->config.cfg_access.pci_cfg_data, write_mask,
1441 d->config.cfg_access.cap.bar,
1442 d->config.cfg_access.cap.offset,
1443 d->config.cfg_access.cap.length);
1444
1445 emulate_mmio_write(d, d->config.cfg_access.cap.offset,
1446 d->config.cfg_access.pci_cfg_data,
1447 write_mask);
1448 return true;
1449 }
1450
1451
1452
1453
1454
1455
1456
1457
1458 return false;
1459}
1460
1461static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask);
1462
1463static void pci_data_ioread(u16 port, u32 mask, u32 *val)
1464{
1465 u32 reg;
1466 struct device *d = dev_and_reg(®);
1467
1468 if (!d)
1469 return;
1470
1471
1472 if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
1473 u32 read_mask;
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484 if (!valid_bar_access(d, &d->config.cfg_access))
1485 bad_driver(d,
1486 "Invalid cfg_access to bar%u, offset %u len %u",
1487 d->config.cfg_access.cap.bar,
1488 d->config.cfg_access.cap.offset,
1489 d->config.cfg_access.cap.length);
1490
1491
1492
1493
1494
1495 read_mask = (1ULL<<(8*d->config.cfg_access.cap.length))-1;
1496 d->config.cfg_access.pci_cfg_data
1497 = emulate_mmio_read(d,
1498 d->config.cfg_access.cap.offset,
1499 read_mask);
1500 verbose("Window read %#x/%#x from bar %u, offset %u len %u\n",
1501 d->config.cfg_access.pci_cfg_data, read_mask,
1502 d->config.cfg_access.cap.bar,
1503 d->config.cfg_access.cap.offset,
1504 d->config.cfg_access.cap.length);
1505 }
1506 ioread(port - PCI_CONFIG_DATA, d->config_words[reg], mask, val);
1507}
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518#define getreg(name) getreg_off(offsetof(struct user_regs_struct, name))
1519#define setreg(name, val) \
1520 setreg_off(offsetof(struct user_regs_struct, name), (val))
1521
1522static u32 getreg_off(size_t offset)
1523{
1524 u32 r;
1525 unsigned long args[] = { LHREQ_GETREG, offset };
1526
1527 if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
1528 err(1, "Getting register %u", offset);
1529 if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r))
1530 err(1, "Reading register %u", offset);
1531
1532 return r;
1533}
1534
1535static void setreg_off(size_t offset, u32 val)
1536{
1537 unsigned long args[] = { LHREQ_SETREG, offset, val };
1538
1539 if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
1540 err(1, "Setting register %u", offset);
1541}
1542
1543
1544static u32 getreg_num(unsigned regnum, u32 mask)
1545{
1546
1547 if (mask == 0xFF && (regnum & 0x4))
1548 return getreg_num(regnum & 0x3, 0xFFFF) >> 8;
1549
1550 switch (regnum) {
1551 case 0: return getreg(eax) & mask;
1552 case 1: return getreg(ecx) & mask;
1553 case 2: return getreg(edx) & mask;
1554 case 3: return getreg(ebx) & mask;
1555 case 4: return getreg(esp) & mask;
1556 case 5: return getreg(ebp) & mask;
1557 case 6: return getreg(esi) & mask;
1558 case 7: return getreg(edi) & mask;
1559 }
1560 abort();
1561}
1562
1563
1564static void setreg_num(unsigned regnum, u32 val, u32 mask)
1565{
1566
1567 assert(~(val & ~mask));
1568
1569
1570 if (mask == 0xFF && (regnum & 0x4)) {
1571
1572 val = (val << 8) | getreg_num(regnum & 0x3, 0xFF);
1573 setreg_num(regnum & 0x3, val, 0xFFFF);
1574 return;
1575 }
1576
1577 switch (regnum) {
1578 case 0: setreg(eax, val | (getreg(eax) & ~mask)); return;
1579 case 1: setreg(ecx, val | (getreg(ecx) & ~mask)); return;
1580 case 2: setreg(edx, val | (getreg(edx) & ~mask)); return;
1581 case 3: setreg(ebx, val | (getreg(ebx) & ~mask)); return;
1582 case 4: setreg(esp, val | (getreg(esp) & ~mask)); return;
1583 case 5: setreg(ebp, val | (getreg(ebp) & ~mask)); return;
1584 case 6: setreg(esi, val | (getreg(esi) & ~mask)); return;
1585 case 7: setreg(edi, val | (getreg(edi) & ~mask)); return;
1586 }
1587 abort();
1588}
1589
1590
1591static u32 insn_displacement_len(u8 mod_reg_rm)
1592{
1593
1594 switch (mod_reg_rm >> 6) {
1595 case 0:
1596
1597 if ((mod_reg_rm & 0x7) == 0x5)
1598 return 2;
1599
1600 return 0;
1601 case 1:
1602
1603 return 1;
1604 case 2:
1605
1606 return 4;
1607 case 3:
1608
1609 return 0;
1610 }
1611 abort();
1612}
1613
1614static void emulate_insn(const u8 insn[])
1615{
1616 unsigned long args[] = { LHREQ_TRAP, 13 };
1617 unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access;
1618 unsigned int eax, port, mask;
1619
1620
1621
1622
1623 u32 val = 0xFFFFFFFF;
1624
1625
1626
1627
1628
1629
1630 if ((getreg(xcs) & 3) != 0x1)
1631 goto no_emulate;
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641 if (insn[insnlen] == 0xfa) {
1642
1643 insnlen = 1;
1644 goto skip_insn;
1645 }
1646
1647
1648
1649
1650 if (insn[insnlen] == 0x66) {
1651 small_operand = 1;
1652
1653 insnlen = 1;
1654 }
1655
1656
1657 byte_access = !(insn[insnlen] & 1);
1658
1659
1660
1661
1662
1663 switch (insn[insnlen] & 0xFE) {
1664 case 0xE4:
1665 port = insn[insnlen+1];
1666 insnlen += 2;
1667 in = 1;
1668 break;
1669 case 0xEC:
1670 port = getreg(edx) & 0xFFFF;
1671 insnlen += 1;
1672 in = 1;
1673 break;
1674 case 0xE6:
1675 port = insn[insnlen+1];
1676 insnlen += 2;
1677 break;
1678 case 0xEE:
1679 port = getreg(edx) & 0xFFFF;
1680 insnlen += 1;
1681 break;
1682 default:
1683
1684 goto no_emulate;
1685 }
1686
1687
1688 if (byte_access)
1689 mask = 0xFF;
1690 else if (small_operand)
1691 mask = 0xFFFF;
1692 else
1693 mask = 0xFFFFFFFF;
1694
1695
1696
1697
1698
1699 eax = getreg(eax);
1700
1701 if (in) {
1702
1703 if (port == 0x64)
1704 val = 1;
1705 else if (is_pci_addr_port(port))
1706 pci_addr_ioread(port, mask, &val);
1707 else if (is_pci_data_port(port))
1708 pci_data_ioread(port, mask, &val);
1709
1710
1711 eax &= ~mask;
1712
1713 eax |= val & mask;
1714
1715 setreg(eax, eax);
1716 } else {
1717 if (is_pci_addr_port(port)) {
1718 if (!pci_addr_iowrite(port, mask, eax))
1719 goto bad_io;
1720 } else if (is_pci_data_port(port)) {
1721 if (!pci_data_iowrite(port, mask, eax))
1722 goto bad_io;
1723 }
1724
1725
1726 }
1727
1728 verbose("IO %s of %x to %u: %#08x\n",
1729 in ? "IN" : "OUT", mask, port, eax);
1730skip_insn:
1731
1732 setreg(eip, getreg(eip) + insnlen);
1733 return;
1734
1735bad_io:
1736 warnx("Attempt to %s port %u (%#x mask)",
1737 in ? "read from" : "write to", port, mask);
1738
1739no_emulate:
1740
1741 if (write(lguest_fd, args, sizeof(args)) < 0)
1742 err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip));
1743}
1744
1745static struct device *find_mmio_region(unsigned long paddr, u32 *off)
1746{
1747 unsigned int i;
1748
1749 for (i = 1; i < MAX_PCI_DEVICES; i++) {
1750 struct device *d = devices.pci[i];
1751
1752 if (!d)
1753 continue;
1754 if (paddr < d->mmio_addr)
1755 continue;
1756 if (paddr >= d->mmio_addr + d->mmio_size)
1757 continue;
1758 *off = paddr - d->mmio_addr;
1759 return d;
1760 }
1761 return NULL;
1762}
1763
1764
1765static struct virtqueue *vq_by_num(struct device *d, u32 num)
1766{
1767 struct virtqueue *vq = d->vq;
1768
1769 while (num-- && vq)
1770 vq = vq->next;
1771
1772 return vq;
1773}
1774
1775static void save_vq_config(const struct virtio_pci_common_cfg *cfg,
1776 struct virtqueue *vq)
1777{
1778 vq->pci_config = *cfg;
1779}
1780
1781static void restore_vq_config(struct virtio_pci_common_cfg *cfg,
1782 struct virtqueue *vq)
1783{
1784
1785 size_t off = offsetof(struct virtio_pci_common_cfg, queue_size);
1786
1787 memcpy((void *)cfg + off, (void *)&vq->pci_config + off,
1788 sizeof(*cfg) - off);
1789}
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799static void check_virtqueue(struct device *d, struct virtqueue *vq)
1800{
1801
1802 if (vq->pci_config.queue_desc_hi
1803 || vq->pci_config.queue_avail_hi
1804 || vq->pci_config.queue_used_hi)
1805 bad_driver_vq(vq, "invalid 64-bit queue address");
1806
1807
1808
1809
1810
1811
1812
1813
1814 if (vq->pci_config.queue_desc_lo % 16
1815 || vq->pci_config.queue_avail_lo % 2
1816 || vq->pci_config.queue_used_lo % 4)
1817 bad_driver_vq(vq, "invalid alignment in queue addresses");
1818
1819
1820 vq->vring.num = vq->pci_config.queue_size;
1821 vq->vring.desc = check_pointer(vq->dev,
1822 vq->pci_config.queue_desc_lo,
1823 sizeof(*vq->vring.desc) * vq->vring.num);
1824 vq->vring.avail = check_pointer(vq->dev,
1825 vq->pci_config.queue_avail_lo,
1826 sizeof(*vq->vring.avail)
1827 + (sizeof(vq->vring.avail->ring[0])
1828 * vq->vring.num));
1829 vq->vring.used = check_pointer(vq->dev,
1830 vq->pci_config.queue_used_lo,
1831 sizeof(*vq->vring.used)
1832 + (sizeof(vq->vring.used->ring[0])
1833 * vq->vring.num));
1834
1835
1836
1837
1838
1839
1840
1841 if (vq->vring.used->flags != 0)
1842 bad_driver_vq(vq, "invalid initial used.flags %#x",
1843 vq->vring.used->flags);
1844}
1845
1846static void start_virtqueue(struct virtqueue *vq)
1847{
1848
1849
1850
1851
1852 char *stack = malloc(32768);
1853
1854
1855 vq->eventfd = eventfd(0, 0);
1856 if (vq->eventfd < 0)
1857 err(1, "Creating eventfd");
1858
1859
1860
1861
1862
1863 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
1864 if (vq->thread == (pid_t)-1)
1865 err(1, "Creating clone");
1866}
1867
1868static void start_virtqueues(struct device *d)
1869{
1870 struct virtqueue *vq;
1871
1872 for (vq = d->vq; vq; vq = vq->next) {
1873 if (vq->pci_config.queue_enable)
1874 start_virtqueue(vq);
1875 }
1876}
1877
1878static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask)
1879{
1880 struct virtqueue *vq;
1881
1882 switch (off) {
1883 case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
1884
1885
1886
1887
1888
1889
1890
1891 if (val == 0)
1892 d->mmio->cfg.device_feature = d->features;
1893 else if (val == 1)
1894 d->mmio->cfg.device_feature = (d->features >> 32);
1895 else
1896 d->mmio->cfg.device_feature = 0;
1897 goto feature_write_through32;
1898 case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
1899 if (val > 1)
1900 bad_driver(d, "Unexpected driver select %u", val);
1901 goto feature_write_through32;
1902 case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
1903 if (d->mmio->cfg.guest_feature_select == 0) {
1904 d->features_accepted &= ~((u64)0xFFFFFFFF);
1905 d->features_accepted |= val;
1906 } else {
1907 assert(d->mmio->cfg.guest_feature_select == 1);
1908 d->features_accepted &= 0xFFFFFFFF;
1909 d->features_accepted |= ((u64)val) << 32;
1910 }
1911
1912
1913
1914
1915
1916
1917 if (d->features_accepted & ~d->features)
1918 bad_driver(d, "over-accepted features %#llx of %#llx",
1919 d->features_accepted, d->features);
1920 goto feature_write_through32;
1921 case offsetof(struct virtio_pci_mmio, cfg.device_status): {
1922 u8 prev;
1923
1924 verbose("%s: device status -> %#x\n", d->name, val);
1925
1926
1927
1928
1929
1930
1931 if (val == 0) {
1932 reset_device(d);
1933 goto write_through8;
1934 }
1935
1936
1937 if (d->mmio->cfg.device_status & ~val)
1938 bad_driver(d, "unset of device status bit %#x -> %#x",
1939 d->mmio->cfg.device_status, val);
1940
1941
1942
1943
1944
1945
1946
1947 if (val & VIRTIO_CONFIG_S_DRIVER_OK
1948 && !(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
1949 start_virtqueues(d);
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981 prev = 0;
1982 switch (val & ~d->mmio->cfg.device_status) {
1983 case VIRTIO_CONFIG_S_DRIVER_OK:
1984 prev |= VIRTIO_CONFIG_S_FEATURES_OK;
1985 case VIRTIO_CONFIG_S_FEATURES_OK:
1986 prev |= VIRTIO_CONFIG_S_DRIVER;
1987 case VIRTIO_CONFIG_S_DRIVER:
1988 prev |= VIRTIO_CONFIG_S_ACKNOWLEDGE;
1989 case VIRTIO_CONFIG_S_ACKNOWLEDGE:
1990 break;
1991 default:
1992 bad_driver(d, "unknown device status bit %#x -> %#x",
1993 d->mmio->cfg.device_status, val);
1994 }
1995 if (d->mmio->cfg.device_status != prev)
1996 bad_driver(d, "unexpected status transition %#x -> %#x",
1997 d->mmio->cfg.device_status, val);
1998
1999
2000 switch (val & ~d->mmio->cfg.device_status) {
2001 case VIRTIO_CONFIG_S_FEATURES_OK:
2002 d->wrote_features_ok = true;
2003 break;
2004 case VIRTIO_CONFIG_S_DRIVER_OK:
2005 if (d->wrote_features_ok)
2006 bad_driver(d, "did not re-read FEATURES_OK");
2007 break;
2008 }
2009 goto write_through8;
2010 }
2011 case offsetof(struct virtio_pci_mmio, cfg.queue_select):
2012 vq = vq_by_num(d, val);
2013
2014
2015
2016
2017
2018
2019 if (!vq) {
2020 d->mmio->cfg.queue_size = 0;
2021 goto write_through16;
2022 }
2023
2024 if (d->mmio->cfg.queue_size)
2025 save_vq_config(&d->mmio->cfg,
2026 vq_by_num(d, d->mmio->cfg.queue_select));
2027
2028 restore_vq_config(&d->mmio->cfg, vq);
2029 goto write_through16;
2030 case offsetof(struct virtio_pci_mmio, cfg.queue_size):
2031
2032
2033
2034
2035
2036
2037 if (val & (val-1))
2038 bad_driver(d, "invalid queue size %u", val);
2039 if (d->mmio->cfg.queue_enable)
2040 bad_driver(d, "changing queue size on live device");
2041 goto write_through16;
2042 case offsetof(struct virtio_pci_mmio, cfg.queue_msix_vector):
2043 bad_driver(d, "attempt to set MSIX vector to %u", val);
2044 case offsetof(struct virtio_pci_mmio, cfg.queue_enable): {
2045 struct virtqueue *vq = vq_by_num(d, d->mmio->cfg.queue_select);
2046
2047
2048
2049
2050
2051
2052 if (val != 1)
2053 bad_driver(d, "setting queue_enable to %u", val);
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067 if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK)
2068 bad_driver(d, "enabling vq after DRIVER_OK");
2069
2070 d->mmio->cfg.queue_enable = val;
2071 save_vq_config(&d->mmio->cfg, vq);
2072 check_virtqueue(d, vq);
2073 goto write_through16;
2074 }
2075 case offsetof(struct virtio_pci_mmio, cfg.queue_notify_off):
2076 bad_driver(d, "attempt to write to queue_notify_off");
2077 case offsetof(struct virtio_pci_mmio, cfg.queue_desc_lo):
2078 case offsetof(struct virtio_pci_mmio, cfg.queue_desc_hi):
2079 case offsetof(struct virtio_pci_mmio, cfg.queue_avail_lo):
2080 case offsetof(struct virtio_pci_mmio, cfg.queue_avail_hi):
2081 case offsetof(struct virtio_pci_mmio, cfg.queue_used_lo):
2082 case offsetof(struct virtio_pci_mmio, cfg.queue_used_hi):
2083
2084
2085
2086
2087
2088
2089 if (d->mmio->cfg.queue_enable)
2090 bad_driver(d, "changing queue on live device");
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK))
2101 bad_driver(d, "setting up vq before FEATURES_OK");
2102
2103
2104
2105
2106
2107 if (d->wrote_features_ok)
2108 bad_driver(d, "didn't re-read FEATURES_OK before setup");
2109
2110 goto write_through32;
2111 case offsetof(struct virtio_pci_mmio, notify):
2112 vq = vq_by_num(d, val);
2113 if (!vq)
2114 bad_driver(d, "Invalid vq notification on %u", val);
2115
2116 write(vq->eventfd, "\1\0\0\0\0\0\0\0", 8);
2117 goto write_through16;
2118 case offsetof(struct virtio_pci_mmio, isr):
2119 bad_driver(d, "Unexpected write to isr");
2120
2121 case sizeof(struct virtio_pci_mmio)
2122 + offsetof(struct virtio_console_config, emerg_wr):
2123 if (strcmp(d->name, "console") == 0) {
2124 char c = val;
2125 write(STDOUT_FILENO, &c, 1);
2126 goto write_through32;
2127 }
2128
2129 default:
2130
2131
2132
2133
2134
2135
2136 bad_driver(d, "Unexpected write to offset %u", off);
2137 }
2138
2139feature_write_through32:
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
2155 bad_driver(d, "feature write before VIRTIO_CONFIG_S_DRIVER");
2156 if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK)
2157 bad_driver(d, "feature write after VIRTIO_CONFIG_S_FEATURES_OK");
2158
2159
2160
2161
2162
2163
2164
2165
2166write_through32:
2167 if (mask != 0xFFFFFFFF) {
2168 bad_driver(d, "non-32-bit write to offset %u (%#x)",
2169 off, getreg(eip));
2170 return;
2171 }
2172 memcpy((char *)d->mmio + off, &val, 4);
2173 return;
2174
2175write_through16:
2176 if (mask != 0xFFFF)
2177 bad_driver(d, "non-16-bit write to offset %u (%#x)",
2178 off, getreg(eip));
2179 memcpy((char *)d->mmio + off, &val, 2);
2180 return;
2181
2182write_through8:
2183 if (mask != 0xFF)
2184 bad_driver(d, "non-8-bit write to offset %u (%#x)",
2185 off, getreg(eip));
2186 memcpy((char *)d->mmio + off, &val, 1);
2187 return;
2188}
2189
2190static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask)
2191{
2192 u8 isr;
2193 u32 val = 0;
2194
2195 switch (off) {
2196 case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
2197 case offsetof(struct virtio_pci_mmio, cfg.device_feature):
2198 case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
2199 case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
2212 bad_driver(d,
2213 "feature read before VIRTIO_CONFIG_S_DRIVER");
2214 goto read_through32;
2215 case offsetof(struct virtio_pci_mmio, cfg.msix_config):
2216 bad_driver(d, "read of msix_config");
2217 case offsetof(struct virtio_pci_mmio, cfg.num_queues):
2218 goto read_through16;
2219 case offsetof(struct virtio_pci_mmio, cfg.device_status):
2220
2221 d->wrote_features_ok = false;
2222 goto read_through8;
2223 case offsetof(struct virtio_pci_mmio, cfg.config_generation):
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235 goto read_through8;
2236 case offsetof(struct virtio_pci_mmio, notify):
2237
2238
2239
2240
2241
2242
2243 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
2244 bad_driver(d, "notify before VIRTIO_CONFIG_S_DRIVER_OK");
2245 goto read_through16;
2246 case offsetof(struct virtio_pci_mmio, isr):
2247 if (mask != 0xFF)
2248 bad_driver(d, "non-8-bit read from offset %u (%#x)",
2249 off, getreg(eip));
2250 isr = d->mmio->isr;
2251
2252
2253
2254
2255
2256 d->mmio->isr = 0;
2257 return isr;
2258 case offsetof(struct virtio_pci_mmio, padding):
2259 bad_driver(d, "read from padding (%#x)", getreg(eip));
2260 default:
2261
2262 if (off > d->mmio_size - 4)
2263 bad_driver(d, "read past end (%#x)", getreg(eip));
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
2278 bad_driver(d,
2279 "config read before VIRTIO_CONFIG_S_DRIVER");
2280
2281 if (mask == 0xFFFFFFFF)
2282 goto read_through32;
2283 else if (mask == 0xFFFF)
2284 goto read_through16;
2285 else
2286 goto read_through8;
2287 }
2288
2289
2290
2291
2292
2293
2294
2295
2296read_through32:
2297 if (mask != 0xFFFFFFFF)
2298 bad_driver(d, "non-32-bit read to offset %u (%#x)",
2299 off, getreg(eip));
2300 memcpy(&val, (char *)d->mmio + off, 4);
2301 return val;
2302
2303read_through16:
2304 if (mask != 0xFFFF)
2305 bad_driver(d, "non-16-bit read to offset %u (%#x)",
2306 off, getreg(eip));
2307 memcpy(&val, (char *)d->mmio + off, 2);
2308 return val;
2309
2310read_through8:
2311 if (mask != 0xFF)
2312 bad_driver(d, "non-8-bit read to offset %u (%#x)",
2313 off, getreg(eip));
2314 memcpy(&val, (char *)d->mmio + off, 1);
2315 return val;
2316}
2317
2318static void emulate_mmio(unsigned long paddr, const u8 *insn)
2319{
2320 u32 val, off, mask = 0xFFFFFFFF, insnlen = 0;
2321 struct device *d = find_mmio_region(paddr, &off);
2322 unsigned long args[] = { LHREQ_TRAP, 14 };
2323
2324 if (!d) {
2325 warnx("MMIO touching %#08lx (not a device)", paddr);
2326 goto reinject;
2327 }
2328
2329
2330 if (insn[0] == 0x66) {
2331 mask = 0xFFFF;
2332 insnlen++;
2333 }
2334
2335
2336 if (insn[insnlen] == 0x89) {
2337
2338 val = getreg_num((insn[insnlen+1] >> 3) & 0x7, mask);
2339 emulate_mmio_write(d, off, val, mask);
2340 insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
2341 } else if (insn[insnlen] == 0x8b) {
2342
2343 val = emulate_mmio_read(d, off, mask);
2344 setreg_num((insn[insnlen+1] >> 3) & 0x7, val, mask);
2345 insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
2346 } else if (insn[0] == 0x88) {
2347 mask = 0xff;
2348
2349 val = getreg_num((insn[1] >> 3) & 0x7, mask);
2350 emulate_mmio_write(d, off, val, mask);
2351 insnlen = 2 + insn_displacement_len(insn[1]);
2352 } else if (insn[0] == 0x8a) {
2353 mask = 0xff;
2354 val = emulate_mmio_read(d, off, mask);
2355 setreg_num((insn[1] >> 3) & 0x7, val, mask);
2356 insnlen = 2 + insn_displacement_len(insn[1]);
2357 } else {
2358 warnx("Unknown MMIO instruction touching %#08lx:"
2359 " %02x %02x %02x %02x at %u",
2360 paddr, insn[0], insn[1], insn[2], insn[3], getreg(eip));
2361 reinject:
2362
2363 if (write(lguest_fd, args, sizeof(args)) < 0)
2364 err(1, "Reinjecting trap 14 for fault at %#x",
2365 getreg(eip));
2366 return;
2367 }
2368
2369
2370 setreg(eip, getreg(eip) + insnlen);
2371}
2372
2373
2374
2375
2376
2377
2378
2379
2380static void add_pci_virtqueue(struct device *dev,
2381 void (*service)(struct virtqueue *),
2382 const char *name)
2383{
2384 struct virtqueue **i, *vq = malloc(sizeof(*vq));
2385
2386
2387 vq->next = NULL;
2388 vq->last_avail_idx = 0;
2389 vq->dev = dev;
2390 vq->name = name;
2391
2392
2393
2394
2395
2396 vq->service = service;
2397 vq->thread = (pid_t)-1;
2398
2399
2400 reset_vq_pci_config(vq);
2401 vq->pci_config.queue_notify_off = 0;
2402
2403
2404 vq->dev->mmio->cfg.num_queues++;
2405
2406
2407
2408
2409
2410 for (i = &dev->vq; *i; i = &(*i)->next);
2411 *i = vq;
2412}
2413
2414
2415static void add_pci_feature(struct device *dev, unsigned bit)
2416{
2417 dev->features |= (1ULL << bit);
2418}
2419
2420
2421static void no_device_config(struct device *dev)
2422{
2423 dev->mmio_addr = get_mmio_region(dev->mmio_size);
2424
2425 dev->config.bar[0] = dev->mmio_addr;
2426
2427 assert(~(dev->config.bar[0] & 0xF));
2428}
2429
2430
2431static void set_device_config(struct device *dev, const void *conf, size_t len)
2432{
2433
2434 dev->mmio_size += len;
2435 dev->mmio = realloc(dev->mmio, dev->mmio_size);
2436 memcpy(dev->mmio + 1, conf, len);
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446 dev->config.cfg_access.cap.cap_next
2447 = offsetof(struct pci_config, device);
2448
2449
2450
2451
2452
2453
2454
2455 assert(dev->config.cfg_access.cap.cap_next % 4 == 0);
2456
2457
2458 dev->config.device.length = len;
2459
2460
2461 no_device_config(dev);
2462}
2463
2464static void init_cap(struct virtio_pci_cap *cap, size_t caplen, int type,
2465 size_t bar_offset, size_t bar_bytes, u8 next)
2466{
2467 cap->cap_vndr = PCI_CAP_ID_VNDR;
2468 cap->cap_next = next;
2469 cap->cap_len = caplen;
2470 cap->cfg_type = type;
2471 cap->bar = 0;
2472 memset(cap->padding, 0, sizeof(cap->padding));
2473 cap->offset = bar_offset;
2474 cap->length = bar_bytes;
2475}
2476
2477
2478
2479
2480
2481static void init_pci_config(struct pci_config *pci, u16 type,
2482 u8 class, u8 subclass)
2483{
2484 size_t bar_offset, bar_len;
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496 memset(pci, 0, sizeof(*pci));
2497
2498
2499 pci->vendor_id = 0x1AF4;
2500
2501 pci->device_id = 0x1040 + type;
2502
2503
2504
2505
2506
2507
2508 pci->class = class;
2509 pci->subclass = subclass;
2510
2511
2512
2513
2514
2515
2516
2517 pci->revid = 1;
2518
2519
2520
2521
2522
2523
2524
2525 pci->subsystem_device_id = 0x40;
2526
2527
2528 pci->irq_line = devices.next_irq++;
2529 pci->irq_pin = 0;
2530
2531
2532 pci->status = (1 << 4);
2533
2534
2535
2536
2537
2538
2539
2540
2541 pci->capabilities = offsetof(struct pci_config, common);
2542
2543
2544 assert(pci->capabilities % 4 == 0);
2545
2546 bar_offset = offsetof(struct virtio_pci_mmio, cfg);
2547 bar_len = sizeof(((struct virtio_pci_mmio *)0)->cfg);
2548 init_cap(&pci->common, sizeof(pci->common), VIRTIO_PCI_CAP_COMMON_CFG,
2549 bar_offset, bar_len,
2550 offsetof(struct pci_config, notify));
2551
2552
2553
2554
2555
2556
2557 bar_offset += bar_len;
2558 bar_len = sizeof(((struct virtio_pci_mmio *)0)->notify);
2559
2560
2561
2562
2563
2564
2565 assert(pci->common.cap_next % 2 == 0);
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575 assert(bar_len >= 2);
2576
2577 init_cap(&pci->notify.cap, sizeof(pci->notify),
2578 VIRTIO_PCI_CAP_NOTIFY_CFG,
2579 bar_offset, bar_len,
2580 offsetof(struct pci_config, isr));
2581
2582 bar_offset += bar_len;
2583 bar_len = sizeof(((struct virtio_pci_mmio *)0)->isr);
2584
2585
2586
2587
2588
2589
2590 init_cap(&pci->isr, sizeof(pci->isr),
2591 VIRTIO_PCI_CAP_ISR_CFG,
2592 bar_offset, bar_len,
2593 offsetof(struct pci_config, cfg_access));
2594
2595
2596
2597
2598
2599
2600
2601
2602 init_cap(&pci->cfg_access.cap, sizeof(pci->cfg_access),
2603 VIRTIO_PCI_CAP_PCI_CFG,
2604 0, 0, 0);
2605
2606 bar_offset += bar_len + sizeof(((struct virtio_pci_mmio *)0)->padding);
2607 assert(bar_offset == sizeof(struct virtio_pci_mmio));
2608
2609
2610
2611
2612
2613
2614 init_cap(&pci->device, sizeof(pci->device), VIRTIO_PCI_CAP_DEVICE_CFG,
2615 bar_offset, 0, 0);
2616}
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626static struct device *new_pci_device(const char *name, u16 type,
2627 u8 class, u8 subclass)
2628{
2629 struct device *dev = malloc(sizeof(*dev));
2630
2631
2632 dev->name = name;
2633 dev->vq = NULL;
2634 dev->running = false;
2635 dev->wrote_features_ok = false;
2636 dev->mmio_size = sizeof(struct virtio_pci_mmio);
2637 dev->mmio = calloc(1, dev->mmio_size);
2638 dev->features = (u64)1 << VIRTIO_F_VERSION_1;
2639 dev->features_accepted = 0;
2640
2641 if (devices.device_num + 1 >= MAX_PCI_DEVICES)
2642 errx(1, "Can only handle 31 PCI devices");
2643
2644 init_pci_config(&dev->config, type, class, subclass);
2645 assert(!devices.pci[devices.device_num+1]);
2646 devices.pci[++devices.device_num] = dev;
2647
2648 return dev;
2649}
2650
2651
2652
2653
2654
2655static void setup_console(void)
2656{
2657 struct device *dev;
2658 struct virtio_console_config conf;
2659
2660
2661 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
2662 struct termios term = orig_term;
2663
2664
2665
2666
2667 term.c_lflag &= ~(ISIG|ICANON|ECHO);
2668 tcsetattr(STDIN_FILENO, TCSANOW, &term);
2669 }
2670
2671 dev = new_pci_device("console", VIRTIO_ID_CONSOLE, 0x07, 0x00);
2672
2673
2674 dev->priv = malloc(sizeof(struct console_abort));
2675 ((struct console_abort *)dev->priv)->count = 0;
2676
2677
2678
2679
2680
2681
2682
2683 add_pci_virtqueue(dev, console_input, "input");
2684 add_pci_virtqueue(dev, console_output, "output");
2685
2686
2687 add_pci_feature(dev, VIRTIO_CONSOLE_F_EMERG_WRITE);
2688 set_device_config(dev, &conf, sizeof(conf));
2689
2690 verbose("device %u: console\n", devices.device_num);
2691}
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712static u32 str2ip(const char *ipaddr)
2713{
2714 unsigned int b[4];
2715
2716 if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4)
2717 errx(1, "Failed to parse IP address '%s'", ipaddr);
2718 return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
2719}
2720
2721static void str2mac(const char *macaddr, unsigned char mac[6])
2722{
2723 unsigned int m[6];
2724 if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x",
2725 &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6)
2726 errx(1, "Failed to parse mac address '%s'", macaddr);
2727 mac[0] = m[0];
2728 mac[1] = m[1];
2729 mac[2] = m[2];
2730 mac[3] = m[3];
2731 mac[4] = m[4];
2732 mac[5] = m[5];
2733}
2734
2735
2736
2737
2738
2739
2740
2741
2742static void add_to_bridge(int fd, const char *if_name, const char *br_name)
2743{
2744 int ifidx;
2745 struct ifreq ifr;
2746
2747 if (!*br_name)
2748 errx(1, "must specify bridge name");
2749
2750 ifidx = if_nametoindex(if_name);
2751 if (!ifidx)
2752 errx(1, "interface %s does not exist!", if_name);
2753
2754 strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
2755 ifr.ifr_name[IFNAMSIZ-1] = '\0';
2756 ifr.ifr_ifindex = ifidx;
2757 if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
2758 err(1, "can't add %s to bridge %s", if_name, br_name);
2759}
2760
2761
2762
2763
2764
2765
2766static void configure_device(int fd, const char *tapif, u32 ipaddr)
2767{
2768 struct ifreq ifr;
2769 struct sockaddr_in sin;
2770
2771 memset(&ifr, 0, sizeof(ifr));
2772 strcpy(ifr.ifr_name, tapif);
2773
2774
2775 sin.sin_family = AF_INET;
2776 sin.sin_addr.s_addr = htonl(ipaddr);
2777 memcpy(&ifr.ifr_addr, &sin, sizeof(sin));
2778 if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
2779 err(1, "Setting %s interface address", tapif);
2780 ifr.ifr_flags = IFF_UP;
2781 if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
2782 err(1, "Bringing interface %s up", tapif);
2783}
2784
2785static int get_tun_device(char tapif[IFNAMSIZ])
2786{
2787 struct ifreq ifr;
2788 int vnet_hdr_sz;
2789 int netfd;
2790
2791
2792 memset(&ifr, 0, sizeof(ifr));
2793
2794
2795
2796
2797
2798
2799
2800 netfd = open_or_die("/dev/net/tun", O_RDWR);
2801 ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
2802 strcpy(ifr.ifr_name, "tap%d");
2803 if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
2804 err(1, "configuring /dev/net/tun");
2805
2806 if (ioctl(netfd, TUNSETOFFLOAD,
2807 TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
2808 err(1, "Could not set features for tun device");
2809
2810
2811
2812
2813
2814 ioctl(netfd, TUNSETNOCSUM, 1);
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824 vnet_hdr_sz = sizeof(struct virtio_net_hdr_v1);
2825 if (ioctl(netfd, TUNSETVNETHDRSZ, &vnet_hdr_sz) != 0)
2826 err(1, "Setting tun header size to %u", vnet_hdr_sz);
2827
2828 memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
2829 return netfd;
2830}
2831
2832
2833
2834
2835
2836
2837
2838static void setup_tun_net(char *arg)
2839{
2840 struct device *dev;
2841 struct net_info *net_info = malloc(sizeof(*net_info));
2842 int ipfd;
2843 u32 ip = INADDR_ANY;
2844 bool bridging = false;
2845 char tapif[IFNAMSIZ], *p;
2846 struct virtio_net_config conf;
2847
2848 net_info->tunfd = get_tun_device(tapif);
2849
2850
2851 dev = new_pci_device("net", VIRTIO_ID_NET, 0x02, 0x00);
2852 dev->priv = net_info;
2853
2854
2855 add_pci_virtqueue(dev, net_input, "rx");
2856 add_pci_virtqueue(dev, net_output, "tx");
2857
2858
2859
2860
2861
2862 ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
2863 if (ipfd < 0)
2864 err(1, "opening IP socket");
2865
2866
2867 if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
2868 arg += strlen(BRIDGE_PFX);
2869 bridging = true;
2870 }
2871
2872
2873 p = strchr(arg, ':');
2874 if (p) {
2875 str2mac(p+1, conf.mac);
2876 add_pci_feature(dev, VIRTIO_NET_F_MAC);
2877 *p = '\0';
2878 }
2879
2880
2881 if (bridging)
2882 add_to_bridge(ipfd, tapif, arg);
2883 else
2884 ip = str2ip(arg);
2885
2886
2887 configure_device(ipfd, tapif, ip);
2888
2889
2890 add_pci_feature(dev, VIRTIO_NET_F_CSUM);
2891 add_pci_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
2892 add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
2893 add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
2894 add_pci_feature(dev, VIRTIO_NET_F_GUEST_ECN);
2895 add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO4);
2896 add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO6);
2897 add_pci_feature(dev, VIRTIO_NET_F_HOST_ECN);
2898
2899 add_pci_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
2900 set_device_config(dev, &conf, sizeof(conf));
2901
2902
2903 close(ipfd);
2904
2905 if (bridging)
2906 verbose("device %u: tun %s attached to bridge: %s\n",
2907 devices.device_num, tapif, arg);
2908 else
2909 verbose("device %u: tun %s: %s\n",
2910 devices.device_num, tapif, arg);
2911}
2912
2913
2914
2915struct vblk_info {
2916
2917 off64_t len;
2918
2919
2920 int fd;
2921
2922};
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938static void blk_request(struct virtqueue *vq)
2939{
2940 struct vblk_info *vblk = vq->dev->priv;
2941 unsigned int head, out_num, in_num, wlen;
2942 int ret, i;
2943 u8 *in;
2944 struct virtio_blk_outhdr out;
2945 struct iovec iov[vq->vring.num];
2946 off64_t off;
2947
2948
2949
2950
2951
2952 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
2953
2954
2955 iov_consume(vq->dev, iov, out_num, &out, sizeof(out));
2956
2957
2958 in = NULL;
2959 for (i = out_num + in_num - 1; i >= out_num; i--) {
2960 if (iov[i].iov_len > 0) {
2961 in = iov[i].iov_base + iov[i].iov_len - 1;
2962 iov[i].iov_len--;
2963 break;
2964 }
2965 }
2966 if (!in)
2967 bad_driver_vq(vq, "Bad virtblk cmd with no room for status");
2968
2969
2970
2971
2972
2973 off = out.sector * 512;
2974
2975 if (out.type & VIRTIO_BLK_T_OUT) {
2976
2977
2978
2979
2980
2981
2982 if (lseek64(vblk->fd, off, SEEK_SET) != off)
2983 err(1, "Bad seek to sector %llu", out.sector);
2984
2985 ret = writev(vblk->fd, iov, out_num);
2986 verbose("WRITE to sector %llu: %i\n", out.sector, ret);
2987
2988
2989
2990
2991
2992
2993 if (ret > 0 && off + ret > vblk->len) {
2994
2995 ftruncate64(vblk->fd, vblk->len);
2996
2997 bad_driver_vq(vq, "Write past end %llu+%u", off, ret);
2998 }
2999
3000 wlen = sizeof(*in);
3001 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
3002 } else if (out.type & VIRTIO_BLK_T_FLUSH) {
3003
3004 ret = fdatasync(vblk->fd);
3005 verbose("FLUSH fdatasync: %i\n", ret);
3006 wlen = sizeof(*in);
3007 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
3008 } else {
3009
3010
3011
3012
3013
3014
3015 if (lseek64(vblk->fd, off, SEEK_SET) != off)
3016 err(1, "Bad seek to sector %llu", out.sector);
3017
3018 ret = readv(vblk->fd, iov + out_num, in_num);
3019 if (ret >= 0) {
3020 wlen = sizeof(*in) + ret;
3021 *in = VIRTIO_BLK_S_OK;
3022 } else {
3023 wlen = sizeof(*in);
3024 *in = VIRTIO_BLK_S_IOERR;
3025 }
3026 }
3027
3028
3029 add_used(vq, head, wlen);
3030}
3031
3032
3033static void setup_block_file(const char *filename)
3034{
3035 struct device *dev;
3036 struct vblk_info *vblk;
3037 struct virtio_blk_config conf;
3038
3039
3040 dev = new_pci_device("block", VIRTIO_ID_BLOCK, 0x01, 0x80);
3041
3042
3043 add_pci_virtqueue(dev, blk_request, "request");
3044
3045
3046 vblk = dev->priv = malloc(sizeof(*vblk));
3047
3048
3049 vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
3050 vblk->len = lseek64(vblk->fd, 0, SEEK_END);
3051
3052
3053 conf.capacity = cpu_to_le64(vblk->len / 512);
3054
3055
3056
3057
3058
3059 add_pci_feature(dev, VIRTIO_BLK_F_SEG_MAX);
3060 conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
3061
3062 set_device_config(dev, &conf, sizeof(struct virtio_blk_config));
3063
3064 verbose("device %u: virtblock %llu sectors\n",
3065 devices.device_num, le64_to_cpu(conf.capacity));
3066}
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076struct rng_info {
3077 int rfd;
3078};
3079
3080static void rng_input(struct virtqueue *vq)
3081{
3082 int len;
3083 unsigned int head, in_num, out_num, totlen = 0;
3084 struct rng_info *rng_info = vq->dev->priv;
3085 struct iovec iov[vq->vring.num];
3086
3087
3088 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
3089 if (out_num)
3090 bad_driver_vq(vq, "Output buffers in rng?");
3091
3092
3093
3094
3095
3096 while (!iov_empty(iov, in_num)) {
3097 len = readv(rng_info->rfd, iov, in_num);
3098 if (len <= 0)
3099 err(1, "Read from /dev/urandom gave %i", len);
3100 iov_consume(vq->dev, iov, in_num, NULL, len);
3101 totlen += len;
3102 }
3103
3104
3105 add_used(vq, head, totlen);
3106}
3107
3108
3109
3110
3111static void setup_rng(void)
3112{
3113 struct device *dev;
3114 struct rng_info *rng_info = malloc(sizeof(*rng_info));
3115
3116
3117 rng_info->rfd = open_or_die("/dev/urandom", O_RDONLY);
3118
3119
3120 dev = new_pci_device("rng", VIRTIO_ID_RNG, 0xff, 0);
3121 dev->priv = rng_info;
3122
3123
3124 add_pci_virtqueue(dev, rng_input, "input");
3125
3126
3127 no_device_config(dev);
3128
3129 verbose("device %u: rng\n", devices.device_num);
3130}
3131
3132
3133
3134static void __attribute__((noreturn)) restart_guest(void)
3135{
3136 unsigned int i;
3137
3138
3139
3140
3141
3142 for (i = 3; i < FD_SETSIZE; i++)
3143 close(i);
3144
3145
3146 cleanup_devices();
3147
3148 execv(main_args[0], main_args);
3149 err(1, "Could not exec %s", main_args[0]);
3150}
3151
3152
3153
3154
3155
3156static void __attribute__((noreturn)) run_guest(void)
3157{
3158 for (;;) {
3159 struct lguest_pending notify;
3160 int readval;
3161
3162
3163 readval = pread(lguest_fd, ¬ify, sizeof(notify), cpu_id);
3164 if (readval == sizeof(notify)) {
3165 if (notify.trap == 13) {
3166 verbose("Emulating instruction at %#x\n",
3167 getreg(eip));
3168 emulate_insn(notify.insn);
3169 } else if (notify.trap == 14) {
3170 verbose("Emulating MMIO at %#x\n",
3171 getreg(eip));
3172 emulate_mmio(notify.addr, notify.insn);
3173 } else
3174 errx(1, "Unknown trap %i addr %#08x\n",
3175 notify.trap, notify.addr);
3176
3177 } else if (errno == ENOENT) {
3178 char reason[1024] = { 0 };
3179 pread(lguest_fd, reason, sizeof(reason)-1, cpu_id);
3180 errx(1, "%s", reason);
3181
3182 } else if (errno == ERESTART) {
3183 restart_guest();
3184
3185 } else
3186 err(1, "Running guest failed");
3187 }
3188}
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198static struct option opts[] = {
3199 { "verbose", 0, NULL, 'v' },
3200 { "tunnet", 1, NULL, 't' },
3201 { "block", 1, NULL, 'b' },
3202 { "rng", 0, NULL, 'r' },
3203 { "initrd", 1, NULL, 'i' },
3204 { "username", 1, NULL, 'u' },
3205 { "chroot", 1, NULL, 'c' },
3206 { NULL },
3207};
3208static void usage(void)
3209{
3210 errx(1, "Usage: lguest [--verbose] "
3211 "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n"
3212 "|--block=<filename>|--initrd=<filename>]...\n"
3213 "<mem-in-mb> vmlinux [args...]");
3214}
3215
3216
3217int main(int argc, char *argv[])
3218{
3219
3220 unsigned long mem = 0, start, initrd_size = 0;
3221
3222 int i, c;
3223
3224 struct boot_params *boot;
3225
3226 const char *initrd_name = NULL;
3227
3228
3229 struct passwd *user_details = NULL;
3230
3231
3232 char *chroot_path = NULL;
3233
3234
3235 main_args = argv;
3236
3237
3238
3239
3240
3241 devices.next_irq = 1;
3242
3243
3244 cpu_id = 0;
3245
3246
3247
3248
3249
3250
3251
3252 for (i = 1; i < argc; i++) {
3253 if (argv[i][0] != '-') {
3254 mem = atoi(argv[i]) * 1024 * 1024;
3255
3256
3257
3258
3259
3260
3261 guest_base = map_zeroed_pages(mem / getpagesize()
3262 + DEVICE_PAGES);
3263 guest_limit = mem;
3264 guest_max = guest_mmio = mem + DEVICE_PAGES*getpagesize();
3265 break;
3266 }
3267 }
3268
3269
3270 setup_console();
3271
3272
3273 while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
3274 switch (c) {
3275 case 'v':
3276 verbose = true;
3277 break;
3278 case 't':
3279 setup_tun_net(optarg);
3280 break;
3281 case 'b':
3282 setup_block_file(optarg);
3283 break;
3284 case 'r':
3285 setup_rng();
3286 break;
3287 case 'i':
3288 initrd_name = optarg;
3289 break;
3290 case 'u':
3291 user_details = getpwnam(optarg);
3292 if (!user_details)
3293 err(1, "getpwnam failed, incorrect username?");
3294 break;
3295 case 'c':
3296 chroot_path = optarg;
3297 break;
3298 default:
3299 warnx("Unknown argument %s", argv[optind]);
3300 usage();
3301 }
3302 }
3303
3304
3305
3306
3307 if (optind + 2 > argc)
3308 usage();
3309
3310 verbose("Guest base is at %p\n", guest_base);
3311
3312
3313 init_pci_host_bridge();
3314
3315
3316 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
3317
3318
3319 boot = from_guest_phys(0);
3320
3321
3322 if (initrd_name) {
3323 initrd_size = load_initrd(initrd_name, mem);
3324
3325
3326
3327
3328 boot->hdr.ramdisk_image = mem - initrd_size;
3329 boot->hdr.ramdisk_size = initrd_size;
3330
3331 boot->hdr.type_of_loader = 0xFF;
3332 }
3333
3334
3335
3336
3337
3338 boot->e820_entries = 1;
3339 boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM });
3340
3341
3342
3343
3344 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
3345
3346 concat((char *)(boot + 1), argv+optind+2);
3347
3348
3349 boot->hdr.kernel_alignment = 0x1000000;
3350
3351
3352 boot->hdr.version = 0x207;
3353
3354
3355 boot->hdr.hardware_subarch = 1;
3356
3357
3358 boot->hdr.loadflags |= KEEP_SEGMENTS;
3359
3360
3361 tell_kernel(start);
3362
3363
3364 signal(SIGCHLD, kill_launcher);
3365
3366
3367 atexit(cleanup_devices);
3368
3369
3370 if (chroot_path) {
3371 if (chroot(chroot_path) != 0)
3372 err(1, "chroot(\"%s\") failed", chroot_path);
3373
3374 if (chdir("/") != 0)
3375 err(1, "chdir(\"/\") failed");
3376
3377 verbose("chroot done\n");
3378 }
3379
3380
3381 if (user_details) {
3382 uid_t u;
3383 gid_t g;
3384
3385 u = user_details->pw_uid;
3386 g = user_details->pw_gid;
3387
3388 if (initgroups(user_details->pw_name, g) != 0)
3389 err(1, "initgroups failed");
3390
3391 if (setresgid(g, g, g) != 0)
3392 err(1, "setresgid failed");
3393
3394 if (setresuid(u, u, u) != 0)
3395 err(1, "setresuid failed");
3396
3397 verbose("Dropping privileges completed\n");
3398 }
3399
3400
3401 run_guest();
3402}
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415