diff options
author | Peter Grehan <grehan@FreeBSD.org> | 2011-05-13 04:54:01 +0000 |
---|---|---|
committer | Peter Grehan <grehan@FreeBSD.org> | 2011-05-13 04:54:01 +0000 |
commit | 366f60834ff8ef709f132fe8976c96a5e2caace9 (patch) | |
tree | 4af898a91c7d67e7068687610ebc68f1cbdf3b2e | |
parent | 9adee7d03f2e4c91e6330410f88fb5addaf2a24a (diff) | |
download | src-366f60834ff8ef709f132fe8976c96a5e2caace9.tar.gz src-366f60834ff8ef709f132fe8976c96a5e2caace9.zip |
Notes
84 files changed, 19016 insertions, 0 deletions
diff --git a/lib/Makefile b/lib/Makefile index e9f4ec36b823..8f33206f44a2 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -102,6 +102,7 @@ SUBDIR= ${SUBDIR_ORDERED} \ ${_libusbhid} \ ${_libusb} \ ${_libvgl} \ + ${_libvmmapi} \ libwrap \ liby \ libz \ @@ -177,6 +178,7 @@ _libncp= libncp .endif _libsmb= libsmb _libvgl= libvgl +_libvmmapi= libvmmapi .endif .if ${MACHINE_ARCH} == "powerpc" diff --git a/lib/libvmmapi/Makefile b/lib/libvmmapi/Makefile new file mode 100644 index 000000000000..492391f9f85c --- /dev/null +++ b/lib/libvmmapi/Makefile @@ -0,0 +1,9 @@ +# $FreeBSD$ + +LIB= vmmapi +SRCS= vmmapi.c vmmapi_freebsd.c mptable.c +INCS= vmmapi.h + +CFLAGS+= -I${.CURDIR} + +.include <bsd.lib.mk> diff --git a/lib/libvmmapi/mptable.c b/lib/libvmmapi/mptable.c new file mode 100644 index 000000000000..1aea61a2ad7c --- /dev/null +++ b/lib/libvmmapi/mptable.c @@ -0,0 +1,336 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/mman.h> + +#include <stdio.h> +#include <string.h> +#include <machine/vmm.h> +#include <machine/vmm_dev.h> + +#include "vmmapi.h" +#include "mptable.h" + +#define LAPIC_PADDR (0xFEE00000) +#define LAPIC_VERSION (16) + +#define IOAPIC_PADDR (0xFEC00000) +#define IOAPIC_VERSION (0x11) + +extern int errno; + +static uint8_t +mp_compute_checksum(void *base, size_t len) +{ + uint8_t *bytes = base; + uint8_t sum = 0; + for(; len > 0; len--) { + sum += *bytes++; + } + return 256 - sum; +} + +static void +mp_build_mpfp(struct mp_floating_pointer *mpfp, vm_paddr_t mpfp_gpa) +{ + memset(mpfp, 0, sizeof(*mpfp)); + memcpy(mpfp->signature, MPFP_SIGNATURE, MPFP_SIGNATURE_LEN); + mpfp->mptable_paddr = mpfp_gpa + sizeof(*mpfp); + mpfp->specrev = MP_SPECREV; + mpfp->feature2 = 0; + mpfp->checksum = mp_compute_checksum(mpfp, sizeof(*mpfp)); +} + +static void +mp_build_mpch(struct mp_config_hdr *mpch) +{ + memset(mpch, 0, sizeof(*mpch)); + mpch->specrev = MP_SPECREV; + memcpy(mpch->signature, MPCH_SIGNATURE, MPCH_SIGNATURE_LEN); + memcpy(mpch->oemid, MPCH_OEMID, MPCH_OEMID_LEN); + memcpy(mpch->prodid, MPCH_PRODID, MPCH_PRODID_LEN); + mpch->lapic_paddr = LAPIC_PADDR; + + +} + +static void +mp_build_proc_entries(struct mpe_proc *mpep, int num_proc) +{ + int i; + + for (i = 0; i < num_proc; i++) { + memset(mpep, 0, sizeof(*mpep)); + mpep->entry_type = MP_ENTRY_PROC; + mpep->lapic_id = i; // XXX + mpep->lapic_version = LAPIC_VERSION; + mpep->proc_flags = (i == 0)?MPEP_FLAGS_BSP:0; + mpep->proc_flags |= MPEP_FLAGS_EN; + mpep->proc_signature = MPEP_SIGNATURE; + mpep->feature_flags = MPEP_FEATURES; + mpep++; + } + +} + +static void +mp_build_bus_entries(struct mpe_bus *mpeb) +{ + memset(mpeb, 0, sizeof(*mpeb)); + mpeb->entry_type = MP_ENTRY_BUS; + mpeb->busid = MPE_BUSID_ISA; + memcpy(mpeb->busname, MPE_BUSNAME_ISA, MPE_BUSNAME_LEN); + mpeb++; + + memset(mpeb, 0, sizeof(*mpeb)); + mpeb->entry_type = MP_ENTRY_BUS; + mpeb->busid = MPE_BUSID_PCI; + memcpy(mpeb->busname, MPE_BUSNAME_PCI, MPE_BUSNAME_LEN); + +} + +static void +mp_build_ioapic_entries(struct mpe_ioapic *mpei) +{ + memset(mpei, 0, sizeof(*mpei)); + mpei->entry_type = MP_ENTRY_IOAPIC; + mpei->ioapic_id = MPE_IOAPIC_ID; + mpei->ioapic_version = IOAPIC_VERSION; + mpei->ioapic_flags = MPE_IOAPIC_FLAG_EN; + mpei->ioapic_paddr = IOAPIC_PADDR; +} + +static void +mp_build_ioint_entries(struct mpe_ioint *mpeii, int num_pins) +{ + int pin; + + /* + * The following config is taken from kernel mptable.c + * mptable_parse_default_config_ints(...), for now + * just use the default config, tweek later if needed. + */ + + + /* Run through all 16 pins. */ + for (pin = 0; pin < num_pins; pin++) { + memset(mpeii, 0, sizeof(*mpeii)); + mpeii->entry_type = MP_ENTRY_IOINT; + mpeii->src_bus_id = MPE_BUSID_ISA; + mpeii->dst_apic_id = MPE_IOAPIC_ID; + + /* + * All default configs route IRQs from bus 0 to the first 16 pins + * of the first I/O APIC with an APIC ID of 2. + */ + mpeii->dst_apic_intin = pin; + switch (pin) { + case 0: + /* Pin 0 is an ExtINT pin. */ + mpeii->intr_type = MPEII_INTR_EXTINT; + break; + case 2: + /* IRQ 0 is routed to pin 2. */ + mpeii->intr_type = MPEII_INTR_INT; + mpeii->src_bus_irq = 0; + break; + case 5: + case 10: + case 11: + /* + * PCI Irqs set to level triggered. + */ + mpeii->intr_flags = MPEII_FLAGS_TRIGMODE_LEVEL; + mpeii->src_bus_id = MPE_BUSID_PCI; + default: + /* All other pins are identity mapped. */ + mpeii->intr_type = MPEII_INTR_INT; + mpeii->src_bus_irq = pin; + break; + } + mpeii++; + } + +} + +#define COPYSTR(dest, src, bytes) \ + memcpy(dest, src, bytes); \ + str[bytes] = 0; + + +static void +mptable_dump(struct mp_floating_pointer *mpfp, struct mp_config_hdr *mpch) +{ + static char str[16]; + int i; + char *cur; + + union mpe { + struct mpe_proc *proc; + struct mpe_bus *bus; + struct mpe_ioapic *ioapic; + struct mpe_ioint *ioint; + struct mpe_lint *lnit; + char *p; + }; + + union mpe mpe; + + printf(" MP Floating Pointer :\n"); + COPYSTR(str, mpfp->signature, 4); + printf(" signature: %s\n", str); + printf(" mpch paddr: %x\n", mpfp->mptable_paddr); + printf(" length: %x\n", mpfp->length); + printf(" specrec: %x\n", mpfp->specrev); + printf(" checksum: %x\n", mpfp->checksum); + printf(" feature1: %x\n", mpfp->feature1); + printf(" feature2: %x\n", mpfp->feature2); + printf(" feature3: %x\n", mpfp->feature3); + printf(" feature4: %x\n", mpfp->feature4); + + printf(" MP Configuration Header :\n"); + COPYSTR(str, mpch->signature, 4); + printf(" signature: %s\n", str); + printf(" length: %x\n", mpch->length); + printf(" specrec: %x\n", mpch->specrev); + printf(" checksum: %x\n", mpch->checksum); + COPYSTR(str, mpch->oemid, MPCH_OEMID_LEN); + printf(" oemid: %s\n", str); + COPYSTR(str, mpch->prodid, MPCH_PRODID_LEN); + printf(" prodid: %s\n", str); + printf(" oem_ptr: %x\n", mpch->oem_ptr); + printf(" oem_sz: %x\n", mpch->oem_sz); + printf(" nr_entries: %x\n", mpch->nr_entries); + printf(" apic paddr: %x\n", mpch->lapic_paddr); + printf(" ext_length: %x\n", mpch->ext_length); + printf(" ext_checksum: %x\n", mpch->ext_checksum); + + cur = (char *)mpch + sizeof(*mpch); + for (i = 0; i < mpch->nr_entries; i++) { + mpe.p = cur; + switch(*mpe.p) { + case MP_ENTRY_PROC: + printf(" MP Processor Entry :\n"); + printf(" lapic_id: %x\n", mpe.proc->lapic_id); + printf(" lapic_version: %x\n", mpe.proc->lapic_version); + printf(" proc_flags: %x\n", mpe.proc->proc_flags); + printf(" proc_signature: %x\n", mpe.proc->proc_signature); + printf(" feature_flags: %x\n", mpe.proc->feature_flags); + cur += sizeof(struct mpe_proc); + break; + case MP_ENTRY_BUS: + printf(" MP Bus Entry :\n"); + printf(" busid: %x\n", mpe.bus->busid); + COPYSTR(str, mpe.bus->busname, MPE_BUSNAME_LEN); + printf(" busname: %s\n", str); + cur += sizeof(struct mpe_bus); + break; + case MP_ENTRY_IOAPIC: + printf(" MP IOAPIC Entry :\n"); + printf(" ioapi_id: %x\n", mpe.ioapic->ioapic_id); + printf(" ioapi_version: %x\n", mpe.ioapic->ioapic_version); + printf(" ioapi_flags: %x\n", mpe.ioapic->ioapic_flags); + printf(" ioapi_paddr: %x\n", mpe.ioapic->ioapic_paddr); + cur += sizeof(struct mpe_ioapic); + break; + case MP_ENTRY_IOINT: + printf(" MP IO Interrupt Entry :\n"); + printf(" intr_type: %x\n", mpe.ioint->intr_type); + printf(" intr_flags: %x\n", mpe.ioint->intr_flags); + printf(" src_bus_id: %x\n", mpe.ioint->src_bus_id); + printf(" src_bus_irq: %x\n", mpe.ioint->src_bus_irq); + printf(" dst_apic_id: %x\n", mpe.ioint->dst_apic_id); + printf(" dst_apic_intin: %x\n", mpe.ioint->dst_apic_intin); + cur += sizeof(struct mpe_ioint); + break; + case MP_ENTRY_LINT: + printf(" MP Local Interrupt Entry :\n"); + cur += sizeof(struct mpe_lint); + break; + } + + } +} + +int +vm_build_mptable(struct vmctx *ctx, vm_paddr_t gpa, int len, int ncpu, + void *oemp, int oemsz) +{ + struct mp_config_hdr *mpch; + char *mapaddr; + char *startaddr; + int error; + + mapaddr = vm_map_memory(ctx, gpa, len); + if (mapaddr == MAP_FAILED) { + printf("%s\n", strerror(errno)); + goto err; + } + startaddr = mapaddr; + + mp_build_mpfp((struct mp_floating_pointer*) mapaddr, gpa); + mapaddr += sizeof(struct mp_floating_pointer); + + mpch = (struct mp_config_hdr*)mapaddr; + mp_build_mpch(mpch); + mapaddr += sizeof(struct mp_config_hdr); + + mp_build_proc_entries((struct mpe_proc*) mapaddr, ncpu); + mapaddr += (sizeof(struct mpe_proc)*ncpu); + mpch->nr_entries += ncpu; + + mp_build_bus_entries((struct mpe_bus*)mapaddr); + mapaddr += (sizeof(struct mpe_bus)*MPE_NUM_BUSES); + mpch->nr_entries += MPE_NUM_BUSES; +#if 0 + mp_build_ioapic_entries((struct mpe_ioapic*)mapaddr); + mapaddr += sizeof(struct mpe_ioapic); + mpch->nr_entries++; + + mp_build_ioint_entries((struct mpe_ioint*)mapaddr, MPEII_MAX_IRQ); + mapaddr += sizeof(struct mpe_ioint)*MPEII_MAX_IRQ; + mpch->nr_entries += MPEII_MAX_IRQ; + +#endif + if (oemp) { + mpch->oem_ptr = mapaddr - startaddr + gpa; + mpch->oem_sz = oemsz; + memcpy(mapaddr, oemp, oemsz); + } + mpch->length = (mapaddr) - ((char*) mpch); + mpch->checksum = mp_compute_checksum(mpch, sizeof(*mpch)); + + + // mptable_dump((struct mp_floating_pointer*)startaddr, mpch); +err: + return (error); +} diff --git a/lib/libvmmapi/mptable.h b/lib/libvmmapi/mptable.h new file mode 100644 index 000000000000..cad8834a47b2 --- /dev/null +++ b/lib/libvmmapi/mptable.h @@ -0,0 +1,171 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MPTABLE_h_ +#define _MPTABLE_h_ + +#define MP_SPECREV (4) // MP spec revision 1.1 + +/* + * MP Floating Pointer Structure + */ +#define MPFP_SIGNATURE "_MP_" +#define MPFP_SIGNATURE_LEN (4) +#define MPFP_FEATURE2 (0x80) // IMCR is present +struct mp_floating_pointer { + uint8_t signature[MPFP_SIGNATURE_LEN]; + uint32_t mptable_paddr; + uint8_t length; + uint8_t specrev; + uint8_t checksum; + uint8_t feature1; + uint8_t feature2; + uint8_t feature3; + uint8_t feature4; + uint8_t feature5; +}; + + +/* + * MP Configuration Table Header + */ +#define MPCH_SIGNATURE "PCMP" +#define MPCH_SIGNATURE_LEN (4) + +#define MPCH_OEMID "NETAPP " +#define MPCH_OEMID_LEN (8) +#define MPCH_PRODID "vFiler " +#define MPCH_PRODID_LEN (12) + +struct mp_config_hdr { + uint8_t signature[MPCH_SIGNATURE_LEN]; + uint16_t length; + uint8_t specrev; + uint8_t checksum; + uint8_t oemid[MPCH_OEMID_LEN]; + uint8_t prodid[MPCH_PRODID_LEN]; + uint32_t oem_ptr; + uint16_t oem_sz; + uint16_t nr_entries; + uint32_t lapic_paddr; + uint16_t ext_length; + uint8_t ext_checksum; + uint8_t reserved; +}; + +#define MP_ENTRY_PROC (0) +#define MP_ENTRY_BUS (1) +#define MP_ENTRY_IOAPIC (2) +#define MP_ENTRY_IOINT (3) +#define MP_ENTRY_LINT (4) + +/* + * MP Processor Entry + */ + +#define MPEP_FLAGS_EN (0x1) +#define MPEP_FLAGS_BSP (0x2) + +#define MPEP_SIG_FAMILY (6) +#define MPEP_SIG_MODEL (26) +#define MPEP_SIG_STEPPING (5) +#define MPEP_SIGNATURE ((MPEP_SIG_FAMILY << 8) | (MPEP_SIG_MODEL << 4) \ + | (MPEP_SIG_STEPPING)) + +#define MPEP_FEATURES (0xBFEBFBFF) // Value from Intel i7 CPUID + +struct mpe_proc { + uint8_t entry_type; + uint8_t lapic_id; + uint8_t lapic_version; + uint8_t proc_flags; + uint32_t proc_signature; + uint32_t feature_flags; + uint8_t reserved[8]; +}; + +/* + * MP Bus Entry + */ + +#define MPE_NUM_BUSES (2) +#define MPE_BUSNAME_LEN (6) +#define MPE_BUSID_ISA (0) +#define MPE_BUSID_PCI (1) +#define MPE_BUSNAME_ISA "ISA " +#define MPE_BUSNAME_PCI "PCI " +struct mpe_bus { + uint8_t entry_type; + uint8_t busid; + uint8_t busname[MPE_BUSNAME_LEN]; +}; + +/* + * MP IO APIC Entry + */ +#define MPE_IOAPIC_ID (2) +#define MPE_IOAPIC_FLAG_EN (1) +struct mpe_ioapic { + uint8_t entry_type; + uint8_t ioapic_id; + uint8_t ioapic_version; + uint8_t ioapic_flags; + uint32_t ioapic_paddr; + +}; + +/* + * MP IO Interrupt Assignment Entry + */ +#define MPEII_INTR_INT (0) +#define MPEII_INTR_NMI (1) +#define MPEII_INTR_SMI (2) +#define MPEII_INTR_EXTINT (3) +#define MPEII_PCI_IRQ_MASK (0x0c20U) /* IRQ 5,10,11 are PCI connected */ +#define MPEII_MAX_IRQ (16) +#define MPEII_FLAGS_TRIGMODE_LEVEL (0x3) +struct mpe_ioint { + uint8_t entry_type; + uint8_t intr_type; + uint16_t intr_flags; + uint8_t src_bus_id; + uint8_t src_bus_irq; + uint8_t dst_apic_id; + uint8_t dst_apic_intin; +}; + +/* + * MP Local Interrupt Assignment Entry + */ +struct mpe_lint { + uint8_t entry_type; +}; + +int vm_build_mptable(struct vmctx *ctxt, vm_paddr_t gpa, int len, + int ncpu, void *oemp, int oemsz); +#endif /* _MPTABLE_h_ */ diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c new file mode 100644 index 000000000000..7f95fea9b603 --- /dev/null +++ b/lib/libvmmapi/vmmapi.c @@ -0,0 +1,647 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/sysctl.h> +#include <sys/ioctl.h> +#include <sys/mman.h> + +#include <machine/specialreg.h> + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <string.h> +#include <fcntl.h> +#include <unistd.h> + +#include <machine/vmm.h> +#include <machine/vmm_dev.h> + +#include "vmmapi.h" +#include "mptable.h" + +#ifndef CR4_VMXE +#define CR4_VMXE (1UL << 13) +#endif + +#define BIOS_ROM_BASE (0xf0000) +#define BIOS_ROM_SIZE (0x10000) + +struct vmctx { + int fd; + char *name; +}; + +#define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) +#define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) + +static int +vm_device_open(const char *name) +{ + int fd, len; + char *vmfile; + + len = strlen("/dev/vmm/") + strlen(name) + 1; + vmfile = malloc(len); + assert(vmfile != NULL); + snprintf(vmfile, len, "/dev/vmm/%s", name); + + /* Open the device file */ + fd = open(vmfile, O_RDWR, 0); + + free(vmfile); + return (fd); +} + +int +vm_create(const char *name) +{ + + return (CREATE((char *)name)); +} + +struct vmctx * +vm_open(const char *name) +{ + struct vmctx *vm; + + vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); + assert(vm != NULL); + + vm->fd = -1; + vm->name = (char *)(vm + 1); + strcpy(vm->name, name); + + if ((vm->fd = vm_device_open(vm->name)) < 0) + goto err; + + return (vm); +err: + vm_destroy(vm); + return (NULL); +} + +void +vm_destroy(struct vmctx *vm) +{ + assert(vm != NULL); + + DESTROY(vm->name); + if (vm->fd >= 0) + close(vm->fd); + free(vm); +} + +int +vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, + vm_paddr_t *ret_hpa, size_t *ret_len) +{ + int error; + struct vm_memory_segment seg; + + bzero(&seg, sizeof(seg)); + seg.gpa = gpa; + error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg); + *ret_hpa = seg.hpa; + *ret_len = seg.len; + return (error); +} + +int +vm_setup_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **mapaddr) +{ + int error; + struct vm_memory_segment seg; + + /* + * Create and optionally map 'len' bytes of memory at guest + * physical address 'gpa' + */ + bzero(&seg, sizeof(seg)); + seg.gpa = gpa; + seg.len = len; + error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg); + if (error == 0 && mapaddr != NULL) { + *mapaddr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED, + ctx->fd, gpa); + } + return (error); +} + +char * +vm_map_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len) +{ + + /* Map 'len' bytes of memory at guest physical address 'gpa' */ + return ((char *)mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED, + ctx->fd, gpa)); +} + +int +vm_set_desc(struct vmctx *ctx, int vcpu, int reg, + uint64_t base, uint32_t limit, uint32_t access) +{ + int error; + struct vm_seg_desc vmsegdesc; + + bzero(&vmsegdesc, sizeof(vmsegdesc)); + vmsegdesc.cpuid = vcpu; + vmsegdesc.regnum = reg; + vmsegdesc.desc.base = base; + vmsegdesc.desc.limit = limit; + vmsegdesc.desc.access = access; + + error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc); + return (error); +} + +int +vm_get_desc(struct vmctx *ctx, int vcpu, int reg, + uint64_t *base, uint32_t *limit, uint32_t *access) +{ + int error; + struct vm_seg_desc vmsegdesc; + + bzero(&vmsegdesc, sizeof(vmsegdesc)); + vmsegdesc.cpuid = vcpu; + vmsegdesc.regnum = reg; + + error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc); + if (error == 0) { + *base = vmsegdesc.desc.base; + *limit = vmsegdesc.desc.limit; + *access = vmsegdesc.desc.access; + } + return (error); +} + +int +vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val) +{ + int error; + struct vm_register vmreg; + + bzero(&vmreg, sizeof(vmreg)); + vmreg.cpuid = vcpu; + vmreg.regnum = reg; + vmreg.regval = val; + + error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg); + return (error); +} + +int +vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val) +{ + int error; + struct vm_register vmreg; + + bzero(&vmreg, sizeof(vmreg)); + vmreg.cpuid = vcpu; + vmreg.regnum = reg; + + error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg); + *ret_val = vmreg.regval; + return (error); +} + +int +vm_get_pinning(struct vmctx *ctx, int vcpu, int *host_cpuid) +{ + int error; + struct vm_pin vmpin; + + bzero(&vmpin, sizeof(vmpin)); + vmpin.vm_cpuid = vcpu; + + error = ioctl(ctx->fd, VM_GET_PINNING, &vmpin); + *host_cpuid = vmpin.host_cpuid; + return (error); +} + +int +vm_set_pinning(struct vmctx *ctx, int vcpu, int host_cpuid) +{ + int error; + struct vm_pin vmpin; + + bzero(&vmpin, sizeof(vmpin)); + vmpin.vm_cpuid = vcpu; + vmpin.host_cpuid = host_cpuid; + + error = ioctl(ctx->fd, VM_SET_PINNING, &vmpin); + return (error); +} + +int +vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, struct vm_exit *vmexit) +{ + int error; + struct vm_run vmrun; + + bzero(&vmrun, sizeof(vmrun)); + vmrun.cpuid = vcpu; + vmrun.rip = rip; + + error = ioctl(ctx->fd, VM_RUN, &vmrun); + bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit)); + return (error); +} + +static int +vm_inject_event_real(struct vmctx *ctx, int vcpu, enum vm_event_type type, + int vector, int error_code, int error_code_valid) +{ + struct vm_event ev; + + bzero(&ev, sizeof(ev)); + ev.cpuid = vcpu; + ev.type = type; + ev.vector = vector; + ev.error_code = error_code; + ev.error_code_valid = error_code_valid; + + return (ioctl(ctx->fd, VM_INJECT_EVENT, &ev)); +} + +int +vm_inject_event(struct vmctx *ctx, int vcpu, enum vm_event_type type, + int vector) +{ + + return (vm_inject_event_real(ctx, vcpu, type, vector, 0, 0)); +} + +int +vm_inject_event2(struct vmctx *ctx, int vcpu, enum vm_event_type type, + int vector, int error_code) +{ + + return (vm_inject_event_real(ctx, vcpu, type, vector, error_code, 1)); +} + +int +vm_build_tables(struct vmctx *ctxt, int ncpu, void *oemtbl, int oemtblsz) +{ + + return (vm_build_mptable(ctxt, BIOS_ROM_BASE, BIOS_ROM_SIZE, ncpu, + oemtbl, oemtblsz)); +} + +int +vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector) +{ + struct vm_lapic_irq vmirq; + + bzero(&vmirq, sizeof(vmirq)); + vmirq.cpuid = vcpu; + vmirq.vector = vector; + + return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq)); +} + +int +vm_inject_nmi(struct vmctx *ctx, int vcpu) +{ + struct vm_nmi vmnmi; + + bzero(&vmnmi, sizeof(vmnmi)); + vmnmi.cpuid = vcpu; + + return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi)); +} + +int +vm_capability_name2type(const char *capname) +{ + int i; + + static struct { + const char *name; + int type; + } capstrmap[] = { + { "hlt_exit", VM_CAP_HALT_EXIT }, + { "mtrap_exit", VM_CAP_MTRAP_EXIT }, + { "pause_exit", VM_CAP_PAUSE_EXIT }, + { "unrestricted_guest", VM_CAP_UNRESTRICTED_GUEST }, + { 0 } + }; + + for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) { + if (strcmp(capstrmap[i].name, capname) == 0) + return (capstrmap[i].type); + } + + return (-1); +} + +int +vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, + int *retval) +{ + int error; + struct vm_capability vmcap; + + bzero(&vmcap, sizeof(vmcap)); + vmcap.cpuid = vcpu; + vmcap.captype = cap; + + error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap); + *retval = vmcap.capval; + return (error); +} + +int +vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val) +{ + struct vm_capability vmcap; + + bzero(&vmcap, sizeof(vmcap)); + vmcap.cpuid = vcpu; + vmcap.captype = cap; + vmcap.capval = val; + + return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap)); +} + +int +vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func) +{ + struct vm_pptdev pptdev; + + bzero(&pptdev, sizeof(pptdev)); + pptdev.bus = bus; + pptdev.slot = slot; + pptdev.func = func; + + return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev)); +} + +int +vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func) +{ + struct vm_pptdev pptdev; + + bzero(&pptdev, sizeof(pptdev)); + pptdev.bus = bus; + pptdev.slot = slot; + pptdev.func = func; + + return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev)); +} + +int +vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa) +{ + struct vm_pptdev_mmio pptmmio; + + bzero(&pptmmio, sizeof(pptmmio)); + pptmmio.bus = bus; + pptmmio.slot = slot; + pptmmio.func = func; + pptmmio.gpa = gpa; + pptmmio.len = len; + pptmmio.hpa = hpa; + + return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio)); +} + +int +vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, + int destcpu, int vector, int numvec) +{ + struct vm_pptdev_msi pptmsi; + + bzero(&pptmsi, sizeof(pptmsi)); + pptmsi.vcpu = vcpu; + pptmsi.bus = bus; + pptmsi.slot = slot; + pptmsi.func = func; + pptmsi.destcpu = destcpu; + pptmsi.vector = vector; + pptmsi.numvec = numvec; + + return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi)); +} + +uint64_t * +vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, + int *ret_entries) +{ + int error; + + static struct vm_stats vmstats; + + vmstats.cpuid = vcpu; + + error = ioctl(ctx->fd, VM_STATS, &vmstats); + if (error == 0) { + if (ret_entries) + *ret_entries = vmstats.num_entries; + if (ret_tv) + *ret_tv = vmstats.tv; + return (vmstats.statbuf); + } else + return (NULL); +} + +const char * +vm_get_stat_desc(struct vmctx *ctx, int index) +{ + int error; + + static struct vm_stat_desc statdesc; + + statdesc.index = index; + if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) + return (statdesc.desc); + else + return (NULL); +} + +/* + * From Intel Vol 3a: + * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT + */ +int +vcpu_reset(struct vmctx *vmctx, int vcpu) +{ + int error; + uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx; + uint32_t desc_access, desc_limit; + uint16_t sel; + + zero = 0; + + rflags = 0x2; + error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags); + if (error) + goto done; + + rip = 0xfff0; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0) + goto done; + + cr0 = CR0_NE; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0) + goto done; + + cr4 = CR4_VMXE; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0) + goto done; + + /* + * CS: present, r/w, accessed, 16-bit, byte granularity, usable + */ + desc_base = 0xffff0000; + desc_limit = 0xffff; + desc_access = 0x0093; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + sel = 0xf000; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0) + goto done; + + /* + * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity + */ + desc_base = 0; + desc_limit = 0xffff; + desc_access = 0x0093; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + sel = 0; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0) + goto done; + + /* General purpose registers */ + rdx = 0xf00; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0) + goto done; + + /* GDTR, IDTR */ + desc_base = 0; + desc_limit = 0xffff; + desc_access = 0; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR, + desc_base, desc_limit, desc_access); + if (error != 0) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR, + desc_base, desc_limit, desc_access); + if (error != 0) + goto done; + + /* TR */ + desc_base = 0; + desc_limit = 0xffff; + desc_access = 0x0000008b; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access); + if (error) + goto done; + + sel = 0; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0) + goto done; + + /* LDTR */ + desc_base = 0; + desc_limit = 0xffff; + desc_access = 0x00000082; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base, + desc_limit, desc_access); + if (error) + goto done; + + sel = 0; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0) + goto done; + + /* XXX cr2, debug registers */ + + error = 0; +done: + return (error); +} diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h new file mode 100644 index 000000000000..38533a894b17 --- /dev/null +++ b/lib/libvmmapi/vmmapi.h @@ -0,0 +1,98 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMMAPI_H_ +#define _VMMAPI_H_ + +struct vmctx; + +int vm_create(const char *name); +struct vmctx *vm_open(const char *name); +void vm_destroy(struct vmctx *ctx); +int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, + vm_paddr_t *ret_hpa, size_t *ret_len); +/* + * Create a memory segment of 'len' bytes in the guest physical address space + * at offset 'gpa'. + * + * If 'mapaddr' is not NULL then this region is mmap'ed into the address + * space of the calling process. If there is an mmap error then *mapaddr + * will be set to MAP_FAILED. + */ + +int vm_setup_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len, + char **mapaddr); +char * vm_map_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len); +int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, + uint64_t base, uint32_t limit, uint32_t access); +int vm_get_desc(struct vmctx *ctx, int vcpu, int reg, + uint64_t *base, uint32_t *limit, uint32_t *access); +int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val); +int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval); +int vm_get_pinning(struct vmctx *ctx, int vcpu, int *host_cpuid); +int vm_set_pinning(struct vmctx *ctx, int vcpu, int host_cpuid); +int vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, + struct vm_exit *ret_vmexit); +int vm_build_tables(struct vmctx *ctxt, int ncpus, void *oemtbl, + int oemtblsz); +int vm_inject_event(struct vmctx *ctx, int vcpu, enum vm_event_type type, + int vector); +int vm_inject_event2(struct vmctx *ctx, int vcpu, enum vm_event_type type, + int vector, int error_code); +int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector); +int vm_inject_nmi(struct vmctx *ctx, int vcpu); +int vm_capability_name2type(const char *capname); +int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, + int *retval); +int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, + int val); +int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func); +int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func); +int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +int vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, + int dest, int vector, int numvec); + +/* + * Return a pointer to the statistics buffer. Note that this is not MT-safe. + */ +uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, + int *ret_entries); +const char *vm_get_stat_desc(struct vmctx *ctx, int index); + +/* Reset vcpu register state */ +int vcpu_reset(struct vmctx *ctx, int vcpu); + +/* + * FreeBSD specific APIs + */ +int vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu, + uint64_t rip, uint64_t cr3, uint64_t gdtbase, + uint64_t rsp); +void vm_setup_freebsd_gdt(uint64_t *gdtr); +#endif /* _VMMAPI_H_ */ diff --git a/lib/libvmmapi/vmmapi_freebsd.c b/lib/libvmmapi/vmmapi_freebsd.c new file mode 100644 index 000000000000..c4ad9898ede1 --- /dev/null +++ b/lib/libvmmapi/vmmapi_freebsd.c @@ -0,0 +1,187 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <machine/specialreg.h> +#include <machine/segments.h> +#include <machine/vmm.h> + +#include "vmmapi.h" + +#ifndef CR4_VMXE +#define CR4_VMXE (1UL << 13) +#endif + +#define DESC_UNUSABLE 0x00010000 + +#define GUEST_NULL_SEL 0 +#define GUEST_CODE_SEL 1 +#define GUEST_DATA_SEL 2 +#define GUEST_GDTR_LIMIT (3 * 8 - 1) + +void +vm_setup_freebsd_gdt(uint64_t *gdtr) +{ + gdtr[GUEST_NULL_SEL] = 0; + gdtr[GUEST_CODE_SEL] = 0x0020980000000000; + gdtr[GUEST_DATA_SEL] = 0x0000900000000000; +} + +/* + * Setup the 'vcpu' register set such that it will begin execution at + * 'rip' in long mode. + */ +int +vm_setup_freebsd_registers(struct vmctx *vmctx, int vcpu, + uint64_t rip, uint64_t cr3, uint64_t gdtbase, + uint64_t rsp) +{ + int error; + uint64_t cr0, cr4, efer, rflags, desc_base; + uint32_t desc_access, desc_limit; + uint16_t gsel; + + cr0 = CR0_PE | CR0_PG | CR0_NE; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0) + goto done; + + cr4 = CR4_PAE | CR4_VMXE; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0) + goto done; + + efer = EFER_LME | EFER_LMA; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_EFER, efer))) + goto done; + + rflags = 0x2; + error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags); + if (error) + goto done; + + desc_base = 0; + desc_limit = 0; + desc_access = 0x0000209B; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + desc_access = 0x00000093; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + /* + * XXX TR is pointing to null selector even though we set the + * TSS segment to be usable with a base address and limit of 0. + */ + desc_access = 0x0000008b; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, 0, 0, + DESC_UNUSABLE); + if (error) + goto done; + + gsel = GSEL(GUEST_CODE_SEL, SEL_KPL); + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, gsel)) != 0) + goto done; + + gsel = GSEL(GUEST_DATA_SEL, SEL_KPL); + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, gsel)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, gsel)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, gsel)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, gsel)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, gsel)) != 0) + goto done; + + /* XXX TR is pointing to the null selector */ + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, 0)) != 0) + goto done; + + /* LDTR is pointing to the null selector */ + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0) + goto done; + + /* entry point */ + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0) + goto done; + + /* page table base */ + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, cr3)) != 0) + goto done; + + desc_base = gdtbase; + desc_limit = GUEST_GDTR_LIMIT; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR, + desc_base, desc_limit, 0); + if (error != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, rsp)) != 0) + goto done; + + error = 0; +done: + return (error); +} diff --git a/share/mk/bsd.libnames.mk b/share/mk/bsd.libnames.mk index ff1a11ae05b5..5fc1b3cbf40a 100644 --- a/share/mk/bsd.libnames.mk +++ b/share/mk/bsd.libnames.mk @@ -155,6 +155,7 @@ LIBUSB?= ${DESTDIR}${LIBDIR}/libusb.a LIBUTIL?= ${DESTDIR}${LIBDIR}/libutil.a LIBUUTIL?= ${DESTDIR}${LIBDIR}/libuutil.a LIBVGL?= ${DESTDIR}${LIBDIR}/libvgl.a +LIBVMMAPI?= ${DESTDIR}${LIBDIR}/libvmmapi.a LIBWRAP?= ${DESTDIR}${LIBDIR}/libwrap.a LIBXPG4?= ${DESTDIR}${LIBDIR}/libxpg4.a LIBY?= ${DESTDIR}${LIBDIR}/liby.a diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h index 895619cf6f7e..c95fee05f5eb 100644 --- a/sys/amd64/include/specialreg.h +++ b/sys/amd64/include/specialreg.h @@ -297,6 +297,7 @@ */ #define APICBASE_RESERVED 0x000006ff #define APICBASE_BSP 0x00000100 +#define APICBASE_X2APIC 0x00000400 #define APICBASE_ENABLED 0x00000800 #define APICBASE_ADDRESS 0xfffff000 diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h new file mode 100644 index 000000000000..0f4c356b6d5f --- /dev/null +++ b/sys/amd64/include/vmm.h @@ -0,0 +1,268 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: vmm.h 482 2011-05-09 21:22:43Z grehan $ + */ + +#ifndef _VMM_H_ +#define _VMM_H_ + +#ifdef _KERNEL + +#define VM_MAX_NAMELEN 32 + +struct vm; +struct vm_memory_segment; +struct seg_desc; +struct vm_exit; +struct vm_run; +struct vlapic; + +typedef int (*vmm_init_func_t)(void); +typedef int (*vmm_cleanup_func_t)(void); +typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */ +typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, + struct vm_exit *vmexit); +typedef void (*vmi_cleanup_func_t)(void *vmi); +typedef int (*vmi_mmap_func_t)(void *vmi, vm_paddr_t gpa, vm_paddr_t hpa, + size_t length, vm_memattr_t attr, + int prot, boolean_t superpages_ok); +typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, + uint64_t *retval); +typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, + uint64_t val); +typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num, + struct seg_desc *desc); +typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, + struct seg_desc *desc); +typedef int (*vmi_inject_event_t)(void *vmi, int vcpu, + int type, int vector, + uint32_t code, int code_valid); +typedef int (*vmi_inject_nmi_t)(void *vmi, int vcpu); +typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); +typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); + +struct vmm_ops { + vmm_init_func_t init; /* module wide initialization */ + vmm_cleanup_func_t cleanup; + + vmi_init_func_t vminit; /* vm-specific initialization */ + vmi_run_func_t vmrun; + vmi_cleanup_func_t vmcleanup; + vmi_mmap_func_t vmmmap; + vmi_get_register_t vmgetreg; + vmi_set_register_t vmsetreg; + vmi_get_desc_t vmgetdesc; + vmi_set_desc_t vmsetdesc; + vmi_inject_event_t vminject; + vmi_inject_nmi_t vmnmi; + vmi_get_cap_t vmgetcap; + vmi_set_cap_t vmsetcap; +}; + +extern struct vmm_ops vmm_ops_intel; +extern struct vmm_ops vmm_ops_amd; + +struct vm *vm_create(const char *name); +void vm_destroy(struct vm *vm); +const char *vm_name(struct vm *vm); +int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa); +int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); +vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size); +int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, + struct vm_memory_segment *seg); +int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); +int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); +int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *ret_desc); +int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc); +int vm_get_pinning(struct vm *vm, int vcpu, int *cpuid); +int vm_set_pinning(struct vm *vm, int vcpu, int cpuid); +int vm_run(struct vm *vm, struct vm_run *vmrun); +int vm_inject_event(struct vm *vm, int vcpu, int type, + int vector, uint32_t error_code, int error_code_valid); +int vm_inject_nmi(struct vm *vm, int vcpu); +uint64_t *vm_guest_msrs(struct vm *vm, int cpu); +struct vlapic *vm_lapic(struct vm *vm, int cpu); +int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); +int vm_set_capability(struct vm *vm, int vcpu, int type, int val); +void vm_activate_cpu(struct vm *vm, int vcpu); +cpumask_t vm_active_cpus(struct vm *vm); + +/* + * Return 1 if device indicated by bus/slot/func is supposed to be a + * pci passthrough device. + * + * Return 0 otherwise. + */ +int vmm_is_pptdev(int bus, int slot, int func); + +void *vm_iommu_domain(struct vm *vm); + +#define VCPU_STOPPED 0 +#define VCPU_RUNNING 1 +void vm_set_run_state(struct vm *vm, int vcpu, int running); +int vm_get_run_state(struct vm *vm, int vcpu, int *hostcpu); + +void *vcpu_stats(struct vm *vm, int vcpu); + +static int __inline +vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) +{ + return (vm_get_run_state(vm, vcpu, hostcpu) == VCPU_RUNNING); +} + +static cpumask_t __inline +vcpu_mask(int vcpuid) +{ + return ((cpumask_t)1 << vcpuid); +} + +#endif /* KERNEL */ + +#define VM_MAXCPU 8 /* maximum virtual cpus */ + +/* + * Identifiers for events that can be injected into the VM + */ +enum vm_event_type { + VM_EVENT_NONE, + VM_HW_INTR, + VM_NMI, + VM_HW_EXCEPTION, + VM_SW_INTR, + VM_PRIV_SW_EXCEPTION, + VM_SW_EXCEPTION, + VM_EVENT_MAX +}; + +/* + * Identifiers for architecturally defined registers. + */ +enum vm_reg_name { + VM_REG_GUEST_RAX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RDX, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_RBP, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15, + VM_REG_GUEST_CR0, + VM_REG_GUEST_CR3, + VM_REG_GUEST_CR4, + VM_REG_GUEST_DR7, + VM_REG_GUEST_RSP, + VM_REG_GUEST_RIP, + VM_REG_GUEST_RFLAGS, + VM_REG_GUEST_ES, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS, + VM_REG_GUEST_LDTR, + VM_REG_GUEST_TR, + VM_REG_GUEST_IDTR, + VM_REG_GUEST_GDTR, + VM_REG_GUEST_EFER, + VM_REG_LAST +}; + +/* + * Identifiers for optional vmm capabilities + */ +enum vm_cap_type { + VM_CAP_HALT_EXIT, + VM_CAP_MTRAP_EXIT, + VM_CAP_PAUSE_EXIT, + VM_CAP_UNRESTRICTED_GUEST, + VM_CAP_MAX +}; + +/* + * The 'access' field has the format specified in Table 21-2 of the Intel + * Architecture Manual vol 3b. + * + * XXX The contents of the 'access' field are architecturally defined except + * bit 16 - Segment Unusable. + */ +struct seg_desc { + uint64_t base; + uint32_t limit; + uint32_t access; +}; + +enum vm_exitcode { + VM_EXITCODE_INOUT, + VM_EXITCODE_VMX, + VM_EXITCODE_BOGUS, + VM_EXITCODE_RDMSR, + VM_EXITCODE_WRMSR, + VM_EXITCODE_HLT, + VM_EXITCODE_MTRAP, + VM_EXITCODE_PAUSE, + VM_EXITCODE_MAX, +}; + +struct vm_exit { + enum vm_exitcode exitcode; + int inst_length; /* 0 means unknown */ + uint64_t rip; + union { + struct { + uint16_t bytes:3; /* 1 or 2 or 4 */ + uint16_t in:1; /* out is 0, in is 1 */ + uint16_t string:1; + uint16_t rep:1; + uint16_t port; + uint32_t eax; /* valid for out */ + } inout; + /* + * VMX specific payload. Used when there is no "better" + * exitcode to represent the VM-exit. + */ + struct { + int error; /* vmx inst error */ + uint32_t exit_reason; + uint64_t exit_qualification; + } vmx; + struct { + uint32_t code; /* ecx value */ + uint64_t wval; + } msr; + } u; +}; + +#endif /* _VMM_H_ */ diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h new file mode 100644 index 000000000000..1b143b527e46 --- /dev/null +++ b/sys/amd64/include/vmm_dev.h @@ -0,0 +1,191 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: vmm_dev.h 482 2011-05-09 21:22:43Z grehan $ + */ + +#ifndef _VMM_DEV_H_ +#define _VMM_DEV_H_ + +#ifdef _KERNEL +void vmmdev_init(void); +void vmmdev_cleanup(void); +#endif + +struct vm_memory_segment { + vm_paddr_t hpa; /* out */ + vm_paddr_t gpa; /* in */ + size_t len; /* in */ +}; + +struct vm_register { + int cpuid; + int regnum; /* enum vm_reg_name */ + uint64_t regval; +}; + +struct vm_seg_desc { /* data or code segment */ + int cpuid; + int regnum; /* enum vm_reg_name */ + struct seg_desc desc; +}; + +struct vm_pin { + int vm_cpuid; + int host_cpuid; /* -1 to unpin */ +}; + +struct vm_run { + int cpuid; + uint64_t rip; /* start running here */ + struct vm_exit vm_exit; +}; + +struct vm_event { + int cpuid; + enum vm_event_type type; + int vector; + uint32_t error_code; + int error_code_valid; +}; + +struct vm_lapic_irq { + int cpuid; + int vector; +}; + +struct vm_capability { + int cpuid; + enum vm_cap_type captype; + int capval; + int allcpus; +}; + +struct vm_pptdev { + int bus; + int slot; + int func; +}; + +struct vm_pptdev_mmio { + int bus; + int slot; + int func; + vm_paddr_t gpa; + vm_paddr_t hpa; + size_t len; +}; + +struct vm_pptdev_msi { + int vcpu; + int bus; + int slot; + int func; + int numvec; /* 0 means disabled */ + int vector; + int destcpu; +}; + +struct vm_nmi { + int cpuid; +}; + +#define MAX_VM_STATS 64 +struct vm_stats { + int cpuid; /* in */ + int num_entries; /* out */ + struct timeval tv; + uint64_t statbuf[MAX_VM_STATS]; +}; + +struct vm_stat_desc { + int index; /* in */ + char desc[128]; /* out */ +}; + +enum { + IOCNUM_RUN, + IOCNUM_SET_PINNING, + IOCNUM_GET_PINNING, + IOCNUM_MAP_MEMORY, + IOCNUM_GET_MEMORY_SEG, + IOCNUM_SET_REGISTER, + IOCNUM_GET_REGISTER, + IOCNUM_SET_SEGMENT_DESCRIPTOR, + IOCNUM_GET_SEGMENT_DESCRIPTOR, + IOCNUM_INJECT_EVENT, + IOCNUM_LAPIC_IRQ, + IOCNUM_SET_CAPABILITY, + IOCNUM_GET_CAPABILITY, + IOCNUM_BIND_PPTDEV, + IOCNUM_UNBIND_PPTDEV, + IOCNUM_MAP_PPTDEV_MMIO, + IOCNUM_PPTDEV_MSI, + IOCNUM_INJECT_NMI, + IOCNUM_VM_STATS, + IOCNUM_VM_STAT_DESC, +}; + +#define VM_RUN \ + _IOWR('v', IOCNUM_RUN, struct vm_run) +#define VM_SET_PINNING \ + _IOW('v', IOCNUM_SET_PINNING, struct vm_pin) +#define VM_GET_PINNING \ + _IOWR('v', IOCNUM_GET_PINNING, struct vm_pin) +#define VM_MAP_MEMORY \ + _IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment) +#define VM_GET_MEMORY_SEG \ + _IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment) +#define VM_SET_REGISTER \ + _IOW('v', IOCNUM_SET_REGISTER, struct vm_register) +#define VM_GET_REGISTER \ + _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register) +#define VM_SET_SEGMENT_DESCRIPTOR \ + _IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) +#define VM_GET_SEGMENT_DESCRIPTOR \ + _IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) +#define VM_INJECT_EVENT \ + _IOW('v', IOCNUM_INJECT_EVENT, struct vm_event) +#define VM_LAPIC_IRQ \ + _IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq) +#define VM_SET_CAPABILITY \ + _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability) +#define VM_GET_CAPABILITY \ + _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability) +#define VM_BIND_PPTDEV \ + _IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev) +#define VM_UNBIND_PPTDEV \ + _IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev) +#define VM_MAP_PPTDEV_MMIO \ + _IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio) +#define VM_PPTDEV_MSI \ + _IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi) +#define VM_INJECT_NMI \ + _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi) +#define VM_STATS \ + _IOWR('v', IOCNUM_VM_STATS, struct vm_stats) +#define VM_STAT_DESC \ + _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc) +#endif diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c new file mode 100644 index 000000000000..41e937a20da9 --- /dev/null +++ b/sys/amd64/vmm/amd/amdv.c @@ -0,0 +1,247 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/errno.h> + +#include <machine/vmm.h> +#include "io/iommu.h" + +static int +amdv_init(void) +{ + + printf("amdv_init: not implemented\n"); + return (ENXIO); +} + +static int +amdv_cleanup(void) +{ + + printf("amdv_cleanup: not implemented\n"); + return (ENXIO); +} + +static void * +amdv_vminit(struct vm *vm) +{ + + printf("amdv_vminit: not implemented\n"); + return (NULL); +} + +static int +amdv_vmrun(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit) +{ + + printf("amdv_vmrun: not implemented\n"); + return (ENXIO); +} + +static void +amdv_vmcleanup(void *arg) +{ + + printf("amdv_vmcleanup: not implemented\n"); + return; +} + +static int +amdv_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, + vm_memattr_t attr, int prot, boolean_t spok) +{ + + printf("amdv_vmmmap: not implemented\n"); + return (EINVAL); +} + +static int +amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval) +{ + + printf("amdv_getreg: not implemented\n"); + return (EINVAL); +} + +static int +amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val) +{ + + printf("amdv_setreg: not implemented\n"); + return (EINVAL); +} + +static int +amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc) +{ + + printf("amdv_get_desc: not implemented\n"); + return (EINVAL); +} + +static int +amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc) +{ + + printf("amdv_get_desc: not implemented\n"); + return (EINVAL); +} + +static int +amdv_inject_event(void *vmi, int vcpu, int type, int vector, + uint32_t error_code, int error_code_valid) +{ + + printf("amdv_inject_event: not implemented\n"); + return (EINVAL); +} + +static int +amdv_nmi(void *arg, int vcpu) +{ + + printf("amdv_nmi: not implemented\n"); + return (EINVAL); +} + +static int +amdv_getcap(void *arg, int vcpu, int type, int *retval) +{ + + printf("amdv_getcap: not implemented\n"); + return (EINVAL); +} + +static int +amdv_setcap(void *arg, int vcpu, int type, int val) +{ + + printf("amdv_setcap: not implemented\n"); + return (EINVAL); +} + +struct vmm_ops vmm_ops_amd = { + amdv_init, + amdv_cleanup, + amdv_vminit, + amdv_vmrun, + amdv_vmcleanup, + amdv_vmmmap, + amdv_getreg, + amdv_setreg, + amdv_getdesc, + amdv_setdesc, + amdv_inject_event, + amdv_nmi, + amdv_getcap, + amdv_setcap +}; + +static int +amd_iommu_init(void) +{ + + printf("amd_iommu_init: not implemented\n"); + return (ENXIO); +} + +static void +amd_iommu_cleanup(void) +{ + + printf("amd_iommu_cleanup: not implemented\n"); +} + +static void +amd_iommu_enable(void) +{ + + printf("amd_iommu_enable: not implemented\n"); +} + +static void +amd_iommu_disable(void) +{ + + printf("amd_iommu_disable: not implemented\n"); +} + +static void * +amd_iommu_create_domain(vm_paddr_t maxaddr) +{ + + printf("amd_iommu_create_domain: not implemented\n"); + return (NULL); +} + +static void +amd_iommu_destroy_domain(void *domain) +{ + + printf("amd_iommu_destroy_domain: not implemented\n"); +} + +static uint64_t +amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, + uint64_t len) +{ + + printf("amd_iommu_create_mapping: not implemented\n"); + return (0); +} + +static void +amd_iommu_add_device(void *domain, int bus, int slot, int func) +{ + + printf("amd_iommu_add_device: not implemented\n"); +} + +static void +amd_iommu_remove_device(void *domain, int bus, int slot, int func) +{ + + printf("amd_iommu_remove_device: not implemented\n"); +} + +struct iommu_ops iommu_ops_amd = { + amd_iommu_init, + amd_iommu_cleanup, + amd_iommu_enable, + amd_iommu_disable, + amd_iommu_create_domain, + amd_iommu_destroy_domain, + amd_iommu_create_mapping, + amd_iommu_add_device, + amd_iommu_remove_device, +}; diff --git a/sys/amd64/vmm/intel/ept.c b/sys/amd64/vmm/intel/ept.c new file mode 100644 index 000000000000..c9fca9d5abee --- /dev/null +++ b/sys/amd64/vmm/intel/ept.c @@ -0,0 +1,312 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/smp.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/param.h> +#include <machine/cpufunc.h> +#include <machine/pmap.h> +#include <machine/vmparam.h> + +#include <machine/vmm.h> +#include "vmx_cpufunc.h" +#include "vmx_msr.h" +#include "vmx.h" +#include "ept.h" + +#define EPT_PWL4(cap) ((cap) & (1UL << 6)) +#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14)) +#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */ +#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */ +#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32)) +#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20)) + +#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL +#define INVVPID_ALL_TYPES_SUPPORTED(cap) \ + (((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK) + +#define INVEPT_ALL_TYPES_MASK 0x6000000UL +#define INVEPT_ALL_TYPES_SUPPORTED(cap) \ + (((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK) + +#define EPT_PG_RD (1 << 0) +#define EPT_PG_WR (1 << 1) +#define EPT_PG_EX (1 << 2) +#define EPT_PG_MEMORY_TYPE(x) ((x) << 3) +#define EPT_PG_IGNORE_PAT (1 << 6) +#define EPT_PG_SUPERPAGE (1 << 7) + +#define EPT_ADDR_MASK ((uint64_t)-1 << 12) + +MALLOC_DECLARE(M_VMX); + +static uint64_t page_sizes_mask; + +int +ept_init(void) +{ + int page_shift; + uint64_t cap; + + cap = rdmsr(MSR_VMX_EPT_VPID_CAP); + + /* + * Verify that: + * - page walk length is 4 steps + * - extended page tables can be laid out in write-back memory + * - invvpid instruction with all possible types is supported + * - invept instruction with all possible types is supported + */ + if (!EPT_PWL4(cap) || + !EPT_MEMORY_TYPE_WB(cap) || + !INVVPID_SUPPORTED(cap) || + !INVVPID_ALL_TYPES_SUPPORTED(cap) || + !INVEPT_SUPPORTED(cap) || + !INVEPT_ALL_TYPES_SUPPORTED(cap)) + return (EINVAL); + + /* Set bits in 'page_sizes_mask' for each valid page size */ + page_shift = PAGE_SHIFT; + page_sizes_mask = 1UL << page_shift; /* 4KB page */ + + page_shift += 9; + if (EPT_PDE_SUPERPAGE(cap)) + page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */ + + page_shift += 9; + if (EPT_PDPTE_SUPERPAGE(cap)) + page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */ + + return (0); +} + +static size_t +ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, + vm_memattr_t attr, vm_prot_t prot, boolean_t spok) +{ + int spshift, ptpshift, ptpindex, nlevels; + + /* + * Compute the size of the mapping that we can accomodate. + * + * This is based on three factors: + * - super page sizes supported by the processor + * - alignment of the region starting at 'gpa' and 'hpa' + * - length of the region 'len' + */ + spshift = PAGE_SHIFT; + if (spok) + spshift += (EPT_PWLEVELS - 1) * 9; + while (spshift >= PAGE_SHIFT) { + uint64_t spsize = 1UL << spshift; + if ((page_sizes_mask & spsize) != 0 && + (gpa & (spsize - 1)) == 0 && + (hpa & (spsize - 1)) == 0 && + length >= spsize) { + break; + } + spshift -= 9; + } + + if (spshift < PAGE_SHIFT) { + panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, " + "length 0x%016lx, page_sizes_mask 0x%016lx", + gpa, hpa, length, page_sizes_mask); + } + + nlevels = EPT_PWLEVELS; + while (--nlevels >= 0) { + ptpshift = PAGE_SHIFT + nlevels * 9; + ptpindex = (gpa >> ptpshift) & 0x1FF; + + /* We have reached the leaf mapping */ + if (spshift >= ptpshift) + break; + + /* + * We are working on a non-leaf page table page. + * + * Create the next level page table page if necessary and point + * to it from the current page table. + */ + if (ptp[ptpindex] == 0) { + void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO); + ptp[ptpindex] = vtophys(nlp); + ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX; + } + + /* Work our way down to the next level page table page */ + ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK); + } + + if ((gpa & ((1UL << ptpshift) - 1)) != 0) { + panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d " + "mismatch\n", gpa, ptpshift); + } + + /* Do the mapping */ + ptp[ptpindex] = hpa; + + /* Apply the access controls */ + if (prot & VM_PROT_READ) + ptp[ptpindex] |= EPT_PG_RD; + if (prot & VM_PROT_WRITE) + ptp[ptpindex] |= EPT_PG_WR; + if (prot & VM_PROT_EXECUTE) + ptp[ptpindex] |= EPT_PG_EX; + + /* + * XXX should we enforce this memory type by setting the ignore PAT + * bit to 1. + */ + ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr); + + if (nlevels > 0) + ptp[ptpindex] |= EPT_PG_SUPERPAGE; + + return (1UL << ptpshift); +} + +static void +ept_free_pt_entry(pt_entry_t pte) +{ + if (pte == 0) + return; + + /* sanity check */ + if ((pte & EPT_PG_SUPERPAGE) != 0) + panic("ept_free_pt_entry: pte cannot have superpage bit"); + + return; +} + +static void +ept_free_pd_entry(pd_entry_t pde) +{ + pt_entry_t *pt; + int i; + + if (pde == 0) + return; + + if ((pde & EPT_PG_SUPERPAGE) == 0) { + pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK); + for (i = 0; i < NPTEPG; i++) + ept_free_pt_entry(pt[i]); + free(pt, M_VMX); /* free the page table page */ + } +} + +static void +ept_free_pdp_entry(pdp_entry_t pdpe) +{ + pd_entry_t *pd; + int i; + + if (pdpe == 0) + return; + + if ((pdpe & EPT_PG_SUPERPAGE) == 0) { + pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK); + for (i = 0; i < NPDEPG; i++) + ept_free_pd_entry(pd[i]); + free(pd, M_VMX); /* free the page directory page */ + } +} + +static void +ept_free_pml4_entry(pml4_entry_t pml4e) +{ + pdp_entry_t *pdp; + int i; + + if (pml4e == 0) + return; + + if ((pml4e & EPT_PG_SUPERPAGE) == 0) { + pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK); + for (i = 0; i < NPDPEPG; i++) + ept_free_pdp_entry(pdp[i]); + free(pdp, M_VMX); /* free the page directory ptr page */ + } +} + +void +ept_vmcleanup(struct vmx *vmx) +{ + int i; + + for (i = 0; i < NPML4EPG; i++) + ept_free_pml4_entry(vmx->pml4ept[i]); +} + +int +ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len, + vm_memattr_t attr, int prot, boolean_t spok) +{ + size_t n; + struct vmx *vmx = arg; + + while (len > 0) { + n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr, + prot, spok); + len -= n; + gpa += n; + hpa += n; + } + + return (0); +} + +static void +invept_single_context(void *arg) +{ + struct invept_desc desc = *(struct invept_desc *)arg; + + invept(INVEPT_TYPE_SINGLE_CONTEXT, desc); +} + +void +ept_invalidate_mappings(u_long pml4ept) +{ + struct invept_desc invept_desc = { 0 }; + + invept_desc.eptp = EPTP(pml4ept); + + smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc); +} diff --git a/sys/amd64/vmm/intel/ept.h b/sys/amd64/vmm/intel/ept.h new file mode 100644 index 000000000000..013c330ed41a --- /dev/null +++ b/sys/amd64/vmm/intel/ept.h @@ -0,0 +1,42 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _EPT_H_ +#define _EPT_H_ + +struct vmx; + +#define EPT_PWLEVELS 4 /* page walk levels */ +#define EPTP(pml4) ((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK) + +int ept_init(void); +int ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, + vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings); +void ept_invalidate_mappings(u_long ept_pml4); +void ept_vmcleanup(struct vmx *vmx); +#endif diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c new file mode 100644 index 000000000000..80d45ccebb25 --- /dev/null +++ b/sys/amd64/vmm/intel/vmcs.c @@ -0,0 +1,451 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/pcpu.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/segments.h> +#include <machine/pmap.h> + +#include <machine/vmm.h> +#include "vmcs.h" +#include "vmx_cpufunc.h" +#include "ept.h" +#include "vmx.h" + +static uint64_t +vmcs_fix_regval(uint32_t encoding, uint64_t val) +{ + + switch (encoding) { + case VMCS_GUEST_CR0: + val = vmx_fix_cr0(val); + break; + case VMCS_GUEST_CR4: + val = vmx_fix_cr4(val); + break; + default: + break; + } + return (val); +} + +static uint32_t +vmcs_field_encoding(int ident) +{ + switch (ident) { + case VM_REG_GUEST_CR0: + return (VMCS_GUEST_CR0); + case VM_REG_GUEST_CR3: + return (VMCS_GUEST_CR3); + case VM_REG_GUEST_CR4: + return (VMCS_GUEST_CR4); + case VM_REG_GUEST_DR7: + return (VMCS_GUEST_DR7); + case VM_REG_GUEST_RSP: + return (VMCS_GUEST_RSP); + case VM_REG_GUEST_RIP: + return (VMCS_GUEST_RIP); + case VM_REG_GUEST_RFLAGS: + return (VMCS_GUEST_RFLAGS); + case VM_REG_GUEST_ES: + return (VMCS_GUEST_ES_SELECTOR); + case VM_REG_GUEST_CS: + return (VMCS_GUEST_CS_SELECTOR); + case VM_REG_GUEST_SS: + return (VMCS_GUEST_SS_SELECTOR); + case VM_REG_GUEST_DS: + return (VMCS_GUEST_DS_SELECTOR); + case VM_REG_GUEST_FS: + return (VMCS_GUEST_FS_SELECTOR); + case VM_REG_GUEST_GS: + return (VMCS_GUEST_GS_SELECTOR); + case VM_REG_GUEST_TR: + return (VMCS_GUEST_TR_SELECTOR); + case VM_REG_GUEST_LDTR: + return (VMCS_GUEST_LDTR_SELECTOR); + case VM_REG_GUEST_EFER: + return (VMCS_GUEST_IA32_EFER); + default: + return (-1); + } + +} + +static int +vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc) +{ + + switch (seg) { + case VM_REG_GUEST_ES: + *base = VMCS_GUEST_ES_BASE; + *lim = VMCS_GUEST_ES_LIMIT; + *acc = VMCS_GUEST_ES_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_CS: + *base = VMCS_GUEST_CS_BASE; + *lim = VMCS_GUEST_CS_LIMIT; + *acc = VMCS_GUEST_CS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_SS: + *base = VMCS_GUEST_SS_BASE; + *lim = VMCS_GUEST_SS_LIMIT; + *acc = VMCS_GUEST_SS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_DS: + *base = VMCS_GUEST_DS_BASE; + *lim = VMCS_GUEST_DS_LIMIT; + *acc = VMCS_GUEST_DS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_FS: + *base = VMCS_GUEST_FS_BASE; + *lim = VMCS_GUEST_FS_LIMIT; + *acc = VMCS_GUEST_FS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_GS: + *base = VMCS_GUEST_GS_BASE; + *lim = VMCS_GUEST_GS_LIMIT; + *acc = VMCS_GUEST_GS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_TR: + *base = VMCS_GUEST_TR_BASE; + *lim = VMCS_GUEST_TR_LIMIT; + *acc = VMCS_GUEST_TR_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_LDTR: + *base = VMCS_GUEST_LDTR_BASE; + *lim = VMCS_GUEST_LDTR_LIMIT; + *acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_IDTR: + *base = VMCS_GUEST_IDTR_BASE; + *lim = VMCS_GUEST_IDTR_LIMIT; + *acc = VMCS_INVALID_ENCODING; + break; + case VM_REG_GUEST_GDTR: + *base = VMCS_GUEST_GDTR_BASE; + *lim = VMCS_GUEST_GDTR_LIMIT; + *acc = VMCS_INVALID_ENCODING; + break; + default: + return (EINVAL); + } + + return (0); +} + +int +vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval) +{ + int error; + uint32_t encoding; + + /* + * If we need to get at vmx-specific state in the VMCS we can bypass + * the translation of 'ident' to 'encoding' by simply setting the + * sign bit. As it so happens the upper 16 bits are reserved (i.e + * set to 0) in the encodings for the VMCS so we are free to use the + * sign bit. + */ + if (ident < 0) + encoding = ident & 0x7fffffff; + else + encoding = vmcs_field_encoding(ident); + + if (encoding == (uint32_t)-1) + return (EINVAL); + + VMPTRLD(vmcs); + error = vmread(encoding, retval); + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val) +{ + int error; + uint32_t encoding; + + if (ident < 0) + encoding = ident & 0x7fffffff; + else + encoding = vmcs_field_encoding(ident); + + if (encoding == (uint32_t)-1) + return (EINVAL); + + val = vmcs_fix_regval(encoding, val); + + VMPTRLD(vmcs); + error = vmwrite(encoding, val); + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_setdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc) +{ + int error; + uint32_t base, limit, access; + + error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); + if (error != 0) + panic("vmcs_setdesc: invalid segment register %d", seg); + + VMPTRLD(vmcs); + if ((error = vmwrite(base, desc->base)) != 0) + goto done; + + if ((error = vmwrite(limit, desc->limit)) != 0) + goto done; + + if (access != VMCS_INVALID_ENCODING) { + if ((error = vmwrite(access, desc->access)) != 0) + goto done; + } +done: + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_getdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc) +{ + int error; + uint32_t base, limit, access; + uint64_t u64; + + error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); + if (error != 0) + panic("vmcs_getdesc: invalid segment register %d", seg); + + VMPTRLD(vmcs); + if ((error = vmread(base, &u64)) != 0) + goto done; + desc->base = u64; + + if ((error = vmread(limit, &u64)) != 0) + goto done; + desc->limit = u64; + + if (access != VMCS_INVALID_ENCODING) { + if ((error = vmread(access, &u64)) != 0) + goto done; + desc->access = u64; + } +done: + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count) +{ + int error; + + VMPTRLD(vmcs); + + /* + * Guest MSRs are saved in the VM-exit MSR-store area. + * Guest MSRs are loaded from the VM-entry MSR-load area. + * Both areas point to the same location in memory. + */ + if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0) + goto done; + if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0) + goto done; + + if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0) + goto done; + if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0) + goto done; + + error = 0; +done: + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_set_defaults(struct vmcs *vmcs, + u_long host_rip, u_long host_rsp, u_long ept_pml4, + uint32_t pinbased_ctls, uint32_t procbased_ctls, + uint32_t procbased_ctls2, uint32_t exit_ctls, + uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid) +{ + int error, codesel, datasel, tsssel; + u_long cr0, cr4, efer; + uint64_t eptp, pat; + uint32_t exc_bitmap; + + codesel = GSEL(GCODE_SEL, SEL_KPL); + datasel = GSEL(GDATA_SEL, SEL_KPL); + tsssel = GSEL(GPROC0_SEL, SEL_KPL); + + /* + * Make sure we have a "current" VMCS to work with. + */ + VMPTRLD(vmcs); + + /* + * Load the VMX controls + */ + if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0) + goto done; + if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0) + goto done; + if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0) + goto done; + if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0) + goto done; + if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0) + goto done; + + /* Guest state */ + + /* Initialize guest IA32_PAT MSR with the default value */ + pat = PAT_VALUE(0, PAT_WRITE_BACK) | + PAT_VALUE(1, PAT_WRITE_THROUGH) | + PAT_VALUE(2, PAT_UNCACHED) | + PAT_VALUE(3, PAT_UNCACHEABLE) | + PAT_VALUE(4, PAT_WRITE_BACK) | + PAT_VALUE(5, PAT_WRITE_THROUGH) | + PAT_VALUE(6, PAT_UNCACHED) | + PAT_VALUE(7, PAT_UNCACHEABLE); + if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0) + goto done; + + /* Host state */ + + /* Initialize host IA32_PAT MSR */ + pat = rdmsr(MSR_PAT); + if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0) + goto done; + + /* Load the IA32_EFER MSR */ + efer = rdmsr(MSR_EFER); + if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0) + goto done; + + /* Load the control registers */ + cr0 = rcr0(); + if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0) + goto done; + + cr4 = rcr4(); + if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0) + goto done; + + /* Load the segment selectors */ + if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0) + goto done; + + /* + * Load the Base-Address for %fs and idtr. + * + * Note that we exclude %gs, tss and gdtr here because their base + * address is pcpu specific. + */ + if ((error = vmwrite(VMCS_HOST_FS_BASE, 0)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_IDTR_BASE, r_idt.rd_base)) != 0) + goto done; + + /* instruction pointer */ + if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0) + goto done; + + /* stack pointer */ + if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0) + goto done; + + /* eptp */ + eptp = EPTP(ept_pml4); + if ((error = vmwrite(VMCS_EPTP, eptp)) != 0) + goto done; + + /* vpid */ + if ((error = vmwrite(VMCS_VPID, vpid)) != 0) + goto done; + + /* msr bitmap */ + if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0) + goto done; + + /* exception bitmap */ + exc_bitmap = 1 << IDT_MC; + if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0) + goto done; + + /* link pointer */ + if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0) + goto done; +done: + VMCLEAR(vmcs); + return (error); +} + +uint64_t +vmcs_read(uint32_t encoding) +{ + int error; + uint64_t val; + + error = vmread(encoding, &val); + if (error != 0) + panic("vmcs_read(%u) error %d", encoding, error); + + return (val); +} diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h new file mode 100644 index 000000000000..c633a5957a07 --- /dev/null +++ b/sys/amd64/vmm/intel/vmcs.h @@ -0,0 +1,324 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMCS_H_ +#define _VMCS_H_ + +#ifdef _KERNEL +struct vmcs { + uint32_t identifier; + uint32_t abort_code; + char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2]; +}; +CTASSERT(sizeof(struct vmcs) == PAGE_SIZE); + +/* MSR save region is composed of an array of 'struct msr_entry' */ +struct msr_entry { + uint32_t index; + uint32_t reserved; + uint64_t val; + +}; + +int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count); +int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp, + u_long ept_pml4, + uint32_t pinbased_ctls, uint32_t procbased_ctls, + uint32_t procbased_ctls2, uint32_t exit_ctls, + uint32_t entry_ctls, u_long msr_bitmap, + uint16_t vpid); +int vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval); +int vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val); +int vmcs_getdesc(struct vmcs *vmcs, int ident, + struct seg_desc *desc); +int vmcs_setdesc(struct vmcs *vmcs, int ident, + struct seg_desc *desc); +uint64_t vmcs_read(uint32_t encoding); + +#define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH) +#define vmcs_guest_rip() vmcs_read(VMCS_GUEST_RIP) +#define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR) +#define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff) +#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION) + +#endif /* _KERNEL */ + +#define VMCS_IDENT(encoding) ((encoding) | 0x80000000) +/* + * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B. + */ +#define VMCS_INVALID_ENCODING 0xffffffff + +/* 16-bit control fields */ +#define VMCS_VPID 0x00000000 + +/* 16-bit guest-state fields */ +#define VMCS_GUEST_ES_SELECTOR 0x00000800 +#define VMCS_GUEST_CS_SELECTOR 0x00000802 +#define VMCS_GUEST_SS_SELECTOR 0x00000804 +#define VMCS_GUEST_DS_SELECTOR 0x00000806 +#define VMCS_GUEST_FS_SELECTOR 0x00000808 +#define VMCS_GUEST_GS_SELECTOR 0x0000080A +#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C +#define VMCS_GUEST_TR_SELECTOR 0x0000080E + +/* 16-bit host-state fields */ +#define VMCS_HOST_ES_SELECTOR 0x00000C00 +#define VMCS_HOST_CS_SELECTOR 0x00000C02 +#define VMCS_HOST_SS_SELECTOR 0x00000C04 +#define VMCS_HOST_DS_SELECTOR 0x00000C06 +#define VMCS_HOST_FS_SELECTOR 0x00000C08 +#define VMCS_HOST_GS_SELECTOR 0x00000C0A +#define VMCS_HOST_TR_SELECTOR 0x00000C0C + +/* 64-bit control fields */ +#define VMCS_IO_BITMAP_A 0x00002000 +#define VMCS_IO_BITMAP_B 0x00002002 +#define VMCS_MSR_BITMAP 0x00002004 +#define VMCS_EXIT_MSR_STORE 0x00002006 +#define VMCS_EXIT_MSR_LOAD 0x00002008 +#define VMCS_ENTRY_MSR_LOAD 0x0000200A +#define VMCS_EXECUTIVE_VMCS 0x0000200C +#define VMCS_TSC_OFFSET 0x00002010 +#define VMCS_VIRTUAL_APIC 0x00002012 +#define VMCS_APIC_ACCESS 0x00002014 +#define VMCS_EPTP 0x0000201A + +/* 64-bit read-only fields */ +#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 + +/* 64-bit guest-state fields */ +#define VMCS_LINK_POINTER 0x00002800 +#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802 +#define VMCS_GUEST_IA32_PAT 0x00002804 +#define VMCS_GUEST_IA32_EFER 0x00002806 +#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808 +#define VMCS_GUEST_PDPTE0 0x0000280A +#define VMCS_GUEST_PDPTE1 0x0000280C +#define VMCS_GUEST_PDPTE2 0x0000280E +#define VMCS_GUEST_PDPTE3 0x00002810 + +/* 64-bit host-state fields */ +#define VMCS_HOST_IA32_PAT 0x00002C00 +#define VMCS_HOST_IA32_EFER 0x00002C02 +#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04 + +/* 32-bit control fields */ +#define VMCS_PIN_BASED_CTLS 0x00004000 +#define VMCS_PRI_PROC_BASED_CTLS 0x00004002 +#define VMCS_EXCEPTION_BITMAP 0x00004004 +#define VMCS_PF_ERROR_MASK 0x00004006 +#define VMCS_PF_ERROR_MATCH 0x00004008 +#define VMCS_CR3_TARGET_COUNT 0x0000400A +#define VMCS_EXIT_CTLS 0x0000400C +#define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E +#define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010 +#define VMCS_ENTRY_CTLS 0x00004012 +#define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014 +#define VMCS_ENTRY_INTR_INFO 0x00004016 +#define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018 +#define VMCS_ENTRY_INST_LENGTH 0x0000401A +#define VMCS_TPR_THRESHOLD 0x0000401C +#define VMCS_SEC_PROC_BASED_CTLS 0x0000401E +#define VMCS_PLE_GAP 0x00004020 +#define VMCS_PLE_WINDOW 0x00004022 + +/* 32-bit read-only data fields */ +#define VMCS_INSTRUCTION_ERROR 0x00004400 +#define VMCS_EXIT_REASON 0x00004402 +#define VMCS_EXIT_INTERRUPTION_INFO 0x00004404 +#define VMCS_EXIT_INTERRUPTION_ERROR 0x00004406 +#define VMCS_IDT_VECTORING_INFO 0x00004408 +#define VMCS_IDT_VECTORING_ERROR 0x0000440A +#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C +#define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E + +/* 32-bit guest-state fields */ +#define VMCS_GUEST_ES_LIMIT 0x00004800 +#define VMCS_GUEST_CS_LIMIT 0x00004802 +#define VMCS_GUEST_SS_LIMIT 0x00004804 +#define VMCS_GUEST_DS_LIMIT 0x00004806 +#define VMCS_GUEST_FS_LIMIT 0x00004808 +#define VMCS_GUEST_GS_LIMIT 0x0000480A +#define VMCS_GUEST_LDTR_LIMIT 0x0000480C +#define VMCS_GUEST_TR_LIMIT 0x0000480E +#define VMCS_GUEST_GDTR_LIMIT 0x00004810 +#define VMCS_GUEST_IDTR_LIMIT 0x00004812 +#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814 +#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816 +#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818 +#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A +#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C +#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E +#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820 +#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822 +#define VMCS_GUEST_INTERRUPTIBILITY 0x00004824 +#define VMCS_GUEST_ACTIVITY 0x00004826 +#define VMCS_GUEST_SMBASE 0x00004828 +#define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A +#define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E + +/* 32-bit host state fields */ +#define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00 + +/* Natural Width control fields */ +#define VMCS_CR0_MASK 0x00006000 +#define VMCS_CR4_MASK 0x00006002 +#define VMCS_CR0_SHADOW 0x00006004 +#define VMCS_CR4_SHADOW 0x00006006 +#define VMCS_CR3_TARGET0 0x00006008 +#define VMCS_CR3_TARGET1 0x0000600A +#define VMCS_CR3_TARGET2 0x0000600C +#define VMCS_CR3_TARGET3 0x0000600E + +/* Natural Width read-only fields */ +#define VMCS_EXIT_QUALIFICATION 0x00006400 +#define VMCS_IO_RCX 0x00006402 +#define VMCS_IO_RSI 0x00006404 +#define VMCS_IO_RDI 0x00006406 +#define VMCS_IO_RIP 0x00006408 +#define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A + +/* Natural Width guest-state fields */ +#define VMCS_GUEST_CR0 0x00006800 +#define VMCS_GUEST_CR3 0x00006802 +#define VMCS_GUEST_CR4 0x00006804 +#define VMCS_GUEST_ES_BASE 0x00006806 +#define VMCS_GUEST_CS_BASE 0x00006808 +#define VMCS_GUEST_SS_BASE 0x0000680A +#define VMCS_GUEST_DS_BASE 0x0000680C +#define VMCS_GUEST_FS_BASE 0x0000680E +#define VMCS_GUEST_GS_BASE 0x00006810 +#define VMCS_GUEST_LDTR_BASE 0x00006812 +#define VMCS_GUEST_TR_BASE 0x00006814 +#define VMCS_GUEST_GDTR_BASE 0x00006816 +#define VMCS_GUEST_IDTR_BASE 0x00006818 +#define VMCS_GUEST_DR7 0x0000681A +#define VMCS_GUEST_RSP 0x0000681C +#define VMCS_GUEST_RIP 0x0000681E +#define VMCS_GUEST_RFLAGS 0x00006820 +#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822 +#define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824 +#define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826 + +/* Natural Width host-state fields */ +#define VMCS_HOST_CR0 0x00006C00 +#define VMCS_HOST_CR3 0x00006C02 +#define VMCS_HOST_CR4 0x00006C04 +#define VMCS_HOST_FS_BASE 0x00006C06 +#define VMCS_HOST_GS_BASE 0x00006C08 +#define VMCS_HOST_TR_BASE 0x00006C0A +#define VMCS_HOST_GDTR_BASE 0x00006C0C +#define VMCS_HOST_IDTR_BASE 0x00006C0E +#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10 +#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12 +#define VMCS_HOST_RSP 0x00006C14 +#define VMCS_HOST_RIP 0x00006c16 + +/* + * VM instruction error numbers + */ +#define VMRESUME_WITH_NON_LAUNCHED_VMCS 5 + +/* + * VMCS exit reasons + */ +#define EXIT_REASON_EXCEPTION 0 +#define EXIT_REASON_EXT_INTR 1 +#define EXIT_REASON_TRIPLE_FAULT 2 +#define EXIT_REASON_INIT 3 +#define EXIT_REASON_SIPI 4 +#define EXIT_REASON_IO_SMI 5 +#define EXIT_REASON_SMI 6 +#define EXIT_REASON_INTR_WINDOW 7 +#define EXIT_REASON_NMI_WINDOW 8 +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_GETSEC 11 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVD 13 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_RSM 17 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMXOFF 26 +#define EXIT_REASON_VMXON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_INOUT 30 +#define EXIT_REASON_RDMSR 31 +#define EXIT_REASON_WRMSR 32 +#define EXIT_REASON_INVAL_VMCS 33 +#define EXIT_REASON_INVAL_MSR 34 +#define EXIT_REASON_MWAIT 36 +#define EXIT_REASON_MTF 37 +#define EXIT_REASON_MONITOR 39 +#define EXIT_REASON_PAUSE 40 +#define EXIT_REASON_MCE 41 +#define EXIT_REASON_TPR 43 +#define EXIT_REASON_APIC 44 +#define EXIT_REASON_GDTR_IDTR 46 +#define EXIT_REASON_LDTR_TR 47 +#define EXIT_REASON_EPT_FAULT 48 +#define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_INVEPT 50 +#define EXIT_REASON_RDTSCP 51 +#define EXIT_REASON_VMX_PREEMPT 52 +#define EXIT_REASON_INVVPID 53 +#define EXIT_REASON_WBINVD 54 +#define EXIT_REASON_XSETBV 55 + +/* + * VMCS interrupt information fields + */ +#define VMCS_INTERRUPTION_INFO_VALID (1 << 31) +#define VMCS_INTERRUPTION_INFO_HW_INTR (0 << 8) +#define VMCS_INTERRUPTION_INFO_NMI (2 << 8) + +/* + * VMCS Guest interruptibility field + */ +#define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0) +#define VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING (1 << 1) +#define VMCS_INTERRUPTIBILITY_SMI_BLOCKING (1 << 2) +#define VMCS_INTERRUPTIBILITY_NMI_BLOCKING (1 << 3) + +/* + * Exit qualification for EXIT_REASON_INVAL_VMCS + */ +#define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3 + +#endif diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c new file mode 100644 index 000000000000..ec181c40ab90 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx.c @@ -0,0 +1,1673 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/smp.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/proc.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/psl.h> +#include <machine/cpufunc.h> +#include <machine/pmap.h> +#include <machine/segments.h> +#include <machine/vmparam.h> + +#include <machine/vmm.h> +#include "vmm_lapic.h" +#include "vmm_msr.h" +#include "vmm_ktr.h" +#include "vmm_stat.h" + +#include "vmx_msr.h" +#include "ept.h" +#include "vmx_cpufunc.h" +#include "vmx.h" +#include "x86.h" +#include "vmx_controls.h" + +#define CR4_VMXE (1UL << 13) + +#define PINBASED_CTLS_ONE_SETTING \ + (PINBASED_EXTINT_EXITING | \ + PINBASED_NMI_EXITING | \ + PINBASED_VIRTUAL_NMI) +#define PINBASED_CTLS_ZERO_SETTING 0 + +#define PROCBASED_CTLS_WINDOW_SETTING \ + (PROCBASED_INT_WINDOW_EXITING | \ + PROCBASED_NMI_WINDOW_EXITING) + +#define PROCBASED_CTLS_ONE_SETTING \ + (PROCBASED_SECONDARY_CONTROLS | \ + PROCBASED_IO_EXITING | \ + PROCBASED_MSR_BITMAPS | \ + PROCBASED_CTLS_WINDOW_SETTING) +#define PROCBASED_CTLS_ZERO_SETTING \ + (PROCBASED_CR3_LOAD_EXITING | \ + PROCBASED_CR3_STORE_EXITING | \ + PROCBASED_IO_BITMAPS) + +#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT +#define PROCBASED_CTLS2_ZERO_SETTING 0 + +#define VM_EXIT_CTLS_ONE_SETTING \ + (VM_EXIT_HOST_LMA | \ + VM_EXIT_SAVE_EFER | \ + VM_EXIT_SAVE_PAT | \ + VM_EXIT_LOAD_PAT | \ + VM_EXIT_LOAD_EFER) +#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS + +#define VM_ENTRY_CTLS_ONE_SETTING \ + (VM_ENTRY_LOAD_PAT | \ + VM_ENTRY_LOAD_EFER) +#define VM_ENTRY_CTLS_ZERO_SETTING \ + (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_INTO_SMM | \ + VM_ENTRY_DEACTIVATE_DUAL_MONITOR) + +#define guest_msr_rw(vmx, msr) \ + msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) + +#define HANDLED 1 +#define UNHANDLED 0 + +MALLOC_DEFINE(M_VMX, "vmx", "vmx"); + +extern struct pcpu __pcpu[]; + +static int vmxon_enabled[MAXCPU]; +static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); + +static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; +static uint32_t exit_ctls, entry_ctls; + +static uint64_t cr0_ones_mask, cr0_zeros_mask; +static uint64_t cr4_ones_mask, cr4_zeros_mask; + +static volatile u_int nextvpid; + +/* + * Virtual NMI blocking conditions. + * + * Some processor implementations also require NMI to be blocked if + * the STI_BLOCKING bit is set. It is possible to detect this at runtime + * based on the (exit_reason,exit_qual) tuple being set to + * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING). + * + * We take the easy way out and also include STI_BLOCKING as one of the + * gating items for vNMI injection. + */ +static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING | + VMCS_INTERRUPTIBILITY_NMI_BLOCKING | + VMCS_INTERRUPTIBILITY_STI_BLOCKING; + +/* + * Optional capabilities + */ +static int cap_halt_exit; +static int cap_pause_exit; +static int cap_unrestricted_guest; +static int cap_monitor_trap; + +/* statistics */ +static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus"); +static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt"); + +#ifdef KTR +static const char * +exit_reason_to_str(int reason) +{ + static char reasonbuf[32]; + + switch (reason) { + case EXIT_REASON_EXCEPTION: + return "exception"; + case EXIT_REASON_EXT_INTR: + return "extint"; + case EXIT_REASON_TRIPLE_FAULT: + return "triplefault"; + case EXIT_REASON_INIT: + return "init"; + case EXIT_REASON_SIPI: + return "sipi"; + case EXIT_REASON_IO_SMI: + return "iosmi"; + case EXIT_REASON_SMI: + return "smi"; + case EXIT_REASON_INTR_WINDOW: + return "intrwindow"; + case EXIT_REASON_NMI_WINDOW: + return "nmiwindow"; + case EXIT_REASON_TASK_SWITCH: + return "taskswitch"; + case EXIT_REASON_CPUID: + return "cpuid"; + case EXIT_REASON_GETSEC: + return "getsec"; + case EXIT_REASON_HLT: + return "hlt"; + case EXIT_REASON_INVD: + return "invd"; + case EXIT_REASON_INVLPG: + return "invlpg"; + case EXIT_REASON_RDPMC: + return "rdpmc"; + case EXIT_REASON_RDTSC: + return "rdtsc"; + case EXIT_REASON_RSM: + return "rsm"; + case EXIT_REASON_VMCALL: + return "vmcall"; + case EXIT_REASON_VMCLEAR: + return "vmclear"; + case EXIT_REASON_VMLAUNCH: + return "vmlaunch"; + case EXIT_REASON_VMPTRLD: + return "vmptrld"; + case EXIT_REASON_VMPTRST: + return "vmptrst"; + case EXIT_REASON_VMREAD: + return "vmread"; + case EXIT_REASON_VMRESUME: + return "vmresume"; + case EXIT_REASON_VMWRITE: + return "vmwrite"; + case EXIT_REASON_VMXOFF: + return "vmxoff"; + case EXIT_REASON_VMXON: + return "vmxon"; + case EXIT_REASON_CR_ACCESS: + return "craccess"; + case EXIT_REASON_DR_ACCESS: + return "draccess"; + case EXIT_REASON_INOUT: + return "inout"; + case EXIT_REASON_RDMSR: + return "rdmsr"; + case EXIT_REASON_WRMSR: + return "wrmsr"; + case EXIT_REASON_INVAL_VMCS: + return "invalvmcs"; + case EXIT_REASON_INVAL_MSR: + return "invalmsr"; + case EXIT_REASON_MWAIT: + return "mwait"; + case EXIT_REASON_MTF: + return "mtf"; + case EXIT_REASON_MONITOR: + return "monitor"; + case EXIT_REASON_PAUSE: + return "pause"; + case EXIT_REASON_MCE: + return "mce"; + case EXIT_REASON_TPR: + return "tpr"; + case EXIT_REASON_APIC: + return "apic"; + case EXIT_REASON_GDTR_IDTR: + return "gdtridtr"; + case EXIT_REASON_LDTR_TR: + return "ldtrtr"; + case EXIT_REASON_EPT_FAULT: + return "eptfault"; + case EXIT_REASON_EPT_MISCONFIG: + return "eptmisconfig"; + case EXIT_REASON_INVEPT: + return "invept"; + case EXIT_REASON_RDTSCP: + return "rdtscp"; + case EXIT_REASON_VMX_PREEMPT: + return "vmxpreempt"; + case EXIT_REASON_INVVPID: + return "invvpid"; + case EXIT_REASON_WBINVD: + return "wbinvd"; + case EXIT_REASON_XSETBV: + return "xsetbv"; + default: + snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); + return (reasonbuf); + } +} + +#ifdef SETJMP_TRACE +static const char * +vmx_setjmp_rc2str(int rc) +{ + switch (rc) { + case VMX_RETURN_DIRECT: + return "direct"; + case VMX_RETURN_LONGJMP: + return "longjmp"; + case VMX_RETURN_VMRESUME: + return "vmresume"; + case VMX_RETURN_VMLAUNCH: + return "vmlaunch"; + default: + return "unknown"; + } +} + +#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ + VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ + (vmxctx)->regname) + +static void +vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) +{ + uint64_t host_rip, host_rsp; + + if (vmxctx != &vmx->ctx[vcpu]) + panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", + vmxctx, &vmx->ctx[vcpu]); + + VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); + VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", + vmx_setjmp_rc2str(rc), rc); + + host_rsp = host_rip = ~0; + vmread(VMCS_HOST_RIP, &host_rip); + vmread(VMCS_HOST_RSP, &host_rsp); + VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx", + host_rip, host_rsp); + + SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); + SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); + + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); + SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); +} +#endif +#else +static void __inline +vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) +{ + return; +} +#endif /* KTR */ + +u_long +vmx_fix_cr0(u_long cr0) +{ + + return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); +} + +u_long +vmx_fix_cr4(u_long cr4) +{ + + return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); +} + +static void +msr_save_area_init(struct msr_entry *g_area, int *g_count) +{ + int cnt; + + static struct msr_entry guest_msrs[] = { + { MSR_KGSBASE, 0, 0 }, + }; + + cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); + if (cnt > GUEST_MSR_MAX_ENTRIES) + panic("guest msr save area overrun"); + bcopy(guest_msrs, g_area, sizeof(guest_msrs)); + *g_count = cnt; +} + +static void +vmx_disable(void *arg __unused) +{ + struct invvpid_desc invvpid_desc = { 0 }; + struct invept_desc invept_desc = { 0 }; + + if (vmxon_enabled[curcpu]) { + /* + * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. + * + * VMXON or VMXOFF are not required to invalidate any TLB + * caching structures. This prevents potential retention of + * cached information in the TLB between distinct VMX episodes. + */ + invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); + invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); + vmxoff(); + } + load_cr4(rcr4() & ~CR4_VMXE); +} + +static int +vmx_cleanup(void) +{ + + smp_rendezvous(NULL, vmx_disable, NULL, NULL); + + return (0); +} + +static void +vmx_enable(void *arg __unused) +{ + int error; + + load_cr4(rcr4() | CR4_VMXE); + + *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); + error = vmxon(vmxon_region[curcpu]); + if (error == 0) + vmxon_enabled[curcpu] = 1; +} + +static int +vmx_init(void) +{ + int error; + unsigned int regs[4]; + uint64_t fixed0, fixed1; + uint32_t tmp; + + /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ + do_cpuid(1, regs); + if ((regs[2] & CPUID_0000_0001_FEAT0_VMX) == 0) { + printf("vmx_init: processor does not support VMX operation\n"); + return (ENXIO); + } + + /* Check support for primary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_CTLS_ONE_SETTING, + PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); + if (error) { + printf("vmx_init: processor does not support desired primary " + "processor-based controls\n"); + return (error); + } + + /* Clear the processor-based ctl bits that are set on demand */ + procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; + + /* Check support for secondary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, + PROCBASED_CTLS2_ONE_SETTING, + PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); + if (error) { + printf("vmx_init: processor does not support desired secondary " + "processor-based controls\n"); + return (error); + } + + /* Check support for VPID */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_ENABLE_VPID, 0, &tmp); + if (error == 0) + procbased_ctls2 |= PROCBASED2_ENABLE_VPID; + + /* Check support for pin-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, + PINBASED_CTLS_ONE_SETTING, + PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "pin-based controls\n"); + return (error); + } + + /* Check support for VM-exit controls */ + error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, + VM_EXIT_CTLS_ONE_SETTING, + VM_EXIT_CTLS_ZERO_SETTING, + &exit_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "exit controls\n"); + return (error); + } + + /* Check support for VM-entry controls */ + error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, + VM_ENTRY_CTLS_ONE_SETTING, + VM_ENTRY_CTLS_ZERO_SETTING, + &entry_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "entry controls\n"); + return (error); + } + + /* + * Check support for optional features by testing them + * as individual bits + */ + cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_HLT_EXITING, 0, + &tmp) == 0); + + cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_PROCBASED_CTLS, + PROCBASED_MTF, 0, + &tmp) == 0); + + cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_PAUSE_EXITING, 0, + &tmp) == 0); + + cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_UNRESTRICTED_GUEST, 0, + &tmp) == 0); + + /* Initialize EPT */ + error = ept_init(); + if (error) { + printf("vmx_init: ept initialization failed (%d)\n", error); + return (error); + } + + /* + * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 + */ + fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); + fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); + cr0_ones_mask = fixed0 & fixed1; + cr0_zeros_mask = ~fixed0 & ~fixed1; + + /* + * CR0_PE and CR0_PG can be set to zero in VMX non-root operation + * if unrestricted guest execution is allowed. + */ + if (cap_unrestricted_guest) + cr0_ones_mask &= ~(CR0_PG | CR0_PE); + + /* + * Do not allow the guest to set CR0_NW or CR0_CD. + */ + cr0_zeros_mask |= (CR0_NW | CR0_CD); + + fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); + fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); + cr4_ones_mask = fixed0 & fixed1; + cr4_zeros_mask = ~fixed0 & ~fixed1; + + /* enable VMX operation */ + smp_rendezvous(NULL, vmx_enable, NULL, NULL); + + return (0); +} + +/* + * If this processor does not support VPIDs then simply return 0. + * + * Otherwise generate the next value of VPID to use. Any value is alright + * as long as it is non-zero. + * + * We always execute in VMX non-root context with EPT enabled. Thus all + * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This + * in turn means that multiple VMs can share the same VPID as long as + * they have distinct EPT page tables. + * + * XXX + * We should optimize this so that it returns VPIDs that are not in + * use. Then we will not unnecessarily invalidate mappings in + * vmx_set_pcpu_defaults() just because two or more vcpus happen to + * use the same 'vpid'. + */ +static uint16_t +vmx_vpid(void) +{ + uint16_t vpid = 0; + + if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) { + do { + vpid = atomic_fetchadd_int(&nextvpid, 1); + } while (vpid == 0); + } + + return (vpid); +} + +static int +vmx_setup_cr0_shadow(struct vmcs *vmcs) +{ + int error; + uint64_t mask, shadow; + + mask = cr0_ones_mask | cr0_zeros_mask; + error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_MASK), mask); + if (error) + return (error); + + shadow = cr0_ones_mask; + error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_SHADOW), shadow); + if (error) + return (error); + + return (0); +} + +static void * +vmx_vminit(struct vm *vm) +{ + uint16_t vpid; + int i, error, guest_msr_count; + struct vmx *vmx; + + vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); + if ((uintptr_t)vmx & PAGE_MASK) { + panic("malloc of struct vmx not aligned on %d byte boundary", + PAGE_SIZE); + } + vmx->vm = vm; + + /* + * Clean up EPTP-tagged guest physical and combined mappings + * + * VMX transitions are not required to invalidate any guest physical + * mappings. So, it may be possible for stale guest physical mappings + * to be present in the processor TLBs. + * + * Combined mappings for this EP4TA are also invalidated for all VPIDs. + */ + ept_invalidate_mappings(vtophys(vmx->pml4ept)); + + msr_bitmap_initialize(vmx->msr_bitmap); + + /* + * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. + * The guest FSBASE and GSBASE are saved and restored during + * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are + * always restored from the vmcs host state area on vm-exit. + * + * Guest KGSBASE is saved and restored in the guest MSR save area. + * Host KGSBASE is restored before returning to userland from the pcb. + * There will be a window of time when we are executing in the host + * kernel context with a value of KGSBASE from the guest. This is ok + * because the value of KGSBASE is inconsequential in kernel context. + * + * MSR_EFER is saved and restored in the guest VMCS area on a + * VM exit and entry respectively. It is also restored from the + * host VMCS area on a VM exit. + * + * MSR_PAT is saved and restored in the guest VMCS are on a VM exit + * and entry respectively. It is also restored from the host VMCS + * area on a VM exit. + */ + if (guest_msr_rw(vmx, MSR_GSBASE) || + guest_msr_rw(vmx, MSR_FSBASE) || + guest_msr_rw(vmx, MSR_KGSBASE) || + guest_msr_rw(vmx, MSR_EFER) || + guest_msr_rw(vmx, MSR_PAT)) + panic("vmx_vminit: error setting guest msr access"); + + for (i = 0; i < VM_MAXCPU; i++) { + vmx->vmcs[i].identifier = vmx_revision(); + error = vmclear(&vmx->vmcs[i]); + if (error != 0) { + panic("vmx_vminit: vmclear error %d on vcpu %d\n", + error, i); + } + + vpid = vmx_vpid(); + + error = vmcs_set_defaults(&vmx->vmcs[i], + (u_long)vmx_longjmp, + (u_long)&vmx->ctx[i], + vtophys(vmx->pml4ept), + pinbased_ctls, + procbased_ctls, + procbased_ctls2, + exit_ctls, entry_ctls, + vtophys(vmx->msr_bitmap), + vpid); + + if (error != 0) + panic("vmx_vminit: vmcs_set_defaults error %d", error); + + vmx->cap[i].set = 0; + vmx->cap[i].proc_ctls = procbased_ctls; + + vmx->state[i].request_nmi = 0; + vmx->state[i].lastcpu = -1; + vmx->state[i].vpid = vpid; + + msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); + + error = vmcs_set_msr_save(&vmx->vmcs[i], + vtophys(vmx->guest_msrs[i]), + guest_msr_count); + if (error != 0) + panic("vmcs_set_msr_save error %d", error); + + error = vmx_setup_cr0_shadow(&vmx->vmcs[i]); + } + + return (vmx); +} + +static int +vmx_handle_cpuid(struct vmxctx *vmxctx) +{ + int handled, func; + + func = vmxctx->guest_rax; + + handled = x86_emulate_cpuid((uint32_t*)(&vmxctx->guest_rax), + (uint32_t*)(&vmxctx->guest_rbx), (uint32_t*)(&vmxctx->guest_rcx), + (uint32_t*)(&vmxctx->guest_rdx)); +#if 0 + printf("%s: func %x rax %lx rbx %lx rcx %lx rdx %lx handled %d\n", + __func__, func, vmxctx->guest_rax, vmxctx->guest_rbx, + vmxctx->guest_rcx, vmxctx->guest_rdx, handled); +#endif + + return (handled); +} + +static __inline void +vmx_run_trace(struct vmx *vmx, int vcpu) +{ +#ifdef KTR + VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip()); +#endif +} + +static __inline void +vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, + int handled, int astpending) +{ +#ifdef KTR + VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", + handled ? "handled" : "unhandled", + exit_reason_to_str(exit_reason), rip); + + if (astpending) + VMM_CTR0(vmx->vm, vcpu, "astpending"); +#endif +} + +static int +vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) +{ + int error, lastcpu; + struct vmxstate *vmxstate; + struct invvpid_desc invvpid_desc = { 0 }; + + vmxstate = &vmx->state[vcpu]; + lastcpu = vmxstate->lastcpu; + vmxstate->lastcpu = curcpu; + + if (lastcpu == curcpu) { + error = 0; + goto done; + } + + vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); + + error = vmwrite(VMCS_HOST_TR_BASE, (u_long)PCPU_GET(tssp)); + if (error != 0) + goto done; + + error = vmwrite(VMCS_HOST_GDTR_BASE, (u_long)&gdt[NGDT * curcpu]); + if (error != 0) + goto done; + + error = vmwrite(VMCS_HOST_GS_BASE, (u_long)&__pcpu[curcpu]); + if (error != 0) + goto done; + + /* + * If we are using VPIDs then invalidate all mappings tagged with 'vpid' + * + * We do this because this vcpu was executing on a different host + * cpu when it last ran. We do not track whether it invalidated + * mappings associated with its 'vpid' during that run. So we must + * assume that the mappings associated with 'vpid' on 'curcpu' are + * stale and invalidate them. + * + * Note that we incur this penalty only when the scheduler chooses to + * move the thread associated with this vcpu between host cpus. + * + * Note also that this will invalidate mappings tagged with 'vpid' + * for "all" EP4TAs. + */ + if (vmxstate->vpid != 0) { + invvpid_desc.vpid = vmxstate->vpid; + invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); + } +done: + return (error); +} + +static void +vm_exit_update_rip(struct vm_exit *vmexit) +{ + int error; + + error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); + if (error) + panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); +} + +/* + * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. + */ +CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); + +static void __inline +vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) +{ + int error; + + vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; + + error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + if (error) + panic("vmx_set_int_window_exiting: vmwrite error %d", error); +} + +static void __inline +vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) +{ + int error; + + vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; + + error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + if (error) + panic("vmx_clear_int_window_exiting: vmwrite error %d", error); +} + +static void __inline +vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) +{ + int error; + + vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; + + error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + if (error) + panic("vmx_set_nmi_window_exiting: vmwrite error %d", error); +} + +static void __inline +vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) +{ + int error; + + vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; + + error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + if (error) + panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error); +} + +static int +vmx_inject_nmi(struct vmx *vmx, int vcpu) +{ + int error; + uint64_t info, interruptibility; + + /* Bail out if no NMI requested */ + if (vmx->state[vcpu].request_nmi == 0) + return (0); + + error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); + if (error) { + panic("vmx_inject_nmi: vmread(interruptibility) %d", + error); + } + if (interruptibility & nmi_blocking_bits) + goto nmiblocked; + + /* + * Inject the virtual NMI. The vector must be the NMI IDT entry + * or the VMCS entry check will fail. + */ + info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; + info |= IDT_NMI; + + error = vmwrite(VMCS_ENTRY_INTR_INFO, info); + if (error) + panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error); + + VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI"); + + /* Clear the request */ + vmx->state[vcpu].request_nmi = 0; + return (1); + +nmiblocked: + /* + * Set the NMI Window Exiting execution control so we can inject + * the virtual NMI as soon as blocking condition goes away. + */ + vmx_set_nmi_window_exiting(vmx, vcpu); + + VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); + return (1); +} + +static void +vmx_inject_interrupts(struct vmx *vmx, int vcpu) +{ + int error, vector; + uint64_t info, rflags, interruptibility; + + const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | + VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; + +#if 1 + /* + * XXX + * If an event is being injected from userland then just return. + * For e.g. we may inject a breakpoint exception to cause the + * guest to enter the debugger so we can inspect its state. + */ + error = vmread(VMCS_ENTRY_INTR_INFO, &info); + if (error) + panic("vmx_inject_interrupts: vmread(intrinfo) %d", error); + if (info & VMCS_INTERRUPTION_INFO_VALID) + return; +#endif + /* + * NMI injection has priority so deal with those first + */ + if (vmx_inject_nmi(vmx, vcpu)) + return; + + /* Ask the local apic for a vector to inject */ + vector = lapic_pending_intr(vmx->vm, vcpu); + if (vector < 0) + return; + + if (vector < 32 || vector > 255) + panic("vmx_inject_interrupts: invalid vector %d\n", vector); + + /* Check RFLAGS.IF and the interruptibility state of the guest */ + error = vmread(VMCS_GUEST_RFLAGS, &rflags); + if (error) + panic("vmx_inject_interrupts: vmread(rflags) %d", error); + + if ((rflags & PSL_I) == 0) + goto cantinject; + + error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); + if (error) { + panic("vmx_inject_interrupts: vmread(interruptibility) %d", + error); + } + if (interruptibility & HWINTR_BLOCKED) + goto cantinject; + + /* Inject the interrupt */ + info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; + info |= vector; + error = vmwrite(VMCS_ENTRY_INTR_INFO, info); + if (error) + panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error); + + /* Update the Local APIC ISR */ + lapic_intr_accepted(vmx->vm, vcpu, vector); + + VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); + + return; + +cantinject: + /* + * Set the Interrupt Window Exiting execution control so we can inject + * the interrupt as soon as blocking condition goes away. + */ + vmx_set_int_window_exiting(vmx, vcpu); + + VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); +} + +static int +vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) +{ + int error; + uint64_t regval; + const struct vmxctx *vmxctx; + + /* We only handle mov to %cr0 at this time */ + if ((exitqual & 0xff) != 0x00) + return (UNHANDLED); + + vmxctx = &vmx->ctx[vcpu]; + + /* + * We must use vmwrite() directly here because vmcs_setreg() will + * call vmclear(vmcs) as a side-effect which we certainly don't want. + */ + switch ((exitqual >> 8) & 0xf) { + case 0: + regval = vmxctx->guest_rax; + break; + case 1: + regval = vmxctx->guest_rcx; + break; + case 2: + regval = vmxctx->guest_rdx; + break; + case 3: + regval = vmxctx->guest_rbx; + break; + case 4: + error = vmread(VMCS_GUEST_RSP, ®val); + if (error) { + panic("vmx_emulate_cr_access: " + "error %d reading guest rsp", error); + } + break; + case 5: + regval = vmxctx->guest_rbp; + break; + case 6: + regval = vmxctx->guest_rsi; + break; + case 7: + regval = vmxctx->guest_rdi; + break; + case 8: + regval = vmxctx->guest_r8; + break; + case 9: + regval = vmxctx->guest_r9; + break; + case 10: + regval = vmxctx->guest_r10; + break; + case 11: + regval = vmxctx->guest_r11; + break; + case 12: + regval = vmxctx->guest_r12; + break; + case 13: + regval = vmxctx->guest_r13; + break; + case 14: + regval = vmxctx->guest_r14; + break; + case 15: + regval = vmxctx->guest_r15; + break; + } + + regval |= cr0_ones_mask; + regval &= ~cr0_zeros_mask; + error = vmwrite(VMCS_GUEST_CR0, regval); + if (error) + panic("vmx_emulate_cr_access: error %d writing cr0", error); + + return (HANDLED); +} + +static int +vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) +{ + int handled; + struct vmcs *vmcs; + struct vmxctx *vmxctx; + uint32_t eax, ecx, edx; + uint64_t qual; + + handled = 0; + vmcs = &vmx->vmcs[vcpu]; + vmxctx = &vmx->ctx[vcpu]; + qual = vmexit->u.vmx.exit_qualification; + vmexit->exitcode = VM_EXITCODE_BOGUS; + + switch (vmexit->u.vmx.exit_reason) { + case EXIT_REASON_CR_ACCESS: + handled = vmx_emulate_cr_access(vmx, vcpu, qual); + break; + case EXIT_REASON_RDMSR: + ecx = vmxctx->guest_rcx; + handled = emulate_rdmsr(vmx->vm, vcpu, ecx); + if (!handled) { + vmexit->exitcode = VM_EXITCODE_RDMSR; + vmexit->u.msr.code = ecx; + } + break; + case EXIT_REASON_WRMSR: + eax = vmxctx->guest_rax; + ecx = vmxctx->guest_rcx; + edx = vmxctx->guest_rdx; + handled = emulate_wrmsr(vmx->vm, vcpu, ecx, + (uint64_t)edx << 32 | eax); + if (!handled) { + vmexit->exitcode = VM_EXITCODE_WRMSR; + vmexit->u.msr.code = ecx; + vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; + } + break; + case EXIT_REASON_HLT: + vmexit->exitcode = VM_EXITCODE_HLT; + break; + case EXIT_REASON_MTF: + vmexit->exitcode = VM_EXITCODE_MTRAP; + break; + case EXIT_REASON_PAUSE: + vmexit->exitcode = VM_EXITCODE_PAUSE; + break; + case EXIT_REASON_INTR_WINDOW: + vmx_clear_int_window_exiting(vmx, vcpu); + VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); + /* FALLTHRU */ + case EXIT_REASON_EXT_INTR: + /* + * External interrupts serve only to cause VM exits and allow + * the host interrupt handler to run. + * + * If this external interrupt triggers a virtual interrupt + * to a VM, then that state will be recorded by the + * host interrupt handler in the VM's softc. We will inject + * this virtual interrupt during the subsequent VM enter. + */ + + /* + * This is special. We want to treat this as an 'handled' + * VM-exit but not increment the instruction pointer. + */ + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); + return (1); + case EXIT_REASON_NMI_WINDOW: + /* Exit to allow the pending virtual NMI to be injected */ + vmx_clear_nmi_window_exiting(vmx, vcpu); + VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); + return (1); + case EXIT_REASON_INOUT: + vmexit->exitcode = VM_EXITCODE_INOUT; + vmexit->u.inout.bytes = (qual & 0x7) + 1; + vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; + vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; + vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; + vmexit->u.inout.port = (uint16_t)(qual >> 16); + vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); + break; + case EXIT_REASON_CPUID: + handled = vmx_handle_cpuid(vmxctx); + break; + default: + break; + } + + if (handled) { + /* + * It is possible that control is returned to userland + * even though we were able to handle the VM exit in the + * kernel (for e.g. 'astpending' is set in the run loop). + * + * In such a case we want to make sure that the userland + * restarts guest execution at the instruction *after* + * the one we just processed. Therefore we update the + * guest rip in the VMCS and in 'vmexit'. + */ + vm_exit_update_rip(vmexit); + vmexit->rip += vmexit->inst_length; + vmexit->inst_length = 0; + } else { + if (vmexit->exitcode == VM_EXITCODE_BOGUS) { + /* + * If this VM exit was not claimed by anybody then + * treat it as a generic VMX exit. + */ + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.error = 0; + } else { + /* + * The exitcode and collateral have been populated. + * The VM exit will be processed further in userland. + */ + } + } + return (handled); +} + +static int +vmx_run(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit) +{ + int error, vie, rc, handled, astpending, loopstart; + uint32_t exit_reason; + struct vmx *vmx; + struct vmxctx *vmxctx; + struct vmcs *vmcs; + + vmx = arg; + vmcs = &vmx->vmcs[vcpu]; + vmxctx = &vmx->ctx[vcpu]; + loopstart = 1; + + /* + * XXX Can we avoid doing this every time we do a vm run? + */ + VMPTRLD(vmcs); + + /* + * XXX + * We do this every time because we may setup the virtual machine + * from a different process than the one that actually runs it. + * + * If the life of a virtual machine was spent entirely in the context + * of a single process we could do this once in vmcs_set_defaults(). + */ + if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0) + panic("vmx_run: error %d writing to VMCS_HOST_CR3", error); + + if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0) + panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); + + if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0) + panic("vmx_run: error %d setting up pcpu defaults", error); + + do { + lapic_timer_tick(vmx->vm, vcpu); + vmx_inject_interrupts(vmx, vcpu); + vmx_run_trace(vmx, vcpu); + rc = vmx_setjmp(vmxctx); +#ifdef SETJMP_TRACE + vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); +#endif + switch (rc) { + case VMX_RETURN_DIRECT: + if (loopstart) { + loopstart = 0; + vmx_launch(vmxctx); + } else + vmx_resume(vmxctx); + panic("vmx_launch/resume should not return"); + break; + case VMX_RETURN_LONGJMP: + break; /* vm exit */ + case VMX_RETURN_VMRESUME: + vie = vmcs_instruction_error(); + if (vmxctx->launch_error == VM_FAIL_INVALID || + vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { + printf("vmresume error %d vmcs inst error %d\n", + vmxctx->launch_error, vie); + goto err_exit; + } + vmx_launch(vmxctx); /* try to launch the guest */ + panic("vmx_launch should not return"); + break; + case VMX_RETURN_VMLAUNCH: + vie = vmcs_instruction_error(); +#if 1 + printf("vmlaunch error %d vmcs inst error %d\n", + vmxctx->launch_error, vie); +#endif + goto err_exit; + default: + panic("vmx_setjmp returned %d", rc); + } + + /* + * XXX locking? + * See comments in exception.S about checking for ASTs + * atomically while interrupts are disabled. But it is + * not clear that they apply in our case. + */ + astpending = curthread->td_flags & TDF_ASTPENDING; + + /* enable interrupts */ + enable_intr(); + + /* collect some basic information for VM exit processing */ + vmexit->rip = rip = vmcs_guest_rip(); + vmexit->inst_length = vmexit_instruction_length(); + vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); + vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); + + handled = vmx_exit_process(vmx, vcpu, vmexit); + + vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled, + astpending); + } while (handled && !astpending); + + /* + * If a VM exit has been handled then the exitcode must be BOGUS + * If a VM exit is not handled then the exitcode must not be BOGUS + */ + if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || + (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { + panic("Mismatch between handled (%d) and exitcode (%d)", + handled, vmexit->exitcode); + } + + VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode); + + /* + * XXX + * We need to do this to ensure that any VMCS state cached by the + * processor is flushed to memory. We need to do this in case the + * VM moves to a different cpu the next time it runs. + * + * Can we avoid doing this? + */ + VMCLEAR(vmcs); + return (0); + +err_exit: + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.exit_reason = (uint32_t)-1; + vmexit->u.vmx.exit_qualification = (uint32_t)-1; + vmexit->u.vmx.error = vie; + VMCLEAR(vmcs); + return (ENOEXEC); +} + +static void +vmx_vmcleanup(void *arg) +{ + int error; + struct vmx *vmx = arg; + + /* + * XXXSMP we also need to clear the VMCS active on the other vcpus. + */ + error = vmclear(&vmx->vmcs[0]); + if (error != 0) + panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); + + ept_vmcleanup(vmx); + free(vmx, M_VMX); + + return; +} + +static register_t * +vmxctx_regptr(struct vmxctx *vmxctx, int reg) +{ + + switch (reg) { + case VM_REG_GUEST_RAX: + return (&vmxctx->guest_rax); + case VM_REG_GUEST_RBX: + return (&vmxctx->guest_rbx); + case VM_REG_GUEST_RCX: + return (&vmxctx->guest_rcx); + case VM_REG_GUEST_RDX: + return (&vmxctx->guest_rdx); + case VM_REG_GUEST_RSI: + return (&vmxctx->guest_rsi); + case VM_REG_GUEST_RDI: + return (&vmxctx->guest_rdi); + case VM_REG_GUEST_RBP: + return (&vmxctx->guest_rbp); + case VM_REG_GUEST_R8: + return (&vmxctx->guest_r8); + case VM_REG_GUEST_R9: + return (&vmxctx->guest_r9); + case VM_REG_GUEST_R10: + return (&vmxctx->guest_r10); + case VM_REG_GUEST_R11: + return (&vmxctx->guest_r11); + case VM_REG_GUEST_R12: + return (&vmxctx->guest_r12); + case VM_REG_GUEST_R13: + return (&vmxctx->guest_r13); + case VM_REG_GUEST_R14: + return (&vmxctx->guest_r14); + case VM_REG_GUEST_R15: + return (&vmxctx->guest_r15); + default: + break; + } + return (NULL); +} + +static int +vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) +{ + register_t *regp; + + if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { + *retval = *regp; + return (0); + } else + return (EINVAL); +} + +static int +vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) +{ + register_t *regp; + + if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { + *regp = val; + return (0); + } else + return (EINVAL); +} + +static int +vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) +{ + struct vmx *vmx = arg; + + if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) + return (0); + + /* + * If the vcpu is running then don't mess with the VMCS. + * + * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause + * the subsequent vmlaunch/vmresume to fail. + */ + if (vcpu_is_running(vmx->vm, vcpu, NULL)) + panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); + + return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval)); +} + +static int +vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) +{ + int error; + uint64_t ctls; + struct vmx *vmx = arg; + + /* + * XXX Allow caller to set contents of the guest registers saved in + * the 'vmxctx' even though the vcpu might be running. We need this + * specifically to support the rdmsr emulation that will set the + * %eax and %edx registers during vm exit processing. + */ + if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) + return (0); + + /* + * If the vcpu is running then don't mess with the VMCS. + * + * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause + * the subsequent vmlaunch/vmresume to fail. + */ + if (vcpu_is_running(vmx->vm, vcpu, NULL)) + panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); + + error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val); + + if (error == 0) { + /* + * If the "load EFER" VM-entry control is 1 then the + * value of EFER.LMA must be identical to "IA-32e mode guest" + * bit in the VM-entry control. + */ + if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && + (reg == VM_REG_GUEST_EFER)) { + vmcs_getreg(&vmx->vmcs[vcpu], + VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); + if (val & EFER_LMA) + ctls |= VM_ENTRY_GUEST_LMA; + else + ctls &= ~VM_ENTRY_GUEST_LMA; + vmcs_setreg(&vmx->vmcs[vcpu], + VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); + } + } + + return (error); +} + +static int +vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + struct vmx *vmx = arg; + + return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); +} + +static int +vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + struct vmx *vmx = arg; + + return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); +} + +static int +vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, + int code_valid) +{ + int error; + uint32_t info; + struct vmx *vmx = arg; + struct vmcs *vmcs = &vmx->vmcs[vcpu]; + + static uint32_t type_map[VM_EVENT_MAX] = { + 0x1, /* VM_EVENT_NONE */ + 0x0, /* VM_HW_INTR */ + 0x2, /* VM_NMI */ + 0x3, /* VM_HW_EXCEPTION */ + 0x4, /* VM_SW_INTR */ + 0x5, /* VM_PRIV_SW_EXCEPTION */ + 0x6, /* VM_SW_EXCEPTION */ + }; + + info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); + info |= VMCS_INTERRUPTION_INFO_VALID; + error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); + if (error != 0) + return (error); + + if (code_valid) { + error = vmcs_setreg(vmcs, + VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR), + code); + } + return (error); +} + +static int +vmx_nmi(void *arg, int vcpu) +{ + struct vmx *vmx = arg; + + atomic_set_int(&vmx->state[vcpu].request_nmi, 1); + + return (0); +} + +static int +vmx_getcap(void *arg, int vcpu, int type, int *retval) +{ + struct vmx *vmx = arg; + int vcap; + int ret; + + ret = ENOENT; + + vcap = vmx->cap[vcpu].set; + + switch (type) { + case VM_CAP_HALT_EXIT: + if (cap_halt_exit) + ret = 0; + break; + case VM_CAP_PAUSE_EXIT: + if (cap_pause_exit) + ret = 0; + break; + case VM_CAP_MTRAP_EXIT: + if (cap_monitor_trap) + ret = 0; + break; + case VM_CAP_UNRESTRICTED_GUEST: + if (cap_unrestricted_guest) + ret = 0; + break; + default: + break; + } + + if (ret == 0) + *retval = (vcap & (1 << type)) ? 1 : 0; + + return (ret); +} + +static int +vmx_setcap(void *arg, int vcpu, int type, int val) +{ + struct vmx *vmx = arg; + struct vmcs *vmcs = &vmx->vmcs[vcpu]; + uint32_t baseval; + uint32_t *pptr; + int error; + int flag; + int reg; + int retval; + + retval = ENOENT; + pptr = NULL; + + switch (type) { + case VM_CAP_HALT_EXIT: + if (cap_halt_exit) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_HLT_EXITING; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_MTRAP_EXIT: + if (cap_monitor_trap) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_MTF; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_PAUSE_EXIT: + if (cap_pause_exit) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_PAUSE_EXITING; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_UNRESTRICTED_GUEST: + if (cap_unrestricted_guest) { + retval = 0; + baseval = procbased_ctls2; + flag = PROCBASED2_UNRESTRICTED_GUEST; + reg = VMCS_SEC_PROC_BASED_CTLS; + } + break; + default: + break; + } + + if (retval == 0) { + if (val) { + baseval |= flag; + } else { + baseval &= ~flag; + } + VMPTRLD(vmcs); + error = vmwrite(reg, baseval); + VMCLEAR(vmcs); + + if (error) { + retval = error; + } else { + /* + * Update optional stored flags, and record + * setting + */ + if (pptr != NULL) { + *pptr = baseval; + } + + if (val) { + vmx->cap[vcpu].set |= (1 << type); + } else { + vmx->cap[vcpu].set &= ~(1 << type); + } + } + } + + return (retval); +} + +struct vmm_ops vmm_ops_intel = { + vmx_init, + vmx_cleanup, + vmx_vminit, + vmx_run, + vmx_vmcleanup, + ept_vmmmap, + vmx_getreg, + vmx_setreg, + vmx_getdesc, + vmx_setdesc, + vmx_inject, + vmx_nmi, + vmx_getcap, + vmx_setcap +}; diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h new file mode 100644 index 000000000000..69697f8e48d4 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx.h @@ -0,0 +1,115 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMX_H_ +#define _VMX_H_ + +#include "vmcs.h" + +#define GUEST_MSR_MAX_ENTRIES 64 /* arbitrary */ + +struct vmxctx { + register_t guest_rdi; /* Guest state */ + register_t guest_rsi; + register_t guest_rdx; + register_t guest_rcx; + register_t guest_r8; + register_t guest_r9; + register_t guest_rax; + register_t guest_rbx; + register_t guest_rbp; + register_t guest_r10; + register_t guest_r11; + register_t guest_r12; + register_t guest_r13; + register_t guest_r14; + register_t guest_r15; + register_t guest_cr2; + + register_t host_r15; /* Host state */ + register_t host_r14; + register_t host_r13; + register_t host_r12; + register_t host_rbp; + register_t host_rsp; + register_t host_rbx; + register_t host_rip; + /* + * XXX todo debug registers and fpu state + */ + + int launch_error; +}; + +struct vmxcap { + int set; + uint32_t proc_ctls; +}; + +struct vmxstate { + int request_nmi; + int lastcpu; /* host cpu that this 'vcpu' last ran on */ + uint16_t vpid; +}; + +/* virtual machine softc */ +struct vmx { + pml4_entry_t pml4ept[NPML4EPG]; + struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */ + char msr_bitmap[PAGE_SIZE]; + struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES]; + struct vmxctx ctx[VM_MAXCPU]; + struct vmxcap cap[VM_MAXCPU]; + struct vmxstate state[VM_MAXCPU]; + struct vm *vm; +}; +CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0); +CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0); +CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0); +CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0); + +#define VMX_RETURN_DIRECT 0 +#define VMX_RETURN_LONGJMP 1 +#define VMX_RETURN_VMRESUME 2 +#define VMX_RETURN_VMLAUNCH 3 +/* + * vmx_setjmp() returns: + * - 0 when it returns directly + * - 1 when it returns from vmx_longjmp + * - 2 when it returns from vmx_resume (which would only be in the error case) + * - 3 when it returns from vmx_launch (which would only be in the error case) + */ +int vmx_setjmp(struct vmxctx *ctx); +void vmx_longjmp(void); /* returns via vmx_setjmp */ +void vmx_launch(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */ +void vmx_resume(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */ + +u_long vmx_fix_cr0(u_long cr0); +u_long vmx_fix_cr4(u_long cr4); + +#endif diff --git a/sys/amd64/vmm/intel/vmx_controls.h b/sys/amd64/vmm/intel/vmx_controls.h new file mode 100644 index 000000000000..31f29f8e6bba --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_controls.h @@ -0,0 +1,92 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMX_CONTROLS_H_ +#define _VMX_CONTROLS_H_ + +/* Pin-Based VM-Execution Controls */ +#define PINBASED_EXTINT_EXITING (1 << 0) +#define PINBASED_NMI_EXITING (1 << 3) +#define PINBASED_VIRTUAL_NMI (1 << 5) +#define PINBASED_PREMPTION_TIMER (1 << 6) + +/* Primary Processor-Based VM-Execution Controls */ +#define PROCBASED_INT_WINDOW_EXITING (1 << 2) +#define PROCBASED_TSC_OFFSET (1 << 3) +#define PROCBASED_HLT_EXITING (1 << 7) +#define PROCBASED_INVLPG_EXITING (1 << 9) +#define PROCBASED_MWAIT_EXITING (1 << 10) +#define PROCBASED_RDPMC_EXITING (1 << 11) +#define PROCBASED_RDTSC_EXITING (1 << 12) +#define PROCBASED_CR3_LOAD_EXITING (1 << 15) +#define PROCBASED_CR3_STORE_EXITING (1 << 16) +#define PROCBASED_CR8_LOAD_EXITING (1 << 19) +#define PROCBASED_CR8_STORE_EXITING (1 << 20) +#define PROCBASED_USE_TPR_SHADOW (1 << 21) +#define PROCBASED_NMI_WINDOW_EXITING (1 << 22) +#define PROCBASED_MOV_DR_EXITING (1 << 23) +#define PROCBASED_IO_EXITING (1 << 24) +#define PROCBASED_IO_BITMAPS (1 << 25) +#define PROCBASED_MTF (1 << 27) +#define PROCBASED_MSR_BITMAPS (1 << 28) +#define PROCBASED_MONITOR_EXITING (1 << 29) +#define PROCBASED_PAUSE_EXITING (1 << 30) +#define PROCBASED_SECONDARY_CONTROLS (1 << 31) + +/* Secondary Processor-Based VM-Execution Controls */ +#define PROCBASED2_VIRTUALIZE_APIC (1 << 0) +#define PROCBASED2_ENABLE_EPT (1 << 1) +#define PROCBASED2_DESC_TABLE_EXITING (1 << 2) +#define PROCBASED2_ENABLE_RDTSCP (1 << 3) +#define PROCBASED2_VIRTUALIZE_X2APIC (1 << 4) +#define PROCBASED2_ENABLE_VPID (1 << 5) +#define PROCBASED2_WBINVD_EXITING (1 << 6) +#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7) +#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10) + +/* VM Exit Controls */ +#define VM_EXIT_SAVE_DEBUG_CONTROLS (1 << 2) +#define VM_EXIT_HOST_LMA (1 << 9) +#define VM_EXIT_LOAD_PERF_GLOBAL_CTRL (1 << 12) +#define VM_EXIT_ACKNOWLEDGE_INTERRUPT (1 << 15) +#define VM_EXIT_SAVE_PAT (1 << 18) +#define VM_EXIT_LOAD_PAT (1 << 19) +#define VM_EXIT_SAVE_EFER (1 << 20) +#define VM_EXIT_LOAD_EFER (1 << 21) +#define VM_EXIT_SAVE_PREEMPTION_TIMER (1 << 22) + +/* VM Entry Controls */ +#define VM_ENTRY_LOAD_DEBUG_CONTROLS (1 << 2) +#define VM_ENTRY_GUEST_LMA (1 << 9) +#define VM_ENTRY_INTO_SMM (1 << 10) +#define VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11) +#define VM_ENTRY_LOAD_PERF_GLOBAL_CTRL (1 << 13) +#define VM_ENTRY_LOAD_PAT (1 << 14) +#define VM_ENTRY_LOAD_EFER (1 << 15) + +#endif diff --git a/sys/amd64/vmm/intel/vmx_cpufunc.h b/sys/amd64/vmm/intel/vmx_cpufunc.h new file mode 100644 index 000000000000..e9f6c6dcaab3 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_cpufunc.h @@ -0,0 +1,199 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMX_CPUFUNC_H_ +#define _VMX_CPUFUNC_H_ + +struct vmcs; + +/* + * Section 5.2 "Conventions" from Intel Architecture Manual 2B. + * + * error + * VMsucceed 0 + * VMFailInvalid 1 + * VMFailValid 2 see also VMCS VM-Instruction Error Field + */ +#define VM_SUCCESS 0 +#define VM_FAIL_INVALID 1 +#define VM_FAIL_VALID 2 +#define VMX_SET_ERROR_CODE(varname) \ + do { \ + __asm __volatile(" jnc 1f;" \ + " mov $1, %0;" /* CF: error = 1 */ \ + " jmp 3f;" \ + "1: jnz 2f;" \ + " mov $2, %0;" /* ZF: error = 2 */ \ + " jmp 3f;" \ + "2: mov $0, %0;" \ + "3: nop" \ + :"=r" (varname)); \ + } while (0) + +/* returns 0 on success and non-zero on failure */ +static __inline int +vmxon(char *region) +{ + int error; + uint64_t addr; + + addr = vtophys(region); + __asm __volatile("vmxon %0" : : "m" (*(uint64_t *)&addr) : "memory"); + VMX_SET_ERROR_CODE(error); + return (error); +} + +/* returns 0 on success and non-zero on failure */ +static __inline int +vmclear(struct vmcs *vmcs) +{ + int error; + uint64_t addr; + + addr = vtophys(vmcs); + __asm __volatile("vmclear %0" : : "m" (*(uint64_t *)&addr) : "memory"); + VMX_SET_ERROR_CODE(error); + return (error); +} + +static __inline void +vmxoff(void) +{ + __asm __volatile("vmxoff"); +} + +static __inline void +vmptrst(uint64_t *addr) +{ + __asm __volatile("vmptrst %0" : : "m" (*addr) : "memory"); +} + +static __inline int +vmptrld(struct vmcs *vmcs) +{ + int error; + uint64_t addr; + + addr = vtophys(vmcs); + __asm __volatile("vmptrld %0" : : "m" (*(uint64_t *)&addr) : "memory"); + VMX_SET_ERROR_CODE(error); + return (error); +} + +static __inline int +vmwrite(uint64_t reg, uint64_t val) +{ + int error; + + __asm __volatile("vmwrite %0, %1" : : "r" (val), "r" (reg) : "memory"); + + VMX_SET_ERROR_CODE(error); + + return (error); +} + +static __inline int +vmread(uint64_t r, uint64_t *addr) +{ + int error; + + __asm __volatile("vmread %0, %1" : : "r" (r), "m" (*addr) : "memory"); + + VMX_SET_ERROR_CODE(error); + + return (error); +} + +static void __inline +VMCLEAR(struct vmcs *vmcs) +{ + int err; + + err = vmclear(vmcs); + if (err != 0) + panic("%s: vmclear(%p) error %d", __func__, vmcs, err); + + critical_exit(); +} + +static void __inline +VMPTRLD(struct vmcs *vmcs) +{ + int err; + + critical_enter(); + + err = vmptrld(vmcs); + if (err != 0) + panic("%s: vmptrld(%p) error %d", __func__, vmcs, err); +} + +#define INVVPID_TYPE_ADDRESS 0UL +#define INVVPID_TYPE_SINGLE_CONTEXT 1UL +#define INVVPID_TYPE_ALL_CONTEXTS 2UL + +struct invvpid_desc { + uint16_t vpid; + uint16_t _res1; + uint32_t _res2; + uint64_t linear_addr; +}; +CTASSERT(sizeof(struct invvpid_desc) == 16); + +static void __inline +invvpid(uint64_t type, struct invvpid_desc desc) +{ + int error; + + __asm __volatile("invvpid %0, %1" :: "m" (desc), "r" (type) : "memory"); + + VMX_SET_ERROR_CODE(error); + if (error) + panic("invvpid error %d", error); +} + +#define INVEPT_TYPE_SINGLE_CONTEXT 1UL +#define INVEPT_TYPE_ALL_CONTEXTS 2UL +struct invept_desc { + uint64_t eptp; + uint64_t _res; +}; +CTASSERT(sizeof(struct invept_desc) == 16); + +static void __inline +invept(uint64_t type, struct invept_desc desc) +{ + int error; + + __asm __volatile("invept %0, %1" :: "m" (desc), "r" (type) : "memory"); + + VMX_SET_ERROR_CODE(error); + if (error) + panic("invept error %d", error); +} +#endif diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c new file mode 100644 index 000000000000..c4b1efcd7451 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_genassym.c @@ -0,0 +1,81 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/assym.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/pmap.h> + +#include <machine/vmm.h> +#include "vmx.h" +#include "vmx_cpufunc.h" + +ASSYM(VMXCTX_GUEST_RDI, offsetof(struct vmxctx, guest_rdi)); +ASSYM(VMXCTX_GUEST_RSI, offsetof(struct vmxctx, guest_rsi)); +ASSYM(VMXCTX_GUEST_RDX, offsetof(struct vmxctx, guest_rdx)); +ASSYM(VMXCTX_GUEST_RCX, offsetof(struct vmxctx, guest_rcx)); +ASSYM(VMXCTX_GUEST_R8, offsetof(struct vmxctx, guest_r8)); +ASSYM(VMXCTX_GUEST_R9, offsetof(struct vmxctx, guest_r9)); +ASSYM(VMXCTX_GUEST_RAX, offsetof(struct vmxctx, guest_rax)); +ASSYM(VMXCTX_GUEST_RBX, offsetof(struct vmxctx, guest_rbx)); +ASSYM(VMXCTX_GUEST_RBP, offsetof(struct vmxctx, guest_rbp)); +ASSYM(VMXCTX_GUEST_R10, offsetof(struct vmxctx, guest_r10)); +ASSYM(VMXCTX_GUEST_R11, offsetof(struct vmxctx, guest_r11)); +ASSYM(VMXCTX_GUEST_R12, offsetof(struct vmxctx, guest_r12)); +ASSYM(VMXCTX_GUEST_R13, offsetof(struct vmxctx, guest_r13)); +ASSYM(VMXCTX_GUEST_R14, offsetof(struct vmxctx, guest_r14)); +ASSYM(VMXCTX_GUEST_R15, offsetof(struct vmxctx, guest_r15)); +ASSYM(VMXCTX_GUEST_CR2, offsetof(struct vmxctx, guest_cr2)); + +ASSYM(VMXCTX_HOST_R15, offsetof(struct vmxctx, host_r15)); +ASSYM(VMXCTX_HOST_R14, offsetof(struct vmxctx, host_r14)); +ASSYM(VMXCTX_HOST_R13, offsetof(struct vmxctx, host_r13)); +ASSYM(VMXCTX_HOST_R12, offsetof(struct vmxctx, host_r12)); +ASSYM(VMXCTX_HOST_RBP, offsetof(struct vmxctx, host_rbp)); +ASSYM(VMXCTX_HOST_RSP, offsetof(struct vmxctx, host_rsp)); +ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx)); +ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip)); + +ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error)); + +ASSYM(VM_SUCCESS, VM_SUCCESS); +ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID); +ASSYM(VM_FAIL_VALID, VM_FAIL_VALID); + +ASSYM(VMX_RETURN_DIRECT, VMX_RETURN_DIRECT); +ASSYM(VMX_RETURN_LONGJMP, VMX_RETURN_LONGJMP); +ASSYM(VMX_RETURN_VMRESUME, VMX_RETURN_VMRESUME); +ASSYM(VMX_RETURN_VMLAUNCH, VMX_RETURN_VMLAUNCH); diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c new file mode 100644 index 000000000000..1e9a837a78d6 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_msr.c @@ -0,0 +1,172 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> + +#include <machine/cpufunc.h> + +#include "vmx_msr.h" + +static boolean_t +vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos) +{ + + if (msr_val & (1UL << (bitpos + 32))) + return (TRUE); + else + return (FALSE); +} + +static boolean_t +vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos) +{ + + if ((msr_val & (1UL << bitpos)) == 0) + return (TRUE); + else + return (FALSE); +} + +uint32_t +vmx_revision(void) +{ + + return (rdmsr(MSR_VMX_BASIC) & 0xffffffff); +} + +/* + * Generate a bitmask to be used for the VMCS execution control fields. + * + * The caller specifies what bits should be set to one in 'ones_mask' + * and what bits should be set to zero in 'zeros_mask'. The don't-care + * bits are set to the default value. The default values are obtained + * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining + * VMX Capabilities". + * + * Returns zero on success and non-zero on error. + */ +int +vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, + uint32_t zeros_mask, uint32_t *retval) +{ + int i; + uint64_t val, trueval; + boolean_t true_ctls_avail, one_allowed, zero_allowed; + + /* We cannot ask the same bit to be set to both '1' and '0' */ + if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask)) + return (EINVAL); + + if (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) + true_ctls_avail = TRUE; + else + true_ctls_avail = FALSE; + + val = rdmsr(ctl_reg); + if (true_ctls_avail) + trueval = rdmsr(true_ctl_reg); /* step c */ + else + trueval = val; /* step a */ + + for (i = 0; i < 32; i++) { + one_allowed = vmx_ctl_allows_one_setting(trueval, i); + zero_allowed = vmx_ctl_allows_zero_setting(trueval, i); + + KASSERT(one_allowed || zero_allowed, + ("invalid zero/one setting for bit %d of ctl 0x%0x, " + "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg)); + + if (zero_allowed && !one_allowed) { /* b(i),c(i) */ + if (ones_mask & (1 << i)) + return (EINVAL); + *retval &= ~(1 << i); + } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */ + if (zeros_mask & (1 << i)) + return (EINVAL); + *retval |= 1 << i; + } else { + if (zeros_mask & (1 << i)) /* b(ii),c(ii) */ + *retval &= ~(1 << i); + else if (ones_mask & (1 << i)) /* b(ii), c(ii) */ + *retval |= 1 << i; + else if (!true_ctls_avail) + *retval &= ~(1 << i); /* b(iii) */ + else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/ + *retval &= ~(1 << i); + else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */ + *retval |= 1 << i; + else { + panic("vmx_set_ctlreg: unable to determine " + "correct value of ctl bit %d for msr " + "0x%0x and true msr 0x%0x", i, ctl_reg, + true_ctl_reg); + } + } + } + + return (0); +} + +void +msr_bitmap_initialize(char *bitmap) +{ + + memset(bitmap, 0xff, PAGE_SIZE); +} + +int +msr_bitmap_change_access(char *bitmap, u_int msr, int access) +{ + int byte, bit; + + if (msr >= 0x00000000 && msr <= 0x00001FFF) + byte = msr / 8; + else if (msr >= 0xC0000000 && msr <= 0xC0001FFF) + byte = 1024 + (msr - 0xC0000000) / 8; + else + return (EINVAL); + + bit = msr & 0x7; + + if (access & MSR_BITMAP_ACCESS_READ) + bitmap[byte] &= ~(1 << bit); + else + bitmap[byte] |= 1 << bit; + + byte += 2048; + if (access & MSR_BITMAP_ACCESS_WRITE) + bitmap[byte] &= ~(1 << bit); + else + bitmap[byte] |= 1 << bit; + + return (0); +} diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h new file mode 100644 index 000000000000..e6379a93d155 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_msr.h @@ -0,0 +1,78 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMX_MSR_H_ +#define _VMX_MSR_H_ + +#define MSR_VMX_BASIC 0x480 +#define MSR_VMX_EPT_VPID_CAP 0x48C + +#define MSR_VMX_PROCBASED_CTLS 0x482 +#define MSR_VMX_TRUE_PROCBASED_CTLS 0x48E + +#define MSR_VMX_PINBASED_CTLS 0x481 +#define MSR_VMX_TRUE_PINBASED_CTLS 0x48D + +#define MSR_VMX_PROCBASED_CTLS2 0x48B + +#define MSR_VMX_EXIT_CTLS 0x483 +#define MSR_VMX_TRUE_EXIT_CTLS 0x48f + +#define MSR_VMX_ENTRY_CTLS 0x484 +#define MSR_VMX_TRUE_ENTRY_CTLS 0x490 + +#define MSR_VMX_CR0_FIXED0 0x486 +#define MSR_VMX_CR0_FIXED1 0x487 + +#define MSR_VMX_CR4_FIXED0 0x488 +#define MSR_VMX_CR4_FIXED1 0x489 + +uint32_t vmx_revision(void); + +int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, + uint32_t zeros_mask, uint32_t *retval); + +/* + * According to Section 21.10.4 "Software Access to Related Structures", + * changes to data structures pointed to by the VMCS must be made only when + * there is no logical processor with a current VMCS that points to the + * data structure. + * + * This pretty much limits us to configuring the MSR bitmap before VMCS + * initialization for SMP VMs. Unless of course we do it the hard way - which + * would involve some form of synchronization between the vcpus to vmclear + * all VMCSs' that point to the bitmap. + */ +#define MSR_BITMAP_ACCESS_NONE 0x0 +#define MSR_BITMAP_ACCESS_READ 0x1 +#define MSR_BITMAP_ACCESS_WRITE 0x2 +#define MSR_BITMAP_ACCESS_RW (MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE) +void msr_bitmap_initialize(char *bitmap); +int msr_bitmap_change_access(char *bitmap, u_int msr, int access); + +#endif diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S new file mode 100644 index 000000000000..4d1bf1da13df --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_support.S @@ -0,0 +1,204 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <machine/asmacros.h> + +#include "vmx_assym.s" + +/* + * Assumes that %rdi holds a pointer to the 'vmxctx' + */ +#define VMX_GUEST_RESTORE \ + /* \ + * Make sure that interrupts are disabled before restoring CR2. \ + * Otherwise there could be a page fault during the interrupt \ + * handler execution that would end up trashing CR2. \ + */ \ + cli; \ + movq VMXCTX_GUEST_CR2(%rdi),%rsi; \ + movq %rsi,%cr2; \ + movq VMXCTX_GUEST_RSI(%rdi),%rsi; \ + movq VMXCTX_GUEST_RDX(%rdi),%rdx; \ + movq VMXCTX_GUEST_RCX(%rdi),%rcx; \ + movq VMXCTX_GUEST_R8(%rdi),%r8; \ + movq VMXCTX_GUEST_R9(%rdi),%r9; \ + movq VMXCTX_GUEST_RAX(%rdi),%rax; \ + movq VMXCTX_GUEST_RBX(%rdi),%rbx; \ + movq VMXCTX_GUEST_RBP(%rdi),%rbp; \ + movq VMXCTX_GUEST_R10(%rdi),%r10; \ + movq VMXCTX_GUEST_R11(%rdi),%r11; \ + movq VMXCTX_GUEST_R12(%rdi),%r12; \ + movq VMXCTX_GUEST_R13(%rdi),%r13; \ + movq VMXCTX_GUEST_R14(%rdi),%r14; \ + movq VMXCTX_GUEST_R15(%rdi),%r15; \ + movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */ + +#define VM_INSTRUCTION_ERROR(reg) \ + jnc 1f; \ + movl $VM_FAIL_INVALID,reg; /* CF is set */ \ + jmp 3f; \ +1: jnz 2f; \ + movl $VM_FAIL_VALID,reg; /* ZF is set */ \ + jmp 3f; \ +2: movl $VM_SUCCESS,reg; \ +3: movl reg,VMXCTX_LAUNCH_ERROR(%rsp) + + .text +/* + * int vmx_setjmp(ctxp) + * %rdi = ctxp + * + * Return value is '0' when it returns directly from here. + * Return value is '1' when it returns after a vm exit through vmx_longjmp. + */ +ENTRY(vmx_setjmp) + movq (%rsp),%rax /* return address */ + movq %r15,VMXCTX_HOST_R15(%rdi) + movq %r14,VMXCTX_HOST_R14(%rdi) + movq %r13,VMXCTX_HOST_R13(%rdi) + movq %r12,VMXCTX_HOST_R12(%rdi) + movq %rbp,VMXCTX_HOST_RBP(%rdi) + movq %rsp,VMXCTX_HOST_RSP(%rdi) + movq %rbx,VMXCTX_HOST_RBX(%rdi) + movq %rax,VMXCTX_HOST_RIP(%rdi) + + /* + * XXX save host debug registers + */ + movl $VMX_RETURN_DIRECT,%eax + ret +END(vmx_setjmp) + +/* + * void vmx_return(struct vmxctx *ctxp, int retval) + * %rdi = ctxp + * %rsi = retval + * Return to vmm context through vmx_setjmp() with a value of 'retval'. + */ +ENTRY(vmx_return) + /* Restore host context. */ + movq VMXCTX_HOST_R15(%rdi),%r15 + movq VMXCTX_HOST_R14(%rdi),%r14 + movq VMXCTX_HOST_R13(%rdi),%r13 + movq VMXCTX_HOST_R12(%rdi),%r12 + movq VMXCTX_HOST_RBP(%rdi),%rbp + movq VMXCTX_HOST_RSP(%rdi),%rsp + movq VMXCTX_HOST_RBX(%rdi),%rbx + movq VMXCTX_HOST_RIP(%rdi),%rax + movq %rax,(%rsp) /* return address */ + + /* + * XXX restore host debug registers + */ + movl %esi,%eax + ret +END(vmx_return) + +/* + * void vmx_longjmp(void) + * %rsp points to the struct vmxctx + */ +ENTRY(vmx_longjmp) + /* + * Save guest state that is not automatically saved in the vmcs. + */ + movq %rdi,VMXCTX_GUEST_RDI(%rsp) + movq %rsi,VMXCTX_GUEST_RSI(%rsp) + movq %rdx,VMXCTX_GUEST_RDX(%rsp) + movq %rcx,VMXCTX_GUEST_RCX(%rsp) + movq %r8,VMXCTX_GUEST_R8(%rsp) + movq %r9,VMXCTX_GUEST_R9(%rsp) + movq %rax,VMXCTX_GUEST_RAX(%rsp) + movq %rbx,VMXCTX_GUEST_RBX(%rsp) + movq %rbp,VMXCTX_GUEST_RBP(%rsp) + movq %r10,VMXCTX_GUEST_R10(%rsp) + movq %r11,VMXCTX_GUEST_R11(%rsp) + movq %r12,VMXCTX_GUEST_R12(%rsp) + movq %r13,VMXCTX_GUEST_R13(%rsp) + movq %r14,VMXCTX_GUEST_R14(%rsp) + movq %r15,VMXCTX_GUEST_R15(%rsp) + + movq %cr2,%rdi + movq %rdi,VMXCTX_GUEST_CR2(%rsp) + + movq %rsp,%rdi + movq $VMX_RETURN_LONGJMP,%rsi + callq vmx_return +END(vmx_longjmp) + +/* + * void vmx_resume(struct vmxctx *ctxp) + * %rdi = ctxp + * + * Although the return type is a 'void' this function may return indirectly + * through vmx_setjmp() with a return value of 2. + */ +ENTRY(vmx_resume) + /* + * Restore guest state that is not automatically loaded from the vmcs. + */ + VMX_GUEST_RESTORE + + vmresume + + /* + * Capture the reason why vmresume failed. + */ + VM_INSTRUCTION_ERROR(%eax) + + /* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */ + movq %rsp,%rdi + movq $VMX_RETURN_VMRESUME,%rsi + callq vmx_return +END(vmx_resume) + +/* + * void vmx_launch(struct vmxctx *ctxp) + * %rdi = ctxp + * + * Although the return type is a 'void' this function may return indirectly + * through vmx_setjmp() with a return value of 3. + */ +ENTRY(vmx_launch) + /* + * Restore guest state that is not automatically loaded from the vmcs. + */ + VMX_GUEST_RESTORE + + vmlaunch + + /* + * Capture the reason why vmlaunch failed. + */ + VM_INSTRUCTION_ERROR(%eax) + + /* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */ + movq %rsp,%rdi + movq $VMX_RETURN_VMLAUNCH,%rsi + callq vmx_return +END(vmx_launch) diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c new file mode 100644 index 000000000000..24495a977477 --- /dev/null +++ b/sys/amd64/vmm/intel/vtd.c @@ -0,0 +1,637 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <dev/pci/pcireg.h> + +#include <machine/pmap.h> +#include <machine/vmparam.h> +#include <machine/pci_cfgreg.h> + +#include "io/iommu.h" + +/* + * Documented in the "Intel Virtualization Technology for Directed I/O", + * Architecture Spec, September 2008. + */ + +/* Section 10.4 "Register Descriptions" */ +struct vtdmap { + volatile uint32_t version; + volatile uint32_t res0; + volatile uint64_t cap; + volatile uint64_t ext_cap; + volatile uint32_t gcr; + volatile uint32_t gsr; + volatile uint64_t rta; + volatile uint64_t ccr; +}; + +#define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F) +#define VTD_CAP_ND(cap) ((cap) & 0x7) +#define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1) +#define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF) +#define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1) + +#define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1) +#define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1) +#define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF) + +#define VTD_GCR_WBF (1 << 27) +#define VTD_GCR_SRTP (1 << 30) +#define VTD_GCR_TE (1 << 31) + +#define VTD_GSR_WBFS (1 << 27) +#define VTD_GSR_RTPS (1 << 30) +#define VTD_GSR_TES (1 << 31) + +#define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */ +#define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */ + +#define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */ +#define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */ +#define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */ +#define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */ +#define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */ +#define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */ +#define VTD_IIR_DOMAIN_P 32 + +#define VTD_ROOT_PRESENT 0x1 +#define VTD_CTX_PRESENT 0x1 +#define VTD_CTX_TT_ALL (1UL << 2) + +#define VTD_PTE_RD (1UL << 0) +#define VTD_PTE_WR (1UL << 1) +#define VTD_PTE_SUPERPAGE (1UL << 7) +#define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL) + +struct domain { + uint64_t *ptp; /* first level page table page */ + int pt_levels; /* number of page table levels */ + int addrwidth; /* 'AW' field in context entry */ + int spsmask; /* supported super page sizes */ + u_int id; /* domain id */ + vm_paddr_t maxaddr; /* highest address to be mapped */ + SLIST_ENTRY(domain) next; +}; + +static SLIST_HEAD(, domain) domhead; + +#define DRHD_MAX_UNITS 8 +static int drhd_num; +static struct vtdmap *vtdmaps[DRHD_MAX_UNITS]; +static int max_domains; +typedef int (*drhd_ident_func_t)(void); + +static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); +static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); + +static MALLOC_DEFINE(M_VTD, "vtd", "vtd"); + +/* + * Config space register definitions from the "Intel 5520 and 5500" datasheet. + */ +static int +tylersburg_vtd_ident(void) +{ + int units, nlbus; + uint16_t did, vid; + uint32_t miscsts, vtbar; + + const int bus = 0; + const int slot = 20; + const int func = 0; + + units = 0; + + vid = pci_cfgregread(bus, slot, func, PCIR_VENDOR, 2); + did = pci_cfgregread(bus, slot, func, PCIR_DEVICE, 2); + if (vid != 0x8086 || did != 0x342E) + goto done; + + /* + * Check if this is a dual IOH configuration. + */ + miscsts = pci_cfgregread(bus, slot, func, 0x9C, 4); + if (miscsts & (1 << 25)) + nlbus = pci_cfgregread(bus, slot, func, 0x160, 1); + else + nlbus = -1; + + vtbar = pci_cfgregread(bus, slot, func, 0x180, 4); + if (vtbar & 0x1) { + vtdmaps[units++] = (struct vtdmap *) + PHYS_TO_DMAP(vtbar & 0xffffe000); + } else if (bootverbose) + printf("VT-d unit in legacy IOH is disabled!\n"); + + if (nlbus != -1) { + vtbar = pci_cfgregread(nlbus, slot, func, 0x180, 4); + if (vtbar & 0x1) { + vtdmaps[units++] = (struct vtdmap *) + PHYS_TO_DMAP(vtbar & 0xffffe000); + } else if (bootverbose) + printf("VT-d unit in non-legacy IOH is disabled!\n"); + } +done: + return (units); +} + +static drhd_ident_func_t drhd_ident_funcs[] = { + tylersburg_vtd_ident, + NULL +}; + +static int +vtd_max_domains(struct vtdmap *vtdmap) +{ + int nd; + + nd = VTD_CAP_ND(vtdmap->cap); + + switch (nd) { + case 0: + return (16); + case 1: + return (64); + case 2: + return (256); + case 3: + return (1024); + case 4: + return (4 * 1024); + case 5: + return (16 * 1024); + case 6: + return (64 * 1024); + default: + panic("vtd_max_domains: invalid value of nd (0x%0x)", nd); + } +} + +static u_int +domain_id(void) +{ + u_int id; + struct domain *dom; + + /* Skip domain id 0 - it is reserved when Caching Mode field is set */ + for (id = 1; id < max_domains; id++) { + SLIST_FOREACH(dom, &domhead, next) { + if (dom->id == id) + break; + } + if (dom == NULL) + break; /* found it */ + } + + if (id >= max_domains) + panic("domain ids exhausted"); + + return (id); +} + +static void +vtd_wbflush(struct vtdmap *vtdmap) +{ + + if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0) + pmap_invalidate_cache(); + + if (VTD_CAP_RWBF(vtdmap->cap)) { + vtdmap->gcr = VTD_GCR_WBF; + while ((vtdmap->gsr & VTD_GSR_WBFS) != 0) + ; + } +} + +static void +vtd_ctx_global_invalidate(struct vtdmap *vtdmap) +{ + + vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL; + while ((vtdmap->ccr & VTD_CCR_ICC) != 0) + ; +} + +static void +vtd_iotlb_global_invalidate(struct vtdmap *vtdmap) +{ + int offset; + volatile uint64_t *iotlb_reg, val; + + vtd_wbflush(vtdmap); + + offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16; + iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8); + + *iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL | + VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES; + + while (1) { + val = *iotlb_reg; + if ((val & VTD_IIR_IVT) == 0) + break; + } +} + +static void +vtd_translation_enable(struct vtdmap *vtdmap) +{ + + vtdmap->gcr = VTD_GCR_TE; + while ((vtdmap->gsr & VTD_GSR_TES) == 0) + ; +} + +static void +vtd_translation_disable(struct vtdmap *vtdmap) +{ + + vtdmap->gcr = 0; + while ((vtdmap->gsr & VTD_GSR_TES) != 0) + ; +} + +static int +vtd_init(void) +{ + int i, units; + struct vtdmap *vtdmap; + vm_paddr_t ctx_paddr; + + for (i = 0; drhd_ident_funcs[i] != NULL; i++) { + units = (*drhd_ident_funcs[i])(); + if (units > 0) + break; + } + + if (units <= 0) + return (ENXIO); + + drhd_num = units; + vtdmap = vtdmaps[0]; + + if (VTD_CAP_CM(vtdmap->cap) != 0) + panic("vtd_init: invalid caching mode"); + + max_domains = vtd_max_domains(vtdmap); + + /* + * Set up the root-table to point to the context-entry tables + */ + for (i = 0; i < 256; i++) { + ctx_paddr = vtophys(ctx_tables[i]); + if (ctx_paddr & PAGE_MASK) + panic("ctx table (0x%0lx) not page aligned", ctx_paddr); + + root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT; + } + + return (0); +} + +static void +vtd_cleanup(void) +{ +} + +static void +vtd_enable(void) +{ + int i; + struct vtdmap *vtdmap; + + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_wbflush(vtdmap); + + /* Update the root table address */ + vtdmap->rta = vtophys(root_table); + vtdmap->gcr = VTD_GCR_SRTP; + while ((vtdmap->gsr & VTD_GSR_RTPS) == 0) + ; + + vtd_ctx_global_invalidate(vtdmap); + vtd_iotlb_global_invalidate(vtdmap); + + vtd_translation_enable(vtdmap); + } +} + +static void +vtd_disable(void) +{ + int i; + struct vtdmap *vtdmap; + + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_translation_disable(vtdmap); + } +} + +static void +vtd_add_device(void *arg, int bus, int slot, int func) +{ + int idx; + uint64_t *ctxp; + struct domain *dom = arg; + vm_paddr_t pt_paddr; + struct vtdmap *vtdmap; + + if (bus < 0 || bus > PCI_BUSMAX || + slot < 0 || slot > PCI_SLOTMAX || + func < 0 || func > PCI_FUNCMAX) + panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func); + + vtdmap = vtdmaps[0]; + ctxp = ctx_tables[bus]; + pt_paddr = vtophys(dom->ptp); + idx = (slot << 3 | func) * 2; + + if (ctxp[idx] & VTD_CTX_PRESENT) { + panic("vtd_add_device: device %d/%d/%d is already owned by " + "domain %d", bus, slot, func, + (uint16_t)(ctxp[idx + 1] >> 8)); + } + + /* + * Order is important. The 'present' bit is set only after all fields + * of the context pointer are initialized. + */ + ctxp[idx + 1] = dom->addrwidth | (dom->id << 8); + + if (VTD_ECAP_DI(vtdmap->ext_cap)) + ctxp[idx] = VTD_CTX_TT_ALL; + else + ctxp[idx] = 0; + + ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT; + + /* + * 'Not Present' entries are not cached in either the Context Cache + * or in the IOTLB, so there is no need to invalidate either of them. + */ +} + +static void +vtd_remove_device(void *arg, int bus, int slot, int func) +{ + int i, idx; + uint64_t *ctxp; + struct vtdmap *vtdmap; + + if (bus < 0 || bus > PCI_BUSMAX || + slot < 0 || slot > PCI_SLOTMAX || + func < 0 || func > PCI_FUNCMAX) + panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func); + + ctxp = ctx_tables[bus]; + idx = (slot << 3 | func) * 2; + + /* + * Order is important. The 'present' bit is must be cleared first. + */ + ctxp[idx] = 0; + ctxp[idx + 1] = 0; + + /* + * Invalidate the Context Cache and the IOTLB. + * + * XXX use device-selective invalidation for Context Cache + * XXX use domain-selective invalidation for IOTLB + */ + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_ctx_global_invalidate(vtdmap); + vtd_iotlb_global_invalidate(vtdmap); + } +} + +static uint64_t +vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) +{ + struct domain *dom; + int i, spshift, ptpshift, ptpindex, nlevels; + uint64_t spsize, *ptp; + + dom = arg; + ptpindex = 0; + ptpshift = 0; + + if (gpa & PAGE_MASK) + panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa); + + if (hpa & PAGE_MASK) + panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa); + + if (len & PAGE_MASK) + panic("vtd_create_mapping: unaligned len 0x%0lx", len); + + /* + * Compute the size of the mapping that we can accomodate. + * + * This is based on three factors: + * - supported super page size + * - alignment of the region starting at 'gpa' and 'hpa' + * - length of the region 'len' + */ + spshift = 48; + for (i = 3; i >= 0; i--) { + spsize = 1UL << spshift; + if ((dom->spsmask & (1 << i)) != 0 && + (gpa & (spsize - 1)) == 0 && + (hpa & (spsize - 1)) == 0 && + (len >= spsize)) { + break; + } + spshift -= 9; + } + + ptp = dom->ptp; + nlevels = dom->pt_levels; + while (--nlevels >= 0) { + ptpshift = 12 + nlevels * 9; + ptpindex = (gpa >> ptpshift) & 0x1FF; + + /* We have reached the leaf mapping */ + if (spshift >= ptpshift) { + break; + } + + /* + * We are working on a non-leaf page table page. + * + * Create a downstream page table page if necessary and point + * to it from the current page table. + */ + if (ptp[ptpindex] == 0) { + void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO); + ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR; + } + + ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M); + } + + if ((gpa & ((1UL << ptpshift) - 1)) != 0) + panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift); + + /* + * Create a 'gpa' -> 'hpa' mapping + */ + ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR; + + if (nlevels > 0) + ptp[ptpindex] |= VTD_PTE_SUPERPAGE; + + return (1UL << ptpshift); +} + +static void * +vtd_create_domain(vm_paddr_t maxaddr) +{ + struct domain *dom; + vm_paddr_t addr; + int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth; + struct vtdmap *vtdmap; + + if (drhd_num <= 0) + panic("vtd_create_domain: no dma remapping hardware available"); + + vtdmap = vtdmaps[0]; + + /* + * Calculate AGAW. + * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec. + */ + addr = 0; + for (gaw = 0; addr < maxaddr; gaw++) + addr = 1ULL << gaw; + + res = (gaw - 12) % 9; + if (res == 0) + agaw = gaw; + else + agaw = gaw + 9 - res; + + if (agaw > 64) + agaw = 64; + + /* + * Select the smallest Supported AGAW and the corresponding number + * of page table levels. + */ + pt_levels = 2; + sagaw = 30; + addrwidth = 0; + tmp = VTD_CAP_SAGAW(vtdmap->cap); + for (i = 0; i < 5; i++) { + if ((tmp & (1 << i)) != 0 && sagaw >= agaw) + break; + pt_levels++; + addrwidth++; + sagaw += 9; + if (sagaw > 64) + sagaw = 64; + } + + if (i >= 5) { + panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d", + VTD_CAP_SAGAW(vtdmap->cap), agaw); + } + + dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK); + dom->pt_levels = pt_levels; + dom->addrwidth = addrwidth; + dom->spsmask = VTD_CAP_SPS(vtdmap->cap); + dom->id = domain_id(); + dom->maxaddr = maxaddr; + dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK); + if ((uintptr_t)dom->ptp & PAGE_MASK) + panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp); + + SLIST_INSERT_HEAD(&domhead, dom, next); + + return (dom); +} + +static void +vtd_free_ptp(uint64_t *ptp, int level) +{ + int i; + uint64_t *nlp; + + if (level > 1) { + for (i = 0; i < 512; i++) { + if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0) + continue; + if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0) + continue; + nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M); + vtd_free_ptp(nlp, level - 1); + } + } + + bzero(ptp, PAGE_SIZE); + free(ptp, M_VTD); +} + +static void +vtd_destroy_domain(void *arg) +{ + struct domain *dom; + + dom = arg; + + SLIST_REMOVE(&domhead, dom, domain, next); + vtd_free_ptp(dom->ptp, dom->pt_levels); + free(dom, M_VTD); +} + +struct iommu_ops iommu_ops_intel = { + vtd_init, + vtd_cleanup, + vtd_enable, + vtd_disable, + vtd_create_domain, + vtd_destroy_domain, + vtd_create_mapping, + vtd_add_device, + vtd_remove_device, +}; diff --git a/sys/amd64/vmm/io/iommu.c b/sys/amd64/vmm/io/iommu.c new file mode 100644 index 000000000000..baf2447c4fa0 --- /dev/null +++ b/sys/amd64/vmm/io/iommu.c @@ -0,0 +1,230 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/bus.h> + +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> + +#include <machine/md_var.h> + +#include "vmm_util.h" +#include "iommu.h" + +static boolean_t iommu_avail; +static struct iommu_ops *ops; +static void *host_domain; + +static __inline int +IOMMU_INIT(void) +{ + if (ops != NULL) + return ((*ops->init)()); + else + return (ENXIO); +} + +static __inline void +IOMMU_CLEANUP(void) +{ + if (ops != NULL && iommu_avail) + (*ops->cleanup)(); +} + +static __inline void * +IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->create_domain)(maxaddr)); + else + return (NULL); +} + +static __inline void +IOMMU_DESTROY_DOMAIN(void *dom) +{ + + if (ops != NULL && iommu_avail) + (*ops->destroy_domain)(dom); +} + +static __inline uint64_t +IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->create_mapping)(domain, gpa, hpa, len)); + else + return (len); /* XXX */ +} + +static __inline void +IOMMU_ADD_DEVICE(void *domain, int bus, int slot, int func) +{ + + if (ops != NULL && iommu_avail) + (*ops->add_device)(domain, bus, slot, func); +} + +static __inline void +IOMMU_REMOVE_DEVICE(void *domain, int bus, int slot, int func) +{ + + if (ops != NULL && iommu_avail) + (*ops->remove_device)(domain, bus, slot, func); +} + +static __inline void +IOMMU_ENABLE(void) +{ + + if (ops != NULL && iommu_avail) + (*ops->enable)(); +} + +static __inline void +IOMMU_DISABLE(void) +{ + + if (ops != NULL && iommu_avail) + (*ops->disable)(); +} + +void +iommu_init(void) +{ + int error, bus, slot, func; + vm_paddr_t maxaddr; + const char *name; + device_t dev; + + if (vmm_is_intel()) + ops = &iommu_ops_intel; + else if (vmm_is_amd()) + ops = &iommu_ops_amd; + else + ops = NULL; + + error = IOMMU_INIT(); + if (error) + return; + + iommu_avail = TRUE; + + /* + * Create a domain for the devices owned by the host + */ + maxaddr = ptoa(Maxmem); + host_domain = IOMMU_CREATE_DOMAIN(maxaddr); + if (host_domain == NULL) + panic("iommu_init: unable to create a host domain"); + + /* + * Create 1:1 mappings from '0' to 'Maxmem' for devices assigned to + * the host + */ + iommu_create_mapping(host_domain, 0, 0, maxaddr); + + for (bus = 0; bus <= PCI_BUSMAX; bus++) { + for (slot = 0; slot <= PCI_SLOTMAX; slot++) { + for (func = 0; func <= PCI_FUNCMAX; func++) { + dev = pci_find_dbsf(0, bus, slot, func); + if (dev == NULL) + continue; + + /* skip passthrough devices */ + name = device_get_name(dev); + if (name != NULL && strcmp(name, "ppt") == 0) + continue; + + /* everything else belongs to the host domain */ + iommu_add_device(host_domain, bus, slot, func); + } + } + } + IOMMU_ENABLE(); + +} + +void +iommu_cleanup(void) +{ + IOMMU_DISABLE(); + IOMMU_DESTROY_DOMAIN(host_domain); + IOMMU_CLEANUP(); +} + +void * +iommu_create_domain(vm_paddr_t maxaddr) +{ + + return (IOMMU_CREATE_DOMAIN(maxaddr)); +} + +void +iommu_destroy_domain(void *dom) +{ + + IOMMU_DESTROY_DOMAIN(dom); +} + +void +iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len) +{ + uint64_t mapped, remaining; + + remaining = len; + + while (remaining > 0) { + mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining); + gpa += mapped; + hpa += mapped; + remaining -= mapped; + } +} + +void +iommu_add_device(void *dom, int bus, int slot, int func) +{ + + IOMMU_ADD_DEVICE(dom, bus, slot, func); +} + +void +iommu_remove_device(void *dom, int bus, int slot, int func) +{ + + IOMMU_REMOVE_DEVICE(dom, bus, slot, func); +} diff --git a/sys/amd64/vmm/io/iommu.h b/sys/amd64/vmm/io/iommu.h new file mode 100644 index 000000000000..e4f722914fdd --- /dev/null +++ b/sys/amd64/vmm/io/iommu.h @@ -0,0 +1,67 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IO_IOMMU_H_ +#define _IO_IOMMU_H_ + +typedef int (*iommu_init_func_t)(void); +typedef void (*iommu_cleanup_func_t)(void); +typedef void (*iommu_enable_func_t)(void); +typedef void (*iommu_disable_func_t)(void); +typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr); +typedef void (*iommu_destroy_domain_t)(void *domain); +typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t len); +typedef void (*iommu_add_device_t)(void *domain, int bus, int slot, int func); +typedef void (*iommu_remove_device_t)(void *dom, int bus, int slot, int func); + +struct iommu_ops { + iommu_init_func_t init; /* module wide */ + iommu_cleanup_func_t cleanup; + iommu_enable_func_t enable; + iommu_disable_func_t disable; + + iommu_create_domain_t create_domain; /* domain-specific */ + iommu_destroy_domain_t destroy_domain; + iommu_create_mapping_t create_mapping; + iommu_add_device_t add_device; + iommu_remove_device_t remove_device; +}; + +extern struct iommu_ops iommu_ops_intel; +extern struct iommu_ops iommu_ops_amd; + +void iommu_init(void); +void iommu_cleanup(void); +void *iommu_create_domain(vm_paddr_t maxaddr); +void iommu_destroy_domain(void *dom); +void iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, + size_t len); +void iommu_add_device(void *dom, int bus, int slot, int func); +void iommu_remove_device(void *dom, int bus, int slot, int func); +#endif diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c new file mode 100644 index 000000000000..dc2f326b1e85 --- /dev/null +++ b/sys/amd64/vmm/io/ppt.c @@ -0,0 +1,449 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/bus.h> +#include <sys/pciio.h> +#include <sys/rman.h> + +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> + +#include <machine/resource.h> + +#include <machine/vmm.h> +#include <machine/vmm_dev.h> + +#include "vmm_lapic.h" +#include "vmm_ktr.h" + +#include "iommu.h" +#include "ppt.h" + +#define MAX_PPTDEVS (sizeof(pptdevs) / sizeof(pptdevs[0])) +#define MAX_MMIOSEGS (PCIR_MAX_BAR_0 + 1) +#define MAX_MSIMSGS 32 + +struct pptintr_arg { /* pptintr(pptintr_arg) */ + struct pptdev *pptdev; + int msg; +}; + +static struct pptdev { + device_t dev; + struct vm *vm; /* owner of this device */ + struct vm_memory_segment mmio[MAX_MMIOSEGS]; + struct { + int num_msgs; /* guest state */ + int vector; + int vcpu; + + int startrid; /* host state */ + struct resource *res[MAX_MSIMSGS]; + void *cookie[MAX_MSIMSGS]; + struct pptintr_arg arg[MAX_MSIMSGS]; + } msi; +} pptdevs[32]; + +static int num_pptdevs; + +static int +ppt_probe(device_t dev) +{ + int bus, slot, func; + struct pci_devinfo *dinfo; + + dinfo = (struct pci_devinfo *)device_get_ivars(dev); + + bus = pci_get_bus(dev); + slot = pci_get_slot(dev); + func = pci_get_function(dev); + + /* + * To qualify as a pci passthrough device a device must: + * - be allowed by administrator to be used in this role + * - be an endpoint device + */ + if (vmm_is_pptdev(bus, slot, func) && + (dinfo->cfg.hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_NORMAL) + return (0); + else + return (ENXIO); +} + +static int +ppt_attach(device_t dev) +{ + int n; + + if (num_pptdevs >= MAX_PPTDEVS) { + printf("ppt_attach: maximum number of pci passthrough devices " + "exceeded\n"); + return (ENXIO); + } + + n = num_pptdevs++; + pptdevs[n].dev = dev; + + if (bootverbose) + device_printf(dev, "attached\n"); + + return (0); +} + +static int +ppt_detach(device_t dev) +{ + /* + * XXX check whether there are any pci passthrough devices assigned + * to guests before we allow this driver to detach. + */ + + return (0); +} + +static device_method_t ppt_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, ppt_probe), + DEVMETHOD(device_attach, ppt_attach), + DEVMETHOD(device_detach, ppt_detach), + {0, 0} +}; + +static devclass_t ppt_devclass; +DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, 0); +DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL); + +static struct pptdev * +ppt_find(int bus, int slot, int func) +{ + device_t dev; + int i, b, s, f; + + for (i = 0; i < num_pptdevs; i++) { + dev = pptdevs[i].dev; + b = pci_get_bus(dev); + s = pci_get_slot(dev); + f = pci_get_function(dev); + if (bus == b && slot == s && func == f) + return (&pptdevs[i]); + } + return (NULL); +} + +static void +ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt) +{ + int i; + struct vm_memory_segment *seg; + + for (i = 0; i < MAX_MMIOSEGS; i++) { + seg = &ppt->mmio[i]; + if (seg->len == 0) + continue; + (void)vm_unmap_mmio(vm, seg->gpa, seg->len); + bzero(seg, sizeof(struct vm_memory_segment)); + } +} + +static void +ppt_teardown_msi(struct pptdev *ppt) +{ + int i, rid; + void *cookie; + struct resource *res; + + if (ppt->msi.num_msgs == 0) + return; + + for (i = 0; i < ppt->msi.num_msgs; i++) { + rid = ppt->msi.startrid + i; + res = ppt->msi.res[i]; + cookie = ppt->msi.cookie[i]; + + if (cookie != NULL) + bus_teardown_intr(ppt->dev, res, cookie); + + if (res != NULL) + bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res); + + ppt->msi.res[i] = NULL; + ppt->msi.cookie[i] = NULL; + } + + if (ppt->msi.startrid == 1) + pci_release_msi(ppt->dev); + + ppt->msi.num_msgs = 0; +} + +int +ppt_assign_device(struct vm *vm, int bus, int slot, int func) +{ + struct pptdev *ppt; + + ppt = ppt_find(bus, slot, func); + if (ppt != NULL) { + /* + * If this device is owned by a different VM then we + * cannot change its owner. + */ + if (ppt->vm != NULL && ppt->vm != vm) + return (EBUSY); + + ppt->vm = vm; + iommu_add_device(vm_iommu_domain(vm), bus, slot, func); + return (0); + } + return (ENOENT); +} + +int +ppt_unassign_device(struct vm *vm, int bus, int slot, int func) +{ + struct pptdev *ppt; + + ppt = ppt_find(bus, slot, func); + if (ppt != NULL) { + /* + * If this device is not owned by this 'vm' then bail out. + */ + if (ppt->vm != vm) + return (EBUSY); + ppt_unmap_mmio(vm, ppt); + ppt_teardown_msi(ppt); + iommu_remove_device(vm_iommu_domain(vm), bus, slot, func); + ppt->vm = NULL; + return (0); + } + return (ENOENT); +} + +int +ppt_unassign_all(struct vm *vm) +{ + int i, bus, slot, func; + device_t dev; + + for (i = 0; i < num_pptdevs; i++) { + if (pptdevs[i].vm == vm) { + dev = pptdevs[i].dev; + bus = pci_get_bus(dev); + slot = pci_get_slot(dev); + func = pci_get_function(dev); + ppt_unassign_device(vm, bus, slot, func); + } + } + + return (0); +} + +int +ppt_map_mmio(struct vm *vm, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa) +{ + int i, error; + struct vm_memory_segment *seg; + struct pptdev *ppt; + + ppt = ppt_find(bus, slot, func); + if (ppt != NULL) { + if (ppt->vm != vm) + return (EBUSY); + + for (i = 0; i < MAX_MMIOSEGS; i++) { + seg = &ppt->mmio[i]; + if (seg->len == 0) { + error = vm_map_mmio(vm, gpa, len, hpa); + if (error == 0) { + seg->gpa = gpa; + seg->len = len; + seg->hpa = hpa; + } + return (error); + } + } + return (ENOSPC); + } + return (ENOENT); +} + +static int +pptintr(void *arg) +{ + int vec; + struct pptdev *ppt; + struct pptintr_arg *pptarg; + + pptarg = arg; + ppt = pptarg->pptdev; + vec = ppt->msi.vector + pptarg->msg; + + if (ppt->vm != NULL) + (void) lapic_set_intr(ppt->vm, ppt->msi.vcpu, vec); + else { + /* + * XXX + * This is not expected to happen - panic? + */ + } + + /* + * For legacy interrupts give other filters a chance in case + * the interrupt was not generated by the passthrough device. + */ + if (ppt->msi.startrid == 0) + return (FILTER_STRAY); + else + return (FILTER_HANDLED); +} + +/* + * XXX + * When we try to free the MSI resource the kernel will bind the thread to + * the host cpu was originally handling the MSI. The function freeing the + * MSI vector (apic_free_vector()) will panic the kernel if the thread + * is already bound to a cpu. + * + * So, we temporarily unbind the vcpu thread before freeing the MSI resource. + */ +static void +PPT_TEARDOWN_MSI(struct vm *vm, int vcpu, struct pptdev *ppt) +{ + int pincpu = -1; + + vm_get_pinning(vm, vcpu, &pincpu); + + if (pincpu >= 0) + vm_set_pinning(vm, vcpu, -1); + + ppt_teardown_msi(ppt); + + if (pincpu >= 0) + vm_set_pinning(vm, vcpu, pincpu); +} + +int +ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, + int destcpu, int vector, int numvec) +{ + int i, rid, flags; + int msi_count, startrid, error, tmp; + struct pptdev *ppt; + + if ((destcpu >= VM_MAXCPU || destcpu < 0) || + (vector < 0 || vector > 255) || + (numvec < 0 || numvec > MAX_MSIMSGS)) + return (EINVAL); + + ppt = ppt_find(bus, slot, func); + if (ppt == NULL) + return (ENOENT); + if (ppt->vm != vm) /* Make sure we own this device */ + return (EBUSY); + + /* Free any allocated resources */ + PPT_TEARDOWN_MSI(vm, vcpu, ppt); + + if (numvec == 0) /* nothing more to do */ + return (0); + + flags = RF_ACTIVE; + msi_count = pci_msi_count(ppt->dev); + if (msi_count == 0) { + startrid = 0; /* legacy interrupt */ + msi_count = 1; + flags |= RF_SHAREABLE; + } else + startrid = 1; /* MSI */ + + /* + * The device must be capable of supporting the number of vectors + * the guest wants to allocate. + */ + if (numvec > msi_count) + return (EINVAL); + + /* + * Make sure that we can allocate all the MSI vectors that are needed + * by the guest. + */ + if (startrid == 1) { + tmp = numvec; + error = pci_alloc_msi(ppt->dev, &tmp); + if (error) + return (error); + else if (tmp != numvec) { + pci_release_msi(ppt->dev); + return (ENOSPC); + } else { + /* success */ + } + } + + ppt->msi.vector = vector; + ppt->msi.vcpu = destcpu; + ppt->msi.startrid = startrid; + + /* + * Allocate the irq resource and attach it to the interrupt handler. + */ + for (i = 0; i < numvec; i++) { + ppt->msi.num_msgs = i + 1; + ppt->msi.cookie[i] = NULL; + + rid = startrid + i; + ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ, + &rid, flags); + if (ppt->msi.res[i] == NULL) + break; + + ppt->msi.arg[i].pptdev = ppt; + ppt->msi.arg[i].msg = i; + + error = bus_setup_intr(ppt->dev, ppt->msi.res[i], + INTR_TYPE_NET | INTR_MPSAFE | INTR_FAST, + pptintr, NULL, &ppt->msi.arg[i], + &ppt->msi.cookie[i]); + if (error != 0) + break; + } + + if (i < numvec) { + PPT_TEARDOWN_MSI(vm, vcpu, ppt); + return (ENXIO); + } + + return (0); +} diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h new file mode 100644 index 000000000000..95f3ad08b44c --- /dev/null +++ b/sys/amd64/vmm/io/ppt.h @@ -0,0 +1,40 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IO_PPT_H_ +#define _IO_PPT_H_ + +int ppt_assign_device(struct vm *vm, int bus, int slot, int func); +int ppt_unassign_device(struct vm *vm, int bus, int slot, int func); +int ppt_unassign_all(struct vm *vm); +int ppt_map_mmio(struct vm *vm, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, + int destcpu, int vector, int numvec); + +#endif diff --git a/sys/amd64/vmm/io/vdev.c b/sys/amd64/vmm/io/vdev.c new file mode 100644 index 000000000000..cd6c5d1b39c9 --- /dev/null +++ b/sys/amd64/vmm/io/vdev.c @@ -0,0 +1,270 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> + +#include "vdev.h" + +struct vdev { + SLIST_ENTRY(vdev) entry; + struct vdev_ops *ops; + void *dev; +}; +static SLIST_HEAD(, vdev) vdev_head; +static int vdev_count; + +struct vdev_region { + SLIST_ENTRY(vdev_region) entry; + struct vdev_ops *ops; + void *dev; + struct io_region *io; +}; +static SLIST_HEAD(, vdev_region) region_head; +static int region_count; + +static MALLOC_DEFINE(M_VDEV, "vdev", "vdev"); + +#define VDEV_INIT (0) +#define VDEV_RESET (1) +#define VDEV_HALT (2) + +// static const char* vdev_event_str[] = {"VDEV_INIT", "VDEV_RESET", "VDEV_HALT"}; + +static int +vdev_system_event(int event) +{ + struct vdev *vd; + int rc; + + // TODO: locking + SLIST_FOREACH(vd, &vdev_head, entry) { + // printf("%s : %s Device %s\n", __func__, vdev_event_str[event], vd->ops->name); + switch (event) { + case VDEV_INIT: + rc = vd->ops->init(vd->dev); + break; + case VDEV_RESET: + rc = vd->ops->reset(vd->dev); + break; + case VDEV_HALT: + rc = vd->ops->halt(vd->dev); + break; + default: + break; + } + if (rc) { + printf("vdev %s init failed rc=%d\n", + vd->ops->name, rc); + return rc; + } + } + return 0; +} + +int +vdev_init(void) +{ + return vdev_system_event(VDEV_INIT); +} + +int +vdev_reset(void) +{ + return vdev_system_event(VDEV_RESET); +} + +int +vdev_halt(void) +{ + return vdev_system_event(VDEV_HALT); +} + +void +vdev_vm_init(void) +{ + SLIST_INIT(&vdev_head); + vdev_count = 0; + + SLIST_INIT(®ion_head); + region_count = 0; +} +void +vdev_vm_cleanup(void) +{ + struct vdev *vd; + + // TODO: locking + while (!SLIST_EMPTY(&vdev_head)) { + vd = SLIST_FIRST(&vdev_head); + SLIST_REMOVE_HEAD(&vdev_head, entry); + free(vd, M_VDEV); + vdev_count--; + } +} + +int +vdev_register(struct vdev_ops *ops, void *dev) +{ + struct vdev *vd; + vd = malloc(sizeof(*vd), M_VDEV, M_WAITOK | M_ZERO); + vd->ops = ops; + vd->dev = dev; + + // TODO: locking + SLIST_INSERT_HEAD(&vdev_head, vd, entry); + vdev_count++; + return 0; +} + +void +vdev_unregister(void *dev) +{ + struct vdev *vd, *found; + + found = NULL; + // TODO: locking + SLIST_FOREACH(vd, &vdev_head, entry) { + if (vd->dev == dev) { + found = vd; + } + } + + if (found) { + SLIST_REMOVE(&vdev_head, found, vdev, entry); + free(found, M_VDEV); + } +} + +#define IN_RANGE(val, start, end) \ + (((val) >= (start)) && ((val) < (end))) + +static struct vdev_region* +vdev_find_region(struct io_region *io, void *dev) +{ + struct vdev_region *region, *found; + uint64_t region_base; + uint64_t region_end; + + found = NULL; + + // TODO: locking + // FIXME: we should verify we are in the context the current + // vcpu here as well. + SLIST_FOREACH(region, ®ion_head, entry) { + region_base = region->io->base; + region_end = region_base + region->io->len; + if (IN_RANGE(io->base, region_base, region_end) && + IN_RANGE(io->base+io->len, region_base, region_end+1) && + (dev && dev == region->dev)) { + found = region; + break; + } + } + return found; +} + +int +vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io) +{ + struct vdev_region *region; + + region = vdev_find_region(io, dev); + if (region) { + return -EEXIST; + } + + region = malloc(sizeof(*region), M_VDEV, M_WAITOK | M_ZERO); + region->io = io; + region->ops = ops; + region->dev = dev; + + // TODO: locking + SLIST_INSERT_HEAD(®ion_head, region, entry); + region_count++; + + return 0; +} + +void +vdev_unregister_region(void *dev, struct io_region *io) +{ + struct vdev_region *region; + + region = vdev_find_region(io, dev); + + if (region) { + SLIST_REMOVE(®ion_head, region, vdev_region, entry); + free(region, M_VDEV); + region_count--; + } +} + +static int +vdev_memrw(uint64_t gpa, opsize_t size, uint64_t *data, int read) +{ + struct vdev_region *region; + struct io_region io; + region_attr_t attr; + int rc; + + io.base = gpa; + io.len = size; + + region = vdev_find_region(&io, NULL); + if (!region) + return -EINVAL; + + attr = (read) ? MMIO_READ : MMIO_WRITE; + if (!(region->io->attr & attr)) + return -EPERM; + + if (read) + rc = region->ops->memread(region->dev, gpa, size, data); + else + rc = region->ops->memwrite(region->dev, gpa, size, *data); + + return rc; +} + +int +vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data) +{ + return vdev_memrw(gpa, size, data, 1); +} + +int +vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data) +{ + return vdev_memrw(gpa, size, &data, 0); +} diff --git a/sys/amd64/vmm/io/vdev.h b/sys/amd64/vmm/io/vdev.h new file mode 100644 index 000000000000..6feeba87b7c0 --- /dev/null +++ b/sys/amd64/vmm/io/vdev.h @@ -0,0 +1,84 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VDEV_H_ +#define _VDEV_H_ + +typedef enum { + BYTE = 1, + WORD = 2, + DWORD = 4, + QWORD = 8, +} opsize_t; + +typedef enum { + MMIO_READ = 1, + MMIO_WRITE = 2, +} region_attr_t; + +struct io_region { + uint64_t base; + uint64_t len; + region_attr_t attr; + int vcpu; +}; + +typedef int (*vdev_init_t)(void* dev); +typedef int (*vdev_reset_t)(void* dev); +typedef int (*vdev_halt_t)(void* dev); +typedef int (*vdev_memread_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t *data); +typedef int (*vdev_memwrite_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t data); + + +struct vdev_ops { + const char *name; + vdev_init_t init; + vdev_reset_t reset; + vdev_halt_t halt; + vdev_memread_t memread; + vdev_memwrite_t memwrite; +}; + + +void vdev_vm_init(void); +void vdev_vm_cleanup(void); + +int vdev_register(struct vdev_ops *ops, void *dev); +void vdev_unregister(void *dev); + +int vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io); +void vdev_unregister_region(void *dev, struct io_region *io); + +int vdev_init(void); +int vdev_reset(void); +int vdev_halt(void); +int vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data); +int vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data); + +#endif /* _VDEV_H_ */ + diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c new file mode 100644 index 000000000000..a21addfd5d32 --- /dev/null +++ b/sys/amd64/vmm/io/vlapic.c @@ -0,0 +1,812 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/systm.h> + +#include <machine/clock.h> +#include <machine/apicreg.h> + +#include <machine/vmm.h> + +#include "vmm_lapic.h" +#include "vmm_ktr.h" +#include "vdev.h" +#include "vlapic.h" + +#define VLAPIC_CTR0(vlapic, format) \ + VMM_CTR0((vlapic)->vm, (vlapic)->vcpuid, format) + +#define VLAPIC_CTR1(vlapic, format, p1) \ + VMM_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1) + +#define VLAPIC_CTR_IRR(vlapic, msg) \ +do { \ + uint32_t *irrptr = &(vlapic)->apic.irr0; \ + irrptr[0] = irrptr[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]); \ +} while (0) + +#define VLAPIC_CTR_ISR(vlapic, msg) \ +do { \ + uint32_t *isrptr = &(vlapic)->apic.isr0; \ + isrptr[0] = isrptr[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \ +} while (0) + +static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); + +#define PRIO(x) ((x) >> 4) + +#define VLAPIC_VERSION (16) +#define VLAPIC_MAXLVT_ENTRIES (5) + +struct vlapic { + struct vm *vm; + int vcpuid; + + struct io_region *mmio; + struct vdev_ops *ops; + struct LAPIC apic; + + int esr_update; + + int divisor; + int ccr_ticks; + + /* + * The 'isrvec_stk' is a stack of vectors injected by the local apic. + * A vector is popped from the stack when the processor does an EOI. + * The vector on the top of the stack is used to compute the + * Processor Priority in conjunction with the TPR. + */ + uint8_t isrvec_stk[ISRVEC_STK_SIZE]; + int isrvec_stk_top; +}; + +static void +vlapic_mask_lvts(uint32_t *lvts, int num_lvt) +{ + int i; + for (i = 0; i < num_lvt; i++) { + *lvts |= APIC_LVT_M; + lvts += 4; + } +} + +#if 0 +static inline void +vlapic_dump_lvt(uint32_t offset, uint32_t *lvt) +{ + printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset, + *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS, + *lvt & APIC_LVTT_M); +} +#endif + +static uint64_t +vlapic_get_ccr(struct vlapic *vlapic) +{ + struct LAPIC *lapic = &vlapic->apic; + return lapic->ccr_timer; +} + +static void +vlapic_update_errors(struct vlapic *vlapic) +{ + struct LAPIC *lapic = &vlapic->apic; + lapic->esr = 0; // XXX +} + +static void +vlapic_init_ipi(struct vlapic *vlapic) +{ + struct LAPIC *lapic = &vlapic->apic; + lapic->version = VLAPIC_VERSION; + lapic->version |= (VLAPIC_MAXLVT_ENTRIES < MAXLVTSHIFT); + lapic->dfr = 0xffffffff; + lapic->svr = APIC_SVR_VECTOR; + vlapic_mask_lvts(&lapic->lvt_timer, VLAPIC_MAXLVT_ENTRIES+1); +} + +static int +vlapic_op_reset(void* dev) +{ + struct vlapic *vlapic = (struct vlapic*)dev; + struct LAPIC *lapic = &vlapic->apic; + + memset(lapic, 0, sizeof(*lapic)); + lapic->id = vlapic->vcpuid << 24; + lapic->apr = vlapic->vcpuid; + vlapic_init_ipi(vlapic); + + return 0; + +} + +static int +vlapic_op_init(void* dev) +{ + struct vlapic *vlapic = (struct vlapic*)dev; + vdev_register_region(vlapic->ops, vlapic, vlapic->mmio); + return vlapic_op_reset(dev); +} + +static int +vlapic_op_halt(void* dev) +{ + struct vlapic *vlapic = (struct vlapic*)dev; + vdev_unregister_region(vlapic, vlapic->mmio); + return 0; + +} + +void +vlapic_set_intr_ready(struct vlapic *vlapic, int vector) +{ + struct LAPIC *lapic = &vlapic->apic; + uint32_t *irrptr; + int idx; + + if (vector < 0 || vector >= 256) + panic("vlapic_set_intr_ready: invalid vector %d\n", vector); + + idx = (vector / 32) * 4; + irrptr = &lapic->irr0; + atomic_set_int(&irrptr[idx], 1 << (vector % 32)); + VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready"); +} + +#define VLAPIC_BUS_FREQ tsc_freq +#define VLAPIC_DCR(x) ((x->dcr_timer & 0x8) >> 1)|(x->dcr_timer & 0x3) + +static int +vlapic_timer_divisor(uint32_t dcr) +{ + switch (dcr & 0xB) { + case APIC_TDCR_2: + return (2); + case APIC_TDCR_4: + return (4); + case APIC_TDCR_8: + return (8); + case APIC_TDCR_16: + return (16); + case APIC_TDCR_32: + return (32); + case APIC_TDCR_64: + return (64); + case APIC_TDCR_128: + return (128); + default: + panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr); + } +} + +static void +vlapic_start_timer(struct vlapic *vlapic, uint32_t elapsed) +{ + uint32_t icr_timer; + + icr_timer = vlapic->apic.icr_timer; + + vlapic->ccr_ticks = ticks; + if (elapsed < icr_timer) + vlapic->apic.ccr_timer = icr_timer - elapsed; + else { + /* + * This can happen when the guest is trying to run its local + * apic timer higher that the setting of 'hz' in the host. + * + * We deal with this by running the guest local apic timer + * at the rate of the host's 'hz' setting. + */ + vlapic->apic.ccr_timer = 0; + } +} + +static __inline uint32_t * +vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset) +{ + struct LAPIC *lapic = &vlapic->apic; + int i; + + if (offset < APIC_OFFSET_TIMER_LVT || offset > APIC_OFFSET_ERROR_LVT) { + panic("vlapic_get_lvt: invalid LVT\n"); + } + i = (offset - APIC_OFFSET_TIMER_LVT) >> 2; + return ((&lapic->lvt_timer) + i);; +} + +#if 1 +static void +dump_isrvec_stk(struct vlapic *vlapic) +{ + int i; + uint32_t *isrptr; + + isrptr = &vlapic->apic.isr0; + for (i = 0; i < 8; i++) + printf("ISR%d 0x%08x\n", i, isrptr[i * 4]); + + for (i = 0; i <= vlapic->isrvec_stk_top; i++) + printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]); +} +#endif + +/* + * Algorithm adopted from section "Interrupt, Task and Processor Priority" + * in Intel Architecture Manual Vol 3a. + */ +static void +vlapic_update_ppr(struct vlapic *vlapic) +{ + int isrvec, tpr, ppr; + + /* + * Note that the value on the stack at index 0 is always 0. + * + * This is a placeholder for the value of ISRV when none of the + * bits is set in the ISRx registers. + */ + isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top]; + tpr = vlapic->apic.tpr; + +#if 1 + { + int i, lastprio, curprio, vector, idx; + uint32_t *isrptr; + + if (vlapic->isrvec_stk_top == 0 && isrvec != 0) + panic("isrvec_stk is corrupted: %d", isrvec); + + /* + * Make sure that the priority of the nested interrupts is + * always increasing. + */ + lastprio = -1; + for (i = 1; i <= vlapic->isrvec_stk_top; i++) { + curprio = PRIO(vlapic->isrvec_stk[i]); + if (curprio <= lastprio) { + dump_isrvec_stk(vlapic); + panic("isrvec_stk does not satisfy invariant"); + } + lastprio = curprio; + } + + /* + * Make sure that each bit set in the ISRx registers has a + * corresponding entry on the isrvec stack. + */ + i = 1; + isrptr = &vlapic->apic.isr0; + for (vector = 0; vector < 256; vector++) { + idx = (vector / 32) * 4; + if (isrptr[idx] & (1 << (vector % 32))) { + if (i > vlapic->isrvec_stk_top || + vlapic->isrvec_stk[i] != vector) { + dump_isrvec_stk(vlapic); + panic("ISR and isrvec_stk out of sync"); + } + i++; + } + } + } +#endif + + if (PRIO(tpr) >= PRIO(isrvec)) + ppr = tpr; + else + ppr = isrvec & 0xf0; + + vlapic->apic.ppr = ppr; + VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); +} + +static void +vlapic_process_eoi(struct vlapic *vlapic) +{ + struct LAPIC *lapic = &vlapic->apic; + uint32_t *isrptr; + int i, idx, bitpos; + + isrptr = &lapic->isr0; + + /* + * The x86 architecture reserves the the first 32 vectors for use + * by the processor. + */ + for (i = 7; i > 0; i--) { + idx = i * 4; + bitpos = fls(isrptr[idx]); + if (bitpos != 0) { + if (vlapic->isrvec_stk_top <= 0) { + panic("invalid vlapic isrvec_stk_top %d", + vlapic->isrvec_stk_top); + } + isrptr[idx] &= ~(1 << (bitpos - 1)); + VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi"); + vlapic->isrvec_stk_top--; + vlapic_update_ppr(vlapic); + return; + } + } +} + +static __inline int +vlapic_get_lvt_field(uint32_t *lvt, uint32_t mask) +{ + return (*lvt & mask); +} + +static __inline int +vlapic_periodic_timer(struct vlapic *vlapic) +{ + uint32_t *lvt; + + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); + + return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC)); +} + +static void +vlapic_fire_timer(struct vlapic *vlapic) +{ + int vector; + uint32_t *lvt; + + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); + + if (!vlapic_get_lvt_field(lvt, APIC_LVTT_M)) { + vector = vlapic_get_lvt_field(lvt,APIC_LVTT_VECTOR); + vlapic_set_intr_ready(vlapic, vector); + } +} + +static int +lapic_process_icr(struct vlapic *vlapic, uint64_t icrval) +{ + int i; + cpumask_t dmask, thiscpumask; + uint32_t dest, vec, mode; + + thiscpumask = vcpu_mask(vlapic->vcpuid); + + dmask = 0; + dest = icrval >> 32; + vec = icrval & APIC_VECTOR_MASK; + mode = icrval & APIC_DELMODE_MASK; + + if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) { + switch (icrval & APIC_DEST_MASK) { + case APIC_DEST_DESTFLD: + dmask = vcpu_mask(dest); + break; + case APIC_DEST_SELF: + dmask = thiscpumask; + break; + case APIC_DEST_ALLISELF: + dmask = vm_active_cpus(vlapic->vm); + break; + case APIC_DEST_ALLESELF: + dmask = vm_active_cpus(vlapic->vm) & ~thiscpumask; + break; + } + + for (i = 0; i < VM_MAXCPU; i++) { + if (dmask & vcpu_mask(i)) { + if (mode == APIC_DELMODE_FIXED) + lapic_set_intr(vlapic->vm, i, vec); + else + vm_inject_nmi(vlapic->vm, i); + } + } + + return (0); /* handled completely in the kernel */ + } + + /* + * XXX this assumes that the startup IPI always succeeds + */ + if (mode == APIC_DELMODE_STARTUP) + vm_activate_cpu(vlapic->vm, dest); + + /* + * This will cause a return to userland. + */ + return (1); +} + +int +vlapic_pending_intr(struct vlapic *vlapic) +{ + struct LAPIC *lapic = &vlapic->apic; + int idx, i, bitpos, vector; + uint32_t *irrptr, val; + + irrptr = &lapic->irr0; + + /* + * The x86 architecture reserves the the first 32 vectors for use + * by the processor. + */ + for (i = 7; i > 0; i--) { + idx = i * 4; + val = atomic_load_acq_int(&irrptr[idx]); + bitpos = fls(val); + if (bitpos != 0) { + vector = i * 32 + (bitpos - 1); + if (PRIO(vector) > PRIO(lapic->ppr)) { + VLAPIC_CTR1(vlapic, "pending intr %d", vector); + return (vector); + } else + break; + } + } + VLAPIC_CTR0(vlapic, "no pending intr"); + return (-1); +} + +void +vlapic_intr_accepted(struct vlapic *vlapic, int vector) +{ + struct LAPIC *lapic = &vlapic->apic; + uint32_t *irrptr, *isrptr; + int idx, stk_top; + + /* + * clear the ready bit for vector being accepted in irr + * and set the vector as in service in isr. + */ + idx = (vector / 32) * 4; + + irrptr = &lapic->irr0; + atomic_clear_int(&irrptr[idx], 1 << (vector % 32)); + VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted"); + + isrptr = &lapic->isr0; + isrptr[idx] |= 1 << (vector % 32); + VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted"); + + /* + * Update the PPR + */ + vlapic->isrvec_stk_top++; + + stk_top = vlapic->isrvec_stk_top; + if (stk_top >= ISRVEC_STK_SIZE) + panic("isrvec_stk_top overflow %d", stk_top); + + vlapic->isrvec_stk[stk_top] = vector; + vlapic_update_ppr(vlapic); +} + +int +vlapic_op_mem_read(void* dev, uint64_t gpa, opsize_t size, uint64_t *data) +{ + struct vlapic *vlapic = (struct vlapic*)dev; + struct LAPIC *lapic = &vlapic->apic; + uint64_t offset = gpa & ~(PAGE_SIZE); + uint32_t *reg; + int i; + + if (offset > sizeof(*lapic)) { + *data = 0; + return 0; + } + + offset &= ~3; + switch(offset) + { + case APIC_OFFSET_ID: + *data = lapic->id; + break; + case APIC_OFFSET_VER: + *data = lapic->version; + break; + case APIC_OFFSET_TPR: + *data = lapic->tpr; + break; + case APIC_OFFSET_APR: + *data = lapic->apr; + break; + case APIC_OFFSET_PPR: + *data = lapic->ppr; + break; + case APIC_OFFSET_EOI: + *data = lapic->eoi; + break; + case APIC_OFFSET_LDR: + *data = lapic->ldr; + break; + case APIC_OFFSET_DFR: + *data = lapic->dfr; + break; + case APIC_OFFSET_SVR: + *data = lapic->svr; + break; + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + i = (offset - APIC_OFFSET_ISR0) >> 2; + reg = &lapic->isr0; + *data = *(reg + i); + break; + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + i = (offset - APIC_OFFSET_TMR0) >> 2; + reg = &lapic->tmr0; + *data = *(reg + i); + break; + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + i = (offset - APIC_OFFSET_IRR0) >> 2; + reg = &lapic->irr0; + *data = atomic_load_acq_int(reg + i); + break; + case APIC_OFFSET_ESR: + *data = lapic->esr; + break; + case APIC_OFFSET_ICR_LOW: + *data = lapic->icr_lo; + break; + case APIC_OFFSET_ICR_HI: + *data = lapic->icr_hi; + break; + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + reg = vlapic_get_lvt(vlapic, offset); + *data = *(reg); + break; + case APIC_OFFSET_ICR: + *data = lapic->icr_timer; + break; + case APIC_OFFSET_CCR: + *data = vlapic_get_ccr(vlapic); + break; + case APIC_OFFSET_DCR: + *data = lapic->dcr_timer; + break; + case APIC_OFFSET_RRR: + default: + *data = 0; + break; + } + return 0; +} + +int +vlapic_op_mem_write(void* dev, uint64_t gpa, opsize_t size, uint64_t data) +{ + struct vlapic *vlapic = (struct vlapic*)dev; + struct LAPIC *lapic = &vlapic->apic; + uint64_t offset = gpa & ~(PAGE_SIZE); + uint32_t *reg; + int retval; + + if (offset > sizeof(*lapic)) { + return 0; + } + + retval = 0; + offset &= ~3; + switch(offset) + { + case APIC_OFFSET_ID: + lapic->id = data; + break; + case APIC_OFFSET_TPR: + lapic->tpr = data & 0xff; + vlapic_update_ppr(vlapic); + break; + case APIC_OFFSET_EOI: + vlapic_process_eoi(vlapic); + break; + case APIC_OFFSET_LDR: + break; + case APIC_OFFSET_DFR: + break; + case APIC_OFFSET_SVR: + lapic->svr = data; + break; + case APIC_OFFSET_ICR_LOW: + retval = lapic_process_icr(vlapic, data); + break; + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + reg = vlapic_get_lvt(vlapic, offset); + if (!(lapic->svr & APIC_SVR_ENABLE)) { + data |= APIC_LVT_M; + } + *reg = data; + // vlapic_dump_lvt(offset, reg); + break; + case APIC_OFFSET_ICR: + lapic->icr_timer = data; + vlapic_start_timer(vlapic, 0); + break; + + case APIC_OFFSET_DCR: + lapic->dcr_timer = data; + vlapic->divisor = vlapic_timer_divisor(data); + break; + + case APIC_OFFSET_ESR: + vlapic_update_errors(vlapic); + break; + case APIC_OFFSET_VER: + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + case APIC_OFFSET_CCR: + default: + // Read only. + break; + } + + return (retval); +} + +void +vlapic_timer_tick(struct vlapic *vlapic) +{ + int curticks, delta, periodic; + uint32_t ccr; + uint32_t decrement, remainder; + + curticks = ticks; + + /* Common case */ + delta = curticks - vlapic->ccr_ticks; + if (delta == 0) + return; + + /* Local APIC timer is disabled */ + if (vlapic->apic.icr_timer == 0) + return; + + /* One-shot mode and timer has already counted down to zero */ + periodic = vlapic_periodic_timer(vlapic); + if (!periodic && vlapic->apic.ccr_timer == 0) + return; + /* + * The 'curticks' and 'ccr_ticks' are out of sync by more than + * 2^31 ticks. We deal with this by restarting the timer. + */ + if (delta < 0) { + vlapic_start_timer(vlapic, 0); + return; + } + + ccr = vlapic->apic.ccr_timer; + decrement = (VLAPIC_BUS_FREQ / vlapic->divisor) / hz; + while (delta-- > 0) { + if (ccr <= decrement) { + remainder = decrement - ccr; + vlapic_fire_timer(vlapic); + if (periodic) { + vlapic_start_timer(vlapic, remainder); + ccr = vlapic->apic.ccr_timer; + } else { + /* + * One-shot timer has counted down to zero. + */ + ccr = 0; + break; + } + } else + ccr -= decrement; + } + + vlapic->ccr_ticks = curticks; + vlapic->apic.ccr_timer = ccr; +} + +struct vdev_ops vlapic_dev_ops = { + .name = "vlapic", + .init = vlapic_op_init, + .reset = vlapic_op_reset, + .halt = vlapic_op_halt, + .memread = vlapic_op_mem_read, + .memwrite = vlapic_op_mem_write, +}; +static struct io_region vlapic_mmio[VM_MAXCPU]; + +struct vlapic * +vlapic_init(struct vm *vm, int vcpuid) +{ + struct vlapic *vlapic; + + vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO); + vlapic->vm = vm; + vlapic->vcpuid = vcpuid; + vlapic->ops = &vlapic_dev_ops; + + vlapic->mmio = vlapic_mmio + vcpuid; + vlapic->mmio->base = DEFAULT_APIC_BASE; + vlapic->mmio->len = PAGE_SIZE; + vlapic->mmio->attr = MMIO_READ|MMIO_WRITE; + vlapic->mmio->vcpu = vcpuid; + + vdev_register(&vlapic_dev_ops, vlapic); + + vlapic_op_init(vlapic); + + return (vlapic); +} + +void +vlapic_cleanup(struct vlapic *vlapic) +{ + vdev_unregister(vlapic); + free(vlapic, M_VLAPIC); +} diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h new file mode 100644 index 000000000000..861ea8c9c380 --- /dev/null +++ b/sys/amd64/vmm/io/vlapic.h @@ -0,0 +1,105 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VLAPIC_H_ +#define _VLAPIC_H_ + +#include "vdev.h" + +struct vm; + +/* + * Map of APIC Registers: Offset Description Access + */ +#define APIC_OFFSET_ID 0x20 // Local APIC ID R/W +#define APIC_OFFSET_VER 0x30 // Local APIC Version R +#define APIC_OFFSET_TPR 0x80 // Task Priority Register R/W +#define APIC_OFFSET_APR 0x90 // Arbitration Priority Register R +#define APIC_OFFSET_PPR 0xA0 // Processor Priority Register R +#define APIC_OFFSET_EOI 0xB0 // EOI Register W +#define APIC_OFFSET_RRR 0xC0 // Remote read R +#define APIC_OFFSET_LDR 0xD0 // Logical Destination R/W +#define APIC_OFFSET_DFR 0xE0 // Destination Format Register 0..27 R; 28..31 R/W +#define APIC_OFFSET_SVR 0xF0 // Spurious Interrupt Vector Reg. 0..3 R; 4..9 R/W +#define APIC_OFFSET_ISR0 0x100 // ISR 000-031 R +#define APIC_OFFSET_ISR1 0x110 // ISR 032-063 R +#define APIC_OFFSET_ISR2 0x120 // ISR 064-095 R +#define APIC_OFFSET_ISR3 0x130 // ISR 095-128 R +#define APIC_OFFSET_ISR4 0x140 // ISR 128-159 R +#define APIC_OFFSET_ISR5 0x150 // ISR 160-191 R +#define APIC_OFFSET_ISR6 0x160 // ISR 192-223 R +#define APIC_OFFSET_ISR7 0x170 // ISR 224-255 R +#define APIC_OFFSET_TMR0 0x180 // TMR 000-031 R +#define APIC_OFFSET_TMR1 0x190 // TMR 032-063 R +#define APIC_OFFSET_TMR2 0x1A0 // TMR 064-095 R +#define APIC_OFFSET_TMR3 0x1B0 // TMR 095-128 R +#define APIC_OFFSET_TMR4 0x1C0 // TMR 128-159 R +#define APIC_OFFSET_TMR5 0x1D0 // TMR 160-191 R +#define APIC_OFFSET_TMR6 0x1E0 // TMR 192-223 R +#define APIC_OFFSET_TMR7 0x1F0 // TMR 224-255 R +#define APIC_OFFSET_IRR0 0x200 // IRR 000-031 R +#define APIC_OFFSET_IRR1 0x210 // IRR 032-063 R +#define APIC_OFFSET_IRR2 0x220 // IRR 064-095 R +#define APIC_OFFSET_IRR3 0x230 // IRR 095-128 R +#define APIC_OFFSET_IRR4 0x240 // IRR 128-159 R +#define APIC_OFFSET_IRR5 0x250 // IRR 160-191 R +#define APIC_OFFSET_IRR6 0x260 // IRR 192-223 R +#define APIC_OFFSET_IRR7 0x270 // IRR 224-255 R +#define APIC_OFFSET_ESR 0x280 // Error Status Register R +#define APIC_OFFSET_ICR_LOW 0x300 // Interrupt Command Reg. (0-31) R/W +#define APIC_OFFSET_ICR_HI 0x310 // Interrupt Command Reg. (32-63) R/W +#define APIC_OFFSET_TIMER_LVT 0x320 // Local Vector Table (Timer) R/W +#define APIC_OFFSET_THERM_LVT 0x330 // Local Vector Table (Thermal) R/W (PIV+) +#define APIC_OFFSET_PERF_LVT 0x340 // Local Vector Table (Performance) R/W (P6+) +#define APIC_OFFSET_LINT0_LVT 0x350 // Local Vector Table (LINT0) R/W +#define APIC_OFFSET_LINT1_LVT 0x360 // Local Vector Table (LINT1) R/W +#define APIC_OFFSET_ERROR_LVT 0x370 // Local Vector Table (ERROR) R/W +#define APIC_OFFSET_ICR 0x380 // Initial Count Reg. for Timer R/W +#define APIC_OFFSET_CCR 0x390 // Current Count of Timer R +#define APIC_OFFSET_DCR 0x3E0 // Timer Divide Configuration Reg. R/W + +/* + * 16 priority levels with at most one vector injected per level. + */ +#define ISRVEC_STK_SIZE (16 + 1) + +struct vlapic *vlapic_init(struct vm *vm, int vcpuid); +void vlapic_cleanup(struct vlapic *vlapic); + +int vlapic_op_mem_write(void* dev, uint64_t gpa, + opsize_t size, uint64_t data); + +int vlapic_op_mem_read(void* dev, uint64_t gpa, + opsize_t size, uint64_t *data); + +int vlapic_pending_intr(struct vlapic *vlapic); +void vlapic_intr_accepted(struct vlapic *vlapic, int vector); +void vlapic_set_intr_ready(struct vlapic *vlapic, int vector); +void vlapic_timer_tick(struct vlapic *vlapic); + +#endif /* _VLAPIC_H_ */ diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c new file mode 100644 index 000000000000..c93c31e772e8 --- /dev/null +++ b/sys/amd64/vmm/vmm.c @@ -0,0 +1,737 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/sched.h> +#include <sys/smp.h> +#include <sys/systm.h> + +#include <vm/vm.h> + +#include <machine/vm.h> +#include <machine/pcb.h> +#include <machine/apicreg.h> + +#include <machine/vmm.h> +#include "vmm_mem.h" +#include "vmm_util.h" +#include <machine/vmm_dev.h> +#include "vlapic.h" +#include "vmm_msr.h" +#include "vmm_ipi.h" +#include "vmm_stat.h" + +#include "io/ppt.h" +#include "io/iommu.h" + +struct vlapic; + +struct vcpu { + int flags; + int pincpu; /* host cpuid this vcpu is bound to */ + int hostcpu; /* host cpuid this vcpu last ran on */ + uint64_t guest_msrs[VMM_MSR_NUM]; + struct vlapic *vlapic; + int vcpuid; + struct savefpu savefpu; /* guest fpu state */ + void *stats; +}; +#define VCPU_F_PINNED 0x0001 +#define VCPU_F_RUNNING 0x0002 + +#define VCPU_PINCPU(vm, vcpuid) \ + ((vm->vcpu[vcpuid].flags & VCPU_F_PINNED) ? vm->vcpu[vcpuid].pincpu : -1) + +#define VCPU_UNPIN(vm, vcpuid) (vm->vcpu[vcpuid].flags &= ~VCPU_F_PINNED) + +#define VCPU_PIN(vm, vcpuid, host_cpuid) \ +do { \ + vm->vcpu[vcpuid].flags |= VCPU_F_PINNED; \ + vm->vcpu[vcpuid].pincpu = host_cpuid; \ +} while(0) + +#define VM_MAX_MEMORY_SEGMENTS 2 + +struct vm { + void *cookie; /* processor-specific data */ + void *iommu; /* iommu-specific data */ + struct vcpu vcpu[VM_MAXCPU]; + int num_mem_segs; + struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS]; + char name[VM_MAX_NAMELEN]; + + /* + * Mask of active vcpus. + * An active vcpu is one that has been started implicitly (BSP) or + * explicitly (AP) by sending it a startup ipi. + */ + cpumask_t active_cpus; +}; + +static struct vmm_ops *ops; +#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0) +#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) + +#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL) +#define VMRUN(vmi, vcpu, rip, vmexit) \ + (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, vmexit) : ENXIO) +#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) +#define VMMMAP(vmi, gpa, hpa, len, attr, prot, spm) \ + (ops != NULL ? (*ops->vmmmap)(vmi, gpa, hpa, len, attr, prot, spm) : ENXIO) +#define VMGETREG(vmi, vcpu, num, retval) \ + (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) +#define VMSETREG(vmi, vcpu, num, val) \ + (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) +#define VMGETDESC(vmi, vcpu, num, desc) \ + (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) +#define VMSETDESC(vmi, vcpu, num, desc) \ + (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) +#define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \ + (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO) +#define VMNMI(vmi, vcpu) \ + (ops != NULL ? (*ops->vmnmi)(vmi, vcpu) : ENXIO) +#define VMGETCAP(vmi, vcpu, num, retval) \ + (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) +#define VMSETCAP(vmi, vcpu, num, val) \ + (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) + +#define fxrstor(addr) __asm("fxrstor %0" : : "m" (*(addr))) +#define fxsave(addr) __asm __volatile("fxsave %0" : "=m" (*(addr))) +#define fpu_start_emulating() __asm("smsw %%ax; orb %0,%%al; lmsw %%ax" \ + : : "n" (CR0_TS) : "ax") +#define fpu_stop_emulating() __asm("clts") + +static MALLOC_DEFINE(M_VM, "vm", "vm"); +CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */ + +/* statistics */ +static VMM_STAT_DEFINE(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); + +static void +vcpu_cleanup(struct vcpu *vcpu) +{ + vlapic_cleanup(vcpu->vlapic); + vmm_stat_free(vcpu->stats); +} + +static void +vcpu_init(struct vm *vm, uint32_t vcpu_id) +{ + struct vcpu *vcpu; + + vcpu = &vm->vcpu[vcpu_id]; + + vcpu->hostcpu = -1; + vcpu->vcpuid = vcpu_id; + vcpu->vlapic = vlapic_init(vm, vcpu_id); + fpugetregs(curthread, &vcpu->savefpu); + vcpu->stats = vmm_stat_alloc(); +} + +static int +vmm_init(void) +{ + int error; + + vmm_ipi_init(); + + error = vmm_mem_init(); + if (error) + return (error); + + if (vmm_is_intel()) + ops = &vmm_ops_intel; + else if (vmm_is_amd()) + ops = &vmm_ops_amd; + else + return (ENXIO); + + vmm_msr_init(); + + return (VMM_INIT()); +} + +static int +vmm_handler(module_t mod, int what, void *arg) +{ + int error; + + switch (what) { + case MOD_LOAD: + vmmdev_init(); + iommu_init(); + error = vmm_init(); + break; + case MOD_UNLOAD: + vmmdev_cleanup(); + iommu_cleanup(); + vmm_ipi_cleanup(); + error = VMM_CLEANUP(); + break; + default: + error = 0; + break; + } + return (error); +} + +static moduledata_t vmm_kmod = { + "vmm", + vmm_handler, + NULL +}; + +/* + * Execute the module load handler after the pci passthru driver has had + * a chance to claim devices. We need this information at the time we do + * iommu initialization. + */ +DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_CONFIGURE + 1, SI_ORDER_ANY); +MODULE_VERSION(vmm, 1); + +SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); + +struct vm * +vm_create(const char *name) +{ + int i; + struct vm *vm; + vm_paddr_t maxaddr; + + const int BSP = 0; + + if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) + return (NULL); + + vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); + strcpy(vm->name, name); + vm->cookie = VMINIT(vm); + + for (i = 0; i < VM_MAXCPU; i++) { + vcpu_init(vm, i); + guest_msrs_init(vm, i); + } + + maxaddr = vmm_mem_maxaddr(); + vm->iommu = iommu_create_domain(maxaddr); + vm_activate_cpu(vm, BSP); + + return (vm); +} + +void +vm_destroy(struct vm *vm) +{ + int i; + + ppt_unassign_all(vm); + + for (i = 0; i < vm->num_mem_segs; i++) + vmm_mem_free(vm->mem_segs[i].hpa, vm->mem_segs[i].len); + + for (i = 0; i < VM_MAXCPU; i++) + vcpu_cleanup(&vm->vcpu[i]); + + iommu_destroy_domain(vm->iommu); + + VMCLEANUP(vm->cookie); + + free(vm, M_VM); +} + +const char * +vm_name(struct vm *vm) +{ + return (vm->name); +} + +int +vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) +{ + const boolean_t spok = TRUE; /* superpage mappings are ok */ + + return (VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE, + VM_PROT_RW, spok)); +} + +int +vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) +{ + const boolean_t spok = TRUE; /* superpage mappings are ok */ + + return (VMMMAP(vm->cookie, gpa, 0, len, VM_MEMATTR_UNCACHEABLE, + VM_PROT_NONE, spok)); +} + +int +vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa) +{ + int error; + vm_paddr_t hpa; + + const boolean_t spok = TRUE; /* superpage mappings are ok */ + + /* + * find the hpa if already it was already vm_malloc'd. + */ + hpa = vm_gpa2hpa(vm, gpa, len); + if (hpa != ((vm_paddr_t)-1)) + goto out; + + if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) + return (E2BIG); + + hpa = vmm_mem_alloc(len); + if (hpa == 0) + return (ENOMEM); + + error = VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_WRITE_BACK, + VM_PROT_ALL, spok); + if (error) { + vmm_mem_free(hpa, len); + return (error); + } + + iommu_create_mapping(vm->iommu, gpa, hpa, len); + + vm->mem_segs[vm->num_mem_segs].gpa = gpa; + vm->mem_segs[vm->num_mem_segs].hpa = hpa; + vm->mem_segs[vm->num_mem_segs].len = len; + vm->num_mem_segs++; +out: + *ret_hpa = hpa; + return (0); +} + +vm_paddr_t +vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len) +{ + int i; + vm_paddr_t gpabase, gpalimit, hpabase; + + for (i = 0; i < vm->num_mem_segs; i++) { + hpabase = vm->mem_segs[i].hpa; + gpabase = vm->mem_segs[i].gpa; + gpalimit = gpabase + vm->mem_segs[i].len; + if (gpa >= gpabase && gpa + len <= gpalimit) + return ((gpa - gpabase) + hpabase); + } + return ((vm_paddr_t)-1); +} + +int +vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, + struct vm_memory_segment *seg) +{ + int i; + + for (i = 0; i < vm->num_mem_segs; i++) { + if (gpabase == vm->mem_segs[i].gpa) { + *seg = vm->mem_segs[i]; + return (0); + } + } + return (-1); +} + +int +vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) +{ + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + + return (VMGETREG(vm->cookie, vcpu, reg, retval)); +} + +int +vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) +{ + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + + return (VMSETREG(vm->cookie, vcpu, reg, val)); +} + +static boolean_t +is_descriptor_table(int reg) +{ + + switch (reg) { + case VM_REG_GUEST_IDTR: + case VM_REG_GUEST_GDTR: + return (TRUE); + default: + return (FALSE); + } +} + +static boolean_t +is_segment_register(int reg) +{ + + switch (reg) { + case VM_REG_GUEST_ES: + case VM_REG_GUEST_CS: + case VM_REG_GUEST_SS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_TR: + case VM_REG_GUEST_LDTR: + return (TRUE); + default: + return (FALSE); + } +} + +int +vm_get_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc) +{ + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (!is_segment_register(reg) && !is_descriptor_table(reg)) + return (EINVAL); + + return (VMGETDESC(vm->cookie, vcpu, reg, desc)); +} + +int +vm_set_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc) +{ + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (!is_segment_register(reg) && !is_descriptor_table(reg)) + return (EINVAL); + + return (VMSETDESC(vm->cookie, vcpu, reg, desc)); +} + +int +vm_get_pinning(struct vm *vm, int vcpuid, int *cpuid) +{ + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + *cpuid = VCPU_PINCPU(vm, vcpuid); + + return (0); +} + +int +vm_set_pinning(struct vm *vm, int vcpuid, int host_cpuid) +{ + struct thread *td; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + td = curthread; /* XXXSMP only safe when muxing vcpus */ + + /* unpin */ + if (host_cpuid < 0) { + VCPU_UNPIN(vm, vcpuid); + thread_lock(td); + sched_unbind(td); + thread_unlock(td); + return (0); + } + + if (CPU_ABSENT(host_cpuid)) + return (EINVAL); + + /* + * XXX we should check that 'host_cpuid' has not already been pinned + * by another vm. + */ + thread_lock(td); + sched_bind(td, host_cpuid); + thread_unlock(td); + VCPU_PIN(vm, vcpuid, host_cpuid); + + return (0); +} + +static void +restore_guest_fpustate(struct vcpu *vcpu) +{ + register_t s; + + s = intr_disable(); + fpu_stop_emulating(); + fxrstor(&vcpu->savefpu); + fpu_start_emulating(); + intr_restore(s); +} + +static void +save_guest_fpustate(struct vcpu *vcpu) +{ + register_t s; + + s = intr_disable(); + fpu_stop_emulating(); + fxsave(&vcpu->savefpu); + fpu_start_emulating(); + intr_restore(s); +} + +int +vm_run(struct vm *vm, struct vm_run *vmrun) +{ + int error, vcpuid; + struct vcpu *vcpu; + struct pcb *pcb; + uint64_t tscval; + + vcpuid = vmrun->cpuid; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + critical_enter(); + + tscval = rdtsc(); + + pcb = PCPU_GET(curpcb); + pcb->pcb_full_iret = 1; + + vcpu->hostcpu = curcpu; + + fpuexit(curthread); + restore_guest_msrs(vm, vcpuid); + restore_guest_fpustate(vcpu); + error = VMRUN(vm->cookie, vcpuid, vmrun->rip, &vmrun->vm_exit); + save_guest_fpustate(vcpu); + restore_host_msrs(vm, vcpuid); + + vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); + + critical_exit(); + + return (error); +} + +int +vm_inject_event(struct vm *vm, int vcpuid, int type, + int vector, uint32_t code, int code_valid) +{ + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0) + return (EINVAL); + + if (vector < 0 || vector > 255) + return (EINVAL); + + return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid)); +} + +int +vm_inject_nmi(struct vm *vm, int vcpu) +{ + int error; + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + error = VMNMI(vm->cookie, vcpu); + vm_interrupt_hostcpu(vm, vcpu); + return (error); +} + +int +vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) +{ + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (VMGETCAP(vm->cookie, vcpu, type, retval)); +} + +int +vm_set_capability(struct vm *vm, int vcpu, int type, int val) +{ + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (VMSETCAP(vm->cookie, vcpu, type, val)); +} + +uint64_t * +vm_guest_msrs(struct vm *vm, int cpu) +{ + return (vm->vcpu[cpu].guest_msrs); +} + +struct vlapic * +vm_lapic(struct vm *vm, int cpu) +{ + return (vm->vcpu[cpu].vlapic); +} + +boolean_t +vmm_is_pptdev(int bus, int slot, int func) +{ + int found, b, s, f, n; + char *val, *cp, *cp2; + + /* + * setenv pptdevs "1/2/3 4/5/6 7/8/9 10/11/12" + */ + found = 0; + cp = val = getenv("pptdevs"); + while (cp != NULL && *cp != '\0') { + if ((cp2 = strchr(cp, ' ')) != NULL) + *cp2 = '\0'; + + n = sscanf(cp, "%d/%d/%d", &b, &s, &f); + if (n == 3 && bus == b && slot == s && func == f) { + found = 1; + break; + } + + if (cp2 != NULL) + *cp2++ = ' '; + + cp = cp2; + } + freeenv(val); + return (found); +} + +void * +vm_iommu_domain(struct vm *vm) +{ + + return (vm->iommu); +} + +void +vm_set_run_state(struct vm *vm, int vcpuid, int state) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_set_run_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + if (state == VCPU_RUNNING) { + if (vcpu->flags & VCPU_F_RUNNING) { + panic("vm_set_run_state: %s[%d] is already running", + vm_name(vm), vcpuid); + } + vcpu->flags |= VCPU_F_RUNNING; + } else { + if ((vcpu->flags & VCPU_F_RUNNING) == 0) { + panic("vm_set_run_state: %s[%d] is already stopped", + vm_name(vm), vcpuid); + } + vcpu->flags &= ~VCPU_F_RUNNING; + } +} + +int +vm_get_run_state(struct vm *vm, int vcpuid, int *cpuptr) +{ + int retval, hostcpu; + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_get_run_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + if (vcpu->flags & VCPU_F_RUNNING) { + retval = VCPU_RUNNING; + hostcpu = vcpu->hostcpu; + } else { + retval = VCPU_STOPPED; + hostcpu = -1; + } + + if (cpuptr) + *cpuptr = hostcpu; + + return (retval); +} + +void +vm_activate_cpu(struct vm *vm, int vcpuid) +{ + + if (vcpuid >= 0 && vcpuid < VM_MAXCPU) + vm->active_cpus |= vcpu_mask(vcpuid); +} + +cpumask_t +vm_active_cpus(struct vm *vm) +{ + + return (vm->active_cpus); +} + +void * +vcpu_stats(struct vm *vm, int vcpuid) +{ + + return (vm->vcpu[vcpuid].stats); +} diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c new file mode 100644 index 000000000000..cf443fc1235c --- /dev/null +++ b/sys/amd64/vmm/vmm_dev.c @@ -0,0 +1,468 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/queue.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/malloc.h> +#include <sys/conf.h> +#include <sys/sysctl.h> +#include <sys/libkern.h> +#include <sys/ioccom.h> +#include <sys/mman.h> +#include <sys/uio.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/pmap.h> +#include <machine/vmparam.h> + +#include <machine/vmm.h> +#include "vmm_lapic.h" +#include "vmm_stat.h" +#include "io/ppt.h" +#include <machine/vmm_dev.h> + +struct vmmdev_softc { + struct vm *vm; /* vm instance cookie */ + struct cdev *cdev; + SLIST_ENTRY(vmmdev_softc) link; +}; +static SLIST_HEAD(, vmmdev_softc) head; + +static struct mtx vmmdev_mtx; + +static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); + +SYSCTL_DECL(_hw_vmm); + +static struct vmmdev_softc * +vmmdev_lookup(const char *name) +{ + struct vmmdev_softc *sc; + +#ifdef notyet /* XXX kernel is not compiled with invariants */ + mtx_assert(&vmmdev_mtx, MA_OWNED); +#endif + + SLIST_FOREACH(sc, &head, link) { + if (strcmp(name, vm_name(sc->vm)) == 0) + break; + } + + return (sc); +} + +static struct vmmdev_softc * +vmmdev_lookup2(struct cdev *cdev) +{ + struct vmmdev_softc *sc; + +#ifdef notyet /* XXX kernel is not compiled with invariants */ + mtx_assert(&vmmdev_mtx, MA_OWNED); +#endif + + SLIST_FOREACH(sc, &head, link) { + if (sc->cdev == cdev) + break; + } + + return (sc); +} + +static int +vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) +{ + int error, off, c; + vm_paddr_t hpa, gpa; + struct vmmdev_softc *sc; + + static char zerobuf[PAGE_SIZE]; + + error = 0; + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup2(cdev); + + while (uio->uio_resid > 0 && error == 0) { + gpa = uio->uio_offset; + off = gpa & PAGE_MASK; + c = min(uio->uio_resid, PAGE_SIZE - off); + + /* + * The VM has a hole in its physical memory map. If we want to + * use 'dd' to inspect memory beyond the hole we need to + * provide bogus data for memory that lies in the hole. + * + * Since this device does not support lseek(2), dd(1) will + * read(2) blocks of data to simulate the lseek(2). + */ + hpa = vm_gpa2hpa(sc->vm, gpa, c); + if (hpa == (vm_paddr_t)-1) { + if (uio->uio_rw == UIO_READ) + error = uiomove(zerobuf, c, uio); + else + error = EFAULT; + } else + error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio); + } + + mtx_unlock(&vmmdev_mtx); + return (error); +} + +static int +vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, + struct thread *td) +{ + int error, vcpu; + struct vmmdev_softc *sc; + struct vm_memory_segment *seg; + struct vm_register *vmreg; + struct vm_seg_desc* vmsegdesc; + struct vm_pin *vmpin; + struct vm_run *vmrun; + struct vm_event *vmevent; + struct vm_lapic_irq *vmirq; + struct vm_capability *vmcap; + struct vm_pptdev *pptdev; + struct vm_pptdev_mmio *pptmmio; + struct vm_pptdev_msi *pptmsi; + struct vm_nmi *vmnmi; + struct vm_stats *vmstats; + struct vm_stat_desc *statdesc; + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup2(cdev); + if (sc == NULL) { + mtx_unlock(&vmmdev_mtx); + return (ENXIO); + } + + /* + * Some VMM ioctls can operate only on vcpus that are not running. + */ + switch (cmd) { + case VM_RUN: + case VM_SET_PINNING: + case VM_GET_REGISTER: + case VM_SET_REGISTER: + case VM_GET_SEGMENT_DESCRIPTOR: + case VM_SET_SEGMENT_DESCRIPTOR: + case VM_INJECT_EVENT: + case VM_GET_CAPABILITY: + case VM_SET_CAPABILITY: + case VM_PPTDEV_MSI: + /* + * XXX fragile, handle with care + * Assumes that the first field of the ioctl data is the vcpu. + */ + vcpu = *(int *)data; + if (vcpu < 0 || vcpu >= VM_MAXCPU) { + error = EINVAL; + goto done; + } + + if (vcpu_is_running(sc->vm, vcpu, NULL)) { + error = EBUSY; + goto done; + } + break; + default: + break; + } + + switch(cmd) { + case VM_RUN: + vmrun = (struct vm_run *)data; + + vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_RUNNING); + mtx_unlock(&vmmdev_mtx); + + error = vm_run(sc->vm, vmrun); + + mtx_lock(&vmmdev_mtx); + vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_STOPPED); + break; + case VM_STAT_DESC: { + const char *desc; + statdesc = (struct vm_stat_desc *)data; + desc = vmm_stat_desc(statdesc->index); + if (desc != NULL) { + error = 0; + strlcpy(statdesc->desc, desc, sizeof(statdesc->desc)); + } else + error = EINVAL; + break; + } + case VM_STATS: { + CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_TYPES); + vmstats = (struct vm_stats *)data; + getmicrotime(&vmstats->tv); + error = vmm_stat_copy(sc->vm, vmstats->cpuid, + &vmstats->num_entries, vmstats->statbuf); + break; + } + case VM_PPTDEV_MSI: + pptmsi = (struct vm_pptdev_msi *)data; + error = ppt_setup_msi(sc->vm, pptmsi->vcpu, + pptmsi->bus, pptmsi->slot, pptmsi->func, + pptmsi->destcpu, pptmsi->vector, + pptmsi->numvec); + break; + case VM_MAP_PPTDEV_MMIO: + pptmmio = (struct vm_pptdev_mmio *)data; + error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, + pptmmio->func, pptmmio->gpa, pptmmio->len, + pptmmio->hpa); + break; + case VM_BIND_PPTDEV: + pptdev = (struct vm_pptdev *)data; + error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot, + pptdev->func); + break; + case VM_UNBIND_PPTDEV: + pptdev = (struct vm_pptdev *)data; + error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot, + pptdev->func); + break; + case VM_INJECT_EVENT: + vmevent = (struct vm_event *)data; + error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type, + vmevent->vector, + vmevent->error_code, + vmevent->error_code_valid); + break; + case VM_INJECT_NMI: + vmnmi = (struct vm_nmi *)data; + error = vm_inject_nmi(sc->vm, vmnmi->cpuid); + break; + case VM_LAPIC_IRQ: + vmirq = (struct vm_lapic_irq *)data; + error = lapic_set_intr(sc->vm, vmirq->cpuid, vmirq->vector); + break; + case VM_SET_PINNING: + vmpin = (struct vm_pin *)data; + error = vm_set_pinning(sc->vm, vmpin->vm_cpuid, + vmpin->host_cpuid); + break; + case VM_GET_PINNING: + vmpin = (struct vm_pin *)data; + error = vm_get_pinning(sc->vm, vmpin->vm_cpuid, + &vmpin->host_cpuid); + break; + case VM_MAP_MEMORY: + seg = (struct vm_memory_segment *)data; + error = vm_malloc(sc->vm, seg->gpa, seg->len, &seg->hpa); + break; + case VM_GET_MEMORY_SEG: + seg = (struct vm_memory_segment *)data; + seg->hpa = seg->len = 0; + (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg); + error = 0; + break; + case VM_GET_REGISTER: + vmreg = (struct vm_register *)data; + error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, + &vmreg->regval); + break; + case VM_SET_REGISTER: + vmreg = (struct vm_register *)data; + error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, + vmreg->regval); + break; + case VM_SET_SEGMENT_DESCRIPTOR: + vmsegdesc = (struct vm_seg_desc *)data; + error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid, + vmsegdesc->regnum, + &vmsegdesc->desc); + break; + case VM_GET_SEGMENT_DESCRIPTOR: + vmsegdesc = (struct vm_seg_desc *)data; + error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid, + vmsegdesc->regnum, + &vmsegdesc->desc); + break; + case VM_GET_CAPABILITY: + vmcap = (struct vm_capability *)data; + error = vm_get_capability(sc->vm, vmcap->cpuid, + vmcap->captype, + &vmcap->capval); + break; + case VM_SET_CAPABILITY: + vmcap = (struct vm_capability *)data; + error = vm_set_capability(sc->vm, vmcap->cpuid, + vmcap->captype, + vmcap->capval); + break; + default: + error = ENOTTY; + break; + } +done: + mtx_unlock(&vmmdev_mtx); + + return (error); +} + +static int +vmmdev_mmap(struct cdev *cdev, vm_offset_t offset, vm_paddr_t *paddr, int nprot) +{ + int error; + struct vmmdev_softc *sc; + + error = -1; + mtx_lock(&vmmdev_mtx); + + sc = vmmdev_lookup2(cdev); + if (sc != NULL && (nprot & PROT_EXEC) == 0) { + *paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE); + if (*paddr != (vm_paddr_t)-1) + error = 0; + } + + mtx_unlock(&vmmdev_mtx); + + return (error); +} + +static void +vmmdev_destroy(struct vmmdev_softc *sc) +{ + +#ifdef notyet /* XXX kernel is not compiled with invariants */ + mtx_assert(&vmmdev_mtx, MA_OWNED); +#endif + + /* + * XXX must stop virtual machine instances that may be still + * running and cleanup their state. + */ + SLIST_REMOVE(&head, sc, vmmdev_softc, link); + destroy_dev(sc->cdev); + vm_destroy(sc->vm); + free(sc, M_VMMDEV); +} + +static int +sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) +{ + int error; + char buf[VM_MAX_NAMELEN]; + struct vmmdev_softc *sc; + + strlcpy(buf, "beavis", sizeof(buf)); + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + return (error); + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup(buf); + if (sc == NULL) { + mtx_unlock(&vmmdev_mtx); + return (EINVAL); + } + vmmdev_destroy(sc); + mtx_unlock(&vmmdev_mtx); + return (0); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW, + NULL, 0, sysctl_vmm_destroy, "A", NULL); + +static struct cdevsw vmmdevsw = { + .d_name = "vmmdev", + .d_version = D_VERSION, + .d_ioctl = vmmdev_ioctl, + .d_mmap = vmmdev_mmap, + .d_read = vmmdev_rw, + .d_write = vmmdev_rw, +}; + +static int +sysctl_vmm_create(SYSCTL_HANDLER_ARGS) +{ + int error; + struct vm *vm; + struct vmmdev_softc *sc; + char buf[VM_MAX_NAMELEN]; + + strlcpy(buf, "beavis", sizeof(buf)); + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + return (error); + + mtx_lock(&vmmdev_mtx); + + sc = vmmdev_lookup(buf); + if (sc != NULL) { + mtx_unlock(&vmmdev_mtx); + return (EEXIST); + } + + vm = vm_create(buf); + if (vm == NULL) { + mtx_unlock(&vmmdev_mtx); + return (EINVAL); + } + + sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); + sc->vm = vm; + sc->cdev = make_dev(&vmmdevsw, 0, UID_ROOT, GID_WHEEL, 0600, + "vmm/%s", buf); + sc->cdev->si_drv1 = sc; + SLIST_INSERT_HEAD(&head, sc, link); + + mtx_unlock(&vmmdev_mtx); + return (0); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, + NULL, 0, sysctl_vmm_create, "A", NULL); + +void +vmmdev_init(void) +{ + mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); +} + +void +vmmdev_cleanup(void) +{ + struct vmmdev_softc *sc, *sc2; + + mtx_lock(&vmmdev_mtx); + + SLIST_FOREACH_SAFE(sc, &head, link, sc2) + vmmdev_destroy(sc); + + mtx_unlock(&vmmdev_mtx); +} diff --git a/sys/amd64/vmm/vmm_ipi.c b/sys/amd64/vmm/vmm_ipi.c new file mode 100644 index 000000000000..c8e795bedcc9 --- /dev/null +++ b/sys/amd64/vmm/vmm_ipi.c @@ -0,0 +1,103 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/bus.h> + +#include <machine/intr_machdep.h> +#include <machine/apicvar.h> +#include <machine/segments.h> +#include <machine/md_var.h> +#include <machine/smp.h> + +#include <machine/vmm.h> +#include "vmm_ipi.h" + +extern inthand_t IDTVEC(rsvd), IDTVEC(justreturn); + +/* + * The default is to use the IPI_AST to interrupt a vcpu. + */ +static int ipinum = IPI_AST; + +CTASSERT(APIC_SPURIOUS_INT == 255); + +void +vmm_ipi_init(void) +{ + int idx; + uintptr_t func; + struct gate_descriptor *ip; + + /* + * Search backwards from the highest IDT vector available for use + * as our IPI vector. We install the 'justreturn' handler at that + * vector and use it to interrupt the vcpus. + * + * We do this because the IPI_AST is heavyweight and saves all + * registers in the trapframe. This is overkill for our use case + * which is simply to EOI the interrupt and return. + */ + idx = APIC_SPURIOUS_INT; + while (--idx >= APIC_IPI_INTS) { + ip = &idt[idx]; + func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); + if (func == (uintptr_t)&IDTVEC(rsvd)) { + ipinum = idx; + setidt(ipinum, IDTVEC(justreturn), SDT_SYSIGT, + SEL_KPL, 0); + break; + } + } + + if (ipinum != IPI_AST && bootverbose) { + printf("vmm_ipi_init: installing ipi handler to interrupt " + "vcpus at vector %d\n", ipinum); + } +} + +void +vmm_ipi_cleanup(void) +{ + if (ipinum != IPI_AST) + setidt(ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); +} + +void +vm_interrupt_hostcpu(struct vm *vm, int vcpu) +{ + int hostcpu; + + if (vcpu_is_running(vm, vcpu, &hostcpu) && hostcpu != curcpu) + ipi_selected((cpumask_t)1 << hostcpu, ipinum); +} diff --git a/sys/amd64/vmm/vmm_ipi.h b/sys/amd64/vmm/vmm_ipi.h new file mode 100644 index 000000000000..7ab94bfdf073 --- /dev/null +++ b/sys/amd64/vmm/vmm_ipi.h @@ -0,0 +1,38 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_IPI_H_ +#define _VMM_IPI_H_ + +struct vm; + +void vmm_ipi_init(void); +void vmm_ipi_cleanup(void); +void vm_interrupt_hostcpu(struct vm *vm, int vcpu); + +#endif diff --git a/sys/amd64/vmm/vmm_ktr.h b/sys/amd64/vmm/vmm_ktr.h new file mode 100644 index 000000000000..e691c61af68e --- /dev/null +++ b/sys/amd64/vmm/vmm_ktr.h @@ -0,0 +1,51 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_KTR_H_ +#define _VMM_KTR_H_ + +#include <sys/ktr.h> +#include <sys/pcpu.h> + +#define KTR_VMM KTR_GEN + +#define VMM_CTR0(vm, vcpuid, format) \ +CTR3(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu) + +#define VMM_CTR1(vm, vcpuid, format, p1) \ +CTR4(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \ + (p1)) + +#define VMM_CTR2(vm, vcpuid, format, p1, p2) \ +CTR5(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \ + (p1), (p2)) + +#define VMM_CTR3(vm, vcpuid, format, p1, p2, p3) \ +CTR6(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \ + (p1), (p2), (p3)) +#endif diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c new file mode 100644 index 000000000000..8704fcf4c754 --- /dev/null +++ b/sys/amd64/vmm/vmm_lapic.c @@ -0,0 +1,121 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> + +#include <machine/vmm.h> +#include "vmm_ipi.h" +#include "vmm_lapic.h" +#include "vlapic.h" + +int +lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val) +{ + int handled; + + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + if (vlapic_op_mem_write(vlapic, offset, DWORD, val) == 0) + handled = 1; + else + handled = 0; + + return (handled); +} + +int +lapic_read(struct vm *vm, int cpu, u_int offset, uint64_t *rv) +{ + int handled; + + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + if (vlapic_op_mem_read(vlapic, offset, DWORD, rv) == 0) + handled = 1; + else + handled = 0; + + return (handled); +} + +int +lapic_pending_intr(struct vm *vm, int cpu) +{ + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + return (vlapic_pending_intr(vlapic)); +} + +void +lapic_intr_accepted(struct vm *vm, int cpu, int vector) +{ + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + vlapic_intr_accepted(vlapic, vector); +} + +int +lapic_set_intr(struct vm *vm, int cpu, int vector) +{ + struct vlapic *vlapic; + + if (cpu < 0 || cpu >= VM_MAXCPU) + return (EINVAL); + + if (vector < 32 || vector > 255) + return (EINVAL); + + vlapic = vm_lapic(vm, cpu); + vlapic_set_intr_ready(vlapic, vector); + + vm_interrupt_hostcpu(vm, cpu); + + return (0); +} + +void +lapic_timer_tick(struct vm *vm, int cpu) +{ + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + vlapic_timer_tick(vlapic); +} diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h new file mode 100644 index 000000000000..815b2f7937b7 --- /dev/null +++ b/sys/amd64/vmm/vmm_lapic.h @@ -0,0 +1,64 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_LAPIC_H_ +#define _VMM_LAPIC_H_ + +struct vm; + +int lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val); +int lapic_read(struct vm *vm, int cpu, u_int offset, uint64_t *retval); +void lapic_timer_tick(struct vm *vm, int cpu); + +/* + * Returns a vector between 32 and 255 if an interrupt is pending in the + * IRR that can be delivered based on the current state of ISR and TPR. + * + * Note that the vector does not automatically transition to the ISR as a + * result of calling this function. + * + * Returns -1 if there is no eligible vector that can be delivered to the + * guest at this time. + */ +int lapic_pending_intr(struct vm *vm, int cpu); + +/* + * Transition 'vector' from IRR to ISR. This function is called with the + * vector returned by 'lapic_pending_intr()' when the guest is able to + * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that + * block interrupt delivery). + */ +void lapic_intr_accepted(struct vm *vm, int cpu, int vector); + +/* + * Signals to the LAPIC that an interrupt at 'vector' needs to be generated + * to the 'cpu', the state is recorded in IRR. + */ +int lapic_set_intr(struct vm *vm, int cpu, int vector); + +#endif diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c new file mode 100644 index 000000000000..9ce1e800f323 --- /dev/null +++ b/sys/amd64/vmm/vmm_mem.c @@ -0,0 +1,413 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/linker.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/md_var.h> +#include <machine/metadata.h> +#include <machine/pc/bios.h> +#include <machine/vmparam.h> +#include <machine/pmap.h> + +#include "vmm_util.h" +#include "vmm_mem.h" + +static MALLOC_DEFINE(M_VMM_MEM, "vmm memory", "vmm memory"); + +#define MB (1024 * 1024) +#define GB (1024 * MB) + +#define VMM_MEM_MAXSEGS 64 + +/* protected by vmm_mem_mtx */ +static struct { + vm_paddr_t base; + vm_size_t length; +} vmm_mem_avail[VMM_MEM_MAXSEGS]; + +static int vmm_mem_nsegs; + +static vm_paddr_t maxaddr; + +static struct mtx vmm_mem_mtx; + +/* + * Steal any memory that was deliberately hidden from FreeBSD either by + * the use of MAXMEM kernel config option or the hw.physmem loader tunable. + */ +static int +vmm_mem_steal_memory(void) +{ + int nsegs; + caddr_t kmdp; + uint32_t smapsize; + uint64_t base, length; + struct bios_smap *smapbase, *smap, *smapend; + + /* + * Borrowed from hammer_time() and getmemsize() in machdep.c + */ + kmdp = preload_search_by_type("elf kernel"); + if (kmdp == NULL) + kmdp = preload_search_by_type("elf64 kernel"); + + smapbase = (struct bios_smap *)preload_search_info(kmdp, + MODINFO_METADATA | MODINFOMD_SMAP); + if (smapbase == NULL) + panic("No BIOS smap info from loader!"); + + smapsize = *((uint32_t *)smapbase - 1); + smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); + + nsegs = 0; + for (smap = smapbase; smap < smapend; smap++) { + /* + * XXX + * Assuming non-overlapping, monotonically increasing + * memory segments. + */ + if (smap->type != SMAP_TYPE_MEMORY) + continue; + if (smap->length == 0) + break; + + base = roundup(smap->base, NBPDR); + length = rounddown(smap->length, NBPDR); + + /* Skip this segment if FreeBSD is using all of it. */ + if (base + length <= ptoa(Maxmem)) + continue; + + /* + * If FreeBSD is using part of this segment then adjust + * 'base' and 'length' accordingly. + */ + if (base < ptoa(Maxmem)) { + uint64_t used; + used = roundup(ptoa(Maxmem), NBPDR) - base; + base += used; + length -= used; + } + + if (length == 0) + continue; + + vmm_mem_avail[nsegs].base = base; + vmm_mem_avail[nsegs].length = length; + + if (base + length > maxaddr) + maxaddr = base + length; + + if (0 && bootverbose) { + printf("vmm_mem_populate: index %d, base 0x%0lx, " + "length %ld\n", + nsegs, vmm_mem_avail[nsegs].base, + vmm_mem_avail[nsegs].length); + } + + nsegs++; + if (nsegs >= VMM_MEM_MAXSEGS) { + printf("vmm_mem_populate: maximum number of vmm memory " + "segments reached!\n"); + return (ENOSPC); + } + } + + vmm_mem_nsegs = nsegs; + + return (0); +} + +static void +vmm_mem_direct_map(vm_paddr_t start, vm_paddr_t end) +{ + vm_paddr_t addr, remaining; + int pdpi, pdi, superpage_size; + pml4_entry_t *pml4p; + pdp_entry_t *pdp; + pd_entry_t *pd; + uint64_t page_attr_bits; + + if (end >= NBPML4) + panic("Cannot map memory beyond %ldGB", NBPML4 / GB); + + /* XXX FreeBSD 8.1 does not use 1G superpages in the direct map */ + if (0 && vmm_supports_1G_pages()) + superpage_size = NBPDP; + else + superpage_size = NBPDR; + + /* + * Get the page directory pointer page that contains the direct + * map address mappings. + */ + pml4p = kernel_pmap->pm_pml4; + pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4p[DMPML4I] & ~PAGE_MASK); + + page_attr_bits = PG_RW | PG_V | PG_PS | PG_G; + addr = start; + while (addr < end) { + remaining = end - addr; + pdpi = addr / NBPDP; + if (superpage_size == NBPDP && + remaining >= NBPDP && + addr % NBPDP == 0) { + /* + * If there isn't a mapping for this address then + * create one but if there is one already make sure + * it matches what we expect it to be. + */ + if (pdp[pdpi] == 0) { + pdp[pdpi] = addr | page_attr_bits; + if (0 && bootverbose) { + printf("vmm_mem_populate: mapping " + "0x%lx with 1GB page at " + "pdpi %d\n", addr, pdpi); + } + } else { + pdp_entry_t pdpe = pdp[pdpi]; + if ((pdpe & ~PAGE_MASK) != addr || + (pdpe & page_attr_bits) != page_attr_bits) { + panic("An invalid mapping 0x%016lx " + "already exists for 0x%016lx\n", + pdpe, addr); + } + } + addr += NBPDP; + } else { + if (remaining < NBPDR) { + panic("vmm_mem_populate: remaining (%ld) must " + "be greater than NBPDR (%d)\n", + remaining, NBPDR); + } + if (pdp[pdpi] == 0) { + /* + * XXX we lose this memory forever because + * we do not keep track of the virtual address + * that would be required to free this page. + */ + pd = malloc(PAGE_SIZE, M_VMM_MEM, + M_WAITOK | M_ZERO); + if ((uintptr_t)pd & PAGE_MASK) { + panic("vmm_mem_populate: page directory" + "page not aligned on %d " + "boundary\n", PAGE_SIZE); + } + pdp[pdpi] = vtophys(pd); + pdp[pdpi] |= PG_RW | PG_V | PG_U; + if (0 && bootverbose) { + printf("Creating page directory " + "at pdp index %d for 0x%016lx\n", + pdpi, addr); + } + } + pdi = (addr % NBPDP) / NBPDR; + pd = (pd_entry_t *)PHYS_TO_DMAP(pdp[pdpi] & ~PAGE_MASK); + + /* + * Create a new mapping if one doesn't already exist + * or validate it if it does. + */ + if (pd[pdi] == 0) { + pd[pdi] = addr | page_attr_bits; + if (0 && bootverbose) { + printf("vmm_mem_populate: mapping " + "0x%lx with 2MB page at " + "pdpi %d, pdi %d\n", + addr, pdpi, pdi); + } + } else { + pd_entry_t pde = pd[pdi]; + if ((pde & ~PAGE_MASK) != addr || + (pde & page_attr_bits) != page_attr_bits) { + panic("An invalid mapping 0x%016lx " + "already exists for 0x%016lx\n", + pde, addr); + } + } + addr += NBPDR; + } + } +} + +static int +vmm_mem_populate(void) +{ + int seg, error; + vm_paddr_t start, end; + + /* populate the vmm_mem_avail[] array */ + error = vmm_mem_steal_memory(); + if (error) + return (error); + + /* + * Now map the memory that was hidden from FreeBSD in + * the direct map VA space. + */ + for (seg = 0; seg < vmm_mem_nsegs; seg++) { + start = vmm_mem_avail[seg].base; + end = start + vmm_mem_avail[seg].length; + if ((start & PDRMASK) != 0 || (end & PDRMASK) != 0) { + panic("start (0x%016lx) and end (0x%016lx) must be " + "aligned on a %dMB boundary\n", + start, end, NBPDR / MB); + } + vmm_mem_direct_map(start, end); + } + + return (0); +} + +int +vmm_mem_init(void) +{ + int error; + + mtx_init(&vmm_mem_mtx, "vmm_mem_mtx", NULL, MTX_DEF); + + error = vmm_mem_populate(); + if (error) + return (error); + + return (0); +} + +vm_paddr_t +vmm_mem_alloc(size_t size) +{ + int i; + vm_paddr_t addr; + + if ((size & PDRMASK) != 0) { + panic("vmm_mem_alloc: size 0x%0lx must be " + "aligned on a 0x%0x boundary\n", size, NBPDR); + } + + addr = 0; + + mtx_lock(&vmm_mem_mtx); + for (i = 0; i < vmm_mem_nsegs; i++) { + if (vmm_mem_avail[i].length >= size) { + addr = vmm_mem_avail[i].base; + vmm_mem_avail[i].base += size; + vmm_mem_avail[i].length -= size; + /* remove a zero length segment */ + if (vmm_mem_avail[i].length == 0) { + memmove(&vmm_mem_avail[i], + &vmm_mem_avail[i + 1], + (vmm_mem_nsegs - (i + 1)) * + sizeof(vmm_mem_avail[0])); + vmm_mem_nsegs--; + } + break; + } + } + mtx_unlock(&vmm_mem_mtx); + + return (addr); +} + +void +vmm_mem_free(vm_paddr_t base, size_t length) +{ + int i; + + if ((base & PDRMASK) != 0 || (length & PDRMASK) != 0) { + panic("vmm_mem_free: base 0x%0lx and length 0x%0lx must be " + "aligned on a 0x%0x boundary\n", base, length, NBPDR); + } + + mtx_lock(&vmm_mem_mtx); + + for (i = 0; i < vmm_mem_nsegs; i++) { + if (vmm_mem_avail[i].base > base) + break; + } + + if (vmm_mem_nsegs >= VMM_MEM_MAXSEGS) + panic("vmm_mem_free: cannot free any more segments"); + + /* Create a new segment at index 'i' */ + memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i], + (vmm_mem_nsegs - i) * sizeof(vmm_mem_avail[0])); + + vmm_mem_avail[i].base = base; + vmm_mem_avail[i].length = length; + + vmm_mem_nsegs++; + +coalesce_some_more: + for (i = 0; i < vmm_mem_nsegs - 1; i++) { + if (vmm_mem_avail[i].base + vmm_mem_avail[i].length == + vmm_mem_avail[i + 1].base) { + vmm_mem_avail[i].length += vmm_mem_avail[i + 1].length; + memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i + 2], + (vmm_mem_nsegs - (i + 2)) * sizeof(vmm_mem_avail[0])); + vmm_mem_nsegs--; + goto coalesce_some_more; + } + } + + mtx_unlock(&vmm_mem_mtx); +} + +vm_paddr_t +vmm_mem_maxaddr(void) +{ + + return (maxaddr); +} + +void +vmm_mem_dump(void) +{ + int i; + vm_paddr_t base; + vm_size_t length; + + mtx_lock(&vmm_mem_mtx); + for (i = 0; i < vmm_mem_nsegs; i++) { + base = vmm_mem_avail[i].base; + length = vmm_mem_avail[i].length; + printf("%-4d0x%016lx 0x%016lx\n", i, base, base + length); + } + mtx_unlock(&vmm_mem_mtx); +} diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h new file mode 100644 index 000000000000..ef1bf1aee32b --- /dev/null +++ b/sys/amd64/vmm/vmm_mem.h @@ -0,0 +1,38 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_MEM_H_ +#define _VMM_MEM_H_ + +int vmm_mem_init(void); +vm_paddr_t vmm_mem_alloc(size_t size); +void vmm_mem_free(vm_paddr_t start, size_t size); +vm_paddr_t vmm_mem_maxaddr(void); +void vmm_mem_dump(void); + +#endif diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c new file mode 100644 index 000000000000..152aa7b17e1b --- /dev/null +++ b/sys/amd64/vmm/vmm_msr.c @@ -0,0 +1,264 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> + +#include <machine/specialreg.h> +#include <machine/apicreg.h> + +#include <machine/vmm.h> +#include "vmm_lapic.h" +#include "vmm_msr.h" + +#define VMM_MSR_F_EMULATE 0x01 +#define VMM_MSR_F_READONLY 0x02 + +struct vmm_msr { + int num; + int flags; + uint64_t hostval; +}; + +static struct vmm_msr vmm_msr[] = { + { MSR_LSTAR, 0 }, + { MSR_CSTAR, 0 }, + { MSR_STAR, 0 }, + { MSR_SF_MASK, 0 }, + { MSR_APICBASE, VMM_MSR_F_EMULATE }, + { MSR_BIOS_SIGN,VMM_MSR_F_EMULATE }, + { MSR_MCG_CAP, VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY }, +}; + +#define vmm_msr_num (sizeof(vmm_msr) / sizeof(vmm_msr[0])) +CTASSERT(VMM_MSR_NUM >= vmm_msr_num); + +#define readonly_msr(idx) \ + ((vmm_msr[(idx)].flags & VMM_MSR_F_READONLY) != 0) + +#define emulated_msr(idx) \ + ((vmm_msr[(idx)].flags & VMM_MSR_F_EMULATE) != 0) + +void +vmm_msr_init(void) +{ + int i; + + for (i = 0; i < vmm_msr_num; i++) { + if (emulated_msr(i)) + continue; + /* + * XXX this assumes that the value of the host msr does not + * change after we have cached it. + */ + vmm_msr[i].hostval = rdmsr(vmm_msr[i].num); + } +} + +void +guest_msrs_init(struct vm *vm, int cpu) +{ + int i; + uint64_t *guest_msrs; + + guest_msrs = vm_guest_msrs(vm, cpu); + + for (i = 0; i < vmm_msr_num; i++) { + switch (vmm_msr[i].num) { + case MSR_LSTAR: + case MSR_CSTAR: + case MSR_STAR: + case MSR_SF_MASK: + case MSR_BIOS_SIGN: + case MSR_MCG_CAP: + guest_msrs[i] = 0; + break; + case MSR_APICBASE: + guest_msrs[i] = DEFAULT_APIC_BASE | APICBASE_ENABLED | + APICBASE_X2APIC; + if (cpu == 0) + guest_msrs[i] |= APICBASE_BSP; + break; + default: + panic("guest_msrs_init: missing initialization for msr " + "0x%0x", vmm_msr[i].num); + } + } +} + +static boolean_t +x2apic_msr(u_int num) +{ + + if (num >= 0x800 && num <= 0xBFF) + return (TRUE); + else + return (FALSE); +} + +static u_int +x2apic_msr_to_regoff(u_int msr) +{ + + return ((msr - 0x800) << 4); +} + +static boolean_t +x2apic_msr_id(u_int num) +{ + return (num == 0x802); +} + +static int +msr_num_to_idx(u_int num) +{ + int i; + + for (i = 0; i < vmm_msr_num; i++) + if (vmm_msr[i].num == num) + return (i); + + return (-1); +} + +int +emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val) +{ + int handled, idx; + uint64_t *guest_msrs; + + handled = 0; + + if (x2apic_msr(num)) + return (lapic_write(vm, cpu, x2apic_msr_to_regoff(num), val)); + + idx = msr_num_to_idx(num); + if (idx < 0) + goto done; + + if (!readonly_msr(idx)) { + guest_msrs = vm_guest_msrs(vm, cpu); + + /* Stash the value */ + guest_msrs[idx] = val; + + /* Update processor state for non-emulated MSRs */ + if (!emulated_msr(idx)) + wrmsr(vmm_msr[idx].num, val); + } + + handled = 1; +done: + return (handled); +} + +int +emulate_rdmsr(struct vm *vm, int cpu, u_int num) +{ + int error, handled, idx; + uint32_t eax, edx; + uint64_t result, *guest_msrs; + + handled = 0; + + if (x2apic_msr(num)) { + handled = lapic_read(vm, cpu, x2apic_msr_to_regoff(num), + &result); + /* + * The version ID needs to be massaged + */ + if (x2apic_msr_id(num)) { + result = result >> 24; + } + goto done; + } + + idx = msr_num_to_idx(num); + if (idx < 0) + goto done; + + guest_msrs = vm_guest_msrs(vm, cpu); + result = guest_msrs[idx]; + + /* + * If this is not an emulated msr register make sure that the processor + * state matches our cached state. + */ + if (!emulated_msr(idx) && (rdmsr(num) != result)) { + panic("emulate_rdmsr: msr 0x%0x has inconsistent cached " + "(0x%016lx) and actual (0x%016lx) values", num, + result, rdmsr(num)); + } + + handled = 1; + +done: + if (handled) { + eax = result; + edx = result >> 32; + error = vm_set_register(vm, cpu, VM_REG_GUEST_RAX, eax); + if (error) + panic("vm_set_register(rax) error %d", error); + error = vm_set_register(vm, cpu, VM_REG_GUEST_RDX, edx); + if (error) + panic("vm_set_register(rdx) error %d", error); + } + return (handled); +} + +void +restore_guest_msrs(struct vm *vm, int cpu) +{ + int i; + uint64_t *guest_msrs; + + guest_msrs = vm_guest_msrs(vm, cpu); + + for (i = 0; i < vmm_msr_num; i++) { + if (emulated_msr(i)) + continue; + else + wrmsr(vmm_msr[i].num, guest_msrs[i]); + } +} + +void +restore_host_msrs(struct vm *vm, int cpu) +{ + int i; + + for (i = 0; i < vmm_msr_num; i++) { + if (emulated_msr(i)) + continue; + else + wrmsr(vmm_msr[i].num, vmm_msr[i].hostval); + } +} diff --git a/sys/amd64/vmm/vmm_msr.h b/sys/amd64/vmm/vmm_msr.h new file mode 100644 index 000000000000..1e157876e8af --- /dev/null +++ b/sys/amd64/vmm/vmm_msr.h @@ -0,0 +1,42 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_MSR_H_ +#define _VMM_MSR_H_ + +#define VMM_MSR_NUM 16 +struct vm; + +void vmm_msr_init(void); +int emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val); +int emulate_rdmsr(struct vm *vm, int vcpu, u_int msr); +void guest_msrs_init(struct vm *vm, int cpu); +void restore_host_msrs(struct vm *vm, int cpu); +void restore_guest_msrs(struct vm *vm, int cpu); + +#endif diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c new file mode 100644 index 000000000000..e6f5c48d2d3a --- /dev/null +++ b/sys/amd64/vmm/vmm_stat.c @@ -0,0 +1,103 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> + +#include <machine/vmm.h> +#include "vmm_stat.h" + +static int vstnum; +static struct vmm_stat_type *vsttab[MAX_VMM_STAT_TYPES]; + +static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat"); + +void +vmm_stat_init(void *arg) +{ + struct vmm_stat_type *vst = arg; + + /* We require all stats to identify themselves with a description */ + if (vst->desc == NULL) + return; + + if (vstnum >= MAX_VMM_STAT_TYPES) { + printf("Cannot accomodate vmm stat type \"%s\"!\n", vst->desc); + return; + } + + vst->index = vstnum; + vsttab[vstnum++] = vst; +} + +int +vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf) +{ + int i; + uint64_t *stats; + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + stats = vcpu_stats(vm, vcpu); + for (i = 0; i < vstnum; i++) + buf[i] = stats[i]; + *num_stats = vstnum; + return (0); +} + +void * +vmm_stat_alloc(void) +{ + u_long size; + + size = vstnum * sizeof(uint64_t); + + return (malloc(size, M_VMM_STAT, M_ZERO | M_WAITOK)); +} + +void +vmm_stat_free(void *vp) +{ + free(vp, M_VMM_STAT); +} + +const char * +vmm_stat_desc(int index) +{ + + if (index >= 0 && index < vstnum) + return (vsttab[index]->desc); + else + return (NULL); +} diff --git a/sys/amd64/vmm/vmm_stat.h b/sys/amd64/vmm/vmm_stat.h new file mode 100644 index 000000000000..7c075a6a7602 --- /dev/null +++ b/sys/amd64/vmm/vmm_stat.h @@ -0,0 +1,71 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_STAT_H_ +#define _VMM_STAT_H_ + +struct vm; + +#define MAX_VMM_STAT_TYPES 64 /* arbitrary */ + +struct vmm_stat_type { + const char *desc; /* description of statistic */ + int index; /* position in the stats buffer */ +}; + +void vmm_stat_init(void *arg); + +#define VMM_STAT_DEFINE(type, desc) \ + struct vmm_stat_type type[1] = { \ + { desc, -1 } \ + }; \ + SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type) + +void *vmm_stat_alloc(void); +void vmm_stat_free(void *vp); + +/* + * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries + */ +int vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf); +const char *vmm_stat_desc(int index); + +static void __inline +vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats = vcpu_stats(vm, vcpu); + if (vst->index >= 0) + stats[vst->index] += x; +#endif +} + +#endif diff --git a/sys/amd64/vmm/vmm_support.S b/sys/amd64/vmm/vmm_support.S new file mode 100644 index 000000000000..2afc608ae71e --- /dev/null +++ b/sys/amd64/vmm/vmm_support.S @@ -0,0 +1,42 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#define LOCORE + +#include <machine/asmacros.h> + +#define LA_EOI 0xB0 + + .text + SUPERALIGN_TEXT +IDTVEC(justreturn) + pushq %rax + movq lapic, %rax + movl $0, LA_EOI(%rax) + popq %rax + iretq diff --git a/sys/amd64/vmm/vmm_util.c b/sys/amd64/vmm/vmm_util.c new file mode 100644 index 000000000000..f245f922120f --- /dev/null +++ b/sys/amd64/vmm/vmm_util.c @@ -0,0 +1,111 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/libkern.h> + +#include <machine/md_var.h> + +#include "vmm_util.h" + +boolean_t +vmm_is_intel(void) +{ + + if (strcmp(cpu_vendor, "GenuineIntel") == 0) + return (TRUE); + else + return (FALSE); +} + +boolean_t +vmm_is_amd(void) +{ + if (strcmp(cpu_vendor, "AuthenticAMD") == 0) + return (TRUE); + else + return (FALSE); +} + +boolean_t +vmm_supports_1G_pages(void) +{ + unsigned int regs[4]; + + /* + * CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages + * + * Both Intel and AMD support this bit. + */ + if (cpu_exthigh >= 0x80000001) { + do_cpuid(0x80000001, regs); + if (regs[3] & (1 << 26)) + return (TRUE); + } + return (FALSE); +} + +#include <sys/proc.h> +#include <machine/frame.h> +#define DUMP_REG(x) printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x)) +#define DUMP_SEG(x) printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x)) +void +dump_trapframe(struct trapframe *tf) +{ + DUMP_REG(rdi); + DUMP_REG(rsi); + DUMP_REG(rdx); + DUMP_REG(rcx); + DUMP_REG(r8); + DUMP_REG(r9); + DUMP_REG(rax); + DUMP_REG(rbx); + DUMP_REG(rbp); + DUMP_REG(r10); + DUMP_REG(r11); + DUMP_REG(r12); + DUMP_REG(r13); + DUMP_REG(r14); + DUMP_REG(r15); + DUMP_REG(trapno); + DUMP_REG(addr); + DUMP_REG(flags); + DUMP_REG(err); + DUMP_REG(rip); + DUMP_REG(rflags); + DUMP_REG(rsp); + DUMP_SEG(cs); + DUMP_SEG(ss); + DUMP_SEG(fs); + DUMP_SEG(gs); + DUMP_SEG(es); + DUMP_SEG(ds); +} diff --git a/sys/amd64/vmm/vmm_util.h b/sys/amd64/vmm/vmm_util.h new file mode 100644 index 000000000000..7f82332923a0 --- /dev/null +++ b/sys/amd64/vmm/vmm_util.h @@ -0,0 +1,40 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_UTIL_H_ +#define _VMM_UTIL_H_ + +struct trapframe; + +boolean_t vmm_is_intel(void); +boolean_t vmm_is_amd(void); +boolean_t vmm_supports_1G_pages(void); + +void dump_trapframe(struct trapframe *tf); + +#endif diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c new file mode 100644 index 000000000000..45c4c53c199a --- /dev/null +++ b/sys/amd64/vmm/x86.c @@ -0,0 +1,113 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <machine/cpufunc.h> +#include <machine/specialreg.h> + +#include "x86.h" + +int +x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) +{ + unsigned int func, regs[4]; + + func = *eax; + + cpuid_count(*eax, *ecx, regs); + + switch(func) { + case CPUID_0000_0000: + case CPUID_0000_0002: + case CPUID_0000_0003: + case CPUID_0000_0004: + case CPUID_0000_000A: + break; + + case CPUID_8000_0000: + case CPUID_8000_0001: + case CPUID_8000_0002: + case CPUID_8000_0003: + case CPUID_8000_0004: + case CPUID_8000_0006: + case CPUID_8000_0007: + case CPUID_8000_0008: + + break; + + case CPUID_0000_0001: + /* + * Override the APIC ID only in ebx + */ + regs[1] &= ~(CPUID_0000_0001_APICID_MASK); + /* + * XXX fixme for MP case, set apicid properly for cpu. + */ + regs[1] |= (0 << CPUID_0000_0001_APICID_SHIFT); + + /* + * Don't expose VMX capability. + * Advertise x2APIC capability. + */ + regs[2] &= ~CPUID_0000_0001_FEAT0_VMX; + regs[2] |= CPUID2_X2APIC; + + /* + * Machine check handling is done in the host. + * Hide MTRR capability. + */ + regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR); + + break; + + case CPUID_0000_000B: + /* + * XXXSMP fixme + * Processor topology enumeration + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = *ecx & 0xff; + regs[3] = 0; + break; + + default: + return (0); + } + + *eax = regs[0]; + *ebx = regs[1]; + *ecx = regs[2]; + *edx = regs[3]; + return (1); +} + diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h new file mode 100644 index 000000000000..bc4f8a45c78a --- /dev/null +++ b/sys/amd64/vmm/x86.h @@ -0,0 +1,62 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _X86_H_ +#define _X86_H_ + +#define CPUID_0000_0000 (0x0) +#define CPUID_0000_0001 (0x1) +#define CPUID_0000_0002 (0x2) +#define CPUID_0000_0003 (0x3) +#define CPUID_0000_0004 (0x4) +#define CPUID_0000_000A (0xA) +#define CPUID_0000_000B (0xB) +#define CPUID_8000_0000 (0x80000000) +#define CPUID_8000_0001 (0x80000001) +#define CPUID_8000_0002 (0x80000002) +#define CPUID_8000_0003 (0x80000003) +#define CPUID_8000_0004 (0x80000004) +#define CPUID_8000_0006 (0x80000006) +#define CPUID_8000_0007 (0x80000007) +#define CPUID_8000_0008 (0x80000008) + +/* + * CPUID instruction Fn0000_0001: + */ +#define CPUID_0000_0001_APICID_MASK (0xff<<24) +#define CPUID_0000_0001_APICID_SHIFT 24 + +/* + * CPUID instruction Fn0000_0001 ECX + */ +#define CPUID_0000_0001_FEAT0_VMX (1<<5) + +int x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, + uint32_t *edx); + +#endif diff --git a/sys/modules/Makefile b/sys/modules/Makefile index e781da0a33f5..127c3e01a2d3 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -290,6 +290,7 @@ SUBDIR= ${_3dfx} \ ${_vesa} \ vge \ vkbd \ + ${_vmm} \ ${_vpo} \ vr \ vx \ @@ -557,6 +558,7 @@ _sppp= sppp _tmpfs= tmpfs _twa= twa _vesa= vesa +_vmm= vmm _x86bios= x86bios _wi= wi _wpi= wpi diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile new file mode 100644 index 000000000000..67c5b0f33ccc --- /dev/null +++ b/sys/modules/vmm/Makefile @@ -0,0 +1,66 @@ +# $FreeBSD$ + +# *REQUIRES* binutils 2.20.1 for VT-x instructions +AS= /usr/local/bin/as +LD= /usr/local/bin/ld +CFLAGS+= -B /usr/local/bin + +KMOD= vmm + +SRCS= device_if.h bus_if.h pci_if.h + +CFLAGS+= -DVMM_KEEP_STATS +CFLAGS+= -DOLD_BINUTILS +CFLAGS+= -I${.CURDIR}/../../amd64/vmm +CFLAGS+= -I${.CURDIR}/../../amd64/vmm/io +CFLAGS+= -I${.CURDIR}/../../amd64/vmm/intel + +# generic vmm support +.PATH: ${.CURDIR}/../../amd64/vmm +SRCS+= vmm.c \ + vmm_dev.c \ + vmm_ipi.c \ + vmm_lapic.c \ + vmm_mem.c \ + vmm_msr.c \ + vmm_stat.c \ + vmm_util.c \ + x86.c \ + vmm_support.S + +.PATH: ${.CURDIR}/../../amd64/vmm/io +SRCS+= iommu.c \ + ppt.c \ + vdev.c \ + vlapic.c + +# intel-specific files +.PATH: ${.CURDIR}/../../amd64/vmm/intel +SRCS+= ept.c \ + vmcs.c \ + vmx_msr.c \ + vmx.c \ + vtd.c + +# amd-specific files +.PATH: ${.CURDIR}/../../amd64/vmm/amd +SRCS+= amdv.c + +OBJS= vmx_support.o + +CLEANFILES= vmx_assym.s vmx_genassym.o + +vmx_assym.s: vmx_genassym.o +.if exists(@) +vmx_assym.s: @/kern/genassym.sh +.endif + sh @/kern/genassym.sh vmx_genassym.o > ${.TARGET} + +vmx_support.o: vmx_support.S vmx_assym.s + ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ + ${.IMPSRC} -o ${.TARGET} + +vmx_genassym.o: vmx_genassym.c @ machine + ${CC} -c ${CFLAGS:N-fno-common} ${.IMPSRC} + +.include <bsd.kmod.mk> diff --git a/usr.sbin/Makefile b/usr.sbin/Makefile index 44f20a49cee8..fc527b746725 100644 --- a/usr.sbin/Makefile +++ b/usr.sbin/Makefile @@ -19,6 +19,7 @@ SUBDIR= ${_ac} \ ${_auditd} \ ${_auditreduce} \ ${_authpf} \ + ${_bhyve} \ ${_bluetooth} \ ${_boot0cfg} \ ${_boot98cfg} \ @@ -194,6 +195,7 @@ SUBDIR= ${_ac} \ ${_usbdevs} \ ${_usbconfig} \ ${_vidcontrol} \ + ${_vmmctl} \ vipw \ wake \ watch \ @@ -477,6 +479,7 @@ _boot98cfg= boot98cfg _acpi= acpi .endif _asf= asf +_bhyve= bhyve _boot0cfg= boot0cfg .if ${MK_TOOLCHAIN} != "no" _btxld= btxld @@ -494,6 +497,7 @@ _ndiscvt= ndiscvt .endif _sicontrol= sicontrol _spkrtest= spkrtest +_vmmctl= vmmctl _zzz= zzz .endif diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile new file mode 100644 index 000000000000..71df082dcf7a --- /dev/null +++ b/usr.sbin/bhyve/Makefile @@ -0,0 +1,18 @@ +# +# $FreeBSD$ +# + +PROG= bhyve + +SRCS= atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c mevent.c +SRCS+= pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c +SRCS+= pci_virtio_net.c pit_8254.c post.c rtc.c uart.c xmsr.c + +NO_MAN= + +DPADD= ${LIBVMMAPI} ${LIBMD} ${LIBPTHREAD} +LDADD= -lvmmapi -lmd -lpthread + +CFLAGS+= -I${.CURDIR}/../../sys + +.include <bsd.prog.mk> diff --git a/usr.sbin/bhyve/atpic.c b/usr.sbin/bhyve/atpic.c new file mode 100644 index 000000000000..a9fb0842c358 --- /dev/null +++ b/usr.sbin/bhyve/atpic.c @@ -0,0 +1,68 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> + +#include <stdio.h> +#include <string.h> +#include <assert.h> + +#include "inout.h" + +/* + * FreeBSD only writes to the 8259 interrupt controllers to put them in a + * shutdown state. + * + * So, we just ignore the writes. + */ + +#define IO_ICU1 0x20 +#define IO_ICU2 0xA0 +#define ICU_IMR_OFFSET 1 + +static int +atpic_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + if (bytes != 1) + return (-1); + + if (in) + return (-1); + + /* Pretend all writes to the 8259 are alright */ + return (0); +} + +INOUT_PORT(atpic, IO_ICU1, IOPORT_F_INOUT, atpic_handler); +INOUT_PORT(atpic, IO_ICU1 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler); +INOUT_PORT(atpic, IO_ICU2, IOPORT_F_INOUT, atpic_handler); +INOUT_PORT(atpic, IO_ICU2 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler); diff --git a/usr.sbin/bhyve/consport.c b/usr.sbin/bhyve/consport.c new file mode 100644 index 000000000000..34f94a6cc775 --- /dev/null +++ b/usr.sbin/bhyve/consport.c @@ -0,0 +1,121 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/select.h> + +#include <stdio.h> +#include <stdlib.h> +#include <termios.h> +#include <unistd.h> +#include <stdbool.h> + +#include "inout.h" + +#define BVM_CONSOLE_PORT 0x220 + +static struct termios tio_orig, tio_new; + +static void +ttyclose(void) +{ + tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig); +} + +static void +ttyopen(void) +{ + tcgetattr(STDIN_FILENO, &tio_orig); + + cfmakeraw(&tio_new); + tcsetattr(STDIN_FILENO, TCSANOW, &tio_new); + + atexit(ttyclose); +} + +static bool +tty_char_available(void) +{ + fd_set rfds; + struct timeval tv; + + FD_ZERO(&rfds); + FD_SET(STDIN_FILENO, &rfds); + tv.tv_sec = 0; + tv.tv_usec = 0; + if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) { + return (true); + } else { + return (false); + } +} + +static int +ttyread(void) +{ + char rb; + + if (tty_char_available()) { + read(STDIN_FILENO, &rb, 1); + return (rb & 0xff); + } else { + return (-1); + } +} + +static void +ttywrite(unsigned char wb) +{ + (void) write(STDOUT_FILENO, &wb, 1); +} + +static int +console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + static int opened; + + if (bytes != 4) + return (-1); + + if (!opened) { + ttyopen(); + opened = 1; + } + + if (in) + *eax = ttyread(); + else + ttywrite(*eax); + + return (0); +} +INOUT_PORT(console, BVM_CONSOLE_PORT, IOPORT_F_INOUT, console_handler); diff --git a/usr.sbin/bhyve/dbgport.c b/usr.sbin/bhyve/dbgport.c new file mode 100644 index 000000000000..be919e1ea590 --- /dev/null +++ b/usr.sbin/bhyve/dbgport.c @@ -0,0 +1,124 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <sys/uio.h> + +#include <stdio.h> +#include <stdlib.h> +#include <fcntl.h> +#include <unistd.h> +#include <errno.h> + +#include "inout.h" + +#define BVM_DBG_PORT 0x224 + +static int listen_fd, conn_fd; + +static struct sockaddr_in sin; + +void +init_dbgport(int sport) +{ + conn_fd = -1; + + if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("socket"); + exit(1); + } + + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(sport); + + if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + perror("bind"); + exit(1); + } + + if (listen(listen_fd, 1) < 0) { + perror("listen"); + exit(1); + } +} + +static int +dbg_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + char ch; + int nwritten, nread, printonce; + + if (bytes != 4) + return (-1); + +again: + printonce = 0; + while (conn_fd < 0) { + if (!printonce) { + printf("Waiting for connection from gdb\r\n"); + printonce = 1; + } + conn_fd = accept(listen_fd, NULL, NULL); + if (conn_fd >= 0) + fcntl(conn_fd, F_SETFL, O_NONBLOCK); + else if (errno != EINTR) + perror("accept"); + } + + if (in) { + nread = read(conn_fd, &ch, 1); + if (nread == -1 && errno == EAGAIN) + *eax = -1; + else if (nread == 1) + *eax = ch; + else { + close(conn_fd); + conn_fd = -1; + goto again; + } + } else { + ch = *eax; + nwritten = write(conn_fd, &ch, 1); + if (nwritten != 1) { + close(conn_fd); + conn_fd = -1; + goto again; + } + } + return (0); +} + +INOUT_PORT(dbg, BVM_DBG_PORT, IOPORT_F_INOUT, dbg_handler); diff --git a/usr.sbin/bhyve/dbgport.h b/usr.sbin/bhyve/dbgport.h new file mode 100644 index 000000000000..8c7dab7f83cc --- /dev/null +++ b/usr.sbin/bhyve/dbgport.h @@ -0,0 +1,36 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _DBGPORT_H_ +#define _DBGPORT_H_ + +#define DEFAULT_GDB_PORT 6466 + +void init_dbgport(int port); + +#endif diff --git a/usr.sbin/bhyve/elcr.c b/usr.sbin/bhyve/elcr.c new file mode 100644 index 000000000000..2417ae1bb276 --- /dev/null +++ b/usr.sbin/bhyve/elcr.c @@ -0,0 +1,65 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include "inout.h" + +/* + * EISA interrupt Level Control Register. + * + * This is a 16-bit register with one bit for each of the IRQ0 through IRQ15. + * A level triggered irq is indicated by setting the corresponding bit to '1'. + */ +#define ELCR_PORT 0x4d0 + +static uint8_t elcr[2] = { 0x00, 0x00 }; + +static int +elcr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int idx; + + if (bytes != 1) + return (-1); + + idx = port - ELCR_PORT; + + if (in) + *eax = elcr[idx]; + else + elcr[idx] = *eax; + + return (0); +} +INOUT_PORT(elcr, ELCR_PORT + 0, IOPORT_F_INOUT, elcr_handler); +INOUT_PORT(elcr, ELCR_PORT + 1, IOPORT_F_INOUT, elcr_handler); diff --git a/usr.sbin/bhyve/fbsdrun.c b/usr.sbin/bhyve/fbsdrun.c new file mode 100644 index 000000000000..ddbe709b8d57 --- /dev/null +++ b/usr.sbin/bhyve/fbsdrun.c @@ -0,0 +1,650 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/time.h> + +#include <machine/segments.h> + +#include <stdio.h> +#include <stdlib.h> +#include <libgen.h> +#include <unistd.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include <pthread.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include "fbsdrun.h" +#include "inout.h" +#include "dbgport.h" +#include "mevent.h" +#include "pci_emul.h" +#include "xmsr.h" + +#define DEFAULT_GUEST_HZ 100 +#define DEFAULT_GUEST_TSLICE 200 + +#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ + +#define VMEXIT_SWITCH 0 /* force vcpu switch in mux mode */ +#define VMEXIT_CONTINUE 1 /* continue from next instruction */ +#define VMEXIT_RESTART 2 /* restart current instruction */ +#define VMEXIT_ABORT 3 /* abort the vm run loop */ +#define VMEXIT_RESET 4 /* guest machine has reset */ + +#define MB (1024UL * 1024) +#define GB (1024UL * MB) + +typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); + +int guest_tslice = DEFAULT_GUEST_TSLICE; +int guest_hz = DEFAULT_GUEST_HZ; +char *vmname; + +u_long lomem_sz; +u_long himem_sz; + +int guest_ncpus; + +static int pincpu = -1; +static int guest_vcpu_mux; +static int guest_vmexit_on_hlt, guest_vmexit_on_pause; + +static int foundcpus; + +static char *lomem_addr; +static char *himem_addr; + +static char *progname; +static const int BSP = 0; + +static int cpumask; + +static void *oem_tbl_start; +static int oem_tbl_size; + +static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); + +struct vm_exit vmexit[VM_MAXCPU]; + +struct fbsdstats { + uint64_t vmexit_bogus; + uint64_t vmexit_bogus_switch; + uint64_t vmexit_hlt; + uint64_t vmexit_pause; + uint64_t vmexit_mtrap; + uint64_t cpu_switch_rotate; + uint64_t cpu_switch_direct; + int io_reset; +} stats; + +struct mt_vmm_info { + pthread_t mt_thr; + struct vmctx *mt_ctx; + int mt_vcpu; +} mt_vmm_info[VM_MAXCPU]; + +static void +usage(int code) +{ + + fprintf(stderr, + "Usage: %s [-hBHP][-g <gdb port>][-z <hz>][-s <pci>][-p pincpu]" + "[-n <pci>][-m lowmem][-M highmem] <vm>\n" + " -g: gdb port (default is %d and 0 means don't open)\n" + " -c: # cpus (default 1)\n" + " -p: pin vcpu 'n' to host cpu 'pincpu + n'\n" + " -B: inject breakpoint exception on vm entry\n" + " -H: vmexit from the guest on hlt\n" + " -P: vmexit from the guest on pause\n" + " -h: help\n" + " -z: guest hz (default is %d)\n" + " -s: <slot,driver,configinfo> PCI slot config\n" + " -n: <slot,name> PCI slot naming\n" + " -m: lowmem in MB\n" + " -M: highmem in MB\n" + " -x: mux vcpus to 1 hcpu\n" + " -t: mux vcpu timeslice hz (default %d)\n", + progname, DEFAULT_GDB_PORT, DEFAULT_GUEST_HZ, + DEFAULT_GUEST_TSLICE); + exit(code); +} + +void * +paddr_guest2host(uintptr_t gaddr) +{ + if (lomem_sz == 0) + return (NULL); + + if (gaddr < lomem_sz) { + return ((void *)(lomem_addr + gaddr)); + } else if (gaddr >= 4*GB && gaddr < (4*GB + himem_sz)) { + return ((void *)(himem_addr + gaddr - 4*GB)); + } else + return (NULL); +} + +void +fbsdrun_add_oemtbl(void *tbl, int tblsz) +{ + oem_tbl_start = tbl; + oem_tbl_size = tblsz; +} + +int +fbsdrun_vmexit_on_pause(void) +{ + + return (guest_vmexit_on_pause); +} + +int +fbsdrun_vmexit_on_hlt(void) +{ + + return (guest_vmexit_on_hlt); +} + +int +fbsdrun_muxed(void) +{ + + return (guest_vcpu_mux); +} + +void * +fbsdrun_start_thread(void *param) +{ + int vcpu; + struct mt_vmm_info *mtp = param; + + vcpu = mtp->mt_vcpu; + vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); + + /* not reached */ + exit(1); + return (NULL); +} + +void +fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip) +{ + int error; + + if (cpumask & (1 << vcpu)) { + printf("addcpu: attempting to add existing cpu %d\n", vcpu); + exit(1); + } + + cpumask |= 1 << vcpu; + foundcpus++; + + /* + * Set up the vmexit struct to allow execution to start + * at the given RIP + */ + vmexit[vcpu].rip = rip; + vmexit[vcpu].inst_length = 0; + + if (vcpu == BSP || !guest_vcpu_mux){ + mt_vmm_info[vcpu].mt_ctx = ctx; + mt_vmm_info[vcpu].mt_vcpu = vcpu; + + error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL, + fbsdrun_start_thread, &mt_vmm_info[vcpu]); + assert(error == 0); + } +} + +static int +fbsdrun_get_next_cpu(int curcpu) +{ + + /* + * Get the next available CPU. Assumes they arrive + * in ascending order with no gaps. + */ + return ((curcpu + 1) % foundcpus); +} + +int +vmexit_catch_reset(void) +{ + stats.io_reset++; + return (VMEXIT_RESET); +} + +int +vmexit_catch_inout(void) +{ + return (VMEXIT_ABORT); +} + +int +vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, + uint32_t eax) +{ +#if PG_DEBUG /* put all types of debug here */ + if (eax == 0) { + pause_noswitch = 1; + } else if (eax == 1) { + pause_noswitch = 0; + } else { + pause_noswitch = 0; + if (eax == 5) { + vm_set_capability(ctx, *pvcpu, VM_CAP_MTRAP_EXIT, 1); + } + } +#endif + return (VMEXIT_CONTINUE); +} + +static int +vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + int error; + int bytes, port, in, out; + uint32_t eax; + int vcpu; + + vcpu = *pvcpu; + + port = vme->u.inout.port; + bytes = vme->u.inout.bytes; + eax = vme->u.inout.eax; + in = vme->u.inout.in; + out = !in; + + /* We don't deal with these */ + if (vme->u.inout.string || vme->u.inout.rep) + return (VMEXIT_ABORT); + + /* Special case of guest reset */ + if (out && port == 0x64 && (uint8_t)eax == 0xFE) + return (vmexit_catch_reset()); + + /* Extra-special case of host notifications */ + if (out && port == GUEST_NIO_PORT) + return (vmexit_handle_notify(ctx, vme, pvcpu, eax)); + + error = emulate_inout(ctx, vcpu, in, port, bytes, &eax); + if (error == 0 && in) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax); + + if (error == 0) + return (VMEXIT_CONTINUE); + else { + fprintf(stderr, "Unhandled %s%c 0x%04x\n", + in ? "in" : "out", + bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port); + return (vmexit_catch_inout()); + } +} + +static int +vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + printf("vm exit rdmsr 0x%x, cpu %d\n", vme->u.msr.code, *pvcpu); + return (VMEXIT_ABORT); +} + +static int +vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + int newcpu; + int retval = VMEXIT_CONTINUE; + + newcpu = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code,vme->u.msr.wval); + + if (guest_vcpu_mux && *pvcpu != newcpu) { + retval = VMEXIT_SWITCH; + *pvcpu = newcpu; + } + + return (retval); +} + +static int +vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + printf("vm exit[%d]\n", *pvcpu); + printf("\treason\t\tVMX\n"); + printf("\trip\t\t0x%016lx\n", vmexit->rip); + printf("\tinst_length\t%d\n", vmexit->inst_length); + printf("\terror\t\t%d\n", vmexit->u.vmx.error); + printf("\texit_reason\t%u\n", vmexit->u.vmx.exit_reason); + printf("\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification); + + return (VMEXIT_ABORT); +} + +static int bogus_noswitch = 1; + +static int +vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + stats.vmexit_bogus++; + + if (!guest_vcpu_mux || guest_ncpus == 1 || bogus_noswitch) { + return (VMEXIT_RESTART); + } else { + stats.vmexit_bogus_switch++; + vmexit->inst_length = 0; + *pvcpu = -1; + return (VMEXIT_SWITCH); + } +} + +static int +vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + stats.vmexit_hlt++; + if (fbsdrun_muxed()) { + *pvcpu = -1; + return (VMEXIT_SWITCH); + } else { + /* + * Just continue execution with the next instruction. We use + * the HLT VM exit as a way to be friendly with the host + * scheduler. + */ + return (VMEXIT_CONTINUE); + } +} + +static int pause_noswitch; + +static int +vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + stats.vmexit_pause++; + + if (fbsdrun_muxed() && !pause_noswitch) { + *pvcpu = -1; + return (VMEXIT_SWITCH); + } else { + return (VMEXIT_CONTINUE); + } +} + +static int +vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + stats.vmexit_mtrap++; + + return (VMEXIT_RESTART); +} + +static void +sigalrm(int sig) +{ + return; +} + +static void +setup_timeslice(void) +{ + struct sigaction sa; + struct itimerval itv; + int error; + + /* + * Setup a realtime timer to generate a SIGALRM at a + * frequency of 'guest_tslice' ticks per second. + */ + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + sa.sa_handler = sigalrm; + + error = sigaction(SIGALRM, &sa, NULL); + assert(error == 0); + + itv.it_interval.tv_sec = 0; + itv.it_interval.tv_usec = 1000000 / guest_tslice; + itv.it_value.tv_sec = 0; + itv.it_value.tv_usec = 1000000 / guest_tslice; + + error = setitimer(ITIMER_REAL, &itv, NULL); + assert(error == 0); +} + +static vmexit_handler_t handler[VM_EXITCODE_MAX] = { + [VM_EXITCODE_INOUT] = vmexit_inout, + [VM_EXITCODE_VMX] = vmexit_vmx, + [VM_EXITCODE_BOGUS] = vmexit_bogus, + [VM_EXITCODE_RDMSR] = vmexit_rdmsr, + [VM_EXITCODE_WRMSR] = vmexit_wrmsr, + [VM_EXITCODE_MTRAP] = vmexit_mtrap, +}; + +static void +vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip) +{ + int error, rc, prevcpu; + + if (guest_vcpu_mux) + setup_timeslice(); + + if (pincpu >= 0) { + error = vm_set_pinning(ctx, vcpu, pincpu + vcpu); + assert(error == 0); + } + + while (1) { + error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]); + if (error != 0) + break; + + prevcpu = vcpu; + rc = (*handler[vmexit[vcpu].exitcode])(ctx, &vmexit[vcpu], + &vcpu); + switch (rc) { + case VMEXIT_SWITCH: + assert(guest_vcpu_mux); + if (vcpu == -1) { + stats.cpu_switch_rotate++; + vcpu = fbsdrun_get_next_cpu(prevcpu); + } else { + stats.cpu_switch_direct++; + } + /* fall through */ + case VMEXIT_CONTINUE: + rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length; + break; + case VMEXIT_RESTART: + rip = vmexit[vcpu].rip; + break; + case VMEXIT_RESET: + exit(0); + default: + exit(1); + } + } + fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); +} + + +int +main(int argc, char *argv[]) +{ + int c, error, gdb_port, inject_bkpt, tmp, err; + struct vmctx *ctx; + uint64_t rip; + + inject_bkpt = 0; + progname = basename(argv[0]); + gdb_port = DEFAULT_GDB_PORT; + guest_ncpus = 1; + + while ((c = getopt(argc, argv, "hBHPxp:g:c:z:s:n:m:M:")) != -1) { + switch (c) { + case 'B': + inject_bkpt = 1; + break; + case 'x': + guest_vcpu_mux = 1; + break; + case 'p': + pincpu = atoi(optarg); + break; + case 'c': + guest_ncpus = atoi(optarg); + break; + case 'g': + gdb_port = atoi(optarg); + break; + case 'z': + guest_hz = atoi(optarg); + break; + case 't': + guest_tslice = atoi(optarg); + break; + case 's': + pci_parse_slot(optarg); + break; + case 'n': + pci_parse_name(optarg); + break; + case 'm': + lomem_sz = strtoul(optarg, NULL, 0) * MB; + break; + case 'M': + himem_sz = strtoul(optarg, NULL, 0) * MB; + break; + case 'H': + guest_vmexit_on_hlt = 1; + break; + case 'P': + guest_vmexit_on_pause = 1; + break; + case 'h': + usage(0); + default: + usage(1); + } + } + argc -= optind; + argv += optind; + + if (argc != 1) + usage(1); + + /* No need to mux if guest is uni-processor */ + if (guest_ncpus <= 1) + guest_vcpu_mux = 0; + + /* vmexit on hlt if guest is muxed */ + if (guest_vcpu_mux) { + guest_vmexit_on_hlt = 1; + guest_vmexit_on_pause = 1; + } + + vmname = argv[0]; + + ctx = vm_open(vmname); + if (ctx == NULL) { + perror("vm_open"); + exit(1); + } + + if (fbsdrun_vmexit_on_hlt()) { + err = vm_get_capability(ctx, BSP, VM_CAP_HALT_EXIT, &tmp); + if (err < 0) { + printf("VM exit on HLT not supported\n"); + exit(1); + } + vm_set_capability(ctx, BSP, VM_CAP_HALT_EXIT, 1); + handler[VM_EXITCODE_HLT] = vmexit_hlt; + } + + if (fbsdrun_vmexit_on_pause()) { + /* + * pause exit support required for this mode + */ + err = vm_get_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, &tmp); + if (err < 0) { + printf("SMP mux requested, no pause support\n"); + exit(1); + } + vm_set_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, 1); + handler[VM_EXITCODE_PAUSE] = vmexit_pause; + } + + if (lomem_sz != 0) { + lomem_addr = vm_map_memory(ctx, 0, lomem_sz); + if (lomem_addr == (char *) MAP_FAILED) { + lomem_sz = 0; + } else if (himem_sz != 0) { + himem_addr = vm_map_memory(ctx, 4*GB, himem_sz); + if (himem_addr == (char *) MAP_FAILED) { + lomem_sz = 0; + himem_sz = 0; + } + } + } + + init_inout(); + init_pci(ctx); + + if (gdb_port != 0) + init_dbgport(gdb_port); + + error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); + assert(error == 0); + + if (inject_bkpt) { + error = vm_inject_event(ctx, BSP, VM_HW_EXCEPTION, IDT_BP); + assert(error == 0); + } + + /* + * build the guest tables, MP etc. + */ + vm_build_tables(ctx, guest_ncpus, oem_tbl_start, oem_tbl_size); + + /* + * Add CPU 0 + */ + fbsdrun_addcpu(ctx, BSP, rip); + + /* + * Head off to the main event dispatch loop + */ + mevent_dispatch(); + + exit(1); +} diff --git a/usr.sbin/bhyve/fbsdrun.h b/usr.sbin/bhyve/fbsdrun.h new file mode 100644 index 000000000000..81061222a1a1 --- /dev/null +++ b/usr.sbin/bhyve/fbsdrun.h @@ -0,0 +1,53 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _FBSDRUN_H_ +#define _FBSDRUN_H_ + +#ifndef CTASSERT /* Allow lint to override */ +#define CTASSERT(x) _CTASSERT(x, __LINE__) +#define _CTASSERT(x, y) __CTASSERT(x, y) +#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1] +#endif + +struct vmctx; +extern int guest_hz; +extern int guest_tslice; +extern int guest_ncpus; +extern char *vmname; + +extern u_long lomem_sz, himem_sz; + +void *paddr_guest2host(uintptr_t); + +void fbsdrun_addcpu(struct vmctx *ctx, int cpu, uint64_t rip); +void fbsdrun_add_oemtbl(void *tbl, int tblsz); +int fbsdrun_muxed(void); +int fbsdrun_vmexit_on_hlt(void); +int fbsdrun_vmexit_on_pause(void); +#endif diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c new file mode 100644 index 000000000000..84445b1f0a99 --- /dev/null +++ b/usr.sbin/bhyve/inout.c @@ -0,0 +1,98 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/linker_set.h> + +#include <stdio.h> +#include <assert.h> + +#include "inout.h" + +SET_DECLARE(inout_port_set, struct inout_port); + +#define MAX_IOPORTS (1 << 16) + +static struct { + const char *name; + int flags; + inout_func_t handler; + void *arg; +} inout_handlers[MAX_IOPORTS]; + +int +emulate_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax) +{ + int flags; + inout_func_t handler; + void *arg; + + assert(port < MAX_IOPORTS); + + if ((handler = inout_handlers[port].handler) == NULL) + return (-1); + + flags = inout_handlers[port].flags; + arg = inout_handlers[port].arg; + + if ((in && (flags & IOPORT_F_IN)) || (!in && (flags & IOPORT_F_OUT))) + return ((*handler)(ctx, vcpu, in, port, bytes, eax, arg)); + else + return (-1); +} + +void +init_inout(void) +{ + struct inout_port **iopp, *iop; + + SET_FOREACH(iopp, inout_port_set) { + iop = *iopp; + assert(iop->port < MAX_IOPORTS); + inout_handlers[iop->port].name = iop->name; + inout_handlers[iop->port].flags = iop->flags; + inout_handlers[iop->port].handler = iop->handler; + inout_handlers[iop->port].arg = NULL; + } +} + +int +register_inout(struct inout_port *iop) +{ + assert(iop->port < MAX_IOPORTS); + inout_handlers[iop->port].name = iop->name; + inout_handlers[iop->port].flags = iop->flags; + inout_handlers[iop->port].handler = iop->handler; + inout_handlers[iop->port].arg = iop->arg; + + return (0); +} diff --git a/usr.sbin/bhyve/inout.h b/usr.sbin/bhyve/inout.h new file mode 100644 index 000000000000..7b8a4a6edd38 --- /dev/null +++ b/usr.sbin/bhyve/inout.h @@ -0,0 +1,64 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _INOUT_H_ +#define _INOUT_H_ + +#include <sys/linker_set.h> + +struct vmctx; + +typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port, + int bytes, uint32_t *eax, void *arg); + +struct inout_port { + const char *name; + int port; + int flags; + inout_func_t handler; + void *arg; +}; +#define IOPORT_F_IN 0x1 +#define IOPORT_F_OUT 0x2 +#define IOPORT_F_INOUT 0x3 + +#define INOUT_PORT(name, port, flags, handler) \ + static struct inout_port __CONCAT(__inout_port, __LINE__) = { \ + #name, \ + (port), \ + (flags), \ + (handler) \ + }; \ + DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__)) + +void init_inout(void); +int emulate_inout(struct vmctx *, int vcpu, int in, int port, int bytes, + uint32_t *eax); +int register_inout(struct inout_port *iop); + +#endif /* _INOUT_H_ */ diff --git a/usr.sbin/bhyve/mevent.c b/usr.sbin/bhyve/mevent.c new file mode 100644 index 000000000000..0d3b2872e1b4 --- /dev/null +++ b/usr.sbin/bhyve/mevent.c @@ -0,0 +1,419 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Micro event library for FreeBSD, designed for a single i/o thread + * using kqueue, and having events be persistent by default. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <assert.h> +#include <errno.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#include <sys/types.h> +#include <sys/event.h> +#include <sys/time.h> + +#include <pthread.h> + +#include "mevent.h" + +#define MEVENT_MAX 64 + +#define MEV_ENABLE 1 +#define MEV_DISABLE 2 +#define MEV_DEL_PENDING 3 + +static pthread_t mevent_tid; +static int mevent_pipefd[2]; +static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER; + +struct mevent { + void (*me_func)(int, enum ev_type, void *); + int me_fd; + enum ev_type me_type; + void *me_param; + int me_cq; + int me_state; + int me_closefd; + LIST_ENTRY(mevent) me_list; +}; + +static LIST_HEAD(listhead, mevent) global_head, change_head; + +static void +mevent_qlock(void) +{ + pthread_mutex_lock(&mevent_lmutex); +} + +static void +mevent_qunlock(void) +{ + pthread_mutex_unlock(&mevent_lmutex); +} + +static void +mevent_pipe_read(int fd, enum ev_type type, void *param) +{ + char buf[MEVENT_MAX]; + int status; + + /* + * Drain the pipe read side. The fd is non-blocking so this is + * safe to do. + */ + do { + status = read(fd, buf, sizeof(buf)); + } while (status == MEVENT_MAX); +} + +static void +mevent_notify(void) +{ + char c; + + /* + * If calling from outside the i/o thread, write a byte on the + * pipe to force the i/o thread to exit the blocking kevent call. + */ + if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) { + write(mevent_pipefd[1], &c, 1); + } +} + +static int +mevent_kq_filter(struct mevent *mevp) +{ + int retval; + + retval = 0; + + if (mevp->me_type == EVF_READ) + retval = EVFILT_READ; + + if (mevp->me_type == EVF_WRITE) + retval = EVFILT_WRITE; + + return (retval); +} + +static int +mevent_kq_flags(struct mevent *mevp) +{ + int ret; + + switch (mevp->me_state) { + case MEV_ENABLE: + ret = EV_ADD; + break; + case MEV_DISABLE: + ret = EV_DISABLE; + break; + case MEV_DEL_PENDING: + ret = EV_DELETE; + break; + } + + return (ret); +} + +static int +mevent_kq_fflags(struct mevent *mevp) +{ + /* XXX nothing yet, perhaps EV_EOF for reads ? */ + return (0); +} + +static int +mevent_build(int mfd, struct kevent *kev) +{ + struct mevent *mevp, *tmpp; + int i; + + i = 0; + + mevent_qlock(); + + LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { + if (mevp->me_closefd) { + /* + * A close of the file descriptor will remove the + * event + */ + close(mevp->me_fd); + } else { + kev[i].ident = mevp->me_fd; + kev[i].filter = mevent_kq_filter(mevp); + kev[i].flags = mevent_kq_flags(mevp); + kev[i].fflags = mevent_kq_fflags(mevp); + kev[i].data = 0; + kev[i].udata = mevp; + i++; + } + + mevp->me_cq = 0; + LIST_REMOVE(mevp, me_list); + + if (mevp->me_state == MEV_DEL_PENDING) { + free(mevp); + } else { + LIST_INSERT_HEAD(&global_head, mevp, me_list); + } + + assert(i < MEVENT_MAX); + } + + mevent_qunlock(); + + return (i); +} + +static void +mevent_handle(struct kevent *kev, int numev) +{ + struct mevent *mevp; + int i; + + for (i = 0; i < numev; i++) { + mevp = kev[i].udata; + + /* XXX check for EV_ERROR ? */ + + (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param); + } +} + +struct mevent * +mevent_add(int fd, enum ev_type type, + void (*func)(int, enum ev_type, void *), void *param) +{ + struct mevent *lp, *mevp; + + if (fd < 0 || func == NULL) { + return (NULL); + } + + mevp = NULL; + + mevent_qlock(); + + /* + * Verify that the fd/type tuple is not present in any list + */ + LIST_FOREACH(lp, &global_head, me_list) { + if (lp->me_fd == fd && lp->me_type == type) { + goto exit; + } + } + + LIST_FOREACH(lp, &change_head, me_list) { + if (lp->me_fd == fd && lp->me_type == type) { + goto exit; + } + } + + /* + * Allocate an entry, populate it, and add it to the change list. + */ + mevp = malloc(sizeof(struct mevent)); + if (mevp == NULL) { + goto exit; + } + + memset(mevp, 0, sizeof(struct mevent)); + mevp->me_fd = fd; + mevp->me_type = type; + mevp->me_func = func; + mevp->me_param = param; + + LIST_INSERT_HEAD(&change_head, mevp, me_list); + mevp->me_cq = 1; + mevp->me_state = MEV_ENABLE; + mevent_notify(); + +exit: + mevent_qunlock(); + + return (mevp); +} + +static int +mevent_update(struct mevent *evp, int newstate) +{ + /* + * It's not possible to enable/disable a deleted event + */ + if (evp->me_state == MEV_DEL_PENDING) + return (EINVAL); + + /* + * No update needed if state isn't changing + */ + if (evp->me_state == newstate) + return (0); + + mevent_qlock(); + + evp->me_state = newstate; + + /* + * Place the entry onto the changed list if not already there. + */ + if (evp->me_cq == 0) { + evp->me_cq = 1; + LIST_REMOVE(evp, me_list); + LIST_INSERT_HEAD(&change_head, evp, me_list); + mevent_notify(); + } + + mevent_qunlock(); + + return (0); +} + +int +mevent_enable(struct mevent *evp) +{ + + return (mevent_update(evp, MEV_ENABLE)); +} + +int +mevent_disable(struct mevent *evp) +{ + + return (mevent_update(evp, MEV_DISABLE)); +} + +static int +mevent_delete_event(struct mevent *evp, int closefd) +{ + mevent_qlock(); + + /* + * Place the entry onto the changed list if not already there, and + * mark as to be deleted. + */ + if (evp->me_cq == 0) { + evp->me_cq = 1; + LIST_REMOVE(evp, me_list); + LIST_INSERT_HEAD(&change_head, evp, me_list); + mevent_notify(); + } + evp->me_state = MEV_DEL_PENDING; + + if (closefd) + evp->me_closefd = 1; + + mevent_qunlock(); + + return (0); +} + +int +mevent_delete(struct mevent *evp) +{ + + return (mevent_delete_event(evp, 0)); +} + +int +mevent_delete_close(struct mevent *evp) +{ + + return (mevent_delete_event(evp, 1)); +} + +void +mevent_dispatch(void) +{ + struct kevent changelist[MEVENT_MAX]; + struct kevent eventlist[MEVENT_MAX]; + struct mevent *pipev; + int mfd; + int numev; + int ret; + + mevent_tid = pthread_self(); + + mfd = kqueue(); + assert(mfd > 0); + + /* + * Open the pipe that will be used for other threads to force + * the blocking kqueue call to exit by writing to it. Set the + * descriptor to non-blocking. + */ + ret = pipe(mevent_pipefd); + if (ret < 0) { + perror("pipe"); + exit(0); + } + + /* + * Add internal event handler for the pipe write fd + */ + pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL); + assert(pipev != NULL); + + for (;;) { + /* + * Build changelist if required. + * XXX the changelist can be put into the blocking call + * to eliminate the extra syscall. Currently better for + * debug. + */ + numev = mevent_build(mfd, changelist); + if (numev) { + ret = kevent(mfd, changelist, numev, NULL, 0, NULL); + if (ret == -1) { + perror("Error return from kevent change"); + } + } + + /* + * Block awaiting events + */ + ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL); + if (ret == -1) { + perror("Error return from kevent monitor"); + } + + /* + * Handle reported events + */ + mevent_handle(eventlist, ret); + } +} diff --git a/usr.sbin/bhyve/mevent.h b/usr.sbin/bhyve/mevent.h new file mode 100644 index 000000000000..32a9d74ab566 --- /dev/null +++ b/usr.sbin/bhyve/mevent.h @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MEVENT_H_ +#define _MEVENT_H_ + +enum ev_type { + EVF_READ, + EVF_WRITE +}; + +struct mevent; + +struct mevent *mevent_add(int fd, enum ev_type type, + void (*func)(int, enum ev_type, void *), + void *param); +int mevent_enable(struct mevent *evp); +int mevent_disable(struct mevent *evp); +int mevent_delete(struct mevent *evp); +int mevent_delete_close(struct mevent *evp); + +void mevent_dispatch(void); + +#endif /* _MEVENT_H_ */ diff --git a/usr.sbin/bhyve/mevent_test.c b/usr.sbin/bhyve/mevent_test.c new file mode 100644 index 000000000000..c72a49718188 --- /dev/null +++ b/usr.sbin/bhyve/mevent_test.c @@ -0,0 +1,180 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Test program for the micro event library. Set up a simple TCP echo + * service. + * + * cc mevent_test.c mevent.c -lpthread + */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> + +#include <stdio.h> +#include <stdlib.h> +#include <pthread.h> + +#include "mevent.h" + +#define TEST_PORT 4321 + +static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER; + +#define MEVENT_ECHO + +#ifdef MEVENT_ECHO +struct esync { + pthread_mutex_t e_mt; + pthread_cond_t e_cond; +}; + +static void +echoer_callback(int fd, enum ev_type type, void *param) +{ + struct esync *sync = param; + + pthread_mutex_lock(&sync->e_mt); + pthread_cond_signal(&sync->e_cond); + pthread_mutex_unlock(&sync->e_mt); +} + +static void * +echoer(void *param) +{ + struct esync sync; + struct mevent *mev; + char buf[128]; + int fd = (int)(uintptr_t) param; + int len; + + pthread_mutex_init(&sync.e_mt, NULL); + pthread_cond_init(&sync.e_cond, NULL); + + pthread_mutex_lock(&sync.e_mt); + + mev = mevent_add(fd, EVF_READ, echoer_callback, &sync); + if (mev == NULL) { + printf("Could not allocate echoer event\n"); + exit(1); + } + + while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) { + len = read(fd, buf, sizeof(buf)); + if (len > 0) { + write(fd, buf, len); + write(0, buf, len); + } else { + break; + } + } + + mevent_delete_close(mev); + + pthread_mutex_unlock(&sync.e_mt); + pthread_mutex_destroy(&sync.e_mt); + pthread_cond_destroy(&sync.e_cond); +} + +#else + +static void * +echoer(void *param) +{ + char buf[128]; + int fd = (int)(uintptr_t) param; + int len; + + while ((len = read(fd, buf, sizeof(buf))) > 0) { + write(1, buf, len); + } +} +#endif /* MEVENT_ECHO */ + +static void +acceptor_callback(int fd, enum ev_type type, void *param) +{ + pthread_mutex_lock(&accept_mutex); + pthread_cond_signal(&accept_condvar); + pthread_mutex_unlock(&accept_mutex); +} + +static void * +acceptor(void *param) +{ + struct sockaddr_in sin; + pthread_t tid; + int news; + int s; + + if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("socket"); + exit(1); + } + + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(TEST_PORT); + + if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + perror("bind"); + exit(1); + } + + if (listen(s, 1) < 0) { + perror("listen"); + exit(1); + } + + (void) mevent_add(s, EVF_READ, acceptor_callback, NULL); + + pthread_mutex_lock(&accept_mutex); + + while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) { + news = accept(s, NULL, NULL); + if (news < 0) { + perror("accept error"); + } else { + printf("incoming connection, spawning thread\n"); + pthread_create(&tid, NULL, echoer, + (void *)(uintptr_t)news); + } + } +} + +main() +{ + pthread_t tid; + + pthread_create(&tid, NULL, acceptor, NULL); + + mevent_dispatch(); +} diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c new file mode 100644 index 000000000000..273c6a2c78c5 --- /dev/null +++ b/usr.sbin/bhyve/pci_emul.c @@ -0,0 +1,976 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/linker_set.h> + +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <assert.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include "fbsdrun.h" +#include "inout.h" +#include "pci_emul.h" + +#define CONF1_ADDR_PORT 0x0cf8 +#define CONF1_DATA_PORT 0x0cfc + +#define CFGWRITE(pi,off,val,b) \ +do { \ + if ((b) == 1) { \ + pci_set_cfgdata8((pi),(off),(val)); \ + } else if ((b) == 2) { \ + pci_set_cfgdata16((pi),(off),(val)); \ + } else { \ + pci_set_cfgdata32((pi),(off),(val)); \ + } \ +} while (0) + +#define MAXSLOTS 32 + +static struct slotinfo { + char *si_name; + char *si_param; + struct pci_devinst *si_devi; + int si_titled; + int si_pslot; + char si_prefix; + char si_suffix; +} pci_slotinfo[MAXSLOTS]; + +/* + * NetApp specific: + * struct used to build an in-core OEM table to supply device names + * to driver instances + */ +static struct mptable_pci_devnames { +#define MPT_HDR_BASE 0 +#define MPT_HDR_NAME 2 + uint16_t md_hdrtype; + uint16_t md_entries; + uint16_t md_cksum; + uint16_t md_pad; +#define MPT_NTAP_SIG \ + ((uint32_t)(('P' << 24) | ('A' << 16) | ('T' << 8) | 'N')) + uint32_t md_sig; + uint32_t md_rsvd; + struct mptable_pci_slotinfo { + uint16_t mds_type; + uint16_t mds_phys_slot; + uint8_t mds_bus; + uint8_t mds_slot; + uint8_t mds_func; + uint8_t mds_pad; + uint16_t mds_vid; + uint16_t mds_did; + uint8_t mds_suffix[4]; + uint8_t mds_prefix[4]; + uint32_t mds_rsvd[3]; + } md_slotinfo[MAXSLOTS]; +} pci_devnames; + +SET_DECLARE(pci_devemu_set, struct pci_devemu); + +static uint64_t pci_emul_iobase; +static uint64_t pci_emul_membase32; +static uint64_t pci_emul_membase64; + +#define PCI_EMUL_IOBASE 0x2000 +#define PCI_EMUL_IOLIMIT 0x10000 + +#define PCI_EMUL_MEMBASE32 (lomem_sz) +#define PCI_EMUL_MEMLIMIT32 0xE0000000 /* 3.5GB */ + +#define PCI_EMUL_MEMBASE64 0xD000000000UL +#define PCI_EMUL_MEMLIMIT64 0xFD00000000UL + +static int pci_emul_devices; +static int devname_elems; + +/* + * I/O access + */ + +/* + * Slot options are in the form: + * + * <slot>,<emul>[,<config>] + * + * slot is 0..31 + * emul is a string describing the type of PCI device e.g. virtio-net + * config is an optional string, depending on the device, that can be + * used for configuration. + * Examples are: + * 1,virtio-net,tap0 + * 3,dummy + */ +static void +pci_parse_slot_usage(char *aopt) +{ + printf("Invalid PCI slot info field \"%s\"\n", aopt); + free(aopt); +} + +void +pci_parse_slot(char *opt) +{ + char *slot, *emul, *config; + char *str, *cpy; + int snum; + + str = cpy = strdup(opt); + config = NULL; + + slot = strsep(&str, ","); + emul = strsep(&str, ","); + if (str != NULL) { + config = strsep(&str, ","); + } + + if (emul == NULL) { + pci_parse_slot_usage(cpy); + return; + } + + snum = 255; + snum = atoi(slot); + if (snum < 0 || snum >= MAXSLOTS) { + pci_parse_slot_usage(cpy); + } else { + pci_slotinfo[snum].si_name = emul; + pci_slotinfo[snum].si_param = config; + } +} + + +/* + * + * PCI MPTable names are of the form: + * + * <slot>,[prefix]<digit><suffix> + * + * .. with <prefix> an alphabetic char, <digit> a 1 or 2-digit string, + * and <suffix> a single char. + * + * Examples: + * 1,e0c + * 4,e0P + * 6,43a + * 7,0f + * 10,1 + * 12,e0M + * 2,12a + * + * Note that this is NetApp-specific, but is ignored on other o/s's. + */ +static void +pci_parse_name_usage(char *aopt) +{ + printf("Invalid PCI slot name field \"%s\"\n", aopt); +} + +void +pci_parse_name(char *opt) +{ + char csnum[4]; + char *namestr; + char *slotend; + char prefix, suffix; + int i; + int pslot; + int snum; + + pslot = -1; + prefix = suffix = 0; + slotend = strchr(opt, ','); + + /* + * A comma must be present, and can't be the first character + * or no slot would be present. Also, the slot number can't be + * more than 2 characters. + */ + if (slotend == NULL || slotend == opt || (slotend - opt > 2)) { + pci_parse_name_usage(opt); + return; + } + + for (i = 0; i < (slotend - opt); i++) { + csnum[i] = opt[i]; + } + csnum[i] = '\0'; + + snum = 255; + snum = atoi(csnum); + if (snum < 0 || snum >= MAXSLOTS) { + pci_parse_name_usage(opt); + return; + } + + namestr = slotend + 1; + + if (strlen(namestr) > 3) { + pci_parse_name_usage(opt); + return; + } + + if (isalpha(*namestr)) { + prefix = *namestr++; + } + + if (!isdigit(*namestr)) { + pci_parse_name_usage(opt); + } else { + pslot = *namestr++ - '0'; + if (isnumber(*namestr)) { + pslot = 10*pslot + *namestr++ - '0'; + + } + if (isalpha(*namestr) && *(namestr + 1) == 0) { + suffix = *namestr; + pci_slotinfo[snum].si_titled = 1; + pci_slotinfo[snum].si_pslot = pslot; + pci_slotinfo[snum].si_prefix = prefix; + pci_slotinfo[snum].si_suffix = suffix; + + } else { + pci_parse_name_usage(opt); + } + } +} + +static void +pci_add_mptable_name(struct slotinfo *si) +{ + struct mptable_pci_slotinfo *ms; + + /* + * If naming information has been supplied for this slot, populate + * the next available mptable OEM entry + */ + if (si->si_titled) { + ms = &pci_devnames.md_slotinfo[devname_elems]; + + ms->mds_type = MPT_HDR_NAME; + ms->mds_phys_slot = si->si_pslot; + ms->mds_bus = si->si_devi->pi_bus; + ms->mds_slot = si->si_devi->pi_slot; + ms->mds_func = si->si_devi->pi_func; + ms->mds_vid = pci_get_cfgdata16(si->si_devi, PCIR_VENDOR); + ms->mds_did = pci_get_cfgdata16(si->si_devi, PCIR_DEVICE); + ms->mds_suffix[0] = si->si_suffix; + ms->mds_prefix[0] = si->si_prefix; + + devname_elems++; + } +} + +static void +pci_finish_mptable_names(void) +{ + int size; + + if (devname_elems) { + pci_devnames.md_hdrtype = MPT_HDR_BASE; + pci_devnames.md_entries = devname_elems; + pci_devnames.md_cksum = 0; /* XXX */ + pci_devnames.md_sig = MPT_NTAP_SIG; + + size = (uintptr_t)&pci_devnames.md_slotinfo[devname_elems] - + (uintptr_t)&pci_devnames; + + fbsdrun_add_oemtbl(&pci_devnames, size); + } +} + +static int +pci_emul_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + struct pci_devinst *pdi = arg; + struct pci_devemu *pe = pdi->pi_d; + int offset, i; + + for (i = 0; i <= PCI_BARMAX; i++) { + if (pdi->pi_bar[i].type == PCIBAR_IO && + port >= pdi->pi_bar[i].addr && + port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) { + offset = port - pdi->pi_bar[i].addr; + if (in) + *eax = (*pe->pe_ior)(pdi, i, offset, bytes); + else + (*pe->pe_iow)(pdi, i, offset, bytes, *eax); + return (0); + } + } + return (-1); +} + +static int +pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, + uint64_t *addr) +{ + uint64_t base; + + assert((size & (size - 1)) == 0); /* must be a power of 2 */ + + base = roundup2(*baseptr, size); + + if (base + size <= limit) { + *addr = base; + *baseptr = base + size; + return (0); + } else + return (-1); +} + +int +pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, uint64_t hostbase, + enum pcibar_type type, uint64_t size) +{ + int i, error; + uint64_t *baseptr, limit, addr, mask, lobits, bar; + struct inout_port iop; + + assert(idx >= 0 && idx <= PCI_BARMAX); + + if ((size & (size - 1)) != 0) + size = 1UL << flsl(size); /* round up to a power of 2 */ + + switch (type) { + case PCIBAR_NONE: + baseptr = NULL; + addr = mask = lobits = 0; + break; + case PCIBAR_IO: + baseptr = &pci_emul_iobase; + limit = PCI_EMUL_IOLIMIT; + mask = PCIM_BAR_IO_BASE; + lobits = PCIM_BAR_IO_SPACE; + break; + case PCIBAR_MEM64: + /* + * XXX + * Some drivers do not work well if the 64-bit BAR is allocated + * above 4GB. Allow for this by allocating small requests under + * 4GB unless then allocation size is larger than some arbitrary + * number (32MB currently). + */ + if (size > 32 * 1024 * 1024) { + /* + * XXX special case for device requiring peer-peer DMA + */ + if (size == 0x100000000UL) + baseptr = &hostbase; + else + baseptr = &pci_emul_membase64; + limit = PCI_EMUL_MEMLIMIT64; + mask = PCIM_BAR_MEM_BASE; + lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | + PCIM_BAR_MEM_PREFETCH; + break; + } + /* fallthrough */ + case PCIBAR_MEM32: + baseptr = &pci_emul_membase32; + limit = PCI_EMUL_MEMLIMIT32; + mask = PCIM_BAR_MEM_BASE; + lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; + break; + default: + printf("pci_emul_alloc_base: invalid bar type %d\n", type); + assert(0); + } + + if (baseptr != NULL) { + error = pci_emul_alloc_resource(baseptr, limit, size, &addr); + if (error != 0) + return (error); + } + + pdi->pi_bar[idx].type = type; + pdi->pi_bar[idx].addr = addr; + pdi->pi_bar[idx].size = size; + + /* Initialize the BAR register in config space */ + bar = (addr & mask) | lobits; + pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar); + + if (type == PCIBAR_MEM64) { + assert(idx + 1 <= PCI_BARMAX); + pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64; + pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32); + } + + /* add a handler to intercept accesses to the I/O bar */ + if (type == PCIBAR_IO) { + iop.name = pdi->pi_name; + iop.flags = IOPORT_F_INOUT; + iop.handler = pci_emul_handler; + iop.arg = pdi; + + for (i = 0; i < size; i++) { + iop.port = addr + i; + register_inout(&iop); + } + } + + return (0); +} + +#define CAP_START_OFFSET 0x40 +static int +pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen) +{ + int i, capoff, capid, reallen; + uint16_t sts; + + static u_char endofcap[4] = { + PCIY_RESERVED, 0, 0, 0 + }; + + assert(caplen > 0 && capdata[0] != PCIY_RESERVED); + + reallen = roundup2(caplen, 4); /* dword aligned */ + + sts = pci_get_cfgdata16(pi, PCIR_STATUS); + if ((sts & PCIM_STATUS_CAPPRESENT) == 0) { + capoff = CAP_START_OFFSET; + pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff); + pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT); + } else { + capoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR); + while (1) { + assert((capoff & 0x3) == 0); + capid = pci_get_cfgdata8(pi, capoff); + if (capid == PCIY_RESERVED) + break; + capoff = pci_get_cfgdata8(pi, capoff + 1); + } + } + + /* Check if we have enough space */ + if (capoff + reallen + sizeof(endofcap) > PCI_REGMAX + 1) + return (-1); + + /* Copy the capability */ + for (i = 0; i < caplen; i++) + pci_set_cfgdata8(pi, capoff + i, capdata[i]); + + /* Set the next capability pointer */ + pci_set_cfgdata8(pi, capoff + 1, capoff + reallen); + + /* Copy of the reserved capability which serves as the end marker */ + for (i = 0; i < sizeof(endofcap); i++) + pci_set_cfgdata8(pi, capoff + reallen + i, endofcap[i]); + + return (0); +} + +static struct pci_devemu * +pci_emul_finddev(char *name) +{ + struct pci_devemu **pdpp, *pdp; + + SET_FOREACH(pdpp, pci_devemu_set) { + pdp = *pdpp; + if (!strcmp(pdp->pe_emu, name)) { + return (pdp); + } + } + + return (NULL); +} + +static void +pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int slot, char *params) +{ + struct pci_devinst *pdi; + pdi = malloc(sizeof(struct pci_devinst)); + bzero(pdi, sizeof(*pdi)); + + pdi->pi_vmctx = ctx; + pdi->pi_bus = 0; + pdi->pi_slot = slot; + pdi->pi_func = 0; + pdi->pi_d = pde; + snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot); + + /* Disable legacy interrupts */ + pci_set_cfgdata8(pdi, PCIR_INTLINE, 255); + pci_set_cfgdata8(pdi, PCIR_INTPIN, 0); + + pci_set_cfgdata8(pdi, PCIR_COMMAND, + PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN); + + if ((*pde->pe_init)(ctx, pdi, params) != 0) { + free(pdi); + } else { + pci_emul_devices++; + pci_slotinfo[slot].si_devi = pdi; + } +} + +void +pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr) +{ + int mmc; + + CTASSERT(sizeof(struct msicap) == 14); + + /* Number of msi messages must be a power of 2 between 1 and 32 */ + assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32); + mmc = ffs(msgnum) - 1; + + bzero(msicap, sizeof(struct msicap)); + msicap->capid = PCIY_MSI; + msicap->nextptr = nextptr; + msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1); +} + +int +pci_emul_add_msicap(struct pci_devinst *pi, int msgnum) +{ + struct msicap msicap; + + pci_populate_msicap(&msicap, msgnum, 0); + + return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap))); +} + +void +msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val) +{ + uint16_t msgctrl, rwmask, msgdata, mme; + uint32_t addrlo; + + /* + * If guest is writing to the message control register make sure + * we do not overwrite read-only fields. + */ + if ((offset - capoff) == 2 && bytes == 2) { + rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE; + msgctrl = pci_get_cfgdata16(pi, offset); + msgctrl &= ~rwmask; + msgctrl |= val & rwmask; + val = msgctrl; + + addrlo = pci_get_cfgdata32(pi, capoff + 4); + if (msgctrl & PCIM_MSICTRL_64BIT) + msgdata = pci_get_cfgdata16(pi, capoff + 12); + else + msgdata = pci_get_cfgdata16(pi, capoff + 8); + + /* + * XXX check delivery mode, destination mode etc + */ + mme = msgctrl & PCIM_MSICTRL_MME_MASK; + pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0; + if (pi->pi_msi.enabled) { + pi->pi_msi.cpu = (addrlo >> 12) & 0xff; + pi->pi_msi.vector = msgdata & 0xff; + pi->pi_msi.msgnum = 1 << (mme >> 4); + } else { + pi->pi_msi.cpu = 0; + pi->pi_msi.vector = 0; + pi->pi_msi.msgnum = 0; + } + } + + CFGWRITE(pi, offset, val, bytes); +} + +/* + * This function assumes that 'coff' is in the capabilities region of the + * config space. + */ +static void +pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val) +{ + int capid; + uint8_t capoff, nextoff; + + /* Do not allow un-aligned writes */ + if ((offset & (bytes - 1)) != 0) + return; + + /* Find the capability that we want to update */ + capoff = CAP_START_OFFSET; + while (1) { + capid = pci_get_cfgdata8(pi, capoff); + if (capid == PCIY_RESERVED) + break; + + nextoff = pci_get_cfgdata8(pi, capoff + 1); + if (offset >= capoff && offset < nextoff) + break; + + capoff = nextoff; + } + assert(offset >= capoff); + + /* + * Capability ID and Next Capability Pointer are readonly + */ + if (offset == capoff || offset == capoff + 1) + return; + + switch (capid) { + case PCIY_MSI: + msicap_cfgwrite(pi, capoff, offset, bytes, val); + break; + default: + break; + } +} + +static int +pci_emul_iscap(struct pci_devinst *pi, int offset) +{ + int found; + uint16_t sts; + uint8_t capid, lastoff; + + found = 0; + sts = pci_get_cfgdata16(pi, PCIR_STATUS); + if ((sts & PCIM_STATUS_CAPPRESENT) != 0) { + lastoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR); + while (1) { + assert((lastoff & 0x3) == 0); + capid = pci_get_cfgdata8(pi, lastoff); + if (capid == PCIY_RESERVED) + break; + lastoff = pci_get_cfgdata8(pi, lastoff + 1); + } + if (offset >= CAP_START_OFFSET && offset <= lastoff) + found = 1; + } + return (found); +} + +void +init_pci(struct vmctx *ctx) +{ + struct pci_devemu *pde; + struct slotinfo *si; + int i; + + pci_emul_iobase = PCI_EMUL_IOBASE; + pci_emul_membase32 = PCI_EMUL_MEMBASE32; + pci_emul_membase64 = PCI_EMUL_MEMBASE64; + + si = pci_slotinfo; + + for (i = 0; i < MAXSLOTS; i++, si++) { + if (si->si_name != NULL) { + pde = pci_emul_finddev(si->si_name); + if (pde != NULL) { + pci_emul_init(ctx, pde, i, si->si_param); + pci_add_mptable_name(si); + } + } + } + pci_finish_mptable_names(); +} + +int +pci_msi_enabled(struct pci_devinst *pi) +{ + return (pi->pi_msi.enabled); +} + +int +pci_msi_msgnum(struct pci_devinst *pi) +{ + if (pi->pi_msi.enabled) + return (pi->pi_msi.msgnum); + else + return (0); +} + +void +pci_generate_msi(struct pci_devinst *pi, int msg) +{ + + if (pci_msi_enabled(pi) && msg < pci_msi_msgnum(pi)) { + vm_lapic_irq(pi->pi_vmctx, + pi->pi_msi.cpu, + pi->pi_msi.vector + msg); + } +} + +static int cfgbus, cfgslot, cfgfunc, cfgoff; + +static int +pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + uint32_t x; + + assert(!in); + + if (bytes != 4) + return (-1); + + x = *eax; + cfgoff = x & PCI_REGMAX; + cfgfunc = (x >> 8) & PCI_FUNCMAX; + cfgslot = (x >> 11) & PCI_SLOTMAX; + cfgbus = (x >> 16) & PCI_BUSMAX; + + return (0); +} +INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_OUT, pci_emul_cfgaddr); + +static int +pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + struct pci_devinst *pi; + struct pci_devemu *pe; + int coff, idx; + uint64_t mask, bar; + + assert(bytes == 1 || bytes == 2 || bytes == 4); + + pi = pci_slotinfo[cfgslot].si_devi; + coff = cfgoff + (port - CONF1_DATA_PORT); + +#if 0 + printf("pcicfg-%s from 0x%0x of %d bytes (%d/%d/%d)\n\r", + in ? "read" : "write", coff, bytes, cfgbus, cfgslot, cfgfunc); +#endif + + if (pi == NULL || cfgfunc != 0) { + if (in) + *eax = 0xffffffff; + return (0); + } + + pe = pi->pi_d; + + /* + * Config read + */ + if (in) { + /* Let the device emulation override the default handler */ + if (pe->pe_cfgread != NULL && + (*pe->pe_cfgread)(ctx, vcpu, pi, coff, bytes, eax) == 0) + return (0); + + if (bytes == 1) + *eax = pci_get_cfgdata8(pi, coff); + else if (bytes == 2) + *eax = pci_get_cfgdata16(pi, coff); + else + *eax = pci_get_cfgdata32(pi, coff); + } else { + /* Let the device emulation override the default handler */ + if (pe->pe_cfgwrite != NULL && + (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0) + return (0); + + /* + * Special handling for write to BAR registers + */ + if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) { + /* + * Ignore writes to BAR registers that are not + * 4-byte aligned. + */ + if (bytes != 4 || (coff & 0x3) != 0) + return (0); + idx = (coff - PCIR_BAR(0)) / 4; + switch (pi->pi_bar[idx].type) { + case PCIBAR_NONE: + bar = 0; + break; + case PCIBAR_IO: + mask = ~(pi->pi_bar[idx].size - 1); + mask &= PCIM_BAR_IO_BASE; + bar = (*eax & mask) | PCIM_BAR_IO_SPACE; + break; + case PCIBAR_MEM32: + mask = ~(pi->pi_bar[idx].size - 1); + mask &= PCIM_BAR_MEM_BASE; + bar = *eax & mask; + bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; + break; + case PCIBAR_MEM64: + mask = ~(pi->pi_bar[idx].size - 1); + mask &= PCIM_BAR_MEM_BASE; + bar = *eax & mask; + bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | + PCIM_BAR_MEM_PREFETCH; + break; + case PCIBAR_MEMHI64: + mask = ~(pi->pi_bar[idx - 1].size - 1); + mask &= PCIM_BAR_MEM_BASE; + bar = ((uint64_t)*eax << 32) & mask; + bar = bar >> 32; + break; + default: + assert(0); + } + pci_set_cfgdata32(pi, coff, bar); + } else if (pci_emul_iscap(pi, coff)) { + pci_emul_capwrite(pi, coff, bytes, *eax); + } else { + CFGWRITE(pi, coff, *eax, bytes); + } + } + + return (0); +} + +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata); +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata); +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata); +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata); + +/* + * I/O ports to configure PCI IRQ routing. We ignore all writes to it. + */ +static int +pci_irq_port_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + assert(in == 0); + return (0); +} +INOUT_PORT(pci_irq, 0xC00, IOPORT_F_OUT, pci_irq_port_handler); +INOUT_PORT(pci_irq, 0xC01, IOPORT_F_OUT, pci_irq_port_handler); + +#define PCI_EMUL_TEST +#ifdef PCI_EMUL_TEST +/* + * Define a dummy test device + */ +#define DREGSZ 20 +struct pci_emul_dsoftc { + uint8_t regs[DREGSZ]; +}; + +#define PCI_EMUL_MSGS 4 + +int +pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + int error; + struct pci_emul_dsoftc *sc; + + sc = malloc(sizeof(struct pci_emul_dsoftc)); + memset(sc, 0, sizeof(struct pci_emul_dsoftc)); + + pi->pi_arg = sc; + + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD); + pci_set_cfgdata8(pi, PCIR_CLASS, 0x02); + + error = pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, DREGSZ); + assert(error == 0); + + error = pci_emul_add_msicap(pi, PCI_EMUL_MSGS); + assert(error == 0); + + return (0); +} + +void +pci_emul_diow(struct pci_devinst *pi, int baridx, int offset, int size, + uint32_t value) +{ + int i; + struct pci_emul_dsoftc *sc = pi->pi_arg; + + if (offset + size > DREGSZ) { + printf("diow: too large, offset %d size %d\n", offset, size); + return; + } + + if (size == 1) { + sc->regs[offset] = value & 0xff; + } else if (size == 2) { + *(uint16_t *)&sc->regs[offset] = value & 0xffff; + } else { + *(uint32_t *)&sc->regs[offset] = value; + } + + /* + * Special magic value to generate an interrupt + */ + if (offset == 4 && size == 4 && pci_msi_enabled(pi)) + pci_generate_msi(pi, value % pci_msi_msgnum(pi)); + + if (value == 0xabcdef) { + for (i = 0; i < pci_msi_msgnum(pi); i++) + pci_generate_msi(pi, i); + } +} + +uint32_t +pci_emul_dior(struct pci_devinst *pi, int baridx, int offset, int size) +{ + struct pci_emul_dsoftc *sc = pi->pi_arg; + uint32_t value; + + if (offset + size > DREGSZ) { + printf("dior: too large, offset %d size %d\n", offset, size); + return (0); + } + + if (size == 1) { + value = sc->regs[offset]; + } else if (size == 2) { + value = *(uint16_t *) &sc->regs[offset]; + } else { + value = *(uint32_t *) &sc->regs[offset]; + } + + return (value); +} + +struct pci_devemu pci_dummy = { + .pe_emu = "dummy", + .pe_init = pci_emul_dinit, + .pe_iow = pci_emul_diow, + .pe_ior = pci_emul_dior +}; +PCI_EMUL_SET(pci_dummy); + +#endif /* PCI_EMUL_TEST */ diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h new file mode 100644 index 000000000000..f5f8e228c55e --- /dev/null +++ b/usr.sbin/bhyve/pci_emul.h @@ -0,0 +1,171 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PCI_EMUL_H_ +#define _PCI_EMUL_H_ + +#include <sys/types.h> +#include <sys/queue.h> +#include <sys/kernel.h> + +#include <dev/pci/pcireg.h> + +#include <assert.h> + +#define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */ +#define PCIY_RESERVED 0x00 + +struct vmctx; +struct pci_devinst; + +struct pci_devemu { + char *pe_emu; /* Name of device emulation */ + + /* instance creation */ + int (*pe_init)(struct vmctx *, struct pci_devinst *, char *opts); + + /* config space read/write callbacks */ + int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int offset, + int bytes, uint32_t val); + int (*pe_cfgread)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int offset, + int bytes, uint32_t *retval); + + /* I/O space read/write callbacks */ + void (*pe_iow)(struct pci_devinst *pi, int baridx, + int offset, int size, uint32_t value); + uint32_t (*pe_ior)(struct pci_devinst *pi, int baridx, + int offset, int size); +}; +#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x); + +enum pcibar_type { + PCIBAR_NONE, + PCIBAR_IO, + PCIBAR_MEM32, + PCIBAR_MEM64, + PCIBAR_MEMHI64 +}; + +struct pcibar { + enum pcibar_type type; /* io or memory */ + uint64_t size; + uint64_t addr; +}; + +#define PI_NAMESZ 40 + +struct pci_devinst { + struct pci_devemu *pi_d; + struct vmctx *pi_vmctx; + uint8_t pi_bus, pi_slot, pi_func; + char pi_name[PI_NAMESZ]; + uint16_t pi_iobase; + int pi_bar_getsize; + + struct { + int enabled; + int cpu; + int vector; + int msgnum; + } pi_msi; + + void *pi_arg; /* devemu-private data */ + + u_char pi_cfgdata[PCI_REGMAX + 1]; + struct pcibar pi_bar[PCI_BARMAX + 1]; +}; + +struct msicap { + uint8_t capid; + uint8_t nextptr; + uint16_t msgctrl; + uint32_t addrlo; + uint32_t addrhi; + uint16_t msgdata; +} __packed; + +void init_pci(struct vmctx *ctx); +void pci_parse_slot(char *opt); +void pci_parse_name(char *opt); +void pci_callback(void); +int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, uint64_t hostbase, + enum pcibar_type type, uint64_t size); +int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum); +void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val); + +void pci_generate_msi(struct pci_devinst *pi, int msgnum); +int pci_msi_enabled(struct pci_devinst *pi); +int pci_msi_msgnum(struct pci_devinst *pi); +void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr); + +static __inline void +pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val) +{ + assert(offset <= PCI_REGMAX); + *(uint8_t *)(pi->pi_cfgdata + offset) = val; +} + +static __inline void +pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val) +{ + assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); + *(uint16_t *)(pi->pi_cfgdata + offset) = val; +} + +static __inline void +pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val) +{ + assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); + *(uint32_t *)(pi->pi_cfgdata + offset) = val; +} + +static __inline uint8_t +pci_get_cfgdata8(struct pci_devinst *pi, int offset) +{ + assert(offset <= PCI_REGMAX); + return (*(uint8_t *)(pi->pi_cfgdata + offset)); +} + +static __inline uint16_t +pci_get_cfgdata16(struct pci_devinst *pi, int offset) +{ + assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); + return (*(uint16_t *)(pi->pi_cfgdata + offset)); +} + +static __inline uint32_t +pci_get_cfgdata32(struct pci_devinst *pi, int offset) +{ + assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); + return (*(uint32_t *)(pi->pi_cfgdata + offset)); +} + +#endif /* _PCI_EMUL_H_ */ diff --git a/usr.sbin/bhyve/pci_hostbridge.c b/usr.sbin/bhyve/pci_hostbridge.c new file mode 100644 index 000000000000..c77762d8f921 --- /dev/null +++ b/usr.sbin/bhyve/pci_hostbridge.c @@ -0,0 +1,52 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "pci_emul.h" + +static int +pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + + /* config space */ + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */ + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */ + pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_BRIDGE); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST); + + return (0); +} + +struct pci_devemu pci_de_hostbridge = { + .pe_emu = "hostbridge", + .pe_init = pci_hostbridge_init, +}; +PCI_EMUL_SET(pci_de_hostbridge); diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c new file mode 100644 index 000000000000..1c417fd58725 --- /dev/null +++ b/usr.sbin/bhyve/pci_passthru.c @@ -0,0 +1,508 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/pciio.h> +#include <sys/ioctl.h> + +#include <dev/io/iodev.h> +#include <machine/iodev.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> + +#include <machine/vmm.h> +#include <vmmapi.h> +#include "pci_emul.h" + +#ifndef _PATH_DEVPCI +#define _PATH_DEVPCI "/dev/pci" +#endif + +#ifndef _PATH_DEVIO +#define _PATH_DEVIO "/dev/io" +#endif + +#define LEGACY_SUPPORT 1 + +static int pcifd = -1; +static int iofd = -1; + +struct passthru_softc { + struct pci_devinst *psc_pi; + struct pcibar psc_bar[PCI_BARMAX + 1]; + struct { + int capoff; + int msgctrl; + int emulated; + } psc_msi; + struct pcisel psc_sel; +}; + +static int +msi_caplen(int msgctrl) +{ + int len; + + len = 10; /* minimum length of msi capability */ + + if (msgctrl & PCIM_MSICTRL_64BIT) + len += 4; + +#if 0 + /* + * Ignore the 'mask' and 'pending' bits in the MSI capability. + * We'll let the guest manipulate them directly. + */ + if (msgctrl & PCIM_MSICTRL_VECTOR) + len += 10; +#endif + + return (len); +} + +static uint32_t +read_config(const struct pcisel *sel, long reg, int width) +{ + struct pci_io pi; + + bzero(&pi, sizeof(pi)); + pi.pi_sel = *sel; + pi.pi_reg = reg; + pi.pi_width = width; + + if (ioctl(pcifd, PCIOCREAD, &pi) < 0) + return (0); /* XXX */ + else + return (pi.pi_data); +} + +static void +write_config(const struct pcisel *sel, long reg, int width, uint32_t data) +{ + struct pci_io pi; + + bzero(&pi, sizeof(pi)); + pi.pi_sel = *sel; + pi.pi_reg = reg; + pi.pi_width = width; + pi.pi_data = data; + + (void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */ +} + +#ifdef LEGACY_SUPPORT +static int +passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr) +{ + int capoff, i; + struct msicap msicap; + u_char *capdata; + + pci_populate_msicap(&msicap, msgnum, nextptr); + + /* + * XXX + * Copy the msi capability structure in the last 16 bytes of the + * config space. This is wrong because it could shadow something + * useful to the device. + */ + capoff = 256 - roundup(sizeof(msicap), 4); + capdata = (u_char *)&msicap; + for (i = 0; i < sizeof(msicap); i++) + pci_set_cfgdata8(pi, capoff + i, capdata[i]); + + return (capoff); +} +#endif /* LEGACY_SUPPORT */ + +static int +cfginitmsi(struct passthru_softc *sc) +{ + int ptr, cap, sts, caplen; + uint32_t u32; + struct pcisel sel; + struct pci_devinst *pi; + + pi = sc->psc_pi; + sel = sc->psc_sel; + + /* + * Parse the capabilities and cache the location of the MSI + * capability. + */ + sts = read_config(&sel, PCIR_STATUS, 2); + if (sts & PCIM_STATUS_CAPPRESENT) { + ptr = read_config(&sel, PCIR_CAP_PTR, 1); + while (ptr != 0 && ptr != 0xff) { + cap = read_config(&sel, ptr + PCICAP_ID, 1); + if (cap == PCIY_MSI) { + /* + * Copy the MSI capability into the config + * space of the emulated pci device + */ + sc->psc_msi.capoff = ptr; + sc->psc_msi.msgctrl = read_config(&sel, + ptr + 2, 2); + sc->psc_msi.emulated = 0; + caplen = msi_caplen(sc->psc_msi.msgctrl); + while (caplen > 0) { + u32 = read_config(&sel, ptr, 4); + pci_set_cfgdata32(pi, ptr, u32); + caplen -= 4; + ptr += 4; + } + break; + } + ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1); + } + } + +#ifdef LEGACY_SUPPORT + /* + * If the passthrough device does not support MSI then craft a + * MSI capability for it. We link the new MSI capability at the + * head of the list of capabilities. + */ + if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) { + int origptr, msiptr; + origptr = read_config(&sel, PCIR_CAP_PTR, 1); + msiptr = passthru_add_msicap(pi, 1, origptr); + sc->psc_msi.capoff = msiptr; + sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2); + sc->psc_msi.emulated = 1; + pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr); + } +#endif + + if (sc->psc_msi.capoff == 0) /* MSI or bust */ + return (-1); + else + return (0); +} + +static int +cfginitbar(struct vmctx *ctx, struct passthru_softc *sc) +{ + int i, error; + struct pci_devinst *pi; + struct pci_bar_io bar; + enum pcibar_type bartype; + uint64_t base; + + pi = sc->psc_pi; + + /* + * Initialize BAR registers + */ + for (i = 0; i <= PCI_BARMAX; i++) { + bzero(&bar, sizeof(bar)); + bar.pbi_sel = sc->psc_sel; + bar.pbi_reg = PCIR_BAR(i); + + if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0) + continue; + + if (PCI_BAR_IO(bar.pbi_base)) { + bartype = PCIBAR_IO; + base = bar.pbi_base & PCIM_BAR_IO_BASE; + } else { + switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) { + case PCIM_BAR_MEM_64: + bartype = PCIBAR_MEM64; + break; + default: + bartype = PCIBAR_MEM32; + break; + } + base = bar.pbi_base & PCIM_BAR_MEM_BASE; + } + + /* Cache information about the "real" BAR */ + sc->psc_bar[i].type = bartype; + sc->psc_bar[i].size = bar.pbi_length; + sc->psc_bar[i].addr = base; + + /* Allocate the BAR in the guest I/O or MMIO space */ + error = pci_emul_alloc_bar(pi, i, base, bartype, + bar.pbi_length); + if (error) + return (-1); + + /* + * Map the physical MMIO space in the guest MMIO space + */ + if (bartype != PCIBAR_IO) { + error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, + sc->psc_sel.pc_dev, sc->psc_sel.pc_func, + pi->pi_bar[i].addr, pi->pi_bar[i].size, base); + if (error) + return (-1); + } + + /* + * 64-bit BAR takes up two slots so skip the next one. + */ + if (bartype == PCIBAR_MEM64) { + i++; + assert(i <= PCI_BARMAX); + sc->psc_bar[i].type = PCIBAR_MEMHI64; + } + } + return (0); +} + +static int +cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func) +{ + int error; + struct passthru_softc *sc; + + error = 1; + sc = pi->pi_arg; + + bzero(&sc->psc_sel, sizeof(struct pcisel)); + sc->psc_sel.pc_bus = bus; + sc->psc_sel.pc_dev = slot; + sc->psc_sel.pc_func = func; + + if (cfginitbar(ctx, sc) != 0) + goto done; + + if (cfginitmsi(sc) != 0) + goto done; + + error = 0; /* success */ +done: + return (error); +} + +static int +passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + int bus, slot, func, error; + struct passthru_softc *sc; + + sc = NULL; + error = 1; + + if (pcifd < 0) { + pcifd = open(_PATH_DEVPCI, O_RDWR, 0); + if (pcifd < 0) + goto done; + } + + if (iofd < 0) { + iofd = open(_PATH_DEVIO, O_RDWR, 0); + if (iofd < 0) + goto done; + } + + if (opts == NULL || sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) + goto done; + + if (vm_assign_pptdev(ctx, bus, slot, func) != 0) + goto done; + + sc = malloc(sizeof(struct passthru_softc)); + memset(sc, 0, sizeof(struct passthru_softc)); + + pi->pi_arg = sc; + sc->psc_pi = pi; + + /* initialize config space */ + if (cfginit(ctx, pi, bus, slot, func) != 0) + goto done; + + error = 0; /* success */ +done: + if (error) { + free(sc); + vm_unassign_pptdev(ctx, bus, slot, func); + } + return (error); +} + +static int +bar_access(int coff) +{ + if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) + return (1); + else + return (0); +} + +static int +msicap_access(struct passthru_softc *sc, int coff) +{ + int caplen; + + if (sc->psc_msi.capoff == 0) + return (0); + + caplen = msi_caplen(sc->psc_msi.msgctrl); + + if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen) + return (1); + else + return (0); +} + +static int +passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, + int bytes, uint32_t *rv) +{ + struct passthru_softc *sc; + + sc = pi->pi_arg; + + /* + * PCI BARs and MSI capability is emulated. + */ + if (bar_access(coff) || msicap_access(sc, coff)) + return (-1); + +#ifdef LEGACY_SUPPORT + /* + * Emulate PCIR_CAP_PTR if this device does not support MSI capability + * natively. + */ + if (sc->psc_msi.emulated) { + if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4) + return (-1); + } +#endif + + /* Everything else just read from the device's config space */ + *rv = read_config(&sc->psc_sel, coff, bytes); + + return (0); +} + +static int +passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff, + int bytes, uint32_t val) +{ + int error; + struct passthru_softc *sc; + + sc = pi->pi_arg; + + /* + * PCI BARs are emulated + */ + if (bar_access(coff)) + return (-1); + + /* + * MSI capability is emulated + */ + if (msicap_access(sc, coff)) { + msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val); + + error = vm_setup_msi(ctx, vcpu, sc->psc_sel.pc_bus, + sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_msi.cpu, + pi->pi_msi.vector, pi->pi_msi.msgnum); + if (error != 0) { + printf("vm_setup_msi returned error %d\r\n", errno); + exit(1); + } + return (0); + } + +#ifdef LEGACY_SUPPORT + /* + * If this device does not support MSI natively then we cannot let + * the guest disable legacy interrupts from the device. It is the + * legacy interrupt that is triggering the virtual MSI to the guest. + */ + if (sc->psc_msi.emulated && pci_msi_enabled(pi)) { + if (coff == PCIR_COMMAND && bytes == 2) + val &= ~PCIM_CMD_INTxDIS; + } +#endif + + write_config(&sc->psc_sel, coff, bytes, val); + + return (0); +} + +static void +passthru_iow(struct pci_devinst *pi, int baridx, int offset, int size, + uint32_t value) +{ + struct passthru_softc *sc; + struct iodev_pio_req pio; + + sc = pi->pi_arg; + + bzero(&pio, sizeof(struct iodev_pio_req)); + pio.access = IODEV_PIO_WRITE; + pio.port = sc->psc_bar[baridx].addr + offset; + pio.width = size; + pio.val = value; + + (void)ioctl(iofd, IODEV_PIO, &pio); +} + +static uint32_t +passthru_ior(struct pci_devinst *pi, int baridx, int offset, int size) +{ + struct passthru_softc *sc; + struct iodev_pio_req pio; + + sc = pi->pi_arg; + + bzero(&pio, sizeof(struct iodev_pio_req)); + pio.access = IODEV_PIO_READ; + pio.port = sc->psc_bar[baridx].addr + offset; + pio.width = size; + pio.val = 0; + + (void)ioctl(iofd, IODEV_PIO, &pio); + + return (pio.val); +} + +struct pci_devemu passthru = { + .pe_emu = "passthru", + .pe_init = passthru_init, + .pe_cfgwrite = passthru_cfgwrite, + .pe_cfgread = passthru_cfgread, + .pe_iow = passthru_iow, + .pe_ior = passthru_ior, +}; +PCI_EMUL_SET(passthru); diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c new file mode 100644 index 000000000000..b86e21dff64e --- /dev/null +++ b/usr.sbin/bhyve/pci_virtio_block.c @@ -0,0 +1,502 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/linker_set.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/ioctl.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <strings.h> +#include <unistd.h> +#include <assert.h> +#include <pthread.h> + +#include "fbsdrun.h" +#include "pci_emul.h" +#include "virtio.h" + +#define VTBLK_RINGSZ 64 + +#define VTBLK_CFGSZ 28 + +#define VTBLK_R_CFG VTCFG_R_CFG0 +#define VTBLK_R_CFG_END VTBLK_R_CFG + VTBLK_CFGSZ -1 +#define VTBLK_R_MAX VTBLK_R_CFG_END + +#define VTBLK_REGSZ VTBLK_R_MAX+1 + +#define VTBLK_MAXSEGS 32 + +#define VTBLK_S_OK 0 +#define VTBLK_S_IOERR 1 + +/* + * Host capabilities + */ +#define VTBLK_S_HOSTCAPS \ + ( 0x00000004 | /* host maximum request segments */ \ + 0x10000000 ) /* supports indirect descriptors */ + +struct vring_hqueue { + /* Internal state */ + uint16_t hq_size; + uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */ + + /* Host-context pointers to the queue */ + struct virtio_desc *hq_dtable; + uint16_t *hq_avail_flags; + uint16_t *hq_avail_idx; /* monotonically increasing */ + uint16_t *hq_avail_ring; + + uint16_t *hq_used_flags; + uint16_t *hq_used_idx; /* monotonically increasing */ + struct virtio_used *hq_used_ring; +}; + +/* + * Config space + */ +struct vtblk_config { + uint64_t vbc_capacity; + uint32_t vbc_size_max; + uint32_t vbc_seg_max; + uint16_t vbc_geom_c; + uint8_t vbc_geom_h; + uint8_t vbc_geom_s; + uint32_t vbc_blk_size; + uint32_t vbc_sectors_max; +} __packed; +CTASSERT(sizeof(struct vtblk_config) == VTBLK_CFGSZ); + +/* + * Fixed-size block header + */ +struct virtio_blk_hdr { +#define VBH_OP_READ 0 +#define VBH_OP_WRITE 1 + uint32_t vbh_type; + uint32_t vbh_ioprio; + uint64_t vbh_sector; +} __packed; + +/* + * Debug printf + */ +static int pci_vtblk_debug; +#define DPRINTF(params) if (pci_vtblk_debug) printf params +#define WPRINTF(params) printf params + +/* + * Per-device softc + */ +struct pci_vtblk_softc { + struct pci_devinst *vbsc_pi; + int vbsc_fd; + int vbsc_status; + int vbsc_isr; + int vbsc_lastq; + uint32_t vbsc_features; + uint64_t vbsc_pfn; + struct vring_hqueue vbsc_q; + struct vtblk_config vbsc_cfg; +}; + +/* + * Return the number of available descriptors in the vring taking care + * of the 16-bit index wraparound. + */ +static int +hq_num_avail(struct vring_hqueue *hq) +{ + int ndesc; + + if (*hq->hq_avail_idx >= hq->hq_cur_aidx) + ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx; + else + ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1; + + assert(ndesc >= 0 && ndesc <= hq->hq_size); + + return (ndesc); +} + +static void +pci_vtblk_update_status(struct pci_vtblk_softc *sc, uint32_t value) +{ + if (value == 0) { + DPRINTF(("vtblk: device reset requested !\n")); + } + + sc->vbsc_status = value; +} + +static void +pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vring_hqueue *hq) +{ + struct iovec iov[VTBLK_MAXSEGS]; + struct virtio_blk_hdr *vbh; + struct virtio_desc *vd, *vid; + struct virtio_used *vu; + uint8_t *status; + int i; + int err; + int iolen; + int nsegs; + int uidx, aidx, didx; + int writeop; + off_t offset; + + uidx = *hq->hq_used_idx; + aidx = hq->hq_cur_aidx; + didx = hq->hq_avail_ring[aidx % hq->hq_size]; + assert(didx >= 0 && didx < hq->hq_size); + + vd = &hq->hq_dtable[didx]; + + /* + * Verify that the descriptor is indirect, and obtain + * the pointer to the indirect descriptor. + * There has to be space for at least 3 descriptors + * in the indirect descriptor array: the block header, + * 1 or more data descriptors, and a status byte. + */ + assert(vd->vd_flags & VRING_DESC_F_INDIRECT); + + nsegs = vd->vd_len / sizeof(struct virtio_desc); + assert(nsegs >= 3); + assert(nsegs < VTBLK_MAXSEGS + 2); + + vid = paddr_guest2host(vd->vd_addr); + assert((vid->vd_flags & VRING_DESC_F_INDIRECT) == 0); + + /* + * The first descriptor will be the read-only fixed header + */ + vbh = paddr_guest2host(vid[0].vd_addr); + assert(vid[0].vd_len == sizeof(struct virtio_blk_hdr)); + assert(vid[0].vd_flags & VRING_DESC_F_NEXT); + assert((vid[0].vd_flags & VRING_DESC_F_WRITE) == 0); + + writeop = (vbh->vbh_type == VBH_OP_WRITE); + + offset = vbh->vbh_sector * DEV_BSIZE; + + /* + * Build up the iovec based on the guest's data descriptors + */ + for (i = 1, iolen = 0; i < nsegs - 1; i++) { + iov[i-1].iov_base = paddr_guest2host(vid[i].vd_addr); + iov[i-1].iov_len = vid[i].vd_len; + iolen += vid[i].vd_len; + + assert(vid[i].vd_flags & VRING_DESC_F_NEXT); + assert((vid[i].vd_flags & VRING_DESC_F_INDIRECT) == 0); + + /* + * - write op implies read-only descriptor, + * - read op implies write-only descriptor, + * therefore test the inverse of the descriptor bit + * to the op. + */ + assert(((vid[i].vd_flags & VRING_DESC_F_WRITE) == 0) == + writeop); + } + + /* Lastly, get the address of the status byte */ + status = paddr_guest2host(vid[nsegs - 1].vd_addr); + assert(vid[nsegs - 1].vd_len == 1); + assert((vid[nsegs - 1].vd_flags & VRING_DESC_F_NEXT) == 0); + assert(vid[nsegs - 1].vd_flags & VRING_DESC_F_WRITE); + + DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r", + writeop ? "write" : "read", iolen, nsegs - 2, offset)); + + if (writeop){ + err = pwritev(sc->vbsc_fd, iov, nsegs - 2, offset); + } else { + err = preadv(sc->vbsc_fd, iov, nsegs - 2, offset); + } + + *status = err < 0 ? VTBLK_S_IOERR : VTBLK_S_OK; + + /* + * Return the single indirect descriptor back to the host + */ + vu = &hq->hq_used_ring[uidx % hq->hq_size]; + vu->vu_idx = didx; + vu->vu_tlen = 1; + hq->hq_cur_aidx++; + *hq->hq_used_idx += 1; +} + +static void +pci_vtblk_qnotify(struct pci_vtblk_softc *sc) +{ + struct vring_hqueue *hq = &sc->vbsc_q; + int i; + int ndescs; + + /* + * Calculate number of ring entries to process + */ + ndescs = hq_num_avail(hq); + + if (ndescs == 0) + return; + + /* + * Run through all the entries, placing them into iovecs and + * sending when an end-of-packet is found + */ + for (i = 0; i < ndescs; i++) + pci_vtblk_proc(sc, hq); + + /* + * Generate an interrupt if able + */ + if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0 && + sc->vbsc_isr == 0) { + sc->vbsc_isr = 1; + pci_generate_msi(sc->vbsc_pi, 0); + } + +} + +static void +pci_vtblk_ring_init(struct pci_vtblk_softc *sc, uint64_t pfn) +{ + struct vring_hqueue *hq; + + sc->vbsc_pfn = pfn << VRING_PFN; + + /* + * Set up host pointers to the various parts of the + * queue + */ + hq = &sc->vbsc_q; + hq->hq_size = VTBLK_RINGSZ; + + hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN); + hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size); + hq->hq_avail_idx = hq->hq_avail_flags + 1; + hq->hq_avail_ring = hq->hq_avail_flags + 2; + hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring, + VRING_ALIGN); + hq->hq_used_idx = hq->hq_used_flags + 1; + hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2); + + /* + * Initialize queue indexes + */ + hq->hq_cur_aidx = 0; +} + +static int +pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct stat sbuf; + struct pci_vtblk_softc *sc; + int fd; + + if (opts == NULL) { + printf("virtio-block: backing device required\n"); + return (1); + } + + /* + * Access to guest memory is required. Fail if + * memory not mapped + */ + if (paddr_guest2host(0) == NULL) + return (1); + + /* + * The supplied backing file has to exist + */ + fd = open(opts, O_RDWR); + if (fd < 0) { + perror("Could not open backing file"); + return (1); + } + + if (fstat(fd, &sbuf) < 0) { + perror("Could not stat backing file"); + close(fd); + return (1); + } + + sc = malloc(sizeof(struct pci_vtblk_softc)); + memset(sc, 0, sizeof(struct pci_vtblk_softc)); + + pi->pi_arg = sc; + sc->vbsc_pi = pi; + sc->vbsc_fd = fd; + + /* setup virtio block config space */ + sc->vbsc_cfg.vbc_capacity = sbuf.st_size / DEV_BSIZE; + sc->vbsc_cfg.vbc_seg_max = VTBLK_MAXSEGS; + sc->vbsc_cfg.vbc_blk_size = DEV_BSIZE; + sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */ + sc->vbsc_cfg.vbc_geom_c = 0; /* no geometry */ + sc->vbsc_cfg.vbc_geom_h = 0; + sc->vbsc_cfg.vbc_geom_s = 0; + sc->vbsc_cfg.vbc_sectors_max = 0; + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK); + pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, VTBLK_REGSZ); + pci_emul_add_msicap(pi, 1); + + return (0); +} + +static void +pci_vtblk_write(struct pci_devinst *pi, int baridx, int offset, int size, + uint32_t value) +{ + struct pci_vtblk_softc *sc = pi->pi_arg; + + if (offset + size > VTBLK_REGSZ) { + DPRINTF(("vtblk_write: 2big, offset %d size %d\n", + offset, size)); + return; + } + + switch (offset) { + case VTCFG_R_GUESTCAP: + assert(size == 4); + sc->vbsc_features = value & VTBLK_S_HOSTCAPS; + break; + case VTCFG_R_PFN: + assert(size == 4); + pci_vtblk_ring_init(sc, value); + break; + case VTCFG_R_QSEL: + assert(size == 2); + sc->vbsc_lastq = value; + break; + case VTCFG_R_QNOTIFY: + assert(size == 2); + assert(value == 0); + pci_vtblk_qnotify(sc); + break; + case VTCFG_R_STATUS: + assert(size == 1); + pci_vtblk_update_status(sc, value); + break; + case VTCFG_R_HOSTCAP: + case VTCFG_R_QNUM: + case VTCFG_R_ISR: + case VTBLK_R_CFG ... VTBLK_R_CFG_END: + DPRINTF(("vtblk: write to readonly reg %d\n\r", offset)); + break; + default: + DPRINTF(("vtblk: unknown i/o write offset %d\n\r", offset)); + value = 0; + break; + } +} + +uint32_t +pci_vtblk_read(struct pci_devinst *pi, int baridx, int offset, int size) +{ + struct pci_vtblk_softc *sc = pi->pi_arg; + uint32_t value; + + if (offset + size > VTBLK_REGSZ) { + DPRINTF(("vtblk_read: 2big, offset %d size %d\n", + offset, size)); + return (0); + } + + switch (offset) { + case VTCFG_R_HOSTCAP: + assert(size == 4); + value = VTBLK_S_HOSTCAPS; + break; + case VTCFG_R_GUESTCAP: + assert(size == 4); + value = sc->vbsc_features; /* XXX never read ? */ + break; + case VTCFG_R_PFN: + assert(size == 4); + value = sc->vbsc_pfn >> VRING_PFN; + break; + case VTCFG_R_QNUM: + value = (sc->vbsc_lastq == 0) ? VTBLK_RINGSZ: 0; + break; + case VTCFG_R_QSEL: + assert(size == 2); + value = sc->vbsc_lastq; /* XXX never read ? */ + break; + case VTCFG_R_QNOTIFY: + assert(size == 2); + value = 0; /* XXX never read ? */ + break; + case VTCFG_R_STATUS: + assert(size == 1); + value = sc->vbsc_status; + break; + case VTCFG_R_ISR: + assert(size == 1); + value = sc->vbsc_isr; + sc->vbsc_isr = 0; /* a read clears this flag */ + break; + case VTBLK_R_CFG ... VTBLK_R_CFG_END: + assert(size == 1); + value = *((uint8_t *)&sc->vbsc_cfg + offset - VTBLK_R_CFG); + break; + default: + DPRINTF(("vtblk: unknown i/o read offset %d\n\r", offset)); + value = 0; + break; + } + + return (value); +} + +struct pci_devemu pci_de_vblk = { + .pe_emu = "virtio-blk", + .pe_init = pci_vtblk_init, + .pe_iow = pci_vtblk_write, + .pe_ior = pci_vtblk_read, +}; +PCI_EMUL_SET(pci_de_vblk); diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c new file mode 100644 index 000000000000..5db1eb7f5a73 --- /dev/null +++ b/usr.sbin/bhyve/pci_virtio_net.c @@ -0,0 +1,739 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/linker_set.h> +#include <sys/select.h> +#include <sys/uio.h> +#include <sys/ioctl.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <strings.h> +#include <unistd.h> +#include <assert.h> +#include <md5.h> +#include <pthread.h> + +#include "fbsdrun.h" +#include "pci_emul.h" +#include "mevent.h" +#include "virtio.h" + +#define VTNET_RINGSZ 256 + +#define VTNET_MAXSEGS 32 + +/* + * PCI config-space register offsets + */ +#define VTNET_R_CFG0 20 +#define VTNET_R_CFG1 21 +#define VTNET_R_CFG2 22 +#define VTNET_R_CFG3 23 +#define VTNET_R_CFG4 24 +#define VTNET_R_CFG5 25 +#define VTNET_R_CFG6 26 +#define VTNET_R_CFG7 27 +#define VTNET_R_MAX 27 + +#define VTNET_REGSZ VTNET_R_MAX+1 + +/* + * Host capabilities + */ +#define VTNET_S_HOSTCAPS \ + ( 0x00000020 | /* host supplies MAC */ \ + 0x00008000 | /* host can merge Rx buffers */ \ + 0x00010000 ) /* config status available */ + +/* + * Queue definitions. + */ +#define VTNET_RXQ 0 +#define VTNET_TXQ 1 +#define VTNET_CTLQ 2 + +#define VTNET_MAXQ 3 + +struct vring_hqueue { + /* Internal state */ + uint16_t hq_size; + uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */ + + /* Host-context pointers to the queue */ + struct virtio_desc *hq_dtable; + uint16_t *hq_avail_flags; + uint16_t *hq_avail_idx; /* monotonically increasing */ + uint16_t *hq_avail_ring; + + uint16_t *hq_used_flags; + uint16_t *hq_used_idx; /* monotonically increasing */ + struct virtio_used *hq_used_ring; +}; + +/* + * Fixed network header size + */ +struct virtio_net_rxhdr { + uint8_t vrh_flags; + uint8_t vrh_gso_type; + uint16_t vrh_hdr_len; + uint16_t vrh_gso_size; + uint16_t vrh_csum_start; + uint16_t vrh_csum_offset; + uint16_t vrh_bufs; +} __packed; + +/* + * Debug printf + */ +static int pci_vtnet_debug; +#define DPRINTF(params) if (pci_vtnet_debug) printf params +#define WPRINTF(params) printf params + +/* + * Per-device softc + */ +struct pci_vtnet_softc { + struct pci_devinst *vsc_pi; + pthread_mutex_t vsc_mtx; + struct mevent *vsc_mevp; + + int vsc_curq; + int vsc_status; + int vsc_isr; + int vsc_tapfd; + int vsc_rx_ready; + int vsc_rxpend; + + uint32_t vsc_features; + uint8_t vsc_macaddr[6]; + + uint64_t vsc_pfn[VTNET_MAXQ]; + struct vring_hqueue vsc_hq[VTNET_MAXQ]; +}; + +/* + * Return the number of available descriptors in the vring taking care + * of the 16-bit index wraparound. + */ +static int +hq_num_avail(struct vring_hqueue *hq) +{ + int ndesc; + + if (*hq->hq_avail_idx >= hq->hq_cur_aidx) + ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx; + else + ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1; + + assert(ndesc >= 0 && ndesc <= hq->hq_size); + + return (ndesc); +} + +static uint16_t +pci_vtnet_qsize(int qnum) +{ + /* XXX no ctl queue currently */ + if (qnum == VTNET_CTLQ) { + return (0); + } + + /* XXX fixed currently. Maybe different for tx/rx/ctl */ + return (VTNET_RINGSZ); +} + +static void +pci_vtnet_update_status(struct pci_vtnet_softc *sc, uint32_t value) +{ + if (value == 0) { + DPRINTF(("vtnet: device reset requested !\n")); + } + + sc->vsc_status = value; +} + +/* + * Called to send a buffer chain out to the tap device + */ +static void +pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, + int len) +{ + char pad[60]; + + if (sc->vsc_tapfd == -1) + return; + + /* + * If the length is < 60, pad out to that and add the + * extra zero'd segment to the iov. It is guaranteed that + * there is always an extra iov available by the caller. + */ + if (len < 60) { + memset(pad, 0, 60 - len); + iov[iovcnt].iov_base = pad; + iov[iovcnt].iov_len = 60 - len; + iovcnt++; + } + (void) writev(sc->vsc_tapfd, iov, iovcnt); +} + +/* + * Called when there is read activity on the tap file descriptor. + * Each buffer posted by the guest is assumed to be able to contain + * an entire ethernet frame + rx header. + * MP note: the dummybuf is only used for discarding frames, so there + * is no need for it to be per-vtnet or locked. + */ +static uint8_t dummybuf[2048]; + +static void +pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) +{ + struct virtio_desc *vd; + struct virtio_used *vu; + struct vring_hqueue *hq; + struct virtio_net_rxhdr *vrx; + uint8_t *buf; + int i; + int len; + int ndescs; + int didx, uidx, aidx; /* descriptor, avail and used index */ + + /* + * Should never be called without a valid tap fd + */ + assert(sc->vsc_tapfd != -1); + + /* + * But, will be called when the rx ring hasn't yet + * been set up. + */ + if (sc->vsc_rx_ready == 0) { + /* + * Drop the packet and try later. + */ + (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); + return; + } + + /* + * Calculate the number of available rx buffers + */ + hq = &sc->vsc_hq[VTNET_RXQ]; + + ndescs = hq_num_avail(hq); + + if (ndescs == 0) { + /* + * Need to wait for host notification to read + */ + if (sc->vsc_rxpend == 0) { + WPRINTF(("vtnet: no rx descriptors !\n")); + sc->vsc_rxpend = 1; + } + + /* + * Drop the packet and try later + */ + (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); + return; + } + + aidx = hq->hq_cur_aidx; + uidx = *hq->hq_used_idx; + for (i = 0; i < ndescs; i++) { + /* + * 'aidx' indexes into the an array of descriptor indexes + */ + didx = hq->hq_avail_ring[aidx % hq->hq_size]; + assert(didx >= 0 && didx < hq->hq_size); + + vd = &hq->hq_dtable[didx]; + + /* + * Get a pointer to the rx header, and use the + * data immediately following it for the packet buffer. + */ + vrx = (struct virtio_net_rxhdr *)paddr_guest2host(vd->vd_addr); + buf = (uint8_t *)(vrx + 1); + + len = read(sc->vsc_tapfd, buf, + vd->vd_len - sizeof(struct virtio_net_rxhdr)); + + if (len < 0 && errno == EWOULDBLOCK) { + break; + } + + /* + * The only valid field in the rx packet header is the + * number of buffers, which is always 1 without TSO + * support. + */ + memset(vrx, 0, sizeof(struct virtio_net_rxhdr)); + vrx->vrh_bufs = 1; + + /* + * Write this descriptor into the used ring + */ + vu = &hq->hq_used_ring[uidx % hq->hq_size]; + vu->vu_idx = didx; + vu->vu_tlen = len + sizeof(struct virtio_net_rxhdr); + uidx++; + aidx++; + } + + /* + * Update the used pointer, and signal an interrupt if allowed + */ + *hq->hq_used_idx = uidx; + hq->hq_cur_aidx = aidx; + + if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { + sc->vsc_isr |= 1; + pci_generate_msi(sc->vsc_pi, 0); + } +} + +static void +pci_vtnet_tap_callback(int fd, enum ev_type type, void *param) +{ + struct pci_vtnet_softc *sc = param; + + pthread_mutex_lock(&sc->vsc_mtx); + pci_vtnet_tap_rx(sc); + pthread_mutex_unlock(&sc->vsc_mtx); + +} + +static void +pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc) +{ + /* + * A qnotify means that the rx process can now begin + */ + if (sc->vsc_rx_ready == 0) { + sc->vsc_rx_ready = 1; + } + + /* + * If the rx queue was empty, attempt to receive a + * packet that was previously blocked due to no rx bufs + * available + */ + if (sc->vsc_rxpend) { + WPRINTF(("vtnet: rx resumed\n\r")); + sc->vsc_rxpend = 0; + pci_vtnet_tap_rx(sc); + } +} + +static void +pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq) +{ + struct iovec iov[VTNET_MAXSEGS + 1]; + struct virtio_desc *vd; + struct virtio_used *vu; + int i; + int plen; + int tlen; + int uidx, aidx, didx; + + uidx = *hq->hq_used_idx; + aidx = hq->hq_cur_aidx; + didx = hq->hq_avail_ring[aidx % hq->hq_size]; + assert(didx >= 0 && didx < hq->hq_size); + + vd = &hq->hq_dtable[didx]; + + /* + * Run through the chain of descriptors, ignoring the + * first header descriptor. However, include the header + * length in the total length that will be put into the + * used queue. + */ + tlen = vd->vd_len; + vd = &hq->hq_dtable[vd->vd_next]; + + for (i = 0, plen = 0; + i < VTNET_MAXSEGS; + i++, vd = &hq->hq_dtable[vd->vd_next]) { + iov[i].iov_base = paddr_guest2host(vd->vd_addr); + iov[i].iov_len = vd->vd_len; + plen += vd->vd_len; + tlen += vd->vd_len; + + if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0) + break; + } + assert(i < VTNET_MAXSEGS); + + DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, i + 1)); + pci_vtnet_tap_tx(sc, iov, i + 1, plen); + + /* + * Return this chain back to the host + */ + vu = &hq->hq_used_ring[uidx % hq->hq_size]; + vu->vu_idx = didx; + vu->vu_tlen = tlen; + hq->hq_cur_aidx = aidx + 1; + *hq->hq_used_idx = uidx + 1; + + /* + * Generate an interrupt if able + */ + if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { + sc->vsc_isr |= 1; + pci_generate_msi(sc->vsc_pi, 0); + } +} + +static void +pci_vtnet_ping_txq(struct pci_vtnet_softc *sc) +{ + struct vring_hqueue *hq = &sc->vsc_hq[VTNET_TXQ]; + int i; + int ndescs; + + /* + * Calculate number of ring entries to process + */ + ndescs = hq_num_avail(hq); + + if (ndescs == 0) + return; + + /* + * Run through all the entries, placing them into iovecs and + * sending when an end-of-packet is found + */ + for (i = 0; i < ndescs; i++) + pci_vtnet_proctx(sc, hq); +} + +static void +pci_vtnet_ping_ctlq(struct pci_vtnet_softc *sc) +{ + + DPRINTF(("vtnet: control qnotify!\n\r")); +} + +static void +pci_vtnet_ring_init(struct pci_vtnet_softc *sc, uint64_t pfn) +{ + struct vring_hqueue *hq; + int qnum = sc->vsc_curq; + + assert(qnum < VTNET_MAXQ); + + sc->vsc_pfn[qnum] = pfn << VRING_PFN; + + /* + * Set up host pointers to the various parts of the + * queue + */ + hq = &sc->vsc_hq[qnum]; + hq->hq_size = pci_vtnet_qsize(qnum); + + hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN); + hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size); + hq->hq_avail_idx = hq->hq_avail_flags + 1; + hq->hq_avail_ring = hq->hq_avail_flags + 2; + hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring, + VRING_ALIGN); + hq->hq_used_idx = hq->hq_used_flags + 1; + hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2); + + /* + * Initialize queue indexes + */ + hq->hq_cur_aidx = 0; +} + +static int +pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + MD5_CTX mdctx; + unsigned char digest[16]; + char nstr[80]; + struct pci_vtnet_softc *sc; + + /* + * Access to guest memory is required. Fail if + * memory not mapped + */ + if (paddr_guest2host(0) == NULL) + return (1); + + sc = malloc(sizeof(struct pci_vtnet_softc)); + memset(sc, 0, sizeof(struct pci_vtnet_softc)); + + pi->pi_arg = sc; + sc->vsc_pi = pi; + + pthread_mutex_init(&sc->vsc_mtx, NULL); + + /* + * Attempt to open the tap device + */ + sc->vsc_tapfd = -1; + if (opts != NULL) { + char tbuf[80]; + + strcpy(tbuf, "/dev/"); + strncat(tbuf, opts, sizeof(tbuf) - strlen(tbuf)); + + sc->vsc_tapfd = open(tbuf, O_RDWR); + if (sc->vsc_tapfd == -1) { + WPRINTF(("open of tap device %s failed\n", tbuf)); + } else { + /* + * Set non-blocking and register for read + * notifications with the event loop + */ + int opt = 1; + if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) { + WPRINTF(("tap device O_NONBLOCK failed\n")); + close(sc->vsc_tapfd); + sc->vsc_tapfd = -1; + } + + sc->vsc_mevp = mevent_add(sc->vsc_tapfd, + EVF_READ, + pci_vtnet_tap_callback, + sc); + if (sc->vsc_mevp == NULL) { + WPRINTF(("Could not register event\n")); + close(sc->vsc_tapfd); + sc->vsc_tapfd = -1; + } + } + } + + /* + * The MAC address is the standard NetApp OUI of 00-a0-98, + * followed by an MD5 of the vm name. The slot number is + * prepended to this for slots other than 1, so that + * CFE can netboot from the equivalent of slot 1. + */ + if (pi->pi_slot == 1) { + strncpy(nstr, vmname, sizeof(nstr)); + } else { + snprintf(nstr, sizeof(nstr), "%d-%s", pi->pi_slot, vmname); + } + + MD5Init(&mdctx); + MD5Update(&mdctx, nstr, strlen(nstr)); + MD5Final(digest, &mdctx); + + sc->vsc_macaddr[0] = 0x00; + sc->vsc_macaddr[1] = 0xa0; + sc->vsc_macaddr[2] = 0x98; + sc->vsc_macaddr[3] = digest[0]; + sc->vsc_macaddr[4] = digest[1]; + sc->vsc_macaddr[5] = digest[2]; + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); + pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, VTNET_REGSZ); + pci_emul_add_msicap(pi, 1); + + return (0); +} + +/* + * Function pointer array to handle queue notifications + */ +static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = { + pci_vtnet_ping_rxq, + pci_vtnet_ping_txq, + pci_vtnet_ping_ctlq +}; + +static void +pci_vtnet_write(struct pci_devinst *pi, int baridx, int offset, int size, + uint32_t value) +{ + struct pci_vtnet_softc *sc = pi->pi_arg; + + if (offset + size > VTNET_REGSZ) { + DPRINTF(("vtnet_write: 2big, offset %d size %d\n", + offset, size)); + return; + } + + pthread_mutex_lock(&sc->vsc_mtx); + + switch (offset) { + case VTCFG_R_GUESTCAP: + assert(size == 4); + sc->vsc_features = value & VTNET_S_HOSTCAPS; + break; + case VTCFG_R_PFN: + assert(size == 4); + pci_vtnet_ring_init(sc, value); + break; + case VTCFG_R_QSEL: + assert(size == 2); + assert(value < VTNET_MAXQ); + sc->vsc_curq = value; + break; + case VTCFG_R_QNOTIFY: + assert(size == 2); + assert(value < VTNET_MAXQ); + (*pci_vtnet_qnotify[value])(sc); + break; + case VTCFG_R_STATUS: + assert(size == 1); + pci_vtnet_update_status(sc, value); + break; + case VTNET_R_CFG0: + case VTNET_R_CFG1: + case VTNET_R_CFG2: + case VTNET_R_CFG3: + case VTNET_R_CFG4: + case VTNET_R_CFG5: + /* + * The driver is allowed to change the MAC address + */ + assert(size == 1); + sc->vsc_macaddr[offset - VTNET_R_CFG0] = value; + break; + case VTCFG_R_HOSTCAP: + case VTCFG_R_QNUM: + case VTCFG_R_ISR: + case VTNET_R_CFG6: + case VTNET_R_CFG7: + DPRINTF(("vtnet: write to readonly reg %d\n\r", offset)); + break; + default: + DPRINTF(("vtnet: unknown i/o write offset %d\n\r", offset)); + value = 0; + break; + } + + pthread_mutex_unlock(&sc->vsc_mtx); +} + +uint32_t +pci_vtnet_read(struct pci_devinst *pi, int baridx, int offset, int size) +{ + struct pci_vtnet_softc *sc = pi->pi_arg; + uint32_t value; + + if (offset + size > VTNET_REGSZ) { + DPRINTF(("vtnet_read: 2big, offset %d size %d\n", + offset, size)); + return (0); + } + + pthread_mutex_lock(&sc->vsc_mtx); + + switch (offset) { + case VTCFG_R_HOSTCAP: + assert(size == 4); + value = VTNET_S_HOSTCAPS; + break; + case VTCFG_R_GUESTCAP: + assert(size == 4); + value = sc->vsc_features; /* XXX never read ? */ + break; + case VTCFG_R_PFN: + assert(size == 4); + value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN; + break; + case VTCFG_R_QNUM: + assert(size == 2); + value = pci_vtnet_qsize(sc->vsc_curq); + break; + case VTCFG_R_QSEL: + assert(size == 2); + value = sc->vsc_curq; /* XXX never read ? */ + break; + case VTCFG_R_QNOTIFY: + assert(size == 2); + value = sc->vsc_curq; /* XXX never read ? */ + break; + case VTCFG_R_STATUS: + assert(size == 1); + value = sc->vsc_status; + break; + case VTCFG_R_ISR: + assert(size == 1); + value = sc->vsc_isr; + sc->vsc_isr = 0; /* a read clears this flag */ + break; + case VTNET_R_CFG0: + case VTNET_R_CFG1: + case VTNET_R_CFG2: + case VTNET_R_CFG3: + case VTNET_R_CFG4: + case VTNET_R_CFG5: + assert(size == 1); + value = sc->vsc_macaddr[offset - VTNET_R_CFG0]; + break; + case VTNET_R_CFG6: + assert(size == 1); + value = 0x01; /* XXX link always up */ + break; + case VTNET_R_CFG7: + assert(size == 1); + value = 0; /* link status is in the LSB */ + break; + default: + DPRINTF(("vtnet: unknown i/o read offset %d\n\r", offset)); + value = 0; + break; + } + + pthread_mutex_unlock(&sc->vsc_mtx); + + return (value); +} + +struct pci_devemu pci_de_vnet = { + .pe_emu = "virtio-net", + .pe_init = pci_vtnet_init, + .pe_iow = pci_vtnet_write, + .pe_ior = pci_vtnet_read, +}; +PCI_EMUL_SET(pci_de_vnet); diff --git a/usr.sbin/bhyve/pit_8254.c b/usr.sbin/bhyve/pit_8254.c new file mode 100644 index 000000000000..b5101616b035 --- /dev/null +++ b/usr.sbin/bhyve/pit_8254.c @@ -0,0 +1,196 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/time.h> + +#include <machine/clock.h> + +#include <stdio.h> +#include <assert.h> + +#include "fbsdrun.h" +#include "inout.h" +#include "pit_8254.h" + +#define TIMER_SEL_MASK 0xc0 +#define TIMER_RW_MASK 0x30 +#define TIMER_MODE_MASK 0x0f +#define TIMER_SEL_READBACK 0xc0 + +#define TIMER_DIV(freq, hz) (((freq) + (hz) / 2) / (hz)) + +#define PIT_8254_FREQ 1193182 +static const int nsecs_per_tick = 1000000000 / PIT_8254_FREQ; + +struct counter { + struct timeval tv; /* uptime when counter was loaded */ + uint16_t initial; /* initial counter value */ + uint8_t cr[2]; + uint8_t ol[2]; + int crbyte; + int olbyte; +}; + +static void +timevalfix(struct timeval *t1) +{ + + if (t1->tv_usec < 0) { + t1->tv_sec--; + t1->tv_usec += 1000000; + } + if (t1->tv_usec >= 1000000) { + t1->tv_sec++; + t1->tv_usec -= 1000000; + } +} + +static void +timevalsub(struct timeval *t1, const struct timeval *t2) +{ + + t1->tv_sec -= t2->tv_sec; + t1->tv_usec -= t2->tv_usec; + timevalfix(t1); +} + +static void +latch(struct counter *c) +{ + struct timeval tv2; + uint16_t lval; + uint64_t delta_nsecs, delta_ticks; + + /* cannot latch a new value until the old one has been consumed */ + if (c->olbyte != 0) + return; + + if (c->initial == 0 || c->initial == 1) { + /* + * XXX the program that runs the VM can be stopped and + * restarted at any time. This means that state that was + * created by the guest is destroyed between invocations + * of the program. + * + * If the counter's initial value is not programmed we + * assume a value that would be set to generate 'guest_hz' + * interrupts per second. + */ + c->initial = TIMER_DIV(PIT_8254_FREQ, guest_hz); + gettimeofday(&c->tv, NULL); + } + + (void)gettimeofday(&tv2, NULL); + timevalsub(&tv2, &c->tv); + delta_nsecs = tv2.tv_sec * 1000000000 + tv2.tv_usec * 1000; + delta_ticks = delta_nsecs / nsecs_per_tick; + + lval = c->initial - delta_ticks % c->initial; + c->olbyte = 2; + c->ol[1] = lval; /* LSB */ + c->ol[0] = lval >> 8; /* MSB */ +} + +static int +pit_8254_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int sel, rw, mode; + uint8_t val; + struct counter *c; + + static struct counter counter[3]; + + if (bytes != 1) + return (-1); + + val = *eax; + + if (port == TIMER_MODE) { + assert(in == 0); + sel = val & TIMER_SEL_MASK; + rw = val & TIMER_RW_MASK; + mode = val & TIMER_MODE_MASK; + + if (sel == TIMER_SEL_READBACK) + return (-1); + if (rw != TIMER_LATCH && rw != TIMER_16BIT) + return (-1); + + if (rw != TIMER_LATCH) { + /* + * Counter mode is not affected when issuing a + * latch command. + */ + if (mode != TIMER_RATEGEN && mode != TIMER_SQWAVE) + return (-1); + } + + c = &counter[sel >> 6]; + if (rw == TIMER_LATCH) + latch(c); + else + c->olbyte = 0; /* reset latch after reprogramming */ + + return (0); + } + + /* counter ports */ + assert(port >= TIMER_CNTR0 && port <= TIMER_CNTR2); + c = &counter[port - TIMER_CNTR0]; + + if (in) { + /* + * XXX + * The spec says that once the output latch is completely + * read it should revert to "following" the counter. We don't + * do this because it is hard and any reasonable OS should + * always latch the counter before trying to read it. + */ + if (c->olbyte == 0) + c->olbyte = 2; + *eax = c->ol[--c->olbyte]; + } else { + c->cr[c->crbyte++] = *eax; + if (c->crbyte == 2) { + c->crbyte = 0; + c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8; + gettimeofday(&c->tv, NULL); + } + } + + return (0); +} + +INOUT_PORT(8254, TIMER_MODE, IOPORT_F_OUT, pit_8254_handler); +INOUT_PORT(8254, TIMER_CNTR0, IOPORT_F_INOUT, pit_8254_handler); +INOUT_PORT(8254, TIMER_CNTR1, IOPORT_F_INOUT, pit_8254_handler); +INOUT_PORT(8254, TIMER_CNTR2, IOPORT_F_INOUT, pit_8254_handler); diff --git a/usr.sbin/bhyve/pit_8254.h b/usr.sbin/bhyve/pit_8254.h new file mode 100644 index 000000000000..61bd15d13b1c --- /dev/null +++ b/usr.sbin/bhyve/pit_8254.h @@ -0,0 +1,45 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PIT_8254_H_ +#define _PIT_8254_H_ + +/* + * Borrowed from amd64/include/timerreg.h because in that file it is + * conditionally compiled for #ifdef _KERNEL only. + */ + +#include <dev/ic/i8253reg.h> + +#define IO_TIMER1 0x40 /* 8253 Timer #1 */ +#define TIMER_CNTR0 (IO_TIMER1 + TIMER_REG_CNTR0) +#define TIMER_CNTR1 (IO_TIMER1 + TIMER_REG_CNTR1) +#define TIMER_CNTR2 (IO_TIMER1 + TIMER_REG_CNTR2) +#define TIMER_MODE (IO_TIMER1 + TIMER_REG_MODE) + +#endif /* _PIT_8254_H_ */ diff --git a/usr.sbin/bhyve/post.c b/usr.sbin/bhyve/post.c new file mode 100644 index 000000000000..092a551d87b3 --- /dev/null +++ b/usr.sbin/bhyve/post.c @@ -0,0 +1,51 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <assert.h> + +#include "inout.h" + +static int +post_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + assert(in == 1); + + if (bytes != 1) + return (-1); + + *eax = 0xff; /* return some garbage */ + return (0); +} + +INOUT_PORT(post, 0x84, IOPORT_F_IN, post_data_handler); diff --git a/usr.sbin/bhyve/rtc.c b/usr.sbin/bhyve/rtc.c new file mode 100644 index 000000000000..a6f44e00bcd8 --- /dev/null +++ b/usr.sbin/bhyve/rtc.c @@ -0,0 +1,268 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/time.h> + +#include <stdio.h> +#include <time.h> +#include <assert.h> + +#include "inout.h" + +#define IO_RTC 0x70 + +#define RTC_SEC 0x00 /* seconds */ +#define RTC_MIN 0x02 +#define RTC_HRS 0x04 +#define RTC_WDAY 0x06 +#define RTC_DAY 0x07 +#define RTC_MONTH 0x08 +#define RTC_YEAR 0x09 +#define RTC_CENTURY 0x32 /* current century */ + +#define RTC_STATUSA 0xA +#define RTCSA_TUP 0x80 /* time update, don't look now */ + +#define RTC_STATUSB 0xB +#define RTCSB_DST 0x01 +#define RTCSB_24HR 0x02 +#define RTCSB_BIN 0x04 /* 0 = BCD, 1 = Binary */ +#define RTCSB_PINTR 0x40 /* 1 = enable periodic clock interrupt */ +#define RTCSB_HALT 0x80 /* stop clock updates */ + +#define RTC_INTR 0x0c /* status register C (R) interrupt source */ + +#define RTC_STATUSD 0x0d /* status register D (R) Lost Power */ +#define RTCSD_PWR 0x80 /* clock power OK */ + +#define RTC_DIAG 0x0e + +#define RTC_RSTCODE 0x0f + +static int addr; + +/* XXX initialize these to default values as they would be from BIOS */ +static uint8_t status_a, status_b, rstcode; + +static u_char const bin2bcd_data[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99 +}; +#define bin2bcd(bin) (bin2bcd_data[bin]) + +#define rtcout(val) ((status_b & RTCSB_BIN) ? (val) : bin2bcd((val))) + +static void +timevalfix(struct timeval *t1) +{ + + if (t1->tv_usec < 0) { + t1->tv_sec--; + t1->tv_usec += 1000000; + } + if (t1->tv_usec >= 1000000) { + t1->tv_sec++; + t1->tv_usec -= 1000000; + } +} + +static void +timevalsub(struct timeval *t1, const struct timeval *t2) +{ + + t1->tv_sec -= t2->tv_sec; + t1->tv_usec -= t2->tv_usec; + timevalfix(t1); +} + +static int +rtc_addr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + assert(in == 0); + + if (bytes != 1) + return (-1); + + switch (*eax) { + case RTC_SEC: + case RTC_MIN: + case RTC_HRS: + case RTC_WDAY: + case RTC_DAY: + case RTC_MONTH: + case RTC_YEAR: + case RTC_CENTURY: + case RTC_STATUSA: + case RTC_STATUSB: + case RTC_INTR: + case RTC_STATUSD: + case RTC_DIAG: + case RTC_RSTCODE: + break; + default: + return (-1); + } + + addr = *eax; + return (0); +} + +static int +rtc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int hour; + time_t t; + struct timeval cur, delta; + + static struct timeval last; + static struct tm tm; + + if (bytes != 1) + return (-1); + + gettimeofday(&cur, NULL); + + /* + * Increment the cached time only once per second so we can guarantee + * that the guest has at least one second to read the hour:min:sec + * separately and still get a coherent view of the time. + */ + delta = cur; + timevalsub(&delta, &last); + if (delta.tv_sec >= 1 && (status_b & RTCSB_HALT) == 0) { + t = cur.tv_sec; + localtime_r(&t, &tm); + last = cur; + } + + if (in) { + switch (addr) { + case RTC_SEC: + *eax = rtcout(tm.tm_sec); + return (0); + case RTC_MIN: + *eax = rtcout(tm.tm_min); + return (0); + case RTC_HRS: + if (status_b & RTCSB_24HR) + hour = tm.tm_hour; + else + hour = (tm.tm_hour % 12) + 1; + + *eax = rtcout(hour); + + /* + * If we are representing time in the 12-hour format + * then set the MSB to indicate PM. + */ + if ((status_b & RTCSB_24HR) == 0 && tm.tm_hour >= 12) + *eax |= 0x80; + + return (0); + case RTC_WDAY: + *eax = rtcout(tm.tm_wday + 1); + return (0); + case RTC_DAY: + *eax = rtcout(tm.tm_mday); + return (0); + case RTC_MONTH: + *eax = rtcout(tm.tm_mon + 1); + return (0); + case RTC_YEAR: + *eax = rtcout(tm.tm_year % 100); + return (0); + case RTC_CENTURY: + *eax = rtcout(tm.tm_year / 100); + break; + case RTC_STATUSA: + *eax = status_a; + return (0); + case RTC_INTR: + *eax = 0; + return (0); + case RTC_STATUSD: + *eax = RTCSD_PWR; + return (0); + case RTC_DIAG: + *eax = 0; + return (0); + case RTC_RSTCODE: + *eax = rstcode; + return (0); + default: + return (-1); + } + } + + switch (addr) { + case RTC_STATUSA: + status_a = *eax & ~RTCSA_TUP; + break; + case RTC_STATUSB: + /* XXX not implemented yet XXX */ + if (*eax & RTCSB_PINTR) + return (-1); + status_b = *eax; + break; + case RTC_RSTCODE: + rstcode = *eax; + break; + case RTC_SEC: + case RTC_MIN: + case RTC_HRS: + case RTC_WDAY: + case RTC_DAY: + case RTC_MONTH: + case RTC_YEAR: + case RTC_CENTURY: + /* + * Ignore writes to the time of day registers + */ + break; + default: + return (-1); + } + return (0); +} + +INOUT_PORT(rtc, IO_RTC, IOPORT_F_OUT, rtc_addr_handler); +INOUT_PORT(rtc, IO_RTC + 1, IOPORT_F_INOUT, rtc_data_handler); diff --git a/usr.sbin/bhyve/uart.c b/usr.sbin/bhyve/uart.c new file mode 100644 index 000000000000..640f3bf0f70d --- /dev/null +++ b/usr.sbin/bhyve/uart.c @@ -0,0 +1,60 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <assert.h> + +#include "inout.h" + +#define COM1 0x3F8 +#define COM2 0x2F8 + +#define REG_IIR 2 + +static int +com_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + assert(in); + + if (bytes != 1) + return (-1); + + /* + * COM port is not implemented so we return 0xFF for all registers + */ + *eax = 0xFF; + + return (0); +} + +INOUT_PORT(uart, COM1 + REG_IIR, IOPORT_F_IN, com_handler); +INOUT_PORT(uart, COM2 + REG_IIR, IOPORT_F_IN, com_handler); diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h new file mode 100644 index 000000000000..474e244c5965 --- /dev/null +++ b/usr.sbin/bhyve/virtio.h @@ -0,0 +1,85 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VIRTIO_H_ +#define _VIRTIO_H_ + +#define VRING_ALIGN 4096 + +#define VRING_DESC_F_NEXT (1 << 0) +#define VRING_DESC_F_WRITE (1 << 1) +#define VRING_DESC_F_INDIRECT (1 << 2) + +#define VRING_AVAIL_F_NO_INTERRUPT 1 + +struct virtio_desc { + uint64_t vd_addr; + uint32_t vd_len; + uint16_t vd_flags; + uint16_t vd_next; +} __packed; + +struct virtio_used { + uint32_t vu_idx; + uint32_t vu_tlen; +} __packed; + +/* + * PFN register shift amount + */ +#define VRING_PFN 12 + +/* + * Virtio device types + */ +#define VIRTIO_TYPE_NET 1 +#define VIRTIO_TYPE_BLOCK 2 + +/* + * PCI vendor/device IDs + */ +#define VIRTIO_VENDOR 0x1AF4 +#define VIRTIO_DEV_NET 0x1000 +#define VIRTIO_DEV_BLOCK 0x1001 + +/* + * PCI config space constants + */ +#define VTCFG_R_HOSTCAP 0 +#define VTCFG_R_GUESTCAP 4 +#define VTCFG_R_PFN 8 +#define VTCFG_R_QNUM 12 +#define VTCFG_R_QSEL 14 +#define VTCFG_R_QNOTIFY 16 +#define VTCFG_R_STATUS 18 +#define VTCFG_R_ISR 19 +#define VTCFG_R_CFG0 20 /* No MSI-X */ +#define VTCFG_R_CFG1 24 /* With MSI-X */ +#define VTCFG_R_MSIX 20 + +#endif /* _VIRTIO_H_ */ diff --git a/usr.sbin/bhyve/xmsr.c b/usr.sbin/bhyve/xmsr.c new file mode 100644 index 000000000000..931b7d7f791c --- /dev/null +++ b/usr.sbin/bhyve/xmsr.c @@ -0,0 +1,261 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <machine/apicreg.h> + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <machine/vmm.h> +#include <vmmapi.h> + +#include "fbsdrun.h" +#include "xmsr.h" + +/* + * Trampoline for hypervisor direct 64-bit jump. + * + * 0 - signature for guest->host verification + * 8 - kernel virtual address of trampoline + * 16 - instruction virtual address + * 24 - stack pointer virtual address + * 32 - CR3, physical address of kernel page table + * 40 - 24-byte area for null/code/data GDT entries + */ +#define MP_V64T_SIG 0xcafebabecafebabeULL +struct mp_v64tramp { + uint64_t mt_sig; + uint64_t mt_virt; + uint64_t mt_eip; + uint64_t mt_rsp; + uint64_t mt_cr3; + uint64_t mt_gdtr[3]; +}; + +/* + * CPU 0 is considered to be the BSP and is set to the RUNNING state. + * All other CPUs are set up in the INIT state. + */ +#define BSP 0 +enum cpu_bstate { + CPU_S_INIT, + CPU_S_SIPI, + CPU_S_RUNNING +} static cpu_b[VM_MAXCPU] = { [BSP] = CPU_S_RUNNING }; + +static void spinup_ap(struct vmctx *, int, int, uint64_t *); +static void spinup_ap_direct64(struct vmctx *, int, uintptr_t, uint64_t *); + +int +emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val) +{ + int dest; + int mode; + int thiscpu; + int vec; + int error, retval; + uint64_t rip; + + retval = vcpu; + thiscpu = 1 << vcpu; + + /* + * The only MSR value handled is the x2apic CR register + */ + if (code != 0x830) { + printf("Unknown WRMSR code %x, val %lx, cpu %d\n", + code, val, vcpu); + exit(1); + } + + /* + * The value written to the MSR will generate an IPI to + * a set of CPUs. If this is a SIPI, create the initial + * state for the CPU and switch to it. Otherwise, inject + * an interrupt for the destination CPU(s), and request + * a switch to the next available one by returning -1 + */ + dest = val >> 32; + vec = val & APIC_VECTOR_MASK; + mode = val & APIC_DELMODE_MASK; + + switch (mode) { + case APIC_DELMODE_INIT: + assert(dest != 0); + assert(dest < guest_ncpus); + + /* + * Ignore legacy de-assert INITs in x2apic mode + */ + if ((val & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) { + break; + } + assert(cpu_b[dest] == CPU_S_INIT); + + /* + * Move CPU to wait-for-SIPI state + */ + error = vcpu_reset(ctx, dest); + assert(error == 0); + + cpu_b[dest] = CPU_S_SIPI; + break; + + case APIC_DELMODE_STARTUP: + assert(dest != 0); + assert(dest < guest_ncpus); + /* + * Ignore SIPIs in any state other than wait-for-SIPI + */ + if (cpu_b[dest] != CPU_S_SIPI) { + break; + } + + /* + * Bring up the AP and signal the main loop that it is + * available and to switch to it. + */ + spinup_ap(ctx, dest, vec, &rip); + cpu_b[dest] = CPU_S_RUNNING; + fbsdrun_addcpu(ctx, dest, rip); + retval = dest; + break; + + default: + printf("APIC delivery mode %lx not supported!\n", + val & APIC_DELMODE_MASK); + exit(1); + } + + return (retval); +} + +/* + * There are 2 startup modes possible here: + * - if the CPU supports 'unrestricted guest' mode, the spinup can + * set up the processor state in power-on 16-bit mode, with the CS:IP + * init'd to the specified low-mem 4K page. + * - if the guest has requested a 64-bit trampoline in the low-mem 4K + * page by placing in the specified signature, set up the register + * state using register state in the signature. Note that this + * requires accessing guest physical memory to read the signature + * while 'unrestricted mode' does not. + */ +static void +spinup_ap(struct vmctx *ctx, int newcpu, int vector, uint64_t *rip) +{ + int error; + uint16_t cs; + uint64_t desc_base; + uint32_t desc_limit, desc_access; + + if (fbsdrun_vmexit_on_hlt()) { + error = vm_set_capability(ctx, newcpu, VM_CAP_HALT_EXIT, 1); + assert(error == 0); + } + + if (fbsdrun_vmexit_on_pause()) { + error = vm_set_capability(ctx, newcpu, VM_CAP_PAUSE_EXIT, 1); + assert(error == 0); + } + + error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1); + if (error) { + /* + * If the guest does not support real-mode execution then + * we will bring up the AP directly in 64-bit mode. + */ + spinup_ap_direct64(ctx, newcpu, vector << PAGE_SHIFT, rip); + } else { + /* + * Update the %cs and %rip of the guest so that it starts + * executing real mode code at at 'vector << 12'. + */ + *rip = 0; + error = vm_set_register(ctx, newcpu, VM_REG_GUEST_RIP, *rip); + assert(error == 0); + + error = vm_get_desc(ctx, newcpu, VM_REG_GUEST_CS, &desc_base, + &desc_limit, &desc_access); + assert(error == 0); + + desc_base = vector << PAGE_SHIFT; + error = vm_set_desc(ctx, newcpu, VM_REG_GUEST_CS, + desc_base, desc_limit, desc_access); + assert(error == 0); + + cs = (vector << PAGE_SHIFT) >> 4; + error = vm_set_register(ctx, newcpu, VM_REG_GUEST_CS, cs); + assert(error == 0); + } +} + +static void +spinup_ap_direct64(struct vmctx *ctx, int newcpu, uintptr_t gaddr, + uint64_t *rip) +{ + struct mp_v64tramp *mvt; + char *errstr; + int error; + uint64_t gdtbase; + + mvt = paddr_guest2host(gaddr); + + assert(mvt->mt_sig == MP_V64T_SIG); + + /* + * Set up the 3-entry GDT using memory supplied in the + * guest's trampoline structure. + */ + vm_setup_freebsd_gdt(mvt->mt_gdtr); + +#define CHECK_ERROR(msg) \ + if (error != 0) { \ + errstr = msg; \ + goto err_exit; \ + } + + /* entry point */ + *rip = mvt->mt_eip; + + /* Get the guest virtual address of the GDT */ + gdtbase = mvt->mt_virt + __offsetof(struct mp_v64tramp, mt_gdtr); + + error = vm_setup_freebsd_registers(ctx, newcpu, mvt->mt_eip, + mvt->mt_cr3, gdtbase, mvt->mt_rsp); + CHECK_ERROR("vm_setup_freebsd_registers"); + + return; +err_exit: + printf("spinup_ap_direct64: machine state error: %s", errstr); + exit(1); +} diff --git a/usr.sbin/bhyve/xmsr.h b/usr.sbin/bhyve/xmsr.h new file mode 100644 index 000000000000..8cebcea0cd55 --- /dev/null +++ b/usr.sbin/bhyve/xmsr.h @@ -0,0 +1,34 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _XMSR_H_ +#define _XMSR_H_ + +int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val); + +#endif diff --git a/usr.sbin/vmmctl/Makefile b/usr.sbin/vmmctl/Makefile new file mode 100644 index 000000000000..1f529b561f1b --- /dev/null +++ b/usr.sbin/vmmctl/Makefile @@ -0,0 +1,15 @@ +# +# $FreeBSD$ +# + +PROG= vmmctl +SRCS= vmmctl.c + +NO_MAN= + +DPADD= ${LIBVMMAPI} +LDADD= -lvmmapi + +CFLAGS+= -I${.CURDIR}/../../sys/amd64/vmm + +.include <bsd.prog.mk> diff --git a/usr.sbin/vmmctl/sample.sh b/usr.sbin/vmmctl/sample.sh new file mode 100755 index 000000000000..f38d0dadb882 --- /dev/null +++ b/usr.sbin/vmmctl/sample.sh @@ -0,0 +1,75 @@ +#!/bin/sh + +# $FreeBSD$ + +VMMCTL="sudo ./vmmctl" +VMNAME=sample + +${VMMCTL} --vm=${VMNAME} --create +${VMMCTL} --vm=${VMNAME} --set-lowmem=128 --set-highmem=256 +${VMMCTL} --vm=${VMNAME} --get-lowmem --get-highmem + +CR0_PE=$((1 << 0)) +CR0_PG=$((1 << 31)) +CR0=$(($CR0_PE | $CR0_PG)) +${VMMCTL} --vm=${VMNAME} --set-cr0=${CR0} --get-cr0 + +# XXX this is bogus the value of %cr3 should come from the loader +CR3=0 +${VMMCTL} --vm=${VMNAME} --set-cr3=${CR3} --get-cr3 + +CR4_PAE=$((1 << 5)) +CR4=$((${CR4_PAE})) +${VMMCTL} --vm=${VMNAME} --set-cr4=${CR4} --get-cr4 + +DR7=0x00000400 # Table 9-1 from Intel Architecture Manual 3A +${VMMCTL} --vm=${VMNAME} --set-dr7=${DR7} --get-dr7 + +# +# XXX the values of rsp and rip are bogus and should come from the loader. +# +RSP=0xa5a5a5a5 +RIP=0x0000bfbfbfbf0000 +RFLAGS=0x2 +${VMMCTL} --vm=${VMNAME} --set-rsp=${RSP} --get-rsp +${VMMCTL} --vm=${VMNAME} --set-rip=${RIP} --get-rip +${VMMCTL} --vm=${VMNAME} --set-rflags=${RFLAGS} --get-rflags + +# Set "hidden" state of %cs descriptor to indicate long mode code segment. +# +# Note that this should match the contents of the entry pointed to by the +# segment selector in the GDTR. +# +${VMMCTL} --vm=${VMNAME} --set-desc-cs --desc-access=0x00002098 --get-desc-cs + +# Set "hidden" state of all data descriptors to indicate a usable segment. +# The only useful fields are the "Present" and "Descriptor Type" bits. +${VMMCTL} --vm=${VMNAME} --set-desc-ds --desc-access=0x00000090 --get-desc-ds +${VMMCTL} --vm=${VMNAME} --set-desc-es --desc-access=0x00000090 --get-desc-es +${VMMCTL} --vm=${VMNAME} --set-desc-fs --desc-access=0x00000090 --get-desc-fs +${VMMCTL} --vm=${VMNAME} --set-desc-gs --desc-access=0x00000090 --get-desc-gs +${VMMCTL} --vm=${VMNAME} --set-desc-ss --desc-access=0x00000090 --get-desc-ss + +# +# Set the code segment selector to point to entry at offset 8 in the GDTR. +# +${VMMCTL} --vm=${VMNAME} --set-cs=0x0008 --get-cs + +# Set all the remaining data segment selectors to point to entry at offset +# 16 in the GDTR. +${VMMCTL} --vm=${VMNAME} --set-ds=0x0010 --get-ds +${VMMCTL} --vm=${VMNAME} --set-es=0x0010 --get-es +${VMMCTL} --vm=${VMNAME} --set-fs=0x0010 --get-fs +${VMMCTL} --vm=${VMNAME} --set-gs=0x0010 --get-gs +${VMMCTL} --vm=${VMNAME} --set-ss=0x0010 --get-ss + +# XXX the value of the GDTR should come from the loader. +# Set the GDTR +GDTR_BASE=0xffff0000 +GDTR_LIMIT=0x10 +${VMMCTL} --vm=${VMNAME} --set-desc-gdtr --desc-base=${GDTR_BASE} --desc-limit=${GDTR_LIMIT} --get-desc-gdtr + +${VMMCTL} --vm=${VMNAME} --set-pinning=0 --get-pinning +${VMMCTL} --vm=${VMNAME} --set-pinning=-1 --get-pinning + +${VMMCTL} --vm=${VMNAME} --destroy diff --git a/usr.sbin/vmmctl/vmmctl.c b/usr.sbin/vmmctl/vmmctl.c new file mode 100644 index 000000000000..678f98b0734b --- /dev/null +++ b/usr.sbin/vmmctl/vmmctl.c @@ -0,0 +1,1485 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysctl.h> +#include <sys/errno.h> +#include <sys/mman.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <libgen.h> +#include <libutil.h> +#include <fcntl.h> +#include <string.h> +#include <getopt.h> +#include <assert.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include "intel/vmcs.h" + +#define MB (1UL << 20) +#define GB (1UL << 30) + +#define REQ_ARG required_argument +#define NO_ARG no_argument +#define OPT_ARG optional_argument + +static const char *progname; + +static void +usage(void) +{ + + (void)fprintf(stderr, + "Usage: %s --vm=<name>\n" + " [--cpu=<vcpu_number>]\n" + " [--create]\n" + " [--destroy]\n" + " [--get-stats]\n" + " [--set-desc-ds]\n" + " [--get-desc-ds]\n" + " [--set-desc-es]\n" + " [--get-desc-es]\n" + " [--set-desc-gs]\n" + " [--get-desc-gs]\n" + " [--set-desc-fs]\n" + " [--get-desc-fs]\n" + " [--set-desc-cs]\n" + " [--get-desc-cs]\n" + " [--set-desc-ss]\n" + " [--get-desc-ss]\n" + " [--set-desc-tr]\n" + " [--get-desc-tr]\n" + " [--set-desc-ldtr]\n" + " [--get-desc-ldtr]\n" + " [--set-desc-gdtr]\n" + " [--get-desc-gdtr]\n" + " [--set-desc-idtr]\n" + " [--get-desc-idtr]\n" + " [--run]\n" + " [--capname=<capname>]\n" + " [--getcap]\n" + " [--setcap=<0|1>]\n" + " [--desc-base=<BASE>]\n" + " [--desc-limit=<LIMIT>]\n" + " [--desc-access=<ACCESS>]\n" + " [--set-cr0=<CR0>]\n" + " [--get-cr0]\n" + " [--set-cr3=<CR3>]\n" + " [--get-cr3]\n" + " [--set-cr4=<CR4>]\n" + " [--get-cr4]\n" + " [--set-dr7=<DR7>]\n" + " [--get-dr7]\n" + " [--set-rsp=<RSP>]\n" + " [--get-rsp]\n" + " [--set-rip=<RIP>]\n" + " [--get-rip]\n" + " [--get-rax]\n" + " [--set-rax=<RAX>]\n" + " [--get-rbx]\n" + " [--get-rcx]\n" + " [--get-rdx]\n" + " [--get-rsi]\n" + " [--get-rdi]\n" + " [--get-rbp]\n" + " [--get-r8]\n" + " [--get-r9]\n" + " [--get-r10]\n" + " [--get-r11]\n" + " [--get-r12]\n" + " [--get-r13]\n" + " [--get-r14]\n" + " [--get-r15]\n" + " [--set-rflags=<RFLAGS>]\n" + " [--get-rflags]\n" + " [--set-cs]\n" + " [--get-cs]\n" + " [--set-ds]\n" + " [--get-ds]\n" + " [--set-es]\n" + " [--get-es]\n" + " [--set-fs]\n" + " [--get-fs]\n" + " [--set-gs]\n" + " [--get-gs]\n" + " [--set-ss]\n" + " [--get-ss]\n" + " [--get-tr]\n" + " [--get-ldtr]\n" + " [--get-vmcs-pinbased-ctls]\n" + " [--get-vmcs-procbased-ctls]\n" + " [--get-vmcs-procbased-ctls2]\n" + " [--get-vmcs-entry-interruption-info]\n" + " [--set-vmcs-entry-interruption-info=<info>]\n" + " [--get-vmcs-eptp]\n" + " [--get-vmcs-guest-physical-address\n" + " [--get-vmcs-guest-linear-address\n" + " [--set-vmcs-exception-bitmap]\n" + " [--get-vmcs-exception-bitmap]\n" + " [--get-vmcs-io-bitmap-address]\n" + " [--get-vmcs-tsc-offset]\n" + " [--get-vmcs-guest-pat]\n" + " [--get-vmcs-host-pat]\n" + " [--get-vmcs-host-cr0]\n" + " [--get-vmcs-host-cr3]\n" + " [--get-vmcs-host-cr4]\n" + " [--get-vmcs-host-rip]\n" + " [--get-vmcs-host-rsp]\n" + " [--get-vmcs-cr0-mask]\n" + " [--get-vmcs-cr0-shadow]\n" + " [--get-vmcs-cr4-mask]\n" + " [--get-vmcs-cr4-shadow]\n" + " [--get-vmcs-cr3-targets]\n" + " [--get-vmcs-apic-access-address]\n" + " [--get-vmcs-virtual-apic-address]\n" + " [--get-vmcs-tpr-threshold]\n" + " [--get-vmcs-msr-bitmap]\n" + " [--get-vmcs-msr-bitmap-address]\n" + " [--get-vmcs-vpid]\n" + " [--get-vmcs-ple-gap]\n" + " [--get-vmcs-ple-window]\n" + " [--get-vmcs-instruction-error]\n" + " [--get-vmcs-exit-ctls]\n" + " [--get-vmcs-entry-ctls]\n" + " [--get-vmcs-guest-sysenter]\n" + " [--get-vmcs-link]\n" + " [--get-vmcs-exit-reason]\n" + " [--get-vmcs-exit-qualification]\n" + " [--get-vmcs-exit-interruption-info]\n" + " [--get-vmcs-exit-interruption-error]\n" + " [--get-vmcs-interruptibility]\n" + " [--set-pinning=<host_cpuid>]\n" + " [--get-pinning]\n" + " [--set-lowmem=<memory below 4GB in units of MB>]\n" + " [--get-lowmem]\n" + " [--set-highmem=<memory above 4GB in units of MB>]\n" + " [--get-highmem]\n", + progname); + exit(1); +} + +static int get_stats, getcap, setcap, capval; +static const char *capname; +static int create, destroy, get_lowmem, get_highmem; +static uint64_t lowmem, highmem; +static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4; +static int set_efer, get_efer; +static int set_dr7, get_dr7; +static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags; +static int set_rax, get_rax; +static int get_rbx, get_rcx, get_rdx, get_rsi, get_rdi, get_rbp; +static int get_r8, get_r9, get_r10, get_r11, get_r12, get_r13, get_r14, get_r15; +static int set_desc_ds, get_desc_ds; +static int set_desc_es, get_desc_es; +static int set_desc_fs, get_desc_fs; +static int set_desc_gs, get_desc_gs; +static int set_desc_cs, get_desc_cs; +static int set_desc_ss, get_desc_ss; +static int set_desc_gdtr, get_desc_gdtr; +static int set_desc_idtr, get_desc_idtr; +static int set_desc_tr, get_desc_tr; +static int set_desc_ldtr, get_desc_ldtr; +static int set_cs, set_ds, set_es, set_fs, set_gs, set_ss, set_tr, set_ldtr; +static int get_cs, get_ds, get_es, get_fs, get_gs, get_ss, get_tr, get_ldtr; +static int set_pinning, get_pinning, pincpu; +static int run; + +/* + * VMCS-specific fields + */ +static int get_pinbased_ctls, get_procbased_ctls, get_procbased_ctls2; +static int get_eptp, get_io_bitmap, get_tsc_offset; +static int get_vmcs_entry_interruption_info, set_vmcs_entry_interruption_info; +static int get_vmcs_interruptibility; +uint32_t vmcs_entry_interruption_info; +static int get_vmcs_gpa, get_vmcs_gla; +static int get_exception_bitmap, set_exception_bitmap, exception_bitmap; +static int get_cr0_mask, get_cr0_shadow; +static int get_cr4_mask, get_cr4_shadow; +static int get_cr3_targets; +static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold; +static int get_msr_bitmap, get_msr_bitmap_address; +static int get_vpid, get_ple_gap, get_ple_window; +static int get_inst_err, get_exit_ctls, get_entry_ctls; +static int get_host_cr0, get_host_cr3, get_host_cr4; +static int get_host_rip, get_host_rsp; +static int get_guest_pat, get_host_pat; +static int get_guest_sysenter, get_vmcs_link; +static int get_vmcs_exit_reason, get_vmcs_exit_qualification; +static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error; + +static uint64_t desc_base; +static uint32_t desc_limit, desc_access; + +static void +dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu) +{ + printf("vm exit[%d]\n", vcpu); + printf("\trip\t\t0x%016lx\n", vmexit->rip); + printf("\tinst_length\t%d\n", vmexit->inst_length); + switch (vmexit->exitcode) { + case VM_EXITCODE_INOUT: + printf("\treason\t\tINOUT\n"); + printf("\tdirection\t%s\n", vmexit->u.inout.in ? "IN" : "OUT"); + printf("\tbytes\t\t%d\n", vmexit->u.inout.bytes); + printf("\tflags\t\t%s%s\n", + vmexit->u.inout.string ? "STRING " : "", + vmexit->u.inout.rep ? "REP " : ""); + printf("\tport\t\t0x%04x\n", vmexit->u.inout.port); + printf("\teax\t\t0x%08x\n", vmexit->u.inout.eax); + break; + case VM_EXITCODE_VMX: + printf("\treason\t\tVMX\n"); + printf("\terror\t\t%d\n", vmexit->u.vmx.error); + printf("\texit_reason\t0x%08x (%u)\n", + vmexit->u.vmx.exit_reason, vmexit->u.vmx.exit_reason); + printf("\tqualification\t0x%016lx\n", + vmexit->u.vmx.exit_qualification); + break; + default: + printf("*** unknown vm run exitcode %d\n", vmexit->exitcode); + break; + } +} + +static int +dump_vmcs_msr_bitmap(int vcpu, u_long addr) +{ + int error, fd, byte, bit, readable, writeable; + u_int msr; + const char *bitmap; + + error = -1; + bitmap = MAP_FAILED; + + fd = open("/dev/mem", O_RDONLY, 0); + if (fd < 0) + goto done; + + bitmap = mmap(NULL, PAGE_SIZE, PROT_READ, 0, fd, addr); + if (bitmap == MAP_FAILED) + goto done; + + for (msr = 0; msr < 0x2000; msr++) { + byte = msr / 8; + bit = msr & 0x7; + + /* Look at MSRs in the range 0x00000000 to 0x00001FFF */ + readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; + writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; + if (readable || writeable) { + printf("msr 0x%08x[%d]\t\t%c%c\n", msr, vcpu, + readable ? 'R' : '-', + writeable ? 'W' : '-'); + } + + /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */ + byte += 1024; + readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; + writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; + if (readable || writeable) { + printf("msr 0x%08x[%d]\t\t%c%c\n", + 0xc0000000 + msr, vcpu, + readable ? 'R' : '-', + writeable ? 'W' : '-'); + } + } + + error = 0; +done: + if (bitmap != MAP_FAILED) + munmap((void *)bitmap, PAGE_SIZE); + if (fd >= 0) + close(fd); + return (error); +} + +static int +vm_get_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t *ret_val) +{ + + return (vm_get_register(ctx, vcpu, VMCS_IDENT(field), ret_val)); +} + +static int +vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val) +{ + + return (vm_set_register(ctx, vcpu, VMCS_IDENT(field), val)); +} + +enum { + VMNAME = 1000, /* avoid collision with return values from getopt */ + VCPU, + SET_LOWMEM, + SET_HIGHMEM, + SET_EFER, + SET_CR0, + SET_CR3, + SET_CR4, + SET_DR7, + SET_RSP, + SET_RIP, + SET_RAX, + SET_RFLAGS, + DESC_BASE, + DESC_LIMIT, + DESC_ACCESS, + SET_CS, + SET_DS, + SET_ES, + SET_FS, + SET_GS, + SET_SS, + SET_TR, + SET_LDTR, + SET_PINNING, + SET_VMCS_EXCEPTION_BITMAP, + SET_VMCS_ENTRY_INTERRUPTION_INFO, + SET_CAP, + CAPNAME, +}; + +int +main(int argc, char *argv[]) +{ + char *vmname; + int error, ch, vcpu; + vm_paddr_t hpa; + size_t len; + struct vm_exit vmexit; + uint64_t ctl, eptp, bm, tsc_off, addr, u64; + struct vmctx *ctx; + + uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat; + uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp; + uint64_t r8, r9, r10, r11, r12, r13, r14, r15; + uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; + + struct option opts[] = { + { "vm", REQ_ARG, 0, VMNAME }, + { "cpu", REQ_ARG, 0, VCPU }, + { "set-lowmem", REQ_ARG, 0, SET_LOWMEM }, + { "set-highmem",REQ_ARG, 0, SET_HIGHMEM }, + { "set-efer", REQ_ARG, 0, SET_EFER }, + { "set-cr0", REQ_ARG, 0, SET_CR0 }, + { "set-cr3", REQ_ARG, 0, SET_CR3 }, + { "set-cr4", REQ_ARG, 0, SET_CR4 }, + { "set-dr7", REQ_ARG, 0, SET_DR7 }, + { "set-rsp", REQ_ARG, 0, SET_RSP }, + { "set-rip", REQ_ARG, 0, SET_RIP }, + { "set-rax", REQ_ARG, 0, SET_RAX }, + { "set-rflags", REQ_ARG, 0, SET_RFLAGS }, + { "desc-base", REQ_ARG, 0, DESC_BASE }, + { "desc-limit", REQ_ARG, 0, DESC_LIMIT }, + { "desc-access",REQ_ARG, 0, DESC_ACCESS }, + { "set-cs", REQ_ARG, 0, SET_CS }, + { "set-ds", REQ_ARG, 0, SET_DS }, + { "set-es", REQ_ARG, 0, SET_ES }, + { "set-fs", REQ_ARG, 0, SET_FS }, + { "set-gs", REQ_ARG, 0, SET_GS }, + { "set-ss", REQ_ARG, 0, SET_SS }, + { "set-tr", REQ_ARG, 0, SET_TR }, + { "set-ldtr", REQ_ARG, 0, SET_LDTR }, + { "set-pinning",REQ_ARG, 0, SET_PINNING }, + { "set-vmcs-exception-bitmap", + REQ_ARG, 0, SET_VMCS_EXCEPTION_BITMAP }, + { "set-vmcs-entry-interruption-info", + REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO }, + { "capname", REQ_ARG, 0, CAPNAME }, + { "setcap", REQ_ARG, 0, SET_CAP }, + { "getcap", NO_ARG, &getcap, 1 }, + { "get-stats", NO_ARG, &get_stats, 1 }, + { "get-desc-ds",NO_ARG, &get_desc_ds, 1 }, + { "set-desc-ds",NO_ARG, &set_desc_ds, 1 }, + { "get-desc-es",NO_ARG, &get_desc_es, 1 }, + { "set-desc-es",NO_ARG, &set_desc_es, 1 }, + { "get-desc-ss",NO_ARG, &get_desc_ss, 1 }, + { "set-desc-ss",NO_ARG, &set_desc_ss, 1 }, + { "get-desc-cs",NO_ARG, &get_desc_cs, 1 }, + { "set-desc-cs",NO_ARG, &set_desc_cs, 1 }, + { "get-desc-fs",NO_ARG, &get_desc_fs, 1 }, + { "set-desc-fs",NO_ARG, &set_desc_fs, 1 }, + { "get-desc-gs",NO_ARG, &get_desc_gs, 1 }, + { "set-desc-gs",NO_ARG, &set_desc_gs, 1 }, + { "get-desc-tr",NO_ARG, &get_desc_tr, 1 }, + { "set-desc-tr",NO_ARG, &set_desc_tr, 1 }, + { "set-desc-ldtr", NO_ARG, &set_desc_ldtr, 1 }, + { "get-desc-ldtr", NO_ARG, &get_desc_ldtr, 1 }, + { "set-desc-gdtr", NO_ARG, &set_desc_gdtr, 1 }, + { "get-desc-gdtr", NO_ARG, &get_desc_gdtr, 1 }, + { "set-desc-idtr", NO_ARG, &set_desc_idtr, 1 }, + { "get-desc-idtr", NO_ARG, &get_desc_idtr, 1 }, + { "get-lowmem", NO_ARG, &get_lowmem, 1 }, + { "get-highmem",NO_ARG, &get_highmem, 1 }, + { "get-efer", NO_ARG, &get_efer, 1 }, + { "get-cr0", NO_ARG, &get_cr0, 1 }, + { "get-cr3", NO_ARG, &get_cr3, 1 }, + { "get-cr4", NO_ARG, &get_cr4, 1 }, + { "get-dr7", NO_ARG, &get_dr7, 1 }, + { "get-rsp", NO_ARG, &get_rsp, 1 }, + { "get-rip", NO_ARG, &get_rip, 1 }, + { "get-rax", NO_ARG, &get_rax, 1 }, + { "get-rbx", NO_ARG, &get_rbx, 1 }, + { "get-rcx", NO_ARG, &get_rcx, 1 }, + { "get-rdx", NO_ARG, &get_rdx, 1 }, + { "get-rsi", NO_ARG, &get_rsi, 1 }, + { "get-rdi", NO_ARG, &get_rdi, 1 }, + { "get-rbp", NO_ARG, &get_rbp, 1 }, + { "get-r8", NO_ARG, &get_r8, 1 }, + { "get-r9", NO_ARG, &get_r9, 1 }, + { "get-r10", NO_ARG, &get_r10, 1 }, + { "get-r11", NO_ARG, &get_r11, 1 }, + { "get-r12", NO_ARG, &get_r12, 1 }, + { "get-r13", NO_ARG, &get_r13, 1 }, + { "get-r14", NO_ARG, &get_r14, 1 }, + { "get-r15", NO_ARG, &get_r15, 1 }, + { "get-rflags", NO_ARG, &get_rflags, 1 }, + { "get-cs", NO_ARG, &get_cs, 1 }, + { "get-ds", NO_ARG, &get_ds, 1 }, + { "get-es", NO_ARG, &get_es, 1 }, + { "get-fs", NO_ARG, &get_fs, 1 }, + { "get-gs", NO_ARG, &get_gs, 1 }, + { "get-ss", NO_ARG, &get_ss, 1 }, + { "get-tr", NO_ARG, &get_tr, 1 }, + { "get-ldtr", NO_ARG, &get_ldtr, 1 }, + { "get-vmcs-pinbased-ctls", + NO_ARG, &get_pinbased_ctls, 1 }, + { "get-vmcs-procbased-ctls", + NO_ARG, &get_procbased_ctls, 1 }, + { "get-vmcs-procbased-ctls2", + NO_ARG, &get_procbased_ctls2, 1 }, + { "get-vmcs-guest-linear-address", + NO_ARG, &get_vmcs_gla, 1 }, + { "get-vmcs-guest-physical-address", + NO_ARG, &get_vmcs_gpa, 1 }, + { "get-vmcs-entry-interruption-info", + NO_ARG, &get_vmcs_entry_interruption_info, 1}, + { "get-vmcs-eptp", NO_ARG, &get_eptp, 1 }, + { "get-vmcs-exception-bitmap", + NO_ARG, &get_exception_bitmap, 1 }, + { "get-vmcs-io-bitmap-address", + NO_ARG, &get_io_bitmap, 1 }, + { "get-vmcs-tsc-offset", NO_ARG,&get_tsc_offset, 1 }, + { "get-vmcs-cr0-mask", NO_ARG, &get_cr0_mask, 1 }, + { "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 }, + { "get-vmcs-cr4-mask", NO_ARG, &get_cr4_mask, 1 }, + { "get-vmcs-cr4-shadow", NO_ARG,&get_cr4_shadow, 1 }, + { "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1}, + { "get-vmcs-apic-access-address", + NO_ARG, &get_apic_access_addr, 1}, + { "get-vmcs-virtual-apic-address", + NO_ARG, &get_virtual_apic_addr, 1}, + { "get-vmcs-tpr-threshold", + NO_ARG, &get_tpr_threshold, 1 }, + { "get-vmcs-msr-bitmap", + NO_ARG, &get_msr_bitmap, 1 }, + { "get-vmcs-msr-bitmap-address", + NO_ARG, &get_msr_bitmap_address, 1 }, + { "get-vmcs-vpid", NO_ARG, &get_vpid, 1 }, + { "get-vmcs-ple-gap", NO_ARG, &get_ple_gap, 1 }, + { "get-vmcs-ple-window", NO_ARG,&get_ple_window,1 }, + { "get-vmcs-instruction-error", + NO_ARG, &get_inst_err, 1 }, + { "get-vmcs-exit-ctls", NO_ARG, &get_exit_ctls, 1 }, + { "get-vmcs-entry-ctls", + NO_ARG, &get_entry_ctls, 1 }, + { "get-vmcs-guest-pat", NO_ARG, &get_guest_pat, 1 }, + { "get-vmcs-host-pat", NO_ARG, &get_host_pat, 1 }, + { "get-vmcs-host-cr0", + NO_ARG, &get_host_cr0, 1 }, + { "get-vmcs-host-cr3", + NO_ARG, &get_host_cr3, 1 }, + { "get-vmcs-host-cr4", + NO_ARG, &get_host_cr4, 1 }, + { "get-vmcs-host-rip", + NO_ARG, &get_host_rip, 1 }, + { "get-vmcs-host-rsp", + NO_ARG, &get_host_rsp, 1 }, + { "get-vmcs-guest-sysenter", + NO_ARG, &get_guest_sysenter, 1 }, + { "get-vmcs-link", NO_ARG, &get_vmcs_link, 1 }, + { "get-vmcs-exit-reason", + NO_ARG, &get_vmcs_exit_reason, 1 }, + { "get-vmcs-exit-qualification", + NO_ARG, &get_vmcs_exit_qualification, 1 }, + { "get-vmcs-exit-interruption-info", + NO_ARG, &get_vmcs_exit_interruption_info, 1}, + { "get-vmcs-exit-interruption-error", + NO_ARG, &get_vmcs_exit_interruption_error, 1}, + { "get-vmcs-interruptibility", + NO_ARG, &get_vmcs_interruptibility, 1 }, + { "get-pinning",NO_ARG, &get_pinning, 1 }, + { "run", NO_ARG, &run, 1 }, + { "create", NO_ARG, &create, 1 }, + { "destroy", NO_ARG, &destroy, 1 }, + { NULL, 0, NULL, 0 } + }; + + vcpu = 0; + progname = basename(argv[0]); + + while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) { + switch (ch) { + case 0: + break; + case VMNAME: + vmname = optarg; + break; + case VCPU: + vcpu = atoi(optarg); + break; + case SET_LOWMEM: + lowmem = atoi(optarg) * MB; + lowmem = roundup(lowmem, 2 * MB); + break; + case SET_HIGHMEM: + highmem = atoi(optarg) * MB; + highmem = roundup(highmem, 2 * MB); + break; + case SET_EFER: + efer = strtoul(optarg, NULL, 0); + set_efer = 1; + break; + case SET_CR0: + cr0 = strtoul(optarg, NULL, 0); + set_cr0 = 1; + break; + case SET_CR3: + cr3 = strtoul(optarg, NULL, 0); + set_cr3 = 1; + break; + case SET_CR4: + cr4 = strtoul(optarg, NULL, 0); + set_cr4 = 1; + break; + case SET_DR7: + dr7 = strtoul(optarg, NULL, 0); + set_dr7 = 1; + break; + case SET_RSP: + rsp = strtoul(optarg, NULL, 0); + set_rsp = 1; + break; + case SET_RIP: + rip = strtoul(optarg, NULL, 0); + set_rip = 1; + break; + case SET_RAX: + rax = strtoul(optarg, NULL, 0); + set_rax = 1; + break; + case SET_RFLAGS: + rflags = strtoul(optarg, NULL, 0); + set_rflags = 1; + break; + case DESC_BASE: + desc_base = strtoul(optarg, NULL, 0); + break; + case DESC_LIMIT: + desc_limit = strtoul(optarg, NULL, 0); + break; + case DESC_ACCESS: + desc_access = strtoul(optarg, NULL, 0); + break; + case SET_CS: + cs = strtoul(optarg, NULL, 0); + set_cs = 1; + break; + case SET_DS: + ds = strtoul(optarg, NULL, 0); + set_ds = 1; + break; + case SET_ES: + es = strtoul(optarg, NULL, 0); + set_es = 1; + break; + case SET_FS: + fs = strtoul(optarg, NULL, 0); + set_fs = 1; + break; + case SET_GS: + gs = strtoul(optarg, NULL, 0); + set_gs = 1; + break; + case SET_SS: + ss = strtoul(optarg, NULL, 0); + set_ss = 1; + break; + case SET_TR: + tr = strtoul(optarg, NULL, 0); + set_tr = 1; + break; + case SET_LDTR: + ldtr = strtoul(optarg, NULL, 0); + set_ldtr = 1; + break; + case SET_PINNING: + pincpu = strtol(optarg, NULL, 0); + set_pinning = 1; + break; + case SET_VMCS_EXCEPTION_BITMAP: + exception_bitmap = strtoul(optarg, NULL, 0); + set_exception_bitmap = 1; + break; + case SET_VMCS_ENTRY_INTERRUPTION_INFO: + vmcs_entry_interruption_info = strtoul(optarg, NULL, 0); + set_vmcs_entry_interruption_info = 1; + break; + case SET_CAP: + capval = strtoul(optarg, NULL, 0); + setcap = 1; + break; + case CAPNAME: + capname = optarg; + break; + default: + usage(); + } + } + argc -= optind; + argv += optind; + + if (vmname == NULL) + usage(); + + error = 0; + + if (!error && create) + error = vm_create(vmname); + + if (!error) { + ctx = vm_open(vmname); + if (ctx == NULL) + error = -1; + } + + if (!error && lowmem) + error = vm_setup_memory(ctx, 0, lowmem, NULL); + + if (!error && highmem) + error = vm_setup_memory(ctx, 4 * GB, highmem, NULL); + + if (!error && set_efer) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer); + + if (!error && set_cr0) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0); + + if (!error && set_cr3) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3); + + if (!error && set_cr4) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4); + + if (!error && set_dr7) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7); + + if (!error && set_rsp) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp); + + if (!error && set_rip) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip); + + if (!error && set_rax) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax); + + if (!error && set_rflags) { + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS, + rflags); + } + + if (!error && set_desc_ds) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_es) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_ss) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_cs) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_fs) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_gs) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_tr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_ldtr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_gdtr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR, + desc_base, desc_limit, 0); + } + + if (!error && set_desc_idtr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR, + desc_base, desc_limit, 0); + } + + if (!error && set_cs) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs); + + if (!error && set_ds) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds); + + if (!error && set_es) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es); + + if (!error && set_fs) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs); + + if (!error && set_gs) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs); + + if (!error && set_ss) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss); + + if (!error && set_tr) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr); + + if (!error && set_ldtr) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr); + + if (!error && set_pinning) + error = vm_set_pinning(ctx, vcpu, pincpu); + + if (!error && set_exception_bitmap) { + error = vm_set_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP, + exception_bitmap); + } + + if (!error && set_vmcs_entry_interruption_info) { + error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO, + vmcs_entry_interruption_info); + } + + if (!error && get_lowmem) { + error = vm_get_memory_seg(ctx, 0, &hpa, &len); + if (error == 0) + printf("lowmem\t\t0x%016lx/%ld\n", hpa, len); + } + + if (!error && get_highmem) { + error = vm_get_memory_seg(ctx, 4 * GB, &hpa, &len); + if (error == 0) + printf("highmem\t\t0x%016lx/%ld\n", hpa, len); + } + + if (!error && get_efer) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer); + if (error == 0) + printf("efer[%d]\t\t0x%016lx\n", vcpu, efer); + } + + if (!error && get_cr0) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0); + if (error == 0) + printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0); + } + + if (!error && get_cr3) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3); + if (error == 0) + printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3); + } + + if (!error && get_cr4) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4); + if (error == 0) + printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4); + } + + if (!error && get_dr7) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR7, &dr7); + if (error == 0) + printf("dr7[%d]\t\t0x%016lx\n", vcpu, dr7); + } + + if (!error && get_rsp) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSP, &rsp); + if (error == 0) + printf("rsp[%d]\t\t0x%016lx\n", vcpu, rsp); + } + + if (!error && get_rip) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip); + if (error == 0) + printf("rip[%d]\t\t0x%016lx\n", vcpu, rip); + } + + if (!error && get_rax) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RAX, &rax); + if (error == 0) + printf("rax[%d]\t\t0x%016lx\n", vcpu, rax); + } + + if (!error && get_rbx) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBX, &rbx); + if (error == 0) + printf("rbx[%d]\t\t0x%016lx\n", vcpu, rbx); + } + + if (!error && get_rcx) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RCX, &rcx); + if (error == 0) + printf("rcx[%d]\t\t0x%016lx\n", vcpu, rcx); + } + + if (!error && get_rdx) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDX, &rdx); + if (error == 0) + printf("rdx[%d]\t\t0x%016lx\n", vcpu, rdx); + } + + if (!error && get_rsi) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSI, &rsi); + if (error == 0) + printf("rsi[%d]\t\t0x%016lx\n", vcpu, rsi); + } + + if (!error && get_rdi) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDI, &rdi); + if (error == 0) + printf("rdi[%d]\t\t0x%016lx\n", vcpu, rdi); + } + + if (!error && get_rbp) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBP, &rbp); + if (error == 0) + printf("rbp[%d]\t\t0x%016lx\n", vcpu, rbp); + } + + if (!error && get_r8) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R8, &r8); + if (error == 0) + printf("r8[%d]\t\t0x%016lx\n", vcpu, r8); + } + + if (!error && get_r9) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R9, &r9); + if (error == 0) + printf("r9[%d]\t\t0x%016lx\n", vcpu, r9); + } + + if (!error && get_r10) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R10, &r10); + if (error == 0) + printf("r10[%d]\t\t0x%016lx\n", vcpu, r10); + } + + if (!error && get_r11) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R11, &r11); + if (error == 0) + printf("r11[%d]\t\t0x%016lx\n", vcpu, r11); + } + + if (!error && get_r12) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R12, &r12); + if (error == 0) + printf("r12[%d]\t\t0x%016lx\n", vcpu, r12); + } + + if (!error && get_r13) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R13, &r13); + if (error == 0) + printf("r13[%d]\t\t0x%016lx\n", vcpu, r13); + } + + if (!error && get_r14) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R14, &r14); + if (error == 0) + printf("r14[%d]\t\t0x%016lx\n", vcpu, r14); + } + + if (!error && get_r15) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R15, &r15); + if (error == 0) + printf("r15[%d]\t\t0x%016lx\n", vcpu, r15); + } + + if (!error && get_rflags) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RFLAGS, + &rflags); + if (error == 0) + printf("rflags[%d]\t0x%016lx\n", vcpu, rflags); + } + + if (!error && get_stats) { + int i, num_stats; + uint64_t *stats; + struct timeval tv; + const char *desc; + + stats = vm_get_stats(ctx, vcpu, &tv, &num_stats); + if (stats != NULL) { + printf("vcpu%d\n", vcpu); + for (i = 0; i < num_stats; i++) { + desc = vm_get_stat_desc(ctx, i); + printf("%-32s\t%ld\n", desc, stats[i]); + } + } + } + + if (!error && get_desc_ds) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_DS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && get_desc_es) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_ES, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && get_desc_fs) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_FS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && get_desc_gs) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && get_desc_ss) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && get_desc_cs) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_CS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && get_desc_tr) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && get_desc_ldtr) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_LDTR, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && get_desc_gdtr) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GDTR, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("gdtr[%d]\t\t0x%016lx/0x%08x\n", + vcpu, desc_base, desc_limit); + } + } + + if (!error && get_desc_idtr) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_IDTR, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("idtr[%d]\t\t0x%016lx/0x%08x\n", + vcpu, desc_base, desc_limit); + } + } + + if (!error && get_cs) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CS, &cs); + if (error == 0) + printf("cs[%d]\t\t0x%04lx\n", vcpu, cs); + } + + if (!error && get_ds) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DS, &ds); + if (error == 0) + printf("ds[%d]\t\t0x%04lx\n", vcpu, ds); + } + + if (!error && get_es) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_ES, &es); + if (error == 0) + printf("es[%d]\t\t0x%04lx\n", vcpu, es); + } + + if (!error && get_fs) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_FS, &fs); + if (error == 0) + printf("fs[%d]\t\t0x%04lx\n", vcpu, fs); + } + + if (!error && get_gs) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_GS, &gs); + if (error == 0) + printf("gs[%d]\t\t0x%04lx\n", vcpu, gs); + } + + if (!error && get_ss) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_SS, &ss); + if (error == 0) + printf("ss[%d]\t\t0x%04lx\n", vcpu, ss); + } + + if (!error && get_tr) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_TR, &tr); + if (error == 0) + printf("tr[%d]\t\t0x%04lx\n", vcpu, tr); + } + + if (!error && get_ldtr) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_LDTR, &ldtr); + if (error == 0) + printf("ldtr[%d]\t\t0x%04lx\n", vcpu, ldtr); + } + + if (!error && get_pinning) { + error = vm_get_pinning(ctx, vcpu, &pincpu); + if (error == 0) { + if (pincpu < 0) + printf("pincpu[%d]\tunpinned\n", vcpu); + else + printf("pincpu[%d]\t%d\n", vcpu, pincpu); + } + } + + if (!error && get_pinbased_ctls) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl); + if (error == 0) + printf("pinbased_ctls[%d]\t0x%08x\n", vcpu, ctl); + } + + if (!error && get_procbased_ctls) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_PRI_PROC_BASED_CTLS, &ctl); + if (error == 0) + printf("procbased_ctls[%d]\t0x%08x\n", vcpu, ctl); + } + + if (!error && get_procbased_ctls2) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_SEC_PROC_BASED_CTLS, &ctl); + if (error == 0) + printf("procbased_ctls2[%d]\t0x%08x\n", vcpu, ctl); + } + + if (!error && get_vmcs_gla) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_LINEAR_ADDRESS, &u64); + if (error == 0) + printf("gla[%d]\t\t0x%016lx\n", vcpu, u64); + } + + if (!error && get_vmcs_gpa) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_PHYSICAL_ADDRESS, &u64); + if (error == 0) + printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64); + } + + if (!error && get_vmcs_entry_interruption_info) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64); + if (error == 0) { + printf("entry_interruption_info[%d]\t0x%08x\n", + vcpu, u64); + } + } + + if (!error && get_eptp) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp); + if (error == 0) + printf("eptp[%d]\t\t0x%016lx\n", vcpu, eptp); + } + + if (!error && get_exception_bitmap) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP, + &bm); + if (error == 0) + printf("exception_bitmap[%d]\t0x%08x\n", vcpu, bm); + } + + if (!error && get_io_bitmap) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A, &bm); + if (error == 0) + printf("io_bitmap_a[%d]\t0x%08x\n", vcpu, bm); + error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B, &bm); + if (error == 0) + printf("io_bitmap_b[%d]\t0x%08x\n", vcpu, bm); + } + + if (!error && get_tsc_offset) { + uint64_t tscoff; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET, &tscoff); + if (error == 0) + printf("tsc_offset[%d]\t0x%016lx\n", tscoff); + } + + if (!error && get_cr0_mask) { + uint64_t cr0mask; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_MASK, &cr0mask); + if (error == 0) + printf("cr0_mask[%d]\t\t0x%016lx\n", cr0mask); + } + + if (!error && get_cr0_shadow) { + uint64_t cr0shadow; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_SHADOW, + &cr0shadow); + if (error == 0) + printf("cr0_shadow[%d]\t\t0x%016lx\n", cr0shadow); + } + + if (!error && get_cr4_mask) { + uint64_t cr4mask; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_MASK, &cr4mask); + if (error == 0) + printf("cr4_mask[%d]\t\t0x%016lx\n", cr4mask); + } + + if (!error && get_cr4_shadow) { + uint64_t cr4shadow; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_SHADOW, + &cr4shadow); + if (error == 0) + printf("cr4_shadow[%d]\t\t0x%016lx\n", cr4shadow); + } + + if (!error && get_cr3_targets) { + uint64_t target_count, target_addr; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT, + &target_count); + if (error == 0) { + printf("cr3_target_count[%d]\t0x%08x\n", + vcpu, target_count); + } + + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET0, + &target_addr); + if (error == 0) { + printf("cr3_target0[%d]\t\t0x%016lx\n", + vcpu, target_addr); + } + + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET1, + &target_addr); + if (error == 0) { + printf("cr3_target1[%d]\t\t0x%016lx\n", + vcpu, target_addr); + } + + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET2, + &target_addr); + if (error == 0) { + printf("cr3_target2[%d]\t\t0x%016lx\n", + vcpu, target_addr); + } + + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET3, + &target_addr); + if (error == 0) { + printf("cr3_target3[%d]\t\t0x%016lx\n", + vcpu, target_addr); + } + } + + if (!error && get_apic_access_addr) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_APIC_ACCESS, &addr); + if (error == 0) + printf("apic_access_addr[%d]\t0x%016lx\n", vcpu, addr); + } + + if (!error && get_virtual_apic_addr) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_VIRTUAL_APIC, &addr); + if (error == 0) + printf("virtual_apic_addr[%d]\t0x%016lx\n", vcpu, addr); + } + + if (!error && get_tpr_threshold) { + uint64_t threshold; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD, + &threshold); + if (error == 0) + printf("tpr_threshold[%d]\t0x%08x\n", vcpu, threshold); + } + + if (!error && get_msr_bitmap_address) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr); + if (error == 0) + printf("msr_bitmap[%d]\t\t0x%016lx\n", vcpu, addr); + } + + if (!error && get_msr_bitmap) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr); + if (error == 0) + error = dump_vmcs_msr_bitmap(vcpu, addr); + } + + if (!error && get_vpid) { + uint64_t vpid; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid); + if (error == 0) + printf("vpid[%d]\t\t0x%04x\n", vcpu, vpid); + } + + if (!error && get_ple_window) { + uint64_t window; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_WINDOW, &window); + if (error == 0) + printf("ple_window[%d]\t\t0x%08x\n", vcpu, window); + } + + if (!error && get_ple_gap) { + uint64_t gap; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_GAP, &gap); + if (error == 0) + printf("ple_gap[%d]\t\t0x%08x\n", vcpu, gap); + } + + if (!error && get_inst_err) { + uint64_t insterr; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_INSTRUCTION_ERROR, + &insterr); + if (error == 0) { + printf("instruction_error[%d]\t0x%08x\n", + vcpu, insterr); + } + } + + if (!error && get_exit_ctls) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl); + if (error == 0) + printf("exit_ctls[%d]\t\t0x%08x\n", vcpu, ctl); + } + + if (!error && get_entry_ctls) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_CTLS, &ctl); + if (error == 0) + printf("entry_ctls[%d]\t\t0x%08x\n", vcpu, ctl); + } + + if (!error && get_host_pat) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_IA32_PAT, &pat); + if (error == 0) + printf("host_pat[%d]\t\t0x%016lx\n", vcpu, pat); + } + + if (!error && get_guest_pat) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_IA32_PAT, &pat); + if (error == 0) + printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat); + } + + if (!error && get_host_cr0) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR0, &cr0); + if (error == 0) + printf("host_cr0[%d]\t\t0x%016lx\n", vcpu, cr0); + } + + if (!error && get_host_cr3) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR3, &cr3); + if (error == 0) + printf("host_cr3[%d]\t\t0x%016lx\n", vcpu, cr3); + } + + if (!error && get_host_cr4) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR4, &cr4); + if (error == 0) + printf("host_cr4[%d]\t\t0x%016lx\n", vcpu, cr4); + } + + if (!error && get_host_rip) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RIP, &rip); + if (error == 0) + printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rip); + } + + if (!error && get_host_rsp) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RSP, &rsp); + if (error == 0) + printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rsp); + } + + if (!error && get_guest_sysenter) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_SYSENTER_CS, &cs); + if (error == 0) + printf("guest_sysenter_cs[%d]\t0x%08x\n", vcpu, cs); + + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_SYSENTER_ESP, &rsp); + if (error == 0) + printf("guest_sysenter_sp[%d]\t0x%016lx\n", vcpu, rsp); + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_SYSENTER_EIP, &rip); + if (error == 0) + printf("guest_sysenter_ip[%d]\t0x%016lx\n", vcpu, rip); + } + + if (!error && get_vmcs_link) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr); + if (error == 0) + printf("vmcs_pointer[%d]\t0x%016lx\n", vcpu, addr); + } + + if (!error && get_vmcs_exit_reason) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON, &u64); + if (error == 0) + printf("vmcs_exit_reason[%d]\t0x%016lx\n", vcpu, u64); + } + + if (!error && get_vmcs_exit_qualification) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION, + &u64); + if (error == 0) + printf("vmcs_exit_qualification[%d]\t0x%016lx\n", + vcpu, u64); + } + + if (!error && get_vmcs_exit_interruption_info) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_EXIT_INTERRUPTION_INFO, &u64); + if (error == 0) { + printf("vmcs_exit_interruption_info[%d]\t0x%08x\n", + vcpu, u64); + } + } + + if (!error && get_vmcs_exit_interruption_error) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_EXIT_INTERRUPTION_ERROR, &u64); + if (error == 0) { + printf("vmcs_exit_interruption_error[%d]\t0x%08x\n", + vcpu, u64); + } + } + + if (!error && get_vmcs_interruptibility) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_INTERRUPTIBILITY, &u64); + if (error == 0) { + printf("vmcs_guest_interruptibility[%d]\t0x%08x\n", + vcpu, u64); + } + } + + if (!error && setcap) { + int captype; + captype = vm_capability_name2type(capname); + error = vm_set_capability(ctx, vcpu, captype, capval); + if (error != 0 && errno == ENOENT) + printf("Capability \"%s\" is not available\n", capname); + } + + if (!error && getcap) { + int captype, val; + captype = vm_capability_name2type(capname); + error = vm_get_capability(ctx, vcpu, captype, &val); + if (error == 0) { + printf("Capability \"%s\" is %s on vcpu %d\n", capname, + val ? "set" : "not set", vcpu); + } else if (errno == ENOENT) { + printf("Capability \"%s\" is not available\n", capname); + } + } + + if (!error && run) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip); + assert(error == 0); + + error = vm_run(ctx, vcpu, rip, &vmexit); + if (error == 0) + dump_vm_run_exitcode(&vmexit, vcpu); + else + printf("vm_run error %d\n", error); + } + + if (error) + printf("errno = %d\n", errno); + + if (!error && destroy) + vm_destroy(ctx); + + exit(error); +} |