aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Grehan <grehan@FreeBSD.org>2011-05-13 04:54:01 +0000
committerPeter Grehan <grehan@FreeBSD.org>2011-05-13 04:54:01 +0000
commit366f60834ff8ef709f132fe8976c96a5e2caace9 (patch)
tree4af898a91c7d67e7068687610ebc68f1cbdf3b2e
parent9adee7d03f2e4c91e6330410f88fb5addaf2a24a (diff)
downloadsrc-366f60834ff8ef709f132fe8976c96a5e2caace9.tar.gz
src-366f60834ff8ef709f132fe8976c96a5e2caace9.zip
Notes
-rw-r--r--lib/Makefile2
-rw-r--r--lib/libvmmapi/Makefile9
-rw-r--r--lib/libvmmapi/mptable.c336
-rw-r--r--lib/libvmmapi/mptable.h171
-rw-r--r--lib/libvmmapi/vmmapi.c647
-rw-r--r--lib/libvmmapi/vmmapi.h98
-rw-r--r--lib/libvmmapi/vmmapi_freebsd.c187
-rw-r--r--share/mk/bsd.libnames.mk1
-rw-r--r--sys/amd64/include/specialreg.h1
-rw-r--r--sys/amd64/include/vmm.h268
-rw-r--r--sys/amd64/include/vmm_dev.h191
-rw-r--r--sys/amd64/vmm/amd/amdv.c247
-rw-r--r--sys/amd64/vmm/intel/ept.c312
-rw-r--r--sys/amd64/vmm/intel/ept.h42
-rw-r--r--sys/amd64/vmm/intel/vmcs.c451
-rw-r--r--sys/amd64/vmm/intel/vmcs.h324
-rw-r--r--sys/amd64/vmm/intel/vmx.c1673
-rw-r--r--sys/amd64/vmm/intel/vmx.h115
-rw-r--r--sys/amd64/vmm/intel/vmx_controls.h92
-rw-r--r--sys/amd64/vmm/intel/vmx_cpufunc.h199
-rw-r--r--sys/amd64/vmm/intel/vmx_genassym.c81
-rw-r--r--sys/amd64/vmm/intel/vmx_msr.c172
-rw-r--r--sys/amd64/vmm/intel/vmx_msr.h78
-rw-r--r--sys/amd64/vmm/intel/vmx_support.S204
-rw-r--r--sys/amd64/vmm/intel/vtd.c637
-rw-r--r--sys/amd64/vmm/io/iommu.c230
-rw-r--r--sys/amd64/vmm/io/iommu.h67
-rw-r--r--sys/amd64/vmm/io/ppt.c449
-rw-r--r--sys/amd64/vmm/io/ppt.h40
-rw-r--r--sys/amd64/vmm/io/vdev.c270
-rw-r--r--sys/amd64/vmm/io/vdev.h84
-rw-r--r--sys/amd64/vmm/io/vlapic.c812
-rw-r--r--sys/amd64/vmm/io/vlapic.h105
-rw-r--r--sys/amd64/vmm/vmm.c737
-rw-r--r--sys/amd64/vmm/vmm_dev.c468
-rw-r--r--sys/amd64/vmm/vmm_ipi.c103
-rw-r--r--sys/amd64/vmm/vmm_ipi.h38
-rw-r--r--sys/amd64/vmm/vmm_ktr.h51
-rw-r--r--sys/amd64/vmm/vmm_lapic.c121
-rw-r--r--sys/amd64/vmm/vmm_lapic.h64
-rw-r--r--sys/amd64/vmm/vmm_mem.c413
-rw-r--r--sys/amd64/vmm/vmm_mem.h38
-rw-r--r--sys/amd64/vmm/vmm_msr.c264
-rw-r--r--sys/amd64/vmm/vmm_msr.h42
-rw-r--r--sys/amd64/vmm/vmm_stat.c103
-rw-r--r--sys/amd64/vmm/vmm_stat.h71
-rw-r--r--sys/amd64/vmm/vmm_support.S42
-rw-r--r--sys/amd64/vmm/vmm_util.c111
-rw-r--r--sys/amd64/vmm/vmm_util.h40
-rw-r--r--sys/amd64/vmm/x86.c113
-rw-r--r--sys/amd64/vmm/x86.h62
-rw-r--r--sys/modules/Makefile2
-rw-r--r--sys/modules/vmm/Makefile66
-rw-r--r--usr.sbin/Makefile4
-rw-r--r--usr.sbin/bhyve/Makefile18
-rw-r--r--usr.sbin/bhyve/atpic.c68
-rw-r--r--usr.sbin/bhyve/consport.c121
-rw-r--r--usr.sbin/bhyve/dbgport.c124
-rw-r--r--usr.sbin/bhyve/dbgport.h36
-rw-r--r--usr.sbin/bhyve/elcr.c65
-rw-r--r--usr.sbin/bhyve/fbsdrun.c650
-rw-r--r--usr.sbin/bhyve/fbsdrun.h53
-rw-r--r--usr.sbin/bhyve/inout.c98
-rw-r--r--usr.sbin/bhyve/inout.h64
-rw-r--r--usr.sbin/bhyve/mevent.c419
-rw-r--r--usr.sbin/bhyve/mevent.h49
-rw-r--r--usr.sbin/bhyve/mevent_test.c180
-rw-r--r--usr.sbin/bhyve/pci_emul.c976
-rw-r--r--usr.sbin/bhyve/pci_emul.h171
-rw-r--r--usr.sbin/bhyve/pci_hostbridge.c52
-rw-r--r--usr.sbin/bhyve/pci_passthru.c508
-rw-r--r--usr.sbin/bhyve/pci_virtio_block.c502
-rw-r--r--usr.sbin/bhyve/pci_virtio_net.c739
-rw-r--r--usr.sbin/bhyve/pit_8254.c196
-rw-r--r--usr.sbin/bhyve/pit_8254.h45
-rw-r--r--usr.sbin/bhyve/post.c51
-rw-r--r--usr.sbin/bhyve/rtc.c268
-rw-r--r--usr.sbin/bhyve/uart.c60
-rw-r--r--usr.sbin/bhyve/virtio.h85
-rw-r--r--usr.sbin/bhyve/xmsr.c261
-rw-r--r--usr.sbin/bhyve/xmsr.h34
-rw-r--r--usr.sbin/vmmctl/Makefile15
-rwxr-xr-xusr.sbin/vmmctl/sample.sh75
-rw-r--r--usr.sbin/vmmctl/vmmctl.c1485
84 files changed, 19016 insertions, 0 deletions
diff --git a/lib/Makefile b/lib/Makefile
index e9f4ec36b823..8f33206f44a2 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -102,6 +102,7 @@ SUBDIR= ${SUBDIR_ORDERED} \
${_libusbhid} \
${_libusb} \
${_libvgl} \
+ ${_libvmmapi} \
libwrap \
liby \
libz \
@@ -177,6 +178,7 @@ _libncp= libncp
.endif
_libsmb= libsmb
_libvgl= libvgl
+_libvmmapi= libvmmapi
.endif
.if ${MACHINE_ARCH} == "powerpc"
diff --git a/lib/libvmmapi/Makefile b/lib/libvmmapi/Makefile
new file mode 100644
index 000000000000..492391f9f85c
--- /dev/null
+++ b/lib/libvmmapi/Makefile
@@ -0,0 +1,9 @@
+# $FreeBSD$
+
+LIB= vmmapi
+SRCS= vmmapi.c vmmapi_freebsd.c mptable.c
+INCS= vmmapi.h
+
+CFLAGS+= -I${.CURDIR}
+
+.include <bsd.lib.mk>
diff --git a/lib/libvmmapi/mptable.c b/lib/libvmmapi/mptable.c
new file mode 100644
index 000000000000..1aea61a2ad7c
--- /dev/null
+++ b/lib/libvmmapi/mptable.c
@@ -0,0 +1,336 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/mman.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+
+#include "vmmapi.h"
+#include "mptable.h"
+
+#define LAPIC_PADDR (0xFEE00000)
+#define LAPIC_VERSION (16)
+
+#define IOAPIC_PADDR (0xFEC00000)
+#define IOAPIC_VERSION (0x11)
+
+extern int errno;
+
+static uint8_t
+mp_compute_checksum(void *base, size_t len)
+{
+ uint8_t *bytes = base;
+ uint8_t sum = 0;
+ for(; len > 0; len--) {
+ sum += *bytes++;
+ }
+ return 256 - sum;
+}
+
+static void
+mp_build_mpfp(struct mp_floating_pointer *mpfp, vm_paddr_t mpfp_gpa)
+{
+ memset(mpfp, 0, sizeof(*mpfp));
+ memcpy(mpfp->signature, MPFP_SIGNATURE, MPFP_SIGNATURE_LEN);
+ mpfp->mptable_paddr = mpfp_gpa + sizeof(*mpfp);
+ mpfp->specrev = MP_SPECREV;
+ mpfp->feature2 = 0;
+ mpfp->checksum = mp_compute_checksum(mpfp, sizeof(*mpfp));
+}
+
+static void
+mp_build_mpch(struct mp_config_hdr *mpch)
+{
+ memset(mpch, 0, sizeof(*mpch));
+ mpch->specrev = MP_SPECREV;
+ memcpy(mpch->signature, MPCH_SIGNATURE, MPCH_SIGNATURE_LEN);
+ memcpy(mpch->oemid, MPCH_OEMID, MPCH_OEMID_LEN);
+ memcpy(mpch->prodid, MPCH_PRODID, MPCH_PRODID_LEN);
+ mpch->lapic_paddr = LAPIC_PADDR;
+
+
+}
+
+static void
+mp_build_proc_entries(struct mpe_proc *mpep, int num_proc)
+{
+ int i;
+
+ for (i = 0; i < num_proc; i++) {
+ memset(mpep, 0, sizeof(*mpep));
+ mpep->entry_type = MP_ENTRY_PROC;
+ mpep->lapic_id = i; // XXX
+ mpep->lapic_version = LAPIC_VERSION;
+ mpep->proc_flags = (i == 0)?MPEP_FLAGS_BSP:0;
+ mpep->proc_flags |= MPEP_FLAGS_EN;
+ mpep->proc_signature = MPEP_SIGNATURE;
+ mpep->feature_flags = MPEP_FEATURES;
+ mpep++;
+ }
+
+}
+
+static void
+mp_build_bus_entries(struct mpe_bus *mpeb)
+{
+ memset(mpeb, 0, sizeof(*mpeb));
+ mpeb->entry_type = MP_ENTRY_BUS;
+ mpeb->busid = MPE_BUSID_ISA;
+ memcpy(mpeb->busname, MPE_BUSNAME_ISA, MPE_BUSNAME_LEN);
+ mpeb++;
+
+ memset(mpeb, 0, sizeof(*mpeb));
+ mpeb->entry_type = MP_ENTRY_BUS;
+ mpeb->busid = MPE_BUSID_PCI;
+ memcpy(mpeb->busname, MPE_BUSNAME_PCI, MPE_BUSNAME_LEN);
+
+}
+
+static void
+mp_build_ioapic_entries(struct mpe_ioapic *mpei)
+{
+ memset(mpei, 0, sizeof(*mpei));
+ mpei->entry_type = MP_ENTRY_IOAPIC;
+ mpei->ioapic_id = MPE_IOAPIC_ID;
+ mpei->ioapic_version = IOAPIC_VERSION;
+ mpei->ioapic_flags = MPE_IOAPIC_FLAG_EN;
+ mpei->ioapic_paddr = IOAPIC_PADDR;
+}
+
+static void
+mp_build_ioint_entries(struct mpe_ioint *mpeii, int num_pins)
+{
+ int pin;
+
+ /*
+ * The following config is taken from kernel mptable.c
+ * mptable_parse_default_config_ints(...), for now
+ * just use the default config, tweek later if needed.
+ */
+
+
+ /* Run through all 16 pins. */
+ for (pin = 0; pin < num_pins; pin++) {
+ memset(mpeii, 0, sizeof(*mpeii));
+ mpeii->entry_type = MP_ENTRY_IOINT;
+ mpeii->src_bus_id = MPE_BUSID_ISA;
+ mpeii->dst_apic_id = MPE_IOAPIC_ID;
+
+ /*
+ * All default configs route IRQs from bus 0 to the first 16 pins
+ * of the first I/O APIC with an APIC ID of 2.
+ */
+ mpeii->dst_apic_intin = pin;
+ switch (pin) {
+ case 0:
+ /* Pin 0 is an ExtINT pin. */
+ mpeii->intr_type = MPEII_INTR_EXTINT;
+ break;
+ case 2:
+ /* IRQ 0 is routed to pin 2. */
+ mpeii->intr_type = MPEII_INTR_INT;
+ mpeii->src_bus_irq = 0;
+ break;
+ case 5:
+ case 10:
+ case 11:
+ /*
+ * PCI Irqs set to level triggered.
+ */
+ mpeii->intr_flags = MPEII_FLAGS_TRIGMODE_LEVEL;
+ mpeii->src_bus_id = MPE_BUSID_PCI;
+ default:
+ /* All other pins are identity mapped. */
+ mpeii->intr_type = MPEII_INTR_INT;
+ mpeii->src_bus_irq = pin;
+ break;
+ }
+ mpeii++;
+ }
+
+}
+
+#define COPYSTR(dest, src, bytes) \
+ memcpy(dest, src, bytes); \
+ str[bytes] = 0;
+
+
+static void
+mptable_dump(struct mp_floating_pointer *mpfp, struct mp_config_hdr *mpch)
+{
+ static char str[16];
+ int i;
+ char *cur;
+
+ union mpe {
+ struct mpe_proc *proc;
+ struct mpe_bus *bus;
+ struct mpe_ioapic *ioapic;
+ struct mpe_ioint *ioint;
+ struct mpe_lint *lnit;
+ char *p;
+ };
+
+ union mpe mpe;
+
+ printf(" MP Floating Pointer :\n");
+ COPYSTR(str, mpfp->signature, 4);
+ printf(" signature: %s\n", str);
+ printf(" mpch paddr: %x\n", mpfp->mptable_paddr);
+ printf(" length: %x\n", mpfp->length);
+ printf(" specrec: %x\n", mpfp->specrev);
+ printf(" checksum: %x\n", mpfp->checksum);
+ printf(" feature1: %x\n", mpfp->feature1);
+ printf(" feature2: %x\n", mpfp->feature2);
+ printf(" feature3: %x\n", mpfp->feature3);
+ printf(" feature4: %x\n", mpfp->feature4);
+
+ printf(" MP Configuration Header :\n");
+ COPYSTR(str, mpch->signature, 4);
+ printf(" signature: %s\n", str);
+ printf(" length: %x\n", mpch->length);
+ printf(" specrec: %x\n", mpch->specrev);
+ printf(" checksum: %x\n", mpch->checksum);
+ COPYSTR(str, mpch->oemid, MPCH_OEMID_LEN);
+ printf(" oemid: %s\n", str);
+ COPYSTR(str, mpch->prodid, MPCH_PRODID_LEN);
+ printf(" prodid: %s\n", str);
+ printf(" oem_ptr: %x\n", mpch->oem_ptr);
+ printf(" oem_sz: %x\n", mpch->oem_sz);
+ printf(" nr_entries: %x\n", mpch->nr_entries);
+ printf(" apic paddr: %x\n", mpch->lapic_paddr);
+ printf(" ext_length: %x\n", mpch->ext_length);
+ printf(" ext_checksum: %x\n", mpch->ext_checksum);
+
+ cur = (char *)mpch + sizeof(*mpch);
+ for (i = 0; i < mpch->nr_entries; i++) {
+ mpe.p = cur;
+ switch(*mpe.p) {
+ case MP_ENTRY_PROC:
+ printf(" MP Processor Entry :\n");
+ printf(" lapic_id: %x\n", mpe.proc->lapic_id);
+ printf(" lapic_version: %x\n", mpe.proc->lapic_version);
+ printf(" proc_flags: %x\n", mpe.proc->proc_flags);
+ printf(" proc_signature: %x\n", mpe.proc->proc_signature);
+ printf(" feature_flags: %x\n", mpe.proc->feature_flags);
+ cur += sizeof(struct mpe_proc);
+ break;
+ case MP_ENTRY_BUS:
+ printf(" MP Bus Entry :\n");
+ printf(" busid: %x\n", mpe.bus->busid);
+ COPYSTR(str, mpe.bus->busname, MPE_BUSNAME_LEN);
+ printf(" busname: %s\n", str);
+ cur += sizeof(struct mpe_bus);
+ break;
+ case MP_ENTRY_IOAPIC:
+ printf(" MP IOAPIC Entry :\n");
+ printf(" ioapi_id: %x\n", mpe.ioapic->ioapic_id);
+ printf(" ioapi_version: %x\n", mpe.ioapic->ioapic_version);
+ printf(" ioapi_flags: %x\n", mpe.ioapic->ioapic_flags);
+ printf(" ioapi_paddr: %x\n", mpe.ioapic->ioapic_paddr);
+ cur += sizeof(struct mpe_ioapic);
+ break;
+ case MP_ENTRY_IOINT:
+ printf(" MP IO Interrupt Entry :\n");
+ printf(" intr_type: %x\n", mpe.ioint->intr_type);
+ printf(" intr_flags: %x\n", mpe.ioint->intr_flags);
+ printf(" src_bus_id: %x\n", mpe.ioint->src_bus_id);
+ printf(" src_bus_irq: %x\n", mpe.ioint->src_bus_irq);
+ printf(" dst_apic_id: %x\n", mpe.ioint->dst_apic_id);
+ printf(" dst_apic_intin: %x\n", mpe.ioint->dst_apic_intin);
+ cur += sizeof(struct mpe_ioint);
+ break;
+ case MP_ENTRY_LINT:
+ printf(" MP Local Interrupt Entry :\n");
+ cur += sizeof(struct mpe_lint);
+ break;
+ }
+
+ }
+}
+
+int
+vm_build_mptable(struct vmctx *ctx, vm_paddr_t gpa, int len, int ncpu,
+ void *oemp, int oemsz)
+{
+ struct mp_config_hdr *mpch;
+ char *mapaddr;
+ char *startaddr;
+ int error;
+
+ mapaddr = vm_map_memory(ctx, gpa, len);
+ if (mapaddr == MAP_FAILED) {
+ printf("%s\n", strerror(errno));
+ goto err;
+ }
+ startaddr = mapaddr;
+
+ mp_build_mpfp((struct mp_floating_pointer*) mapaddr, gpa);
+ mapaddr += sizeof(struct mp_floating_pointer);
+
+ mpch = (struct mp_config_hdr*)mapaddr;
+ mp_build_mpch(mpch);
+ mapaddr += sizeof(struct mp_config_hdr);
+
+ mp_build_proc_entries((struct mpe_proc*) mapaddr, ncpu);
+ mapaddr += (sizeof(struct mpe_proc)*ncpu);
+ mpch->nr_entries += ncpu;
+
+ mp_build_bus_entries((struct mpe_bus*)mapaddr);
+ mapaddr += (sizeof(struct mpe_bus)*MPE_NUM_BUSES);
+ mpch->nr_entries += MPE_NUM_BUSES;
+#if 0
+ mp_build_ioapic_entries((struct mpe_ioapic*)mapaddr);
+ mapaddr += sizeof(struct mpe_ioapic);
+ mpch->nr_entries++;
+
+ mp_build_ioint_entries((struct mpe_ioint*)mapaddr, MPEII_MAX_IRQ);
+ mapaddr += sizeof(struct mpe_ioint)*MPEII_MAX_IRQ;
+ mpch->nr_entries += MPEII_MAX_IRQ;
+
+#endif
+ if (oemp) {
+ mpch->oem_ptr = mapaddr - startaddr + gpa;
+ mpch->oem_sz = oemsz;
+ memcpy(mapaddr, oemp, oemsz);
+ }
+ mpch->length = (mapaddr) - ((char*) mpch);
+ mpch->checksum = mp_compute_checksum(mpch, sizeof(*mpch));
+
+
+ // mptable_dump((struct mp_floating_pointer*)startaddr, mpch);
+err:
+ return (error);
+}
diff --git a/lib/libvmmapi/mptable.h b/lib/libvmmapi/mptable.h
new file mode 100644
index 000000000000..cad8834a47b2
--- /dev/null
+++ b/lib/libvmmapi/mptable.h
@@ -0,0 +1,171 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MPTABLE_h_
+#define _MPTABLE_h_
+
+#define MP_SPECREV (4) // MP spec revision 1.1
+
+/*
+ * MP Floating Pointer Structure
+ */
+#define MPFP_SIGNATURE "_MP_"
+#define MPFP_SIGNATURE_LEN (4)
+#define MPFP_FEATURE2 (0x80) // IMCR is present
+struct mp_floating_pointer {
+ uint8_t signature[MPFP_SIGNATURE_LEN];
+ uint32_t mptable_paddr;
+ uint8_t length;
+ uint8_t specrev;
+ uint8_t checksum;
+ uint8_t feature1;
+ uint8_t feature2;
+ uint8_t feature3;
+ uint8_t feature4;
+ uint8_t feature5;
+};
+
+
+/*
+ * MP Configuration Table Header
+ */
+#define MPCH_SIGNATURE "PCMP"
+#define MPCH_SIGNATURE_LEN (4)
+
+#define MPCH_OEMID "NETAPP "
+#define MPCH_OEMID_LEN (8)
+#define MPCH_PRODID "vFiler "
+#define MPCH_PRODID_LEN (12)
+
+struct mp_config_hdr {
+ uint8_t signature[MPCH_SIGNATURE_LEN];
+ uint16_t length;
+ uint8_t specrev;
+ uint8_t checksum;
+ uint8_t oemid[MPCH_OEMID_LEN];
+ uint8_t prodid[MPCH_PRODID_LEN];
+ uint32_t oem_ptr;
+ uint16_t oem_sz;
+ uint16_t nr_entries;
+ uint32_t lapic_paddr;
+ uint16_t ext_length;
+ uint8_t ext_checksum;
+ uint8_t reserved;
+};
+
+#define MP_ENTRY_PROC (0)
+#define MP_ENTRY_BUS (1)
+#define MP_ENTRY_IOAPIC (2)
+#define MP_ENTRY_IOINT (3)
+#define MP_ENTRY_LINT (4)
+
+/*
+ * MP Processor Entry
+ */
+
+#define MPEP_FLAGS_EN (0x1)
+#define MPEP_FLAGS_BSP (0x2)
+
+#define MPEP_SIG_FAMILY (6)
+#define MPEP_SIG_MODEL (26)
+#define MPEP_SIG_STEPPING (5)
+#define MPEP_SIGNATURE ((MPEP_SIG_FAMILY << 8) | (MPEP_SIG_MODEL << 4) \
+ | (MPEP_SIG_STEPPING))
+
+#define MPEP_FEATURES (0xBFEBFBFF) // Value from Intel i7 CPUID
+
+struct mpe_proc {
+ uint8_t entry_type;
+ uint8_t lapic_id;
+ uint8_t lapic_version;
+ uint8_t proc_flags;
+ uint32_t proc_signature;
+ uint32_t feature_flags;
+ uint8_t reserved[8];
+};
+
+/*
+ * MP Bus Entry
+ */
+
+#define MPE_NUM_BUSES (2)
+#define MPE_BUSNAME_LEN (6)
+#define MPE_BUSID_ISA (0)
+#define MPE_BUSID_PCI (1)
+#define MPE_BUSNAME_ISA "ISA "
+#define MPE_BUSNAME_PCI "PCI "
+struct mpe_bus {
+ uint8_t entry_type;
+ uint8_t busid;
+ uint8_t busname[MPE_BUSNAME_LEN];
+};
+
+/*
+ * MP IO APIC Entry
+ */
+#define MPE_IOAPIC_ID (2)
+#define MPE_IOAPIC_FLAG_EN (1)
+struct mpe_ioapic {
+ uint8_t entry_type;
+ uint8_t ioapic_id;
+ uint8_t ioapic_version;
+ uint8_t ioapic_flags;
+ uint32_t ioapic_paddr;
+
+};
+
+/*
+ * MP IO Interrupt Assignment Entry
+ */
+#define MPEII_INTR_INT (0)
+#define MPEII_INTR_NMI (1)
+#define MPEII_INTR_SMI (2)
+#define MPEII_INTR_EXTINT (3)
+#define MPEII_PCI_IRQ_MASK (0x0c20U) /* IRQ 5,10,11 are PCI connected */
+#define MPEII_MAX_IRQ (16)
+#define MPEII_FLAGS_TRIGMODE_LEVEL (0x3)
+struct mpe_ioint {
+ uint8_t entry_type;
+ uint8_t intr_type;
+ uint16_t intr_flags;
+ uint8_t src_bus_id;
+ uint8_t src_bus_irq;
+ uint8_t dst_apic_id;
+ uint8_t dst_apic_intin;
+};
+
+/*
+ * MP Local Interrupt Assignment Entry
+ */
+struct mpe_lint {
+ uint8_t entry_type;
+};
+
+int vm_build_mptable(struct vmctx *ctxt, vm_paddr_t gpa, int len,
+ int ncpu, void *oemp, int oemsz);
+#endif /* _MPTABLE_h_ */
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
new file mode 100644
index 000000000000..7f95fea9b603
--- /dev/null
+++ b/lib/libvmmapi/vmmapi.c
@@ -0,0 +1,647 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <machine/specialreg.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+
+#include "vmmapi.h"
+#include "mptable.h"
+
+#ifndef CR4_VMXE
+#define CR4_VMXE (1UL << 13)
+#endif
+
+#define BIOS_ROM_BASE (0xf0000)
+#define BIOS_ROM_SIZE (0x10000)
+
+struct vmctx {
+ int fd;
+ char *name;
+};
+
+#define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
+#define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))
+
+static int
+vm_device_open(const char *name)
+{
+ int fd, len;
+ char *vmfile;
+
+ len = strlen("/dev/vmm/") + strlen(name) + 1;
+ vmfile = malloc(len);
+ assert(vmfile != NULL);
+ snprintf(vmfile, len, "/dev/vmm/%s", name);
+
+ /* Open the device file */
+ fd = open(vmfile, O_RDWR, 0);
+
+ free(vmfile);
+ return (fd);
+}
+
+int
+vm_create(const char *name)
+{
+
+ return (CREATE((char *)name));
+}
+
+struct vmctx *
+vm_open(const char *name)
+{
+ struct vmctx *vm;
+
+ vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
+ assert(vm != NULL);
+
+ vm->fd = -1;
+ vm->name = (char *)(vm + 1);
+ strcpy(vm->name, name);
+
+ if ((vm->fd = vm_device_open(vm->name)) < 0)
+ goto err;
+
+ return (vm);
+err:
+ vm_destroy(vm);
+ return (NULL);
+}
+
+void
+vm_destroy(struct vmctx *vm)
+{
+ assert(vm != NULL);
+
+ DESTROY(vm->name);
+ if (vm->fd >= 0)
+ close(vm->fd);
+ free(vm);
+}
+
+int
+vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa,
+ vm_paddr_t *ret_hpa, size_t *ret_len)
+{
+ int error;
+ struct vm_memory_segment seg;
+
+ bzero(&seg, sizeof(seg));
+ seg.gpa = gpa;
+ error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
+ *ret_hpa = seg.hpa;
+ *ret_len = seg.len;
+ return (error);
+}
+
+int
+vm_setup_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **mapaddr)
+{
+ int error;
+ struct vm_memory_segment seg;
+
+ /*
+ * Create and optionally map 'len' bytes of memory at guest
+ * physical address 'gpa'
+ */
+ bzero(&seg, sizeof(seg));
+ seg.gpa = gpa;
+ seg.len = len;
+ error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg);
+ if (error == 0 && mapaddr != NULL) {
+ *mapaddr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED,
+ ctx->fd, gpa);
+ }
+ return (error);
+}
+
+char *
+vm_map_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len)
+{
+
+ /* Map 'len' bytes of memory at guest physical address 'gpa' */
+ return ((char *)mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED,
+ ctx->fd, gpa));
+}
+
+int
+vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
+ uint64_t base, uint32_t limit, uint32_t access)
+{
+ int error;
+ struct vm_seg_desc vmsegdesc;
+
+ bzero(&vmsegdesc, sizeof(vmsegdesc));
+ vmsegdesc.cpuid = vcpu;
+ vmsegdesc.regnum = reg;
+ vmsegdesc.desc.base = base;
+ vmsegdesc.desc.limit = limit;
+ vmsegdesc.desc.access = access;
+
+ error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc);
+ return (error);
+}
+
+int
+vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
+ uint64_t *base, uint32_t *limit, uint32_t *access)
+{
+ int error;
+ struct vm_seg_desc vmsegdesc;
+
+ bzero(&vmsegdesc, sizeof(vmsegdesc));
+ vmsegdesc.cpuid = vcpu;
+ vmsegdesc.regnum = reg;
+
+ error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc);
+ if (error == 0) {
+ *base = vmsegdesc.desc.base;
+ *limit = vmsegdesc.desc.limit;
+ *access = vmsegdesc.desc.access;
+ }
+ return (error);
+}
+
+int
+vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
+{
+ int error;
+ struct vm_register vmreg;
+
+ bzero(&vmreg, sizeof(vmreg));
+ vmreg.cpuid = vcpu;
+ vmreg.regnum = reg;
+ vmreg.regval = val;
+
+ error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg);
+ return (error);
+}
+
+int
+vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val)
+{
+ int error;
+ struct vm_register vmreg;
+
+ bzero(&vmreg, sizeof(vmreg));
+ vmreg.cpuid = vcpu;
+ vmreg.regnum = reg;
+
+ error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg);
+ *ret_val = vmreg.regval;
+ return (error);
+}
+
+int
+vm_get_pinning(struct vmctx *ctx, int vcpu, int *host_cpuid)
+{
+ int error;
+ struct vm_pin vmpin;
+
+ bzero(&vmpin, sizeof(vmpin));
+ vmpin.vm_cpuid = vcpu;
+
+ error = ioctl(ctx->fd, VM_GET_PINNING, &vmpin);
+ *host_cpuid = vmpin.host_cpuid;
+ return (error);
+}
+
+int
+vm_set_pinning(struct vmctx *ctx, int vcpu, int host_cpuid)
+{
+ int error;
+ struct vm_pin vmpin;
+
+ bzero(&vmpin, sizeof(vmpin));
+ vmpin.vm_cpuid = vcpu;
+ vmpin.host_cpuid = host_cpuid;
+
+ error = ioctl(ctx->fd, VM_SET_PINNING, &vmpin);
+ return (error);
+}
+
+int
+vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, struct vm_exit *vmexit)
+{
+ int error;
+ struct vm_run vmrun;
+
+ bzero(&vmrun, sizeof(vmrun));
+ vmrun.cpuid = vcpu;
+ vmrun.rip = rip;
+
+ error = ioctl(ctx->fd, VM_RUN, &vmrun);
+ bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit));
+ return (error);
+}
+
+static int
+vm_inject_event_real(struct vmctx *ctx, int vcpu, enum vm_event_type type,
+ int vector, int error_code, int error_code_valid)
+{
+ struct vm_event ev;
+
+ bzero(&ev, sizeof(ev));
+ ev.cpuid = vcpu;
+ ev.type = type;
+ ev.vector = vector;
+ ev.error_code = error_code;
+ ev.error_code_valid = error_code_valid;
+
+ return (ioctl(ctx->fd, VM_INJECT_EVENT, &ev));
+}
+
+int
+vm_inject_event(struct vmctx *ctx, int vcpu, enum vm_event_type type,
+ int vector)
+{
+
+ return (vm_inject_event_real(ctx, vcpu, type, vector, 0, 0));
+}
+
+int
+vm_inject_event2(struct vmctx *ctx, int vcpu, enum vm_event_type type,
+ int vector, int error_code)
+{
+
+ return (vm_inject_event_real(ctx, vcpu, type, vector, error_code, 1));
+}
+
+int
+vm_build_tables(struct vmctx *ctxt, int ncpu, void *oemtbl, int oemtblsz)
+{
+
+ return (vm_build_mptable(ctxt, BIOS_ROM_BASE, BIOS_ROM_SIZE, ncpu,
+ oemtbl, oemtblsz));
+}
+
+int
+vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector)
+{
+ struct vm_lapic_irq vmirq;
+
+ bzero(&vmirq, sizeof(vmirq));
+ vmirq.cpuid = vcpu;
+ vmirq.vector = vector;
+
+ return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq));
+}
+
+int
+vm_inject_nmi(struct vmctx *ctx, int vcpu)
+{
+ struct vm_nmi vmnmi;
+
+ bzero(&vmnmi, sizeof(vmnmi));
+ vmnmi.cpuid = vcpu;
+
+ return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi));
+}
+
+int
+vm_capability_name2type(const char *capname)
+{
+ int i;
+
+ static struct {
+ const char *name;
+ int type;
+ } capstrmap[] = {
+ { "hlt_exit", VM_CAP_HALT_EXIT },
+ { "mtrap_exit", VM_CAP_MTRAP_EXIT },
+ { "pause_exit", VM_CAP_PAUSE_EXIT },
+ { "unrestricted_guest", VM_CAP_UNRESTRICTED_GUEST },
+ { 0 }
+ };
+
+ for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) {
+ if (strcmp(capstrmap[i].name, capname) == 0)
+ return (capstrmap[i].type);
+ }
+
+ return (-1);
+}
+
+int
+vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
+ int *retval)
+{
+ int error;
+ struct vm_capability vmcap;
+
+ bzero(&vmcap, sizeof(vmcap));
+ vmcap.cpuid = vcpu;
+ vmcap.captype = cap;
+
+ error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap);
+ *retval = vmcap.capval;
+ return (error);
+}
+
+int
+vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val)
+{
+ struct vm_capability vmcap;
+
+ bzero(&vmcap, sizeof(vmcap));
+ vmcap.cpuid = vcpu;
+ vmcap.captype = cap;
+ vmcap.capval = val;
+
+ return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap));
+}
+
+int
+vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
+{
+ struct vm_pptdev pptdev;
+
+ bzero(&pptdev, sizeof(pptdev));
+ pptdev.bus = bus;
+ pptdev.slot = slot;
+ pptdev.func = func;
+
+ return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev));
+}
+
+int
+vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
+{
+ struct vm_pptdev pptdev;
+
+ bzero(&pptdev, sizeof(pptdev));
+ pptdev.bus = bus;
+ pptdev.slot = slot;
+ pptdev.func = func;
+
+ return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev));
+}
+
+int
+vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
+ vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+ struct vm_pptdev_mmio pptmmio;
+
+ bzero(&pptmmio, sizeof(pptmmio));
+ pptmmio.bus = bus;
+ pptmmio.slot = slot;
+ pptmmio.func = func;
+ pptmmio.gpa = gpa;
+ pptmmio.len = len;
+ pptmmio.hpa = hpa;
+
+ return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio));
+}
+
+int
+vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
+ int destcpu, int vector, int numvec)
+{
+ struct vm_pptdev_msi pptmsi;
+
+ bzero(&pptmsi, sizeof(pptmsi));
+ pptmsi.vcpu = vcpu;
+ pptmsi.bus = bus;
+ pptmsi.slot = slot;
+ pptmsi.func = func;
+ pptmsi.destcpu = destcpu;
+ pptmsi.vector = vector;
+ pptmsi.numvec = numvec;
+
+ return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi));
+}
+
+uint64_t *
+vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
+ int *ret_entries)
+{
+ int error;
+
+ static struct vm_stats vmstats;
+
+ vmstats.cpuid = vcpu;
+
+ error = ioctl(ctx->fd, VM_STATS, &vmstats);
+ if (error == 0) {
+ if (ret_entries)
+ *ret_entries = vmstats.num_entries;
+ if (ret_tv)
+ *ret_tv = vmstats.tv;
+ return (vmstats.statbuf);
+ } else
+ return (NULL);
+}
+
+const char *
+vm_get_stat_desc(struct vmctx *ctx, int index)
+{
+ int error;
+
+ static struct vm_stat_desc statdesc;
+
+ statdesc.index = index;
+ if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
+ return (statdesc.desc);
+ else
+ return (NULL);
+}
+
+/*
+ * From Intel Vol 3a:
+ * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT
+ */
+int
+vcpu_reset(struct vmctx *vmctx, int vcpu)
+{
+ int error;
+ uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx;
+ uint32_t desc_access, desc_limit;
+ uint16_t sel;
+
+ zero = 0;
+
+ rflags = 0x2;
+ error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
+ if (error)
+ goto done;
+
+ rip = 0xfff0;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
+ goto done;
+
+ cr0 = CR0_NE;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
+ goto done;
+
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0)
+ goto done;
+
+ cr4 = CR4_VMXE;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
+ goto done;
+
+ /*
+ * CS: present, r/w, accessed, 16-bit, byte granularity, usable
+ */
+ desc_base = 0xffff0000;
+ desc_limit = 0xffff;
+ desc_access = 0x0093;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ sel = 0xf000;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0)
+ goto done;
+
+ /*
+ * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity
+ */
+ desc_base = 0;
+ desc_limit = 0xffff;
+ desc_access = 0x0093;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ sel = 0;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0)
+ goto done;
+
+ /* General purpose registers */
+ rdx = 0xf00;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0)
+ goto done;
+
+ /* GDTR, IDTR */
+ desc_base = 0;
+ desc_limit = 0xffff;
+ desc_access = 0;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
+ desc_base, desc_limit, desc_access);
+ if (error != 0)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR,
+ desc_base, desc_limit, desc_access);
+ if (error != 0)
+ goto done;
+
+ /* TR */
+ desc_base = 0;
+ desc_limit = 0xffff;
+ desc_access = 0x0000008b;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
+ if (error)
+ goto done;
+
+ sel = 0;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0)
+ goto done;
+
+ /* LDTR */
+ desc_base = 0;
+ desc_limit = 0xffff;
+ desc_access = 0x00000082;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base,
+ desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ sel = 0;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
+ goto done;
+
+ /* XXX cr2, debug registers */
+
+ error = 0;
+done:
+ return (error);
+}
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
new file mode 100644
index 000000000000..38533a894b17
--- /dev/null
+++ b/lib/libvmmapi/vmmapi.h
@@ -0,0 +1,98 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMMAPI_H_
+#define _VMMAPI_H_
+
+struct vmctx;
+
+int vm_create(const char *name);
+struct vmctx *vm_open(const char *name);
+void vm_destroy(struct vmctx *ctx);
+int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa,
+ vm_paddr_t *ret_hpa, size_t *ret_len);
+/*
+ * Create a memory segment of 'len' bytes in the guest physical address space
+ * at offset 'gpa'.
+ *
+ * If 'mapaddr' is not NULL then this region is mmap'ed into the address
+ * space of the calling process. If there is an mmap error then *mapaddr
+ * will be set to MAP_FAILED.
+ */
+
+int vm_setup_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len,
+ char **mapaddr);
+char * vm_map_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len);
+int vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
+ uint64_t base, uint32_t limit, uint32_t access);
+int vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
+ uint64_t *base, uint32_t *limit, uint32_t *access);
+int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
+int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
+int vm_get_pinning(struct vmctx *ctx, int vcpu, int *host_cpuid);
+int vm_set_pinning(struct vmctx *ctx, int vcpu, int host_cpuid);
+int vm_run(struct vmctx *ctx, int vcpu, uint64_t rip,
+ struct vm_exit *ret_vmexit);
+int vm_build_tables(struct vmctx *ctxt, int ncpus, void *oemtbl,
+ int oemtblsz);
+int vm_inject_event(struct vmctx *ctx, int vcpu, enum vm_event_type type,
+ int vector);
+int vm_inject_event2(struct vmctx *ctx, int vcpu, enum vm_event_type type,
+ int vector, int error_code);
+int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector);
+int vm_inject_nmi(struct vmctx *ctx, int vcpu);
+int vm_capability_name2type(const char *capname);
+int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
+ int *retval);
+int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
+ int val);
+int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
+int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
+int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
+ vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
+ int dest, int vector, int numvec);
+
+/*
+ * Return a pointer to the statistics buffer. Note that this is not MT-safe.
+ */
+uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
+ int *ret_entries);
+const char *vm_get_stat_desc(struct vmctx *ctx, int index);
+
+/* Reset vcpu register state */
+int vcpu_reset(struct vmctx *ctx, int vcpu);
+
+/*
+ * FreeBSD specific APIs
+ */
+int vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu,
+ uint64_t rip, uint64_t cr3, uint64_t gdtbase,
+ uint64_t rsp);
+void vm_setup_freebsd_gdt(uint64_t *gdtr);
+#endif /* _VMMAPI_H_ */
diff --git a/lib/libvmmapi/vmmapi_freebsd.c b/lib/libvmmapi/vmmapi_freebsd.c
new file mode 100644
index 000000000000..c4ad9898ede1
--- /dev/null
+++ b/lib/libvmmapi/vmmapi_freebsd.c
@@ -0,0 +1,187 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <machine/specialreg.h>
+#include <machine/segments.h>
+#include <machine/vmm.h>
+
+#include "vmmapi.h"
+
+#ifndef CR4_VMXE
+#define CR4_VMXE (1UL << 13)
+#endif
+
+#define DESC_UNUSABLE 0x00010000
+
+#define GUEST_NULL_SEL 0
+#define GUEST_CODE_SEL 1
+#define GUEST_DATA_SEL 2
+#define GUEST_GDTR_LIMIT (3 * 8 - 1)
+
+void
+vm_setup_freebsd_gdt(uint64_t *gdtr)
+{
+ gdtr[GUEST_NULL_SEL] = 0;
+ gdtr[GUEST_CODE_SEL] = 0x0020980000000000;
+ gdtr[GUEST_DATA_SEL] = 0x0000900000000000;
+}
+
+/*
+ * Setup the 'vcpu' register set such that it will begin execution at
+ * 'rip' in long mode.
+ */
+int
+vm_setup_freebsd_registers(struct vmctx *vmctx, int vcpu,
+ uint64_t rip, uint64_t cr3, uint64_t gdtbase,
+ uint64_t rsp)
+{
+ int error;
+ uint64_t cr0, cr4, efer, rflags, desc_base;
+ uint32_t desc_access, desc_limit;
+ uint16_t gsel;
+
+ cr0 = CR0_PE | CR0_PG | CR0_NE;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
+ goto done;
+
+ cr4 = CR4_PAE | CR4_VMXE;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
+ goto done;
+
+ efer = EFER_LME | EFER_LMA;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_EFER, efer)))
+ goto done;
+
+ rflags = 0x2;
+ error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
+ if (error)
+ goto done;
+
+ desc_base = 0;
+ desc_limit = 0;
+ desc_access = 0x0000209B;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ desc_access = 0x00000093;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ /*
+ * XXX TR is pointing to null selector even though we set the
+ * TSS segment to be usable with a base address and limit of 0.
+ */
+ desc_access = 0x0000008b;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, 0, 0,
+ DESC_UNUSABLE);
+ if (error)
+ goto done;
+
+ gsel = GSEL(GUEST_CODE_SEL, SEL_KPL);
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, gsel)) != 0)
+ goto done;
+
+ gsel = GSEL(GUEST_DATA_SEL, SEL_KPL);
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, gsel)) != 0)
+ goto done;
+
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, gsel)) != 0)
+ goto done;
+
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, gsel)) != 0)
+ goto done;
+
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, gsel)) != 0)
+ goto done;
+
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, gsel)) != 0)
+ goto done;
+
+ /* XXX TR is pointing to the null selector */
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, 0)) != 0)
+ goto done;
+
+ /* LDTR is pointing to the null selector */
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
+ goto done;
+
+ /* entry point */
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
+ goto done;
+
+ /* page table base */
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, cr3)) != 0)
+ goto done;
+
+ desc_base = gdtbase;
+ desc_limit = GUEST_GDTR_LIMIT;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
+ desc_base, desc_limit, 0);
+ if (error != 0)
+ goto done;
+
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, rsp)) != 0)
+ goto done;
+
+ error = 0;
+done:
+ return (error);
+}
diff --git a/share/mk/bsd.libnames.mk b/share/mk/bsd.libnames.mk
index ff1a11ae05b5..5fc1b3cbf40a 100644
--- a/share/mk/bsd.libnames.mk
+++ b/share/mk/bsd.libnames.mk
@@ -155,6 +155,7 @@ LIBUSB?= ${DESTDIR}${LIBDIR}/libusb.a
LIBUTIL?= ${DESTDIR}${LIBDIR}/libutil.a
LIBUUTIL?= ${DESTDIR}${LIBDIR}/libuutil.a
LIBVGL?= ${DESTDIR}${LIBDIR}/libvgl.a
+LIBVMMAPI?= ${DESTDIR}${LIBDIR}/libvmmapi.a
LIBWRAP?= ${DESTDIR}${LIBDIR}/libwrap.a
LIBXPG4?= ${DESTDIR}${LIBDIR}/libxpg4.a
LIBY?= ${DESTDIR}${LIBDIR}/liby.a
diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h
index 895619cf6f7e..c95fee05f5eb 100644
--- a/sys/amd64/include/specialreg.h
+++ b/sys/amd64/include/specialreg.h
@@ -297,6 +297,7 @@
*/
#define APICBASE_RESERVED 0x000006ff
#define APICBASE_BSP 0x00000100
+#define APICBASE_X2APIC 0x00000400
#define APICBASE_ENABLED 0x00000800
#define APICBASE_ADDRESS 0xfffff000
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
new file mode 100644
index 000000000000..0f4c356b6d5f
--- /dev/null
+++ b/sys/amd64/include/vmm.h
@@ -0,0 +1,268 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: vmm.h 482 2011-05-09 21:22:43Z grehan $
+ */
+
+#ifndef _VMM_H_
+#define _VMM_H_
+
+#ifdef _KERNEL
+
+#define VM_MAX_NAMELEN 32
+
+struct vm;
+struct vm_memory_segment;
+struct seg_desc;
+struct vm_exit;
+struct vm_run;
+struct vlapic;
+
+typedef int (*vmm_init_func_t)(void);
+typedef int (*vmm_cleanup_func_t)(void);
+typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
+typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
+ struct vm_exit *vmexit);
+typedef void (*vmi_cleanup_func_t)(void *vmi);
+typedef int (*vmi_mmap_func_t)(void *vmi, vm_paddr_t gpa, vm_paddr_t hpa,
+ size_t length, vm_memattr_t attr,
+ int prot, boolean_t superpages_ok);
+typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num,
+ uint64_t *retval);
+typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num,
+ uint64_t val);
+typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num,
+ struct seg_desc *desc);
+typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num,
+ struct seg_desc *desc);
+typedef int (*vmi_inject_event_t)(void *vmi, int vcpu,
+ int type, int vector,
+ uint32_t code, int code_valid);
+typedef int (*vmi_inject_nmi_t)(void *vmi, int vcpu);
+typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
+typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
+
+struct vmm_ops {
+ vmm_init_func_t init; /* module wide initialization */
+ vmm_cleanup_func_t cleanup;
+
+ vmi_init_func_t vminit; /* vm-specific initialization */
+ vmi_run_func_t vmrun;
+ vmi_cleanup_func_t vmcleanup;
+ vmi_mmap_func_t vmmmap;
+ vmi_get_register_t vmgetreg;
+ vmi_set_register_t vmsetreg;
+ vmi_get_desc_t vmgetdesc;
+ vmi_set_desc_t vmsetdesc;
+ vmi_inject_event_t vminject;
+ vmi_inject_nmi_t vmnmi;
+ vmi_get_cap_t vmgetcap;
+ vmi_set_cap_t vmsetcap;
+};
+
+extern struct vmm_ops vmm_ops_intel;
+extern struct vmm_ops vmm_ops_amd;
+
+struct vm *vm_create(const char *name);
+void vm_destroy(struct vm *vm);
+const char *vm_name(struct vm *vm);
+int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa);
+int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
+vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size);
+int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+ struct vm_memory_segment *seg);
+int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
+int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
+int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *ret_desc);
+int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *desc);
+int vm_get_pinning(struct vm *vm, int vcpu, int *cpuid);
+int vm_set_pinning(struct vm *vm, int vcpu, int cpuid);
+int vm_run(struct vm *vm, struct vm_run *vmrun);
+int vm_inject_event(struct vm *vm, int vcpu, int type,
+ int vector, uint32_t error_code, int error_code_valid);
+int vm_inject_nmi(struct vm *vm, int vcpu);
+uint64_t *vm_guest_msrs(struct vm *vm, int cpu);
+struct vlapic *vm_lapic(struct vm *vm, int cpu);
+int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
+int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
+void vm_activate_cpu(struct vm *vm, int vcpu);
+cpumask_t vm_active_cpus(struct vm *vm);
+
+/*
+ * Return 1 if device indicated by bus/slot/func is supposed to be a
+ * pci passthrough device.
+ *
+ * Return 0 otherwise.
+ */
+int vmm_is_pptdev(int bus, int slot, int func);
+
+void *vm_iommu_domain(struct vm *vm);
+
+#define VCPU_STOPPED 0
+#define VCPU_RUNNING 1
+void vm_set_run_state(struct vm *vm, int vcpu, int running);
+int vm_get_run_state(struct vm *vm, int vcpu, int *hostcpu);
+
+void *vcpu_stats(struct vm *vm, int vcpu);
+
+static int __inline
+vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
+{
+ return (vm_get_run_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
+}
+
+static cpumask_t __inline
+vcpu_mask(int vcpuid)
+{
+ return ((cpumask_t)1 << vcpuid);
+}
+
+#endif /* KERNEL */
+
+#define VM_MAXCPU 8 /* maximum virtual cpus */
+
+/*
+ * Identifiers for events that can be injected into the VM
+ */
+enum vm_event_type {
+ VM_EVENT_NONE,
+ VM_HW_INTR,
+ VM_NMI,
+ VM_HW_EXCEPTION,
+ VM_SW_INTR,
+ VM_PRIV_SW_EXCEPTION,
+ VM_SW_EXCEPTION,
+ VM_EVENT_MAX
+};
+
+/*
+ * Identifiers for architecturally defined registers.
+ */
+enum vm_reg_name {
+ VM_REG_GUEST_RAX,
+ VM_REG_GUEST_RBX,
+ VM_REG_GUEST_RCX,
+ VM_REG_GUEST_RDX,
+ VM_REG_GUEST_RSI,
+ VM_REG_GUEST_RDI,
+ VM_REG_GUEST_RBP,
+ VM_REG_GUEST_R8,
+ VM_REG_GUEST_R9,
+ VM_REG_GUEST_R10,
+ VM_REG_GUEST_R11,
+ VM_REG_GUEST_R12,
+ VM_REG_GUEST_R13,
+ VM_REG_GUEST_R14,
+ VM_REG_GUEST_R15,
+ VM_REG_GUEST_CR0,
+ VM_REG_GUEST_CR3,
+ VM_REG_GUEST_CR4,
+ VM_REG_GUEST_DR7,
+ VM_REG_GUEST_RSP,
+ VM_REG_GUEST_RIP,
+ VM_REG_GUEST_RFLAGS,
+ VM_REG_GUEST_ES,
+ VM_REG_GUEST_CS,
+ VM_REG_GUEST_SS,
+ VM_REG_GUEST_DS,
+ VM_REG_GUEST_FS,
+ VM_REG_GUEST_GS,
+ VM_REG_GUEST_LDTR,
+ VM_REG_GUEST_TR,
+ VM_REG_GUEST_IDTR,
+ VM_REG_GUEST_GDTR,
+ VM_REG_GUEST_EFER,
+ VM_REG_LAST
+};
+
+/*
+ * Identifiers for optional vmm capabilities
+ */
+enum vm_cap_type {
+ VM_CAP_HALT_EXIT,
+ VM_CAP_MTRAP_EXIT,
+ VM_CAP_PAUSE_EXIT,
+ VM_CAP_UNRESTRICTED_GUEST,
+ VM_CAP_MAX
+};
+
+/*
+ * The 'access' field has the format specified in Table 21-2 of the Intel
+ * Architecture Manual vol 3b.
+ *
+ * XXX The contents of the 'access' field are architecturally defined except
+ * bit 16 - Segment Unusable.
+ */
+struct seg_desc {
+ uint64_t base;
+ uint32_t limit;
+ uint32_t access;
+};
+
+enum vm_exitcode {
+ VM_EXITCODE_INOUT,
+ VM_EXITCODE_VMX,
+ VM_EXITCODE_BOGUS,
+ VM_EXITCODE_RDMSR,
+ VM_EXITCODE_WRMSR,
+ VM_EXITCODE_HLT,
+ VM_EXITCODE_MTRAP,
+ VM_EXITCODE_PAUSE,
+ VM_EXITCODE_MAX,
+};
+
+struct vm_exit {
+ enum vm_exitcode exitcode;
+ int inst_length; /* 0 means unknown */
+ uint64_t rip;
+ union {
+ struct {
+ uint16_t bytes:3; /* 1 or 2 or 4 */
+ uint16_t in:1; /* out is 0, in is 1 */
+ uint16_t string:1;
+ uint16_t rep:1;
+ uint16_t port;
+ uint32_t eax; /* valid for out */
+ } inout;
+ /*
+ * VMX specific payload. Used when there is no "better"
+ * exitcode to represent the VM-exit.
+ */
+ struct {
+ int error; /* vmx inst error */
+ uint32_t exit_reason;
+ uint64_t exit_qualification;
+ } vmx;
+ struct {
+ uint32_t code; /* ecx value */
+ uint64_t wval;
+ } msr;
+ } u;
+};
+
+#endif /* _VMM_H_ */
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
new file mode 100644
index 000000000000..1b143b527e46
--- /dev/null
+++ b/sys/amd64/include/vmm_dev.h
@@ -0,0 +1,191 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: vmm_dev.h 482 2011-05-09 21:22:43Z grehan $
+ */
+
+#ifndef _VMM_DEV_H_
+#define _VMM_DEV_H_
+
+#ifdef _KERNEL
+void vmmdev_init(void);
+void vmmdev_cleanup(void);
+#endif
+
+struct vm_memory_segment {
+ vm_paddr_t hpa; /* out */
+ vm_paddr_t gpa; /* in */
+ size_t len; /* in */
+};
+
+struct vm_register {
+ int cpuid;
+ int regnum; /* enum vm_reg_name */
+ uint64_t regval;
+};
+
+struct vm_seg_desc { /* data or code segment */
+ int cpuid;
+ int regnum; /* enum vm_reg_name */
+ struct seg_desc desc;
+};
+
+struct vm_pin {
+ int vm_cpuid;
+ int host_cpuid; /* -1 to unpin */
+};
+
+struct vm_run {
+ int cpuid;
+ uint64_t rip; /* start running here */
+ struct vm_exit vm_exit;
+};
+
+struct vm_event {
+ int cpuid;
+ enum vm_event_type type;
+ int vector;
+ uint32_t error_code;
+ int error_code_valid;
+};
+
+struct vm_lapic_irq {
+ int cpuid;
+ int vector;
+};
+
+struct vm_capability {
+ int cpuid;
+ enum vm_cap_type captype;
+ int capval;
+ int allcpus;
+};
+
+struct vm_pptdev {
+ int bus;
+ int slot;
+ int func;
+};
+
+struct vm_pptdev_mmio {
+ int bus;
+ int slot;
+ int func;
+ vm_paddr_t gpa;
+ vm_paddr_t hpa;
+ size_t len;
+};
+
+struct vm_pptdev_msi {
+ int vcpu;
+ int bus;
+ int slot;
+ int func;
+ int numvec; /* 0 means disabled */
+ int vector;
+ int destcpu;
+};
+
+struct vm_nmi {
+ int cpuid;
+};
+
+#define MAX_VM_STATS 64
+struct vm_stats {
+ int cpuid; /* in */
+ int num_entries; /* out */
+ struct timeval tv;
+ uint64_t statbuf[MAX_VM_STATS];
+};
+
+struct vm_stat_desc {
+ int index; /* in */
+ char desc[128]; /* out */
+};
+
+enum {
+ IOCNUM_RUN,
+ IOCNUM_SET_PINNING,
+ IOCNUM_GET_PINNING,
+ IOCNUM_MAP_MEMORY,
+ IOCNUM_GET_MEMORY_SEG,
+ IOCNUM_SET_REGISTER,
+ IOCNUM_GET_REGISTER,
+ IOCNUM_SET_SEGMENT_DESCRIPTOR,
+ IOCNUM_GET_SEGMENT_DESCRIPTOR,
+ IOCNUM_INJECT_EVENT,
+ IOCNUM_LAPIC_IRQ,
+ IOCNUM_SET_CAPABILITY,
+ IOCNUM_GET_CAPABILITY,
+ IOCNUM_BIND_PPTDEV,
+ IOCNUM_UNBIND_PPTDEV,
+ IOCNUM_MAP_PPTDEV_MMIO,
+ IOCNUM_PPTDEV_MSI,
+ IOCNUM_INJECT_NMI,
+ IOCNUM_VM_STATS,
+ IOCNUM_VM_STAT_DESC,
+};
+
+#define VM_RUN \
+ _IOWR('v', IOCNUM_RUN, struct vm_run)
+#define VM_SET_PINNING \
+ _IOW('v', IOCNUM_SET_PINNING, struct vm_pin)
+#define VM_GET_PINNING \
+ _IOWR('v', IOCNUM_GET_PINNING, struct vm_pin)
+#define VM_MAP_MEMORY \
+ _IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment)
+#define VM_GET_MEMORY_SEG \
+ _IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment)
+#define VM_SET_REGISTER \
+ _IOW('v', IOCNUM_SET_REGISTER, struct vm_register)
+#define VM_GET_REGISTER \
+ _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register)
+#define VM_SET_SEGMENT_DESCRIPTOR \
+ _IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define VM_GET_SEGMENT_DESCRIPTOR \
+ _IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define VM_INJECT_EVENT \
+ _IOW('v', IOCNUM_INJECT_EVENT, struct vm_event)
+#define VM_LAPIC_IRQ \
+ _IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq)
+#define VM_SET_CAPABILITY \
+ _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability)
+#define VM_GET_CAPABILITY \
+ _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability)
+#define VM_BIND_PPTDEV \
+ _IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev)
+#define VM_UNBIND_PPTDEV \
+ _IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev)
+#define VM_MAP_PPTDEV_MMIO \
+ _IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
+#define VM_PPTDEV_MSI \
+ _IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
+#define VM_INJECT_NMI \
+ _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
+#define VM_STATS \
+ _IOWR('v', IOCNUM_VM_STATS, struct vm_stats)
+#define VM_STAT_DESC \
+ _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc)
+#endif
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
new file mode 100644
index 000000000000..41e937a20da9
--- /dev/null
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -0,0 +1,247 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+
+#include <machine/vmm.h>
+#include "io/iommu.h"
+
+static int
+amdv_init(void)
+{
+
+ printf("amdv_init: not implemented\n");
+ return (ENXIO);
+}
+
+static int
+amdv_cleanup(void)
+{
+
+ printf("amdv_cleanup: not implemented\n");
+ return (ENXIO);
+}
+
+static void *
+amdv_vminit(struct vm *vm)
+{
+
+ printf("amdv_vminit: not implemented\n");
+ return (NULL);
+}
+
+static int
+amdv_vmrun(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit)
+{
+
+ printf("amdv_vmrun: not implemented\n");
+ return (ENXIO);
+}
+
+static void
+amdv_vmcleanup(void *arg)
+{
+
+ printf("amdv_vmcleanup: not implemented\n");
+ return;
+}
+
+static int
+amdv_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, int prot, boolean_t spok)
+{
+
+ printf("amdv_vmmmap: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval)
+{
+
+ printf("amdv_getreg: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val)
+{
+
+ printf("amdv_setreg: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+ printf("amdv_get_desc: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+ printf("amdv_get_desc: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_inject_event(void *vmi, int vcpu, int type, int vector,
+ uint32_t error_code, int error_code_valid)
+{
+
+ printf("amdv_inject_event: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_nmi(void *arg, int vcpu)
+{
+
+ printf("amdv_nmi: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_getcap(void *arg, int vcpu, int type, int *retval)
+{
+
+ printf("amdv_getcap: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_setcap(void *arg, int vcpu, int type, int val)
+{
+
+ printf("amdv_setcap: not implemented\n");
+ return (EINVAL);
+}
+
+struct vmm_ops vmm_ops_amd = {
+ amdv_init,
+ amdv_cleanup,
+ amdv_vminit,
+ amdv_vmrun,
+ amdv_vmcleanup,
+ amdv_vmmmap,
+ amdv_getreg,
+ amdv_setreg,
+ amdv_getdesc,
+ amdv_setdesc,
+ amdv_inject_event,
+ amdv_nmi,
+ amdv_getcap,
+ amdv_setcap
+};
+
+static int
+amd_iommu_init(void)
+{
+
+ printf("amd_iommu_init: not implemented\n");
+ return (ENXIO);
+}
+
+static void
+amd_iommu_cleanup(void)
+{
+
+ printf("amd_iommu_cleanup: not implemented\n");
+}
+
+static void
+amd_iommu_enable(void)
+{
+
+ printf("amd_iommu_enable: not implemented\n");
+}
+
+static void
+amd_iommu_disable(void)
+{
+
+ printf("amd_iommu_disable: not implemented\n");
+}
+
+static void *
+amd_iommu_create_domain(vm_paddr_t maxaddr)
+{
+
+ printf("amd_iommu_create_domain: not implemented\n");
+ return (NULL);
+}
+
+static void
+amd_iommu_destroy_domain(void *domain)
+{
+
+ printf("amd_iommu_destroy_domain: not implemented\n");
+}
+
+static uint64_t
+amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa,
+ uint64_t len)
+{
+
+ printf("amd_iommu_create_mapping: not implemented\n");
+ return (0);
+}
+
+static void
+amd_iommu_add_device(void *domain, int bus, int slot, int func)
+{
+
+ printf("amd_iommu_add_device: not implemented\n");
+}
+
+static void
+amd_iommu_remove_device(void *domain, int bus, int slot, int func)
+{
+
+ printf("amd_iommu_remove_device: not implemented\n");
+}
+
+struct iommu_ops iommu_ops_amd = {
+ amd_iommu_init,
+ amd_iommu_cleanup,
+ amd_iommu_enable,
+ amd_iommu_disable,
+ amd_iommu_create_domain,
+ amd_iommu_destroy_domain,
+ amd_iommu_create_mapping,
+ amd_iommu_add_device,
+ amd_iommu_remove_device,
+};
diff --git a/sys/amd64/vmm/intel/ept.c b/sys/amd64/vmm/intel/ept.c
new file mode 100644
index 000000000000..c9fca9d5abee
--- /dev/null
+++ b/sys/amd64/vmm/intel/ept.c
@@ -0,0 +1,312 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/smp.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/param.h>
+#include <machine/cpufunc.h>
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmx_cpufunc.h"
+#include "vmx_msr.h"
+#include "vmx.h"
+#include "ept.h"
+
+#define EPT_PWL4(cap) ((cap) & (1UL << 6))
+#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14))
+#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */
+#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */
+#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32))
+#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20))
+
+#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL
+#define INVVPID_ALL_TYPES_SUPPORTED(cap) \
+ (((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
+
+#define INVEPT_ALL_TYPES_MASK 0x6000000UL
+#define INVEPT_ALL_TYPES_SUPPORTED(cap) \
+ (((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
+
+#define EPT_PG_RD (1 << 0)
+#define EPT_PG_WR (1 << 1)
+#define EPT_PG_EX (1 << 2)
+#define EPT_PG_MEMORY_TYPE(x) ((x) << 3)
+#define EPT_PG_IGNORE_PAT (1 << 6)
+#define EPT_PG_SUPERPAGE (1 << 7)
+
+#define EPT_ADDR_MASK ((uint64_t)-1 << 12)
+
+MALLOC_DECLARE(M_VMX);
+
+static uint64_t page_sizes_mask;
+
+int
+ept_init(void)
+{
+ int page_shift;
+ uint64_t cap;
+
+ cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
+
+ /*
+ * Verify that:
+ * - page walk length is 4 steps
+ * - extended page tables can be laid out in write-back memory
+ * - invvpid instruction with all possible types is supported
+ * - invept instruction with all possible types is supported
+ */
+ if (!EPT_PWL4(cap) ||
+ !EPT_MEMORY_TYPE_WB(cap) ||
+ !INVVPID_SUPPORTED(cap) ||
+ !INVVPID_ALL_TYPES_SUPPORTED(cap) ||
+ !INVEPT_SUPPORTED(cap) ||
+ !INVEPT_ALL_TYPES_SUPPORTED(cap))
+ return (EINVAL);
+
+ /* Set bits in 'page_sizes_mask' for each valid page size */
+ page_shift = PAGE_SHIFT;
+ page_sizes_mask = 1UL << page_shift; /* 4KB page */
+
+ page_shift += 9;
+ if (EPT_PDE_SUPERPAGE(cap))
+ page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */
+
+ page_shift += 9;
+ if (EPT_PDPTE_SUPERPAGE(cap))
+ page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */
+
+ return (0);
+}
+
+static size_t
+ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
+{
+ int spshift, ptpshift, ptpindex, nlevels;
+
+ /*
+ * Compute the size of the mapping that we can accomodate.
+ *
+ * This is based on three factors:
+ * - super page sizes supported by the processor
+ * - alignment of the region starting at 'gpa' and 'hpa'
+ * - length of the region 'len'
+ */
+ spshift = PAGE_SHIFT;
+ if (spok)
+ spshift += (EPT_PWLEVELS - 1) * 9;
+ while (spshift >= PAGE_SHIFT) {
+ uint64_t spsize = 1UL << spshift;
+ if ((page_sizes_mask & spsize) != 0 &&
+ (gpa & (spsize - 1)) == 0 &&
+ (hpa & (spsize - 1)) == 0 &&
+ length >= spsize) {
+ break;
+ }
+ spshift -= 9;
+ }
+
+ if (spshift < PAGE_SHIFT) {
+ panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
+ "length 0x%016lx, page_sizes_mask 0x%016lx",
+ gpa, hpa, length, page_sizes_mask);
+ }
+
+ nlevels = EPT_PWLEVELS;
+ while (--nlevels >= 0) {
+ ptpshift = PAGE_SHIFT + nlevels * 9;
+ ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+ /* We have reached the leaf mapping */
+ if (spshift >= ptpshift)
+ break;
+
+ /*
+ * We are working on a non-leaf page table page.
+ *
+ * Create the next level page table page if necessary and point
+ * to it from the current page table.
+ */
+ if (ptp[ptpindex] == 0) {
+ void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
+ ptp[ptpindex] = vtophys(nlp);
+ ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
+ }
+
+ /* Work our way down to the next level page table page */
+ ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
+ }
+
+ if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
+ panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
+ "mismatch\n", gpa, ptpshift);
+ }
+
+ /* Do the mapping */
+ ptp[ptpindex] = hpa;
+
+ /* Apply the access controls */
+ if (prot & VM_PROT_READ)
+ ptp[ptpindex] |= EPT_PG_RD;
+ if (prot & VM_PROT_WRITE)
+ ptp[ptpindex] |= EPT_PG_WR;
+ if (prot & VM_PROT_EXECUTE)
+ ptp[ptpindex] |= EPT_PG_EX;
+
+ /*
+ * XXX should we enforce this memory type by setting the ignore PAT
+ * bit to 1.
+ */
+ ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
+
+ if (nlevels > 0)
+ ptp[ptpindex] |= EPT_PG_SUPERPAGE;
+
+ return (1UL << ptpshift);
+}
+
+static void
+ept_free_pt_entry(pt_entry_t pte)
+{
+ if (pte == 0)
+ return;
+
+ /* sanity check */
+ if ((pte & EPT_PG_SUPERPAGE) != 0)
+ panic("ept_free_pt_entry: pte cannot have superpage bit");
+
+ return;
+}
+
+static void
+ept_free_pd_entry(pd_entry_t pde)
+{
+ pt_entry_t *pt;
+ int i;
+
+ if (pde == 0)
+ return;
+
+ if ((pde & EPT_PG_SUPERPAGE) == 0) {
+ pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
+ for (i = 0; i < NPTEPG; i++)
+ ept_free_pt_entry(pt[i]);
+ free(pt, M_VMX); /* free the page table page */
+ }
+}
+
+static void
+ept_free_pdp_entry(pdp_entry_t pdpe)
+{
+ pd_entry_t *pd;
+ int i;
+
+ if (pdpe == 0)
+ return;
+
+ if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
+ pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
+ for (i = 0; i < NPDEPG; i++)
+ ept_free_pd_entry(pd[i]);
+ free(pd, M_VMX); /* free the page directory page */
+ }
+}
+
+static void
+ept_free_pml4_entry(pml4_entry_t pml4e)
+{
+ pdp_entry_t *pdp;
+ int i;
+
+ if (pml4e == 0)
+ return;
+
+ if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
+ pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
+ for (i = 0; i < NPDPEPG; i++)
+ ept_free_pdp_entry(pdp[i]);
+ free(pdp, M_VMX); /* free the page directory ptr page */
+ }
+}
+
+void
+ept_vmcleanup(struct vmx *vmx)
+{
+ int i;
+
+ for (i = 0; i < NPML4EPG; i++)
+ ept_free_pml4_entry(vmx->pml4ept[i]);
+}
+
+int
+ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
+ vm_memattr_t attr, int prot, boolean_t spok)
+{
+ size_t n;
+ struct vmx *vmx = arg;
+
+ while (len > 0) {
+ n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
+ prot, spok);
+ len -= n;
+ gpa += n;
+ hpa += n;
+ }
+
+ return (0);
+}
+
+static void
+invept_single_context(void *arg)
+{
+ struct invept_desc desc = *(struct invept_desc *)arg;
+
+ invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
+}
+
+void
+ept_invalidate_mappings(u_long pml4ept)
+{
+ struct invept_desc invept_desc = { 0 };
+
+ invept_desc.eptp = EPTP(pml4ept);
+
+ smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
+}
diff --git a/sys/amd64/vmm/intel/ept.h b/sys/amd64/vmm/intel/ept.h
new file mode 100644
index 000000000000..013c330ed41a
--- /dev/null
+++ b/sys/amd64/vmm/intel/ept.h
@@ -0,0 +1,42 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _EPT_H_
+#define _EPT_H_
+
+struct vmx;
+
+#define EPT_PWLEVELS 4 /* page walk levels */
+#define EPTP(pml4) ((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
+
+int ept_init(void);
+int ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
+void ept_invalidate_mappings(u_long ept_pml4);
+void ept_vmcleanup(struct vmx *vmx);
+#endif
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
new file mode 100644
index 000000000000..80d45ccebb25
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -0,0 +1,451 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/pcpu.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/segments.h>
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "vmcs.h"
+#include "vmx_cpufunc.h"
+#include "ept.h"
+#include "vmx.h"
+
+static uint64_t
+vmcs_fix_regval(uint32_t encoding, uint64_t val)
+{
+
+ switch (encoding) {
+ case VMCS_GUEST_CR0:
+ val = vmx_fix_cr0(val);
+ break;
+ case VMCS_GUEST_CR4:
+ val = vmx_fix_cr4(val);
+ break;
+ default:
+ break;
+ }
+ return (val);
+}
+
+static uint32_t
+vmcs_field_encoding(int ident)
+{
+ switch (ident) {
+ case VM_REG_GUEST_CR0:
+ return (VMCS_GUEST_CR0);
+ case VM_REG_GUEST_CR3:
+ return (VMCS_GUEST_CR3);
+ case VM_REG_GUEST_CR4:
+ return (VMCS_GUEST_CR4);
+ case VM_REG_GUEST_DR7:
+ return (VMCS_GUEST_DR7);
+ case VM_REG_GUEST_RSP:
+ return (VMCS_GUEST_RSP);
+ case VM_REG_GUEST_RIP:
+ return (VMCS_GUEST_RIP);
+ case VM_REG_GUEST_RFLAGS:
+ return (VMCS_GUEST_RFLAGS);
+ case VM_REG_GUEST_ES:
+ return (VMCS_GUEST_ES_SELECTOR);
+ case VM_REG_GUEST_CS:
+ return (VMCS_GUEST_CS_SELECTOR);
+ case VM_REG_GUEST_SS:
+ return (VMCS_GUEST_SS_SELECTOR);
+ case VM_REG_GUEST_DS:
+ return (VMCS_GUEST_DS_SELECTOR);
+ case VM_REG_GUEST_FS:
+ return (VMCS_GUEST_FS_SELECTOR);
+ case VM_REG_GUEST_GS:
+ return (VMCS_GUEST_GS_SELECTOR);
+ case VM_REG_GUEST_TR:
+ return (VMCS_GUEST_TR_SELECTOR);
+ case VM_REG_GUEST_LDTR:
+ return (VMCS_GUEST_LDTR_SELECTOR);
+ case VM_REG_GUEST_EFER:
+ return (VMCS_GUEST_IA32_EFER);
+ default:
+ return (-1);
+ }
+
+}
+
+static int
+vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc)
+{
+
+ switch (seg) {
+ case VM_REG_GUEST_ES:
+ *base = VMCS_GUEST_ES_BASE;
+ *lim = VMCS_GUEST_ES_LIMIT;
+ *acc = VMCS_GUEST_ES_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_CS:
+ *base = VMCS_GUEST_CS_BASE;
+ *lim = VMCS_GUEST_CS_LIMIT;
+ *acc = VMCS_GUEST_CS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_SS:
+ *base = VMCS_GUEST_SS_BASE;
+ *lim = VMCS_GUEST_SS_LIMIT;
+ *acc = VMCS_GUEST_SS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_DS:
+ *base = VMCS_GUEST_DS_BASE;
+ *lim = VMCS_GUEST_DS_LIMIT;
+ *acc = VMCS_GUEST_DS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_FS:
+ *base = VMCS_GUEST_FS_BASE;
+ *lim = VMCS_GUEST_FS_LIMIT;
+ *acc = VMCS_GUEST_FS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_GS:
+ *base = VMCS_GUEST_GS_BASE;
+ *lim = VMCS_GUEST_GS_LIMIT;
+ *acc = VMCS_GUEST_GS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_TR:
+ *base = VMCS_GUEST_TR_BASE;
+ *lim = VMCS_GUEST_TR_LIMIT;
+ *acc = VMCS_GUEST_TR_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_LDTR:
+ *base = VMCS_GUEST_LDTR_BASE;
+ *lim = VMCS_GUEST_LDTR_LIMIT;
+ *acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_IDTR:
+ *base = VMCS_GUEST_IDTR_BASE;
+ *lim = VMCS_GUEST_IDTR_LIMIT;
+ *acc = VMCS_INVALID_ENCODING;
+ break;
+ case VM_REG_GUEST_GDTR:
+ *base = VMCS_GUEST_GDTR_BASE;
+ *lim = VMCS_GUEST_GDTR_LIMIT;
+ *acc = VMCS_INVALID_ENCODING;
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+int
+vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval)
+{
+ int error;
+ uint32_t encoding;
+
+ /*
+ * If we need to get at vmx-specific state in the VMCS we can bypass
+ * the translation of 'ident' to 'encoding' by simply setting the
+ * sign bit. As it so happens the upper 16 bits are reserved (i.e
+ * set to 0) in the encodings for the VMCS so we are free to use the
+ * sign bit.
+ */
+ if (ident < 0)
+ encoding = ident & 0x7fffffff;
+ else
+ encoding = vmcs_field_encoding(ident);
+
+ if (encoding == (uint32_t)-1)
+ return (EINVAL);
+
+ VMPTRLD(vmcs);
+ error = vmread(encoding, retval);
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val)
+{
+ int error;
+ uint32_t encoding;
+
+ if (ident < 0)
+ encoding = ident & 0x7fffffff;
+ else
+ encoding = vmcs_field_encoding(ident);
+
+ if (encoding == (uint32_t)-1)
+ return (EINVAL);
+
+ val = vmcs_fix_regval(encoding, val);
+
+ VMPTRLD(vmcs);
+ error = vmwrite(encoding, val);
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_setdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
+{
+ int error;
+ uint32_t base, limit, access;
+
+ error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+ if (error != 0)
+ panic("vmcs_setdesc: invalid segment register %d", seg);
+
+ VMPTRLD(vmcs);
+ if ((error = vmwrite(base, desc->base)) != 0)
+ goto done;
+
+ if ((error = vmwrite(limit, desc->limit)) != 0)
+ goto done;
+
+ if (access != VMCS_INVALID_ENCODING) {
+ if ((error = vmwrite(access, desc->access)) != 0)
+ goto done;
+ }
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_getdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
+{
+ int error;
+ uint32_t base, limit, access;
+ uint64_t u64;
+
+ error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+ if (error != 0)
+ panic("vmcs_getdesc: invalid segment register %d", seg);
+
+ VMPTRLD(vmcs);
+ if ((error = vmread(base, &u64)) != 0)
+ goto done;
+ desc->base = u64;
+
+ if ((error = vmread(limit, &u64)) != 0)
+ goto done;
+ desc->limit = u64;
+
+ if (access != VMCS_INVALID_ENCODING) {
+ if ((error = vmread(access, &u64)) != 0)
+ goto done;
+ desc->access = u64;
+ }
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count)
+{
+ int error;
+
+ VMPTRLD(vmcs);
+
+ /*
+ * Guest MSRs are saved in the VM-exit MSR-store area.
+ * Guest MSRs are loaded from the VM-entry MSR-load area.
+ * Both areas point to the same location in memory.
+ */
+ if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0)
+ goto done;
+
+ error = 0;
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_set_defaults(struct vmcs *vmcs,
+ u_long host_rip, u_long host_rsp, u_long ept_pml4,
+ uint32_t pinbased_ctls, uint32_t procbased_ctls,
+ uint32_t procbased_ctls2, uint32_t exit_ctls,
+ uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
+{
+ int error, codesel, datasel, tsssel;
+ u_long cr0, cr4, efer;
+ uint64_t eptp, pat;
+ uint32_t exc_bitmap;
+
+ codesel = GSEL(GCODE_SEL, SEL_KPL);
+ datasel = GSEL(GDATA_SEL, SEL_KPL);
+ tsssel = GSEL(GPROC0_SEL, SEL_KPL);
+
+ /*
+ * Make sure we have a "current" VMCS to work with.
+ */
+ VMPTRLD(vmcs);
+
+ /*
+ * Load the VMX controls
+ */
+ if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0)
+ goto done;
+
+ /* Guest state */
+
+ /* Initialize guest IA32_PAT MSR with the default value */
+ pat = PAT_VALUE(0, PAT_WRITE_BACK) |
+ PAT_VALUE(1, PAT_WRITE_THROUGH) |
+ PAT_VALUE(2, PAT_UNCACHED) |
+ PAT_VALUE(3, PAT_UNCACHEABLE) |
+ PAT_VALUE(4, PAT_WRITE_BACK) |
+ PAT_VALUE(5, PAT_WRITE_THROUGH) |
+ PAT_VALUE(6, PAT_UNCACHED) |
+ PAT_VALUE(7, PAT_UNCACHEABLE);
+ if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0)
+ goto done;
+
+ /* Host state */
+
+ /* Initialize host IA32_PAT MSR */
+ pat = rdmsr(MSR_PAT);
+ if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0)
+ goto done;
+
+ /* Load the IA32_EFER MSR */
+ efer = rdmsr(MSR_EFER);
+ if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0)
+ goto done;
+
+ /* Load the control registers */
+ cr0 = rcr0();
+ if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0)
+ goto done;
+
+ cr4 = rcr4();
+ if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0)
+ goto done;
+
+ /* Load the segment selectors */
+ if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0)
+ goto done;
+
+ /*
+ * Load the Base-Address for %fs and idtr.
+ *
+ * Note that we exclude %gs, tss and gdtr here because their base
+ * address is pcpu specific.
+ */
+ if ((error = vmwrite(VMCS_HOST_FS_BASE, 0)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_IDTR_BASE, r_idt.rd_base)) != 0)
+ goto done;
+
+ /* instruction pointer */
+ if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0)
+ goto done;
+
+ /* stack pointer */
+ if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0)
+ goto done;
+
+ /* eptp */
+ eptp = EPTP(ept_pml4);
+ if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
+ goto done;
+
+ /* vpid */
+ if ((error = vmwrite(VMCS_VPID, vpid)) != 0)
+ goto done;
+
+ /* msr bitmap */
+ if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0)
+ goto done;
+
+ /* exception bitmap */
+ exc_bitmap = 1 << IDT_MC;
+ if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0)
+ goto done;
+
+ /* link pointer */
+ if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0)
+ goto done;
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+uint64_t
+vmcs_read(uint32_t encoding)
+{
+ int error;
+ uint64_t val;
+
+ error = vmread(encoding, &val);
+ if (error != 0)
+ panic("vmcs_read(%u) error %d", encoding, error);
+
+ return (val);
+}
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
new file mode 100644
index 000000000000..c633a5957a07
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -0,0 +1,324 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMCS_H_
+#define _VMCS_H_
+
+#ifdef _KERNEL
+struct vmcs {
+ uint32_t identifier;
+ uint32_t abort_code;
+ char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2];
+};
+CTASSERT(sizeof(struct vmcs) == PAGE_SIZE);
+
+/* MSR save region is composed of an array of 'struct msr_entry' */
+struct msr_entry {
+ uint32_t index;
+ uint32_t reserved;
+ uint64_t val;
+
+};
+
+int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
+int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
+ u_long ept_pml4,
+ uint32_t pinbased_ctls, uint32_t procbased_ctls,
+ uint32_t procbased_ctls2, uint32_t exit_ctls,
+ uint32_t entry_ctls, u_long msr_bitmap,
+ uint16_t vpid);
+int vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval);
+int vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val);
+int vmcs_getdesc(struct vmcs *vmcs, int ident,
+ struct seg_desc *desc);
+int vmcs_setdesc(struct vmcs *vmcs, int ident,
+ struct seg_desc *desc);
+uint64_t vmcs_read(uint32_t encoding);
+
+#define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH)
+#define vmcs_guest_rip() vmcs_read(VMCS_GUEST_RIP)
+#define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR)
+#define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff)
+#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION)
+
+#endif /* _KERNEL */
+
+#define VMCS_IDENT(encoding) ((encoding) | 0x80000000)
+/*
+ * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B.
+ */
+#define VMCS_INVALID_ENCODING 0xffffffff
+
+/* 16-bit control fields */
+#define VMCS_VPID 0x00000000
+
+/* 16-bit guest-state fields */
+#define VMCS_GUEST_ES_SELECTOR 0x00000800
+#define VMCS_GUEST_CS_SELECTOR 0x00000802
+#define VMCS_GUEST_SS_SELECTOR 0x00000804
+#define VMCS_GUEST_DS_SELECTOR 0x00000806
+#define VMCS_GUEST_FS_SELECTOR 0x00000808
+#define VMCS_GUEST_GS_SELECTOR 0x0000080A
+#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C
+#define VMCS_GUEST_TR_SELECTOR 0x0000080E
+
+/* 16-bit host-state fields */
+#define VMCS_HOST_ES_SELECTOR 0x00000C00
+#define VMCS_HOST_CS_SELECTOR 0x00000C02
+#define VMCS_HOST_SS_SELECTOR 0x00000C04
+#define VMCS_HOST_DS_SELECTOR 0x00000C06
+#define VMCS_HOST_FS_SELECTOR 0x00000C08
+#define VMCS_HOST_GS_SELECTOR 0x00000C0A
+#define VMCS_HOST_TR_SELECTOR 0x00000C0C
+
+/* 64-bit control fields */
+#define VMCS_IO_BITMAP_A 0x00002000
+#define VMCS_IO_BITMAP_B 0x00002002
+#define VMCS_MSR_BITMAP 0x00002004
+#define VMCS_EXIT_MSR_STORE 0x00002006
+#define VMCS_EXIT_MSR_LOAD 0x00002008
+#define VMCS_ENTRY_MSR_LOAD 0x0000200A
+#define VMCS_EXECUTIVE_VMCS 0x0000200C
+#define VMCS_TSC_OFFSET 0x00002010
+#define VMCS_VIRTUAL_APIC 0x00002012
+#define VMCS_APIC_ACCESS 0x00002014
+#define VMCS_EPTP 0x0000201A
+
+/* 64-bit read-only fields */
+#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
+
+/* 64-bit guest-state fields */
+#define VMCS_LINK_POINTER 0x00002800
+#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802
+#define VMCS_GUEST_IA32_PAT 0x00002804
+#define VMCS_GUEST_IA32_EFER 0x00002806
+#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808
+#define VMCS_GUEST_PDPTE0 0x0000280A
+#define VMCS_GUEST_PDPTE1 0x0000280C
+#define VMCS_GUEST_PDPTE2 0x0000280E
+#define VMCS_GUEST_PDPTE3 0x00002810
+
+/* 64-bit host-state fields */
+#define VMCS_HOST_IA32_PAT 0x00002C00
+#define VMCS_HOST_IA32_EFER 0x00002C02
+#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04
+
+/* 32-bit control fields */
+#define VMCS_PIN_BASED_CTLS 0x00004000
+#define VMCS_PRI_PROC_BASED_CTLS 0x00004002
+#define VMCS_EXCEPTION_BITMAP 0x00004004
+#define VMCS_PF_ERROR_MASK 0x00004006
+#define VMCS_PF_ERROR_MATCH 0x00004008
+#define VMCS_CR3_TARGET_COUNT 0x0000400A
+#define VMCS_EXIT_CTLS 0x0000400C
+#define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E
+#define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010
+#define VMCS_ENTRY_CTLS 0x00004012
+#define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014
+#define VMCS_ENTRY_INTR_INFO 0x00004016
+#define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018
+#define VMCS_ENTRY_INST_LENGTH 0x0000401A
+#define VMCS_TPR_THRESHOLD 0x0000401C
+#define VMCS_SEC_PROC_BASED_CTLS 0x0000401E
+#define VMCS_PLE_GAP 0x00004020
+#define VMCS_PLE_WINDOW 0x00004022
+
+/* 32-bit read-only data fields */
+#define VMCS_INSTRUCTION_ERROR 0x00004400
+#define VMCS_EXIT_REASON 0x00004402
+#define VMCS_EXIT_INTERRUPTION_INFO 0x00004404
+#define VMCS_EXIT_INTERRUPTION_ERROR 0x00004406
+#define VMCS_IDT_VECTORING_INFO 0x00004408
+#define VMCS_IDT_VECTORING_ERROR 0x0000440A
+#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C
+#define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E
+
+/* 32-bit guest-state fields */
+#define VMCS_GUEST_ES_LIMIT 0x00004800
+#define VMCS_GUEST_CS_LIMIT 0x00004802
+#define VMCS_GUEST_SS_LIMIT 0x00004804
+#define VMCS_GUEST_DS_LIMIT 0x00004806
+#define VMCS_GUEST_FS_LIMIT 0x00004808
+#define VMCS_GUEST_GS_LIMIT 0x0000480A
+#define VMCS_GUEST_LDTR_LIMIT 0x0000480C
+#define VMCS_GUEST_TR_LIMIT 0x0000480E
+#define VMCS_GUEST_GDTR_LIMIT 0x00004810
+#define VMCS_GUEST_IDTR_LIMIT 0x00004812
+#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814
+#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816
+#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818
+#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A
+#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C
+#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E
+#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820
+#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822
+#define VMCS_GUEST_INTERRUPTIBILITY 0x00004824
+#define VMCS_GUEST_ACTIVITY 0x00004826
+#define VMCS_GUEST_SMBASE 0x00004828
+#define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A
+#define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E
+
+/* 32-bit host state fields */
+#define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00
+
+/* Natural Width control fields */
+#define VMCS_CR0_MASK 0x00006000
+#define VMCS_CR4_MASK 0x00006002
+#define VMCS_CR0_SHADOW 0x00006004
+#define VMCS_CR4_SHADOW 0x00006006
+#define VMCS_CR3_TARGET0 0x00006008
+#define VMCS_CR3_TARGET1 0x0000600A
+#define VMCS_CR3_TARGET2 0x0000600C
+#define VMCS_CR3_TARGET3 0x0000600E
+
+/* Natural Width read-only fields */
+#define VMCS_EXIT_QUALIFICATION 0x00006400
+#define VMCS_IO_RCX 0x00006402
+#define VMCS_IO_RSI 0x00006404
+#define VMCS_IO_RDI 0x00006406
+#define VMCS_IO_RIP 0x00006408
+#define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A
+
+/* Natural Width guest-state fields */
+#define VMCS_GUEST_CR0 0x00006800
+#define VMCS_GUEST_CR3 0x00006802
+#define VMCS_GUEST_CR4 0x00006804
+#define VMCS_GUEST_ES_BASE 0x00006806
+#define VMCS_GUEST_CS_BASE 0x00006808
+#define VMCS_GUEST_SS_BASE 0x0000680A
+#define VMCS_GUEST_DS_BASE 0x0000680C
+#define VMCS_GUEST_FS_BASE 0x0000680E
+#define VMCS_GUEST_GS_BASE 0x00006810
+#define VMCS_GUEST_LDTR_BASE 0x00006812
+#define VMCS_GUEST_TR_BASE 0x00006814
+#define VMCS_GUEST_GDTR_BASE 0x00006816
+#define VMCS_GUEST_IDTR_BASE 0x00006818
+#define VMCS_GUEST_DR7 0x0000681A
+#define VMCS_GUEST_RSP 0x0000681C
+#define VMCS_GUEST_RIP 0x0000681E
+#define VMCS_GUEST_RFLAGS 0x00006820
+#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822
+#define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824
+#define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826
+
+/* Natural Width host-state fields */
+#define VMCS_HOST_CR0 0x00006C00
+#define VMCS_HOST_CR3 0x00006C02
+#define VMCS_HOST_CR4 0x00006C04
+#define VMCS_HOST_FS_BASE 0x00006C06
+#define VMCS_HOST_GS_BASE 0x00006C08
+#define VMCS_HOST_TR_BASE 0x00006C0A
+#define VMCS_HOST_GDTR_BASE 0x00006C0C
+#define VMCS_HOST_IDTR_BASE 0x00006C0E
+#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10
+#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12
+#define VMCS_HOST_RSP 0x00006C14
+#define VMCS_HOST_RIP 0x00006c16
+
+/*
+ * VM instruction error numbers
+ */
+#define VMRESUME_WITH_NON_LAUNCHED_VMCS 5
+
+/*
+ * VMCS exit reasons
+ */
+#define EXIT_REASON_EXCEPTION 0
+#define EXIT_REASON_EXT_INTR 1
+#define EXIT_REASON_TRIPLE_FAULT 2
+#define EXIT_REASON_INIT 3
+#define EXIT_REASON_SIPI 4
+#define EXIT_REASON_IO_SMI 5
+#define EXIT_REASON_SMI 6
+#define EXIT_REASON_INTR_WINDOW 7
+#define EXIT_REASON_NMI_WINDOW 8
+#define EXIT_REASON_TASK_SWITCH 9
+#define EXIT_REASON_CPUID 10
+#define EXIT_REASON_GETSEC 11
+#define EXIT_REASON_HLT 12
+#define EXIT_REASON_INVD 13
+#define EXIT_REASON_INVLPG 14
+#define EXIT_REASON_RDPMC 15
+#define EXIT_REASON_RDTSC 16
+#define EXIT_REASON_RSM 17
+#define EXIT_REASON_VMCALL 18
+#define EXIT_REASON_VMCLEAR 19
+#define EXIT_REASON_VMLAUNCH 20
+#define EXIT_REASON_VMPTRLD 21
+#define EXIT_REASON_VMPTRST 22
+#define EXIT_REASON_VMREAD 23
+#define EXIT_REASON_VMRESUME 24
+#define EXIT_REASON_VMWRITE 25
+#define EXIT_REASON_VMXOFF 26
+#define EXIT_REASON_VMXON 27
+#define EXIT_REASON_CR_ACCESS 28
+#define EXIT_REASON_DR_ACCESS 29
+#define EXIT_REASON_INOUT 30
+#define EXIT_REASON_RDMSR 31
+#define EXIT_REASON_WRMSR 32
+#define EXIT_REASON_INVAL_VMCS 33
+#define EXIT_REASON_INVAL_MSR 34
+#define EXIT_REASON_MWAIT 36
+#define EXIT_REASON_MTF 37
+#define EXIT_REASON_MONITOR 39
+#define EXIT_REASON_PAUSE 40
+#define EXIT_REASON_MCE 41
+#define EXIT_REASON_TPR 43
+#define EXIT_REASON_APIC 44
+#define EXIT_REASON_GDTR_IDTR 46
+#define EXIT_REASON_LDTR_TR 47
+#define EXIT_REASON_EPT_FAULT 48
+#define EXIT_REASON_EPT_MISCONFIG 49
+#define EXIT_REASON_INVEPT 50
+#define EXIT_REASON_RDTSCP 51
+#define EXIT_REASON_VMX_PREEMPT 52
+#define EXIT_REASON_INVVPID 53
+#define EXIT_REASON_WBINVD 54
+#define EXIT_REASON_XSETBV 55
+
+/*
+ * VMCS interrupt information fields
+ */
+#define VMCS_INTERRUPTION_INFO_VALID (1 << 31)
+#define VMCS_INTERRUPTION_INFO_HW_INTR (0 << 8)
+#define VMCS_INTERRUPTION_INFO_NMI (2 << 8)
+
+/*
+ * VMCS Guest interruptibility field
+ */
+#define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0)
+#define VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING (1 << 1)
+#define VMCS_INTERRUPTIBILITY_SMI_BLOCKING (1 << 2)
+#define VMCS_INTERRUPTIBILITY_NMI_BLOCKING (1 << 3)
+
+/*
+ * Exit qualification for EXIT_REASON_INVAL_VMCS
+ */
+#define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
new file mode 100644
index 000000000000..ec181c40ab90
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -0,0 +1,1673 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/psl.h>
+#include <machine/cpufunc.h>
+#include <machine/pmap.h>
+#include <machine/segments.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmm_lapic.h"
+#include "vmm_msr.h"
+#include "vmm_ktr.h"
+#include "vmm_stat.h"
+
+#include "vmx_msr.h"
+#include "ept.h"
+#include "vmx_cpufunc.h"
+#include "vmx.h"
+#include "x86.h"
+#include "vmx_controls.h"
+
+#define CR4_VMXE (1UL << 13)
+
+#define PINBASED_CTLS_ONE_SETTING \
+ (PINBASED_EXTINT_EXITING | \
+ PINBASED_NMI_EXITING | \
+ PINBASED_VIRTUAL_NMI)
+#define PINBASED_CTLS_ZERO_SETTING 0
+
+#define PROCBASED_CTLS_WINDOW_SETTING \
+ (PROCBASED_INT_WINDOW_EXITING | \
+ PROCBASED_NMI_WINDOW_EXITING)
+
+#define PROCBASED_CTLS_ONE_SETTING \
+ (PROCBASED_SECONDARY_CONTROLS | \
+ PROCBASED_IO_EXITING | \
+ PROCBASED_MSR_BITMAPS | \
+ PROCBASED_CTLS_WINDOW_SETTING)
+#define PROCBASED_CTLS_ZERO_SETTING \
+ (PROCBASED_CR3_LOAD_EXITING | \
+ PROCBASED_CR3_STORE_EXITING | \
+ PROCBASED_IO_BITMAPS)
+
+#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT
+#define PROCBASED_CTLS2_ZERO_SETTING 0
+
+#define VM_EXIT_CTLS_ONE_SETTING \
+ (VM_EXIT_HOST_LMA | \
+ VM_EXIT_SAVE_EFER | \
+ VM_EXIT_SAVE_PAT | \
+ VM_EXIT_LOAD_PAT | \
+ VM_EXIT_LOAD_EFER)
+#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS
+
+#define VM_ENTRY_CTLS_ONE_SETTING \
+ (VM_ENTRY_LOAD_PAT | \
+ VM_ENTRY_LOAD_EFER)
+#define VM_ENTRY_CTLS_ZERO_SETTING \
+ (VM_ENTRY_LOAD_DEBUG_CONTROLS | \
+ VM_ENTRY_INTO_SMM | \
+ VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
+
+#define guest_msr_rw(vmx, msr) \
+ msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
+
+#define HANDLED 1
+#define UNHANDLED 0
+
+MALLOC_DEFINE(M_VMX, "vmx", "vmx");
+
+extern struct pcpu __pcpu[];
+
+static int vmxon_enabled[MAXCPU];
+static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
+
+static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
+static uint32_t exit_ctls, entry_ctls;
+
+static uint64_t cr0_ones_mask, cr0_zeros_mask;
+static uint64_t cr4_ones_mask, cr4_zeros_mask;
+
+static volatile u_int nextvpid;
+
+/*
+ * Virtual NMI blocking conditions.
+ *
+ * Some processor implementations also require NMI to be blocked if
+ * the STI_BLOCKING bit is set. It is possible to detect this at runtime
+ * based on the (exit_reason,exit_qual) tuple being set to
+ * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING).
+ *
+ * We take the easy way out and also include STI_BLOCKING as one of the
+ * gating items for vNMI injection.
+ */
+static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING |
+ VMCS_INTERRUPTIBILITY_NMI_BLOCKING |
+ VMCS_INTERRUPTIBILITY_STI_BLOCKING;
+
+/*
+ * Optional capabilities
+ */
+static int cap_halt_exit;
+static int cap_pause_exit;
+static int cap_unrestricted_guest;
+static int cap_monitor_trap;
+
+/* statistics */
+static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus");
+static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt");
+
+#ifdef KTR
+static const char *
+exit_reason_to_str(int reason)
+{
+ static char reasonbuf[32];
+
+ switch (reason) {
+ case EXIT_REASON_EXCEPTION:
+ return "exception";
+ case EXIT_REASON_EXT_INTR:
+ return "extint";
+ case EXIT_REASON_TRIPLE_FAULT:
+ return "triplefault";
+ case EXIT_REASON_INIT:
+ return "init";
+ case EXIT_REASON_SIPI:
+ return "sipi";
+ case EXIT_REASON_IO_SMI:
+ return "iosmi";
+ case EXIT_REASON_SMI:
+ return "smi";
+ case EXIT_REASON_INTR_WINDOW:
+ return "intrwindow";
+ case EXIT_REASON_NMI_WINDOW:
+ return "nmiwindow";
+ case EXIT_REASON_TASK_SWITCH:
+ return "taskswitch";
+ case EXIT_REASON_CPUID:
+ return "cpuid";
+ case EXIT_REASON_GETSEC:
+ return "getsec";
+ case EXIT_REASON_HLT:
+ return "hlt";
+ case EXIT_REASON_INVD:
+ return "invd";
+ case EXIT_REASON_INVLPG:
+ return "invlpg";
+ case EXIT_REASON_RDPMC:
+ return "rdpmc";
+ case EXIT_REASON_RDTSC:
+ return "rdtsc";
+ case EXIT_REASON_RSM:
+ return "rsm";
+ case EXIT_REASON_VMCALL:
+ return "vmcall";
+ case EXIT_REASON_VMCLEAR:
+ return "vmclear";
+ case EXIT_REASON_VMLAUNCH:
+ return "vmlaunch";
+ case EXIT_REASON_VMPTRLD:
+ return "vmptrld";
+ case EXIT_REASON_VMPTRST:
+ return "vmptrst";
+ case EXIT_REASON_VMREAD:
+ return "vmread";
+ case EXIT_REASON_VMRESUME:
+ return "vmresume";
+ case EXIT_REASON_VMWRITE:
+ return "vmwrite";
+ case EXIT_REASON_VMXOFF:
+ return "vmxoff";
+ case EXIT_REASON_VMXON:
+ return "vmxon";
+ case EXIT_REASON_CR_ACCESS:
+ return "craccess";
+ case EXIT_REASON_DR_ACCESS:
+ return "draccess";
+ case EXIT_REASON_INOUT:
+ return "inout";
+ case EXIT_REASON_RDMSR:
+ return "rdmsr";
+ case EXIT_REASON_WRMSR:
+ return "wrmsr";
+ case EXIT_REASON_INVAL_VMCS:
+ return "invalvmcs";
+ case EXIT_REASON_INVAL_MSR:
+ return "invalmsr";
+ case EXIT_REASON_MWAIT:
+ return "mwait";
+ case EXIT_REASON_MTF:
+ return "mtf";
+ case EXIT_REASON_MONITOR:
+ return "monitor";
+ case EXIT_REASON_PAUSE:
+ return "pause";
+ case EXIT_REASON_MCE:
+ return "mce";
+ case EXIT_REASON_TPR:
+ return "tpr";
+ case EXIT_REASON_APIC:
+ return "apic";
+ case EXIT_REASON_GDTR_IDTR:
+ return "gdtridtr";
+ case EXIT_REASON_LDTR_TR:
+ return "ldtrtr";
+ case EXIT_REASON_EPT_FAULT:
+ return "eptfault";
+ case EXIT_REASON_EPT_MISCONFIG:
+ return "eptmisconfig";
+ case EXIT_REASON_INVEPT:
+ return "invept";
+ case EXIT_REASON_RDTSCP:
+ return "rdtscp";
+ case EXIT_REASON_VMX_PREEMPT:
+ return "vmxpreempt";
+ case EXIT_REASON_INVVPID:
+ return "invvpid";
+ case EXIT_REASON_WBINVD:
+ return "wbinvd";
+ case EXIT_REASON_XSETBV:
+ return "xsetbv";
+ default:
+ snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
+ return (reasonbuf);
+ }
+}
+
+#ifdef SETJMP_TRACE
+static const char *
+vmx_setjmp_rc2str(int rc)
+{
+ switch (rc) {
+ case VMX_RETURN_DIRECT:
+ return "direct";
+ case VMX_RETURN_LONGJMP:
+ return "longjmp";
+ case VMX_RETURN_VMRESUME:
+ return "vmresume";
+ case VMX_RETURN_VMLAUNCH:
+ return "vmlaunch";
+ default:
+ return "unknown";
+ }
+}
+
+#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \
+ VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \
+ (vmxctx)->regname)
+
+static void
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+ uint64_t host_rip, host_rsp;
+
+ if (vmxctx != &vmx->ctx[vcpu])
+ panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p",
+ vmxctx, &vmx->ctx[vcpu]);
+
+ VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx);
+ VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)",
+ vmx_setjmp_rc2str(rc), rc);
+
+ host_rsp = host_rip = ~0;
+ vmread(VMCS_HOST_RIP, &host_rip);
+ vmread(VMCS_HOST_RSP, &host_rsp);
+ VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx",
+ host_rip, host_rsp);
+
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip);
+
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2);
+}
+#endif
+#else
+static void __inline
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+ return;
+}
+#endif /* KTR */
+
+u_long
+vmx_fix_cr0(u_long cr0)
+{
+
+ return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
+}
+
+u_long
+vmx_fix_cr4(u_long cr4)
+{
+
+ return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
+}
+
+static void
+msr_save_area_init(struct msr_entry *g_area, int *g_count)
+{
+ int cnt;
+
+ static struct msr_entry guest_msrs[] = {
+ { MSR_KGSBASE, 0, 0 },
+ };
+
+ cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
+ if (cnt > GUEST_MSR_MAX_ENTRIES)
+ panic("guest msr save area overrun");
+ bcopy(guest_msrs, g_area, sizeof(guest_msrs));
+ *g_count = cnt;
+}
+
+static void
+vmx_disable(void *arg __unused)
+{
+ struct invvpid_desc invvpid_desc = { 0 };
+ struct invept_desc invept_desc = { 0 };
+
+ if (vmxon_enabled[curcpu]) {
+ /*
+ * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
+ *
+ * VMXON or VMXOFF are not required to invalidate any TLB
+ * caching structures. This prevents potential retention of
+ * cached information in the TLB between distinct VMX episodes.
+ */
+ invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
+ invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
+ vmxoff();
+ }
+ load_cr4(rcr4() & ~CR4_VMXE);
+}
+
+static int
+vmx_cleanup(void)
+{
+
+ smp_rendezvous(NULL, vmx_disable, NULL, NULL);
+
+ return (0);
+}
+
+static void
+vmx_enable(void *arg __unused)
+{
+ int error;
+
+ load_cr4(rcr4() | CR4_VMXE);
+
+ *(uint32_t *)vmxon_region[curcpu] = vmx_revision();
+ error = vmxon(vmxon_region[curcpu]);
+ if (error == 0)
+ vmxon_enabled[curcpu] = 1;
+}
+
+static int
+vmx_init(void)
+{
+ int error;
+ unsigned int regs[4];
+ uint64_t fixed0, fixed1;
+ uint32_t tmp;
+
+ /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
+ do_cpuid(1, regs);
+ if ((regs[2] & CPUID_0000_0001_FEAT0_VMX) == 0) {
+ printf("vmx_init: processor does not support VMX operation\n");
+ return (ENXIO);
+ }
+
+ /* Check support for primary processor-based VM-execution controls */
+ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS,
+ PROCBASED_CTLS_ONE_SETTING,
+ PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired primary "
+ "processor-based controls\n");
+ return (error);
+ }
+
+ /* Clear the processor-based ctl bits that are set on demand */
+ procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
+
+ /* Check support for secondary processor-based VM-execution controls */
+ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+ MSR_VMX_PROCBASED_CTLS2,
+ PROCBASED_CTLS2_ONE_SETTING,
+ PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
+ if (error) {
+ printf("vmx_init: processor does not support desired secondary "
+ "processor-based controls\n");
+ return (error);
+ }
+
+ /* Check support for VPID */
+ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
+ PROCBASED2_ENABLE_VPID, 0, &tmp);
+ if (error == 0)
+ procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
+
+ /* Check support for pin-based VM-execution controls */
+ error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
+ MSR_VMX_TRUE_PINBASED_CTLS,
+ PINBASED_CTLS_ONE_SETTING,
+ PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired "
+ "pin-based controls\n");
+ return (error);
+ }
+
+ /* Check support for VM-exit controls */
+ error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
+ VM_EXIT_CTLS_ONE_SETTING,
+ VM_EXIT_CTLS_ZERO_SETTING,
+ &exit_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired "
+ "exit controls\n");
+ return (error);
+ }
+
+ /* Check support for VM-entry controls */
+ error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
+ VM_ENTRY_CTLS_ONE_SETTING,
+ VM_ENTRY_CTLS_ZERO_SETTING,
+ &entry_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired "
+ "entry controls\n");
+ return (error);
+ }
+
+ /*
+ * Check support for optional features by testing them
+ * as individual bits
+ */
+ cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS,
+ PROCBASED_HLT_EXITING, 0,
+ &tmp) == 0);
+
+ cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_PROCBASED_CTLS,
+ PROCBASED_MTF, 0,
+ &tmp) == 0);
+
+ cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS,
+ PROCBASED_PAUSE_EXITING, 0,
+ &tmp) == 0);
+
+ cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+ MSR_VMX_PROCBASED_CTLS2,
+ PROCBASED2_UNRESTRICTED_GUEST, 0,
+ &tmp) == 0);
+
+ /* Initialize EPT */
+ error = ept_init();
+ if (error) {
+ printf("vmx_init: ept initialization failed (%d)\n", error);
+ return (error);
+ }
+
+ /*
+ * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
+ */
+ fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
+ fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
+ cr0_ones_mask = fixed0 & fixed1;
+ cr0_zeros_mask = ~fixed0 & ~fixed1;
+
+ /*
+ * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
+ * if unrestricted guest execution is allowed.
+ */
+ if (cap_unrestricted_guest)
+ cr0_ones_mask &= ~(CR0_PG | CR0_PE);
+
+ /*
+ * Do not allow the guest to set CR0_NW or CR0_CD.
+ */
+ cr0_zeros_mask |= (CR0_NW | CR0_CD);
+
+ fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
+ fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
+ cr4_ones_mask = fixed0 & fixed1;
+ cr4_zeros_mask = ~fixed0 & ~fixed1;
+
+ /* enable VMX operation */
+ smp_rendezvous(NULL, vmx_enable, NULL, NULL);
+
+ return (0);
+}
+
+/*
+ * If this processor does not support VPIDs then simply return 0.
+ *
+ * Otherwise generate the next value of VPID to use. Any value is alright
+ * as long as it is non-zero.
+ *
+ * We always execute in VMX non-root context with EPT enabled. Thus all
+ * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This
+ * in turn means that multiple VMs can share the same VPID as long as
+ * they have distinct EPT page tables.
+ *
+ * XXX
+ * We should optimize this so that it returns VPIDs that are not in
+ * use. Then we will not unnecessarily invalidate mappings in
+ * vmx_set_pcpu_defaults() just because two or more vcpus happen to
+ * use the same 'vpid'.
+ */
+static uint16_t
+vmx_vpid(void)
+{
+ uint16_t vpid = 0;
+
+ if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) {
+ do {
+ vpid = atomic_fetchadd_int(&nextvpid, 1);
+ } while (vpid == 0);
+ }
+
+ return (vpid);
+}
+
+static int
+vmx_setup_cr0_shadow(struct vmcs *vmcs)
+{
+ int error;
+ uint64_t mask, shadow;
+
+ mask = cr0_ones_mask | cr0_zeros_mask;
+ error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_MASK), mask);
+ if (error)
+ return (error);
+
+ shadow = cr0_ones_mask;
+ error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_SHADOW), shadow);
+ if (error)
+ return (error);
+
+ return (0);
+}
+
+static void *
+vmx_vminit(struct vm *vm)
+{
+ uint16_t vpid;
+ int i, error, guest_msr_count;
+ struct vmx *vmx;
+
+ vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
+ if ((uintptr_t)vmx & PAGE_MASK) {
+ panic("malloc of struct vmx not aligned on %d byte boundary",
+ PAGE_SIZE);
+ }
+ vmx->vm = vm;
+
+ /*
+ * Clean up EPTP-tagged guest physical and combined mappings
+ *
+ * VMX transitions are not required to invalidate any guest physical
+ * mappings. So, it may be possible for stale guest physical mappings
+ * to be present in the processor TLBs.
+ *
+ * Combined mappings for this EP4TA are also invalidated for all VPIDs.
+ */
+ ept_invalidate_mappings(vtophys(vmx->pml4ept));
+
+ msr_bitmap_initialize(vmx->msr_bitmap);
+
+ /*
+ * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
+ * The guest FSBASE and GSBASE are saved and restored during
+ * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
+ * always restored from the vmcs host state area on vm-exit.
+ *
+ * Guest KGSBASE is saved and restored in the guest MSR save area.
+ * Host KGSBASE is restored before returning to userland from the pcb.
+ * There will be a window of time when we are executing in the host
+ * kernel context with a value of KGSBASE from the guest. This is ok
+ * because the value of KGSBASE is inconsequential in kernel context.
+ *
+ * MSR_EFER is saved and restored in the guest VMCS area on a
+ * VM exit and entry respectively. It is also restored from the
+ * host VMCS area on a VM exit.
+ *
+ * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
+ * and entry respectively. It is also restored from the host VMCS
+ * area on a VM exit.
+ */
+ if (guest_msr_rw(vmx, MSR_GSBASE) ||
+ guest_msr_rw(vmx, MSR_FSBASE) ||
+ guest_msr_rw(vmx, MSR_KGSBASE) ||
+ guest_msr_rw(vmx, MSR_EFER) ||
+ guest_msr_rw(vmx, MSR_PAT))
+ panic("vmx_vminit: error setting guest msr access");
+
+ for (i = 0; i < VM_MAXCPU; i++) {
+ vmx->vmcs[i].identifier = vmx_revision();
+ error = vmclear(&vmx->vmcs[i]);
+ if (error != 0) {
+ panic("vmx_vminit: vmclear error %d on vcpu %d\n",
+ error, i);
+ }
+
+ vpid = vmx_vpid();
+
+ error = vmcs_set_defaults(&vmx->vmcs[i],
+ (u_long)vmx_longjmp,
+ (u_long)&vmx->ctx[i],
+ vtophys(vmx->pml4ept),
+ pinbased_ctls,
+ procbased_ctls,
+ procbased_ctls2,
+ exit_ctls, entry_ctls,
+ vtophys(vmx->msr_bitmap),
+ vpid);
+
+ if (error != 0)
+ panic("vmx_vminit: vmcs_set_defaults error %d", error);
+
+ vmx->cap[i].set = 0;
+ vmx->cap[i].proc_ctls = procbased_ctls;
+
+ vmx->state[i].request_nmi = 0;
+ vmx->state[i].lastcpu = -1;
+ vmx->state[i].vpid = vpid;
+
+ msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
+
+ error = vmcs_set_msr_save(&vmx->vmcs[i],
+ vtophys(vmx->guest_msrs[i]),
+ guest_msr_count);
+ if (error != 0)
+ panic("vmcs_set_msr_save error %d", error);
+
+ error = vmx_setup_cr0_shadow(&vmx->vmcs[i]);
+ }
+
+ return (vmx);
+}
+
+static int
+vmx_handle_cpuid(struct vmxctx *vmxctx)
+{
+ int handled, func;
+
+ func = vmxctx->guest_rax;
+
+ handled = x86_emulate_cpuid((uint32_t*)(&vmxctx->guest_rax),
+ (uint32_t*)(&vmxctx->guest_rbx), (uint32_t*)(&vmxctx->guest_rcx),
+ (uint32_t*)(&vmxctx->guest_rdx));
+#if 0
+ printf("%s: func %x rax %lx rbx %lx rcx %lx rdx %lx handled %d\n",
+ __func__, func, vmxctx->guest_rax, vmxctx->guest_rbx,
+ vmxctx->guest_rcx, vmxctx->guest_rdx, handled);
+#endif
+
+ return (handled);
+}
+
+static __inline void
+vmx_run_trace(struct vmx *vmx, int vcpu)
+{
+#ifdef KTR
+ VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip());
+#endif
+}
+
+static __inline void
+vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
+ int handled, int astpending)
+{
+#ifdef KTR
+ VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
+ handled ? "handled" : "unhandled",
+ exit_reason_to_str(exit_reason), rip);
+
+ if (astpending)
+ VMM_CTR0(vmx->vm, vcpu, "astpending");
+#endif
+}
+
+static int
+vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
+{
+ int error, lastcpu;
+ struct vmxstate *vmxstate;
+ struct invvpid_desc invvpid_desc = { 0 };
+
+ vmxstate = &vmx->state[vcpu];
+ lastcpu = vmxstate->lastcpu;
+ vmxstate->lastcpu = curcpu;
+
+ if (lastcpu == curcpu) {
+ error = 0;
+ goto done;
+ }
+
+ vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+
+ error = vmwrite(VMCS_HOST_TR_BASE, (u_long)PCPU_GET(tssp));
+ if (error != 0)
+ goto done;
+
+ error = vmwrite(VMCS_HOST_GDTR_BASE, (u_long)&gdt[NGDT * curcpu]);
+ if (error != 0)
+ goto done;
+
+ error = vmwrite(VMCS_HOST_GS_BASE, (u_long)&__pcpu[curcpu]);
+ if (error != 0)
+ goto done;
+
+ /*
+ * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
+ *
+ * We do this because this vcpu was executing on a different host
+ * cpu when it last ran. We do not track whether it invalidated
+ * mappings associated with its 'vpid' during that run. So we must
+ * assume that the mappings associated with 'vpid' on 'curcpu' are
+ * stale and invalidate them.
+ *
+ * Note that we incur this penalty only when the scheduler chooses to
+ * move the thread associated with this vcpu between host cpus.
+ *
+ * Note also that this will invalidate mappings tagged with 'vpid'
+ * for "all" EP4TAs.
+ */
+ if (vmxstate->vpid != 0) {
+ invvpid_desc.vpid = vmxstate->vpid;
+ invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
+ }
+done:
+ return (error);
+}
+
+static void
+vm_exit_update_rip(struct vm_exit *vmexit)
+{
+ int error;
+
+ error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length);
+ if (error)
+ panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+}
+
+/*
+ * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
+ */
+CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
+
+static void __inline
+vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_set_int_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_clear_int_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_set_nmi_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+ int error;
+
+ vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
+
+ error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ if (error)
+ panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error);
+}
+
+static int
+vmx_inject_nmi(struct vmx *vmx, int vcpu)
+{
+ int error;
+ uint64_t info, interruptibility;
+
+ /* Bail out if no NMI requested */
+ if (vmx->state[vcpu].request_nmi == 0)
+ return (0);
+
+ error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
+ if (error) {
+ panic("vmx_inject_nmi: vmread(interruptibility) %d",
+ error);
+ }
+ if (interruptibility & nmi_blocking_bits)
+ goto nmiblocked;
+
+ /*
+ * Inject the virtual NMI. The vector must be the NMI IDT entry
+ * or the VMCS entry check will fail.
+ */
+ info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID;
+ info |= IDT_NMI;
+
+ error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
+ if (error)
+ panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error);
+
+ VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI");
+
+ /* Clear the request */
+ vmx->state[vcpu].request_nmi = 0;
+ return (1);
+
+nmiblocked:
+ /*
+ * Set the NMI Window Exiting execution control so we can inject
+ * the virtual NMI as soon as blocking condition goes away.
+ */
+ vmx_set_nmi_window_exiting(vmx, vcpu);
+
+ VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
+ return (1);
+}
+
+static void
+vmx_inject_interrupts(struct vmx *vmx, int vcpu)
+{
+ int error, vector;
+ uint64_t info, rflags, interruptibility;
+
+ const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING |
+ VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING;
+
+#if 1
+ /*
+ * XXX
+ * If an event is being injected from userland then just return.
+ * For e.g. we may inject a breakpoint exception to cause the
+ * guest to enter the debugger so we can inspect its state.
+ */
+ error = vmread(VMCS_ENTRY_INTR_INFO, &info);
+ if (error)
+ panic("vmx_inject_interrupts: vmread(intrinfo) %d", error);
+ if (info & VMCS_INTERRUPTION_INFO_VALID)
+ return;
+#endif
+ /*
+ * NMI injection has priority so deal with those first
+ */
+ if (vmx_inject_nmi(vmx, vcpu))
+ return;
+
+ /* Ask the local apic for a vector to inject */
+ vector = lapic_pending_intr(vmx->vm, vcpu);
+ if (vector < 0)
+ return;
+
+ if (vector < 32 || vector > 255)
+ panic("vmx_inject_interrupts: invalid vector %d\n", vector);
+
+ /* Check RFLAGS.IF and the interruptibility state of the guest */
+ error = vmread(VMCS_GUEST_RFLAGS, &rflags);
+ if (error)
+ panic("vmx_inject_interrupts: vmread(rflags) %d", error);
+
+ if ((rflags & PSL_I) == 0)
+ goto cantinject;
+
+ error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
+ if (error) {
+ panic("vmx_inject_interrupts: vmread(interruptibility) %d",
+ error);
+ }
+ if (interruptibility & HWINTR_BLOCKED)
+ goto cantinject;
+
+ /* Inject the interrupt */
+ info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID;
+ info |= vector;
+ error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
+ if (error)
+ panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error);
+
+ /* Update the Local APIC ISR */
+ lapic_intr_accepted(vmx->vm, vcpu, vector);
+
+ VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
+
+ return;
+
+cantinject:
+ /*
+ * Set the Interrupt Window Exiting execution control so we can inject
+ * the interrupt as soon as blocking condition goes away.
+ */
+ vmx_set_int_window_exiting(vmx, vcpu);
+
+ VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
+}
+
+static int
+vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
+{
+ int error;
+ uint64_t regval;
+ const struct vmxctx *vmxctx;
+
+ /* We only handle mov to %cr0 at this time */
+ if ((exitqual & 0xff) != 0x00)
+ return (UNHANDLED);
+
+ vmxctx = &vmx->ctx[vcpu];
+
+ /*
+ * We must use vmwrite() directly here because vmcs_setreg() will
+ * call vmclear(vmcs) as a side-effect which we certainly don't want.
+ */
+ switch ((exitqual >> 8) & 0xf) {
+ case 0:
+ regval = vmxctx->guest_rax;
+ break;
+ case 1:
+ regval = vmxctx->guest_rcx;
+ break;
+ case 2:
+ regval = vmxctx->guest_rdx;
+ break;
+ case 3:
+ regval = vmxctx->guest_rbx;
+ break;
+ case 4:
+ error = vmread(VMCS_GUEST_RSP, &regval);
+ if (error) {
+ panic("vmx_emulate_cr_access: "
+ "error %d reading guest rsp", error);
+ }
+ break;
+ case 5:
+ regval = vmxctx->guest_rbp;
+ break;
+ case 6:
+ regval = vmxctx->guest_rsi;
+ break;
+ case 7:
+ regval = vmxctx->guest_rdi;
+ break;
+ case 8:
+ regval = vmxctx->guest_r8;
+ break;
+ case 9:
+ regval = vmxctx->guest_r9;
+ break;
+ case 10:
+ regval = vmxctx->guest_r10;
+ break;
+ case 11:
+ regval = vmxctx->guest_r11;
+ break;
+ case 12:
+ regval = vmxctx->guest_r12;
+ break;
+ case 13:
+ regval = vmxctx->guest_r13;
+ break;
+ case 14:
+ regval = vmxctx->guest_r14;
+ break;
+ case 15:
+ regval = vmxctx->guest_r15;
+ break;
+ }
+
+ regval |= cr0_ones_mask;
+ regval &= ~cr0_zeros_mask;
+ error = vmwrite(VMCS_GUEST_CR0, regval);
+ if (error)
+ panic("vmx_emulate_cr_access: error %d writing cr0", error);
+
+ return (HANDLED);
+}
+
+static int
+vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
+{
+ int handled;
+ struct vmcs *vmcs;
+ struct vmxctx *vmxctx;
+ uint32_t eax, ecx, edx;
+ uint64_t qual;
+
+ handled = 0;
+ vmcs = &vmx->vmcs[vcpu];
+ vmxctx = &vmx->ctx[vcpu];
+ qual = vmexit->u.vmx.exit_qualification;
+ vmexit->exitcode = VM_EXITCODE_BOGUS;
+
+ switch (vmexit->u.vmx.exit_reason) {
+ case EXIT_REASON_CR_ACCESS:
+ handled = vmx_emulate_cr_access(vmx, vcpu, qual);
+ break;
+ case EXIT_REASON_RDMSR:
+ ecx = vmxctx->guest_rcx;
+ handled = emulate_rdmsr(vmx->vm, vcpu, ecx);
+ if (!handled) {
+ vmexit->exitcode = VM_EXITCODE_RDMSR;
+ vmexit->u.msr.code = ecx;
+ }
+ break;
+ case EXIT_REASON_WRMSR:
+ eax = vmxctx->guest_rax;
+ ecx = vmxctx->guest_rcx;
+ edx = vmxctx->guest_rdx;
+ handled = emulate_wrmsr(vmx->vm, vcpu, ecx,
+ (uint64_t)edx << 32 | eax);
+ if (!handled) {
+ vmexit->exitcode = VM_EXITCODE_WRMSR;
+ vmexit->u.msr.code = ecx;
+ vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
+ }
+ break;
+ case EXIT_REASON_HLT:
+ vmexit->exitcode = VM_EXITCODE_HLT;
+ break;
+ case EXIT_REASON_MTF:
+ vmexit->exitcode = VM_EXITCODE_MTRAP;
+ break;
+ case EXIT_REASON_PAUSE:
+ vmexit->exitcode = VM_EXITCODE_PAUSE;
+ break;
+ case EXIT_REASON_INTR_WINDOW:
+ vmx_clear_int_window_exiting(vmx, vcpu);
+ VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
+ /* FALLTHRU */
+ case EXIT_REASON_EXT_INTR:
+ /*
+ * External interrupts serve only to cause VM exits and allow
+ * the host interrupt handler to run.
+ *
+ * If this external interrupt triggers a virtual interrupt
+ * to a VM, then that state will be recorded by the
+ * host interrupt handler in the VM's softc. We will inject
+ * this virtual interrupt during the subsequent VM enter.
+ */
+
+ /*
+ * This is special. We want to treat this as an 'handled'
+ * VM-exit but not increment the instruction pointer.
+ */
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
+ return (1);
+ case EXIT_REASON_NMI_WINDOW:
+ /* Exit to allow the pending virtual NMI to be injected */
+ vmx_clear_nmi_window_exiting(vmx, vcpu);
+ VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
+ return (1);
+ case EXIT_REASON_INOUT:
+ vmexit->exitcode = VM_EXITCODE_INOUT;
+ vmexit->u.inout.bytes = (qual & 0x7) + 1;
+ vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
+ vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
+ vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
+ vmexit->u.inout.port = (uint16_t)(qual >> 16);
+ vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
+ break;
+ case EXIT_REASON_CPUID:
+ handled = vmx_handle_cpuid(vmxctx);
+ break;
+ default:
+ break;
+ }
+
+ if (handled) {
+ /*
+ * It is possible that control is returned to userland
+ * even though we were able to handle the VM exit in the
+ * kernel (for e.g. 'astpending' is set in the run loop).
+ *
+ * In such a case we want to make sure that the userland
+ * restarts guest execution at the instruction *after*
+ * the one we just processed. Therefore we update the
+ * guest rip in the VMCS and in 'vmexit'.
+ */
+ vm_exit_update_rip(vmexit);
+ vmexit->rip += vmexit->inst_length;
+ vmexit->inst_length = 0;
+ } else {
+ if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
+ /*
+ * If this VM exit was not claimed by anybody then
+ * treat it as a generic VMX exit.
+ */
+ vmexit->exitcode = VM_EXITCODE_VMX;
+ vmexit->u.vmx.error = 0;
+ } else {
+ /*
+ * The exitcode and collateral have been populated.
+ * The VM exit will be processed further in userland.
+ */
+ }
+ }
+ return (handled);
+}
+
+static int
+vmx_run(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit)
+{
+ int error, vie, rc, handled, astpending, loopstart;
+ uint32_t exit_reason;
+ struct vmx *vmx;
+ struct vmxctx *vmxctx;
+ struct vmcs *vmcs;
+
+ vmx = arg;
+ vmcs = &vmx->vmcs[vcpu];
+ vmxctx = &vmx->ctx[vcpu];
+ loopstart = 1;
+
+ /*
+ * XXX Can we avoid doing this every time we do a vm run?
+ */
+ VMPTRLD(vmcs);
+
+ /*
+ * XXX
+ * We do this every time because we may setup the virtual machine
+ * from a different process than the one that actually runs it.
+ *
+ * If the life of a virtual machine was spent entirely in the context
+ * of a single process we could do this once in vmcs_set_defaults().
+ */
+ if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0)
+ panic("vmx_run: error %d writing to VMCS_HOST_CR3", error);
+
+ if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0)
+ panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+
+ if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0)
+ panic("vmx_run: error %d setting up pcpu defaults", error);
+
+ do {
+ lapic_timer_tick(vmx->vm, vcpu);
+ vmx_inject_interrupts(vmx, vcpu);
+ vmx_run_trace(vmx, vcpu);
+ rc = vmx_setjmp(vmxctx);
+#ifdef SETJMP_TRACE
+ vmx_setjmp_trace(vmx, vcpu, vmxctx, rc);
+#endif
+ switch (rc) {
+ case VMX_RETURN_DIRECT:
+ if (loopstart) {
+ loopstart = 0;
+ vmx_launch(vmxctx);
+ } else
+ vmx_resume(vmxctx);
+ panic("vmx_launch/resume should not return");
+ break;
+ case VMX_RETURN_LONGJMP:
+ break; /* vm exit */
+ case VMX_RETURN_VMRESUME:
+ vie = vmcs_instruction_error();
+ if (vmxctx->launch_error == VM_FAIL_INVALID ||
+ vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) {
+ printf("vmresume error %d vmcs inst error %d\n",
+ vmxctx->launch_error, vie);
+ goto err_exit;
+ }
+ vmx_launch(vmxctx); /* try to launch the guest */
+ panic("vmx_launch should not return");
+ break;
+ case VMX_RETURN_VMLAUNCH:
+ vie = vmcs_instruction_error();
+#if 1
+ printf("vmlaunch error %d vmcs inst error %d\n",
+ vmxctx->launch_error, vie);
+#endif
+ goto err_exit;
+ default:
+ panic("vmx_setjmp returned %d", rc);
+ }
+
+ /*
+ * XXX locking?
+ * See comments in exception.S about checking for ASTs
+ * atomically while interrupts are disabled. But it is
+ * not clear that they apply in our case.
+ */
+ astpending = curthread->td_flags & TDF_ASTPENDING;
+
+ /* enable interrupts */
+ enable_intr();
+
+ /* collect some basic information for VM exit processing */
+ vmexit->rip = rip = vmcs_guest_rip();
+ vmexit->inst_length = vmexit_instruction_length();
+ vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
+ vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
+
+ handled = vmx_exit_process(vmx, vcpu, vmexit);
+
+ vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled,
+ astpending);
+ } while (handled && !astpending);
+
+ /*
+ * If a VM exit has been handled then the exitcode must be BOGUS
+ * If a VM exit is not handled then the exitcode must not be BOGUS
+ */
+ if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
+ (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
+ panic("Mismatch between handled (%d) and exitcode (%d)",
+ handled, vmexit->exitcode);
+ }
+
+ VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode);
+
+ /*
+ * XXX
+ * We need to do this to ensure that any VMCS state cached by the
+ * processor is flushed to memory. We need to do this in case the
+ * VM moves to a different cpu the next time it runs.
+ *
+ * Can we avoid doing this?
+ */
+ VMCLEAR(vmcs);
+ return (0);
+
+err_exit:
+ vmexit->exitcode = VM_EXITCODE_VMX;
+ vmexit->u.vmx.exit_reason = (uint32_t)-1;
+ vmexit->u.vmx.exit_qualification = (uint32_t)-1;
+ vmexit->u.vmx.error = vie;
+ VMCLEAR(vmcs);
+ return (ENOEXEC);
+}
+
+static void
+vmx_vmcleanup(void *arg)
+{
+ int error;
+ struct vmx *vmx = arg;
+
+ /*
+ * XXXSMP we also need to clear the VMCS active on the other vcpus.
+ */
+ error = vmclear(&vmx->vmcs[0]);
+ if (error != 0)
+ panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
+
+ ept_vmcleanup(vmx);
+ free(vmx, M_VMX);
+
+ return;
+}
+
+static register_t *
+vmxctx_regptr(struct vmxctx *vmxctx, int reg)
+{
+
+ switch (reg) {
+ case VM_REG_GUEST_RAX:
+ return (&vmxctx->guest_rax);
+ case VM_REG_GUEST_RBX:
+ return (&vmxctx->guest_rbx);
+ case VM_REG_GUEST_RCX:
+ return (&vmxctx->guest_rcx);
+ case VM_REG_GUEST_RDX:
+ return (&vmxctx->guest_rdx);
+ case VM_REG_GUEST_RSI:
+ return (&vmxctx->guest_rsi);
+ case VM_REG_GUEST_RDI:
+ return (&vmxctx->guest_rdi);
+ case VM_REG_GUEST_RBP:
+ return (&vmxctx->guest_rbp);
+ case VM_REG_GUEST_R8:
+ return (&vmxctx->guest_r8);
+ case VM_REG_GUEST_R9:
+ return (&vmxctx->guest_r9);
+ case VM_REG_GUEST_R10:
+ return (&vmxctx->guest_r10);
+ case VM_REG_GUEST_R11:
+ return (&vmxctx->guest_r11);
+ case VM_REG_GUEST_R12:
+ return (&vmxctx->guest_r12);
+ case VM_REG_GUEST_R13:
+ return (&vmxctx->guest_r13);
+ case VM_REG_GUEST_R14:
+ return (&vmxctx->guest_r14);
+ case VM_REG_GUEST_R15:
+ return (&vmxctx->guest_r15);
+ default:
+ break;
+ }
+ return (NULL);
+}
+
+static int
+vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
+{
+ register_t *regp;
+
+ if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+ *retval = *regp;
+ return (0);
+ } else
+ return (EINVAL);
+}
+
+static int
+vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
+{
+ register_t *regp;
+
+ if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+ *regp = val;
+ return (0);
+ } else
+ return (EINVAL);
+}
+
+static int
+vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
+{
+ struct vmx *vmx = arg;
+
+ if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
+ return (0);
+
+ /*
+ * If the vcpu is running then don't mess with the VMCS.
+ *
+ * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause
+ * the subsequent vmlaunch/vmresume to fail.
+ */
+ if (vcpu_is_running(vmx->vm, vcpu, NULL))
+ panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+ return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval));
+}
+
+static int
+vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
+{
+ int error;
+ uint64_t ctls;
+ struct vmx *vmx = arg;
+
+ /*
+ * XXX Allow caller to set contents of the guest registers saved in
+ * the 'vmxctx' even though the vcpu might be running. We need this
+ * specifically to support the rdmsr emulation that will set the
+ * %eax and %edx registers during vm exit processing.
+ */
+ if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
+ return (0);
+
+ /*
+ * If the vcpu is running then don't mess with the VMCS.
+ *
+ * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause
+ * the subsequent vmlaunch/vmresume to fail.
+ */
+ if (vcpu_is_running(vmx->vm, vcpu, NULL))
+ panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+ error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val);
+
+ if (error == 0) {
+ /*
+ * If the "load EFER" VM-entry control is 1 then the
+ * value of EFER.LMA must be identical to "IA-32e mode guest"
+ * bit in the VM-entry control.
+ */
+ if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
+ (reg == VM_REG_GUEST_EFER)) {
+ vmcs_getreg(&vmx->vmcs[vcpu],
+ VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
+ if (val & EFER_LMA)
+ ctls |= VM_ENTRY_GUEST_LMA;
+ else
+ ctls &= ~VM_ENTRY_GUEST_LMA;
+ vmcs_setreg(&vmx->vmcs[vcpu],
+ VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
+ }
+ }
+
+ return (error);
+}
+
+static int
+vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+ struct vmx *vmx = arg;
+
+ return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc));
+}
+
+static int
+vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+ struct vmx *vmx = arg;
+
+ return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc));
+}
+
+static int
+vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
+ int code_valid)
+{
+ int error;
+ uint32_t info;
+ struct vmx *vmx = arg;
+ struct vmcs *vmcs = &vmx->vmcs[vcpu];
+
+ static uint32_t type_map[VM_EVENT_MAX] = {
+ 0x1, /* VM_EVENT_NONE */
+ 0x0, /* VM_HW_INTR */
+ 0x2, /* VM_NMI */
+ 0x3, /* VM_HW_EXCEPTION */
+ 0x4, /* VM_SW_INTR */
+ 0x5, /* VM_PRIV_SW_EXCEPTION */
+ 0x6, /* VM_SW_EXCEPTION */
+ };
+
+ info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0);
+ info |= VMCS_INTERRUPTION_INFO_VALID;
+ error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info);
+ if (error != 0)
+ return (error);
+
+ if (code_valid) {
+ error = vmcs_setreg(vmcs,
+ VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR),
+ code);
+ }
+ return (error);
+}
+
+static int
+vmx_nmi(void *arg, int vcpu)
+{
+ struct vmx *vmx = arg;
+
+ atomic_set_int(&vmx->state[vcpu].request_nmi, 1);
+
+ return (0);
+}
+
+static int
+vmx_getcap(void *arg, int vcpu, int type, int *retval)
+{
+ struct vmx *vmx = arg;
+ int vcap;
+ int ret;
+
+ ret = ENOENT;
+
+ vcap = vmx->cap[vcpu].set;
+
+ switch (type) {
+ case VM_CAP_HALT_EXIT:
+ if (cap_halt_exit)
+ ret = 0;
+ break;
+ case VM_CAP_PAUSE_EXIT:
+ if (cap_pause_exit)
+ ret = 0;
+ break;
+ case VM_CAP_MTRAP_EXIT:
+ if (cap_monitor_trap)
+ ret = 0;
+ break;
+ case VM_CAP_UNRESTRICTED_GUEST:
+ if (cap_unrestricted_guest)
+ ret = 0;
+ break;
+ default:
+ break;
+ }
+
+ if (ret == 0)
+ *retval = (vcap & (1 << type)) ? 1 : 0;
+
+ return (ret);
+}
+
+static int
+vmx_setcap(void *arg, int vcpu, int type, int val)
+{
+ struct vmx *vmx = arg;
+ struct vmcs *vmcs = &vmx->vmcs[vcpu];
+ uint32_t baseval;
+ uint32_t *pptr;
+ int error;
+ int flag;
+ int reg;
+ int retval;
+
+ retval = ENOENT;
+ pptr = NULL;
+
+ switch (type) {
+ case VM_CAP_HALT_EXIT:
+ if (cap_halt_exit) {
+ retval = 0;
+ pptr = &vmx->cap[vcpu].proc_ctls;
+ baseval = *pptr;
+ flag = PROCBASED_HLT_EXITING;
+ reg = VMCS_PRI_PROC_BASED_CTLS;
+ }
+ break;
+ case VM_CAP_MTRAP_EXIT:
+ if (cap_monitor_trap) {
+ retval = 0;
+ pptr = &vmx->cap[vcpu].proc_ctls;
+ baseval = *pptr;
+ flag = PROCBASED_MTF;
+ reg = VMCS_PRI_PROC_BASED_CTLS;
+ }
+ break;
+ case VM_CAP_PAUSE_EXIT:
+ if (cap_pause_exit) {
+ retval = 0;
+ pptr = &vmx->cap[vcpu].proc_ctls;
+ baseval = *pptr;
+ flag = PROCBASED_PAUSE_EXITING;
+ reg = VMCS_PRI_PROC_BASED_CTLS;
+ }
+ break;
+ case VM_CAP_UNRESTRICTED_GUEST:
+ if (cap_unrestricted_guest) {
+ retval = 0;
+ baseval = procbased_ctls2;
+ flag = PROCBASED2_UNRESTRICTED_GUEST;
+ reg = VMCS_SEC_PROC_BASED_CTLS;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (retval == 0) {
+ if (val) {
+ baseval |= flag;
+ } else {
+ baseval &= ~flag;
+ }
+ VMPTRLD(vmcs);
+ error = vmwrite(reg, baseval);
+ VMCLEAR(vmcs);
+
+ if (error) {
+ retval = error;
+ } else {
+ /*
+ * Update optional stored flags, and record
+ * setting
+ */
+ if (pptr != NULL) {
+ *pptr = baseval;
+ }
+
+ if (val) {
+ vmx->cap[vcpu].set |= (1 << type);
+ } else {
+ vmx->cap[vcpu].set &= ~(1 << type);
+ }
+ }
+ }
+
+ return (retval);
+}
+
+struct vmm_ops vmm_ops_intel = {
+ vmx_init,
+ vmx_cleanup,
+ vmx_vminit,
+ vmx_run,
+ vmx_vmcleanup,
+ ept_vmmmap,
+ vmx_getreg,
+ vmx_setreg,
+ vmx_getdesc,
+ vmx_setdesc,
+ vmx_inject,
+ vmx_nmi,
+ vmx_getcap,
+ vmx_setcap
+};
diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h
new file mode 100644
index 000000000000..69697f8e48d4
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx.h
@@ -0,0 +1,115 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_H_
+#define _VMX_H_
+
+#include "vmcs.h"
+
+#define GUEST_MSR_MAX_ENTRIES 64 /* arbitrary */
+
+struct vmxctx {
+ register_t guest_rdi; /* Guest state */
+ register_t guest_rsi;
+ register_t guest_rdx;
+ register_t guest_rcx;
+ register_t guest_r8;
+ register_t guest_r9;
+ register_t guest_rax;
+ register_t guest_rbx;
+ register_t guest_rbp;
+ register_t guest_r10;
+ register_t guest_r11;
+ register_t guest_r12;
+ register_t guest_r13;
+ register_t guest_r14;
+ register_t guest_r15;
+ register_t guest_cr2;
+
+ register_t host_r15; /* Host state */
+ register_t host_r14;
+ register_t host_r13;
+ register_t host_r12;
+ register_t host_rbp;
+ register_t host_rsp;
+ register_t host_rbx;
+ register_t host_rip;
+ /*
+ * XXX todo debug registers and fpu state
+ */
+
+ int launch_error;
+};
+
+struct vmxcap {
+ int set;
+ uint32_t proc_ctls;
+};
+
+struct vmxstate {
+ int request_nmi;
+ int lastcpu; /* host cpu that this 'vcpu' last ran on */
+ uint16_t vpid;
+};
+
+/* virtual machine softc */
+struct vmx {
+ pml4_entry_t pml4ept[NPML4EPG];
+ struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */
+ char msr_bitmap[PAGE_SIZE];
+ struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
+ struct vmxctx ctx[VM_MAXCPU];
+ struct vmxcap cap[VM_MAXCPU];
+ struct vmxstate state[VM_MAXCPU];
+ struct vm *vm;
+};
+CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
+
+#define VMX_RETURN_DIRECT 0
+#define VMX_RETURN_LONGJMP 1
+#define VMX_RETURN_VMRESUME 2
+#define VMX_RETURN_VMLAUNCH 3
+/*
+ * vmx_setjmp() returns:
+ * - 0 when it returns directly
+ * - 1 when it returns from vmx_longjmp
+ * - 2 when it returns from vmx_resume (which would only be in the error case)
+ * - 3 when it returns from vmx_launch (which would only be in the error case)
+ */
+int vmx_setjmp(struct vmxctx *ctx);
+void vmx_longjmp(void); /* returns via vmx_setjmp */
+void vmx_launch(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */
+void vmx_resume(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */
+
+u_long vmx_fix_cr0(u_long cr0);
+u_long vmx_fix_cr4(u_long cr4);
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_controls.h b/sys/amd64/vmm/intel/vmx_controls.h
new file mode 100644
index 000000000000..31f29f8e6bba
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_controls.h
@@ -0,0 +1,92 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_CONTROLS_H_
+#define _VMX_CONTROLS_H_
+
+/* Pin-Based VM-Execution Controls */
+#define PINBASED_EXTINT_EXITING (1 << 0)
+#define PINBASED_NMI_EXITING (1 << 3)
+#define PINBASED_VIRTUAL_NMI (1 << 5)
+#define PINBASED_PREMPTION_TIMER (1 << 6)
+
+/* Primary Processor-Based VM-Execution Controls */
+#define PROCBASED_INT_WINDOW_EXITING (1 << 2)
+#define PROCBASED_TSC_OFFSET (1 << 3)
+#define PROCBASED_HLT_EXITING (1 << 7)
+#define PROCBASED_INVLPG_EXITING (1 << 9)
+#define PROCBASED_MWAIT_EXITING (1 << 10)
+#define PROCBASED_RDPMC_EXITING (1 << 11)
+#define PROCBASED_RDTSC_EXITING (1 << 12)
+#define PROCBASED_CR3_LOAD_EXITING (1 << 15)
+#define PROCBASED_CR3_STORE_EXITING (1 << 16)
+#define PROCBASED_CR8_LOAD_EXITING (1 << 19)
+#define PROCBASED_CR8_STORE_EXITING (1 << 20)
+#define PROCBASED_USE_TPR_SHADOW (1 << 21)
+#define PROCBASED_NMI_WINDOW_EXITING (1 << 22)
+#define PROCBASED_MOV_DR_EXITING (1 << 23)
+#define PROCBASED_IO_EXITING (1 << 24)
+#define PROCBASED_IO_BITMAPS (1 << 25)
+#define PROCBASED_MTF (1 << 27)
+#define PROCBASED_MSR_BITMAPS (1 << 28)
+#define PROCBASED_MONITOR_EXITING (1 << 29)
+#define PROCBASED_PAUSE_EXITING (1 << 30)
+#define PROCBASED_SECONDARY_CONTROLS (1 << 31)
+
+/* Secondary Processor-Based VM-Execution Controls */
+#define PROCBASED2_VIRTUALIZE_APIC (1 << 0)
+#define PROCBASED2_ENABLE_EPT (1 << 1)
+#define PROCBASED2_DESC_TABLE_EXITING (1 << 2)
+#define PROCBASED2_ENABLE_RDTSCP (1 << 3)
+#define PROCBASED2_VIRTUALIZE_X2APIC (1 << 4)
+#define PROCBASED2_ENABLE_VPID (1 << 5)
+#define PROCBASED2_WBINVD_EXITING (1 << 6)
+#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7)
+#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10)
+
+/* VM Exit Controls */
+#define VM_EXIT_SAVE_DEBUG_CONTROLS (1 << 2)
+#define VM_EXIT_HOST_LMA (1 << 9)
+#define VM_EXIT_LOAD_PERF_GLOBAL_CTRL (1 << 12)
+#define VM_EXIT_ACKNOWLEDGE_INTERRUPT (1 << 15)
+#define VM_EXIT_SAVE_PAT (1 << 18)
+#define VM_EXIT_LOAD_PAT (1 << 19)
+#define VM_EXIT_SAVE_EFER (1 << 20)
+#define VM_EXIT_LOAD_EFER (1 << 21)
+#define VM_EXIT_SAVE_PREEMPTION_TIMER (1 << 22)
+
+/* VM Entry Controls */
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS (1 << 2)
+#define VM_ENTRY_GUEST_LMA (1 << 9)
+#define VM_ENTRY_INTO_SMM (1 << 10)
+#define VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11)
+#define VM_ENTRY_LOAD_PERF_GLOBAL_CTRL (1 << 13)
+#define VM_ENTRY_LOAD_PAT (1 << 14)
+#define VM_ENTRY_LOAD_EFER (1 << 15)
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_cpufunc.h b/sys/amd64/vmm/intel/vmx_cpufunc.h
new file mode 100644
index 000000000000..e9f6c6dcaab3
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_cpufunc.h
@@ -0,0 +1,199 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_CPUFUNC_H_
+#define _VMX_CPUFUNC_H_
+
+struct vmcs;
+
+/*
+ * Section 5.2 "Conventions" from Intel Architecture Manual 2B.
+ *
+ * error
+ * VMsucceed 0
+ * VMFailInvalid 1
+ * VMFailValid 2 see also VMCS VM-Instruction Error Field
+ */
+#define VM_SUCCESS 0
+#define VM_FAIL_INVALID 1
+#define VM_FAIL_VALID 2
+#define VMX_SET_ERROR_CODE(varname) \
+ do { \
+ __asm __volatile(" jnc 1f;" \
+ " mov $1, %0;" /* CF: error = 1 */ \
+ " jmp 3f;" \
+ "1: jnz 2f;" \
+ " mov $2, %0;" /* ZF: error = 2 */ \
+ " jmp 3f;" \
+ "2: mov $0, %0;" \
+ "3: nop" \
+ :"=r" (varname)); \
+ } while (0)
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmxon(char *region)
+{
+ int error;
+ uint64_t addr;
+
+ addr = vtophys(region);
+ __asm __volatile("vmxon %0" : : "m" (*(uint64_t *)&addr) : "memory");
+ VMX_SET_ERROR_CODE(error);
+ return (error);
+}
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmclear(struct vmcs *vmcs)
+{
+ int error;
+ uint64_t addr;
+
+ addr = vtophys(vmcs);
+ __asm __volatile("vmclear %0" : : "m" (*(uint64_t *)&addr) : "memory");
+ VMX_SET_ERROR_CODE(error);
+ return (error);
+}
+
+static __inline void
+vmxoff(void)
+{
+ __asm __volatile("vmxoff");
+}
+
+static __inline void
+vmptrst(uint64_t *addr)
+{
+ __asm __volatile("vmptrst %0" : : "m" (*addr) : "memory");
+}
+
+static __inline int
+vmptrld(struct vmcs *vmcs)
+{
+ int error;
+ uint64_t addr;
+
+ addr = vtophys(vmcs);
+ __asm __volatile("vmptrld %0" : : "m" (*(uint64_t *)&addr) : "memory");
+ VMX_SET_ERROR_CODE(error);
+ return (error);
+}
+
+static __inline int
+vmwrite(uint64_t reg, uint64_t val)
+{
+ int error;
+
+ __asm __volatile("vmwrite %0, %1" : : "r" (val), "r" (reg) : "memory");
+
+ VMX_SET_ERROR_CODE(error);
+
+ return (error);
+}
+
+static __inline int
+vmread(uint64_t r, uint64_t *addr)
+{
+ int error;
+
+ __asm __volatile("vmread %0, %1" : : "r" (r), "m" (*addr) : "memory");
+
+ VMX_SET_ERROR_CODE(error);
+
+ return (error);
+}
+
+static void __inline
+VMCLEAR(struct vmcs *vmcs)
+{
+ int err;
+
+ err = vmclear(vmcs);
+ if (err != 0)
+ panic("%s: vmclear(%p) error %d", __func__, vmcs, err);
+
+ critical_exit();
+}
+
+static void __inline
+VMPTRLD(struct vmcs *vmcs)
+{
+ int err;
+
+ critical_enter();
+
+ err = vmptrld(vmcs);
+ if (err != 0)
+ panic("%s: vmptrld(%p) error %d", __func__, vmcs, err);
+}
+
+#define INVVPID_TYPE_ADDRESS 0UL
+#define INVVPID_TYPE_SINGLE_CONTEXT 1UL
+#define INVVPID_TYPE_ALL_CONTEXTS 2UL
+
+struct invvpid_desc {
+ uint16_t vpid;
+ uint16_t _res1;
+ uint32_t _res2;
+ uint64_t linear_addr;
+};
+CTASSERT(sizeof(struct invvpid_desc) == 16);
+
+static void __inline
+invvpid(uint64_t type, struct invvpid_desc desc)
+{
+ int error;
+
+ __asm __volatile("invvpid %0, %1" :: "m" (desc), "r" (type) : "memory");
+
+ VMX_SET_ERROR_CODE(error);
+ if (error)
+ panic("invvpid error %d", error);
+}
+
+#define INVEPT_TYPE_SINGLE_CONTEXT 1UL
+#define INVEPT_TYPE_ALL_CONTEXTS 2UL
+struct invept_desc {
+ uint64_t eptp;
+ uint64_t _res;
+};
+CTASSERT(sizeof(struct invept_desc) == 16);
+
+static void __inline
+invept(uint64_t type, struct invept_desc desc)
+{
+ int error;
+
+ __asm __volatile("invept %0, %1" :: "m" (desc), "r" (type) : "memory");
+
+ VMX_SET_ERROR_CODE(error);
+ if (error)
+ panic("invept error %d", error);
+}
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c
new file mode 100644
index 000000000000..c4b1efcd7451
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_genassym.c
@@ -0,0 +1,81 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/assym.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "vmx.h"
+#include "vmx_cpufunc.h"
+
+ASSYM(VMXCTX_GUEST_RDI, offsetof(struct vmxctx, guest_rdi));
+ASSYM(VMXCTX_GUEST_RSI, offsetof(struct vmxctx, guest_rsi));
+ASSYM(VMXCTX_GUEST_RDX, offsetof(struct vmxctx, guest_rdx));
+ASSYM(VMXCTX_GUEST_RCX, offsetof(struct vmxctx, guest_rcx));
+ASSYM(VMXCTX_GUEST_R8, offsetof(struct vmxctx, guest_r8));
+ASSYM(VMXCTX_GUEST_R9, offsetof(struct vmxctx, guest_r9));
+ASSYM(VMXCTX_GUEST_RAX, offsetof(struct vmxctx, guest_rax));
+ASSYM(VMXCTX_GUEST_RBX, offsetof(struct vmxctx, guest_rbx));
+ASSYM(VMXCTX_GUEST_RBP, offsetof(struct vmxctx, guest_rbp));
+ASSYM(VMXCTX_GUEST_R10, offsetof(struct vmxctx, guest_r10));
+ASSYM(VMXCTX_GUEST_R11, offsetof(struct vmxctx, guest_r11));
+ASSYM(VMXCTX_GUEST_R12, offsetof(struct vmxctx, guest_r12));
+ASSYM(VMXCTX_GUEST_R13, offsetof(struct vmxctx, guest_r13));
+ASSYM(VMXCTX_GUEST_R14, offsetof(struct vmxctx, guest_r14));
+ASSYM(VMXCTX_GUEST_R15, offsetof(struct vmxctx, guest_r15));
+ASSYM(VMXCTX_GUEST_CR2, offsetof(struct vmxctx, guest_cr2));
+
+ASSYM(VMXCTX_HOST_R15, offsetof(struct vmxctx, host_r15));
+ASSYM(VMXCTX_HOST_R14, offsetof(struct vmxctx, host_r14));
+ASSYM(VMXCTX_HOST_R13, offsetof(struct vmxctx, host_r13));
+ASSYM(VMXCTX_HOST_R12, offsetof(struct vmxctx, host_r12));
+ASSYM(VMXCTX_HOST_RBP, offsetof(struct vmxctx, host_rbp));
+ASSYM(VMXCTX_HOST_RSP, offsetof(struct vmxctx, host_rsp));
+ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx));
+ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip));
+
+ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error));
+
+ASSYM(VM_SUCCESS, VM_SUCCESS);
+ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID);
+ASSYM(VM_FAIL_VALID, VM_FAIL_VALID);
+
+ASSYM(VMX_RETURN_DIRECT, VMX_RETURN_DIRECT);
+ASSYM(VMX_RETURN_LONGJMP, VMX_RETURN_LONGJMP);
+ASSYM(VMX_RETURN_VMRESUME, VMX_RETURN_VMRESUME);
+ASSYM(VMX_RETURN_VMLAUNCH, VMX_RETURN_VMLAUNCH);
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
new file mode 100644
index 000000000000..1e9a837a78d6
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -0,0 +1,172 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/cpufunc.h>
+
+#include "vmx_msr.h"
+
+static boolean_t
+vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
+{
+
+ if (msr_val & (1UL << (bitpos + 32)))
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+static boolean_t
+vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
+{
+
+ if ((msr_val & (1UL << bitpos)) == 0)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+uint32_t
+vmx_revision(void)
+{
+
+ return (rdmsr(MSR_VMX_BASIC) & 0xffffffff);
+}
+
+/*
+ * Generate a bitmask to be used for the VMCS execution control fields.
+ *
+ * The caller specifies what bits should be set to one in 'ones_mask'
+ * and what bits should be set to zero in 'zeros_mask'. The don't-care
+ * bits are set to the default value. The default values are obtained
+ * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
+ * VMX Capabilities".
+ *
+ * Returns zero on success and non-zero on error.
+ */
+int
+vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+ uint32_t zeros_mask, uint32_t *retval)
+{
+ int i;
+ uint64_t val, trueval;
+ boolean_t true_ctls_avail, one_allowed, zero_allowed;
+
+ /* We cannot ask the same bit to be set to both '1' and '0' */
+ if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
+ return (EINVAL);
+
+ if (rdmsr(MSR_VMX_BASIC) & (1UL << 55))
+ true_ctls_avail = TRUE;
+ else
+ true_ctls_avail = FALSE;
+
+ val = rdmsr(ctl_reg);
+ if (true_ctls_avail)
+ trueval = rdmsr(true_ctl_reg); /* step c */
+ else
+ trueval = val; /* step a */
+
+ for (i = 0; i < 32; i++) {
+ one_allowed = vmx_ctl_allows_one_setting(trueval, i);
+ zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
+
+ KASSERT(one_allowed || zero_allowed,
+ ("invalid zero/one setting for bit %d of ctl 0x%0x, "
+ "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
+
+ if (zero_allowed && !one_allowed) { /* b(i),c(i) */
+ if (ones_mask & (1 << i))
+ return (EINVAL);
+ *retval &= ~(1 << i);
+ } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */
+ if (zeros_mask & (1 << i))
+ return (EINVAL);
+ *retval |= 1 << i;
+ } else {
+ if (zeros_mask & (1 << i)) /* b(ii),c(ii) */
+ *retval &= ~(1 << i);
+ else if (ones_mask & (1 << i)) /* b(ii), c(ii) */
+ *retval |= 1 << i;
+ else if (!true_ctls_avail)
+ *retval &= ~(1 << i); /* b(iii) */
+ else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/
+ *retval &= ~(1 << i);
+ else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */
+ *retval |= 1 << i;
+ else {
+ panic("vmx_set_ctlreg: unable to determine "
+ "correct value of ctl bit %d for msr "
+ "0x%0x and true msr 0x%0x", i, ctl_reg,
+ true_ctl_reg);
+ }
+ }
+ }
+
+ return (0);
+}
+
+void
+msr_bitmap_initialize(char *bitmap)
+{
+
+ memset(bitmap, 0xff, PAGE_SIZE);
+}
+
+int
+msr_bitmap_change_access(char *bitmap, u_int msr, int access)
+{
+ int byte, bit;
+
+ if (msr >= 0x00000000 && msr <= 0x00001FFF)
+ byte = msr / 8;
+ else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
+ byte = 1024 + (msr - 0xC0000000) / 8;
+ else
+ return (EINVAL);
+
+ bit = msr & 0x7;
+
+ if (access & MSR_BITMAP_ACCESS_READ)
+ bitmap[byte] &= ~(1 << bit);
+ else
+ bitmap[byte] |= 1 << bit;
+
+ byte += 2048;
+ if (access & MSR_BITMAP_ACCESS_WRITE)
+ bitmap[byte] &= ~(1 << bit);
+ else
+ bitmap[byte] |= 1 << bit;
+
+ return (0);
+}
diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h
new file mode 100644
index 000000000000..e6379a93d155
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_msr.h
@@ -0,0 +1,78 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_MSR_H_
+#define _VMX_MSR_H_
+
+#define MSR_VMX_BASIC 0x480
+#define MSR_VMX_EPT_VPID_CAP 0x48C
+
+#define MSR_VMX_PROCBASED_CTLS 0x482
+#define MSR_VMX_TRUE_PROCBASED_CTLS 0x48E
+
+#define MSR_VMX_PINBASED_CTLS 0x481
+#define MSR_VMX_TRUE_PINBASED_CTLS 0x48D
+
+#define MSR_VMX_PROCBASED_CTLS2 0x48B
+
+#define MSR_VMX_EXIT_CTLS 0x483
+#define MSR_VMX_TRUE_EXIT_CTLS 0x48f
+
+#define MSR_VMX_ENTRY_CTLS 0x484
+#define MSR_VMX_TRUE_ENTRY_CTLS 0x490
+
+#define MSR_VMX_CR0_FIXED0 0x486
+#define MSR_VMX_CR0_FIXED1 0x487
+
+#define MSR_VMX_CR4_FIXED0 0x488
+#define MSR_VMX_CR4_FIXED1 0x489
+
+uint32_t vmx_revision(void);
+
+int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+ uint32_t zeros_mask, uint32_t *retval);
+
+/*
+ * According to Section 21.10.4 "Software Access to Related Structures",
+ * changes to data structures pointed to by the VMCS must be made only when
+ * there is no logical processor with a current VMCS that points to the
+ * data structure.
+ *
+ * This pretty much limits us to configuring the MSR bitmap before VMCS
+ * initialization for SMP VMs. Unless of course we do it the hard way - which
+ * would involve some form of synchronization between the vcpus to vmclear
+ * all VMCSs' that point to the bitmap.
+ */
+#define MSR_BITMAP_ACCESS_NONE 0x0
+#define MSR_BITMAP_ACCESS_READ 0x1
+#define MSR_BITMAP_ACCESS_WRITE 0x2
+#define MSR_BITMAP_ACCESS_RW (MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE)
+void msr_bitmap_initialize(char *bitmap);
+int msr_bitmap_change_access(char *bitmap, u_int msr, int access);
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
new file mode 100644
index 000000000000..4d1bf1da13df
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -0,0 +1,204 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <machine/asmacros.h>
+
+#include "vmx_assym.s"
+
+/*
+ * Assumes that %rdi holds a pointer to the 'vmxctx'
+ */
+#define VMX_GUEST_RESTORE \
+ /* \
+ * Make sure that interrupts are disabled before restoring CR2. \
+ * Otherwise there could be a page fault during the interrupt \
+ * handler execution that would end up trashing CR2. \
+ */ \
+ cli; \
+ movq VMXCTX_GUEST_CR2(%rdi),%rsi; \
+ movq %rsi,%cr2; \
+ movq VMXCTX_GUEST_RSI(%rdi),%rsi; \
+ movq VMXCTX_GUEST_RDX(%rdi),%rdx; \
+ movq VMXCTX_GUEST_RCX(%rdi),%rcx; \
+ movq VMXCTX_GUEST_R8(%rdi),%r8; \
+ movq VMXCTX_GUEST_R9(%rdi),%r9; \
+ movq VMXCTX_GUEST_RAX(%rdi),%rax; \
+ movq VMXCTX_GUEST_RBX(%rdi),%rbx; \
+ movq VMXCTX_GUEST_RBP(%rdi),%rbp; \
+ movq VMXCTX_GUEST_R10(%rdi),%r10; \
+ movq VMXCTX_GUEST_R11(%rdi),%r11; \
+ movq VMXCTX_GUEST_R12(%rdi),%r12; \
+ movq VMXCTX_GUEST_R13(%rdi),%r13; \
+ movq VMXCTX_GUEST_R14(%rdi),%r14; \
+ movq VMXCTX_GUEST_R15(%rdi),%r15; \
+ movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */
+
+#define VM_INSTRUCTION_ERROR(reg) \
+ jnc 1f; \
+ movl $VM_FAIL_INVALID,reg; /* CF is set */ \
+ jmp 3f; \
+1: jnz 2f; \
+ movl $VM_FAIL_VALID,reg; /* ZF is set */ \
+ jmp 3f; \
+2: movl $VM_SUCCESS,reg; \
+3: movl reg,VMXCTX_LAUNCH_ERROR(%rsp)
+
+ .text
+/*
+ * int vmx_setjmp(ctxp)
+ * %rdi = ctxp
+ *
+ * Return value is '0' when it returns directly from here.
+ * Return value is '1' when it returns after a vm exit through vmx_longjmp.
+ */
+ENTRY(vmx_setjmp)
+ movq (%rsp),%rax /* return address */
+ movq %r15,VMXCTX_HOST_R15(%rdi)
+ movq %r14,VMXCTX_HOST_R14(%rdi)
+ movq %r13,VMXCTX_HOST_R13(%rdi)
+ movq %r12,VMXCTX_HOST_R12(%rdi)
+ movq %rbp,VMXCTX_HOST_RBP(%rdi)
+ movq %rsp,VMXCTX_HOST_RSP(%rdi)
+ movq %rbx,VMXCTX_HOST_RBX(%rdi)
+ movq %rax,VMXCTX_HOST_RIP(%rdi)
+
+ /*
+ * XXX save host debug registers
+ */
+ movl $VMX_RETURN_DIRECT,%eax
+ ret
+END(vmx_setjmp)
+
+/*
+ * void vmx_return(struct vmxctx *ctxp, int retval)
+ * %rdi = ctxp
+ * %rsi = retval
+ * Return to vmm context through vmx_setjmp() with a value of 'retval'.
+ */
+ENTRY(vmx_return)
+ /* Restore host context. */
+ movq VMXCTX_HOST_R15(%rdi),%r15
+ movq VMXCTX_HOST_R14(%rdi),%r14
+ movq VMXCTX_HOST_R13(%rdi),%r13
+ movq VMXCTX_HOST_R12(%rdi),%r12
+ movq VMXCTX_HOST_RBP(%rdi),%rbp
+ movq VMXCTX_HOST_RSP(%rdi),%rsp
+ movq VMXCTX_HOST_RBX(%rdi),%rbx
+ movq VMXCTX_HOST_RIP(%rdi),%rax
+ movq %rax,(%rsp) /* return address */
+
+ /*
+ * XXX restore host debug registers
+ */
+ movl %esi,%eax
+ ret
+END(vmx_return)
+
+/*
+ * void vmx_longjmp(void)
+ * %rsp points to the struct vmxctx
+ */
+ENTRY(vmx_longjmp)
+ /*
+ * Save guest state that is not automatically saved in the vmcs.
+ */
+ movq %rdi,VMXCTX_GUEST_RDI(%rsp)
+ movq %rsi,VMXCTX_GUEST_RSI(%rsp)
+ movq %rdx,VMXCTX_GUEST_RDX(%rsp)
+ movq %rcx,VMXCTX_GUEST_RCX(%rsp)
+ movq %r8,VMXCTX_GUEST_R8(%rsp)
+ movq %r9,VMXCTX_GUEST_R9(%rsp)
+ movq %rax,VMXCTX_GUEST_RAX(%rsp)
+ movq %rbx,VMXCTX_GUEST_RBX(%rsp)
+ movq %rbp,VMXCTX_GUEST_RBP(%rsp)
+ movq %r10,VMXCTX_GUEST_R10(%rsp)
+ movq %r11,VMXCTX_GUEST_R11(%rsp)
+ movq %r12,VMXCTX_GUEST_R12(%rsp)
+ movq %r13,VMXCTX_GUEST_R13(%rsp)
+ movq %r14,VMXCTX_GUEST_R14(%rsp)
+ movq %r15,VMXCTX_GUEST_R15(%rsp)
+
+ movq %cr2,%rdi
+ movq %rdi,VMXCTX_GUEST_CR2(%rsp)
+
+ movq %rsp,%rdi
+ movq $VMX_RETURN_LONGJMP,%rsi
+ callq vmx_return
+END(vmx_longjmp)
+
+/*
+ * void vmx_resume(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 2.
+ */
+ENTRY(vmx_resume)
+ /*
+ * Restore guest state that is not automatically loaded from the vmcs.
+ */
+ VMX_GUEST_RESTORE
+
+ vmresume
+
+ /*
+ * Capture the reason why vmresume failed.
+ */
+ VM_INSTRUCTION_ERROR(%eax)
+
+ /* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
+ movq %rsp,%rdi
+ movq $VMX_RETURN_VMRESUME,%rsi
+ callq vmx_return
+END(vmx_resume)
+
+/*
+ * void vmx_launch(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 3.
+ */
+ENTRY(vmx_launch)
+ /*
+ * Restore guest state that is not automatically loaded from the vmcs.
+ */
+ VMX_GUEST_RESTORE
+
+ vmlaunch
+
+ /*
+ * Capture the reason why vmlaunch failed.
+ */
+ VM_INSTRUCTION_ERROR(%eax)
+
+ /* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
+ movq %rsp,%rdi
+ movq $VMX_RETURN_VMLAUNCH,%rsi
+ callq vmx_return
+END(vmx_launch)
diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c
new file mode 100644
index 000000000000..24495a977477
--- /dev/null
+++ b/sys/amd64/vmm/intel/vtd.c
@@ -0,0 +1,637 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <dev/pci/pcireg.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+#include <machine/pci_cfgreg.h>
+
+#include "io/iommu.h"
+
+/*
+ * Documented in the "Intel Virtualization Technology for Directed I/O",
+ * Architecture Spec, September 2008.
+ */
+
+/* Section 10.4 "Register Descriptions" */
+struct vtdmap {
+ volatile uint32_t version;
+ volatile uint32_t res0;
+ volatile uint64_t cap;
+ volatile uint64_t ext_cap;
+ volatile uint32_t gcr;
+ volatile uint32_t gsr;
+ volatile uint64_t rta;
+ volatile uint64_t ccr;
+};
+
+#define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F)
+#define VTD_CAP_ND(cap) ((cap) & 0x7)
+#define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1)
+#define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF)
+#define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1)
+
+#define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1)
+#define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
+#define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF)
+
+#define VTD_GCR_WBF (1 << 27)
+#define VTD_GCR_SRTP (1 << 30)
+#define VTD_GCR_TE (1 << 31)
+
+#define VTD_GSR_WBFS (1 << 27)
+#define VTD_GSR_RTPS (1 << 30)
+#define VTD_GSR_TES (1 << 31)
+
+#define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */
+#define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */
+
+#define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */
+#define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */
+#define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */
+#define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */
+#define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */
+#define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */
+#define VTD_IIR_DOMAIN_P 32
+
+#define VTD_ROOT_PRESENT 0x1
+#define VTD_CTX_PRESENT 0x1
+#define VTD_CTX_TT_ALL (1UL << 2)
+
+#define VTD_PTE_RD (1UL << 0)
+#define VTD_PTE_WR (1UL << 1)
+#define VTD_PTE_SUPERPAGE (1UL << 7)
+#define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL)
+
+struct domain {
+ uint64_t *ptp; /* first level page table page */
+ int pt_levels; /* number of page table levels */
+ int addrwidth; /* 'AW' field in context entry */
+ int spsmask; /* supported super page sizes */
+ u_int id; /* domain id */
+ vm_paddr_t maxaddr; /* highest address to be mapped */
+ SLIST_ENTRY(domain) next;
+};
+
+static SLIST_HEAD(, domain) domhead;
+
+#define DRHD_MAX_UNITS 8
+static int drhd_num;
+static struct vtdmap *vtdmaps[DRHD_MAX_UNITS];
+static int max_domains;
+typedef int (*drhd_ident_func_t)(void);
+
+static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+
+static MALLOC_DEFINE(M_VTD, "vtd", "vtd");
+
+/*
+ * Config space register definitions from the "Intel 5520 and 5500" datasheet.
+ */
+static int
+tylersburg_vtd_ident(void)
+{
+ int units, nlbus;
+ uint16_t did, vid;
+ uint32_t miscsts, vtbar;
+
+ const int bus = 0;
+ const int slot = 20;
+ const int func = 0;
+
+ units = 0;
+
+ vid = pci_cfgregread(bus, slot, func, PCIR_VENDOR, 2);
+ did = pci_cfgregread(bus, slot, func, PCIR_DEVICE, 2);
+ if (vid != 0x8086 || did != 0x342E)
+ goto done;
+
+ /*
+ * Check if this is a dual IOH configuration.
+ */
+ miscsts = pci_cfgregread(bus, slot, func, 0x9C, 4);
+ if (miscsts & (1 << 25))
+ nlbus = pci_cfgregread(bus, slot, func, 0x160, 1);
+ else
+ nlbus = -1;
+
+ vtbar = pci_cfgregread(bus, slot, func, 0x180, 4);
+ if (vtbar & 0x1) {
+ vtdmaps[units++] = (struct vtdmap *)
+ PHYS_TO_DMAP(vtbar & 0xffffe000);
+ } else if (bootverbose)
+ printf("VT-d unit in legacy IOH is disabled!\n");
+
+ if (nlbus != -1) {
+ vtbar = pci_cfgregread(nlbus, slot, func, 0x180, 4);
+ if (vtbar & 0x1) {
+ vtdmaps[units++] = (struct vtdmap *)
+ PHYS_TO_DMAP(vtbar & 0xffffe000);
+ } else if (bootverbose)
+ printf("VT-d unit in non-legacy IOH is disabled!\n");
+ }
+done:
+ return (units);
+}
+
+static drhd_ident_func_t drhd_ident_funcs[] = {
+ tylersburg_vtd_ident,
+ NULL
+};
+
+static int
+vtd_max_domains(struct vtdmap *vtdmap)
+{
+ int nd;
+
+ nd = VTD_CAP_ND(vtdmap->cap);
+
+ switch (nd) {
+ case 0:
+ return (16);
+ case 1:
+ return (64);
+ case 2:
+ return (256);
+ case 3:
+ return (1024);
+ case 4:
+ return (4 * 1024);
+ case 5:
+ return (16 * 1024);
+ case 6:
+ return (64 * 1024);
+ default:
+ panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
+ }
+}
+
+static u_int
+domain_id(void)
+{
+ u_int id;
+ struct domain *dom;
+
+ /* Skip domain id 0 - it is reserved when Caching Mode field is set */
+ for (id = 1; id < max_domains; id++) {
+ SLIST_FOREACH(dom, &domhead, next) {
+ if (dom->id == id)
+ break;
+ }
+ if (dom == NULL)
+ break; /* found it */
+ }
+
+ if (id >= max_domains)
+ panic("domain ids exhausted");
+
+ return (id);
+}
+
+static void
+vtd_wbflush(struct vtdmap *vtdmap)
+{
+
+ if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
+ pmap_invalidate_cache();
+
+ if (VTD_CAP_RWBF(vtdmap->cap)) {
+ vtdmap->gcr = VTD_GCR_WBF;
+ while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
+ ;
+ }
+}
+
+static void
+vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
+{
+
+ vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
+ while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
+ ;
+}
+
+static void
+vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
+{
+ int offset;
+ volatile uint64_t *iotlb_reg, val;
+
+ vtd_wbflush(vtdmap);
+
+ offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
+ iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
+
+ *iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
+ VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
+
+ while (1) {
+ val = *iotlb_reg;
+ if ((val & VTD_IIR_IVT) == 0)
+ break;
+ }
+}
+
+static void
+vtd_translation_enable(struct vtdmap *vtdmap)
+{
+
+ vtdmap->gcr = VTD_GCR_TE;
+ while ((vtdmap->gsr & VTD_GSR_TES) == 0)
+ ;
+}
+
+static void
+vtd_translation_disable(struct vtdmap *vtdmap)
+{
+
+ vtdmap->gcr = 0;
+ while ((vtdmap->gsr & VTD_GSR_TES) != 0)
+ ;
+}
+
+static int
+vtd_init(void)
+{
+ int i, units;
+ struct vtdmap *vtdmap;
+ vm_paddr_t ctx_paddr;
+
+ for (i = 0; drhd_ident_funcs[i] != NULL; i++) {
+ units = (*drhd_ident_funcs[i])();
+ if (units > 0)
+ break;
+ }
+
+ if (units <= 0)
+ return (ENXIO);
+
+ drhd_num = units;
+ vtdmap = vtdmaps[0];
+
+ if (VTD_CAP_CM(vtdmap->cap) != 0)
+ panic("vtd_init: invalid caching mode");
+
+ max_domains = vtd_max_domains(vtdmap);
+
+ /*
+ * Set up the root-table to point to the context-entry tables
+ */
+ for (i = 0; i < 256; i++) {
+ ctx_paddr = vtophys(ctx_tables[i]);
+ if (ctx_paddr & PAGE_MASK)
+ panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
+
+ root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
+ }
+
+ return (0);
+}
+
+static void
+vtd_cleanup(void)
+{
+}
+
+static void
+vtd_enable(void)
+{
+ int i;
+ struct vtdmap *vtdmap;
+
+ for (i = 0; i < drhd_num; i++) {
+ vtdmap = vtdmaps[i];
+ vtd_wbflush(vtdmap);
+
+ /* Update the root table address */
+ vtdmap->rta = vtophys(root_table);
+ vtdmap->gcr = VTD_GCR_SRTP;
+ while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
+ ;
+
+ vtd_ctx_global_invalidate(vtdmap);
+ vtd_iotlb_global_invalidate(vtdmap);
+
+ vtd_translation_enable(vtdmap);
+ }
+}
+
+static void
+vtd_disable(void)
+{
+ int i;
+ struct vtdmap *vtdmap;
+
+ for (i = 0; i < drhd_num; i++) {
+ vtdmap = vtdmaps[i];
+ vtd_translation_disable(vtdmap);
+ }
+}
+
+static void
+vtd_add_device(void *arg, int bus, int slot, int func)
+{
+ int idx;
+ uint64_t *ctxp;
+ struct domain *dom = arg;
+ vm_paddr_t pt_paddr;
+ struct vtdmap *vtdmap;
+
+ if (bus < 0 || bus > PCI_BUSMAX ||
+ slot < 0 || slot > PCI_SLOTMAX ||
+ func < 0 || func > PCI_FUNCMAX)
+ panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
+
+ vtdmap = vtdmaps[0];
+ ctxp = ctx_tables[bus];
+ pt_paddr = vtophys(dom->ptp);
+ idx = (slot << 3 | func) * 2;
+
+ if (ctxp[idx] & VTD_CTX_PRESENT) {
+ panic("vtd_add_device: device %d/%d/%d is already owned by "
+ "domain %d", bus, slot, func,
+ (uint16_t)(ctxp[idx + 1] >> 8));
+ }
+
+ /*
+ * Order is important. The 'present' bit is set only after all fields
+ * of the context pointer are initialized.
+ */
+ ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
+
+ if (VTD_ECAP_DI(vtdmap->ext_cap))
+ ctxp[idx] = VTD_CTX_TT_ALL;
+ else
+ ctxp[idx] = 0;
+
+ ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
+
+ /*
+ * 'Not Present' entries are not cached in either the Context Cache
+ * or in the IOTLB, so there is no need to invalidate either of them.
+ */
+}
+
+static void
+vtd_remove_device(void *arg, int bus, int slot, int func)
+{
+ int i, idx;
+ uint64_t *ctxp;
+ struct vtdmap *vtdmap;
+
+ if (bus < 0 || bus > PCI_BUSMAX ||
+ slot < 0 || slot > PCI_SLOTMAX ||
+ func < 0 || func > PCI_FUNCMAX)
+ panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
+
+ ctxp = ctx_tables[bus];
+ idx = (slot << 3 | func) * 2;
+
+ /*
+ * Order is important. The 'present' bit is must be cleared first.
+ */
+ ctxp[idx] = 0;
+ ctxp[idx + 1] = 0;
+
+ /*
+ * Invalidate the Context Cache and the IOTLB.
+ *
+ * XXX use device-selective invalidation for Context Cache
+ * XXX use domain-selective invalidation for IOTLB
+ */
+ for (i = 0; i < drhd_num; i++) {
+ vtdmap = vtdmaps[i];
+ vtd_ctx_global_invalidate(vtdmap);
+ vtd_iotlb_global_invalidate(vtdmap);
+ }
+}
+
+static uint64_t
+vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+ struct domain *dom;
+ int i, spshift, ptpshift, ptpindex, nlevels;
+ uint64_t spsize, *ptp;
+
+ dom = arg;
+ ptpindex = 0;
+ ptpshift = 0;
+
+ if (gpa & PAGE_MASK)
+ panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
+
+ if (hpa & PAGE_MASK)
+ panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
+
+ if (len & PAGE_MASK)
+ panic("vtd_create_mapping: unaligned len 0x%0lx", len);
+
+ /*
+ * Compute the size of the mapping that we can accomodate.
+ *
+ * This is based on three factors:
+ * - supported super page size
+ * - alignment of the region starting at 'gpa' and 'hpa'
+ * - length of the region 'len'
+ */
+ spshift = 48;
+ for (i = 3; i >= 0; i--) {
+ spsize = 1UL << spshift;
+ if ((dom->spsmask & (1 << i)) != 0 &&
+ (gpa & (spsize - 1)) == 0 &&
+ (hpa & (spsize - 1)) == 0 &&
+ (len >= spsize)) {
+ break;
+ }
+ spshift -= 9;
+ }
+
+ ptp = dom->ptp;
+ nlevels = dom->pt_levels;
+ while (--nlevels >= 0) {
+ ptpshift = 12 + nlevels * 9;
+ ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+ /* We have reached the leaf mapping */
+ if (spshift >= ptpshift) {
+ break;
+ }
+
+ /*
+ * We are working on a non-leaf page table page.
+ *
+ * Create a downstream page table page if necessary and point
+ * to it from the current page table.
+ */
+ if (ptp[ptpindex] == 0) {
+ void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO);
+ ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
+ }
+
+ ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
+ }
+
+ if ((gpa & ((1UL << ptpshift) - 1)) != 0)
+ panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
+
+ /*
+ * Create a 'gpa' -> 'hpa' mapping
+ */
+ ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
+
+ if (nlevels > 0)
+ ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
+
+ return (1UL << ptpshift);
+}
+
+static void *
+vtd_create_domain(vm_paddr_t maxaddr)
+{
+ struct domain *dom;
+ vm_paddr_t addr;
+ int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
+ struct vtdmap *vtdmap;
+
+ if (drhd_num <= 0)
+ panic("vtd_create_domain: no dma remapping hardware available");
+
+ vtdmap = vtdmaps[0];
+
+ /*
+ * Calculate AGAW.
+ * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
+ */
+ addr = 0;
+ for (gaw = 0; addr < maxaddr; gaw++)
+ addr = 1ULL << gaw;
+
+ res = (gaw - 12) % 9;
+ if (res == 0)
+ agaw = gaw;
+ else
+ agaw = gaw + 9 - res;
+
+ if (agaw > 64)
+ agaw = 64;
+
+ /*
+ * Select the smallest Supported AGAW and the corresponding number
+ * of page table levels.
+ */
+ pt_levels = 2;
+ sagaw = 30;
+ addrwidth = 0;
+ tmp = VTD_CAP_SAGAW(vtdmap->cap);
+ for (i = 0; i < 5; i++) {
+ if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
+ break;
+ pt_levels++;
+ addrwidth++;
+ sagaw += 9;
+ if (sagaw > 64)
+ sagaw = 64;
+ }
+
+ if (i >= 5) {
+ panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d",
+ VTD_CAP_SAGAW(vtdmap->cap), agaw);
+ }
+
+ dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK);
+ dom->pt_levels = pt_levels;
+ dom->addrwidth = addrwidth;
+ dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
+ dom->id = domain_id();
+ dom->maxaddr = maxaddr;
+ dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK);
+ if ((uintptr_t)dom->ptp & PAGE_MASK)
+ panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
+
+ SLIST_INSERT_HEAD(&domhead, dom, next);
+
+ return (dom);
+}
+
+static void
+vtd_free_ptp(uint64_t *ptp, int level)
+{
+ int i;
+ uint64_t *nlp;
+
+ if (level > 1) {
+ for (i = 0; i < 512; i++) {
+ if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
+ continue;
+ if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
+ continue;
+ nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
+ vtd_free_ptp(nlp, level - 1);
+ }
+ }
+
+ bzero(ptp, PAGE_SIZE);
+ free(ptp, M_VTD);
+}
+
+static void
+vtd_destroy_domain(void *arg)
+{
+ struct domain *dom;
+
+ dom = arg;
+
+ SLIST_REMOVE(&domhead, dom, domain, next);
+ vtd_free_ptp(dom->ptp, dom->pt_levels);
+ free(dom, M_VTD);
+}
+
+struct iommu_ops iommu_ops_intel = {
+ vtd_init,
+ vtd_cleanup,
+ vtd_enable,
+ vtd_disable,
+ vtd_create_domain,
+ vtd_destroy_domain,
+ vtd_create_mapping,
+ vtd_add_device,
+ vtd_remove_device,
+};
diff --git a/sys/amd64/vmm/io/iommu.c b/sys/amd64/vmm/io/iommu.c
new file mode 100644
index 000000000000..baf2447c4fa0
--- /dev/null
+++ b/sys/amd64/vmm/io/iommu.c
@@ -0,0 +1,230 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/md_var.h>
+
+#include "vmm_util.h"
+#include "iommu.h"
+
+static boolean_t iommu_avail;
+static struct iommu_ops *ops;
+static void *host_domain;
+
+static __inline int
+IOMMU_INIT(void)
+{
+ if (ops != NULL)
+ return ((*ops->init)());
+ else
+ return (ENXIO);
+}
+
+static __inline void
+IOMMU_CLEANUP(void)
+{
+ if (ops != NULL && iommu_avail)
+ (*ops->cleanup)();
+}
+
+static __inline void *
+IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr)
+{
+
+ if (ops != NULL && iommu_avail)
+ return ((*ops->create_domain)(maxaddr));
+ else
+ return (NULL);
+}
+
+static __inline void
+IOMMU_DESTROY_DOMAIN(void *dom)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->destroy_domain)(dom);
+}
+
+static __inline uint64_t
+IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+
+ if (ops != NULL && iommu_avail)
+ return ((*ops->create_mapping)(domain, gpa, hpa, len));
+ else
+ return (len); /* XXX */
+}
+
+static __inline void
+IOMMU_ADD_DEVICE(void *domain, int bus, int slot, int func)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->add_device)(domain, bus, slot, func);
+}
+
+static __inline void
+IOMMU_REMOVE_DEVICE(void *domain, int bus, int slot, int func)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->remove_device)(domain, bus, slot, func);
+}
+
+static __inline void
+IOMMU_ENABLE(void)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->enable)();
+}
+
+static __inline void
+IOMMU_DISABLE(void)
+{
+
+ if (ops != NULL && iommu_avail)
+ (*ops->disable)();
+}
+
+void
+iommu_init(void)
+{
+ int error, bus, slot, func;
+ vm_paddr_t maxaddr;
+ const char *name;
+ device_t dev;
+
+ if (vmm_is_intel())
+ ops = &iommu_ops_intel;
+ else if (vmm_is_amd())
+ ops = &iommu_ops_amd;
+ else
+ ops = NULL;
+
+ error = IOMMU_INIT();
+ if (error)
+ return;
+
+ iommu_avail = TRUE;
+
+ /*
+ * Create a domain for the devices owned by the host
+ */
+ maxaddr = ptoa(Maxmem);
+ host_domain = IOMMU_CREATE_DOMAIN(maxaddr);
+ if (host_domain == NULL)
+ panic("iommu_init: unable to create a host domain");
+
+ /*
+ * Create 1:1 mappings from '0' to 'Maxmem' for devices assigned to
+ * the host
+ */
+ iommu_create_mapping(host_domain, 0, 0, maxaddr);
+
+ for (bus = 0; bus <= PCI_BUSMAX; bus++) {
+ for (slot = 0; slot <= PCI_SLOTMAX; slot++) {
+ for (func = 0; func <= PCI_FUNCMAX; func++) {
+ dev = pci_find_dbsf(0, bus, slot, func);
+ if (dev == NULL)
+ continue;
+
+ /* skip passthrough devices */
+ name = device_get_name(dev);
+ if (name != NULL && strcmp(name, "ppt") == 0)
+ continue;
+
+ /* everything else belongs to the host domain */
+ iommu_add_device(host_domain, bus, slot, func);
+ }
+ }
+ }
+ IOMMU_ENABLE();
+
+}
+
+void
+iommu_cleanup(void)
+{
+ IOMMU_DISABLE();
+ IOMMU_DESTROY_DOMAIN(host_domain);
+ IOMMU_CLEANUP();
+}
+
+void *
+iommu_create_domain(vm_paddr_t maxaddr)
+{
+
+ return (IOMMU_CREATE_DOMAIN(maxaddr));
+}
+
+void
+iommu_destroy_domain(void *dom)
+{
+
+ IOMMU_DESTROY_DOMAIN(dom);
+}
+
+void
+iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len)
+{
+ uint64_t mapped, remaining;
+
+ remaining = len;
+
+ while (remaining > 0) {
+ mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining);
+ gpa += mapped;
+ hpa += mapped;
+ remaining -= mapped;
+ }
+}
+
+void
+iommu_add_device(void *dom, int bus, int slot, int func)
+{
+
+ IOMMU_ADD_DEVICE(dom, bus, slot, func);
+}
+
+void
+iommu_remove_device(void *dom, int bus, int slot, int func)
+{
+
+ IOMMU_REMOVE_DEVICE(dom, bus, slot, func);
+}
diff --git a/sys/amd64/vmm/io/iommu.h b/sys/amd64/vmm/io/iommu.h
new file mode 100644
index 000000000000..e4f722914fdd
--- /dev/null
+++ b/sys/amd64/vmm/io/iommu.h
@@ -0,0 +1,67 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IO_IOMMU_H_
+#define _IO_IOMMU_H_
+
+typedef int (*iommu_init_func_t)(void);
+typedef void (*iommu_cleanup_func_t)(void);
+typedef void (*iommu_enable_func_t)(void);
+typedef void (*iommu_disable_func_t)(void);
+typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr);
+typedef void (*iommu_destroy_domain_t)(void *domain);
+typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa,
+ vm_paddr_t hpa, uint64_t len);
+typedef void (*iommu_add_device_t)(void *domain, int bus, int slot, int func);
+typedef void (*iommu_remove_device_t)(void *dom, int bus, int slot, int func);
+
+struct iommu_ops {
+ iommu_init_func_t init; /* module wide */
+ iommu_cleanup_func_t cleanup;
+ iommu_enable_func_t enable;
+ iommu_disable_func_t disable;
+
+ iommu_create_domain_t create_domain; /* domain-specific */
+ iommu_destroy_domain_t destroy_domain;
+ iommu_create_mapping_t create_mapping;
+ iommu_add_device_t add_device;
+ iommu_remove_device_t remove_device;
+};
+
+extern struct iommu_ops iommu_ops_intel;
+extern struct iommu_ops iommu_ops_amd;
+
+void iommu_init(void);
+void iommu_cleanup(void);
+void *iommu_create_domain(vm_paddr_t maxaddr);
+void iommu_destroy_domain(void *dom);
+void iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa,
+ size_t len);
+void iommu_add_device(void *dom, int bus, int slot, int func);
+void iommu_remove_device(void *dom, int bus, int slot, int func);
+#endif
diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c
new file mode 100644
index 000000000000..dc2f326b1e85
--- /dev/null
+++ b/sys/amd64/vmm/io/ppt.c
@@ -0,0 +1,449 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/bus.h>
+#include <sys/pciio.h>
+#include <sys/rman.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/resource.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+
+#include "vmm_lapic.h"
+#include "vmm_ktr.h"
+
+#include "iommu.h"
+#include "ppt.h"
+
+#define MAX_PPTDEVS (sizeof(pptdevs) / sizeof(pptdevs[0]))
+#define MAX_MMIOSEGS (PCIR_MAX_BAR_0 + 1)
+#define MAX_MSIMSGS 32
+
+struct pptintr_arg { /* pptintr(pptintr_arg) */
+ struct pptdev *pptdev;
+ int msg;
+};
+
+static struct pptdev {
+ device_t dev;
+ struct vm *vm; /* owner of this device */
+ struct vm_memory_segment mmio[MAX_MMIOSEGS];
+ struct {
+ int num_msgs; /* guest state */
+ int vector;
+ int vcpu;
+
+ int startrid; /* host state */
+ struct resource *res[MAX_MSIMSGS];
+ void *cookie[MAX_MSIMSGS];
+ struct pptintr_arg arg[MAX_MSIMSGS];
+ } msi;
+} pptdevs[32];
+
+static int num_pptdevs;
+
+static int
+ppt_probe(device_t dev)
+{
+ int bus, slot, func;
+ struct pci_devinfo *dinfo;
+
+ dinfo = (struct pci_devinfo *)device_get_ivars(dev);
+
+ bus = pci_get_bus(dev);
+ slot = pci_get_slot(dev);
+ func = pci_get_function(dev);
+
+ /*
+ * To qualify as a pci passthrough device a device must:
+ * - be allowed by administrator to be used in this role
+ * - be an endpoint device
+ */
+ if (vmm_is_pptdev(bus, slot, func) &&
+ (dinfo->cfg.hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_NORMAL)
+ return (0);
+ else
+ return (ENXIO);
+}
+
+static int
+ppt_attach(device_t dev)
+{
+ int n;
+
+ if (num_pptdevs >= MAX_PPTDEVS) {
+ printf("ppt_attach: maximum number of pci passthrough devices "
+ "exceeded\n");
+ return (ENXIO);
+ }
+
+ n = num_pptdevs++;
+ pptdevs[n].dev = dev;
+
+ if (bootverbose)
+ device_printf(dev, "attached\n");
+
+ return (0);
+}
+
+static int
+ppt_detach(device_t dev)
+{
+ /*
+ * XXX check whether there are any pci passthrough devices assigned
+ * to guests before we allow this driver to detach.
+ */
+
+ return (0);
+}
+
+static device_method_t ppt_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, ppt_probe),
+ DEVMETHOD(device_attach, ppt_attach),
+ DEVMETHOD(device_detach, ppt_detach),
+ {0, 0}
+};
+
+static devclass_t ppt_devclass;
+DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, 0);
+DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
+
+static struct pptdev *
+ppt_find(int bus, int slot, int func)
+{
+ device_t dev;
+ int i, b, s, f;
+
+ for (i = 0; i < num_pptdevs; i++) {
+ dev = pptdevs[i].dev;
+ b = pci_get_bus(dev);
+ s = pci_get_slot(dev);
+ f = pci_get_function(dev);
+ if (bus == b && slot == s && func == f)
+ return (&pptdevs[i]);
+ }
+ return (NULL);
+}
+
+static void
+ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
+{
+ int i;
+ struct vm_memory_segment *seg;
+
+ for (i = 0; i < MAX_MMIOSEGS; i++) {
+ seg = &ppt->mmio[i];
+ if (seg->len == 0)
+ continue;
+ (void)vm_unmap_mmio(vm, seg->gpa, seg->len);
+ bzero(seg, sizeof(struct vm_memory_segment));
+ }
+}
+
+static void
+ppt_teardown_msi(struct pptdev *ppt)
+{
+ int i, rid;
+ void *cookie;
+ struct resource *res;
+
+ if (ppt->msi.num_msgs == 0)
+ return;
+
+ for (i = 0; i < ppt->msi.num_msgs; i++) {
+ rid = ppt->msi.startrid + i;
+ res = ppt->msi.res[i];
+ cookie = ppt->msi.cookie[i];
+
+ if (cookie != NULL)
+ bus_teardown_intr(ppt->dev, res, cookie);
+
+ if (res != NULL)
+ bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
+
+ ppt->msi.res[i] = NULL;
+ ppt->msi.cookie[i] = NULL;
+ }
+
+ if (ppt->msi.startrid == 1)
+ pci_release_msi(ppt->dev);
+
+ ppt->msi.num_msgs = 0;
+}
+
+int
+ppt_assign_device(struct vm *vm, int bus, int slot, int func)
+{
+ struct pptdev *ppt;
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt != NULL) {
+ /*
+ * If this device is owned by a different VM then we
+ * cannot change its owner.
+ */
+ if (ppt->vm != NULL && ppt->vm != vm)
+ return (EBUSY);
+
+ ppt->vm = vm;
+ iommu_add_device(vm_iommu_domain(vm), bus, slot, func);
+ return (0);
+ }
+ return (ENOENT);
+}
+
+int
+ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
+{
+ struct pptdev *ppt;
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt != NULL) {
+ /*
+ * If this device is not owned by this 'vm' then bail out.
+ */
+ if (ppt->vm != vm)
+ return (EBUSY);
+ ppt_unmap_mmio(vm, ppt);
+ ppt_teardown_msi(ppt);
+ iommu_remove_device(vm_iommu_domain(vm), bus, slot, func);
+ ppt->vm = NULL;
+ return (0);
+ }
+ return (ENOENT);
+}
+
+int
+ppt_unassign_all(struct vm *vm)
+{
+ int i, bus, slot, func;
+ device_t dev;
+
+ for (i = 0; i < num_pptdevs; i++) {
+ if (pptdevs[i].vm == vm) {
+ dev = pptdevs[i].dev;
+ bus = pci_get_bus(dev);
+ slot = pci_get_slot(dev);
+ func = pci_get_function(dev);
+ ppt_unassign_device(vm, bus, slot, func);
+ }
+ }
+
+ return (0);
+}
+
+int
+ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
+ vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+ int i, error;
+ struct vm_memory_segment *seg;
+ struct pptdev *ppt;
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt != NULL) {
+ if (ppt->vm != vm)
+ return (EBUSY);
+
+ for (i = 0; i < MAX_MMIOSEGS; i++) {
+ seg = &ppt->mmio[i];
+ if (seg->len == 0) {
+ error = vm_map_mmio(vm, gpa, len, hpa);
+ if (error == 0) {
+ seg->gpa = gpa;
+ seg->len = len;
+ seg->hpa = hpa;
+ }
+ return (error);
+ }
+ }
+ return (ENOSPC);
+ }
+ return (ENOENT);
+}
+
+static int
+pptintr(void *arg)
+{
+ int vec;
+ struct pptdev *ppt;
+ struct pptintr_arg *pptarg;
+
+ pptarg = arg;
+ ppt = pptarg->pptdev;
+ vec = ppt->msi.vector + pptarg->msg;
+
+ if (ppt->vm != NULL)
+ (void) lapic_set_intr(ppt->vm, ppt->msi.vcpu, vec);
+ else {
+ /*
+ * XXX
+ * This is not expected to happen - panic?
+ */
+ }
+
+ /*
+ * For legacy interrupts give other filters a chance in case
+ * the interrupt was not generated by the passthrough device.
+ */
+ if (ppt->msi.startrid == 0)
+ return (FILTER_STRAY);
+ else
+ return (FILTER_HANDLED);
+}
+
+/*
+ * XXX
+ * When we try to free the MSI resource the kernel will bind the thread to
+ * the host cpu was originally handling the MSI. The function freeing the
+ * MSI vector (apic_free_vector()) will panic the kernel if the thread
+ * is already bound to a cpu.
+ *
+ * So, we temporarily unbind the vcpu thread before freeing the MSI resource.
+ */
+static void
+PPT_TEARDOWN_MSI(struct vm *vm, int vcpu, struct pptdev *ppt)
+{
+ int pincpu = -1;
+
+ vm_get_pinning(vm, vcpu, &pincpu);
+
+ if (pincpu >= 0)
+ vm_set_pinning(vm, vcpu, -1);
+
+ ppt_teardown_msi(ppt);
+
+ if (pincpu >= 0)
+ vm_set_pinning(vm, vcpu, pincpu);
+}
+
+int
+ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
+ int destcpu, int vector, int numvec)
+{
+ int i, rid, flags;
+ int msi_count, startrid, error, tmp;
+ struct pptdev *ppt;
+
+ if ((destcpu >= VM_MAXCPU || destcpu < 0) ||
+ (vector < 0 || vector > 255) ||
+ (numvec < 0 || numvec > MAX_MSIMSGS))
+ return (EINVAL);
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt == NULL)
+ return (ENOENT);
+ if (ppt->vm != vm) /* Make sure we own this device */
+ return (EBUSY);
+
+ /* Free any allocated resources */
+ PPT_TEARDOWN_MSI(vm, vcpu, ppt);
+
+ if (numvec == 0) /* nothing more to do */
+ return (0);
+
+ flags = RF_ACTIVE;
+ msi_count = pci_msi_count(ppt->dev);
+ if (msi_count == 0) {
+ startrid = 0; /* legacy interrupt */
+ msi_count = 1;
+ flags |= RF_SHAREABLE;
+ } else
+ startrid = 1; /* MSI */
+
+ /*
+ * The device must be capable of supporting the number of vectors
+ * the guest wants to allocate.
+ */
+ if (numvec > msi_count)
+ return (EINVAL);
+
+ /*
+ * Make sure that we can allocate all the MSI vectors that are needed
+ * by the guest.
+ */
+ if (startrid == 1) {
+ tmp = numvec;
+ error = pci_alloc_msi(ppt->dev, &tmp);
+ if (error)
+ return (error);
+ else if (tmp != numvec) {
+ pci_release_msi(ppt->dev);
+ return (ENOSPC);
+ } else {
+ /* success */
+ }
+ }
+
+ ppt->msi.vector = vector;
+ ppt->msi.vcpu = destcpu;
+ ppt->msi.startrid = startrid;
+
+ /*
+ * Allocate the irq resource and attach it to the interrupt handler.
+ */
+ for (i = 0; i < numvec; i++) {
+ ppt->msi.num_msgs = i + 1;
+ ppt->msi.cookie[i] = NULL;
+
+ rid = startrid + i;
+ ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
+ &rid, flags);
+ if (ppt->msi.res[i] == NULL)
+ break;
+
+ ppt->msi.arg[i].pptdev = ppt;
+ ppt->msi.arg[i].msg = i;
+
+ error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
+ INTR_TYPE_NET | INTR_MPSAFE | INTR_FAST,
+ pptintr, NULL, &ppt->msi.arg[i],
+ &ppt->msi.cookie[i]);
+ if (error != 0)
+ break;
+ }
+
+ if (i < numvec) {
+ PPT_TEARDOWN_MSI(vm, vcpu, ppt);
+ return (ENXIO);
+ }
+
+ return (0);
+}
diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h
new file mode 100644
index 000000000000..95f3ad08b44c
--- /dev/null
+++ b/sys/amd64/vmm/io/ppt.h
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IO_PPT_H_
+#define _IO_PPT_H_
+
+int ppt_assign_device(struct vm *vm, int bus, int slot, int func);
+int ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
+int ppt_unassign_all(struct vm *vm);
+int ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
+ vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
+ int destcpu, int vector, int numvec);
+
+#endif
diff --git a/sys/amd64/vmm/io/vdev.c b/sys/amd64/vmm/io/vdev.c
new file mode 100644
index 000000000000..cd6c5d1b39c9
--- /dev/null
+++ b/sys/amd64/vmm/io/vdev.c
@@ -0,0 +1,270 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include "vdev.h"
+
+struct vdev {
+ SLIST_ENTRY(vdev) entry;
+ struct vdev_ops *ops;
+ void *dev;
+};
+static SLIST_HEAD(, vdev) vdev_head;
+static int vdev_count;
+
+struct vdev_region {
+ SLIST_ENTRY(vdev_region) entry;
+ struct vdev_ops *ops;
+ void *dev;
+ struct io_region *io;
+};
+static SLIST_HEAD(, vdev_region) region_head;
+static int region_count;
+
+static MALLOC_DEFINE(M_VDEV, "vdev", "vdev");
+
+#define VDEV_INIT (0)
+#define VDEV_RESET (1)
+#define VDEV_HALT (2)
+
+// static const char* vdev_event_str[] = {"VDEV_INIT", "VDEV_RESET", "VDEV_HALT"};
+
+static int
+vdev_system_event(int event)
+{
+ struct vdev *vd;
+ int rc;
+
+ // TODO: locking
+ SLIST_FOREACH(vd, &vdev_head, entry) {
+ // printf("%s : %s Device %s\n", __func__, vdev_event_str[event], vd->ops->name);
+ switch (event) {
+ case VDEV_INIT:
+ rc = vd->ops->init(vd->dev);
+ break;
+ case VDEV_RESET:
+ rc = vd->ops->reset(vd->dev);
+ break;
+ case VDEV_HALT:
+ rc = vd->ops->halt(vd->dev);
+ break;
+ default:
+ break;
+ }
+ if (rc) {
+ printf("vdev %s init failed rc=%d\n",
+ vd->ops->name, rc);
+ return rc;
+ }
+ }
+ return 0;
+}
+
+int
+vdev_init(void)
+{
+ return vdev_system_event(VDEV_INIT);
+}
+
+int
+vdev_reset(void)
+{
+ return vdev_system_event(VDEV_RESET);
+}
+
+int
+vdev_halt(void)
+{
+ return vdev_system_event(VDEV_HALT);
+}
+
+void
+vdev_vm_init(void)
+{
+ SLIST_INIT(&vdev_head);
+ vdev_count = 0;
+
+ SLIST_INIT(&region_head);
+ region_count = 0;
+}
+void
+vdev_vm_cleanup(void)
+{
+ struct vdev *vd;
+
+ // TODO: locking
+ while (!SLIST_EMPTY(&vdev_head)) {
+ vd = SLIST_FIRST(&vdev_head);
+ SLIST_REMOVE_HEAD(&vdev_head, entry);
+ free(vd, M_VDEV);
+ vdev_count--;
+ }
+}
+
+int
+vdev_register(struct vdev_ops *ops, void *dev)
+{
+ struct vdev *vd;
+ vd = malloc(sizeof(*vd), M_VDEV, M_WAITOK | M_ZERO);
+ vd->ops = ops;
+ vd->dev = dev;
+
+ // TODO: locking
+ SLIST_INSERT_HEAD(&vdev_head, vd, entry);
+ vdev_count++;
+ return 0;
+}
+
+void
+vdev_unregister(void *dev)
+{
+ struct vdev *vd, *found;
+
+ found = NULL;
+ // TODO: locking
+ SLIST_FOREACH(vd, &vdev_head, entry) {
+ if (vd->dev == dev) {
+ found = vd;
+ }
+ }
+
+ if (found) {
+ SLIST_REMOVE(&vdev_head, found, vdev, entry);
+ free(found, M_VDEV);
+ }
+}
+
+#define IN_RANGE(val, start, end) \
+ (((val) >= (start)) && ((val) < (end)))
+
+static struct vdev_region*
+vdev_find_region(struct io_region *io, void *dev)
+{
+ struct vdev_region *region, *found;
+ uint64_t region_base;
+ uint64_t region_end;
+
+ found = NULL;
+
+ // TODO: locking
+ // FIXME: we should verify we are in the context the current
+ // vcpu here as well.
+ SLIST_FOREACH(region, &region_head, entry) {
+ region_base = region->io->base;
+ region_end = region_base + region->io->len;
+ if (IN_RANGE(io->base, region_base, region_end) &&
+ IN_RANGE(io->base+io->len, region_base, region_end+1) &&
+ (dev && dev == region->dev)) {
+ found = region;
+ break;
+ }
+ }
+ return found;
+}
+
+int
+vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io)
+{
+ struct vdev_region *region;
+
+ region = vdev_find_region(io, dev);
+ if (region) {
+ return -EEXIST;
+ }
+
+ region = malloc(sizeof(*region), M_VDEV, M_WAITOK | M_ZERO);
+ region->io = io;
+ region->ops = ops;
+ region->dev = dev;
+
+ // TODO: locking
+ SLIST_INSERT_HEAD(&region_head, region, entry);
+ region_count++;
+
+ return 0;
+}
+
+void
+vdev_unregister_region(void *dev, struct io_region *io)
+{
+ struct vdev_region *region;
+
+ region = vdev_find_region(io, dev);
+
+ if (region) {
+ SLIST_REMOVE(&region_head, region, vdev_region, entry);
+ free(region, M_VDEV);
+ region_count--;
+ }
+}
+
+static int
+vdev_memrw(uint64_t gpa, opsize_t size, uint64_t *data, int read)
+{
+ struct vdev_region *region;
+ struct io_region io;
+ region_attr_t attr;
+ int rc;
+
+ io.base = gpa;
+ io.len = size;
+
+ region = vdev_find_region(&io, NULL);
+ if (!region)
+ return -EINVAL;
+
+ attr = (read) ? MMIO_READ : MMIO_WRITE;
+ if (!(region->io->attr & attr))
+ return -EPERM;
+
+ if (read)
+ rc = region->ops->memread(region->dev, gpa, size, data);
+ else
+ rc = region->ops->memwrite(region->dev, gpa, size, *data);
+
+ return rc;
+}
+
+int
+vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data)
+{
+ return vdev_memrw(gpa, size, data, 1);
+}
+
+int
+vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data)
+{
+ return vdev_memrw(gpa, size, &data, 0);
+}
diff --git a/sys/amd64/vmm/io/vdev.h b/sys/amd64/vmm/io/vdev.h
new file mode 100644
index 000000000000..6feeba87b7c0
--- /dev/null
+++ b/sys/amd64/vmm/io/vdev.h
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VDEV_H_
+#define _VDEV_H_
+
+typedef enum {
+ BYTE = 1,
+ WORD = 2,
+ DWORD = 4,
+ QWORD = 8,
+} opsize_t;
+
+typedef enum {
+ MMIO_READ = 1,
+ MMIO_WRITE = 2,
+} region_attr_t;
+
+struct io_region {
+ uint64_t base;
+ uint64_t len;
+ region_attr_t attr;
+ int vcpu;
+};
+
+typedef int (*vdev_init_t)(void* dev);
+typedef int (*vdev_reset_t)(void* dev);
+typedef int (*vdev_halt_t)(void* dev);
+typedef int (*vdev_memread_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t *data);
+typedef int (*vdev_memwrite_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t data);
+
+
+struct vdev_ops {
+ const char *name;
+ vdev_init_t init;
+ vdev_reset_t reset;
+ vdev_halt_t halt;
+ vdev_memread_t memread;
+ vdev_memwrite_t memwrite;
+};
+
+
+void vdev_vm_init(void);
+void vdev_vm_cleanup(void);
+
+int vdev_register(struct vdev_ops *ops, void *dev);
+void vdev_unregister(void *dev);
+
+int vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io);
+void vdev_unregister_region(void *dev, struct io_region *io);
+
+int vdev_init(void);
+int vdev_reset(void);
+int vdev_halt(void);
+int vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data);
+int vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data);
+
+#endif /* _VDEV_H_ */
+
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
new file mode 100644
index 000000000000..a21addfd5d32
--- /dev/null
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -0,0 +1,812 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+
+#include <machine/clock.h>
+#include <machine/apicreg.h>
+
+#include <machine/vmm.h>
+
+#include "vmm_lapic.h"
+#include "vmm_ktr.h"
+#include "vdev.h"
+#include "vlapic.h"
+
+#define VLAPIC_CTR0(vlapic, format) \
+ VMM_CTR0((vlapic)->vm, (vlapic)->vcpuid, format)
+
+#define VLAPIC_CTR1(vlapic, format, p1) \
+ VMM_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1)
+
+#define VLAPIC_CTR_IRR(vlapic, msg) \
+do { \
+ uint32_t *irrptr = &(vlapic)->apic.irr0; \
+ irrptr[0] = irrptr[0]; /* silence compiler */ \
+ VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]); \
+} while (0)
+
+#define VLAPIC_CTR_ISR(vlapic, msg) \
+do { \
+ uint32_t *isrptr = &(vlapic)->apic.isr0; \
+ isrptr[0] = isrptr[0]; /* silence compiler */ \
+ VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \
+} while (0)
+
+static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
+
+#define PRIO(x) ((x) >> 4)
+
+#define VLAPIC_VERSION (16)
+#define VLAPIC_MAXLVT_ENTRIES (5)
+
+struct vlapic {
+ struct vm *vm;
+ int vcpuid;
+
+ struct io_region *mmio;
+ struct vdev_ops *ops;
+ struct LAPIC apic;
+
+ int esr_update;
+
+ int divisor;
+ int ccr_ticks;
+
+ /*
+ * The 'isrvec_stk' is a stack of vectors injected by the local apic.
+ * A vector is popped from the stack when the processor does an EOI.
+ * The vector on the top of the stack is used to compute the
+ * Processor Priority in conjunction with the TPR.
+ */
+ uint8_t isrvec_stk[ISRVEC_STK_SIZE];
+ int isrvec_stk_top;
+};
+
+static void
+vlapic_mask_lvts(uint32_t *lvts, int num_lvt)
+{
+ int i;
+ for (i = 0; i < num_lvt; i++) {
+ *lvts |= APIC_LVT_M;
+ lvts += 4;
+ }
+}
+
+#if 0
+static inline void
+vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
+{
+ printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
+ *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
+ *lvt & APIC_LVTT_M);
+}
+#endif
+
+static uint64_t
+vlapic_get_ccr(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ return lapic->ccr_timer;
+}
+
+static void
+vlapic_update_errors(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ lapic->esr = 0; // XXX
+}
+
+static void
+vlapic_init_ipi(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ lapic->version = VLAPIC_VERSION;
+ lapic->version |= (VLAPIC_MAXLVT_ENTRIES < MAXLVTSHIFT);
+ lapic->dfr = 0xffffffff;
+ lapic->svr = APIC_SVR_VECTOR;
+ vlapic_mask_lvts(&lapic->lvt_timer, VLAPIC_MAXLVT_ENTRIES+1);
+}
+
+static int
+vlapic_op_reset(void* dev)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ struct LAPIC *lapic = &vlapic->apic;
+
+ memset(lapic, 0, sizeof(*lapic));
+ lapic->id = vlapic->vcpuid << 24;
+ lapic->apr = vlapic->vcpuid;
+ vlapic_init_ipi(vlapic);
+
+ return 0;
+
+}
+
+static int
+vlapic_op_init(void* dev)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ vdev_register_region(vlapic->ops, vlapic, vlapic->mmio);
+ return vlapic_op_reset(dev);
+}
+
+static int
+vlapic_op_halt(void* dev)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ vdev_unregister_region(vlapic, vlapic->mmio);
+ return 0;
+
+}
+
+void
+vlapic_set_intr_ready(struct vlapic *vlapic, int vector)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ uint32_t *irrptr;
+ int idx;
+
+ if (vector < 0 || vector >= 256)
+ panic("vlapic_set_intr_ready: invalid vector %d\n", vector);
+
+ idx = (vector / 32) * 4;
+ irrptr = &lapic->irr0;
+ atomic_set_int(&irrptr[idx], 1 << (vector % 32));
+ VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
+}
+
+#define VLAPIC_BUS_FREQ tsc_freq
+#define VLAPIC_DCR(x) ((x->dcr_timer & 0x8) >> 1)|(x->dcr_timer & 0x3)
+
+static int
+vlapic_timer_divisor(uint32_t dcr)
+{
+ switch (dcr & 0xB) {
+ case APIC_TDCR_2:
+ return (2);
+ case APIC_TDCR_4:
+ return (4);
+ case APIC_TDCR_8:
+ return (8);
+ case APIC_TDCR_16:
+ return (16);
+ case APIC_TDCR_32:
+ return (32);
+ case APIC_TDCR_64:
+ return (64);
+ case APIC_TDCR_128:
+ return (128);
+ default:
+ panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
+ }
+}
+
+static void
+vlapic_start_timer(struct vlapic *vlapic, uint32_t elapsed)
+{
+ uint32_t icr_timer;
+
+ icr_timer = vlapic->apic.icr_timer;
+
+ vlapic->ccr_ticks = ticks;
+ if (elapsed < icr_timer)
+ vlapic->apic.ccr_timer = icr_timer - elapsed;
+ else {
+ /*
+ * This can happen when the guest is trying to run its local
+ * apic timer higher that the setting of 'hz' in the host.
+ *
+ * We deal with this by running the guest local apic timer
+ * at the rate of the host's 'hz' setting.
+ */
+ vlapic->apic.ccr_timer = 0;
+ }
+}
+
+static __inline uint32_t *
+vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ int i;
+
+ if (offset < APIC_OFFSET_TIMER_LVT || offset > APIC_OFFSET_ERROR_LVT) {
+ panic("vlapic_get_lvt: invalid LVT\n");
+ }
+ i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
+ return ((&lapic->lvt_timer) + i);;
+}
+
+#if 1
+static void
+dump_isrvec_stk(struct vlapic *vlapic)
+{
+ int i;
+ uint32_t *isrptr;
+
+ isrptr = &vlapic->apic.isr0;
+ for (i = 0; i < 8; i++)
+ printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
+
+ for (i = 0; i <= vlapic->isrvec_stk_top; i++)
+ printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]);
+}
+#endif
+
+/*
+ * Algorithm adopted from section "Interrupt, Task and Processor Priority"
+ * in Intel Architecture Manual Vol 3a.
+ */
+static void
+vlapic_update_ppr(struct vlapic *vlapic)
+{
+ int isrvec, tpr, ppr;
+
+ /*
+ * Note that the value on the stack at index 0 is always 0.
+ *
+ * This is a placeholder for the value of ISRV when none of the
+ * bits is set in the ISRx registers.
+ */
+ isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top];
+ tpr = vlapic->apic.tpr;
+
+#if 1
+ {
+ int i, lastprio, curprio, vector, idx;
+ uint32_t *isrptr;
+
+ if (vlapic->isrvec_stk_top == 0 && isrvec != 0)
+ panic("isrvec_stk is corrupted: %d", isrvec);
+
+ /*
+ * Make sure that the priority of the nested interrupts is
+ * always increasing.
+ */
+ lastprio = -1;
+ for (i = 1; i <= vlapic->isrvec_stk_top; i++) {
+ curprio = PRIO(vlapic->isrvec_stk[i]);
+ if (curprio <= lastprio) {
+ dump_isrvec_stk(vlapic);
+ panic("isrvec_stk does not satisfy invariant");
+ }
+ lastprio = curprio;
+ }
+
+ /*
+ * Make sure that each bit set in the ISRx registers has a
+ * corresponding entry on the isrvec stack.
+ */
+ i = 1;
+ isrptr = &vlapic->apic.isr0;
+ for (vector = 0; vector < 256; vector++) {
+ idx = (vector / 32) * 4;
+ if (isrptr[idx] & (1 << (vector % 32))) {
+ if (i > vlapic->isrvec_stk_top ||
+ vlapic->isrvec_stk[i] != vector) {
+ dump_isrvec_stk(vlapic);
+ panic("ISR and isrvec_stk out of sync");
+ }
+ i++;
+ }
+ }
+ }
+#endif
+
+ if (PRIO(tpr) >= PRIO(isrvec))
+ ppr = tpr;
+ else
+ ppr = isrvec & 0xf0;
+
+ vlapic->apic.ppr = ppr;
+ VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
+}
+
+static void
+vlapic_process_eoi(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ uint32_t *isrptr;
+ int i, idx, bitpos;
+
+ isrptr = &lapic->isr0;
+
+ /*
+ * The x86 architecture reserves the the first 32 vectors for use
+ * by the processor.
+ */
+ for (i = 7; i > 0; i--) {
+ idx = i * 4;
+ bitpos = fls(isrptr[idx]);
+ if (bitpos != 0) {
+ if (vlapic->isrvec_stk_top <= 0) {
+ panic("invalid vlapic isrvec_stk_top %d",
+ vlapic->isrvec_stk_top);
+ }
+ isrptr[idx] &= ~(1 << (bitpos - 1));
+ VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
+ vlapic->isrvec_stk_top--;
+ vlapic_update_ppr(vlapic);
+ return;
+ }
+ }
+}
+
+static __inline int
+vlapic_get_lvt_field(uint32_t *lvt, uint32_t mask)
+{
+ return (*lvt & mask);
+}
+
+static __inline int
+vlapic_periodic_timer(struct vlapic *vlapic)
+{
+ uint32_t *lvt;
+
+ lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+
+ return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
+}
+
+static void
+vlapic_fire_timer(struct vlapic *vlapic)
+{
+ int vector;
+ uint32_t *lvt;
+
+ lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+
+ if (!vlapic_get_lvt_field(lvt, APIC_LVTT_M)) {
+ vector = vlapic_get_lvt_field(lvt,APIC_LVTT_VECTOR);
+ vlapic_set_intr_ready(vlapic, vector);
+ }
+}
+
+static int
+lapic_process_icr(struct vlapic *vlapic, uint64_t icrval)
+{
+ int i;
+ cpumask_t dmask, thiscpumask;
+ uint32_t dest, vec, mode;
+
+ thiscpumask = vcpu_mask(vlapic->vcpuid);
+
+ dmask = 0;
+ dest = icrval >> 32;
+ vec = icrval & APIC_VECTOR_MASK;
+ mode = icrval & APIC_DELMODE_MASK;
+
+ if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) {
+ switch (icrval & APIC_DEST_MASK) {
+ case APIC_DEST_DESTFLD:
+ dmask = vcpu_mask(dest);
+ break;
+ case APIC_DEST_SELF:
+ dmask = thiscpumask;
+ break;
+ case APIC_DEST_ALLISELF:
+ dmask = vm_active_cpus(vlapic->vm);
+ break;
+ case APIC_DEST_ALLESELF:
+ dmask = vm_active_cpus(vlapic->vm) & ~thiscpumask;
+ break;
+ }
+
+ for (i = 0; i < VM_MAXCPU; i++) {
+ if (dmask & vcpu_mask(i)) {
+ if (mode == APIC_DELMODE_FIXED)
+ lapic_set_intr(vlapic->vm, i, vec);
+ else
+ vm_inject_nmi(vlapic->vm, i);
+ }
+ }
+
+ return (0); /* handled completely in the kernel */
+ }
+
+ /*
+ * XXX this assumes that the startup IPI always succeeds
+ */
+ if (mode == APIC_DELMODE_STARTUP)
+ vm_activate_cpu(vlapic->vm, dest);
+
+ /*
+ * This will cause a return to userland.
+ */
+ return (1);
+}
+
+int
+vlapic_pending_intr(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ int idx, i, bitpos, vector;
+ uint32_t *irrptr, val;
+
+ irrptr = &lapic->irr0;
+
+ /*
+ * The x86 architecture reserves the the first 32 vectors for use
+ * by the processor.
+ */
+ for (i = 7; i > 0; i--) {
+ idx = i * 4;
+ val = atomic_load_acq_int(&irrptr[idx]);
+ bitpos = fls(val);
+ if (bitpos != 0) {
+ vector = i * 32 + (bitpos - 1);
+ if (PRIO(vector) > PRIO(lapic->ppr)) {
+ VLAPIC_CTR1(vlapic, "pending intr %d", vector);
+ return (vector);
+ } else
+ break;
+ }
+ }
+ VLAPIC_CTR0(vlapic, "no pending intr");
+ return (-1);
+}
+
+void
+vlapic_intr_accepted(struct vlapic *vlapic, int vector)
+{
+ struct LAPIC *lapic = &vlapic->apic;
+ uint32_t *irrptr, *isrptr;
+ int idx, stk_top;
+
+ /*
+ * clear the ready bit for vector being accepted in irr
+ * and set the vector as in service in isr.
+ */
+ idx = (vector / 32) * 4;
+
+ irrptr = &lapic->irr0;
+ atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
+ VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted");
+
+ isrptr = &lapic->isr0;
+ isrptr[idx] |= 1 << (vector % 32);
+ VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted");
+
+ /*
+ * Update the PPR
+ */
+ vlapic->isrvec_stk_top++;
+
+ stk_top = vlapic->isrvec_stk_top;
+ if (stk_top >= ISRVEC_STK_SIZE)
+ panic("isrvec_stk_top overflow %d", stk_top);
+
+ vlapic->isrvec_stk[stk_top] = vector;
+ vlapic_update_ppr(vlapic);
+}
+
+int
+vlapic_op_mem_read(void* dev, uint64_t gpa, opsize_t size, uint64_t *data)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ struct LAPIC *lapic = &vlapic->apic;
+ uint64_t offset = gpa & ~(PAGE_SIZE);
+ uint32_t *reg;
+ int i;
+
+ if (offset > sizeof(*lapic)) {
+ *data = 0;
+ return 0;
+ }
+
+ offset &= ~3;
+ switch(offset)
+ {
+ case APIC_OFFSET_ID:
+ *data = lapic->id;
+ break;
+ case APIC_OFFSET_VER:
+ *data = lapic->version;
+ break;
+ case APIC_OFFSET_TPR:
+ *data = lapic->tpr;
+ break;
+ case APIC_OFFSET_APR:
+ *data = lapic->apr;
+ break;
+ case APIC_OFFSET_PPR:
+ *data = lapic->ppr;
+ break;
+ case APIC_OFFSET_EOI:
+ *data = lapic->eoi;
+ break;
+ case APIC_OFFSET_LDR:
+ *data = lapic->ldr;
+ break;
+ case APIC_OFFSET_DFR:
+ *data = lapic->dfr;
+ break;
+ case APIC_OFFSET_SVR:
+ *data = lapic->svr;
+ break;
+ case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+ i = (offset - APIC_OFFSET_ISR0) >> 2;
+ reg = &lapic->isr0;
+ *data = *(reg + i);
+ break;
+ case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+ i = (offset - APIC_OFFSET_TMR0) >> 2;
+ reg = &lapic->tmr0;
+ *data = *(reg + i);
+ break;
+ case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+ i = (offset - APIC_OFFSET_IRR0) >> 2;
+ reg = &lapic->irr0;
+ *data = atomic_load_acq_int(reg + i);
+ break;
+ case APIC_OFFSET_ESR:
+ *data = lapic->esr;
+ break;
+ case APIC_OFFSET_ICR_LOW:
+ *data = lapic->icr_lo;
+ break;
+ case APIC_OFFSET_ICR_HI:
+ *data = lapic->icr_hi;
+ break;
+ case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+ reg = vlapic_get_lvt(vlapic, offset);
+ *data = *(reg);
+ break;
+ case APIC_OFFSET_ICR:
+ *data = lapic->icr_timer;
+ break;
+ case APIC_OFFSET_CCR:
+ *data = vlapic_get_ccr(vlapic);
+ break;
+ case APIC_OFFSET_DCR:
+ *data = lapic->dcr_timer;
+ break;
+ case APIC_OFFSET_RRR:
+ default:
+ *data = 0;
+ break;
+ }
+ return 0;
+}
+
+int
+vlapic_op_mem_write(void* dev, uint64_t gpa, opsize_t size, uint64_t data)
+{
+ struct vlapic *vlapic = (struct vlapic*)dev;
+ struct LAPIC *lapic = &vlapic->apic;
+ uint64_t offset = gpa & ~(PAGE_SIZE);
+ uint32_t *reg;
+ int retval;
+
+ if (offset > sizeof(*lapic)) {
+ return 0;
+ }
+
+ retval = 0;
+ offset &= ~3;
+ switch(offset)
+ {
+ case APIC_OFFSET_ID:
+ lapic->id = data;
+ break;
+ case APIC_OFFSET_TPR:
+ lapic->tpr = data & 0xff;
+ vlapic_update_ppr(vlapic);
+ break;
+ case APIC_OFFSET_EOI:
+ vlapic_process_eoi(vlapic);
+ break;
+ case APIC_OFFSET_LDR:
+ break;
+ case APIC_OFFSET_DFR:
+ break;
+ case APIC_OFFSET_SVR:
+ lapic->svr = data;
+ break;
+ case APIC_OFFSET_ICR_LOW:
+ retval = lapic_process_icr(vlapic, data);
+ break;
+ case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+ reg = vlapic_get_lvt(vlapic, offset);
+ if (!(lapic->svr & APIC_SVR_ENABLE)) {
+ data |= APIC_LVT_M;
+ }
+ *reg = data;
+ // vlapic_dump_lvt(offset, reg);
+ break;
+ case APIC_OFFSET_ICR:
+ lapic->icr_timer = data;
+ vlapic_start_timer(vlapic, 0);
+ break;
+
+ case APIC_OFFSET_DCR:
+ lapic->dcr_timer = data;
+ vlapic->divisor = vlapic_timer_divisor(data);
+ break;
+
+ case APIC_OFFSET_ESR:
+ vlapic_update_errors(vlapic);
+ break;
+ case APIC_OFFSET_VER:
+ case APIC_OFFSET_APR:
+ case APIC_OFFSET_PPR:
+ case APIC_OFFSET_RRR:
+ case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+ case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+ case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+ case APIC_OFFSET_CCR:
+ default:
+ // Read only.
+ break;
+ }
+
+ return (retval);
+}
+
+void
+vlapic_timer_tick(struct vlapic *vlapic)
+{
+ int curticks, delta, periodic;
+ uint32_t ccr;
+ uint32_t decrement, remainder;
+
+ curticks = ticks;
+
+ /* Common case */
+ delta = curticks - vlapic->ccr_ticks;
+ if (delta == 0)
+ return;
+
+ /* Local APIC timer is disabled */
+ if (vlapic->apic.icr_timer == 0)
+ return;
+
+ /* One-shot mode and timer has already counted down to zero */
+ periodic = vlapic_periodic_timer(vlapic);
+ if (!periodic && vlapic->apic.ccr_timer == 0)
+ return;
+ /*
+ * The 'curticks' and 'ccr_ticks' are out of sync by more than
+ * 2^31 ticks. We deal with this by restarting the timer.
+ */
+ if (delta < 0) {
+ vlapic_start_timer(vlapic, 0);
+ return;
+ }
+
+ ccr = vlapic->apic.ccr_timer;
+ decrement = (VLAPIC_BUS_FREQ / vlapic->divisor) / hz;
+ while (delta-- > 0) {
+ if (ccr <= decrement) {
+ remainder = decrement - ccr;
+ vlapic_fire_timer(vlapic);
+ if (periodic) {
+ vlapic_start_timer(vlapic, remainder);
+ ccr = vlapic->apic.ccr_timer;
+ } else {
+ /*
+ * One-shot timer has counted down to zero.
+ */
+ ccr = 0;
+ break;
+ }
+ } else
+ ccr -= decrement;
+ }
+
+ vlapic->ccr_ticks = curticks;
+ vlapic->apic.ccr_timer = ccr;
+}
+
+struct vdev_ops vlapic_dev_ops = {
+ .name = "vlapic",
+ .init = vlapic_op_init,
+ .reset = vlapic_op_reset,
+ .halt = vlapic_op_halt,
+ .memread = vlapic_op_mem_read,
+ .memwrite = vlapic_op_mem_write,
+};
+static struct io_region vlapic_mmio[VM_MAXCPU];
+
+struct vlapic *
+vlapic_init(struct vm *vm, int vcpuid)
+{
+ struct vlapic *vlapic;
+
+ vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO);
+ vlapic->vm = vm;
+ vlapic->vcpuid = vcpuid;
+ vlapic->ops = &vlapic_dev_ops;
+
+ vlapic->mmio = vlapic_mmio + vcpuid;
+ vlapic->mmio->base = DEFAULT_APIC_BASE;
+ vlapic->mmio->len = PAGE_SIZE;
+ vlapic->mmio->attr = MMIO_READ|MMIO_WRITE;
+ vlapic->mmio->vcpu = vcpuid;
+
+ vdev_register(&vlapic_dev_ops, vlapic);
+
+ vlapic_op_init(vlapic);
+
+ return (vlapic);
+}
+
+void
+vlapic_cleanup(struct vlapic *vlapic)
+{
+ vdev_unregister(vlapic);
+ free(vlapic, M_VLAPIC);
+}
diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h
new file mode 100644
index 000000000000..861ea8c9c380
--- /dev/null
+++ b/sys/amd64/vmm/io/vlapic.h
@@ -0,0 +1,105 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VLAPIC_H_
+#define _VLAPIC_H_
+
+#include "vdev.h"
+
+struct vm;
+
+/*
+ * Map of APIC Registers: Offset Description Access
+ */
+#define APIC_OFFSET_ID 0x20 // Local APIC ID R/W
+#define APIC_OFFSET_VER 0x30 // Local APIC Version R
+#define APIC_OFFSET_TPR 0x80 // Task Priority Register R/W
+#define APIC_OFFSET_APR 0x90 // Arbitration Priority Register R
+#define APIC_OFFSET_PPR 0xA0 // Processor Priority Register R
+#define APIC_OFFSET_EOI 0xB0 // EOI Register W
+#define APIC_OFFSET_RRR 0xC0 // Remote read R
+#define APIC_OFFSET_LDR 0xD0 // Logical Destination R/W
+#define APIC_OFFSET_DFR 0xE0 // Destination Format Register 0..27 R; 28..31 R/W
+#define APIC_OFFSET_SVR 0xF0 // Spurious Interrupt Vector Reg. 0..3 R; 4..9 R/W
+#define APIC_OFFSET_ISR0 0x100 // ISR 000-031 R
+#define APIC_OFFSET_ISR1 0x110 // ISR 032-063 R
+#define APIC_OFFSET_ISR2 0x120 // ISR 064-095 R
+#define APIC_OFFSET_ISR3 0x130 // ISR 095-128 R
+#define APIC_OFFSET_ISR4 0x140 // ISR 128-159 R
+#define APIC_OFFSET_ISR5 0x150 // ISR 160-191 R
+#define APIC_OFFSET_ISR6 0x160 // ISR 192-223 R
+#define APIC_OFFSET_ISR7 0x170 // ISR 224-255 R
+#define APIC_OFFSET_TMR0 0x180 // TMR 000-031 R
+#define APIC_OFFSET_TMR1 0x190 // TMR 032-063 R
+#define APIC_OFFSET_TMR2 0x1A0 // TMR 064-095 R
+#define APIC_OFFSET_TMR3 0x1B0 // TMR 095-128 R
+#define APIC_OFFSET_TMR4 0x1C0 // TMR 128-159 R
+#define APIC_OFFSET_TMR5 0x1D0 // TMR 160-191 R
+#define APIC_OFFSET_TMR6 0x1E0 // TMR 192-223 R
+#define APIC_OFFSET_TMR7 0x1F0 // TMR 224-255 R
+#define APIC_OFFSET_IRR0 0x200 // IRR 000-031 R
+#define APIC_OFFSET_IRR1 0x210 // IRR 032-063 R
+#define APIC_OFFSET_IRR2 0x220 // IRR 064-095 R
+#define APIC_OFFSET_IRR3 0x230 // IRR 095-128 R
+#define APIC_OFFSET_IRR4 0x240 // IRR 128-159 R
+#define APIC_OFFSET_IRR5 0x250 // IRR 160-191 R
+#define APIC_OFFSET_IRR6 0x260 // IRR 192-223 R
+#define APIC_OFFSET_IRR7 0x270 // IRR 224-255 R
+#define APIC_OFFSET_ESR 0x280 // Error Status Register R
+#define APIC_OFFSET_ICR_LOW 0x300 // Interrupt Command Reg. (0-31) R/W
+#define APIC_OFFSET_ICR_HI 0x310 // Interrupt Command Reg. (32-63) R/W
+#define APIC_OFFSET_TIMER_LVT 0x320 // Local Vector Table (Timer) R/W
+#define APIC_OFFSET_THERM_LVT 0x330 // Local Vector Table (Thermal) R/W (PIV+)
+#define APIC_OFFSET_PERF_LVT 0x340 // Local Vector Table (Performance) R/W (P6+)
+#define APIC_OFFSET_LINT0_LVT 0x350 // Local Vector Table (LINT0) R/W
+#define APIC_OFFSET_LINT1_LVT 0x360 // Local Vector Table (LINT1) R/W
+#define APIC_OFFSET_ERROR_LVT 0x370 // Local Vector Table (ERROR) R/W
+#define APIC_OFFSET_ICR 0x380 // Initial Count Reg. for Timer R/W
+#define APIC_OFFSET_CCR 0x390 // Current Count of Timer R
+#define APIC_OFFSET_DCR 0x3E0 // Timer Divide Configuration Reg. R/W
+
+/*
+ * 16 priority levels with at most one vector injected per level.
+ */
+#define ISRVEC_STK_SIZE (16 + 1)
+
+struct vlapic *vlapic_init(struct vm *vm, int vcpuid);
+void vlapic_cleanup(struct vlapic *vlapic);
+
+int vlapic_op_mem_write(void* dev, uint64_t gpa,
+ opsize_t size, uint64_t data);
+
+int vlapic_op_mem_read(void* dev, uint64_t gpa,
+ opsize_t size, uint64_t *data);
+
+int vlapic_pending_intr(struct vlapic *vlapic);
+void vlapic_intr_accepted(struct vlapic *vlapic, int vector);
+void vlapic_set_intr_ready(struct vlapic *vlapic, int vector);
+void vlapic_timer_tick(struct vlapic *vlapic);
+
+#endif /* _VLAPIC_H_ */
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
new file mode 100644
index 000000000000..c93c31e772e8
--- /dev/null
+++ b/sys/amd64/vmm/vmm.c
@@ -0,0 +1,737 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+
+#include <machine/vm.h>
+#include <machine/pcb.h>
+#include <machine/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_mem.h"
+#include "vmm_util.h"
+#include <machine/vmm_dev.h>
+#include "vlapic.h"
+#include "vmm_msr.h"
+#include "vmm_ipi.h"
+#include "vmm_stat.h"
+
+#include "io/ppt.h"
+#include "io/iommu.h"
+
+struct vlapic;
+
+struct vcpu {
+ int flags;
+ int pincpu; /* host cpuid this vcpu is bound to */
+ int hostcpu; /* host cpuid this vcpu last ran on */
+ uint64_t guest_msrs[VMM_MSR_NUM];
+ struct vlapic *vlapic;
+ int vcpuid;
+ struct savefpu savefpu; /* guest fpu state */
+ void *stats;
+};
+#define VCPU_F_PINNED 0x0001
+#define VCPU_F_RUNNING 0x0002
+
+#define VCPU_PINCPU(vm, vcpuid) \
+ ((vm->vcpu[vcpuid].flags & VCPU_F_PINNED) ? vm->vcpu[vcpuid].pincpu : -1)
+
+#define VCPU_UNPIN(vm, vcpuid) (vm->vcpu[vcpuid].flags &= ~VCPU_F_PINNED)
+
+#define VCPU_PIN(vm, vcpuid, host_cpuid) \
+do { \
+ vm->vcpu[vcpuid].flags |= VCPU_F_PINNED; \
+ vm->vcpu[vcpuid].pincpu = host_cpuid; \
+} while(0)
+
+#define VM_MAX_MEMORY_SEGMENTS 2
+
+struct vm {
+ void *cookie; /* processor-specific data */
+ void *iommu; /* iommu-specific data */
+ struct vcpu vcpu[VM_MAXCPU];
+ int num_mem_segs;
+ struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
+ char name[VM_MAX_NAMELEN];
+
+ /*
+ * Mask of active vcpus.
+ * An active vcpu is one that has been started implicitly (BSP) or
+ * explicitly (AP) by sending it a startup ipi.
+ */
+ cpumask_t active_cpus;
+};
+
+static struct vmm_ops *ops;
+#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0)
+#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0)
+
+#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL)
+#define VMRUN(vmi, vcpu, rip, vmexit) \
+ (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, vmexit) : ENXIO)
+#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
+#define VMMMAP(vmi, gpa, hpa, len, attr, prot, spm) \
+ (ops != NULL ? (*ops->vmmmap)(vmi, gpa, hpa, len, attr, prot, spm) : ENXIO)
+#define VMGETREG(vmi, vcpu, num, retval) \
+ (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
+#define VMSETREG(vmi, vcpu, num, val) \
+ (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
+#define VMGETDESC(vmi, vcpu, num, desc) \
+ (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define VMSETDESC(vmi, vcpu, num, desc) \
+ (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \
+ (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
+#define VMNMI(vmi, vcpu) \
+ (ops != NULL ? (*ops->vmnmi)(vmi, vcpu) : ENXIO)
+#define VMGETCAP(vmi, vcpu, num, retval) \
+ (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
+#define VMSETCAP(vmi, vcpu, num, val) \
+ (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
+
+#define fxrstor(addr) __asm("fxrstor %0" : : "m" (*(addr)))
+#define fxsave(addr) __asm __volatile("fxsave %0" : "=m" (*(addr)))
+#define fpu_start_emulating() __asm("smsw %%ax; orb %0,%%al; lmsw %%ax" \
+ : : "n" (CR0_TS) : "ax")
+#define fpu_stop_emulating() __asm("clts")
+
+static MALLOC_DEFINE(M_VM, "vm", "vm");
+CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */
+
+/* statistics */
+static VMM_STAT_DEFINE(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
+
+static void
+vcpu_cleanup(struct vcpu *vcpu)
+{
+ vlapic_cleanup(vcpu->vlapic);
+ vmm_stat_free(vcpu->stats);
+}
+
+static void
+vcpu_init(struct vm *vm, uint32_t vcpu_id)
+{
+ struct vcpu *vcpu;
+
+ vcpu = &vm->vcpu[vcpu_id];
+
+ vcpu->hostcpu = -1;
+ vcpu->vcpuid = vcpu_id;
+ vcpu->vlapic = vlapic_init(vm, vcpu_id);
+ fpugetregs(curthread, &vcpu->savefpu);
+ vcpu->stats = vmm_stat_alloc();
+}
+
+static int
+vmm_init(void)
+{
+ int error;
+
+ vmm_ipi_init();
+
+ error = vmm_mem_init();
+ if (error)
+ return (error);
+
+ if (vmm_is_intel())
+ ops = &vmm_ops_intel;
+ else if (vmm_is_amd())
+ ops = &vmm_ops_amd;
+ else
+ return (ENXIO);
+
+ vmm_msr_init();
+
+ return (VMM_INIT());
+}
+
+static int
+vmm_handler(module_t mod, int what, void *arg)
+{
+ int error;
+
+ switch (what) {
+ case MOD_LOAD:
+ vmmdev_init();
+ iommu_init();
+ error = vmm_init();
+ break;
+ case MOD_UNLOAD:
+ vmmdev_cleanup();
+ iommu_cleanup();
+ vmm_ipi_cleanup();
+ error = VMM_CLEANUP();
+ break;
+ default:
+ error = 0;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t vmm_kmod = {
+ "vmm",
+ vmm_handler,
+ NULL
+};
+
+/*
+ * Execute the module load handler after the pci passthru driver has had
+ * a chance to claim devices. We need this information at the time we do
+ * iommu initialization.
+ */
+DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_CONFIGURE + 1, SI_ORDER_ANY);
+MODULE_VERSION(vmm, 1);
+
+SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
+
+struct vm *
+vm_create(const char *name)
+{
+ int i;
+ struct vm *vm;
+ vm_paddr_t maxaddr;
+
+ const int BSP = 0;
+
+ if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
+ return (NULL);
+
+ vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
+ strcpy(vm->name, name);
+ vm->cookie = VMINIT(vm);
+
+ for (i = 0; i < VM_MAXCPU; i++) {
+ vcpu_init(vm, i);
+ guest_msrs_init(vm, i);
+ }
+
+ maxaddr = vmm_mem_maxaddr();
+ vm->iommu = iommu_create_domain(maxaddr);
+ vm_activate_cpu(vm, BSP);
+
+ return (vm);
+}
+
+void
+vm_destroy(struct vm *vm)
+{
+ int i;
+
+ ppt_unassign_all(vm);
+
+ for (i = 0; i < vm->num_mem_segs; i++)
+ vmm_mem_free(vm->mem_segs[i].hpa, vm->mem_segs[i].len);
+
+ for (i = 0; i < VM_MAXCPU; i++)
+ vcpu_cleanup(&vm->vcpu[i]);
+
+ iommu_destroy_domain(vm->iommu);
+
+ VMCLEANUP(vm->cookie);
+
+ free(vm, M_VM);
+}
+
+const char *
+vm_name(struct vm *vm)
+{
+ return (vm->name);
+}
+
+int
+vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+ const boolean_t spok = TRUE; /* superpage mappings are ok */
+
+ return (VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
+ VM_PROT_RW, spok));
+}
+
+int
+vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+ const boolean_t spok = TRUE; /* superpage mappings are ok */
+
+ return (VMMMAP(vm->cookie, gpa, 0, len, VM_MEMATTR_UNCACHEABLE,
+ VM_PROT_NONE, spok));
+}
+
+int
+vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa)
+{
+ int error;
+ vm_paddr_t hpa;
+
+ const boolean_t spok = TRUE; /* superpage mappings are ok */
+
+ /*
+ * find the hpa if already it was already vm_malloc'd.
+ */
+ hpa = vm_gpa2hpa(vm, gpa, len);
+ if (hpa != ((vm_paddr_t)-1))
+ goto out;
+
+ if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
+ return (E2BIG);
+
+ hpa = vmm_mem_alloc(len);
+ if (hpa == 0)
+ return (ENOMEM);
+
+ error = VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_WRITE_BACK,
+ VM_PROT_ALL, spok);
+ if (error) {
+ vmm_mem_free(hpa, len);
+ return (error);
+ }
+
+ iommu_create_mapping(vm->iommu, gpa, hpa, len);
+
+ vm->mem_segs[vm->num_mem_segs].gpa = gpa;
+ vm->mem_segs[vm->num_mem_segs].hpa = hpa;
+ vm->mem_segs[vm->num_mem_segs].len = len;
+ vm->num_mem_segs++;
+out:
+ *ret_hpa = hpa;
+ return (0);
+}
+
+vm_paddr_t
+vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+ int i;
+ vm_paddr_t gpabase, gpalimit, hpabase;
+
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ hpabase = vm->mem_segs[i].hpa;
+ gpabase = vm->mem_segs[i].gpa;
+ gpalimit = gpabase + vm->mem_segs[i].len;
+ if (gpa >= gpabase && gpa + len <= gpalimit)
+ return ((gpa - gpabase) + hpabase);
+ }
+ return ((vm_paddr_t)-1);
+}
+
+int
+vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+ struct vm_memory_segment *seg)
+{
+ int i;
+
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ if (gpabase == vm->mem_segs[i].gpa) {
+ *seg = vm->mem_segs[i];
+ return (0);
+ }
+ }
+ return (-1);
+}
+
+int
+vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
+{
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (reg >= VM_REG_LAST)
+ return (EINVAL);
+
+ return (VMGETREG(vm->cookie, vcpu, reg, retval));
+}
+
+int
+vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
+{
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (reg >= VM_REG_LAST)
+ return (EINVAL);
+
+ return (VMSETREG(vm->cookie, vcpu, reg, val));
+}
+
+static boolean_t
+is_descriptor_table(int reg)
+{
+
+ switch (reg) {
+ case VM_REG_GUEST_IDTR:
+ case VM_REG_GUEST_GDTR:
+ return (TRUE);
+ default:
+ return (FALSE);
+ }
+}
+
+static boolean_t
+is_segment_register(int reg)
+{
+
+ switch (reg) {
+ case VM_REG_GUEST_ES:
+ case VM_REG_GUEST_CS:
+ case VM_REG_GUEST_SS:
+ case VM_REG_GUEST_DS:
+ case VM_REG_GUEST_FS:
+ case VM_REG_GUEST_GS:
+ case VM_REG_GUEST_TR:
+ case VM_REG_GUEST_LDTR:
+ return (TRUE);
+ default:
+ return (FALSE);
+ }
+}
+
+int
+vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *desc)
+{
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (!is_segment_register(reg) && !is_descriptor_table(reg))
+ return (EINVAL);
+
+ return (VMGETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+int
+vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *desc)
+{
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (!is_segment_register(reg) && !is_descriptor_table(reg))
+ return (EINVAL);
+
+ return (VMSETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+int
+vm_get_pinning(struct vm *vm, int vcpuid, int *cpuid)
+{
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ *cpuid = VCPU_PINCPU(vm, vcpuid);
+
+ return (0);
+}
+
+int
+vm_set_pinning(struct vm *vm, int vcpuid, int host_cpuid)
+{
+ struct thread *td;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ td = curthread; /* XXXSMP only safe when muxing vcpus */
+
+ /* unpin */
+ if (host_cpuid < 0) {
+ VCPU_UNPIN(vm, vcpuid);
+ thread_lock(td);
+ sched_unbind(td);
+ thread_unlock(td);
+ return (0);
+ }
+
+ if (CPU_ABSENT(host_cpuid))
+ return (EINVAL);
+
+ /*
+ * XXX we should check that 'host_cpuid' has not already been pinned
+ * by another vm.
+ */
+ thread_lock(td);
+ sched_bind(td, host_cpuid);
+ thread_unlock(td);
+ VCPU_PIN(vm, vcpuid, host_cpuid);
+
+ return (0);
+}
+
+static void
+restore_guest_fpustate(struct vcpu *vcpu)
+{
+ register_t s;
+
+ s = intr_disable();
+ fpu_stop_emulating();
+ fxrstor(&vcpu->savefpu);
+ fpu_start_emulating();
+ intr_restore(s);
+}
+
+static void
+save_guest_fpustate(struct vcpu *vcpu)
+{
+ register_t s;
+
+ s = intr_disable();
+ fpu_stop_emulating();
+ fxsave(&vcpu->savefpu);
+ fpu_start_emulating();
+ intr_restore(s);
+}
+
+int
+vm_run(struct vm *vm, struct vm_run *vmrun)
+{
+ int error, vcpuid;
+ struct vcpu *vcpu;
+ struct pcb *pcb;
+ uint64_t tscval;
+
+ vcpuid = vmrun->cpuid;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ critical_enter();
+
+ tscval = rdtsc();
+
+ pcb = PCPU_GET(curpcb);
+ pcb->pcb_full_iret = 1;
+
+ vcpu->hostcpu = curcpu;
+
+ fpuexit(curthread);
+ restore_guest_msrs(vm, vcpuid);
+ restore_guest_fpustate(vcpu);
+ error = VMRUN(vm->cookie, vcpuid, vmrun->rip, &vmrun->vm_exit);
+ save_guest_fpustate(vcpu);
+ restore_host_msrs(vm, vcpuid);
+
+ vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
+
+ critical_exit();
+
+ return (error);
+}
+
+int
+vm_inject_event(struct vm *vm, int vcpuid, int type,
+ int vector, uint32_t code, int code_valid)
+{
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
+ return (EINVAL);
+
+ if (vector < 0 || vector > 255)
+ return (EINVAL);
+
+ return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
+}
+
+int
+vm_inject_nmi(struct vm *vm, int vcpu)
+{
+ int error;
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ error = VMNMI(vm->cookie, vcpu);
+ vm_interrupt_hostcpu(vm, vcpu);
+ return (error);
+}
+
+int
+vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
+{
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (type < 0 || type >= VM_CAP_MAX)
+ return (EINVAL);
+
+ return (VMGETCAP(vm->cookie, vcpu, type, retval));
+}
+
+int
+vm_set_capability(struct vm *vm, int vcpu, int type, int val)
+{
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (type < 0 || type >= VM_CAP_MAX)
+ return (EINVAL);
+
+ return (VMSETCAP(vm->cookie, vcpu, type, val));
+}
+
+uint64_t *
+vm_guest_msrs(struct vm *vm, int cpu)
+{
+ return (vm->vcpu[cpu].guest_msrs);
+}
+
+struct vlapic *
+vm_lapic(struct vm *vm, int cpu)
+{
+ return (vm->vcpu[cpu].vlapic);
+}
+
+boolean_t
+vmm_is_pptdev(int bus, int slot, int func)
+{
+ int found, b, s, f, n;
+ char *val, *cp, *cp2;
+
+ /*
+ * setenv pptdevs "1/2/3 4/5/6 7/8/9 10/11/12"
+ */
+ found = 0;
+ cp = val = getenv("pptdevs");
+ while (cp != NULL && *cp != '\0') {
+ if ((cp2 = strchr(cp, ' ')) != NULL)
+ *cp2 = '\0';
+
+ n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
+ if (n == 3 && bus == b && slot == s && func == f) {
+ found = 1;
+ break;
+ }
+
+ if (cp2 != NULL)
+ *cp2++ = ' ';
+
+ cp = cp2;
+ }
+ freeenv(val);
+ return (found);
+}
+
+void *
+vm_iommu_domain(struct vm *vm)
+{
+
+ return (vm->iommu);
+}
+
+void
+vm_set_run_state(struct vm *vm, int vcpuid, int state)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ if (state == VCPU_RUNNING) {
+ if (vcpu->flags & VCPU_F_RUNNING) {
+ panic("vm_set_run_state: %s[%d] is already running",
+ vm_name(vm), vcpuid);
+ }
+ vcpu->flags |= VCPU_F_RUNNING;
+ } else {
+ if ((vcpu->flags & VCPU_F_RUNNING) == 0) {
+ panic("vm_set_run_state: %s[%d] is already stopped",
+ vm_name(vm), vcpuid);
+ }
+ vcpu->flags &= ~VCPU_F_RUNNING;
+ }
+}
+
+int
+vm_get_run_state(struct vm *vm, int vcpuid, int *cpuptr)
+{
+ int retval, hostcpu;
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+ if (vcpu->flags & VCPU_F_RUNNING) {
+ retval = VCPU_RUNNING;
+ hostcpu = vcpu->hostcpu;
+ } else {
+ retval = VCPU_STOPPED;
+ hostcpu = -1;
+ }
+
+ if (cpuptr)
+ *cpuptr = hostcpu;
+
+ return (retval);
+}
+
+void
+vm_activate_cpu(struct vm *vm, int vcpuid)
+{
+
+ if (vcpuid >= 0 && vcpuid < VM_MAXCPU)
+ vm->active_cpus |= vcpu_mask(vcpuid);
+}
+
+cpumask_t
+vm_active_cpus(struct vm *vm)
+{
+
+ return (vm->active_cpus);
+}
+
+void *
+vcpu_stats(struct vm *vm, int vcpuid)
+{
+
+ return (vm->vcpu[vcpuid].stats);
+}
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
new file mode 100644
index 000000000000..cf443fc1235c
--- /dev/null
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -0,0 +1,468 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+#include <sys/sysctl.h>
+#include <sys/libkern.h>
+#include <sys/ioccom.h>
+#include <sys/mman.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmm_lapic.h"
+#include "vmm_stat.h"
+#include "io/ppt.h"
+#include <machine/vmm_dev.h>
+
+struct vmmdev_softc {
+ struct vm *vm; /* vm instance cookie */
+ struct cdev *cdev;
+ SLIST_ENTRY(vmmdev_softc) link;
+};
+static SLIST_HEAD(, vmmdev_softc) head;
+
+static struct mtx vmmdev_mtx;
+
+static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
+
+SYSCTL_DECL(_hw_vmm);
+
+static struct vmmdev_softc *
+vmmdev_lookup(const char *name)
+{
+ struct vmmdev_softc *sc;
+
+#ifdef notyet /* XXX kernel is not compiled with invariants */
+ mtx_assert(&vmmdev_mtx, MA_OWNED);
+#endif
+
+ SLIST_FOREACH(sc, &head, link) {
+ if (strcmp(name, vm_name(sc->vm)) == 0)
+ break;
+ }
+
+ return (sc);
+}
+
+static struct vmmdev_softc *
+vmmdev_lookup2(struct cdev *cdev)
+{
+ struct vmmdev_softc *sc;
+
+#ifdef notyet /* XXX kernel is not compiled with invariants */
+ mtx_assert(&vmmdev_mtx, MA_OWNED);
+#endif
+
+ SLIST_FOREACH(sc, &head, link) {
+ if (sc->cdev == cdev)
+ break;
+ }
+
+ return (sc);
+}
+
+static int
+vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
+{
+ int error, off, c;
+ vm_paddr_t hpa, gpa;
+ struct vmmdev_softc *sc;
+
+ static char zerobuf[PAGE_SIZE];
+
+ error = 0;
+ mtx_lock(&vmmdev_mtx);
+ sc = vmmdev_lookup2(cdev);
+
+ while (uio->uio_resid > 0 && error == 0) {
+ gpa = uio->uio_offset;
+ off = gpa & PAGE_MASK;
+ c = min(uio->uio_resid, PAGE_SIZE - off);
+
+ /*
+ * The VM has a hole in its physical memory map. If we want to
+ * use 'dd' to inspect memory beyond the hole we need to
+ * provide bogus data for memory that lies in the hole.
+ *
+ * Since this device does not support lseek(2), dd(1) will
+ * read(2) blocks of data to simulate the lseek(2).
+ */
+ hpa = vm_gpa2hpa(sc->vm, gpa, c);
+ if (hpa == (vm_paddr_t)-1) {
+ if (uio->uio_rw == UIO_READ)
+ error = uiomove(zerobuf, c, uio);
+ else
+ error = EFAULT;
+ } else
+ error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio);
+ }
+
+ mtx_unlock(&vmmdev_mtx);
+ return (error);
+}
+
+static int
+vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
+ struct thread *td)
+{
+ int error, vcpu;
+ struct vmmdev_softc *sc;
+ struct vm_memory_segment *seg;
+ struct vm_register *vmreg;
+ struct vm_seg_desc* vmsegdesc;
+ struct vm_pin *vmpin;
+ struct vm_run *vmrun;
+ struct vm_event *vmevent;
+ struct vm_lapic_irq *vmirq;
+ struct vm_capability *vmcap;
+ struct vm_pptdev *pptdev;
+ struct vm_pptdev_mmio *pptmmio;
+ struct vm_pptdev_msi *pptmsi;
+ struct vm_nmi *vmnmi;
+ struct vm_stats *vmstats;
+ struct vm_stat_desc *statdesc;
+
+ mtx_lock(&vmmdev_mtx);
+ sc = vmmdev_lookup2(cdev);
+ if (sc == NULL) {
+ mtx_unlock(&vmmdev_mtx);
+ return (ENXIO);
+ }
+
+ /*
+ * Some VMM ioctls can operate only on vcpus that are not running.
+ */
+ switch (cmd) {
+ case VM_RUN:
+ case VM_SET_PINNING:
+ case VM_GET_REGISTER:
+ case VM_SET_REGISTER:
+ case VM_GET_SEGMENT_DESCRIPTOR:
+ case VM_SET_SEGMENT_DESCRIPTOR:
+ case VM_INJECT_EVENT:
+ case VM_GET_CAPABILITY:
+ case VM_SET_CAPABILITY:
+ case VM_PPTDEV_MSI:
+ /*
+ * XXX fragile, handle with care
+ * Assumes that the first field of the ioctl data is the vcpu.
+ */
+ vcpu = *(int *)data;
+ if (vcpu < 0 || vcpu >= VM_MAXCPU) {
+ error = EINVAL;
+ goto done;
+ }
+
+ if (vcpu_is_running(sc->vm, vcpu, NULL)) {
+ error = EBUSY;
+ goto done;
+ }
+ break;
+ default:
+ break;
+ }
+
+ switch(cmd) {
+ case VM_RUN:
+ vmrun = (struct vm_run *)data;
+
+ vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_RUNNING);
+ mtx_unlock(&vmmdev_mtx);
+
+ error = vm_run(sc->vm, vmrun);
+
+ mtx_lock(&vmmdev_mtx);
+ vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_STOPPED);
+ break;
+ case VM_STAT_DESC: {
+ const char *desc;
+ statdesc = (struct vm_stat_desc *)data;
+ desc = vmm_stat_desc(statdesc->index);
+ if (desc != NULL) {
+ error = 0;
+ strlcpy(statdesc->desc, desc, sizeof(statdesc->desc));
+ } else
+ error = EINVAL;
+ break;
+ }
+ case VM_STATS: {
+ CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_TYPES);
+ vmstats = (struct vm_stats *)data;
+ getmicrotime(&vmstats->tv);
+ error = vmm_stat_copy(sc->vm, vmstats->cpuid,
+ &vmstats->num_entries, vmstats->statbuf);
+ break;
+ }
+ case VM_PPTDEV_MSI:
+ pptmsi = (struct vm_pptdev_msi *)data;
+ error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
+ pptmsi->bus, pptmsi->slot, pptmsi->func,
+ pptmsi->destcpu, pptmsi->vector,
+ pptmsi->numvec);
+ break;
+ case VM_MAP_PPTDEV_MMIO:
+ pptmmio = (struct vm_pptdev_mmio *)data;
+ error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
+ pptmmio->func, pptmmio->gpa, pptmmio->len,
+ pptmmio->hpa);
+ break;
+ case VM_BIND_PPTDEV:
+ pptdev = (struct vm_pptdev *)data;
+ error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot,
+ pptdev->func);
+ break;
+ case VM_UNBIND_PPTDEV:
+ pptdev = (struct vm_pptdev *)data;
+ error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot,
+ pptdev->func);
+ break;
+ case VM_INJECT_EVENT:
+ vmevent = (struct vm_event *)data;
+ error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type,
+ vmevent->vector,
+ vmevent->error_code,
+ vmevent->error_code_valid);
+ break;
+ case VM_INJECT_NMI:
+ vmnmi = (struct vm_nmi *)data;
+ error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
+ break;
+ case VM_LAPIC_IRQ:
+ vmirq = (struct vm_lapic_irq *)data;
+ error = lapic_set_intr(sc->vm, vmirq->cpuid, vmirq->vector);
+ break;
+ case VM_SET_PINNING:
+ vmpin = (struct vm_pin *)data;
+ error = vm_set_pinning(sc->vm, vmpin->vm_cpuid,
+ vmpin->host_cpuid);
+ break;
+ case VM_GET_PINNING:
+ vmpin = (struct vm_pin *)data;
+ error = vm_get_pinning(sc->vm, vmpin->vm_cpuid,
+ &vmpin->host_cpuid);
+ break;
+ case VM_MAP_MEMORY:
+ seg = (struct vm_memory_segment *)data;
+ error = vm_malloc(sc->vm, seg->gpa, seg->len, &seg->hpa);
+ break;
+ case VM_GET_MEMORY_SEG:
+ seg = (struct vm_memory_segment *)data;
+ seg->hpa = seg->len = 0;
+ (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg);
+ error = 0;
+ break;
+ case VM_GET_REGISTER:
+ vmreg = (struct vm_register *)data;
+ error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
+ &vmreg->regval);
+ break;
+ case VM_SET_REGISTER:
+ vmreg = (struct vm_register *)data;
+ error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
+ vmreg->regval);
+ break;
+ case VM_SET_SEGMENT_DESCRIPTOR:
+ vmsegdesc = (struct vm_seg_desc *)data;
+ error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
+ vmsegdesc->regnum,
+ &vmsegdesc->desc);
+ break;
+ case VM_GET_SEGMENT_DESCRIPTOR:
+ vmsegdesc = (struct vm_seg_desc *)data;
+ error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
+ vmsegdesc->regnum,
+ &vmsegdesc->desc);
+ break;
+ case VM_GET_CAPABILITY:
+ vmcap = (struct vm_capability *)data;
+ error = vm_get_capability(sc->vm, vmcap->cpuid,
+ vmcap->captype,
+ &vmcap->capval);
+ break;
+ case VM_SET_CAPABILITY:
+ vmcap = (struct vm_capability *)data;
+ error = vm_set_capability(sc->vm, vmcap->cpuid,
+ vmcap->captype,
+ vmcap->capval);
+ break;
+ default:
+ error = ENOTTY;
+ break;
+ }
+done:
+ mtx_unlock(&vmmdev_mtx);
+
+ return (error);
+}
+
+static int
+vmmdev_mmap(struct cdev *cdev, vm_offset_t offset, vm_paddr_t *paddr, int nprot)
+{
+ int error;
+ struct vmmdev_softc *sc;
+
+ error = -1;
+ mtx_lock(&vmmdev_mtx);
+
+ sc = vmmdev_lookup2(cdev);
+ if (sc != NULL && (nprot & PROT_EXEC) == 0) {
+ *paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE);
+ if (*paddr != (vm_paddr_t)-1)
+ error = 0;
+ }
+
+ mtx_unlock(&vmmdev_mtx);
+
+ return (error);
+}
+
+static void
+vmmdev_destroy(struct vmmdev_softc *sc)
+{
+
+#ifdef notyet /* XXX kernel is not compiled with invariants */
+ mtx_assert(&vmmdev_mtx, MA_OWNED);
+#endif
+
+ /*
+ * XXX must stop virtual machine instances that may be still
+ * running and cleanup their state.
+ */
+ SLIST_REMOVE(&head, sc, vmmdev_softc, link);
+ destroy_dev(sc->cdev);
+ vm_destroy(sc->vm);
+ free(sc, M_VMMDEV);
+}
+
+static int
+sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ char buf[VM_MAX_NAMELEN];
+ struct vmmdev_softc *sc;
+
+ strlcpy(buf, "beavis", sizeof(buf));
+ error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ mtx_lock(&vmmdev_mtx);
+ sc = vmmdev_lookup(buf);
+ if (sc == NULL) {
+ mtx_unlock(&vmmdev_mtx);
+ return (EINVAL);
+ }
+ vmmdev_destroy(sc);
+ mtx_unlock(&vmmdev_mtx);
+ return (0);
+}
+SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
+ NULL, 0, sysctl_vmm_destroy, "A", NULL);
+
+static struct cdevsw vmmdevsw = {
+ .d_name = "vmmdev",
+ .d_version = D_VERSION,
+ .d_ioctl = vmmdev_ioctl,
+ .d_mmap = vmmdev_mmap,
+ .d_read = vmmdev_rw,
+ .d_write = vmmdev_rw,
+};
+
+static int
+sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ struct vm *vm;
+ struct vmmdev_softc *sc;
+ char buf[VM_MAX_NAMELEN];
+
+ strlcpy(buf, "beavis", sizeof(buf));
+ error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ mtx_lock(&vmmdev_mtx);
+
+ sc = vmmdev_lookup(buf);
+ if (sc != NULL) {
+ mtx_unlock(&vmmdev_mtx);
+ return (EEXIST);
+ }
+
+ vm = vm_create(buf);
+ if (vm == NULL) {
+ mtx_unlock(&vmmdev_mtx);
+ return (EINVAL);
+ }
+
+ sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
+ sc->vm = vm;
+ sc->cdev = make_dev(&vmmdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+ "vmm/%s", buf);
+ sc->cdev->si_drv1 = sc;
+ SLIST_INSERT_HEAD(&head, sc, link);
+
+ mtx_unlock(&vmmdev_mtx);
+ return (0);
+}
+SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
+ NULL, 0, sysctl_vmm_create, "A", NULL);
+
+void
+vmmdev_init(void)
+{
+ mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
+}
+
+void
+vmmdev_cleanup(void)
+{
+ struct vmmdev_softc *sc, *sc2;
+
+ mtx_lock(&vmmdev_mtx);
+
+ SLIST_FOREACH_SAFE(sc, &head, link, sc2)
+ vmmdev_destroy(sc);
+
+ mtx_unlock(&vmmdev_mtx);
+}
diff --git a/sys/amd64/vmm/vmm_ipi.c b/sys/amd64/vmm/vmm_ipi.c
new file mode 100644
index 000000000000..c8e795bedcc9
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ipi.c
@@ -0,0 +1,103 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/bus.h>
+
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
+#include <machine/segments.h>
+#include <machine/md_var.h>
+#include <machine/smp.h>
+
+#include <machine/vmm.h>
+#include "vmm_ipi.h"
+
+extern inthand_t IDTVEC(rsvd), IDTVEC(justreturn);
+
+/*
+ * The default is to use the IPI_AST to interrupt a vcpu.
+ */
+static int ipinum = IPI_AST;
+
+CTASSERT(APIC_SPURIOUS_INT == 255);
+
+void
+vmm_ipi_init(void)
+{
+ int idx;
+ uintptr_t func;
+ struct gate_descriptor *ip;
+
+ /*
+ * Search backwards from the highest IDT vector available for use
+ * as our IPI vector. We install the 'justreturn' handler at that
+ * vector and use it to interrupt the vcpus.
+ *
+ * We do this because the IPI_AST is heavyweight and saves all
+ * registers in the trapframe. This is overkill for our use case
+ * which is simply to EOI the interrupt and return.
+ */
+ idx = APIC_SPURIOUS_INT;
+ while (--idx >= APIC_IPI_INTS) {
+ ip = &idt[idx];
+ func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
+ if (func == (uintptr_t)&IDTVEC(rsvd)) {
+ ipinum = idx;
+ setidt(ipinum, IDTVEC(justreturn), SDT_SYSIGT,
+ SEL_KPL, 0);
+ break;
+ }
+ }
+
+ if (ipinum != IPI_AST && bootverbose) {
+ printf("vmm_ipi_init: installing ipi handler to interrupt "
+ "vcpus at vector %d\n", ipinum);
+ }
+}
+
+void
+vmm_ipi_cleanup(void)
+{
+ if (ipinum != IPI_AST)
+ setidt(ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+}
+
+void
+vm_interrupt_hostcpu(struct vm *vm, int vcpu)
+{
+ int hostcpu;
+
+ if (vcpu_is_running(vm, vcpu, &hostcpu) && hostcpu != curcpu)
+ ipi_selected((cpumask_t)1 << hostcpu, ipinum);
+}
diff --git a/sys/amd64/vmm/vmm_ipi.h b/sys/amd64/vmm/vmm_ipi.h
new file mode 100644
index 000000000000..7ab94bfdf073
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ipi.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_IPI_H_
+#define _VMM_IPI_H_
+
+struct vm;
+
+void vmm_ipi_init(void);
+void vmm_ipi_cleanup(void);
+void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_ktr.h b/sys/amd64/vmm/vmm_ktr.h
new file mode 100644
index 000000000000..e691c61af68e
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ktr.h
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_KTR_H_
+#define _VMM_KTR_H_
+
+#include <sys/ktr.h>
+#include <sys/pcpu.h>
+
+#define KTR_VMM KTR_GEN
+
+#define VMM_CTR0(vm, vcpuid, format) \
+CTR3(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu)
+
+#define VMM_CTR1(vm, vcpuid, format, p1) \
+CTR4(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+ (p1))
+
+#define VMM_CTR2(vm, vcpuid, format, p1, p2) \
+CTR5(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+ (p1), (p2))
+
+#define VMM_CTR3(vm, vcpuid, format, p1, p2, p3) \
+CTR6(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+ (p1), (p2), (p3))
+#endif
diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c
new file mode 100644
index 000000000000..8704fcf4c754
--- /dev/null
+++ b/sys/amd64/vmm/vmm_lapic.c
@@ -0,0 +1,121 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/vmm.h>
+#include "vmm_ipi.h"
+#include "vmm_lapic.h"
+#include "vlapic.h"
+
+int
+lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val)
+{
+ int handled;
+
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ if (vlapic_op_mem_write(vlapic, offset, DWORD, val) == 0)
+ handled = 1;
+ else
+ handled = 0;
+
+ return (handled);
+}
+
+int
+lapic_read(struct vm *vm, int cpu, u_int offset, uint64_t *rv)
+{
+ int handled;
+
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ if (vlapic_op_mem_read(vlapic, offset, DWORD, rv) == 0)
+ handled = 1;
+ else
+ handled = 0;
+
+ return (handled);
+}
+
+int
+lapic_pending_intr(struct vm *vm, int cpu)
+{
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ return (vlapic_pending_intr(vlapic));
+}
+
+void
+lapic_intr_accepted(struct vm *vm, int cpu, int vector)
+{
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ vlapic_intr_accepted(vlapic, vector);
+}
+
+int
+lapic_set_intr(struct vm *vm, int cpu, int vector)
+{
+ struct vlapic *vlapic;
+
+ if (cpu < 0 || cpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (vector < 32 || vector > 255)
+ return (EINVAL);
+
+ vlapic = vm_lapic(vm, cpu);
+ vlapic_set_intr_ready(vlapic, vector);
+
+ vm_interrupt_hostcpu(vm, cpu);
+
+ return (0);
+}
+
+void
+lapic_timer_tick(struct vm *vm, int cpu)
+{
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ vlapic_timer_tick(vlapic);
+}
diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h
new file mode 100644
index 000000000000..815b2f7937b7
--- /dev/null
+++ b/sys/amd64/vmm/vmm_lapic.h
@@ -0,0 +1,64 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_LAPIC_H_
+#define _VMM_LAPIC_H_
+
+struct vm;
+
+int lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val);
+int lapic_read(struct vm *vm, int cpu, u_int offset, uint64_t *retval);
+void lapic_timer_tick(struct vm *vm, int cpu);
+
+/*
+ * Returns a vector between 32 and 255 if an interrupt is pending in the
+ * IRR that can be delivered based on the current state of ISR and TPR.
+ *
+ * Note that the vector does not automatically transition to the ISR as a
+ * result of calling this function.
+ *
+ * Returns -1 if there is no eligible vector that can be delivered to the
+ * guest at this time.
+ */
+int lapic_pending_intr(struct vm *vm, int cpu);
+
+/*
+ * Transition 'vector' from IRR to ISR. This function is called with the
+ * vector returned by 'lapic_pending_intr()' when the guest is able to
+ * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that
+ * block interrupt delivery).
+ */
+void lapic_intr_accepted(struct vm *vm, int cpu, int vector);
+
+/*
+ * Signals to the LAPIC that an interrupt at 'vector' needs to be generated
+ * to the 'cpu', the state is recorded in IRR.
+ */
+int lapic_set_intr(struct vm *vm, int cpu, int vector);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c
new file mode 100644
index 000000000000..9ce1e800f323
--- /dev/null
+++ b/sys/amd64/vmm/vmm_mem.c
@@ -0,0 +1,413 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/linker.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/md_var.h>
+#include <machine/metadata.h>
+#include <machine/pc/bios.h>
+#include <machine/vmparam.h>
+#include <machine/pmap.h>
+
+#include "vmm_util.h"
+#include "vmm_mem.h"
+
+static MALLOC_DEFINE(M_VMM_MEM, "vmm memory", "vmm memory");
+
+#define MB (1024 * 1024)
+#define GB (1024 * MB)
+
+#define VMM_MEM_MAXSEGS 64
+
+/* protected by vmm_mem_mtx */
+static struct {
+ vm_paddr_t base;
+ vm_size_t length;
+} vmm_mem_avail[VMM_MEM_MAXSEGS];
+
+static int vmm_mem_nsegs;
+
+static vm_paddr_t maxaddr;
+
+static struct mtx vmm_mem_mtx;
+
+/*
+ * Steal any memory that was deliberately hidden from FreeBSD either by
+ * the use of MAXMEM kernel config option or the hw.physmem loader tunable.
+ */
+static int
+vmm_mem_steal_memory(void)
+{
+ int nsegs;
+ caddr_t kmdp;
+ uint32_t smapsize;
+ uint64_t base, length;
+ struct bios_smap *smapbase, *smap, *smapend;
+
+ /*
+ * Borrowed from hammer_time() and getmemsize() in machdep.c
+ */
+ kmdp = preload_search_by_type("elf kernel");
+ if (kmdp == NULL)
+ kmdp = preload_search_by_type("elf64 kernel");
+
+ smapbase = (struct bios_smap *)preload_search_info(kmdp,
+ MODINFO_METADATA | MODINFOMD_SMAP);
+ if (smapbase == NULL)
+ panic("No BIOS smap info from loader!");
+
+ smapsize = *((uint32_t *)smapbase - 1);
+ smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
+
+ nsegs = 0;
+ for (smap = smapbase; smap < smapend; smap++) {
+ /*
+ * XXX
+ * Assuming non-overlapping, monotonically increasing
+ * memory segments.
+ */
+ if (smap->type != SMAP_TYPE_MEMORY)
+ continue;
+ if (smap->length == 0)
+ break;
+
+ base = roundup(smap->base, NBPDR);
+ length = rounddown(smap->length, NBPDR);
+
+ /* Skip this segment if FreeBSD is using all of it. */
+ if (base + length <= ptoa(Maxmem))
+ continue;
+
+ /*
+ * If FreeBSD is using part of this segment then adjust
+ * 'base' and 'length' accordingly.
+ */
+ if (base < ptoa(Maxmem)) {
+ uint64_t used;
+ used = roundup(ptoa(Maxmem), NBPDR) - base;
+ base += used;
+ length -= used;
+ }
+
+ if (length == 0)
+ continue;
+
+ vmm_mem_avail[nsegs].base = base;
+ vmm_mem_avail[nsegs].length = length;
+
+ if (base + length > maxaddr)
+ maxaddr = base + length;
+
+ if (0 && bootverbose) {
+ printf("vmm_mem_populate: index %d, base 0x%0lx, "
+ "length %ld\n",
+ nsegs, vmm_mem_avail[nsegs].base,
+ vmm_mem_avail[nsegs].length);
+ }
+
+ nsegs++;
+ if (nsegs >= VMM_MEM_MAXSEGS) {
+ printf("vmm_mem_populate: maximum number of vmm memory "
+ "segments reached!\n");
+ return (ENOSPC);
+ }
+ }
+
+ vmm_mem_nsegs = nsegs;
+
+ return (0);
+}
+
+static void
+vmm_mem_direct_map(vm_paddr_t start, vm_paddr_t end)
+{
+ vm_paddr_t addr, remaining;
+ int pdpi, pdi, superpage_size;
+ pml4_entry_t *pml4p;
+ pdp_entry_t *pdp;
+ pd_entry_t *pd;
+ uint64_t page_attr_bits;
+
+ if (end >= NBPML4)
+ panic("Cannot map memory beyond %ldGB", NBPML4 / GB);
+
+ /* XXX FreeBSD 8.1 does not use 1G superpages in the direct map */
+ if (0 && vmm_supports_1G_pages())
+ superpage_size = NBPDP;
+ else
+ superpage_size = NBPDR;
+
+ /*
+ * Get the page directory pointer page that contains the direct
+ * map address mappings.
+ */
+ pml4p = kernel_pmap->pm_pml4;
+ pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4p[DMPML4I] & ~PAGE_MASK);
+
+ page_attr_bits = PG_RW | PG_V | PG_PS | PG_G;
+ addr = start;
+ while (addr < end) {
+ remaining = end - addr;
+ pdpi = addr / NBPDP;
+ if (superpage_size == NBPDP &&
+ remaining >= NBPDP &&
+ addr % NBPDP == 0) {
+ /*
+ * If there isn't a mapping for this address then
+ * create one but if there is one already make sure
+ * it matches what we expect it to be.
+ */
+ if (pdp[pdpi] == 0) {
+ pdp[pdpi] = addr | page_attr_bits;
+ if (0 && bootverbose) {
+ printf("vmm_mem_populate: mapping "
+ "0x%lx with 1GB page at "
+ "pdpi %d\n", addr, pdpi);
+ }
+ } else {
+ pdp_entry_t pdpe = pdp[pdpi];
+ if ((pdpe & ~PAGE_MASK) != addr ||
+ (pdpe & page_attr_bits) != page_attr_bits) {
+ panic("An invalid mapping 0x%016lx "
+ "already exists for 0x%016lx\n",
+ pdpe, addr);
+ }
+ }
+ addr += NBPDP;
+ } else {
+ if (remaining < NBPDR) {
+ panic("vmm_mem_populate: remaining (%ld) must "
+ "be greater than NBPDR (%d)\n",
+ remaining, NBPDR);
+ }
+ if (pdp[pdpi] == 0) {
+ /*
+ * XXX we lose this memory forever because
+ * we do not keep track of the virtual address
+ * that would be required to free this page.
+ */
+ pd = malloc(PAGE_SIZE, M_VMM_MEM,
+ M_WAITOK | M_ZERO);
+ if ((uintptr_t)pd & PAGE_MASK) {
+ panic("vmm_mem_populate: page directory"
+ "page not aligned on %d "
+ "boundary\n", PAGE_SIZE);
+ }
+ pdp[pdpi] = vtophys(pd);
+ pdp[pdpi] |= PG_RW | PG_V | PG_U;
+ if (0 && bootverbose) {
+ printf("Creating page directory "
+ "at pdp index %d for 0x%016lx\n",
+ pdpi, addr);
+ }
+ }
+ pdi = (addr % NBPDP) / NBPDR;
+ pd = (pd_entry_t *)PHYS_TO_DMAP(pdp[pdpi] & ~PAGE_MASK);
+
+ /*
+ * Create a new mapping if one doesn't already exist
+ * or validate it if it does.
+ */
+ if (pd[pdi] == 0) {
+ pd[pdi] = addr | page_attr_bits;
+ if (0 && bootverbose) {
+ printf("vmm_mem_populate: mapping "
+ "0x%lx with 2MB page at "
+ "pdpi %d, pdi %d\n",
+ addr, pdpi, pdi);
+ }
+ } else {
+ pd_entry_t pde = pd[pdi];
+ if ((pde & ~PAGE_MASK) != addr ||
+ (pde & page_attr_bits) != page_attr_bits) {
+ panic("An invalid mapping 0x%016lx "
+ "already exists for 0x%016lx\n",
+ pde, addr);
+ }
+ }
+ addr += NBPDR;
+ }
+ }
+}
+
+static int
+vmm_mem_populate(void)
+{
+ int seg, error;
+ vm_paddr_t start, end;
+
+ /* populate the vmm_mem_avail[] array */
+ error = vmm_mem_steal_memory();
+ if (error)
+ return (error);
+
+ /*
+ * Now map the memory that was hidden from FreeBSD in
+ * the direct map VA space.
+ */
+ for (seg = 0; seg < vmm_mem_nsegs; seg++) {
+ start = vmm_mem_avail[seg].base;
+ end = start + vmm_mem_avail[seg].length;
+ if ((start & PDRMASK) != 0 || (end & PDRMASK) != 0) {
+ panic("start (0x%016lx) and end (0x%016lx) must be "
+ "aligned on a %dMB boundary\n",
+ start, end, NBPDR / MB);
+ }
+ vmm_mem_direct_map(start, end);
+ }
+
+ return (0);
+}
+
+int
+vmm_mem_init(void)
+{
+ int error;
+
+ mtx_init(&vmm_mem_mtx, "vmm_mem_mtx", NULL, MTX_DEF);
+
+ error = vmm_mem_populate();
+ if (error)
+ return (error);
+
+ return (0);
+}
+
+vm_paddr_t
+vmm_mem_alloc(size_t size)
+{
+ int i;
+ vm_paddr_t addr;
+
+ if ((size & PDRMASK) != 0) {
+ panic("vmm_mem_alloc: size 0x%0lx must be "
+ "aligned on a 0x%0x boundary\n", size, NBPDR);
+ }
+
+ addr = 0;
+
+ mtx_lock(&vmm_mem_mtx);
+ for (i = 0; i < vmm_mem_nsegs; i++) {
+ if (vmm_mem_avail[i].length >= size) {
+ addr = vmm_mem_avail[i].base;
+ vmm_mem_avail[i].base += size;
+ vmm_mem_avail[i].length -= size;
+ /* remove a zero length segment */
+ if (vmm_mem_avail[i].length == 0) {
+ memmove(&vmm_mem_avail[i],
+ &vmm_mem_avail[i + 1],
+ (vmm_mem_nsegs - (i + 1)) *
+ sizeof(vmm_mem_avail[0]));
+ vmm_mem_nsegs--;
+ }
+ break;
+ }
+ }
+ mtx_unlock(&vmm_mem_mtx);
+
+ return (addr);
+}
+
+void
+vmm_mem_free(vm_paddr_t base, size_t length)
+{
+ int i;
+
+ if ((base & PDRMASK) != 0 || (length & PDRMASK) != 0) {
+ panic("vmm_mem_free: base 0x%0lx and length 0x%0lx must be "
+ "aligned on a 0x%0x boundary\n", base, length, NBPDR);
+ }
+
+ mtx_lock(&vmm_mem_mtx);
+
+ for (i = 0; i < vmm_mem_nsegs; i++) {
+ if (vmm_mem_avail[i].base > base)
+ break;
+ }
+
+ if (vmm_mem_nsegs >= VMM_MEM_MAXSEGS)
+ panic("vmm_mem_free: cannot free any more segments");
+
+ /* Create a new segment at index 'i' */
+ memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i],
+ (vmm_mem_nsegs - i) * sizeof(vmm_mem_avail[0]));
+
+ vmm_mem_avail[i].base = base;
+ vmm_mem_avail[i].length = length;
+
+ vmm_mem_nsegs++;
+
+coalesce_some_more:
+ for (i = 0; i < vmm_mem_nsegs - 1; i++) {
+ if (vmm_mem_avail[i].base + vmm_mem_avail[i].length ==
+ vmm_mem_avail[i + 1].base) {
+ vmm_mem_avail[i].length += vmm_mem_avail[i + 1].length;
+ memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i + 2],
+ (vmm_mem_nsegs - (i + 2)) * sizeof(vmm_mem_avail[0]));
+ vmm_mem_nsegs--;
+ goto coalesce_some_more;
+ }
+ }
+
+ mtx_unlock(&vmm_mem_mtx);
+}
+
+vm_paddr_t
+vmm_mem_maxaddr(void)
+{
+
+ return (maxaddr);
+}
+
+void
+vmm_mem_dump(void)
+{
+ int i;
+ vm_paddr_t base;
+ vm_size_t length;
+
+ mtx_lock(&vmm_mem_mtx);
+ for (i = 0; i < vmm_mem_nsegs; i++) {
+ base = vmm_mem_avail[i].base;
+ length = vmm_mem_avail[i].length;
+ printf("%-4d0x%016lx 0x%016lx\n", i, base, base + length);
+ }
+ mtx_unlock(&vmm_mem_mtx);
+}
diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h
new file mode 100644
index 000000000000..ef1bf1aee32b
--- /dev/null
+++ b/sys/amd64/vmm/vmm_mem.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_MEM_H_
+#define _VMM_MEM_H_
+
+int vmm_mem_init(void);
+vm_paddr_t vmm_mem_alloc(size_t size);
+void vmm_mem_free(vm_paddr_t start, size_t size);
+vm_paddr_t vmm_mem_maxaddr(void);
+void vmm_mem_dump(void);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c
new file mode 100644
index 000000000000..152aa7b17e1b
--- /dev/null
+++ b/sys/amd64/vmm/vmm_msr.c
@@ -0,0 +1,264 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/specialreg.h>
+#include <machine/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_lapic.h"
+#include "vmm_msr.h"
+
+#define VMM_MSR_F_EMULATE 0x01
+#define VMM_MSR_F_READONLY 0x02
+
+struct vmm_msr {
+ int num;
+ int flags;
+ uint64_t hostval;
+};
+
+static struct vmm_msr vmm_msr[] = {
+ { MSR_LSTAR, 0 },
+ { MSR_CSTAR, 0 },
+ { MSR_STAR, 0 },
+ { MSR_SF_MASK, 0 },
+ { MSR_APICBASE, VMM_MSR_F_EMULATE },
+ { MSR_BIOS_SIGN,VMM_MSR_F_EMULATE },
+ { MSR_MCG_CAP, VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY },
+};
+
+#define vmm_msr_num (sizeof(vmm_msr) / sizeof(vmm_msr[0]))
+CTASSERT(VMM_MSR_NUM >= vmm_msr_num);
+
+#define readonly_msr(idx) \
+ ((vmm_msr[(idx)].flags & VMM_MSR_F_READONLY) != 0)
+
+#define emulated_msr(idx) \
+ ((vmm_msr[(idx)].flags & VMM_MSR_F_EMULATE) != 0)
+
+void
+vmm_msr_init(void)
+{
+ int i;
+
+ for (i = 0; i < vmm_msr_num; i++) {
+ if (emulated_msr(i))
+ continue;
+ /*
+ * XXX this assumes that the value of the host msr does not
+ * change after we have cached it.
+ */
+ vmm_msr[i].hostval = rdmsr(vmm_msr[i].num);
+ }
+}
+
+void
+guest_msrs_init(struct vm *vm, int cpu)
+{
+ int i;
+ uint64_t *guest_msrs;
+
+ guest_msrs = vm_guest_msrs(vm, cpu);
+
+ for (i = 0; i < vmm_msr_num; i++) {
+ switch (vmm_msr[i].num) {
+ case MSR_LSTAR:
+ case MSR_CSTAR:
+ case MSR_STAR:
+ case MSR_SF_MASK:
+ case MSR_BIOS_SIGN:
+ case MSR_MCG_CAP:
+ guest_msrs[i] = 0;
+ break;
+ case MSR_APICBASE:
+ guest_msrs[i] = DEFAULT_APIC_BASE | APICBASE_ENABLED |
+ APICBASE_X2APIC;
+ if (cpu == 0)
+ guest_msrs[i] |= APICBASE_BSP;
+ break;
+ default:
+ panic("guest_msrs_init: missing initialization for msr "
+ "0x%0x", vmm_msr[i].num);
+ }
+ }
+}
+
+static boolean_t
+x2apic_msr(u_int num)
+{
+
+ if (num >= 0x800 && num <= 0xBFF)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+static u_int
+x2apic_msr_to_regoff(u_int msr)
+{
+
+ return ((msr - 0x800) << 4);
+}
+
+static boolean_t
+x2apic_msr_id(u_int num)
+{
+ return (num == 0x802);
+}
+
+static int
+msr_num_to_idx(u_int num)
+{
+ int i;
+
+ for (i = 0; i < vmm_msr_num; i++)
+ if (vmm_msr[i].num == num)
+ return (i);
+
+ return (-1);
+}
+
+int
+emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val)
+{
+ int handled, idx;
+ uint64_t *guest_msrs;
+
+ handled = 0;
+
+ if (x2apic_msr(num))
+ return (lapic_write(vm, cpu, x2apic_msr_to_regoff(num), val));
+
+ idx = msr_num_to_idx(num);
+ if (idx < 0)
+ goto done;
+
+ if (!readonly_msr(idx)) {
+ guest_msrs = vm_guest_msrs(vm, cpu);
+
+ /* Stash the value */
+ guest_msrs[idx] = val;
+
+ /* Update processor state for non-emulated MSRs */
+ if (!emulated_msr(idx))
+ wrmsr(vmm_msr[idx].num, val);
+ }
+
+ handled = 1;
+done:
+ return (handled);
+}
+
+int
+emulate_rdmsr(struct vm *vm, int cpu, u_int num)
+{
+ int error, handled, idx;
+ uint32_t eax, edx;
+ uint64_t result, *guest_msrs;
+
+ handled = 0;
+
+ if (x2apic_msr(num)) {
+ handled = lapic_read(vm, cpu, x2apic_msr_to_regoff(num),
+ &result);
+ /*
+ * The version ID needs to be massaged
+ */
+ if (x2apic_msr_id(num)) {
+ result = result >> 24;
+ }
+ goto done;
+ }
+
+ idx = msr_num_to_idx(num);
+ if (idx < 0)
+ goto done;
+
+ guest_msrs = vm_guest_msrs(vm, cpu);
+ result = guest_msrs[idx];
+
+ /*
+ * If this is not an emulated msr register make sure that the processor
+ * state matches our cached state.
+ */
+ if (!emulated_msr(idx) && (rdmsr(num) != result)) {
+ panic("emulate_rdmsr: msr 0x%0x has inconsistent cached "
+ "(0x%016lx) and actual (0x%016lx) values", num,
+ result, rdmsr(num));
+ }
+
+ handled = 1;
+
+done:
+ if (handled) {
+ eax = result;
+ edx = result >> 32;
+ error = vm_set_register(vm, cpu, VM_REG_GUEST_RAX, eax);
+ if (error)
+ panic("vm_set_register(rax) error %d", error);
+ error = vm_set_register(vm, cpu, VM_REG_GUEST_RDX, edx);
+ if (error)
+ panic("vm_set_register(rdx) error %d", error);
+ }
+ return (handled);
+}
+
+void
+restore_guest_msrs(struct vm *vm, int cpu)
+{
+ int i;
+ uint64_t *guest_msrs;
+
+ guest_msrs = vm_guest_msrs(vm, cpu);
+
+ for (i = 0; i < vmm_msr_num; i++) {
+ if (emulated_msr(i))
+ continue;
+ else
+ wrmsr(vmm_msr[i].num, guest_msrs[i]);
+ }
+}
+
+void
+restore_host_msrs(struct vm *vm, int cpu)
+{
+ int i;
+
+ for (i = 0; i < vmm_msr_num; i++) {
+ if (emulated_msr(i))
+ continue;
+ else
+ wrmsr(vmm_msr[i].num, vmm_msr[i].hostval);
+ }
+}
diff --git a/sys/amd64/vmm/vmm_msr.h b/sys/amd64/vmm/vmm_msr.h
new file mode 100644
index 000000000000..1e157876e8af
--- /dev/null
+++ b/sys/amd64/vmm/vmm_msr.h
@@ -0,0 +1,42 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_MSR_H_
+#define _VMM_MSR_H_
+
+#define VMM_MSR_NUM 16
+struct vm;
+
+void vmm_msr_init(void);
+int emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val);
+int emulate_rdmsr(struct vm *vm, int vcpu, u_int msr);
+void guest_msrs_init(struct vm *vm, int cpu);
+void restore_host_msrs(struct vm *vm, int cpu);
+void restore_guest_msrs(struct vm *vm, int cpu);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c
new file mode 100644
index 000000000000..e6f5c48d2d3a
--- /dev/null
+++ b/sys/amd64/vmm/vmm_stat.c
@@ -0,0 +1,103 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include <machine/vmm.h>
+#include "vmm_stat.h"
+
+static int vstnum;
+static struct vmm_stat_type *vsttab[MAX_VMM_STAT_TYPES];
+
+static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat");
+
+void
+vmm_stat_init(void *arg)
+{
+ struct vmm_stat_type *vst = arg;
+
+ /* We require all stats to identify themselves with a description */
+ if (vst->desc == NULL)
+ return;
+
+ if (vstnum >= MAX_VMM_STAT_TYPES) {
+ printf("Cannot accomodate vmm stat type \"%s\"!\n", vst->desc);
+ return;
+ }
+
+ vst->index = vstnum;
+ vsttab[vstnum++] = vst;
+}
+
+int
+vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf)
+{
+ int i;
+ uint64_t *stats;
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ stats = vcpu_stats(vm, vcpu);
+ for (i = 0; i < vstnum; i++)
+ buf[i] = stats[i];
+ *num_stats = vstnum;
+ return (0);
+}
+
+void *
+vmm_stat_alloc(void)
+{
+ u_long size;
+
+ size = vstnum * sizeof(uint64_t);
+
+ return (malloc(size, M_VMM_STAT, M_ZERO | M_WAITOK));
+}
+
+void
+vmm_stat_free(void *vp)
+{
+ free(vp, M_VMM_STAT);
+}
+
+const char *
+vmm_stat_desc(int index)
+{
+
+ if (index >= 0 && index < vstnum)
+ return (vsttab[index]->desc);
+ else
+ return (NULL);
+}
diff --git a/sys/amd64/vmm/vmm_stat.h b/sys/amd64/vmm/vmm_stat.h
new file mode 100644
index 000000000000..7c075a6a7602
--- /dev/null
+++ b/sys/amd64/vmm/vmm_stat.h
@@ -0,0 +1,71 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_STAT_H_
+#define _VMM_STAT_H_
+
+struct vm;
+
+#define MAX_VMM_STAT_TYPES 64 /* arbitrary */
+
+struct vmm_stat_type {
+ const char *desc; /* description of statistic */
+ int index; /* position in the stats buffer */
+};
+
+void vmm_stat_init(void *arg);
+
+#define VMM_STAT_DEFINE(type, desc) \
+ struct vmm_stat_type type[1] = { \
+ { desc, -1 } \
+ }; \
+ SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type)
+
+void *vmm_stat_alloc(void);
+void vmm_stat_free(void *vp);
+
+/*
+ * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries
+ */
+int vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf);
+const char *vmm_stat_desc(int index);
+
+static void __inline
+vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x)
+{
+#ifdef VMM_KEEP_STATS
+ uint64_t *stats = vcpu_stats(vm, vcpu);
+ if (vst->index >= 0)
+ stats[vst->index] += x;
+#endif
+}
+
+#endif
diff --git a/sys/amd64/vmm/vmm_support.S b/sys/amd64/vmm/vmm_support.S
new file mode 100644
index 000000000000..2afc608ae71e
--- /dev/null
+++ b/sys/amd64/vmm/vmm_support.S
@@ -0,0 +1,42 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#define LOCORE
+
+#include <machine/asmacros.h>
+
+#define LA_EOI 0xB0
+
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(justreturn)
+ pushq %rax
+ movq lapic, %rax
+ movl $0, LA_EOI(%rax)
+ popq %rax
+ iretq
diff --git a/sys/amd64/vmm/vmm_util.c b/sys/amd64/vmm/vmm_util.c
new file mode 100644
index 000000000000..f245f922120f
--- /dev/null
+++ b/sys/amd64/vmm/vmm_util.c
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/libkern.h>
+
+#include <machine/md_var.h>
+
+#include "vmm_util.h"
+
+boolean_t
+vmm_is_intel(void)
+{
+
+ if (strcmp(cpu_vendor, "GenuineIntel") == 0)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+boolean_t
+vmm_is_amd(void)
+{
+ if (strcmp(cpu_vendor, "AuthenticAMD") == 0)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+boolean_t
+vmm_supports_1G_pages(void)
+{
+ unsigned int regs[4];
+
+ /*
+ * CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages
+ *
+ * Both Intel and AMD support this bit.
+ */
+ if (cpu_exthigh >= 0x80000001) {
+ do_cpuid(0x80000001, regs);
+ if (regs[3] & (1 << 26))
+ return (TRUE);
+ }
+ return (FALSE);
+}
+
+#include <sys/proc.h>
+#include <machine/frame.h>
+#define DUMP_REG(x) printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x))
+#define DUMP_SEG(x) printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x))
+void
+dump_trapframe(struct trapframe *tf)
+{
+ DUMP_REG(rdi);
+ DUMP_REG(rsi);
+ DUMP_REG(rdx);
+ DUMP_REG(rcx);
+ DUMP_REG(r8);
+ DUMP_REG(r9);
+ DUMP_REG(rax);
+ DUMP_REG(rbx);
+ DUMP_REG(rbp);
+ DUMP_REG(r10);
+ DUMP_REG(r11);
+ DUMP_REG(r12);
+ DUMP_REG(r13);
+ DUMP_REG(r14);
+ DUMP_REG(r15);
+ DUMP_REG(trapno);
+ DUMP_REG(addr);
+ DUMP_REG(flags);
+ DUMP_REG(err);
+ DUMP_REG(rip);
+ DUMP_REG(rflags);
+ DUMP_REG(rsp);
+ DUMP_SEG(cs);
+ DUMP_SEG(ss);
+ DUMP_SEG(fs);
+ DUMP_SEG(gs);
+ DUMP_SEG(es);
+ DUMP_SEG(ds);
+}
diff --git a/sys/amd64/vmm/vmm_util.h b/sys/amd64/vmm/vmm_util.h
new file mode 100644
index 000000000000..7f82332923a0
--- /dev/null
+++ b/sys/amd64/vmm/vmm_util.h
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_UTIL_H_
+#define _VMM_UTIL_H_
+
+struct trapframe;
+
+boolean_t vmm_is_intel(void);
+boolean_t vmm_is_amd(void);
+boolean_t vmm_supports_1G_pages(void);
+
+void dump_trapframe(struct trapframe *tf);
+
+#endif
diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
new file mode 100644
index 000000000000..45c4c53c199a
--- /dev/null
+++ b/sys/amd64/vmm/x86.c
@@ -0,0 +1,113 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <machine/cpufunc.h>
+#include <machine/specialreg.h>
+
+#include "x86.h"
+
+int
+x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+{
+ unsigned int func, regs[4];
+
+ func = *eax;
+
+ cpuid_count(*eax, *ecx, regs);
+
+ switch(func) {
+ case CPUID_0000_0000:
+ case CPUID_0000_0002:
+ case CPUID_0000_0003:
+ case CPUID_0000_0004:
+ case CPUID_0000_000A:
+ break;
+
+ case CPUID_8000_0000:
+ case CPUID_8000_0001:
+ case CPUID_8000_0002:
+ case CPUID_8000_0003:
+ case CPUID_8000_0004:
+ case CPUID_8000_0006:
+ case CPUID_8000_0007:
+ case CPUID_8000_0008:
+
+ break;
+
+ case CPUID_0000_0001:
+ /*
+ * Override the APIC ID only in ebx
+ */
+ regs[1] &= ~(CPUID_0000_0001_APICID_MASK);
+ /*
+ * XXX fixme for MP case, set apicid properly for cpu.
+ */
+ regs[1] |= (0 << CPUID_0000_0001_APICID_SHIFT);
+
+ /*
+ * Don't expose VMX capability.
+ * Advertise x2APIC capability.
+ */
+ regs[2] &= ~CPUID_0000_0001_FEAT0_VMX;
+ regs[2] |= CPUID2_X2APIC;
+
+ /*
+ * Machine check handling is done in the host.
+ * Hide MTRR capability.
+ */
+ regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
+
+ break;
+
+ case CPUID_0000_000B:
+ /*
+ * XXXSMP fixme
+ * Processor topology enumeration
+ */
+ regs[0] = 0;
+ regs[1] = 0;
+ regs[2] = *ecx & 0xff;
+ regs[3] = 0;
+ break;
+
+ default:
+ return (0);
+ }
+
+ *eax = regs[0];
+ *ebx = regs[1];
+ *ecx = regs[2];
+ *edx = regs[3];
+ return (1);
+}
+
diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h
new file mode 100644
index 000000000000..bc4f8a45c78a
--- /dev/null
+++ b/sys/amd64/vmm/x86.h
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _X86_H_
+#define _X86_H_
+
+#define CPUID_0000_0000 (0x0)
+#define CPUID_0000_0001 (0x1)
+#define CPUID_0000_0002 (0x2)
+#define CPUID_0000_0003 (0x3)
+#define CPUID_0000_0004 (0x4)
+#define CPUID_0000_000A (0xA)
+#define CPUID_0000_000B (0xB)
+#define CPUID_8000_0000 (0x80000000)
+#define CPUID_8000_0001 (0x80000001)
+#define CPUID_8000_0002 (0x80000002)
+#define CPUID_8000_0003 (0x80000003)
+#define CPUID_8000_0004 (0x80000004)
+#define CPUID_8000_0006 (0x80000006)
+#define CPUID_8000_0007 (0x80000007)
+#define CPUID_8000_0008 (0x80000008)
+
+/*
+ * CPUID instruction Fn0000_0001:
+ */
+#define CPUID_0000_0001_APICID_MASK (0xff<<24)
+#define CPUID_0000_0001_APICID_SHIFT 24
+
+/*
+ * CPUID instruction Fn0000_0001 ECX
+ */
+#define CPUID_0000_0001_FEAT0_VMX (1<<5)
+
+int x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
+ uint32_t *edx);
+
+#endif
diff --git a/sys/modules/Makefile b/sys/modules/Makefile
index e781da0a33f5..127c3e01a2d3 100644
--- a/sys/modules/Makefile
+++ b/sys/modules/Makefile
@@ -290,6 +290,7 @@ SUBDIR= ${_3dfx} \
${_vesa} \
vge \
vkbd \
+ ${_vmm} \
${_vpo} \
vr \
vx \
@@ -557,6 +558,7 @@ _sppp= sppp
_tmpfs= tmpfs
_twa= twa
_vesa= vesa
+_vmm= vmm
_x86bios= x86bios
_wi= wi
_wpi= wpi
diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile
new file mode 100644
index 000000000000..67c5b0f33ccc
--- /dev/null
+++ b/sys/modules/vmm/Makefile
@@ -0,0 +1,66 @@
+# $FreeBSD$
+
+# *REQUIRES* binutils 2.20.1 for VT-x instructions
+AS= /usr/local/bin/as
+LD= /usr/local/bin/ld
+CFLAGS+= -B /usr/local/bin
+
+KMOD= vmm
+
+SRCS= device_if.h bus_if.h pci_if.h
+
+CFLAGS+= -DVMM_KEEP_STATS
+CFLAGS+= -DOLD_BINUTILS
+CFLAGS+= -I${.CURDIR}/../../amd64/vmm
+CFLAGS+= -I${.CURDIR}/../../amd64/vmm/io
+CFLAGS+= -I${.CURDIR}/../../amd64/vmm/intel
+
+# generic vmm support
+.PATH: ${.CURDIR}/../../amd64/vmm
+SRCS+= vmm.c \
+ vmm_dev.c \
+ vmm_ipi.c \
+ vmm_lapic.c \
+ vmm_mem.c \
+ vmm_msr.c \
+ vmm_stat.c \
+ vmm_util.c \
+ x86.c \
+ vmm_support.S
+
+.PATH: ${.CURDIR}/../../amd64/vmm/io
+SRCS+= iommu.c \
+ ppt.c \
+ vdev.c \
+ vlapic.c
+
+# intel-specific files
+.PATH: ${.CURDIR}/../../amd64/vmm/intel
+SRCS+= ept.c \
+ vmcs.c \
+ vmx_msr.c \
+ vmx.c \
+ vtd.c
+
+# amd-specific files
+.PATH: ${.CURDIR}/../../amd64/vmm/amd
+SRCS+= amdv.c
+
+OBJS= vmx_support.o
+
+CLEANFILES= vmx_assym.s vmx_genassym.o
+
+vmx_assym.s: vmx_genassym.o
+.if exists(@)
+vmx_assym.s: @/kern/genassym.sh
+.endif
+ sh @/kern/genassym.sh vmx_genassym.o > ${.TARGET}
+
+vmx_support.o: vmx_support.S vmx_assym.s
+ ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \
+ ${.IMPSRC} -o ${.TARGET}
+
+vmx_genassym.o: vmx_genassym.c @ machine
+ ${CC} -c ${CFLAGS:N-fno-common} ${.IMPSRC}
+
+.include <bsd.kmod.mk>
diff --git a/usr.sbin/Makefile b/usr.sbin/Makefile
index 44f20a49cee8..fc527b746725 100644
--- a/usr.sbin/Makefile
+++ b/usr.sbin/Makefile
@@ -19,6 +19,7 @@ SUBDIR= ${_ac} \
${_auditd} \
${_auditreduce} \
${_authpf} \
+ ${_bhyve} \
${_bluetooth} \
${_boot0cfg} \
${_boot98cfg} \
@@ -194,6 +195,7 @@ SUBDIR= ${_ac} \
${_usbdevs} \
${_usbconfig} \
${_vidcontrol} \
+ ${_vmmctl} \
vipw \
wake \
watch \
@@ -477,6 +479,7 @@ _boot98cfg= boot98cfg
_acpi= acpi
.endif
_asf= asf
+_bhyve= bhyve
_boot0cfg= boot0cfg
.if ${MK_TOOLCHAIN} != "no"
_btxld= btxld
@@ -494,6 +497,7 @@ _ndiscvt= ndiscvt
.endif
_sicontrol= sicontrol
_spkrtest= spkrtest
+_vmmctl= vmmctl
_zzz= zzz
.endif
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile
new file mode 100644
index 000000000000..71df082dcf7a
--- /dev/null
+++ b/usr.sbin/bhyve/Makefile
@@ -0,0 +1,18 @@
+#
+# $FreeBSD$
+#
+
+PROG= bhyve
+
+SRCS= atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c mevent.c
+SRCS+= pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c
+SRCS+= pci_virtio_net.c pit_8254.c post.c rtc.c uart.c xmsr.c
+
+NO_MAN=
+
+DPADD= ${LIBVMMAPI} ${LIBMD} ${LIBPTHREAD}
+LDADD= -lvmmapi -lmd -lpthread
+
+CFLAGS+= -I${.CURDIR}/../../sys
+
+.include <bsd.prog.mk>
diff --git a/usr.sbin/bhyve/atpic.c b/usr.sbin/bhyve/atpic.c
new file mode 100644
index 000000000000..a9fb0842c358
--- /dev/null
+++ b/usr.sbin/bhyve/atpic.c
@@ -0,0 +1,68 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "inout.h"
+
+/*
+ * FreeBSD only writes to the 8259 interrupt controllers to put them in a
+ * shutdown state.
+ *
+ * So, we just ignore the writes.
+ */
+
+#define IO_ICU1 0x20
+#define IO_ICU2 0xA0
+#define ICU_IMR_OFFSET 1
+
+static int
+atpic_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ if (bytes != 1)
+ return (-1);
+
+ if (in)
+ return (-1);
+
+ /* Pretend all writes to the 8259 are alright */
+ return (0);
+}
+
+INOUT_PORT(atpic, IO_ICU1, IOPORT_F_INOUT, atpic_handler);
+INOUT_PORT(atpic, IO_ICU1 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler);
+INOUT_PORT(atpic, IO_ICU2, IOPORT_F_INOUT, atpic_handler);
+INOUT_PORT(atpic, IO_ICU2 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler);
diff --git a/usr.sbin/bhyve/consport.c b/usr.sbin/bhyve/consport.c
new file mode 100644
index 000000000000..34f94a6cc775
--- /dev/null
+++ b/usr.sbin/bhyve/consport.c
@@ -0,0 +1,121 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/select.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <termios.h>
+#include <unistd.h>
+#include <stdbool.h>
+
+#include "inout.h"
+
+#define BVM_CONSOLE_PORT 0x220
+
+static struct termios tio_orig, tio_new;
+
+static void
+ttyclose(void)
+{
+ tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
+}
+
+static void
+ttyopen(void)
+{
+ tcgetattr(STDIN_FILENO, &tio_orig);
+
+ cfmakeraw(&tio_new);
+ tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);
+
+ atexit(ttyclose);
+}
+
+static bool
+tty_char_available(void)
+{
+ fd_set rfds;
+ struct timeval tv;
+
+ FD_ZERO(&rfds);
+ FD_SET(STDIN_FILENO, &rfds);
+ tv.tv_sec = 0;
+ tv.tv_usec = 0;
+ if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) {
+ return (true);
+ } else {
+ return (false);
+ }
+}
+
+static int
+ttyread(void)
+{
+ char rb;
+
+ if (tty_char_available()) {
+ read(STDIN_FILENO, &rb, 1);
+ return (rb & 0xff);
+ } else {
+ return (-1);
+ }
+}
+
+static void
+ttywrite(unsigned char wb)
+{
+ (void) write(STDOUT_FILENO, &wb, 1);
+}
+
+static int
+console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ static int opened;
+
+ if (bytes != 4)
+ return (-1);
+
+ if (!opened) {
+ ttyopen();
+ opened = 1;
+ }
+
+ if (in)
+ *eax = ttyread();
+ else
+ ttywrite(*eax);
+
+ return (0);
+}
+INOUT_PORT(console, BVM_CONSOLE_PORT, IOPORT_F_INOUT, console_handler);
diff --git a/usr.sbin/bhyve/dbgport.c b/usr.sbin/bhyve/dbgport.c
new file mode 100644
index 000000000000..be919e1ea590
--- /dev/null
+++ b/usr.sbin/bhyve/dbgport.c
@@ -0,0 +1,124 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <sys/uio.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include "inout.h"
+
+#define BVM_DBG_PORT 0x224
+
+static int listen_fd, conn_fd;
+
+static struct sockaddr_in sin;
+
+void
+init_dbgport(int sport)
+{
+ conn_fd = -1;
+
+ if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+ perror("socket");
+ exit(1);
+ }
+
+ sin.sin_len = sizeof(sin);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(sport);
+
+ if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
+ perror("bind");
+ exit(1);
+ }
+
+ if (listen(listen_fd, 1) < 0) {
+ perror("listen");
+ exit(1);
+ }
+}
+
+static int
+dbg_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ char ch;
+ int nwritten, nread, printonce;
+
+ if (bytes != 4)
+ return (-1);
+
+again:
+ printonce = 0;
+ while (conn_fd < 0) {
+ if (!printonce) {
+ printf("Waiting for connection from gdb\r\n");
+ printonce = 1;
+ }
+ conn_fd = accept(listen_fd, NULL, NULL);
+ if (conn_fd >= 0)
+ fcntl(conn_fd, F_SETFL, O_NONBLOCK);
+ else if (errno != EINTR)
+ perror("accept");
+ }
+
+ if (in) {
+ nread = read(conn_fd, &ch, 1);
+ if (nread == -1 && errno == EAGAIN)
+ *eax = -1;
+ else if (nread == 1)
+ *eax = ch;
+ else {
+ close(conn_fd);
+ conn_fd = -1;
+ goto again;
+ }
+ } else {
+ ch = *eax;
+ nwritten = write(conn_fd, &ch, 1);
+ if (nwritten != 1) {
+ close(conn_fd);
+ conn_fd = -1;
+ goto again;
+ }
+ }
+ return (0);
+}
+
+INOUT_PORT(dbg, BVM_DBG_PORT, IOPORT_F_INOUT, dbg_handler);
diff --git a/usr.sbin/bhyve/dbgport.h b/usr.sbin/bhyve/dbgport.h
new file mode 100644
index 000000000000..8c7dab7f83cc
--- /dev/null
+++ b/usr.sbin/bhyve/dbgport.h
@@ -0,0 +1,36 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _DBGPORT_H_
+#define _DBGPORT_H_
+
+#define DEFAULT_GDB_PORT 6466
+
+void init_dbgport(int port);
+
+#endif
diff --git a/usr.sbin/bhyve/elcr.c b/usr.sbin/bhyve/elcr.c
new file mode 100644
index 000000000000..2417ae1bb276
--- /dev/null
+++ b/usr.sbin/bhyve/elcr.c
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include "inout.h"
+
+/*
+ * EISA interrupt Level Control Register.
+ *
+ * This is a 16-bit register with one bit for each of the IRQ0 through IRQ15.
+ * A level triggered irq is indicated by setting the corresponding bit to '1'.
+ */
+#define ELCR_PORT 0x4d0
+
+static uint8_t elcr[2] = { 0x00, 0x00 };
+
+static int
+elcr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int idx;
+
+ if (bytes != 1)
+ return (-1);
+
+ idx = port - ELCR_PORT;
+
+ if (in)
+ *eax = elcr[idx];
+ else
+ elcr[idx] = *eax;
+
+ return (0);
+}
+INOUT_PORT(elcr, ELCR_PORT + 0, IOPORT_F_INOUT, elcr_handler);
+INOUT_PORT(elcr, ELCR_PORT + 1, IOPORT_F_INOUT, elcr_handler);
diff --git a/usr.sbin/bhyve/fbsdrun.c b/usr.sbin/bhyve/fbsdrun.c
new file mode 100644
index 000000000000..ddbe709b8d57
--- /dev/null
+++ b/usr.sbin/bhyve/fbsdrun.c
@@ -0,0 +1,650 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+
+#include <machine/segments.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <libgen.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <pthread.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "fbsdrun.h"
+#include "inout.h"
+#include "dbgport.h"
+#include "mevent.h"
+#include "pci_emul.h"
+#include "xmsr.h"
+
+#define DEFAULT_GUEST_HZ 100
+#define DEFAULT_GUEST_TSLICE 200
+
+#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */
+
+#define VMEXIT_SWITCH 0 /* force vcpu switch in mux mode */
+#define VMEXIT_CONTINUE 1 /* continue from next instruction */
+#define VMEXIT_RESTART 2 /* restart current instruction */
+#define VMEXIT_ABORT 3 /* abort the vm run loop */
+#define VMEXIT_RESET 4 /* guest machine has reset */
+
+#define MB (1024UL * 1024)
+#define GB (1024UL * MB)
+
+typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
+
+int guest_tslice = DEFAULT_GUEST_TSLICE;
+int guest_hz = DEFAULT_GUEST_HZ;
+char *vmname;
+
+u_long lomem_sz;
+u_long himem_sz;
+
+int guest_ncpus;
+
+static int pincpu = -1;
+static int guest_vcpu_mux;
+static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
+
+static int foundcpus;
+
+static char *lomem_addr;
+static char *himem_addr;
+
+static char *progname;
+static const int BSP = 0;
+
+static int cpumask;
+
+static void *oem_tbl_start;
+static int oem_tbl_size;
+
+static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
+
+struct vm_exit vmexit[VM_MAXCPU];
+
+struct fbsdstats {
+ uint64_t vmexit_bogus;
+ uint64_t vmexit_bogus_switch;
+ uint64_t vmexit_hlt;
+ uint64_t vmexit_pause;
+ uint64_t vmexit_mtrap;
+ uint64_t cpu_switch_rotate;
+ uint64_t cpu_switch_direct;
+ int io_reset;
+} stats;
+
+struct mt_vmm_info {
+ pthread_t mt_thr;
+ struct vmctx *mt_ctx;
+ int mt_vcpu;
+} mt_vmm_info[VM_MAXCPU];
+
+static void
+usage(int code)
+{
+
+ fprintf(stderr,
+ "Usage: %s [-hBHP][-g <gdb port>][-z <hz>][-s <pci>][-p pincpu]"
+ "[-n <pci>][-m lowmem][-M highmem] <vm>\n"
+ " -g: gdb port (default is %d and 0 means don't open)\n"
+ " -c: # cpus (default 1)\n"
+ " -p: pin vcpu 'n' to host cpu 'pincpu + n'\n"
+ " -B: inject breakpoint exception on vm entry\n"
+ " -H: vmexit from the guest on hlt\n"
+ " -P: vmexit from the guest on pause\n"
+ " -h: help\n"
+ " -z: guest hz (default is %d)\n"
+ " -s: <slot,driver,configinfo> PCI slot config\n"
+ " -n: <slot,name> PCI slot naming\n"
+ " -m: lowmem in MB\n"
+ " -M: highmem in MB\n"
+ " -x: mux vcpus to 1 hcpu\n"
+ " -t: mux vcpu timeslice hz (default %d)\n",
+ progname, DEFAULT_GDB_PORT, DEFAULT_GUEST_HZ,
+ DEFAULT_GUEST_TSLICE);
+ exit(code);
+}
+
+void *
+paddr_guest2host(uintptr_t gaddr)
+{
+ if (lomem_sz == 0)
+ return (NULL);
+
+ if (gaddr < lomem_sz) {
+ return ((void *)(lomem_addr + gaddr));
+ } else if (gaddr >= 4*GB && gaddr < (4*GB + himem_sz)) {
+ return ((void *)(himem_addr + gaddr - 4*GB));
+ } else
+ return (NULL);
+}
+
+void
+fbsdrun_add_oemtbl(void *tbl, int tblsz)
+{
+ oem_tbl_start = tbl;
+ oem_tbl_size = tblsz;
+}
+
+int
+fbsdrun_vmexit_on_pause(void)
+{
+
+ return (guest_vmexit_on_pause);
+}
+
+int
+fbsdrun_vmexit_on_hlt(void)
+{
+
+ return (guest_vmexit_on_hlt);
+}
+
+int
+fbsdrun_muxed(void)
+{
+
+ return (guest_vcpu_mux);
+}
+
+void *
+fbsdrun_start_thread(void *param)
+{
+ int vcpu;
+ struct mt_vmm_info *mtp = param;
+
+ vcpu = mtp->mt_vcpu;
+ vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
+
+ /* not reached */
+ exit(1);
+ return (NULL);
+}
+
+void
+fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip)
+{
+ int error;
+
+ if (cpumask & (1 << vcpu)) {
+ printf("addcpu: attempting to add existing cpu %d\n", vcpu);
+ exit(1);
+ }
+
+ cpumask |= 1 << vcpu;
+ foundcpus++;
+
+ /*
+ * Set up the vmexit struct to allow execution to start
+ * at the given RIP
+ */
+ vmexit[vcpu].rip = rip;
+ vmexit[vcpu].inst_length = 0;
+
+ if (vcpu == BSP || !guest_vcpu_mux){
+ mt_vmm_info[vcpu].mt_ctx = ctx;
+ mt_vmm_info[vcpu].mt_vcpu = vcpu;
+
+ error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
+ fbsdrun_start_thread, &mt_vmm_info[vcpu]);
+ assert(error == 0);
+ }
+}
+
+static int
+fbsdrun_get_next_cpu(int curcpu)
+{
+
+ /*
+ * Get the next available CPU. Assumes they arrive
+ * in ascending order with no gaps.
+ */
+ return ((curcpu + 1) % foundcpus);
+}
+
+int
+vmexit_catch_reset(void)
+{
+ stats.io_reset++;
+ return (VMEXIT_RESET);
+}
+
+int
+vmexit_catch_inout(void)
+{
+ return (VMEXIT_ABORT);
+}
+
+int
+vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
+ uint32_t eax)
+{
+#if PG_DEBUG /* put all types of debug here */
+ if (eax == 0) {
+ pause_noswitch = 1;
+ } else if (eax == 1) {
+ pause_noswitch = 0;
+ } else {
+ pause_noswitch = 0;
+ if (eax == 5) {
+ vm_set_capability(ctx, *pvcpu, VM_CAP_MTRAP_EXIT, 1);
+ }
+ }
+#endif
+ return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ int error;
+ int bytes, port, in, out;
+ uint32_t eax;
+ int vcpu;
+
+ vcpu = *pvcpu;
+
+ port = vme->u.inout.port;
+ bytes = vme->u.inout.bytes;
+ eax = vme->u.inout.eax;
+ in = vme->u.inout.in;
+ out = !in;
+
+ /* We don't deal with these */
+ if (vme->u.inout.string || vme->u.inout.rep)
+ return (VMEXIT_ABORT);
+
+ /* Special case of guest reset */
+ if (out && port == 0x64 && (uint8_t)eax == 0xFE)
+ return (vmexit_catch_reset());
+
+ /* Extra-special case of host notifications */
+ if (out && port == GUEST_NIO_PORT)
+ return (vmexit_handle_notify(ctx, vme, pvcpu, eax));
+
+ error = emulate_inout(ctx, vcpu, in, port, bytes, &eax);
+ if (error == 0 && in)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax);
+
+ if (error == 0)
+ return (VMEXIT_CONTINUE);
+ else {
+ fprintf(stderr, "Unhandled %s%c 0x%04x\n",
+ in ? "in" : "out",
+ bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
+ return (vmexit_catch_inout());
+ }
+}
+
+static int
+vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ printf("vm exit rdmsr 0x%x, cpu %d\n", vme->u.msr.code, *pvcpu);
+ return (VMEXIT_ABORT);
+}
+
+static int
+vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ int newcpu;
+ int retval = VMEXIT_CONTINUE;
+
+ newcpu = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code,vme->u.msr.wval);
+
+ if (guest_vcpu_mux && *pvcpu != newcpu) {
+ retval = VMEXIT_SWITCH;
+ *pvcpu = newcpu;
+ }
+
+ return (retval);
+}
+
+static int
+vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+ printf("vm exit[%d]\n", *pvcpu);
+ printf("\treason\t\tVMX\n");
+ printf("\trip\t\t0x%016lx\n", vmexit->rip);
+ printf("\tinst_length\t%d\n", vmexit->inst_length);
+ printf("\terror\t\t%d\n", vmexit->u.vmx.error);
+ printf("\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
+ printf("\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification);
+
+ return (VMEXIT_ABORT);
+}
+
+static int bogus_noswitch = 1;
+
+static int
+vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ stats.vmexit_bogus++;
+
+ if (!guest_vcpu_mux || guest_ncpus == 1 || bogus_noswitch) {
+ return (VMEXIT_RESTART);
+ } else {
+ stats.vmexit_bogus_switch++;
+ vmexit->inst_length = 0;
+ *pvcpu = -1;
+ return (VMEXIT_SWITCH);
+ }
+}
+
+static int
+vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ stats.vmexit_hlt++;
+ if (fbsdrun_muxed()) {
+ *pvcpu = -1;
+ return (VMEXIT_SWITCH);
+ } else {
+ /*
+ * Just continue execution with the next instruction. We use
+ * the HLT VM exit as a way to be friendly with the host
+ * scheduler.
+ */
+ return (VMEXIT_CONTINUE);
+ }
+}
+
+static int pause_noswitch;
+
+static int
+vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ stats.vmexit_pause++;
+
+ if (fbsdrun_muxed() && !pause_noswitch) {
+ *pvcpu = -1;
+ return (VMEXIT_SWITCH);
+ } else {
+ return (VMEXIT_CONTINUE);
+ }
+}
+
+static int
+vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ stats.vmexit_mtrap++;
+
+ return (VMEXIT_RESTART);
+}
+
+static void
+sigalrm(int sig)
+{
+ return;
+}
+
+static void
+setup_timeslice(void)
+{
+ struct sigaction sa;
+ struct itimerval itv;
+ int error;
+
+ /*
+ * Setup a realtime timer to generate a SIGALRM at a
+ * frequency of 'guest_tslice' ticks per second.
+ */
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = 0;
+ sa.sa_handler = sigalrm;
+
+ error = sigaction(SIGALRM, &sa, NULL);
+ assert(error == 0);
+
+ itv.it_interval.tv_sec = 0;
+ itv.it_interval.tv_usec = 1000000 / guest_tslice;
+ itv.it_value.tv_sec = 0;
+ itv.it_value.tv_usec = 1000000 / guest_tslice;
+
+ error = setitimer(ITIMER_REAL, &itv, NULL);
+ assert(error == 0);
+}
+
+static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
+ [VM_EXITCODE_INOUT] = vmexit_inout,
+ [VM_EXITCODE_VMX] = vmexit_vmx,
+ [VM_EXITCODE_BOGUS] = vmexit_bogus,
+ [VM_EXITCODE_RDMSR] = vmexit_rdmsr,
+ [VM_EXITCODE_WRMSR] = vmexit_wrmsr,
+ [VM_EXITCODE_MTRAP] = vmexit_mtrap,
+};
+
+static void
+vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
+{
+ int error, rc, prevcpu;
+
+ if (guest_vcpu_mux)
+ setup_timeslice();
+
+ if (pincpu >= 0) {
+ error = vm_set_pinning(ctx, vcpu, pincpu + vcpu);
+ assert(error == 0);
+ }
+
+ while (1) {
+ error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]);
+ if (error != 0)
+ break;
+
+ prevcpu = vcpu;
+ rc = (*handler[vmexit[vcpu].exitcode])(ctx, &vmexit[vcpu],
+ &vcpu);
+ switch (rc) {
+ case VMEXIT_SWITCH:
+ assert(guest_vcpu_mux);
+ if (vcpu == -1) {
+ stats.cpu_switch_rotate++;
+ vcpu = fbsdrun_get_next_cpu(prevcpu);
+ } else {
+ stats.cpu_switch_direct++;
+ }
+ /* fall through */
+ case VMEXIT_CONTINUE:
+ rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length;
+ break;
+ case VMEXIT_RESTART:
+ rip = vmexit[vcpu].rip;
+ break;
+ case VMEXIT_RESET:
+ exit(0);
+ default:
+ exit(1);
+ }
+ }
+ fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
+}
+
+
+int
+main(int argc, char *argv[])
+{
+ int c, error, gdb_port, inject_bkpt, tmp, err;
+ struct vmctx *ctx;
+ uint64_t rip;
+
+ inject_bkpt = 0;
+ progname = basename(argv[0]);
+ gdb_port = DEFAULT_GDB_PORT;
+ guest_ncpus = 1;
+
+ while ((c = getopt(argc, argv, "hBHPxp:g:c:z:s:n:m:M:")) != -1) {
+ switch (c) {
+ case 'B':
+ inject_bkpt = 1;
+ break;
+ case 'x':
+ guest_vcpu_mux = 1;
+ break;
+ case 'p':
+ pincpu = atoi(optarg);
+ break;
+ case 'c':
+ guest_ncpus = atoi(optarg);
+ break;
+ case 'g':
+ gdb_port = atoi(optarg);
+ break;
+ case 'z':
+ guest_hz = atoi(optarg);
+ break;
+ case 't':
+ guest_tslice = atoi(optarg);
+ break;
+ case 's':
+ pci_parse_slot(optarg);
+ break;
+ case 'n':
+ pci_parse_name(optarg);
+ break;
+ case 'm':
+ lomem_sz = strtoul(optarg, NULL, 0) * MB;
+ break;
+ case 'M':
+ himem_sz = strtoul(optarg, NULL, 0) * MB;
+ break;
+ case 'H':
+ guest_vmexit_on_hlt = 1;
+ break;
+ case 'P':
+ guest_vmexit_on_pause = 1;
+ break;
+ case 'h':
+ usage(0);
+ default:
+ usage(1);
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 1)
+ usage(1);
+
+ /* No need to mux if guest is uni-processor */
+ if (guest_ncpus <= 1)
+ guest_vcpu_mux = 0;
+
+ /* vmexit on hlt if guest is muxed */
+ if (guest_vcpu_mux) {
+ guest_vmexit_on_hlt = 1;
+ guest_vmexit_on_pause = 1;
+ }
+
+ vmname = argv[0];
+
+ ctx = vm_open(vmname);
+ if (ctx == NULL) {
+ perror("vm_open");
+ exit(1);
+ }
+
+ if (fbsdrun_vmexit_on_hlt()) {
+ err = vm_get_capability(ctx, BSP, VM_CAP_HALT_EXIT, &tmp);
+ if (err < 0) {
+ printf("VM exit on HLT not supported\n");
+ exit(1);
+ }
+ vm_set_capability(ctx, BSP, VM_CAP_HALT_EXIT, 1);
+ handler[VM_EXITCODE_HLT] = vmexit_hlt;
+ }
+
+ if (fbsdrun_vmexit_on_pause()) {
+ /*
+ * pause exit support required for this mode
+ */
+ err = vm_get_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, &tmp);
+ if (err < 0) {
+ printf("SMP mux requested, no pause support\n");
+ exit(1);
+ }
+ vm_set_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, 1);
+ handler[VM_EXITCODE_PAUSE] = vmexit_pause;
+ }
+
+ if (lomem_sz != 0) {
+ lomem_addr = vm_map_memory(ctx, 0, lomem_sz);
+ if (lomem_addr == (char *) MAP_FAILED) {
+ lomem_sz = 0;
+ } else if (himem_sz != 0) {
+ himem_addr = vm_map_memory(ctx, 4*GB, himem_sz);
+ if (himem_addr == (char *) MAP_FAILED) {
+ lomem_sz = 0;
+ himem_sz = 0;
+ }
+ }
+ }
+
+ init_inout();
+ init_pci(ctx);
+
+ if (gdb_port != 0)
+ init_dbgport(gdb_port);
+
+ error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
+ assert(error == 0);
+
+ if (inject_bkpt) {
+ error = vm_inject_event(ctx, BSP, VM_HW_EXCEPTION, IDT_BP);
+ assert(error == 0);
+ }
+
+ /*
+ * build the guest tables, MP etc.
+ */
+ vm_build_tables(ctx, guest_ncpus, oem_tbl_start, oem_tbl_size);
+
+ /*
+ * Add CPU 0
+ */
+ fbsdrun_addcpu(ctx, BSP, rip);
+
+ /*
+ * Head off to the main event dispatch loop
+ */
+ mevent_dispatch();
+
+ exit(1);
+}
diff --git a/usr.sbin/bhyve/fbsdrun.h b/usr.sbin/bhyve/fbsdrun.h
new file mode 100644
index 000000000000..81061222a1a1
--- /dev/null
+++ b/usr.sbin/bhyve/fbsdrun.h
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _FBSDRUN_H_
+#define _FBSDRUN_H_
+
+#ifndef CTASSERT /* Allow lint to override */
+#define CTASSERT(x) _CTASSERT(x, __LINE__)
+#define _CTASSERT(x, y) __CTASSERT(x, y)
+#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1]
+#endif
+
+struct vmctx;
+extern int guest_hz;
+extern int guest_tslice;
+extern int guest_ncpus;
+extern char *vmname;
+
+extern u_long lomem_sz, himem_sz;
+
+void *paddr_guest2host(uintptr_t);
+
+void fbsdrun_addcpu(struct vmctx *ctx, int cpu, uint64_t rip);
+void fbsdrun_add_oemtbl(void *tbl, int tblsz);
+int fbsdrun_muxed(void);
+int fbsdrun_vmexit_on_hlt(void);
+int fbsdrun_vmexit_on_pause(void);
+#endif
diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c
new file mode 100644
index 000000000000..84445b1f0a99
--- /dev/null
+++ b/usr.sbin/bhyve/inout.c
@@ -0,0 +1,98 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+
+#include <stdio.h>
+#include <assert.h>
+
+#include "inout.h"
+
+SET_DECLARE(inout_port_set, struct inout_port);
+
+#define MAX_IOPORTS (1 << 16)
+
+static struct {
+ const char *name;
+ int flags;
+ inout_func_t handler;
+ void *arg;
+} inout_handlers[MAX_IOPORTS];
+
+int
+emulate_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax)
+{
+ int flags;
+ inout_func_t handler;
+ void *arg;
+
+ assert(port < MAX_IOPORTS);
+
+ if ((handler = inout_handlers[port].handler) == NULL)
+ return (-1);
+
+ flags = inout_handlers[port].flags;
+ arg = inout_handlers[port].arg;
+
+ if ((in && (flags & IOPORT_F_IN)) || (!in && (flags & IOPORT_F_OUT)))
+ return ((*handler)(ctx, vcpu, in, port, bytes, eax, arg));
+ else
+ return (-1);
+}
+
+void
+init_inout(void)
+{
+ struct inout_port **iopp, *iop;
+
+ SET_FOREACH(iopp, inout_port_set) {
+ iop = *iopp;
+ assert(iop->port < MAX_IOPORTS);
+ inout_handlers[iop->port].name = iop->name;
+ inout_handlers[iop->port].flags = iop->flags;
+ inout_handlers[iop->port].handler = iop->handler;
+ inout_handlers[iop->port].arg = NULL;
+ }
+}
+
+int
+register_inout(struct inout_port *iop)
+{
+ assert(iop->port < MAX_IOPORTS);
+ inout_handlers[iop->port].name = iop->name;
+ inout_handlers[iop->port].flags = iop->flags;
+ inout_handlers[iop->port].handler = iop->handler;
+ inout_handlers[iop->port].arg = iop->arg;
+
+ return (0);
+}
diff --git a/usr.sbin/bhyve/inout.h b/usr.sbin/bhyve/inout.h
new file mode 100644
index 000000000000..7b8a4a6edd38
--- /dev/null
+++ b/usr.sbin/bhyve/inout.h
@@ -0,0 +1,64 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _INOUT_H_
+#define _INOUT_H_
+
+#include <sys/linker_set.h>
+
+struct vmctx;
+
+typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port,
+ int bytes, uint32_t *eax, void *arg);
+
+struct inout_port {
+ const char *name;
+ int port;
+ int flags;
+ inout_func_t handler;
+ void *arg;
+};
+#define IOPORT_F_IN 0x1
+#define IOPORT_F_OUT 0x2
+#define IOPORT_F_INOUT 0x3
+
+#define INOUT_PORT(name, port, flags, handler) \
+ static struct inout_port __CONCAT(__inout_port, __LINE__) = { \
+ #name, \
+ (port), \
+ (flags), \
+ (handler) \
+ }; \
+ DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__))
+
+void init_inout(void);
+int emulate_inout(struct vmctx *, int vcpu, int in, int port, int bytes,
+ uint32_t *eax);
+int register_inout(struct inout_port *iop);
+
+#endif /* _INOUT_H_ */
diff --git a/usr.sbin/bhyve/mevent.c b/usr.sbin/bhyve/mevent.c
new file mode 100644
index 000000000000..0d3b2872e1b4
--- /dev/null
+++ b/usr.sbin/bhyve/mevent.c
@@ -0,0 +1,419 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Micro event library for FreeBSD, designed for a single i/o thread
+ * using kqueue, and having events be persistent by default.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#include <sys/event.h>
+#include <sys/time.h>
+
+#include <pthread.h>
+
+#include "mevent.h"
+
+#define MEVENT_MAX 64
+
+#define MEV_ENABLE 1
+#define MEV_DISABLE 2
+#define MEV_DEL_PENDING 3
+
+static pthread_t mevent_tid;
+static int mevent_pipefd[2];
+static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
+
+struct mevent {
+ void (*me_func)(int, enum ev_type, void *);
+ int me_fd;
+ enum ev_type me_type;
+ void *me_param;
+ int me_cq;
+ int me_state;
+ int me_closefd;
+ LIST_ENTRY(mevent) me_list;
+};
+
+static LIST_HEAD(listhead, mevent) global_head, change_head;
+
+static void
+mevent_qlock(void)
+{
+ pthread_mutex_lock(&mevent_lmutex);
+}
+
+static void
+mevent_qunlock(void)
+{
+ pthread_mutex_unlock(&mevent_lmutex);
+}
+
+static void
+mevent_pipe_read(int fd, enum ev_type type, void *param)
+{
+ char buf[MEVENT_MAX];
+ int status;
+
+ /*
+ * Drain the pipe read side. The fd is non-blocking so this is
+ * safe to do.
+ */
+ do {
+ status = read(fd, buf, sizeof(buf));
+ } while (status == MEVENT_MAX);
+}
+
+static void
+mevent_notify(void)
+{
+ char c;
+
+ /*
+ * If calling from outside the i/o thread, write a byte on the
+ * pipe to force the i/o thread to exit the blocking kevent call.
+ */
+ if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
+ write(mevent_pipefd[1], &c, 1);
+ }
+}
+
+static int
+mevent_kq_filter(struct mevent *mevp)
+{
+ int retval;
+
+ retval = 0;
+
+ if (mevp->me_type == EVF_READ)
+ retval = EVFILT_READ;
+
+ if (mevp->me_type == EVF_WRITE)
+ retval = EVFILT_WRITE;
+
+ return (retval);
+}
+
+static int
+mevent_kq_flags(struct mevent *mevp)
+{
+ int ret;
+
+ switch (mevp->me_state) {
+ case MEV_ENABLE:
+ ret = EV_ADD;
+ break;
+ case MEV_DISABLE:
+ ret = EV_DISABLE;
+ break;
+ case MEV_DEL_PENDING:
+ ret = EV_DELETE;
+ break;
+ }
+
+ return (ret);
+}
+
+static int
+mevent_kq_fflags(struct mevent *mevp)
+{
+ /* XXX nothing yet, perhaps EV_EOF for reads ? */
+ return (0);
+}
+
+static int
+mevent_build(int mfd, struct kevent *kev)
+{
+ struct mevent *mevp, *tmpp;
+ int i;
+
+ i = 0;
+
+ mevent_qlock();
+
+ LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
+ if (mevp->me_closefd) {
+ /*
+ * A close of the file descriptor will remove the
+ * event
+ */
+ close(mevp->me_fd);
+ } else {
+ kev[i].ident = mevp->me_fd;
+ kev[i].filter = mevent_kq_filter(mevp);
+ kev[i].flags = mevent_kq_flags(mevp);
+ kev[i].fflags = mevent_kq_fflags(mevp);
+ kev[i].data = 0;
+ kev[i].udata = mevp;
+ i++;
+ }
+
+ mevp->me_cq = 0;
+ LIST_REMOVE(mevp, me_list);
+
+ if (mevp->me_state == MEV_DEL_PENDING) {
+ free(mevp);
+ } else {
+ LIST_INSERT_HEAD(&global_head, mevp, me_list);
+ }
+
+ assert(i < MEVENT_MAX);
+ }
+
+ mevent_qunlock();
+
+ return (i);
+}
+
+static void
+mevent_handle(struct kevent *kev, int numev)
+{
+ struct mevent *mevp;
+ int i;
+
+ for (i = 0; i < numev; i++) {
+ mevp = kev[i].udata;
+
+ /* XXX check for EV_ERROR ? */
+
+ (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
+ }
+}
+
+struct mevent *
+mevent_add(int fd, enum ev_type type,
+ void (*func)(int, enum ev_type, void *), void *param)
+{
+ struct mevent *lp, *mevp;
+
+ if (fd < 0 || func == NULL) {
+ return (NULL);
+ }
+
+ mevp = NULL;
+
+ mevent_qlock();
+
+ /*
+ * Verify that the fd/type tuple is not present in any list
+ */
+ LIST_FOREACH(lp, &global_head, me_list) {
+ if (lp->me_fd == fd && lp->me_type == type) {
+ goto exit;
+ }
+ }
+
+ LIST_FOREACH(lp, &change_head, me_list) {
+ if (lp->me_fd == fd && lp->me_type == type) {
+ goto exit;
+ }
+ }
+
+ /*
+ * Allocate an entry, populate it, and add it to the change list.
+ */
+ mevp = malloc(sizeof(struct mevent));
+ if (mevp == NULL) {
+ goto exit;
+ }
+
+ memset(mevp, 0, sizeof(struct mevent));
+ mevp->me_fd = fd;
+ mevp->me_type = type;
+ mevp->me_func = func;
+ mevp->me_param = param;
+
+ LIST_INSERT_HEAD(&change_head, mevp, me_list);
+ mevp->me_cq = 1;
+ mevp->me_state = MEV_ENABLE;
+ mevent_notify();
+
+exit:
+ mevent_qunlock();
+
+ return (mevp);
+}
+
+static int
+mevent_update(struct mevent *evp, int newstate)
+{
+ /*
+ * It's not possible to enable/disable a deleted event
+ */
+ if (evp->me_state == MEV_DEL_PENDING)
+ return (EINVAL);
+
+ /*
+ * No update needed if state isn't changing
+ */
+ if (evp->me_state == newstate)
+ return (0);
+
+ mevent_qlock();
+
+ evp->me_state = newstate;
+
+ /*
+ * Place the entry onto the changed list if not already there.
+ */
+ if (evp->me_cq == 0) {
+ evp->me_cq = 1;
+ LIST_REMOVE(evp, me_list);
+ LIST_INSERT_HEAD(&change_head, evp, me_list);
+ mevent_notify();
+ }
+
+ mevent_qunlock();
+
+ return (0);
+}
+
+int
+mevent_enable(struct mevent *evp)
+{
+
+ return (mevent_update(evp, MEV_ENABLE));
+}
+
+int
+mevent_disable(struct mevent *evp)
+{
+
+ return (mevent_update(evp, MEV_DISABLE));
+}
+
+static int
+mevent_delete_event(struct mevent *evp, int closefd)
+{
+ mevent_qlock();
+
+ /*
+ * Place the entry onto the changed list if not already there, and
+ * mark as to be deleted.
+ */
+ if (evp->me_cq == 0) {
+ evp->me_cq = 1;
+ LIST_REMOVE(evp, me_list);
+ LIST_INSERT_HEAD(&change_head, evp, me_list);
+ mevent_notify();
+ }
+ evp->me_state = MEV_DEL_PENDING;
+
+ if (closefd)
+ evp->me_closefd = 1;
+
+ mevent_qunlock();
+
+ return (0);
+}
+
+int
+mevent_delete(struct mevent *evp)
+{
+
+ return (mevent_delete_event(evp, 0));
+}
+
+int
+mevent_delete_close(struct mevent *evp)
+{
+
+ return (mevent_delete_event(evp, 1));
+}
+
+void
+mevent_dispatch(void)
+{
+ struct kevent changelist[MEVENT_MAX];
+ struct kevent eventlist[MEVENT_MAX];
+ struct mevent *pipev;
+ int mfd;
+ int numev;
+ int ret;
+
+ mevent_tid = pthread_self();
+
+ mfd = kqueue();
+ assert(mfd > 0);
+
+ /*
+ * Open the pipe that will be used for other threads to force
+ * the blocking kqueue call to exit by writing to it. Set the
+ * descriptor to non-blocking.
+ */
+ ret = pipe(mevent_pipefd);
+ if (ret < 0) {
+ perror("pipe");
+ exit(0);
+ }
+
+ /*
+ * Add internal event handler for the pipe write fd
+ */
+ pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
+ assert(pipev != NULL);
+
+ for (;;) {
+ /*
+ * Build changelist if required.
+ * XXX the changelist can be put into the blocking call
+ * to eliminate the extra syscall. Currently better for
+ * debug.
+ */
+ numev = mevent_build(mfd, changelist);
+ if (numev) {
+ ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
+ if (ret == -1) {
+ perror("Error return from kevent change");
+ }
+ }
+
+ /*
+ * Block awaiting events
+ */
+ ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
+ if (ret == -1) {
+ perror("Error return from kevent monitor");
+ }
+
+ /*
+ * Handle reported events
+ */
+ mevent_handle(eventlist, ret);
+ }
+}
diff --git a/usr.sbin/bhyve/mevent.h b/usr.sbin/bhyve/mevent.h
new file mode 100644
index 000000000000..32a9d74ab566
--- /dev/null
+++ b/usr.sbin/bhyve/mevent.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MEVENT_H_
+#define _MEVENT_H_
+
+enum ev_type {
+ EVF_READ,
+ EVF_WRITE
+};
+
+struct mevent;
+
+struct mevent *mevent_add(int fd, enum ev_type type,
+ void (*func)(int, enum ev_type, void *),
+ void *param);
+int mevent_enable(struct mevent *evp);
+int mevent_disable(struct mevent *evp);
+int mevent_delete(struct mevent *evp);
+int mevent_delete_close(struct mevent *evp);
+
+void mevent_dispatch(void);
+
+#endif /* _MEVENT_H_ */
diff --git a/usr.sbin/bhyve/mevent_test.c b/usr.sbin/bhyve/mevent_test.c
new file mode 100644
index 000000000000..c72a49718188
--- /dev/null
+++ b/usr.sbin/bhyve/mevent_test.c
@@ -0,0 +1,180 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Test program for the micro event library. Set up a simple TCP echo
+ * service.
+ *
+ * cc mevent_test.c mevent.c -lpthread
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+#include "mevent.h"
+
+#define TEST_PORT 4321
+
+static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER;
+
+#define MEVENT_ECHO
+
+#ifdef MEVENT_ECHO
+struct esync {
+ pthread_mutex_t e_mt;
+ pthread_cond_t e_cond;
+};
+
+static void
+echoer_callback(int fd, enum ev_type type, void *param)
+{
+ struct esync *sync = param;
+
+ pthread_mutex_lock(&sync->e_mt);
+ pthread_cond_signal(&sync->e_cond);
+ pthread_mutex_unlock(&sync->e_mt);
+}
+
+static void *
+echoer(void *param)
+{
+ struct esync sync;
+ struct mevent *mev;
+ char buf[128];
+ int fd = (int)(uintptr_t) param;
+ int len;
+
+ pthread_mutex_init(&sync.e_mt, NULL);
+ pthread_cond_init(&sync.e_cond, NULL);
+
+ pthread_mutex_lock(&sync.e_mt);
+
+ mev = mevent_add(fd, EVF_READ, echoer_callback, &sync);
+ if (mev == NULL) {
+ printf("Could not allocate echoer event\n");
+ exit(1);
+ }
+
+ while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) {
+ len = read(fd, buf, sizeof(buf));
+ if (len > 0) {
+ write(fd, buf, len);
+ write(0, buf, len);
+ } else {
+ break;
+ }
+ }
+
+ mevent_delete_close(mev);
+
+ pthread_mutex_unlock(&sync.e_mt);
+ pthread_mutex_destroy(&sync.e_mt);
+ pthread_cond_destroy(&sync.e_cond);
+}
+
+#else
+
+static void *
+echoer(void *param)
+{
+ char buf[128];
+ int fd = (int)(uintptr_t) param;
+ int len;
+
+ while ((len = read(fd, buf, sizeof(buf))) > 0) {
+ write(1, buf, len);
+ }
+}
+#endif /* MEVENT_ECHO */
+
+static void
+acceptor_callback(int fd, enum ev_type type, void *param)
+{
+ pthread_mutex_lock(&accept_mutex);
+ pthread_cond_signal(&accept_condvar);
+ pthread_mutex_unlock(&accept_mutex);
+}
+
+static void *
+acceptor(void *param)
+{
+ struct sockaddr_in sin;
+ pthread_t tid;
+ int news;
+ int s;
+
+ if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+ perror("socket");
+ exit(1);
+ }
+
+ sin.sin_len = sizeof(sin);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(TEST_PORT);
+
+ if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
+ perror("bind");
+ exit(1);
+ }
+
+ if (listen(s, 1) < 0) {
+ perror("listen");
+ exit(1);
+ }
+
+ (void) mevent_add(s, EVF_READ, acceptor_callback, NULL);
+
+ pthread_mutex_lock(&accept_mutex);
+
+ while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) {
+ news = accept(s, NULL, NULL);
+ if (news < 0) {
+ perror("accept error");
+ } else {
+ printf("incoming connection, spawning thread\n");
+ pthread_create(&tid, NULL, echoer,
+ (void *)(uintptr_t)news);
+ }
+ }
+}
+
+main()
+{
+ pthread_t tid;
+
+ pthread_create(&tid, NULL, acceptor, NULL);
+
+ mevent_dispatch();
+}
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
new file mode 100644
index 000000000000..273c6a2c78c5
--- /dev/null
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -0,0 +1,976 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <assert.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "fbsdrun.h"
+#include "inout.h"
+#include "pci_emul.h"
+
+#define CONF1_ADDR_PORT 0x0cf8
+#define CONF1_DATA_PORT 0x0cfc
+
+#define CFGWRITE(pi,off,val,b) \
+do { \
+ if ((b) == 1) { \
+ pci_set_cfgdata8((pi),(off),(val)); \
+ } else if ((b) == 2) { \
+ pci_set_cfgdata16((pi),(off),(val)); \
+ } else { \
+ pci_set_cfgdata32((pi),(off),(val)); \
+ } \
+} while (0)
+
+#define MAXSLOTS 32
+
+static struct slotinfo {
+ char *si_name;
+ char *si_param;
+ struct pci_devinst *si_devi;
+ int si_titled;
+ int si_pslot;
+ char si_prefix;
+ char si_suffix;
+} pci_slotinfo[MAXSLOTS];
+
+/*
+ * NetApp specific:
+ * struct used to build an in-core OEM table to supply device names
+ * to driver instances
+ */
+static struct mptable_pci_devnames {
+#define MPT_HDR_BASE 0
+#define MPT_HDR_NAME 2
+ uint16_t md_hdrtype;
+ uint16_t md_entries;
+ uint16_t md_cksum;
+ uint16_t md_pad;
+#define MPT_NTAP_SIG \
+ ((uint32_t)(('P' << 24) | ('A' << 16) | ('T' << 8) | 'N'))
+ uint32_t md_sig;
+ uint32_t md_rsvd;
+ struct mptable_pci_slotinfo {
+ uint16_t mds_type;
+ uint16_t mds_phys_slot;
+ uint8_t mds_bus;
+ uint8_t mds_slot;
+ uint8_t mds_func;
+ uint8_t mds_pad;
+ uint16_t mds_vid;
+ uint16_t mds_did;
+ uint8_t mds_suffix[4];
+ uint8_t mds_prefix[4];
+ uint32_t mds_rsvd[3];
+ } md_slotinfo[MAXSLOTS];
+} pci_devnames;
+
+SET_DECLARE(pci_devemu_set, struct pci_devemu);
+
+static uint64_t pci_emul_iobase;
+static uint64_t pci_emul_membase32;
+static uint64_t pci_emul_membase64;
+
+#define PCI_EMUL_IOBASE 0x2000
+#define PCI_EMUL_IOLIMIT 0x10000
+
+#define PCI_EMUL_MEMBASE32 (lomem_sz)
+#define PCI_EMUL_MEMLIMIT32 0xE0000000 /* 3.5GB */
+
+#define PCI_EMUL_MEMBASE64 0xD000000000UL
+#define PCI_EMUL_MEMLIMIT64 0xFD00000000UL
+
+static int pci_emul_devices;
+static int devname_elems;
+
+/*
+ * I/O access
+ */
+
+/*
+ * Slot options are in the form:
+ *
+ * <slot>,<emul>[,<config>]
+ *
+ * slot is 0..31
+ * emul is a string describing the type of PCI device e.g. virtio-net
+ * config is an optional string, depending on the device, that can be
+ * used for configuration.
+ * Examples are:
+ * 1,virtio-net,tap0
+ * 3,dummy
+ */
+static void
+pci_parse_slot_usage(char *aopt)
+{
+ printf("Invalid PCI slot info field \"%s\"\n", aopt);
+ free(aopt);
+}
+
+void
+pci_parse_slot(char *opt)
+{
+ char *slot, *emul, *config;
+ char *str, *cpy;
+ int snum;
+
+ str = cpy = strdup(opt);
+ config = NULL;
+
+ slot = strsep(&str, ",");
+ emul = strsep(&str, ",");
+ if (str != NULL) {
+ config = strsep(&str, ",");
+ }
+
+ if (emul == NULL) {
+ pci_parse_slot_usage(cpy);
+ return;
+ }
+
+ snum = 255;
+ snum = atoi(slot);
+ if (snum < 0 || snum >= MAXSLOTS) {
+ pci_parse_slot_usage(cpy);
+ } else {
+ pci_slotinfo[snum].si_name = emul;
+ pci_slotinfo[snum].si_param = config;
+ }
+}
+
+
+/*
+ *
+ * PCI MPTable names are of the form:
+ *
+ * <slot>,[prefix]<digit><suffix>
+ *
+ * .. with <prefix> an alphabetic char, <digit> a 1 or 2-digit string,
+ * and <suffix> a single char.
+ *
+ * Examples:
+ * 1,e0c
+ * 4,e0P
+ * 6,43a
+ * 7,0f
+ * 10,1
+ * 12,e0M
+ * 2,12a
+ *
+ * Note that this is NetApp-specific, but is ignored on other o/s's.
+ */
+static void
+pci_parse_name_usage(char *aopt)
+{
+ printf("Invalid PCI slot name field \"%s\"\n", aopt);
+}
+
+void
+pci_parse_name(char *opt)
+{
+ char csnum[4];
+ char *namestr;
+ char *slotend;
+ char prefix, suffix;
+ int i;
+ int pslot;
+ int snum;
+
+ pslot = -1;
+ prefix = suffix = 0;
+ slotend = strchr(opt, ',');
+
+ /*
+ * A comma must be present, and can't be the first character
+ * or no slot would be present. Also, the slot number can't be
+ * more than 2 characters.
+ */
+ if (slotend == NULL || slotend == opt || (slotend - opt > 2)) {
+ pci_parse_name_usage(opt);
+ return;
+ }
+
+ for (i = 0; i < (slotend - opt); i++) {
+ csnum[i] = opt[i];
+ }
+ csnum[i] = '\0';
+
+ snum = 255;
+ snum = atoi(csnum);
+ if (snum < 0 || snum >= MAXSLOTS) {
+ pci_parse_name_usage(opt);
+ return;
+ }
+
+ namestr = slotend + 1;
+
+ if (strlen(namestr) > 3) {
+ pci_parse_name_usage(opt);
+ return;
+ }
+
+ if (isalpha(*namestr)) {
+ prefix = *namestr++;
+ }
+
+ if (!isdigit(*namestr)) {
+ pci_parse_name_usage(opt);
+ } else {
+ pslot = *namestr++ - '0';
+ if (isnumber(*namestr)) {
+ pslot = 10*pslot + *namestr++ - '0';
+
+ }
+ if (isalpha(*namestr) && *(namestr + 1) == 0) {
+ suffix = *namestr;
+ pci_slotinfo[snum].si_titled = 1;
+ pci_slotinfo[snum].si_pslot = pslot;
+ pci_slotinfo[snum].si_prefix = prefix;
+ pci_slotinfo[snum].si_suffix = suffix;
+
+ } else {
+ pci_parse_name_usage(opt);
+ }
+ }
+}
+
+static void
+pci_add_mptable_name(struct slotinfo *si)
+{
+ struct mptable_pci_slotinfo *ms;
+
+ /*
+ * If naming information has been supplied for this slot, populate
+ * the next available mptable OEM entry
+ */
+ if (si->si_titled) {
+ ms = &pci_devnames.md_slotinfo[devname_elems];
+
+ ms->mds_type = MPT_HDR_NAME;
+ ms->mds_phys_slot = si->si_pslot;
+ ms->mds_bus = si->si_devi->pi_bus;
+ ms->mds_slot = si->si_devi->pi_slot;
+ ms->mds_func = si->si_devi->pi_func;
+ ms->mds_vid = pci_get_cfgdata16(si->si_devi, PCIR_VENDOR);
+ ms->mds_did = pci_get_cfgdata16(si->si_devi, PCIR_DEVICE);
+ ms->mds_suffix[0] = si->si_suffix;
+ ms->mds_prefix[0] = si->si_prefix;
+
+ devname_elems++;
+ }
+}
+
+static void
+pci_finish_mptable_names(void)
+{
+ int size;
+
+ if (devname_elems) {
+ pci_devnames.md_hdrtype = MPT_HDR_BASE;
+ pci_devnames.md_entries = devname_elems;
+ pci_devnames.md_cksum = 0; /* XXX */
+ pci_devnames.md_sig = MPT_NTAP_SIG;
+
+ size = (uintptr_t)&pci_devnames.md_slotinfo[devname_elems] -
+ (uintptr_t)&pci_devnames;
+
+ fbsdrun_add_oemtbl(&pci_devnames, size);
+ }
+}
+
+static int
+pci_emul_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ struct pci_devinst *pdi = arg;
+ struct pci_devemu *pe = pdi->pi_d;
+ int offset, i;
+
+ for (i = 0; i <= PCI_BARMAX; i++) {
+ if (pdi->pi_bar[i].type == PCIBAR_IO &&
+ port >= pdi->pi_bar[i].addr &&
+ port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
+ offset = port - pdi->pi_bar[i].addr;
+ if (in)
+ *eax = (*pe->pe_ior)(pdi, i, offset, bytes);
+ else
+ (*pe->pe_iow)(pdi, i, offset, bytes, *eax);
+ return (0);
+ }
+ }
+ return (-1);
+}
+
+static int
+pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size,
+ uint64_t *addr)
+{
+ uint64_t base;
+
+ assert((size & (size - 1)) == 0); /* must be a power of 2 */
+
+ base = roundup2(*baseptr, size);
+
+ if (base + size <= limit) {
+ *addr = base;
+ *baseptr = base + size;
+ return (0);
+ } else
+ return (-1);
+}
+
+int
+pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
+ enum pcibar_type type, uint64_t size)
+{
+ int i, error;
+ uint64_t *baseptr, limit, addr, mask, lobits, bar;
+ struct inout_port iop;
+
+ assert(idx >= 0 && idx <= PCI_BARMAX);
+
+ if ((size & (size - 1)) != 0)
+ size = 1UL << flsl(size); /* round up to a power of 2 */
+
+ switch (type) {
+ case PCIBAR_NONE:
+ baseptr = NULL;
+ addr = mask = lobits = 0;
+ break;
+ case PCIBAR_IO:
+ baseptr = &pci_emul_iobase;
+ limit = PCI_EMUL_IOLIMIT;
+ mask = PCIM_BAR_IO_BASE;
+ lobits = PCIM_BAR_IO_SPACE;
+ break;
+ case PCIBAR_MEM64:
+ /*
+ * XXX
+ * Some drivers do not work well if the 64-bit BAR is allocated
+ * above 4GB. Allow for this by allocating small requests under
+ * 4GB unless then allocation size is larger than some arbitrary
+ * number (32MB currently).
+ */
+ if (size > 32 * 1024 * 1024) {
+ /*
+ * XXX special case for device requiring peer-peer DMA
+ */
+ if (size == 0x100000000UL)
+ baseptr = &hostbase;
+ else
+ baseptr = &pci_emul_membase64;
+ limit = PCI_EMUL_MEMLIMIT64;
+ mask = PCIM_BAR_MEM_BASE;
+ lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
+ PCIM_BAR_MEM_PREFETCH;
+ break;
+ }
+ /* fallthrough */
+ case PCIBAR_MEM32:
+ baseptr = &pci_emul_membase32;
+ limit = PCI_EMUL_MEMLIMIT32;
+ mask = PCIM_BAR_MEM_BASE;
+ lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
+ break;
+ default:
+ printf("pci_emul_alloc_base: invalid bar type %d\n", type);
+ assert(0);
+ }
+
+ if (baseptr != NULL) {
+ error = pci_emul_alloc_resource(baseptr, limit, size, &addr);
+ if (error != 0)
+ return (error);
+ }
+
+ pdi->pi_bar[idx].type = type;
+ pdi->pi_bar[idx].addr = addr;
+ pdi->pi_bar[idx].size = size;
+
+ /* Initialize the BAR register in config space */
+ bar = (addr & mask) | lobits;
+ pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar);
+
+ if (type == PCIBAR_MEM64) {
+ assert(idx + 1 <= PCI_BARMAX);
+ pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64;
+ pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32);
+ }
+
+ /* add a handler to intercept accesses to the I/O bar */
+ if (type == PCIBAR_IO) {
+ iop.name = pdi->pi_name;
+ iop.flags = IOPORT_F_INOUT;
+ iop.handler = pci_emul_handler;
+ iop.arg = pdi;
+
+ for (i = 0; i < size; i++) {
+ iop.port = addr + i;
+ register_inout(&iop);
+ }
+ }
+
+ return (0);
+}
+
+#define CAP_START_OFFSET 0x40
+static int
+pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen)
+{
+ int i, capoff, capid, reallen;
+ uint16_t sts;
+
+ static u_char endofcap[4] = {
+ PCIY_RESERVED, 0, 0, 0
+ };
+
+ assert(caplen > 0 && capdata[0] != PCIY_RESERVED);
+
+ reallen = roundup2(caplen, 4); /* dword aligned */
+
+ sts = pci_get_cfgdata16(pi, PCIR_STATUS);
+ if ((sts & PCIM_STATUS_CAPPRESENT) == 0) {
+ capoff = CAP_START_OFFSET;
+ pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff);
+ pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT);
+ } else {
+ capoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR);
+ while (1) {
+ assert((capoff & 0x3) == 0);
+ capid = pci_get_cfgdata8(pi, capoff);
+ if (capid == PCIY_RESERVED)
+ break;
+ capoff = pci_get_cfgdata8(pi, capoff + 1);
+ }
+ }
+
+ /* Check if we have enough space */
+ if (capoff + reallen + sizeof(endofcap) > PCI_REGMAX + 1)
+ return (-1);
+
+ /* Copy the capability */
+ for (i = 0; i < caplen; i++)
+ pci_set_cfgdata8(pi, capoff + i, capdata[i]);
+
+ /* Set the next capability pointer */
+ pci_set_cfgdata8(pi, capoff + 1, capoff + reallen);
+
+ /* Copy of the reserved capability which serves as the end marker */
+ for (i = 0; i < sizeof(endofcap); i++)
+ pci_set_cfgdata8(pi, capoff + reallen + i, endofcap[i]);
+
+ return (0);
+}
+
+static struct pci_devemu *
+pci_emul_finddev(char *name)
+{
+ struct pci_devemu **pdpp, *pdp;
+
+ SET_FOREACH(pdpp, pci_devemu_set) {
+ pdp = *pdpp;
+ if (!strcmp(pdp->pe_emu, name)) {
+ return (pdp);
+ }
+ }
+
+ return (NULL);
+}
+
+static void
+pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int slot, char *params)
+{
+ struct pci_devinst *pdi;
+ pdi = malloc(sizeof(struct pci_devinst));
+ bzero(pdi, sizeof(*pdi));
+
+ pdi->pi_vmctx = ctx;
+ pdi->pi_bus = 0;
+ pdi->pi_slot = slot;
+ pdi->pi_func = 0;
+ pdi->pi_d = pde;
+ snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot);
+
+ /* Disable legacy interrupts */
+ pci_set_cfgdata8(pdi, PCIR_INTLINE, 255);
+ pci_set_cfgdata8(pdi, PCIR_INTPIN, 0);
+
+ pci_set_cfgdata8(pdi, PCIR_COMMAND,
+ PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
+
+ if ((*pde->pe_init)(ctx, pdi, params) != 0) {
+ free(pdi);
+ } else {
+ pci_emul_devices++;
+ pci_slotinfo[slot].si_devi = pdi;
+ }
+}
+
+void
+pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr)
+{
+ int mmc;
+
+ CTASSERT(sizeof(struct msicap) == 14);
+
+ /* Number of msi messages must be a power of 2 between 1 and 32 */
+ assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32);
+ mmc = ffs(msgnum) - 1;
+
+ bzero(msicap, sizeof(struct msicap));
+ msicap->capid = PCIY_MSI;
+ msicap->nextptr = nextptr;
+ msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1);
+}
+
+int
+pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
+{
+ struct msicap msicap;
+
+ pci_populate_msicap(&msicap, msgnum, 0);
+
+ return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
+}
+
+void
+msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val)
+{
+ uint16_t msgctrl, rwmask, msgdata, mme;
+ uint32_t addrlo;
+
+ /*
+ * If guest is writing to the message control register make sure
+ * we do not overwrite read-only fields.
+ */
+ if ((offset - capoff) == 2 && bytes == 2) {
+ rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE;
+ msgctrl = pci_get_cfgdata16(pi, offset);
+ msgctrl &= ~rwmask;
+ msgctrl |= val & rwmask;
+ val = msgctrl;
+
+ addrlo = pci_get_cfgdata32(pi, capoff + 4);
+ if (msgctrl & PCIM_MSICTRL_64BIT)
+ msgdata = pci_get_cfgdata16(pi, capoff + 12);
+ else
+ msgdata = pci_get_cfgdata16(pi, capoff + 8);
+
+ /*
+ * XXX check delivery mode, destination mode etc
+ */
+ mme = msgctrl & PCIM_MSICTRL_MME_MASK;
+ pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0;
+ if (pi->pi_msi.enabled) {
+ pi->pi_msi.cpu = (addrlo >> 12) & 0xff;
+ pi->pi_msi.vector = msgdata & 0xff;
+ pi->pi_msi.msgnum = 1 << (mme >> 4);
+ } else {
+ pi->pi_msi.cpu = 0;
+ pi->pi_msi.vector = 0;
+ pi->pi_msi.msgnum = 0;
+ }
+ }
+
+ CFGWRITE(pi, offset, val, bytes);
+}
+
+/*
+ * This function assumes that 'coff' is in the capabilities region of the
+ * config space.
+ */
+static void
+pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val)
+{
+ int capid;
+ uint8_t capoff, nextoff;
+
+ /* Do not allow un-aligned writes */
+ if ((offset & (bytes - 1)) != 0)
+ return;
+
+ /* Find the capability that we want to update */
+ capoff = CAP_START_OFFSET;
+ while (1) {
+ capid = pci_get_cfgdata8(pi, capoff);
+ if (capid == PCIY_RESERVED)
+ break;
+
+ nextoff = pci_get_cfgdata8(pi, capoff + 1);
+ if (offset >= capoff && offset < nextoff)
+ break;
+
+ capoff = nextoff;
+ }
+ assert(offset >= capoff);
+
+ /*
+ * Capability ID and Next Capability Pointer are readonly
+ */
+ if (offset == capoff || offset == capoff + 1)
+ return;
+
+ switch (capid) {
+ case PCIY_MSI:
+ msicap_cfgwrite(pi, capoff, offset, bytes, val);
+ break;
+ default:
+ break;
+ }
+}
+
+static int
+pci_emul_iscap(struct pci_devinst *pi, int offset)
+{
+ int found;
+ uint16_t sts;
+ uint8_t capid, lastoff;
+
+ found = 0;
+ sts = pci_get_cfgdata16(pi, PCIR_STATUS);
+ if ((sts & PCIM_STATUS_CAPPRESENT) != 0) {
+ lastoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR);
+ while (1) {
+ assert((lastoff & 0x3) == 0);
+ capid = pci_get_cfgdata8(pi, lastoff);
+ if (capid == PCIY_RESERVED)
+ break;
+ lastoff = pci_get_cfgdata8(pi, lastoff + 1);
+ }
+ if (offset >= CAP_START_OFFSET && offset <= lastoff)
+ found = 1;
+ }
+ return (found);
+}
+
+void
+init_pci(struct vmctx *ctx)
+{
+ struct pci_devemu *pde;
+ struct slotinfo *si;
+ int i;
+
+ pci_emul_iobase = PCI_EMUL_IOBASE;
+ pci_emul_membase32 = PCI_EMUL_MEMBASE32;
+ pci_emul_membase64 = PCI_EMUL_MEMBASE64;
+
+ si = pci_slotinfo;
+
+ for (i = 0; i < MAXSLOTS; i++, si++) {
+ if (si->si_name != NULL) {
+ pde = pci_emul_finddev(si->si_name);
+ if (pde != NULL) {
+ pci_emul_init(ctx, pde, i, si->si_param);
+ pci_add_mptable_name(si);
+ }
+ }
+ }
+ pci_finish_mptable_names();
+}
+
+int
+pci_msi_enabled(struct pci_devinst *pi)
+{
+ return (pi->pi_msi.enabled);
+}
+
+int
+pci_msi_msgnum(struct pci_devinst *pi)
+{
+ if (pi->pi_msi.enabled)
+ return (pi->pi_msi.msgnum);
+ else
+ return (0);
+}
+
+void
+pci_generate_msi(struct pci_devinst *pi, int msg)
+{
+
+ if (pci_msi_enabled(pi) && msg < pci_msi_msgnum(pi)) {
+ vm_lapic_irq(pi->pi_vmctx,
+ pi->pi_msi.cpu,
+ pi->pi_msi.vector + msg);
+ }
+}
+
+static int cfgbus, cfgslot, cfgfunc, cfgoff;
+
+static int
+pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ uint32_t x;
+
+ assert(!in);
+
+ if (bytes != 4)
+ return (-1);
+
+ x = *eax;
+ cfgoff = x & PCI_REGMAX;
+ cfgfunc = (x >> 8) & PCI_FUNCMAX;
+ cfgslot = (x >> 11) & PCI_SLOTMAX;
+ cfgbus = (x >> 16) & PCI_BUSMAX;
+
+ return (0);
+}
+INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_OUT, pci_emul_cfgaddr);
+
+static int
+pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ struct pci_devinst *pi;
+ struct pci_devemu *pe;
+ int coff, idx;
+ uint64_t mask, bar;
+
+ assert(bytes == 1 || bytes == 2 || bytes == 4);
+
+ pi = pci_slotinfo[cfgslot].si_devi;
+ coff = cfgoff + (port - CONF1_DATA_PORT);
+
+#if 0
+ printf("pcicfg-%s from 0x%0x of %d bytes (%d/%d/%d)\n\r",
+ in ? "read" : "write", coff, bytes, cfgbus, cfgslot, cfgfunc);
+#endif
+
+ if (pi == NULL || cfgfunc != 0) {
+ if (in)
+ *eax = 0xffffffff;
+ return (0);
+ }
+
+ pe = pi->pi_d;
+
+ /*
+ * Config read
+ */
+ if (in) {
+ /* Let the device emulation override the default handler */
+ if (pe->pe_cfgread != NULL &&
+ (*pe->pe_cfgread)(ctx, vcpu, pi, coff, bytes, eax) == 0)
+ return (0);
+
+ if (bytes == 1)
+ *eax = pci_get_cfgdata8(pi, coff);
+ else if (bytes == 2)
+ *eax = pci_get_cfgdata16(pi, coff);
+ else
+ *eax = pci_get_cfgdata32(pi, coff);
+ } else {
+ /* Let the device emulation override the default handler */
+ if (pe->pe_cfgwrite != NULL &&
+ (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
+ return (0);
+
+ /*
+ * Special handling for write to BAR registers
+ */
+ if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) {
+ /*
+ * Ignore writes to BAR registers that are not
+ * 4-byte aligned.
+ */
+ if (bytes != 4 || (coff & 0x3) != 0)
+ return (0);
+ idx = (coff - PCIR_BAR(0)) / 4;
+ switch (pi->pi_bar[idx].type) {
+ case PCIBAR_NONE:
+ bar = 0;
+ break;
+ case PCIBAR_IO:
+ mask = ~(pi->pi_bar[idx].size - 1);
+ mask &= PCIM_BAR_IO_BASE;
+ bar = (*eax & mask) | PCIM_BAR_IO_SPACE;
+ break;
+ case PCIBAR_MEM32:
+ mask = ~(pi->pi_bar[idx].size - 1);
+ mask &= PCIM_BAR_MEM_BASE;
+ bar = *eax & mask;
+ bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
+ break;
+ case PCIBAR_MEM64:
+ mask = ~(pi->pi_bar[idx].size - 1);
+ mask &= PCIM_BAR_MEM_BASE;
+ bar = *eax & mask;
+ bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
+ PCIM_BAR_MEM_PREFETCH;
+ break;
+ case PCIBAR_MEMHI64:
+ mask = ~(pi->pi_bar[idx - 1].size - 1);
+ mask &= PCIM_BAR_MEM_BASE;
+ bar = ((uint64_t)*eax << 32) & mask;
+ bar = bar >> 32;
+ break;
+ default:
+ assert(0);
+ }
+ pci_set_cfgdata32(pi, coff, bar);
+ } else if (pci_emul_iscap(pi, coff)) {
+ pci_emul_capwrite(pi, coff, bytes, *eax);
+ } else {
+ CFGWRITE(pi, coff, *eax, bytes);
+ }
+ }
+
+ return (0);
+}
+
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
+
+/*
+ * I/O ports to configure PCI IRQ routing. We ignore all writes to it.
+ */
+static int
+pci_irq_port_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in == 0);
+ return (0);
+}
+INOUT_PORT(pci_irq, 0xC00, IOPORT_F_OUT, pci_irq_port_handler);
+INOUT_PORT(pci_irq, 0xC01, IOPORT_F_OUT, pci_irq_port_handler);
+
+#define PCI_EMUL_TEST
+#ifdef PCI_EMUL_TEST
+/*
+ * Define a dummy test device
+ */
+#define DREGSZ 20
+struct pci_emul_dsoftc {
+ uint8_t regs[DREGSZ];
+};
+
+#define PCI_EMUL_MSGS 4
+
+int
+pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ int error;
+ struct pci_emul_dsoftc *sc;
+
+ sc = malloc(sizeof(struct pci_emul_dsoftc));
+ memset(sc, 0, sizeof(struct pci_emul_dsoftc));
+
+ pi->pi_arg = sc;
+
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD);
+ pci_set_cfgdata8(pi, PCIR_CLASS, 0x02);
+
+ error = pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, DREGSZ);
+ assert(error == 0);
+
+ error = pci_emul_add_msicap(pi, PCI_EMUL_MSGS);
+ assert(error == 0);
+
+ return (0);
+}
+
+void
+pci_emul_diow(struct pci_devinst *pi, int baridx, int offset, int size,
+ uint32_t value)
+{
+ int i;
+ struct pci_emul_dsoftc *sc = pi->pi_arg;
+
+ if (offset + size > DREGSZ) {
+ printf("diow: too large, offset %d size %d\n", offset, size);
+ return;
+ }
+
+ if (size == 1) {
+ sc->regs[offset] = value & 0xff;
+ } else if (size == 2) {
+ *(uint16_t *)&sc->regs[offset] = value & 0xffff;
+ } else {
+ *(uint32_t *)&sc->regs[offset] = value;
+ }
+
+ /*
+ * Special magic value to generate an interrupt
+ */
+ if (offset == 4 && size == 4 && pci_msi_enabled(pi))
+ pci_generate_msi(pi, value % pci_msi_msgnum(pi));
+
+ if (value == 0xabcdef) {
+ for (i = 0; i < pci_msi_msgnum(pi); i++)
+ pci_generate_msi(pi, i);
+ }
+}
+
+uint32_t
+pci_emul_dior(struct pci_devinst *pi, int baridx, int offset, int size)
+{
+ struct pci_emul_dsoftc *sc = pi->pi_arg;
+ uint32_t value;
+
+ if (offset + size > DREGSZ) {
+ printf("dior: too large, offset %d size %d\n", offset, size);
+ return (0);
+ }
+
+ if (size == 1) {
+ value = sc->regs[offset];
+ } else if (size == 2) {
+ value = *(uint16_t *) &sc->regs[offset];
+ } else {
+ value = *(uint32_t *) &sc->regs[offset];
+ }
+
+ return (value);
+}
+
+struct pci_devemu pci_dummy = {
+ .pe_emu = "dummy",
+ .pe_init = pci_emul_dinit,
+ .pe_iow = pci_emul_diow,
+ .pe_ior = pci_emul_dior
+};
+PCI_EMUL_SET(pci_dummy);
+
+#endif /* PCI_EMUL_TEST */
diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h
new file mode 100644
index 000000000000..f5f8e228c55e
--- /dev/null
+++ b/usr.sbin/bhyve/pci_emul.h
@@ -0,0 +1,171 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PCI_EMUL_H_
+#define _PCI_EMUL_H_
+
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/kernel.h>
+
+#include <dev/pci/pcireg.h>
+
+#include <assert.h>
+
+#define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */
+#define PCIY_RESERVED 0x00
+
+struct vmctx;
+struct pci_devinst;
+
+struct pci_devemu {
+ char *pe_emu; /* Name of device emulation */
+
+ /* instance creation */
+ int (*pe_init)(struct vmctx *, struct pci_devinst *, char *opts);
+
+ /* config space read/write callbacks */
+ int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int offset,
+ int bytes, uint32_t val);
+ int (*pe_cfgread)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int offset,
+ int bytes, uint32_t *retval);
+
+ /* I/O space read/write callbacks */
+ void (*pe_iow)(struct pci_devinst *pi, int baridx,
+ int offset, int size, uint32_t value);
+ uint32_t (*pe_ior)(struct pci_devinst *pi, int baridx,
+ int offset, int size);
+};
+#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x);
+
+enum pcibar_type {
+ PCIBAR_NONE,
+ PCIBAR_IO,
+ PCIBAR_MEM32,
+ PCIBAR_MEM64,
+ PCIBAR_MEMHI64
+};
+
+struct pcibar {
+ enum pcibar_type type; /* io or memory */
+ uint64_t size;
+ uint64_t addr;
+};
+
+#define PI_NAMESZ 40
+
+struct pci_devinst {
+ struct pci_devemu *pi_d;
+ struct vmctx *pi_vmctx;
+ uint8_t pi_bus, pi_slot, pi_func;
+ char pi_name[PI_NAMESZ];
+ uint16_t pi_iobase;
+ int pi_bar_getsize;
+
+ struct {
+ int enabled;
+ int cpu;
+ int vector;
+ int msgnum;
+ } pi_msi;
+
+ void *pi_arg; /* devemu-private data */
+
+ u_char pi_cfgdata[PCI_REGMAX + 1];
+ struct pcibar pi_bar[PCI_BARMAX + 1];
+};
+
+struct msicap {
+ uint8_t capid;
+ uint8_t nextptr;
+ uint16_t msgctrl;
+ uint32_t addrlo;
+ uint32_t addrhi;
+ uint16_t msgdata;
+} __packed;
+
+void init_pci(struct vmctx *ctx);
+void pci_parse_slot(char *opt);
+void pci_parse_name(char *opt);
+void pci_callback(void);
+int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
+ enum pcibar_type type, uint64_t size);
+int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
+void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val);
+
+void pci_generate_msi(struct pci_devinst *pi, int msgnum);
+int pci_msi_enabled(struct pci_devinst *pi);
+int pci_msi_msgnum(struct pci_devinst *pi);
+void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
+
+static __inline void
+pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)
+{
+ assert(offset <= PCI_REGMAX);
+ *(uint8_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline void
+pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val)
+{
+ assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
+ *(uint16_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline void
+pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val)
+{
+ assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
+ *(uint32_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline uint8_t
+pci_get_cfgdata8(struct pci_devinst *pi, int offset)
+{
+ assert(offset <= PCI_REGMAX);
+ return (*(uint8_t *)(pi->pi_cfgdata + offset));
+}
+
+static __inline uint16_t
+pci_get_cfgdata16(struct pci_devinst *pi, int offset)
+{
+ assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
+ return (*(uint16_t *)(pi->pi_cfgdata + offset));
+}
+
+static __inline uint32_t
+pci_get_cfgdata32(struct pci_devinst *pi, int offset)
+{
+ assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
+ return (*(uint32_t *)(pi->pi_cfgdata + offset));
+}
+
+#endif /* _PCI_EMUL_H_ */
diff --git a/usr.sbin/bhyve/pci_hostbridge.c b/usr.sbin/bhyve/pci_hostbridge.c
new file mode 100644
index 000000000000..c77762d8f921
--- /dev/null
+++ b/usr.sbin/bhyve/pci_hostbridge.c
@@ -0,0 +1,52 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "pci_emul.h"
+
+static int
+pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+
+ /* config space */
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */
+ pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_BRIDGE);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
+ pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST);
+
+ return (0);
+}
+
+struct pci_devemu pci_de_hostbridge = {
+ .pe_emu = "hostbridge",
+ .pe_init = pci_hostbridge_init,
+};
+PCI_EMUL_SET(pci_de_hostbridge);
diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c
new file mode 100644
index 000000000000..1c417fd58725
--- /dev/null
+++ b/usr.sbin/bhyve/pci_passthru.c
@@ -0,0 +1,508 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/pciio.h>
+#include <sys/ioctl.h>
+
+#include <dev/io/iodev.h>
+#include <machine/iodev.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+#include "pci_emul.h"
+
+#ifndef _PATH_DEVPCI
+#define _PATH_DEVPCI "/dev/pci"
+#endif
+
+#ifndef _PATH_DEVIO
+#define _PATH_DEVIO "/dev/io"
+#endif
+
+#define LEGACY_SUPPORT 1
+
+static int pcifd = -1;
+static int iofd = -1;
+
+struct passthru_softc {
+ struct pci_devinst *psc_pi;
+ struct pcibar psc_bar[PCI_BARMAX + 1];
+ struct {
+ int capoff;
+ int msgctrl;
+ int emulated;
+ } psc_msi;
+ struct pcisel psc_sel;
+};
+
+static int
+msi_caplen(int msgctrl)
+{
+ int len;
+
+ len = 10; /* minimum length of msi capability */
+
+ if (msgctrl & PCIM_MSICTRL_64BIT)
+ len += 4;
+
+#if 0
+ /*
+ * Ignore the 'mask' and 'pending' bits in the MSI capability.
+ * We'll let the guest manipulate them directly.
+ */
+ if (msgctrl & PCIM_MSICTRL_VECTOR)
+ len += 10;
+#endif
+
+ return (len);
+}
+
+static uint32_t
+read_config(const struct pcisel *sel, long reg, int width)
+{
+ struct pci_io pi;
+
+ bzero(&pi, sizeof(pi));
+ pi.pi_sel = *sel;
+ pi.pi_reg = reg;
+ pi.pi_width = width;
+
+ if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
+ return (0); /* XXX */
+ else
+ return (pi.pi_data);
+}
+
+static void
+write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
+{
+ struct pci_io pi;
+
+ bzero(&pi, sizeof(pi));
+ pi.pi_sel = *sel;
+ pi.pi_reg = reg;
+ pi.pi_width = width;
+ pi.pi_data = data;
+
+ (void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */
+}
+
+#ifdef LEGACY_SUPPORT
+static int
+passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
+{
+ int capoff, i;
+ struct msicap msicap;
+ u_char *capdata;
+
+ pci_populate_msicap(&msicap, msgnum, nextptr);
+
+ /*
+ * XXX
+ * Copy the msi capability structure in the last 16 bytes of the
+ * config space. This is wrong because it could shadow something
+ * useful to the device.
+ */
+ capoff = 256 - roundup(sizeof(msicap), 4);
+ capdata = (u_char *)&msicap;
+ for (i = 0; i < sizeof(msicap); i++)
+ pci_set_cfgdata8(pi, capoff + i, capdata[i]);
+
+ return (capoff);
+}
+#endif /* LEGACY_SUPPORT */
+
+static int
+cfginitmsi(struct passthru_softc *sc)
+{
+ int ptr, cap, sts, caplen;
+ uint32_t u32;
+ struct pcisel sel;
+ struct pci_devinst *pi;
+
+ pi = sc->psc_pi;
+ sel = sc->psc_sel;
+
+ /*
+ * Parse the capabilities and cache the location of the MSI
+ * capability.
+ */
+ sts = read_config(&sel, PCIR_STATUS, 2);
+ if (sts & PCIM_STATUS_CAPPRESENT) {
+ ptr = read_config(&sel, PCIR_CAP_PTR, 1);
+ while (ptr != 0 && ptr != 0xff) {
+ cap = read_config(&sel, ptr + PCICAP_ID, 1);
+ if (cap == PCIY_MSI) {
+ /*
+ * Copy the MSI capability into the config
+ * space of the emulated pci device
+ */
+ sc->psc_msi.capoff = ptr;
+ sc->psc_msi.msgctrl = read_config(&sel,
+ ptr + 2, 2);
+ sc->psc_msi.emulated = 0;
+ caplen = msi_caplen(sc->psc_msi.msgctrl);
+ while (caplen > 0) {
+ u32 = read_config(&sel, ptr, 4);
+ pci_set_cfgdata32(pi, ptr, u32);
+ caplen -= 4;
+ ptr += 4;
+ }
+ break;
+ }
+ ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
+ }
+ }
+
+#ifdef LEGACY_SUPPORT
+ /*
+ * If the passthrough device does not support MSI then craft a
+ * MSI capability for it. We link the new MSI capability at the
+ * head of the list of capabilities.
+ */
+ if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
+ int origptr, msiptr;
+ origptr = read_config(&sel, PCIR_CAP_PTR, 1);
+ msiptr = passthru_add_msicap(pi, 1, origptr);
+ sc->psc_msi.capoff = msiptr;
+ sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
+ sc->psc_msi.emulated = 1;
+ pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
+ }
+#endif
+
+ if (sc->psc_msi.capoff == 0) /* MSI or bust */
+ return (-1);
+ else
+ return (0);
+}
+
+static int
+cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
+{
+ int i, error;
+ struct pci_devinst *pi;
+ struct pci_bar_io bar;
+ enum pcibar_type bartype;
+ uint64_t base;
+
+ pi = sc->psc_pi;
+
+ /*
+ * Initialize BAR registers
+ */
+ for (i = 0; i <= PCI_BARMAX; i++) {
+ bzero(&bar, sizeof(bar));
+ bar.pbi_sel = sc->psc_sel;
+ bar.pbi_reg = PCIR_BAR(i);
+
+ if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
+ continue;
+
+ if (PCI_BAR_IO(bar.pbi_base)) {
+ bartype = PCIBAR_IO;
+ base = bar.pbi_base & PCIM_BAR_IO_BASE;
+ } else {
+ switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
+ case PCIM_BAR_MEM_64:
+ bartype = PCIBAR_MEM64;
+ break;
+ default:
+ bartype = PCIBAR_MEM32;
+ break;
+ }
+ base = bar.pbi_base & PCIM_BAR_MEM_BASE;
+ }
+
+ /* Cache information about the "real" BAR */
+ sc->psc_bar[i].type = bartype;
+ sc->psc_bar[i].size = bar.pbi_length;
+ sc->psc_bar[i].addr = base;
+
+ /* Allocate the BAR in the guest I/O or MMIO space */
+ error = pci_emul_alloc_bar(pi, i, base, bartype,
+ bar.pbi_length);
+ if (error)
+ return (-1);
+
+ /*
+ * Map the physical MMIO space in the guest MMIO space
+ */
+ if (bartype != PCIBAR_IO) {
+ error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
+ sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
+ pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
+ if (error)
+ return (-1);
+ }
+
+ /*
+ * 64-bit BAR takes up two slots so skip the next one.
+ */
+ if (bartype == PCIBAR_MEM64) {
+ i++;
+ assert(i <= PCI_BARMAX);
+ sc->psc_bar[i].type = PCIBAR_MEMHI64;
+ }
+ }
+ return (0);
+}
+
+static int
+cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
+{
+ int error;
+ struct passthru_softc *sc;
+
+ error = 1;
+ sc = pi->pi_arg;
+
+ bzero(&sc->psc_sel, sizeof(struct pcisel));
+ sc->psc_sel.pc_bus = bus;
+ sc->psc_sel.pc_dev = slot;
+ sc->psc_sel.pc_func = func;
+
+ if (cfginitbar(ctx, sc) != 0)
+ goto done;
+
+ if (cfginitmsi(sc) != 0)
+ goto done;
+
+ error = 0; /* success */
+done:
+ return (error);
+}
+
+static int
+passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ int bus, slot, func, error;
+ struct passthru_softc *sc;
+
+ sc = NULL;
+ error = 1;
+
+ if (pcifd < 0) {
+ pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
+ if (pcifd < 0)
+ goto done;
+ }
+
+ if (iofd < 0) {
+ iofd = open(_PATH_DEVIO, O_RDWR, 0);
+ if (iofd < 0)
+ goto done;
+ }
+
+ if (opts == NULL || sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3)
+ goto done;
+
+ if (vm_assign_pptdev(ctx, bus, slot, func) != 0)
+ goto done;
+
+ sc = malloc(sizeof(struct passthru_softc));
+ memset(sc, 0, sizeof(struct passthru_softc));
+
+ pi->pi_arg = sc;
+ sc->psc_pi = pi;
+
+ /* initialize config space */
+ if (cfginit(ctx, pi, bus, slot, func) != 0)
+ goto done;
+
+ error = 0; /* success */
+done:
+ if (error) {
+ free(sc);
+ vm_unassign_pptdev(ctx, bus, slot, func);
+ }
+ return (error);
+}
+
+static int
+bar_access(int coff)
+{
+ if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
+ return (1);
+ else
+ return (0);
+}
+
+static int
+msicap_access(struct passthru_softc *sc, int coff)
+{
+ int caplen;
+
+ if (sc->psc_msi.capoff == 0)
+ return (0);
+
+ caplen = msi_caplen(sc->psc_msi.msgctrl);
+
+ if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
+ return (1);
+ else
+ return (0);
+}
+
+static int
+passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff,
+ int bytes, uint32_t *rv)
+{
+ struct passthru_softc *sc;
+
+ sc = pi->pi_arg;
+
+ /*
+ * PCI BARs and MSI capability is emulated.
+ */
+ if (bar_access(coff) || msicap_access(sc, coff))
+ return (-1);
+
+#ifdef LEGACY_SUPPORT
+ /*
+ * Emulate PCIR_CAP_PTR if this device does not support MSI capability
+ * natively.
+ */
+ if (sc->psc_msi.emulated) {
+ if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
+ return (-1);
+ }
+#endif
+
+ /* Everything else just read from the device's config space */
+ *rv = read_config(&sc->psc_sel, coff, bytes);
+
+ return (0);
+}
+
+static int
+passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int coff,
+ int bytes, uint32_t val)
+{
+ int error;
+ struct passthru_softc *sc;
+
+ sc = pi->pi_arg;
+
+ /*
+ * PCI BARs are emulated
+ */
+ if (bar_access(coff))
+ return (-1);
+
+ /*
+ * MSI capability is emulated
+ */
+ if (msicap_access(sc, coff)) {
+ msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val);
+
+ error = vm_setup_msi(ctx, vcpu, sc->psc_sel.pc_bus,
+ sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_msi.cpu,
+ pi->pi_msi.vector, pi->pi_msi.msgnum);
+ if (error != 0) {
+ printf("vm_setup_msi returned error %d\r\n", errno);
+ exit(1);
+ }
+ return (0);
+ }
+
+#ifdef LEGACY_SUPPORT
+ /*
+ * If this device does not support MSI natively then we cannot let
+ * the guest disable legacy interrupts from the device. It is the
+ * legacy interrupt that is triggering the virtual MSI to the guest.
+ */
+ if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
+ if (coff == PCIR_COMMAND && bytes == 2)
+ val &= ~PCIM_CMD_INTxDIS;
+ }
+#endif
+
+ write_config(&sc->psc_sel, coff, bytes, val);
+
+ return (0);
+}
+
+static void
+passthru_iow(struct pci_devinst *pi, int baridx, int offset, int size,
+ uint32_t value)
+{
+ struct passthru_softc *sc;
+ struct iodev_pio_req pio;
+
+ sc = pi->pi_arg;
+
+ bzero(&pio, sizeof(struct iodev_pio_req));
+ pio.access = IODEV_PIO_WRITE;
+ pio.port = sc->psc_bar[baridx].addr + offset;
+ pio.width = size;
+ pio.val = value;
+
+ (void)ioctl(iofd, IODEV_PIO, &pio);
+}
+
+static uint32_t
+passthru_ior(struct pci_devinst *pi, int baridx, int offset, int size)
+{
+ struct passthru_softc *sc;
+ struct iodev_pio_req pio;
+
+ sc = pi->pi_arg;
+
+ bzero(&pio, sizeof(struct iodev_pio_req));
+ pio.access = IODEV_PIO_READ;
+ pio.port = sc->psc_bar[baridx].addr + offset;
+ pio.width = size;
+ pio.val = 0;
+
+ (void)ioctl(iofd, IODEV_PIO, &pio);
+
+ return (pio.val);
+}
+
+struct pci_devemu passthru = {
+ .pe_emu = "passthru",
+ .pe_init = passthru_init,
+ .pe_cfgwrite = passthru_cfgwrite,
+ .pe_cfgread = passthru_cfgread,
+ .pe_iow = passthru_iow,
+ .pe_ior = passthru_ior,
+};
+PCI_EMUL_SET(passthru);
diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c
new file mode 100644
index 000000000000..b86e21dff64e
--- /dev/null
+++ b/usr.sbin/bhyve/pci_virtio_block.c
@@ -0,0 +1,502 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+
+#include "fbsdrun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+
+#define VTBLK_RINGSZ 64
+
+#define VTBLK_CFGSZ 28
+
+#define VTBLK_R_CFG VTCFG_R_CFG0
+#define VTBLK_R_CFG_END VTBLK_R_CFG + VTBLK_CFGSZ -1
+#define VTBLK_R_MAX VTBLK_R_CFG_END
+
+#define VTBLK_REGSZ VTBLK_R_MAX+1
+
+#define VTBLK_MAXSEGS 32
+
+#define VTBLK_S_OK 0
+#define VTBLK_S_IOERR 1
+
+/*
+ * Host capabilities
+ */
+#define VTBLK_S_HOSTCAPS \
+ ( 0x00000004 | /* host maximum request segments */ \
+ 0x10000000 ) /* supports indirect descriptors */
+
+struct vring_hqueue {
+ /* Internal state */
+ uint16_t hq_size;
+ uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */
+
+ /* Host-context pointers to the queue */
+ struct virtio_desc *hq_dtable;
+ uint16_t *hq_avail_flags;
+ uint16_t *hq_avail_idx; /* monotonically increasing */
+ uint16_t *hq_avail_ring;
+
+ uint16_t *hq_used_flags;
+ uint16_t *hq_used_idx; /* monotonically increasing */
+ struct virtio_used *hq_used_ring;
+};
+
+/*
+ * Config space
+ */
+struct vtblk_config {
+ uint64_t vbc_capacity;
+ uint32_t vbc_size_max;
+ uint32_t vbc_seg_max;
+ uint16_t vbc_geom_c;
+ uint8_t vbc_geom_h;
+ uint8_t vbc_geom_s;
+ uint32_t vbc_blk_size;
+ uint32_t vbc_sectors_max;
+} __packed;
+CTASSERT(sizeof(struct vtblk_config) == VTBLK_CFGSZ);
+
+/*
+ * Fixed-size block header
+ */
+struct virtio_blk_hdr {
+#define VBH_OP_READ 0
+#define VBH_OP_WRITE 1
+ uint32_t vbh_type;
+ uint32_t vbh_ioprio;
+ uint64_t vbh_sector;
+} __packed;
+
+/*
+ * Debug printf
+ */
+static int pci_vtblk_debug;
+#define DPRINTF(params) if (pci_vtblk_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_vtblk_softc {
+ struct pci_devinst *vbsc_pi;
+ int vbsc_fd;
+ int vbsc_status;
+ int vbsc_isr;
+ int vbsc_lastq;
+ uint32_t vbsc_features;
+ uint64_t vbsc_pfn;
+ struct vring_hqueue vbsc_q;
+ struct vtblk_config vbsc_cfg;
+};
+
+/*
+ * Return the number of available descriptors in the vring taking care
+ * of the 16-bit index wraparound.
+ */
+static int
+hq_num_avail(struct vring_hqueue *hq)
+{
+ int ndesc;
+
+ if (*hq->hq_avail_idx >= hq->hq_cur_aidx)
+ ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx;
+ else
+ ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1;
+
+ assert(ndesc >= 0 && ndesc <= hq->hq_size);
+
+ return (ndesc);
+}
+
+static void
+pci_vtblk_update_status(struct pci_vtblk_softc *sc, uint32_t value)
+{
+ if (value == 0) {
+ DPRINTF(("vtblk: device reset requested !\n"));
+ }
+
+ sc->vbsc_status = value;
+}
+
+static void
+pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vring_hqueue *hq)
+{
+ struct iovec iov[VTBLK_MAXSEGS];
+ struct virtio_blk_hdr *vbh;
+ struct virtio_desc *vd, *vid;
+ struct virtio_used *vu;
+ uint8_t *status;
+ int i;
+ int err;
+ int iolen;
+ int nsegs;
+ int uidx, aidx, didx;
+ int writeop;
+ off_t offset;
+
+ uidx = *hq->hq_used_idx;
+ aidx = hq->hq_cur_aidx;
+ didx = hq->hq_avail_ring[aidx % hq->hq_size];
+ assert(didx >= 0 && didx < hq->hq_size);
+
+ vd = &hq->hq_dtable[didx];
+
+ /*
+ * Verify that the descriptor is indirect, and obtain
+ * the pointer to the indirect descriptor.
+ * There has to be space for at least 3 descriptors
+ * in the indirect descriptor array: the block header,
+ * 1 or more data descriptors, and a status byte.
+ */
+ assert(vd->vd_flags & VRING_DESC_F_INDIRECT);
+
+ nsegs = vd->vd_len / sizeof(struct virtio_desc);
+ assert(nsegs >= 3);
+ assert(nsegs < VTBLK_MAXSEGS + 2);
+
+ vid = paddr_guest2host(vd->vd_addr);
+ assert((vid->vd_flags & VRING_DESC_F_INDIRECT) == 0);
+
+ /*
+ * The first descriptor will be the read-only fixed header
+ */
+ vbh = paddr_guest2host(vid[0].vd_addr);
+ assert(vid[0].vd_len == sizeof(struct virtio_blk_hdr));
+ assert(vid[0].vd_flags & VRING_DESC_F_NEXT);
+ assert((vid[0].vd_flags & VRING_DESC_F_WRITE) == 0);
+
+ writeop = (vbh->vbh_type == VBH_OP_WRITE);
+
+ offset = vbh->vbh_sector * DEV_BSIZE;
+
+ /*
+ * Build up the iovec based on the guest's data descriptors
+ */
+ for (i = 1, iolen = 0; i < nsegs - 1; i++) {
+ iov[i-1].iov_base = paddr_guest2host(vid[i].vd_addr);
+ iov[i-1].iov_len = vid[i].vd_len;
+ iolen += vid[i].vd_len;
+
+ assert(vid[i].vd_flags & VRING_DESC_F_NEXT);
+ assert((vid[i].vd_flags & VRING_DESC_F_INDIRECT) == 0);
+
+ /*
+ * - write op implies read-only descriptor,
+ * - read op implies write-only descriptor,
+ * therefore test the inverse of the descriptor bit
+ * to the op.
+ */
+ assert(((vid[i].vd_flags & VRING_DESC_F_WRITE) == 0) ==
+ writeop);
+ }
+
+ /* Lastly, get the address of the status byte */
+ status = paddr_guest2host(vid[nsegs - 1].vd_addr);
+ assert(vid[nsegs - 1].vd_len == 1);
+ assert((vid[nsegs - 1].vd_flags & VRING_DESC_F_NEXT) == 0);
+ assert(vid[nsegs - 1].vd_flags & VRING_DESC_F_WRITE);
+
+ DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r",
+ writeop ? "write" : "read", iolen, nsegs - 2, offset));
+
+ if (writeop){
+ err = pwritev(sc->vbsc_fd, iov, nsegs - 2, offset);
+ } else {
+ err = preadv(sc->vbsc_fd, iov, nsegs - 2, offset);
+ }
+
+ *status = err < 0 ? VTBLK_S_IOERR : VTBLK_S_OK;
+
+ /*
+ * Return the single indirect descriptor back to the host
+ */
+ vu = &hq->hq_used_ring[uidx % hq->hq_size];
+ vu->vu_idx = didx;
+ vu->vu_tlen = 1;
+ hq->hq_cur_aidx++;
+ *hq->hq_used_idx += 1;
+}
+
+static void
+pci_vtblk_qnotify(struct pci_vtblk_softc *sc)
+{
+ struct vring_hqueue *hq = &sc->vbsc_q;
+ int i;
+ int ndescs;
+
+ /*
+ * Calculate number of ring entries to process
+ */
+ ndescs = hq_num_avail(hq);
+
+ if (ndescs == 0)
+ return;
+
+ /*
+ * Run through all the entries, placing them into iovecs and
+ * sending when an end-of-packet is found
+ */
+ for (i = 0; i < ndescs; i++)
+ pci_vtblk_proc(sc, hq);
+
+ /*
+ * Generate an interrupt if able
+ */
+ if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0 &&
+ sc->vbsc_isr == 0) {
+ sc->vbsc_isr = 1;
+ pci_generate_msi(sc->vbsc_pi, 0);
+ }
+
+}
+
+static void
+pci_vtblk_ring_init(struct pci_vtblk_softc *sc, uint64_t pfn)
+{
+ struct vring_hqueue *hq;
+
+ sc->vbsc_pfn = pfn << VRING_PFN;
+
+ /*
+ * Set up host pointers to the various parts of the
+ * queue
+ */
+ hq = &sc->vbsc_q;
+ hq->hq_size = VTBLK_RINGSZ;
+
+ hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN);
+ hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size);
+ hq->hq_avail_idx = hq->hq_avail_flags + 1;
+ hq->hq_avail_ring = hq->hq_avail_flags + 2;
+ hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
+ VRING_ALIGN);
+ hq->hq_used_idx = hq->hq_used_flags + 1;
+ hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
+
+ /*
+ * Initialize queue indexes
+ */
+ hq->hq_cur_aidx = 0;
+}
+
+static int
+pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ struct stat sbuf;
+ struct pci_vtblk_softc *sc;
+ int fd;
+
+ if (opts == NULL) {
+ printf("virtio-block: backing device required\n");
+ return (1);
+ }
+
+ /*
+ * Access to guest memory is required. Fail if
+ * memory not mapped
+ */
+ if (paddr_guest2host(0) == NULL)
+ return (1);
+
+ /*
+ * The supplied backing file has to exist
+ */
+ fd = open(opts, O_RDWR);
+ if (fd < 0) {
+ perror("Could not open backing file");
+ return (1);
+ }
+
+ if (fstat(fd, &sbuf) < 0) {
+ perror("Could not stat backing file");
+ close(fd);
+ return (1);
+ }
+
+ sc = malloc(sizeof(struct pci_vtblk_softc));
+ memset(sc, 0, sizeof(struct pci_vtblk_softc));
+
+ pi->pi_arg = sc;
+ sc->vbsc_pi = pi;
+ sc->vbsc_fd = fd;
+
+ /* setup virtio block config space */
+ sc->vbsc_cfg.vbc_capacity = sbuf.st_size / DEV_BSIZE;
+ sc->vbsc_cfg.vbc_seg_max = VTBLK_MAXSEGS;
+ sc->vbsc_cfg.vbc_blk_size = DEV_BSIZE;
+ sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */
+ sc->vbsc_cfg.vbc_geom_c = 0; /* no geometry */
+ sc->vbsc_cfg.vbc_geom_h = 0;
+ sc->vbsc_cfg.vbc_geom_s = 0;
+ sc->vbsc_cfg.vbc_sectors_max = 0;
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
+ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
+ pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, VTBLK_REGSZ);
+ pci_emul_add_msicap(pi, 1);
+
+ return (0);
+}
+
+static void
+pci_vtblk_write(struct pci_devinst *pi, int baridx, int offset, int size,
+ uint32_t value)
+{
+ struct pci_vtblk_softc *sc = pi->pi_arg;
+
+ if (offset + size > VTBLK_REGSZ) {
+ DPRINTF(("vtblk_write: 2big, offset %d size %d\n",
+ offset, size));
+ return;
+ }
+
+ switch (offset) {
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ sc->vbsc_features = value & VTBLK_S_HOSTCAPS;
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ pci_vtblk_ring_init(sc, value);
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ sc->vbsc_lastq = value;
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ assert(value == 0);
+ pci_vtblk_qnotify(sc);
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ pci_vtblk_update_status(sc, value);
+ break;
+ case VTCFG_R_HOSTCAP:
+ case VTCFG_R_QNUM:
+ case VTCFG_R_ISR:
+ case VTBLK_R_CFG ... VTBLK_R_CFG_END:
+ DPRINTF(("vtblk: write to readonly reg %d\n\r", offset));
+ break;
+ default:
+ DPRINTF(("vtblk: unknown i/o write offset %d\n\r", offset));
+ value = 0;
+ break;
+ }
+}
+
+uint32_t
+pci_vtblk_read(struct pci_devinst *pi, int baridx, int offset, int size)
+{
+ struct pci_vtblk_softc *sc = pi->pi_arg;
+ uint32_t value;
+
+ if (offset + size > VTBLK_REGSZ) {
+ DPRINTF(("vtblk_read: 2big, offset %d size %d\n",
+ offset, size));
+ return (0);
+ }
+
+ switch (offset) {
+ case VTCFG_R_HOSTCAP:
+ assert(size == 4);
+ value = VTBLK_S_HOSTCAPS;
+ break;
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ value = sc->vbsc_features; /* XXX never read ? */
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ value = sc->vbsc_pfn >> VRING_PFN;
+ break;
+ case VTCFG_R_QNUM:
+ value = (sc->vbsc_lastq == 0) ? VTBLK_RINGSZ: 0;
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ value = sc->vbsc_lastq; /* XXX never read ? */
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ value = 0; /* XXX never read ? */
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ value = sc->vbsc_status;
+ break;
+ case VTCFG_R_ISR:
+ assert(size == 1);
+ value = sc->vbsc_isr;
+ sc->vbsc_isr = 0; /* a read clears this flag */
+ break;
+ case VTBLK_R_CFG ... VTBLK_R_CFG_END:
+ assert(size == 1);
+ value = *((uint8_t *)&sc->vbsc_cfg + offset - VTBLK_R_CFG);
+ break;
+ default:
+ DPRINTF(("vtblk: unknown i/o read offset %d\n\r", offset));
+ value = 0;
+ break;
+ }
+
+ return (value);
+}
+
+struct pci_devemu pci_de_vblk = {
+ .pe_emu = "virtio-blk",
+ .pe_init = pci_vtblk_init,
+ .pe_iow = pci_vtblk_write,
+ .pe_ior = pci_vtblk_read,
+};
+PCI_EMUL_SET(pci_de_vblk);
diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c
new file mode 100644
index 000000000000..5db1eb7f5a73
--- /dev/null
+++ b/usr.sbin/bhyve/pci_virtio_net.c
@@ -0,0 +1,739 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/select.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <md5.h>
+#include <pthread.h>
+
+#include "fbsdrun.h"
+#include "pci_emul.h"
+#include "mevent.h"
+#include "virtio.h"
+
+#define VTNET_RINGSZ 256
+
+#define VTNET_MAXSEGS 32
+
+/*
+ * PCI config-space register offsets
+ */
+#define VTNET_R_CFG0 20
+#define VTNET_R_CFG1 21
+#define VTNET_R_CFG2 22
+#define VTNET_R_CFG3 23
+#define VTNET_R_CFG4 24
+#define VTNET_R_CFG5 25
+#define VTNET_R_CFG6 26
+#define VTNET_R_CFG7 27
+#define VTNET_R_MAX 27
+
+#define VTNET_REGSZ VTNET_R_MAX+1
+
+/*
+ * Host capabilities
+ */
+#define VTNET_S_HOSTCAPS \
+ ( 0x00000020 | /* host supplies MAC */ \
+ 0x00008000 | /* host can merge Rx buffers */ \
+ 0x00010000 ) /* config status available */
+
+/*
+ * Queue definitions.
+ */
+#define VTNET_RXQ 0
+#define VTNET_TXQ 1
+#define VTNET_CTLQ 2
+
+#define VTNET_MAXQ 3
+
+struct vring_hqueue {
+ /* Internal state */
+ uint16_t hq_size;
+ uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */
+
+ /* Host-context pointers to the queue */
+ struct virtio_desc *hq_dtable;
+ uint16_t *hq_avail_flags;
+ uint16_t *hq_avail_idx; /* monotonically increasing */
+ uint16_t *hq_avail_ring;
+
+ uint16_t *hq_used_flags;
+ uint16_t *hq_used_idx; /* monotonically increasing */
+ struct virtio_used *hq_used_ring;
+};
+
+/*
+ * Fixed network header size
+ */
+struct virtio_net_rxhdr {
+ uint8_t vrh_flags;
+ uint8_t vrh_gso_type;
+ uint16_t vrh_hdr_len;
+ uint16_t vrh_gso_size;
+ uint16_t vrh_csum_start;
+ uint16_t vrh_csum_offset;
+ uint16_t vrh_bufs;
+} __packed;
+
+/*
+ * Debug printf
+ */
+static int pci_vtnet_debug;
+#define DPRINTF(params) if (pci_vtnet_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_vtnet_softc {
+ struct pci_devinst *vsc_pi;
+ pthread_mutex_t vsc_mtx;
+ struct mevent *vsc_mevp;
+
+ int vsc_curq;
+ int vsc_status;
+ int vsc_isr;
+ int vsc_tapfd;
+ int vsc_rx_ready;
+ int vsc_rxpend;
+
+ uint32_t vsc_features;
+ uint8_t vsc_macaddr[6];
+
+ uint64_t vsc_pfn[VTNET_MAXQ];
+ struct vring_hqueue vsc_hq[VTNET_MAXQ];
+};
+
+/*
+ * Return the number of available descriptors in the vring taking care
+ * of the 16-bit index wraparound.
+ */
+static int
+hq_num_avail(struct vring_hqueue *hq)
+{
+ int ndesc;
+
+ if (*hq->hq_avail_idx >= hq->hq_cur_aidx)
+ ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx;
+ else
+ ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1;
+
+ assert(ndesc >= 0 && ndesc <= hq->hq_size);
+
+ return (ndesc);
+}
+
+static uint16_t
+pci_vtnet_qsize(int qnum)
+{
+ /* XXX no ctl queue currently */
+ if (qnum == VTNET_CTLQ) {
+ return (0);
+ }
+
+ /* XXX fixed currently. Maybe different for tx/rx/ctl */
+ return (VTNET_RINGSZ);
+}
+
+static void
+pci_vtnet_update_status(struct pci_vtnet_softc *sc, uint32_t value)
+{
+ if (value == 0) {
+ DPRINTF(("vtnet: device reset requested !\n"));
+ }
+
+ sc->vsc_status = value;
+}
+
+/*
+ * Called to send a buffer chain out to the tap device
+ */
+static void
+pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
+ int len)
+{
+ char pad[60];
+
+ if (sc->vsc_tapfd == -1)
+ return;
+
+ /*
+ * If the length is < 60, pad out to that and add the
+ * extra zero'd segment to the iov. It is guaranteed that
+ * there is always an extra iov available by the caller.
+ */
+ if (len < 60) {
+ memset(pad, 0, 60 - len);
+ iov[iovcnt].iov_base = pad;
+ iov[iovcnt].iov_len = 60 - len;
+ iovcnt++;
+ }
+ (void) writev(sc->vsc_tapfd, iov, iovcnt);
+}
+
+/*
+ * Called when there is read activity on the tap file descriptor.
+ * Each buffer posted by the guest is assumed to be able to contain
+ * an entire ethernet frame + rx header.
+ * MP note: the dummybuf is only used for discarding frames, so there
+ * is no need for it to be per-vtnet or locked.
+ */
+static uint8_t dummybuf[2048];
+
+static void
+pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
+{
+ struct virtio_desc *vd;
+ struct virtio_used *vu;
+ struct vring_hqueue *hq;
+ struct virtio_net_rxhdr *vrx;
+ uint8_t *buf;
+ int i;
+ int len;
+ int ndescs;
+ int didx, uidx, aidx; /* descriptor, avail and used index */
+
+ /*
+ * Should never be called without a valid tap fd
+ */
+ assert(sc->vsc_tapfd != -1);
+
+ /*
+ * But, will be called when the rx ring hasn't yet
+ * been set up.
+ */
+ if (sc->vsc_rx_ready == 0) {
+ /*
+ * Drop the packet and try later.
+ */
+ (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+ return;
+ }
+
+ /*
+ * Calculate the number of available rx buffers
+ */
+ hq = &sc->vsc_hq[VTNET_RXQ];
+
+ ndescs = hq_num_avail(hq);
+
+ if (ndescs == 0) {
+ /*
+ * Need to wait for host notification to read
+ */
+ if (sc->vsc_rxpend == 0) {
+ WPRINTF(("vtnet: no rx descriptors !\n"));
+ sc->vsc_rxpend = 1;
+ }
+
+ /*
+ * Drop the packet and try later
+ */
+ (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+ return;
+ }
+
+ aidx = hq->hq_cur_aidx;
+ uidx = *hq->hq_used_idx;
+ for (i = 0; i < ndescs; i++) {
+ /*
+ * 'aidx' indexes into the an array of descriptor indexes
+ */
+ didx = hq->hq_avail_ring[aidx % hq->hq_size];
+ assert(didx >= 0 && didx < hq->hq_size);
+
+ vd = &hq->hq_dtable[didx];
+
+ /*
+ * Get a pointer to the rx header, and use the
+ * data immediately following it for the packet buffer.
+ */
+ vrx = (struct virtio_net_rxhdr *)paddr_guest2host(vd->vd_addr);
+ buf = (uint8_t *)(vrx + 1);
+
+ len = read(sc->vsc_tapfd, buf,
+ vd->vd_len - sizeof(struct virtio_net_rxhdr));
+
+ if (len < 0 && errno == EWOULDBLOCK) {
+ break;
+ }
+
+ /*
+ * The only valid field in the rx packet header is the
+ * number of buffers, which is always 1 without TSO
+ * support.
+ */
+ memset(vrx, 0, sizeof(struct virtio_net_rxhdr));
+ vrx->vrh_bufs = 1;
+
+ /*
+ * Write this descriptor into the used ring
+ */
+ vu = &hq->hq_used_ring[uidx % hq->hq_size];
+ vu->vu_idx = didx;
+ vu->vu_tlen = len + sizeof(struct virtio_net_rxhdr);
+ uidx++;
+ aidx++;
+ }
+
+ /*
+ * Update the used pointer, and signal an interrupt if allowed
+ */
+ *hq->hq_used_idx = uidx;
+ hq->hq_cur_aidx = aidx;
+
+ if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+ sc->vsc_isr |= 1;
+ pci_generate_msi(sc->vsc_pi, 0);
+ }
+}
+
+static void
+pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
+{
+ struct pci_vtnet_softc *sc = param;
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+ pci_vtnet_tap_rx(sc);
+ pthread_mutex_unlock(&sc->vsc_mtx);
+
+}
+
+static void
+pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc)
+{
+ /*
+ * A qnotify means that the rx process can now begin
+ */
+ if (sc->vsc_rx_ready == 0) {
+ sc->vsc_rx_ready = 1;
+ }
+
+ /*
+ * If the rx queue was empty, attempt to receive a
+ * packet that was previously blocked due to no rx bufs
+ * available
+ */
+ if (sc->vsc_rxpend) {
+ WPRINTF(("vtnet: rx resumed\n\r"));
+ sc->vsc_rxpend = 0;
+ pci_vtnet_tap_rx(sc);
+ }
+}
+
+static void
+pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq)
+{
+ struct iovec iov[VTNET_MAXSEGS + 1];
+ struct virtio_desc *vd;
+ struct virtio_used *vu;
+ int i;
+ int plen;
+ int tlen;
+ int uidx, aidx, didx;
+
+ uidx = *hq->hq_used_idx;
+ aidx = hq->hq_cur_aidx;
+ didx = hq->hq_avail_ring[aidx % hq->hq_size];
+ assert(didx >= 0 && didx < hq->hq_size);
+
+ vd = &hq->hq_dtable[didx];
+
+ /*
+ * Run through the chain of descriptors, ignoring the
+ * first header descriptor. However, include the header
+ * length in the total length that will be put into the
+ * used queue.
+ */
+ tlen = vd->vd_len;
+ vd = &hq->hq_dtable[vd->vd_next];
+
+ for (i = 0, plen = 0;
+ i < VTNET_MAXSEGS;
+ i++, vd = &hq->hq_dtable[vd->vd_next]) {
+ iov[i].iov_base = paddr_guest2host(vd->vd_addr);
+ iov[i].iov_len = vd->vd_len;
+ plen += vd->vd_len;
+ tlen += vd->vd_len;
+
+ if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0)
+ break;
+ }
+ assert(i < VTNET_MAXSEGS);
+
+ DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, i + 1));
+ pci_vtnet_tap_tx(sc, iov, i + 1, plen);
+
+ /*
+ * Return this chain back to the host
+ */
+ vu = &hq->hq_used_ring[uidx % hq->hq_size];
+ vu->vu_idx = didx;
+ vu->vu_tlen = tlen;
+ hq->hq_cur_aidx = aidx + 1;
+ *hq->hq_used_idx = uidx + 1;
+
+ /*
+ * Generate an interrupt if able
+ */
+ if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+ sc->vsc_isr |= 1;
+ pci_generate_msi(sc->vsc_pi, 0);
+ }
+}
+
+static void
+pci_vtnet_ping_txq(struct pci_vtnet_softc *sc)
+{
+ struct vring_hqueue *hq = &sc->vsc_hq[VTNET_TXQ];
+ int i;
+ int ndescs;
+
+ /*
+ * Calculate number of ring entries to process
+ */
+ ndescs = hq_num_avail(hq);
+
+ if (ndescs == 0)
+ return;
+
+ /*
+ * Run through all the entries, placing them into iovecs and
+ * sending when an end-of-packet is found
+ */
+ for (i = 0; i < ndescs; i++)
+ pci_vtnet_proctx(sc, hq);
+}
+
+static void
+pci_vtnet_ping_ctlq(struct pci_vtnet_softc *sc)
+{
+
+ DPRINTF(("vtnet: control qnotify!\n\r"));
+}
+
+static void
+pci_vtnet_ring_init(struct pci_vtnet_softc *sc, uint64_t pfn)
+{
+ struct vring_hqueue *hq;
+ int qnum = sc->vsc_curq;
+
+ assert(qnum < VTNET_MAXQ);
+
+ sc->vsc_pfn[qnum] = pfn << VRING_PFN;
+
+ /*
+ * Set up host pointers to the various parts of the
+ * queue
+ */
+ hq = &sc->vsc_hq[qnum];
+ hq->hq_size = pci_vtnet_qsize(qnum);
+
+ hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN);
+ hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size);
+ hq->hq_avail_idx = hq->hq_avail_flags + 1;
+ hq->hq_avail_ring = hq->hq_avail_flags + 2;
+ hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
+ VRING_ALIGN);
+ hq->hq_used_idx = hq->hq_used_flags + 1;
+ hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
+
+ /*
+ * Initialize queue indexes
+ */
+ hq->hq_cur_aidx = 0;
+}
+
+static int
+pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ MD5_CTX mdctx;
+ unsigned char digest[16];
+ char nstr[80];
+ struct pci_vtnet_softc *sc;
+
+ /*
+ * Access to guest memory is required. Fail if
+ * memory not mapped
+ */
+ if (paddr_guest2host(0) == NULL)
+ return (1);
+
+ sc = malloc(sizeof(struct pci_vtnet_softc));
+ memset(sc, 0, sizeof(struct pci_vtnet_softc));
+
+ pi->pi_arg = sc;
+ sc->vsc_pi = pi;
+
+ pthread_mutex_init(&sc->vsc_mtx, NULL);
+
+ /*
+ * Attempt to open the tap device
+ */
+ sc->vsc_tapfd = -1;
+ if (opts != NULL) {
+ char tbuf[80];
+
+ strcpy(tbuf, "/dev/");
+ strncat(tbuf, opts, sizeof(tbuf) - strlen(tbuf));
+
+ sc->vsc_tapfd = open(tbuf, O_RDWR);
+ if (sc->vsc_tapfd == -1) {
+ WPRINTF(("open of tap device %s failed\n", tbuf));
+ } else {
+ /*
+ * Set non-blocking and register for read
+ * notifications with the event loop
+ */
+ int opt = 1;
+ if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
+ WPRINTF(("tap device O_NONBLOCK failed\n"));
+ close(sc->vsc_tapfd);
+ sc->vsc_tapfd = -1;
+ }
+
+ sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
+ EVF_READ,
+ pci_vtnet_tap_callback,
+ sc);
+ if (sc->vsc_mevp == NULL) {
+ WPRINTF(("Could not register event\n"));
+ close(sc->vsc_tapfd);
+ sc->vsc_tapfd = -1;
+ }
+ }
+ }
+
+ /*
+ * The MAC address is the standard NetApp OUI of 00-a0-98,
+ * followed by an MD5 of the vm name. The slot number is
+ * prepended to this for slots other than 1, so that
+ * CFE can netboot from the equivalent of slot 1.
+ */
+ if (pi->pi_slot == 1) {
+ strncpy(nstr, vmname, sizeof(nstr));
+ } else {
+ snprintf(nstr, sizeof(nstr), "%d-%s", pi->pi_slot, vmname);
+ }
+
+ MD5Init(&mdctx);
+ MD5Update(&mdctx, nstr, strlen(nstr));
+ MD5Final(digest, &mdctx);
+
+ sc->vsc_macaddr[0] = 0x00;
+ sc->vsc_macaddr[1] = 0xa0;
+ sc->vsc_macaddr[2] = 0x98;
+ sc->vsc_macaddr[3] = digest[0];
+ sc->vsc_macaddr[4] = digest[1];
+ sc->vsc_macaddr[5] = digest[2];
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
+ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
+ pci_emul_alloc_bar(pi, 0, 0, PCIBAR_IO, VTNET_REGSZ);
+ pci_emul_add_msicap(pi, 1);
+
+ return (0);
+}
+
+/*
+ * Function pointer array to handle queue notifications
+ */
+static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = {
+ pci_vtnet_ping_rxq,
+ pci_vtnet_ping_txq,
+ pci_vtnet_ping_ctlq
+};
+
+static void
+pci_vtnet_write(struct pci_devinst *pi, int baridx, int offset, int size,
+ uint32_t value)
+{
+ struct pci_vtnet_softc *sc = pi->pi_arg;
+
+ if (offset + size > VTNET_REGSZ) {
+ DPRINTF(("vtnet_write: 2big, offset %d size %d\n",
+ offset, size));
+ return;
+ }
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+
+ switch (offset) {
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ sc->vsc_features = value & VTNET_S_HOSTCAPS;
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ pci_vtnet_ring_init(sc, value);
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ assert(value < VTNET_MAXQ);
+ sc->vsc_curq = value;
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ assert(value < VTNET_MAXQ);
+ (*pci_vtnet_qnotify[value])(sc);
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ pci_vtnet_update_status(sc, value);
+ break;
+ case VTNET_R_CFG0:
+ case VTNET_R_CFG1:
+ case VTNET_R_CFG2:
+ case VTNET_R_CFG3:
+ case VTNET_R_CFG4:
+ case VTNET_R_CFG5:
+ /*
+ * The driver is allowed to change the MAC address
+ */
+ assert(size == 1);
+ sc->vsc_macaddr[offset - VTNET_R_CFG0] = value;
+ break;
+ case VTCFG_R_HOSTCAP:
+ case VTCFG_R_QNUM:
+ case VTCFG_R_ISR:
+ case VTNET_R_CFG6:
+ case VTNET_R_CFG7:
+ DPRINTF(("vtnet: write to readonly reg %d\n\r", offset));
+ break;
+ default:
+ DPRINTF(("vtnet: unknown i/o write offset %d\n\r", offset));
+ value = 0;
+ break;
+ }
+
+ pthread_mutex_unlock(&sc->vsc_mtx);
+}
+
+uint32_t
+pci_vtnet_read(struct pci_devinst *pi, int baridx, int offset, int size)
+{
+ struct pci_vtnet_softc *sc = pi->pi_arg;
+ uint32_t value;
+
+ if (offset + size > VTNET_REGSZ) {
+ DPRINTF(("vtnet_read: 2big, offset %d size %d\n",
+ offset, size));
+ return (0);
+ }
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+
+ switch (offset) {
+ case VTCFG_R_HOSTCAP:
+ assert(size == 4);
+ value = VTNET_S_HOSTCAPS;
+ break;
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ value = sc->vsc_features; /* XXX never read ? */
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN;
+ break;
+ case VTCFG_R_QNUM:
+ assert(size == 2);
+ value = pci_vtnet_qsize(sc->vsc_curq);
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ value = sc->vsc_curq; /* XXX never read ? */
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ value = sc->vsc_curq; /* XXX never read ? */
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ value = sc->vsc_status;
+ break;
+ case VTCFG_R_ISR:
+ assert(size == 1);
+ value = sc->vsc_isr;
+ sc->vsc_isr = 0; /* a read clears this flag */
+ break;
+ case VTNET_R_CFG0:
+ case VTNET_R_CFG1:
+ case VTNET_R_CFG2:
+ case VTNET_R_CFG3:
+ case VTNET_R_CFG4:
+ case VTNET_R_CFG5:
+ assert(size == 1);
+ value = sc->vsc_macaddr[offset - VTNET_R_CFG0];
+ break;
+ case VTNET_R_CFG6:
+ assert(size == 1);
+ value = 0x01; /* XXX link always up */
+ break;
+ case VTNET_R_CFG7:
+ assert(size == 1);
+ value = 0; /* link status is in the LSB */
+ break;
+ default:
+ DPRINTF(("vtnet: unknown i/o read offset %d\n\r", offset));
+ value = 0;
+ break;
+ }
+
+ pthread_mutex_unlock(&sc->vsc_mtx);
+
+ return (value);
+}
+
+struct pci_devemu pci_de_vnet = {
+ .pe_emu = "virtio-net",
+ .pe_init = pci_vtnet_init,
+ .pe_iow = pci_vtnet_write,
+ .pe_ior = pci_vtnet_read,
+};
+PCI_EMUL_SET(pci_de_vnet);
diff --git a/usr.sbin/bhyve/pit_8254.c b/usr.sbin/bhyve/pit_8254.c
new file mode 100644
index 000000000000..b5101616b035
--- /dev/null
+++ b/usr.sbin/bhyve/pit_8254.c
@@ -0,0 +1,196 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/time.h>
+
+#include <machine/clock.h>
+
+#include <stdio.h>
+#include <assert.h>
+
+#include "fbsdrun.h"
+#include "inout.h"
+#include "pit_8254.h"
+
+#define TIMER_SEL_MASK 0xc0
+#define TIMER_RW_MASK 0x30
+#define TIMER_MODE_MASK 0x0f
+#define TIMER_SEL_READBACK 0xc0
+
+#define TIMER_DIV(freq, hz) (((freq) + (hz) / 2) / (hz))
+
+#define PIT_8254_FREQ 1193182
+static const int nsecs_per_tick = 1000000000 / PIT_8254_FREQ;
+
+struct counter {
+ struct timeval tv; /* uptime when counter was loaded */
+ uint16_t initial; /* initial counter value */
+ uint8_t cr[2];
+ uint8_t ol[2];
+ int crbyte;
+ int olbyte;
+};
+
+static void
+timevalfix(struct timeval *t1)
+{
+
+ if (t1->tv_usec < 0) {
+ t1->tv_sec--;
+ t1->tv_usec += 1000000;
+ }
+ if (t1->tv_usec >= 1000000) {
+ t1->tv_sec++;
+ t1->tv_usec -= 1000000;
+ }
+}
+
+static void
+timevalsub(struct timeval *t1, const struct timeval *t2)
+{
+
+ t1->tv_sec -= t2->tv_sec;
+ t1->tv_usec -= t2->tv_usec;
+ timevalfix(t1);
+}
+
+static void
+latch(struct counter *c)
+{
+ struct timeval tv2;
+ uint16_t lval;
+ uint64_t delta_nsecs, delta_ticks;
+
+ /* cannot latch a new value until the old one has been consumed */
+ if (c->olbyte != 0)
+ return;
+
+ if (c->initial == 0 || c->initial == 1) {
+ /*
+ * XXX the program that runs the VM can be stopped and
+ * restarted at any time. This means that state that was
+ * created by the guest is destroyed between invocations
+ * of the program.
+ *
+ * If the counter's initial value is not programmed we
+ * assume a value that would be set to generate 'guest_hz'
+ * interrupts per second.
+ */
+ c->initial = TIMER_DIV(PIT_8254_FREQ, guest_hz);
+ gettimeofday(&c->tv, NULL);
+ }
+
+ (void)gettimeofday(&tv2, NULL);
+ timevalsub(&tv2, &c->tv);
+ delta_nsecs = tv2.tv_sec * 1000000000 + tv2.tv_usec * 1000;
+ delta_ticks = delta_nsecs / nsecs_per_tick;
+
+ lval = c->initial - delta_ticks % c->initial;
+ c->olbyte = 2;
+ c->ol[1] = lval; /* LSB */
+ c->ol[0] = lval >> 8; /* MSB */
+}
+
+static int
+pit_8254_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int sel, rw, mode;
+ uint8_t val;
+ struct counter *c;
+
+ static struct counter counter[3];
+
+ if (bytes != 1)
+ return (-1);
+
+ val = *eax;
+
+ if (port == TIMER_MODE) {
+ assert(in == 0);
+ sel = val & TIMER_SEL_MASK;
+ rw = val & TIMER_RW_MASK;
+ mode = val & TIMER_MODE_MASK;
+
+ if (sel == TIMER_SEL_READBACK)
+ return (-1);
+ if (rw != TIMER_LATCH && rw != TIMER_16BIT)
+ return (-1);
+
+ if (rw != TIMER_LATCH) {
+ /*
+ * Counter mode is not affected when issuing a
+ * latch command.
+ */
+ if (mode != TIMER_RATEGEN && mode != TIMER_SQWAVE)
+ return (-1);
+ }
+
+ c = &counter[sel >> 6];
+ if (rw == TIMER_LATCH)
+ latch(c);
+ else
+ c->olbyte = 0; /* reset latch after reprogramming */
+
+ return (0);
+ }
+
+ /* counter ports */
+ assert(port >= TIMER_CNTR0 && port <= TIMER_CNTR2);
+ c = &counter[port - TIMER_CNTR0];
+
+ if (in) {
+ /*
+ * XXX
+ * The spec says that once the output latch is completely
+ * read it should revert to "following" the counter. We don't
+ * do this because it is hard and any reasonable OS should
+ * always latch the counter before trying to read it.
+ */
+ if (c->olbyte == 0)
+ c->olbyte = 2;
+ *eax = c->ol[--c->olbyte];
+ } else {
+ c->cr[c->crbyte++] = *eax;
+ if (c->crbyte == 2) {
+ c->crbyte = 0;
+ c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8;
+ gettimeofday(&c->tv, NULL);
+ }
+ }
+
+ return (0);
+}
+
+INOUT_PORT(8254, TIMER_MODE, IOPORT_F_OUT, pit_8254_handler);
+INOUT_PORT(8254, TIMER_CNTR0, IOPORT_F_INOUT, pit_8254_handler);
+INOUT_PORT(8254, TIMER_CNTR1, IOPORT_F_INOUT, pit_8254_handler);
+INOUT_PORT(8254, TIMER_CNTR2, IOPORT_F_INOUT, pit_8254_handler);
diff --git a/usr.sbin/bhyve/pit_8254.h b/usr.sbin/bhyve/pit_8254.h
new file mode 100644
index 000000000000..61bd15d13b1c
--- /dev/null
+++ b/usr.sbin/bhyve/pit_8254.h
@@ -0,0 +1,45 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PIT_8254_H_
+#define _PIT_8254_H_
+
+/*
+ * Borrowed from amd64/include/timerreg.h because in that file it is
+ * conditionally compiled for #ifdef _KERNEL only.
+ */
+
+#include <dev/ic/i8253reg.h>
+
+#define IO_TIMER1 0x40 /* 8253 Timer #1 */
+#define TIMER_CNTR0 (IO_TIMER1 + TIMER_REG_CNTR0)
+#define TIMER_CNTR1 (IO_TIMER1 + TIMER_REG_CNTR1)
+#define TIMER_CNTR2 (IO_TIMER1 + TIMER_REG_CNTR2)
+#define TIMER_MODE (IO_TIMER1 + TIMER_REG_MODE)
+
+#endif /* _PIT_8254_H_ */
diff --git a/usr.sbin/bhyve/post.c b/usr.sbin/bhyve/post.c
new file mode 100644
index 000000000000..092a551d87b3
--- /dev/null
+++ b/usr.sbin/bhyve/post.c
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <assert.h>
+
+#include "inout.h"
+
+static int
+post_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in == 1);
+
+ if (bytes != 1)
+ return (-1);
+
+ *eax = 0xff; /* return some garbage */
+ return (0);
+}
+
+INOUT_PORT(post, 0x84, IOPORT_F_IN, post_data_handler);
diff --git a/usr.sbin/bhyve/rtc.c b/usr.sbin/bhyve/rtc.c
new file mode 100644
index 000000000000..a6f44e00bcd8
--- /dev/null
+++ b/usr.sbin/bhyve/rtc.c
@@ -0,0 +1,268 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/time.h>
+
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+
+#include "inout.h"
+
+#define IO_RTC 0x70
+
+#define RTC_SEC 0x00 /* seconds */
+#define RTC_MIN 0x02
+#define RTC_HRS 0x04
+#define RTC_WDAY 0x06
+#define RTC_DAY 0x07
+#define RTC_MONTH 0x08
+#define RTC_YEAR 0x09
+#define RTC_CENTURY 0x32 /* current century */
+
+#define RTC_STATUSA 0xA
+#define RTCSA_TUP 0x80 /* time update, don't look now */
+
+#define RTC_STATUSB 0xB
+#define RTCSB_DST 0x01
+#define RTCSB_24HR 0x02
+#define RTCSB_BIN 0x04 /* 0 = BCD, 1 = Binary */
+#define RTCSB_PINTR 0x40 /* 1 = enable periodic clock interrupt */
+#define RTCSB_HALT 0x80 /* stop clock updates */
+
+#define RTC_INTR 0x0c /* status register C (R) interrupt source */
+
+#define RTC_STATUSD 0x0d /* status register D (R) Lost Power */
+#define RTCSD_PWR 0x80 /* clock power OK */
+
+#define RTC_DIAG 0x0e
+
+#define RTC_RSTCODE 0x0f
+
+static int addr;
+
+/* XXX initialize these to default values as they would be from BIOS */
+static uint8_t status_a, status_b, rstcode;
+
+static u_char const bin2bcd_data[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99
+};
+#define bin2bcd(bin) (bin2bcd_data[bin])
+
+#define rtcout(val) ((status_b & RTCSB_BIN) ? (val) : bin2bcd((val)))
+
+static void
+timevalfix(struct timeval *t1)
+{
+
+ if (t1->tv_usec < 0) {
+ t1->tv_sec--;
+ t1->tv_usec += 1000000;
+ }
+ if (t1->tv_usec >= 1000000) {
+ t1->tv_sec++;
+ t1->tv_usec -= 1000000;
+ }
+}
+
+static void
+timevalsub(struct timeval *t1, const struct timeval *t2)
+{
+
+ t1->tv_sec -= t2->tv_sec;
+ t1->tv_usec -= t2->tv_usec;
+ timevalfix(t1);
+}
+
+static int
+rtc_addr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in == 0);
+
+ if (bytes != 1)
+ return (-1);
+
+ switch (*eax) {
+ case RTC_SEC:
+ case RTC_MIN:
+ case RTC_HRS:
+ case RTC_WDAY:
+ case RTC_DAY:
+ case RTC_MONTH:
+ case RTC_YEAR:
+ case RTC_CENTURY:
+ case RTC_STATUSA:
+ case RTC_STATUSB:
+ case RTC_INTR:
+ case RTC_STATUSD:
+ case RTC_DIAG:
+ case RTC_RSTCODE:
+ break;
+ default:
+ return (-1);
+ }
+
+ addr = *eax;
+ return (0);
+}
+
+static int
+rtc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int hour;
+ time_t t;
+ struct timeval cur, delta;
+
+ static struct timeval last;
+ static struct tm tm;
+
+ if (bytes != 1)
+ return (-1);
+
+ gettimeofday(&cur, NULL);
+
+ /*
+ * Increment the cached time only once per second so we can guarantee
+ * that the guest has at least one second to read the hour:min:sec
+ * separately and still get a coherent view of the time.
+ */
+ delta = cur;
+ timevalsub(&delta, &last);
+ if (delta.tv_sec >= 1 && (status_b & RTCSB_HALT) == 0) {
+ t = cur.tv_sec;
+ localtime_r(&t, &tm);
+ last = cur;
+ }
+
+ if (in) {
+ switch (addr) {
+ case RTC_SEC:
+ *eax = rtcout(tm.tm_sec);
+ return (0);
+ case RTC_MIN:
+ *eax = rtcout(tm.tm_min);
+ return (0);
+ case RTC_HRS:
+ if (status_b & RTCSB_24HR)
+ hour = tm.tm_hour;
+ else
+ hour = (tm.tm_hour % 12) + 1;
+
+ *eax = rtcout(hour);
+
+ /*
+ * If we are representing time in the 12-hour format
+ * then set the MSB to indicate PM.
+ */
+ if ((status_b & RTCSB_24HR) == 0 && tm.tm_hour >= 12)
+ *eax |= 0x80;
+
+ return (0);
+ case RTC_WDAY:
+ *eax = rtcout(tm.tm_wday + 1);
+ return (0);
+ case RTC_DAY:
+ *eax = rtcout(tm.tm_mday);
+ return (0);
+ case RTC_MONTH:
+ *eax = rtcout(tm.tm_mon + 1);
+ return (0);
+ case RTC_YEAR:
+ *eax = rtcout(tm.tm_year % 100);
+ return (0);
+ case RTC_CENTURY:
+ *eax = rtcout(tm.tm_year / 100);
+ break;
+ case RTC_STATUSA:
+ *eax = status_a;
+ return (0);
+ case RTC_INTR:
+ *eax = 0;
+ return (0);
+ case RTC_STATUSD:
+ *eax = RTCSD_PWR;
+ return (0);
+ case RTC_DIAG:
+ *eax = 0;
+ return (0);
+ case RTC_RSTCODE:
+ *eax = rstcode;
+ return (0);
+ default:
+ return (-1);
+ }
+ }
+
+ switch (addr) {
+ case RTC_STATUSA:
+ status_a = *eax & ~RTCSA_TUP;
+ break;
+ case RTC_STATUSB:
+ /* XXX not implemented yet XXX */
+ if (*eax & RTCSB_PINTR)
+ return (-1);
+ status_b = *eax;
+ break;
+ case RTC_RSTCODE:
+ rstcode = *eax;
+ break;
+ case RTC_SEC:
+ case RTC_MIN:
+ case RTC_HRS:
+ case RTC_WDAY:
+ case RTC_DAY:
+ case RTC_MONTH:
+ case RTC_YEAR:
+ case RTC_CENTURY:
+ /*
+ * Ignore writes to the time of day registers
+ */
+ break;
+ default:
+ return (-1);
+ }
+ return (0);
+}
+
+INOUT_PORT(rtc, IO_RTC, IOPORT_F_OUT, rtc_addr_handler);
+INOUT_PORT(rtc, IO_RTC + 1, IOPORT_F_INOUT, rtc_data_handler);
diff --git a/usr.sbin/bhyve/uart.c b/usr.sbin/bhyve/uart.c
new file mode 100644
index 000000000000..640f3bf0f70d
--- /dev/null
+++ b/usr.sbin/bhyve/uart.c
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <assert.h>
+
+#include "inout.h"
+
+#define COM1 0x3F8
+#define COM2 0x2F8
+
+#define REG_IIR 2
+
+static int
+com_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in);
+
+ if (bytes != 1)
+ return (-1);
+
+ /*
+ * COM port is not implemented so we return 0xFF for all registers
+ */
+ *eax = 0xFF;
+
+ return (0);
+}
+
+INOUT_PORT(uart, COM1 + REG_IIR, IOPORT_F_IN, com_handler);
+INOUT_PORT(uart, COM2 + REG_IIR, IOPORT_F_IN, com_handler);
diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h
new file mode 100644
index 000000000000..474e244c5965
--- /dev/null
+++ b/usr.sbin/bhyve/virtio.h
@@ -0,0 +1,85 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VIRTIO_H_
+#define _VIRTIO_H_
+
+#define VRING_ALIGN 4096
+
+#define VRING_DESC_F_NEXT (1 << 0)
+#define VRING_DESC_F_WRITE (1 << 1)
+#define VRING_DESC_F_INDIRECT (1 << 2)
+
+#define VRING_AVAIL_F_NO_INTERRUPT 1
+
+struct virtio_desc {
+ uint64_t vd_addr;
+ uint32_t vd_len;
+ uint16_t vd_flags;
+ uint16_t vd_next;
+} __packed;
+
+struct virtio_used {
+ uint32_t vu_idx;
+ uint32_t vu_tlen;
+} __packed;
+
+/*
+ * PFN register shift amount
+ */
+#define VRING_PFN 12
+
+/*
+ * Virtio device types
+ */
+#define VIRTIO_TYPE_NET 1
+#define VIRTIO_TYPE_BLOCK 2
+
+/*
+ * PCI vendor/device IDs
+ */
+#define VIRTIO_VENDOR 0x1AF4
+#define VIRTIO_DEV_NET 0x1000
+#define VIRTIO_DEV_BLOCK 0x1001
+
+/*
+ * PCI config space constants
+ */
+#define VTCFG_R_HOSTCAP 0
+#define VTCFG_R_GUESTCAP 4
+#define VTCFG_R_PFN 8
+#define VTCFG_R_QNUM 12
+#define VTCFG_R_QSEL 14
+#define VTCFG_R_QNOTIFY 16
+#define VTCFG_R_STATUS 18
+#define VTCFG_R_ISR 19
+#define VTCFG_R_CFG0 20 /* No MSI-X */
+#define VTCFG_R_CFG1 24 /* With MSI-X */
+#define VTCFG_R_MSIX 20
+
+#endif /* _VIRTIO_H_ */
diff --git a/usr.sbin/bhyve/xmsr.c b/usr.sbin/bhyve/xmsr.c
new file mode 100644
index 000000000000..931b7d7f791c
--- /dev/null
+++ b/usr.sbin/bhyve/xmsr.c
@@ -0,0 +1,261 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <machine/apicreg.h>
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "fbsdrun.h"
+#include "xmsr.h"
+
+/*
+ * Trampoline for hypervisor direct 64-bit jump.
+ *
+ * 0 - signature for guest->host verification
+ * 8 - kernel virtual address of trampoline
+ * 16 - instruction virtual address
+ * 24 - stack pointer virtual address
+ * 32 - CR3, physical address of kernel page table
+ * 40 - 24-byte area for null/code/data GDT entries
+ */
+#define MP_V64T_SIG 0xcafebabecafebabeULL
+struct mp_v64tramp {
+ uint64_t mt_sig;
+ uint64_t mt_virt;
+ uint64_t mt_eip;
+ uint64_t mt_rsp;
+ uint64_t mt_cr3;
+ uint64_t mt_gdtr[3];
+};
+
+/*
+ * CPU 0 is considered to be the BSP and is set to the RUNNING state.
+ * All other CPUs are set up in the INIT state.
+ */
+#define BSP 0
+enum cpu_bstate {
+ CPU_S_INIT,
+ CPU_S_SIPI,
+ CPU_S_RUNNING
+} static cpu_b[VM_MAXCPU] = { [BSP] = CPU_S_RUNNING };
+
+static void spinup_ap(struct vmctx *, int, int, uint64_t *);
+static void spinup_ap_direct64(struct vmctx *, int, uintptr_t, uint64_t *);
+
+int
+emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val)
+{
+ int dest;
+ int mode;
+ int thiscpu;
+ int vec;
+ int error, retval;
+ uint64_t rip;
+
+ retval = vcpu;
+ thiscpu = 1 << vcpu;
+
+ /*
+ * The only MSR value handled is the x2apic CR register
+ */
+ if (code != 0x830) {
+ printf("Unknown WRMSR code %x, val %lx, cpu %d\n",
+ code, val, vcpu);
+ exit(1);
+ }
+
+ /*
+ * The value written to the MSR will generate an IPI to
+ * a set of CPUs. If this is a SIPI, create the initial
+ * state for the CPU and switch to it. Otherwise, inject
+ * an interrupt for the destination CPU(s), and request
+ * a switch to the next available one by returning -1
+ */
+ dest = val >> 32;
+ vec = val & APIC_VECTOR_MASK;
+ mode = val & APIC_DELMODE_MASK;
+
+ switch (mode) {
+ case APIC_DELMODE_INIT:
+ assert(dest != 0);
+ assert(dest < guest_ncpus);
+
+ /*
+ * Ignore legacy de-assert INITs in x2apic mode
+ */
+ if ((val & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) {
+ break;
+ }
+ assert(cpu_b[dest] == CPU_S_INIT);
+
+ /*
+ * Move CPU to wait-for-SIPI state
+ */
+ error = vcpu_reset(ctx, dest);
+ assert(error == 0);
+
+ cpu_b[dest] = CPU_S_SIPI;
+ break;
+
+ case APIC_DELMODE_STARTUP:
+ assert(dest != 0);
+ assert(dest < guest_ncpus);
+ /*
+ * Ignore SIPIs in any state other than wait-for-SIPI
+ */
+ if (cpu_b[dest] != CPU_S_SIPI) {
+ break;
+ }
+
+ /*
+ * Bring up the AP and signal the main loop that it is
+ * available and to switch to it.
+ */
+ spinup_ap(ctx, dest, vec, &rip);
+ cpu_b[dest] = CPU_S_RUNNING;
+ fbsdrun_addcpu(ctx, dest, rip);
+ retval = dest;
+ break;
+
+ default:
+ printf("APIC delivery mode %lx not supported!\n",
+ val & APIC_DELMODE_MASK);
+ exit(1);
+ }
+
+ return (retval);
+}
+
+/*
+ * There are 2 startup modes possible here:
+ * - if the CPU supports 'unrestricted guest' mode, the spinup can
+ * set up the processor state in power-on 16-bit mode, with the CS:IP
+ * init'd to the specified low-mem 4K page.
+ * - if the guest has requested a 64-bit trampoline in the low-mem 4K
+ * page by placing in the specified signature, set up the register
+ * state using register state in the signature. Note that this
+ * requires accessing guest physical memory to read the signature
+ * while 'unrestricted mode' does not.
+ */
+static void
+spinup_ap(struct vmctx *ctx, int newcpu, int vector, uint64_t *rip)
+{
+ int error;
+ uint16_t cs;
+ uint64_t desc_base;
+ uint32_t desc_limit, desc_access;
+
+ if (fbsdrun_vmexit_on_hlt()) {
+ error = vm_set_capability(ctx, newcpu, VM_CAP_HALT_EXIT, 1);
+ assert(error == 0);
+ }
+
+ if (fbsdrun_vmexit_on_pause()) {
+ error = vm_set_capability(ctx, newcpu, VM_CAP_PAUSE_EXIT, 1);
+ assert(error == 0);
+ }
+
+ error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
+ if (error) {
+ /*
+ * If the guest does not support real-mode execution then
+ * we will bring up the AP directly in 64-bit mode.
+ */
+ spinup_ap_direct64(ctx, newcpu, vector << PAGE_SHIFT, rip);
+ } else {
+ /*
+ * Update the %cs and %rip of the guest so that it starts
+ * executing real mode code at at 'vector << 12'.
+ */
+ *rip = 0;
+ error = vm_set_register(ctx, newcpu, VM_REG_GUEST_RIP, *rip);
+ assert(error == 0);
+
+ error = vm_get_desc(ctx, newcpu, VM_REG_GUEST_CS, &desc_base,
+ &desc_limit, &desc_access);
+ assert(error == 0);
+
+ desc_base = vector << PAGE_SHIFT;
+ error = vm_set_desc(ctx, newcpu, VM_REG_GUEST_CS,
+ desc_base, desc_limit, desc_access);
+ assert(error == 0);
+
+ cs = (vector << PAGE_SHIFT) >> 4;
+ error = vm_set_register(ctx, newcpu, VM_REG_GUEST_CS, cs);
+ assert(error == 0);
+ }
+}
+
+static void
+spinup_ap_direct64(struct vmctx *ctx, int newcpu, uintptr_t gaddr,
+ uint64_t *rip)
+{
+ struct mp_v64tramp *mvt;
+ char *errstr;
+ int error;
+ uint64_t gdtbase;
+
+ mvt = paddr_guest2host(gaddr);
+
+ assert(mvt->mt_sig == MP_V64T_SIG);
+
+ /*
+ * Set up the 3-entry GDT using memory supplied in the
+ * guest's trampoline structure.
+ */
+ vm_setup_freebsd_gdt(mvt->mt_gdtr);
+
+#define CHECK_ERROR(msg) \
+ if (error != 0) { \
+ errstr = msg; \
+ goto err_exit; \
+ }
+
+ /* entry point */
+ *rip = mvt->mt_eip;
+
+ /* Get the guest virtual address of the GDT */
+ gdtbase = mvt->mt_virt + __offsetof(struct mp_v64tramp, mt_gdtr);
+
+ error = vm_setup_freebsd_registers(ctx, newcpu, mvt->mt_eip,
+ mvt->mt_cr3, gdtbase, mvt->mt_rsp);
+ CHECK_ERROR("vm_setup_freebsd_registers");
+
+ return;
+err_exit:
+ printf("spinup_ap_direct64: machine state error: %s", errstr);
+ exit(1);
+}
diff --git a/usr.sbin/bhyve/xmsr.h b/usr.sbin/bhyve/xmsr.h
new file mode 100644
index 000000000000..8cebcea0cd55
--- /dev/null
+++ b/usr.sbin/bhyve/xmsr.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _XMSR_H_
+#define _XMSR_H_
+
+int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val);
+
+#endif
diff --git a/usr.sbin/vmmctl/Makefile b/usr.sbin/vmmctl/Makefile
new file mode 100644
index 000000000000..1f529b561f1b
--- /dev/null
+++ b/usr.sbin/vmmctl/Makefile
@@ -0,0 +1,15 @@
+#
+# $FreeBSD$
+#
+
+PROG= vmmctl
+SRCS= vmmctl.c
+
+NO_MAN=
+
+DPADD= ${LIBVMMAPI}
+LDADD= -lvmmapi
+
+CFLAGS+= -I${.CURDIR}/../../sys/amd64/vmm
+
+.include <bsd.prog.mk>
diff --git a/usr.sbin/vmmctl/sample.sh b/usr.sbin/vmmctl/sample.sh
new file mode 100755
index 000000000000..f38d0dadb882
--- /dev/null
+++ b/usr.sbin/vmmctl/sample.sh
@@ -0,0 +1,75 @@
+#!/bin/sh
+
+# $FreeBSD$
+
+VMMCTL="sudo ./vmmctl"
+VMNAME=sample
+
+${VMMCTL} --vm=${VMNAME} --create
+${VMMCTL} --vm=${VMNAME} --set-lowmem=128 --set-highmem=256
+${VMMCTL} --vm=${VMNAME} --get-lowmem --get-highmem
+
+CR0_PE=$((1 << 0))
+CR0_PG=$((1 << 31))
+CR0=$(($CR0_PE | $CR0_PG))
+${VMMCTL} --vm=${VMNAME} --set-cr0=${CR0} --get-cr0
+
+# XXX this is bogus the value of %cr3 should come from the loader
+CR3=0
+${VMMCTL} --vm=${VMNAME} --set-cr3=${CR3} --get-cr3
+
+CR4_PAE=$((1 << 5))
+CR4=$((${CR4_PAE}))
+${VMMCTL} --vm=${VMNAME} --set-cr4=${CR4} --get-cr4
+
+DR7=0x00000400 # Table 9-1 from Intel Architecture Manual 3A
+${VMMCTL} --vm=${VMNAME} --set-dr7=${DR7} --get-dr7
+
+#
+# XXX the values of rsp and rip are bogus and should come from the loader.
+#
+RSP=0xa5a5a5a5
+RIP=0x0000bfbfbfbf0000
+RFLAGS=0x2
+${VMMCTL} --vm=${VMNAME} --set-rsp=${RSP} --get-rsp
+${VMMCTL} --vm=${VMNAME} --set-rip=${RIP} --get-rip
+${VMMCTL} --vm=${VMNAME} --set-rflags=${RFLAGS} --get-rflags
+
+# Set "hidden" state of %cs descriptor to indicate long mode code segment.
+#
+# Note that this should match the contents of the entry pointed to by the
+# segment selector in the GDTR.
+#
+${VMMCTL} --vm=${VMNAME} --set-desc-cs --desc-access=0x00002098 --get-desc-cs
+
+# Set "hidden" state of all data descriptors to indicate a usable segment.
+# The only useful fields are the "Present" and "Descriptor Type" bits.
+${VMMCTL} --vm=${VMNAME} --set-desc-ds --desc-access=0x00000090 --get-desc-ds
+${VMMCTL} --vm=${VMNAME} --set-desc-es --desc-access=0x00000090 --get-desc-es
+${VMMCTL} --vm=${VMNAME} --set-desc-fs --desc-access=0x00000090 --get-desc-fs
+${VMMCTL} --vm=${VMNAME} --set-desc-gs --desc-access=0x00000090 --get-desc-gs
+${VMMCTL} --vm=${VMNAME} --set-desc-ss --desc-access=0x00000090 --get-desc-ss
+
+#
+# Set the code segment selector to point to entry at offset 8 in the GDTR.
+#
+${VMMCTL} --vm=${VMNAME} --set-cs=0x0008 --get-cs
+
+# Set all the remaining data segment selectors to point to entry at offset
+# 16 in the GDTR.
+${VMMCTL} --vm=${VMNAME} --set-ds=0x0010 --get-ds
+${VMMCTL} --vm=${VMNAME} --set-es=0x0010 --get-es
+${VMMCTL} --vm=${VMNAME} --set-fs=0x0010 --get-fs
+${VMMCTL} --vm=${VMNAME} --set-gs=0x0010 --get-gs
+${VMMCTL} --vm=${VMNAME} --set-ss=0x0010 --get-ss
+
+# XXX the value of the GDTR should come from the loader.
+# Set the GDTR
+GDTR_BASE=0xffff0000
+GDTR_LIMIT=0x10
+${VMMCTL} --vm=${VMNAME} --set-desc-gdtr --desc-base=${GDTR_BASE} --desc-limit=${GDTR_LIMIT} --get-desc-gdtr
+
+${VMMCTL} --vm=${VMNAME} --set-pinning=0 --get-pinning
+${VMMCTL} --vm=${VMNAME} --set-pinning=-1 --get-pinning
+
+${VMMCTL} --vm=${VMNAME} --destroy
diff --git a/usr.sbin/vmmctl/vmmctl.c b/usr.sbin/vmmctl/vmmctl.c
new file mode 100644
index 000000000000..678f98b0734b
--- /dev/null
+++ b/usr.sbin/vmmctl/vmmctl.c
@@ -0,0 +1,1485 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/errno.h>
+#include <sys/mman.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <libutil.h>
+#include <fcntl.h>
+#include <string.h>
+#include <getopt.h>
+#include <assert.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "intel/vmcs.h"
+
+#define MB (1UL << 20)
+#define GB (1UL << 30)
+
+#define REQ_ARG required_argument
+#define NO_ARG no_argument
+#define OPT_ARG optional_argument
+
+static const char *progname;
+
+static void
+usage(void)
+{
+
+ (void)fprintf(stderr,
+ "Usage: %s --vm=<name>\n"
+ " [--cpu=<vcpu_number>]\n"
+ " [--create]\n"
+ " [--destroy]\n"
+ " [--get-stats]\n"
+ " [--set-desc-ds]\n"
+ " [--get-desc-ds]\n"
+ " [--set-desc-es]\n"
+ " [--get-desc-es]\n"
+ " [--set-desc-gs]\n"
+ " [--get-desc-gs]\n"
+ " [--set-desc-fs]\n"
+ " [--get-desc-fs]\n"
+ " [--set-desc-cs]\n"
+ " [--get-desc-cs]\n"
+ " [--set-desc-ss]\n"
+ " [--get-desc-ss]\n"
+ " [--set-desc-tr]\n"
+ " [--get-desc-tr]\n"
+ " [--set-desc-ldtr]\n"
+ " [--get-desc-ldtr]\n"
+ " [--set-desc-gdtr]\n"
+ " [--get-desc-gdtr]\n"
+ " [--set-desc-idtr]\n"
+ " [--get-desc-idtr]\n"
+ " [--run]\n"
+ " [--capname=<capname>]\n"
+ " [--getcap]\n"
+ " [--setcap=<0|1>]\n"
+ " [--desc-base=<BASE>]\n"
+ " [--desc-limit=<LIMIT>]\n"
+ " [--desc-access=<ACCESS>]\n"
+ " [--set-cr0=<CR0>]\n"
+ " [--get-cr0]\n"
+ " [--set-cr3=<CR3>]\n"
+ " [--get-cr3]\n"
+ " [--set-cr4=<CR4>]\n"
+ " [--get-cr4]\n"
+ " [--set-dr7=<DR7>]\n"
+ " [--get-dr7]\n"
+ " [--set-rsp=<RSP>]\n"
+ " [--get-rsp]\n"
+ " [--set-rip=<RIP>]\n"
+ " [--get-rip]\n"
+ " [--get-rax]\n"
+ " [--set-rax=<RAX>]\n"
+ " [--get-rbx]\n"
+ " [--get-rcx]\n"
+ " [--get-rdx]\n"
+ " [--get-rsi]\n"
+ " [--get-rdi]\n"
+ " [--get-rbp]\n"
+ " [--get-r8]\n"
+ " [--get-r9]\n"
+ " [--get-r10]\n"
+ " [--get-r11]\n"
+ " [--get-r12]\n"
+ " [--get-r13]\n"
+ " [--get-r14]\n"
+ " [--get-r15]\n"
+ " [--set-rflags=<RFLAGS>]\n"
+ " [--get-rflags]\n"
+ " [--set-cs]\n"
+ " [--get-cs]\n"
+ " [--set-ds]\n"
+ " [--get-ds]\n"
+ " [--set-es]\n"
+ " [--get-es]\n"
+ " [--set-fs]\n"
+ " [--get-fs]\n"
+ " [--set-gs]\n"
+ " [--get-gs]\n"
+ " [--set-ss]\n"
+ " [--get-ss]\n"
+ " [--get-tr]\n"
+ " [--get-ldtr]\n"
+ " [--get-vmcs-pinbased-ctls]\n"
+ " [--get-vmcs-procbased-ctls]\n"
+ " [--get-vmcs-procbased-ctls2]\n"
+ " [--get-vmcs-entry-interruption-info]\n"
+ " [--set-vmcs-entry-interruption-info=<info>]\n"
+ " [--get-vmcs-eptp]\n"
+ " [--get-vmcs-guest-physical-address\n"
+ " [--get-vmcs-guest-linear-address\n"
+ " [--set-vmcs-exception-bitmap]\n"
+ " [--get-vmcs-exception-bitmap]\n"
+ " [--get-vmcs-io-bitmap-address]\n"
+ " [--get-vmcs-tsc-offset]\n"
+ " [--get-vmcs-guest-pat]\n"
+ " [--get-vmcs-host-pat]\n"
+ " [--get-vmcs-host-cr0]\n"
+ " [--get-vmcs-host-cr3]\n"
+ " [--get-vmcs-host-cr4]\n"
+ " [--get-vmcs-host-rip]\n"
+ " [--get-vmcs-host-rsp]\n"
+ " [--get-vmcs-cr0-mask]\n"
+ " [--get-vmcs-cr0-shadow]\n"
+ " [--get-vmcs-cr4-mask]\n"
+ " [--get-vmcs-cr4-shadow]\n"
+ " [--get-vmcs-cr3-targets]\n"
+ " [--get-vmcs-apic-access-address]\n"
+ " [--get-vmcs-virtual-apic-address]\n"
+ " [--get-vmcs-tpr-threshold]\n"
+ " [--get-vmcs-msr-bitmap]\n"
+ " [--get-vmcs-msr-bitmap-address]\n"
+ " [--get-vmcs-vpid]\n"
+ " [--get-vmcs-ple-gap]\n"
+ " [--get-vmcs-ple-window]\n"
+ " [--get-vmcs-instruction-error]\n"
+ " [--get-vmcs-exit-ctls]\n"
+ " [--get-vmcs-entry-ctls]\n"
+ " [--get-vmcs-guest-sysenter]\n"
+ " [--get-vmcs-link]\n"
+ " [--get-vmcs-exit-reason]\n"
+ " [--get-vmcs-exit-qualification]\n"
+ " [--get-vmcs-exit-interruption-info]\n"
+ " [--get-vmcs-exit-interruption-error]\n"
+ " [--get-vmcs-interruptibility]\n"
+ " [--set-pinning=<host_cpuid>]\n"
+ " [--get-pinning]\n"
+ " [--set-lowmem=<memory below 4GB in units of MB>]\n"
+ " [--get-lowmem]\n"
+ " [--set-highmem=<memory above 4GB in units of MB>]\n"
+ " [--get-highmem]\n",
+ progname);
+ exit(1);
+}
+
+static int get_stats, getcap, setcap, capval;
+static const char *capname;
+static int create, destroy, get_lowmem, get_highmem;
+static uint64_t lowmem, highmem;
+static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4;
+static int set_efer, get_efer;
+static int set_dr7, get_dr7;
+static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags;
+static int set_rax, get_rax;
+static int get_rbx, get_rcx, get_rdx, get_rsi, get_rdi, get_rbp;
+static int get_r8, get_r9, get_r10, get_r11, get_r12, get_r13, get_r14, get_r15;
+static int set_desc_ds, get_desc_ds;
+static int set_desc_es, get_desc_es;
+static int set_desc_fs, get_desc_fs;
+static int set_desc_gs, get_desc_gs;
+static int set_desc_cs, get_desc_cs;
+static int set_desc_ss, get_desc_ss;
+static int set_desc_gdtr, get_desc_gdtr;
+static int set_desc_idtr, get_desc_idtr;
+static int set_desc_tr, get_desc_tr;
+static int set_desc_ldtr, get_desc_ldtr;
+static int set_cs, set_ds, set_es, set_fs, set_gs, set_ss, set_tr, set_ldtr;
+static int get_cs, get_ds, get_es, get_fs, get_gs, get_ss, get_tr, get_ldtr;
+static int set_pinning, get_pinning, pincpu;
+static int run;
+
+/*
+ * VMCS-specific fields
+ */
+static int get_pinbased_ctls, get_procbased_ctls, get_procbased_ctls2;
+static int get_eptp, get_io_bitmap, get_tsc_offset;
+static int get_vmcs_entry_interruption_info, set_vmcs_entry_interruption_info;
+static int get_vmcs_interruptibility;
+uint32_t vmcs_entry_interruption_info;
+static int get_vmcs_gpa, get_vmcs_gla;
+static int get_exception_bitmap, set_exception_bitmap, exception_bitmap;
+static int get_cr0_mask, get_cr0_shadow;
+static int get_cr4_mask, get_cr4_shadow;
+static int get_cr3_targets;
+static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold;
+static int get_msr_bitmap, get_msr_bitmap_address;
+static int get_vpid, get_ple_gap, get_ple_window;
+static int get_inst_err, get_exit_ctls, get_entry_ctls;
+static int get_host_cr0, get_host_cr3, get_host_cr4;
+static int get_host_rip, get_host_rsp;
+static int get_guest_pat, get_host_pat;
+static int get_guest_sysenter, get_vmcs_link;
+static int get_vmcs_exit_reason, get_vmcs_exit_qualification;
+static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error;
+
+static uint64_t desc_base;
+static uint32_t desc_limit, desc_access;
+
+static void
+dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu)
+{
+ printf("vm exit[%d]\n", vcpu);
+ printf("\trip\t\t0x%016lx\n", vmexit->rip);
+ printf("\tinst_length\t%d\n", vmexit->inst_length);
+ switch (vmexit->exitcode) {
+ case VM_EXITCODE_INOUT:
+ printf("\treason\t\tINOUT\n");
+ printf("\tdirection\t%s\n", vmexit->u.inout.in ? "IN" : "OUT");
+ printf("\tbytes\t\t%d\n", vmexit->u.inout.bytes);
+ printf("\tflags\t\t%s%s\n",
+ vmexit->u.inout.string ? "STRING " : "",
+ vmexit->u.inout.rep ? "REP " : "");
+ printf("\tport\t\t0x%04x\n", vmexit->u.inout.port);
+ printf("\teax\t\t0x%08x\n", vmexit->u.inout.eax);
+ break;
+ case VM_EXITCODE_VMX:
+ printf("\treason\t\tVMX\n");
+ printf("\terror\t\t%d\n", vmexit->u.vmx.error);
+ printf("\texit_reason\t0x%08x (%u)\n",
+ vmexit->u.vmx.exit_reason, vmexit->u.vmx.exit_reason);
+ printf("\tqualification\t0x%016lx\n",
+ vmexit->u.vmx.exit_qualification);
+ break;
+ default:
+ printf("*** unknown vm run exitcode %d\n", vmexit->exitcode);
+ break;
+ }
+}
+
+static int
+dump_vmcs_msr_bitmap(int vcpu, u_long addr)
+{
+ int error, fd, byte, bit, readable, writeable;
+ u_int msr;
+ const char *bitmap;
+
+ error = -1;
+ bitmap = MAP_FAILED;
+
+ fd = open("/dev/mem", O_RDONLY, 0);
+ if (fd < 0)
+ goto done;
+
+ bitmap = mmap(NULL, PAGE_SIZE, PROT_READ, 0, fd, addr);
+ if (bitmap == MAP_FAILED)
+ goto done;
+
+ for (msr = 0; msr < 0x2000; msr++) {
+ byte = msr / 8;
+ bit = msr & 0x7;
+
+ /* Look at MSRs in the range 0x00000000 to 0x00001FFF */
+ readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
+ writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1;
+ if (readable || writeable) {
+ printf("msr 0x%08x[%d]\t\t%c%c\n", msr, vcpu,
+ readable ? 'R' : '-',
+ writeable ? 'W' : '-');
+ }
+
+ /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */
+ byte += 1024;
+ readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
+ writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1;
+ if (readable || writeable) {
+ printf("msr 0x%08x[%d]\t\t%c%c\n",
+ 0xc0000000 + msr, vcpu,
+ readable ? 'R' : '-',
+ writeable ? 'W' : '-');
+ }
+ }
+
+ error = 0;
+done:
+ if (bitmap != MAP_FAILED)
+ munmap((void *)bitmap, PAGE_SIZE);
+ if (fd >= 0)
+ close(fd);
+ return (error);
+}
+
+static int
+vm_get_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t *ret_val)
+{
+
+ return (vm_get_register(ctx, vcpu, VMCS_IDENT(field), ret_val));
+}
+
+static int
+vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val)
+{
+
+ return (vm_set_register(ctx, vcpu, VMCS_IDENT(field), val));
+}
+
+enum {
+ VMNAME = 1000, /* avoid collision with return values from getopt */
+ VCPU,
+ SET_LOWMEM,
+ SET_HIGHMEM,
+ SET_EFER,
+ SET_CR0,
+ SET_CR3,
+ SET_CR4,
+ SET_DR7,
+ SET_RSP,
+ SET_RIP,
+ SET_RAX,
+ SET_RFLAGS,
+ DESC_BASE,
+ DESC_LIMIT,
+ DESC_ACCESS,
+ SET_CS,
+ SET_DS,
+ SET_ES,
+ SET_FS,
+ SET_GS,
+ SET_SS,
+ SET_TR,
+ SET_LDTR,
+ SET_PINNING,
+ SET_VMCS_EXCEPTION_BITMAP,
+ SET_VMCS_ENTRY_INTERRUPTION_INFO,
+ SET_CAP,
+ CAPNAME,
+};
+
+int
+main(int argc, char *argv[])
+{
+ char *vmname;
+ int error, ch, vcpu;
+ vm_paddr_t hpa;
+ size_t len;
+ struct vm_exit vmexit;
+ uint64_t ctl, eptp, bm, tsc_off, addr, u64;
+ struct vmctx *ctx;
+
+ uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat;
+ uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp;
+ uint64_t r8, r9, r10, r11, r12, r13, r14, r15;
+ uint64_t cs, ds, es, fs, gs, ss, tr, ldtr;
+
+ struct option opts[] = {
+ { "vm", REQ_ARG, 0, VMNAME },
+ { "cpu", REQ_ARG, 0, VCPU },
+ { "set-lowmem", REQ_ARG, 0, SET_LOWMEM },
+ { "set-highmem",REQ_ARG, 0, SET_HIGHMEM },
+ { "set-efer", REQ_ARG, 0, SET_EFER },
+ { "set-cr0", REQ_ARG, 0, SET_CR0 },
+ { "set-cr3", REQ_ARG, 0, SET_CR3 },
+ { "set-cr4", REQ_ARG, 0, SET_CR4 },
+ { "set-dr7", REQ_ARG, 0, SET_DR7 },
+ { "set-rsp", REQ_ARG, 0, SET_RSP },
+ { "set-rip", REQ_ARG, 0, SET_RIP },
+ { "set-rax", REQ_ARG, 0, SET_RAX },
+ { "set-rflags", REQ_ARG, 0, SET_RFLAGS },
+ { "desc-base", REQ_ARG, 0, DESC_BASE },
+ { "desc-limit", REQ_ARG, 0, DESC_LIMIT },
+ { "desc-access",REQ_ARG, 0, DESC_ACCESS },
+ { "set-cs", REQ_ARG, 0, SET_CS },
+ { "set-ds", REQ_ARG, 0, SET_DS },
+ { "set-es", REQ_ARG, 0, SET_ES },
+ { "set-fs", REQ_ARG, 0, SET_FS },
+ { "set-gs", REQ_ARG, 0, SET_GS },
+ { "set-ss", REQ_ARG, 0, SET_SS },
+ { "set-tr", REQ_ARG, 0, SET_TR },
+ { "set-ldtr", REQ_ARG, 0, SET_LDTR },
+ { "set-pinning",REQ_ARG, 0, SET_PINNING },
+ { "set-vmcs-exception-bitmap",
+ REQ_ARG, 0, SET_VMCS_EXCEPTION_BITMAP },
+ { "set-vmcs-entry-interruption-info",
+ REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO },
+ { "capname", REQ_ARG, 0, CAPNAME },
+ { "setcap", REQ_ARG, 0, SET_CAP },
+ { "getcap", NO_ARG, &getcap, 1 },
+ { "get-stats", NO_ARG, &get_stats, 1 },
+ { "get-desc-ds",NO_ARG, &get_desc_ds, 1 },
+ { "set-desc-ds",NO_ARG, &set_desc_ds, 1 },
+ { "get-desc-es",NO_ARG, &get_desc_es, 1 },
+ { "set-desc-es",NO_ARG, &set_desc_es, 1 },
+ { "get-desc-ss",NO_ARG, &get_desc_ss, 1 },
+ { "set-desc-ss",NO_ARG, &set_desc_ss, 1 },
+ { "get-desc-cs",NO_ARG, &get_desc_cs, 1 },
+ { "set-desc-cs",NO_ARG, &set_desc_cs, 1 },
+ { "get-desc-fs",NO_ARG, &get_desc_fs, 1 },
+ { "set-desc-fs",NO_ARG, &set_desc_fs, 1 },
+ { "get-desc-gs",NO_ARG, &get_desc_gs, 1 },
+ { "set-desc-gs",NO_ARG, &set_desc_gs, 1 },
+ { "get-desc-tr",NO_ARG, &get_desc_tr, 1 },
+ { "set-desc-tr",NO_ARG, &set_desc_tr, 1 },
+ { "set-desc-ldtr", NO_ARG, &set_desc_ldtr, 1 },
+ { "get-desc-ldtr", NO_ARG, &get_desc_ldtr, 1 },
+ { "set-desc-gdtr", NO_ARG, &set_desc_gdtr, 1 },
+ { "get-desc-gdtr", NO_ARG, &get_desc_gdtr, 1 },
+ { "set-desc-idtr", NO_ARG, &set_desc_idtr, 1 },
+ { "get-desc-idtr", NO_ARG, &get_desc_idtr, 1 },
+ { "get-lowmem", NO_ARG, &get_lowmem, 1 },
+ { "get-highmem",NO_ARG, &get_highmem, 1 },
+ { "get-efer", NO_ARG, &get_efer, 1 },
+ { "get-cr0", NO_ARG, &get_cr0, 1 },
+ { "get-cr3", NO_ARG, &get_cr3, 1 },
+ { "get-cr4", NO_ARG, &get_cr4, 1 },
+ { "get-dr7", NO_ARG, &get_dr7, 1 },
+ { "get-rsp", NO_ARG, &get_rsp, 1 },
+ { "get-rip", NO_ARG, &get_rip, 1 },
+ { "get-rax", NO_ARG, &get_rax, 1 },
+ { "get-rbx", NO_ARG, &get_rbx, 1 },
+ { "get-rcx", NO_ARG, &get_rcx, 1 },
+ { "get-rdx", NO_ARG, &get_rdx, 1 },
+ { "get-rsi", NO_ARG, &get_rsi, 1 },
+ { "get-rdi", NO_ARG, &get_rdi, 1 },
+ { "get-rbp", NO_ARG, &get_rbp, 1 },
+ { "get-r8", NO_ARG, &get_r8, 1 },
+ { "get-r9", NO_ARG, &get_r9, 1 },
+ { "get-r10", NO_ARG, &get_r10, 1 },
+ { "get-r11", NO_ARG, &get_r11, 1 },
+ { "get-r12", NO_ARG, &get_r12, 1 },
+ { "get-r13", NO_ARG, &get_r13, 1 },
+ { "get-r14", NO_ARG, &get_r14, 1 },
+ { "get-r15", NO_ARG, &get_r15, 1 },
+ { "get-rflags", NO_ARG, &get_rflags, 1 },
+ { "get-cs", NO_ARG, &get_cs, 1 },
+ { "get-ds", NO_ARG, &get_ds, 1 },
+ { "get-es", NO_ARG, &get_es, 1 },
+ { "get-fs", NO_ARG, &get_fs, 1 },
+ { "get-gs", NO_ARG, &get_gs, 1 },
+ { "get-ss", NO_ARG, &get_ss, 1 },
+ { "get-tr", NO_ARG, &get_tr, 1 },
+ { "get-ldtr", NO_ARG, &get_ldtr, 1 },
+ { "get-vmcs-pinbased-ctls",
+ NO_ARG, &get_pinbased_ctls, 1 },
+ { "get-vmcs-procbased-ctls",
+ NO_ARG, &get_procbased_ctls, 1 },
+ { "get-vmcs-procbased-ctls2",
+ NO_ARG, &get_procbased_ctls2, 1 },
+ { "get-vmcs-guest-linear-address",
+ NO_ARG, &get_vmcs_gla, 1 },
+ { "get-vmcs-guest-physical-address",
+ NO_ARG, &get_vmcs_gpa, 1 },
+ { "get-vmcs-entry-interruption-info",
+ NO_ARG, &get_vmcs_entry_interruption_info, 1},
+ { "get-vmcs-eptp", NO_ARG, &get_eptp, 1 },
+ { "get-vmcs-exception-bitmap",
+ NO_ARG, &get_exception_bitmap, 1 },
+ { "get-vmcs-io-bitmap-address",
+ NO_ARG, &get_io_bitmap, 1 },
+ { "get-vmcs-tsc-offset", NO_ARG,&get_tsc_offset, 1 },
+ { "get-vmcs-cr0-mask", NO_ARG, &get_cr0_mask, 1 },
+ { "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 },
+ { "get-vmcs-cr4-mask", NO_ARG, &get_cr4_mask, 1 },
+ { "get-vmcs-cr4-shadow", NO_ARG,&get_cr4_shadow, 1 },
+ { "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1},
+ { "get-vmcs-apic-access-address",
+ NO_ARG, &get_apic_access_addr, 1},
+ { "get-vmcs-virtual-apic-address",
+ NO_ARG, &get_virtual_apic_addr, 1},
+ { "get-vmcs-tpr-threshold",
+ NO_ARG, &get_tpr_threshold, 1 },
+ { "get-vmcs-msr-bitmap",
+ NO_ARG, &get_msr_bitmap, 1 },
+ { "get-vmcs-msr-bitmap-address",
+ NO_ARG, &get_msr_bitmap_address, 1 },
+ { "get-vmcs-vpid", NO_ARG, &get_vpid, 1 },
+ { "get-vmcs-ple-gap", NO_ARG, &get_ple_gap, 1 },
+ { "get-vmcs-ple-window", NO_ARG,&get_ple_window,1 },
+ { "get-vmcs-instruction-error",
+ NO_ARG, &get_inst_err, 1 },
+ { "get-vmcs-exit-ctls", NO_ARG, &get_exit_ctls, 1 },
+ { "get-vmcs-entry-ctls",
+ NO_ARG, &get_entry_ctls, 1 },
+ { "get-vmcs-guest-pat", NO_ARG, &get_guest_pat, 1 },
+ { "get-vmcs-host-pat", NO_ARG, &get_host_pat, 1 },
+ { "get-vmcs-host-cr0",
+ NO_ARG, &get_host_cr0, 1 },
+ { "get-vmcs-host-cr3",
+ NO_ARG, &get_host_cr3, 1 },
+ { "get-vmcs-host-cr4",
+ NO_ARG, &get_host_cr4, 1 },
+ { "get-vmcs-host-rip",
+ NO_ARG, &get_host_rip, 1 },
+ { "get-vmcs-host-rsp",
+ NO_ARG, &get_host_rsp, 1 },
+ { "get-vmcs-guest-sysenter",
+ NO_ARG, &get_guest_sysenter, 1 },
+ { "get-vmcs-link", NO_ARG, &get_vmcs_link, 1 },
+ { "get-vmcs-exit-reason",
+ NO_ARG, &get_vmcs_exit_reason, 1 },
+ { "get-vmcs-exit-qualification",
+ NO_ARG, &get_vmcs_exit_qualification, 1 },
+ { "get-vmcs-exit-interruption-info",
+ NO_ARG, &get_vmcs_exit_interruption_info, 1},
+ { "get-vmcs-exit-interruption-error",
+ NO_ARG, &get_vmcs_exit_interruption_error, 1},
+ { "get-vmcs-interruptibility",
+ NO_ARG, &get_vmcs_interruptibility, 1 },
+ { "get-pinning",NO_ARG, &get_pinning, 1 },
+ { "run", NO_ARG, &run, 1 },
+ { "create", NO_ARG, &create, 1 },
+ { "destroy", NO_ARG, &destroy, 1 },
+ { NULL, 0, NULL, 0 }
+ };
+
+ vcpu = 0;
+ progname = basename(argv[0]);
+
+ while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) {
+ switch (ch) {
+ case 0:
+ break;
+ case VMNAME:
+ vmname = optarg;
+ break;
+ case VCPU:
+ vcpu = atoi(optarg);
+ break;
+ case SET_LOWMEM:
+ lowmem = atoi(optarg) * MB;
+ lowmem = roundup(lowmem, 2 * MB);
+ break;
+ case SET_HIGHMEM:
+ highmem = atoi(optarg) * MB;
+ highmem = roundup(highmem, 2 * MB);
+ break;
+ case SET_EFER:
+ efer = strtoul(optarg, NULL, 0);
+ set_efer = 1;
+ break;
+ case SET_CR0:
+ cr0 = strtoul(optarg, NULL, 0);
+ set_cr0 = 1;
+ break;
+ case SET_CR3:
+ cr3 = strtoul(optarg, NULL, 0);
+ set_cr3 = 1;
+ break;
+ case SET_CR4:
+ cr4 = strtoul(optarg, NULL, 0);
+ set_cr4 = 1;
+ break;
+ case SET_DR7:
+ dr7 = strtoul(optarg, NULL, 0);
+ set_dr7 = 1;
+ break;
+ case SET_RSP:
+ rsp = strtoul(optarg, NULL, 0);
+ set_rsp = 1;
+ break;
+ case SET_RIP:
+ rip = strtoul(optarg, NULL, 0);
+ set_rip = 1;
+ break;
+ case SET_RAX:
+ rax = strtoul(optarg, NULL, 0);
+ set_rax = 1;
+ break;
+ case SET_RFLAGS:
+ rflags = strtoul(optarg, NULL, 0);
+ set_rflags = 1;
+ break;
+ case DESC_BASE:
+ desc_base = strtoul(optarg, NULL, 0);
+ break;
+ case DESC_LIMIT:
+ desc_limit = strtoul(optarg, NULL, 0);
+ break;
+ case DESC_ACCESS:
+ desc_access = strtoul(optarg, NULL, 0);
+ break;
+ case SET_CS:
+ cs = strtoul(optarg, NULL, 0);
+ set_cs = 1;
+ break;
+ case SET_DS:
+ ds = strtoul(optarg, NULL, 0);
+ set_ds = 1;
+ break;
+ case SET_ES:
+ es = strtoul(optarg, NULL, 0);
+ set_es = 1;
+ break;
+ case SET_FS:
+ fs = strtoul(optarg, NULL, 0);
+ set_fs = 1;
+ break;
+ case SET_GS:
+ gs = strtoul(optarg, NULL, 0);
+ set_gs = 1;
+ break;
+ case SET_SS:
+ ss = strtoul(optarg, NULL, 0);
+ set_ss = 1;
+ break;
+ case SET_TR:
+ tr = strtoul(optarg, NULL, 0);
+ set_tr = 1;
+ break;
+ case SET_LDTR:
+ ldtr = strtoul(optarg, NULL, 0);
+ set_ldtr = 1;
+ break;
+ case SET_PINNING:
+ pincpu = strtol(optarg, NULL, 0);
+ set_pinning = 1;
+ break;
+ case SET_VMCS_EXCEPTION_BITMAP:
+ exception_bitmap = strtoul(optarg, NULL, 0);
+ set_exception_bitmap = 1;
+ break;
+ case SET_VMCS_ENTRY_INTERRUPTION_INFO:
+ vmcs_entry_interruption_info = strtoul(optarg, NULL, 0);
+ set_vmcs_entry_interruption_info = 1;
+ break;
+ case SET_CAP:
+ capval = strtoul(optarg, NULL, 0);
+ setcap = 1;
+ break;
+ case CAPNAME:
+ capname = optarg;
+ break;
+ default:
+ usage();
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (vmname == NULL)
+ usage();
+
+ error = 0;
+
+ if (!error && create)
+ error = vm_create(vmname);
+
+ if (!error) {
+ ctx = vm_open(vmname);
+ if (ctx == NULL)
+ error = -1;
+ }
+
+ if (!error && lowmem)
+ error = vm_setup_memory(ctx, 0, lowmem, NULL);
+
+ if (!error && highmem)
+ error = vm_setup_memory(ctx, 4 * GB, highmem, NULL);
+
+ if (!error && set_efer)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer);
+
+ if (!error && set_cr0)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0);
+
+ if (!error && set_cr3)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3);
+
+ if (!error && set_cr4)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4);
+
+ if (!error && set_dr7)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7);
+
+ if (!error && set_rsp)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp);
+
+ if (!error && set_rip)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip);
+
+ if (!error && set_rax)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax);
+
+ if (!error && set_rflags) {
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
+ rflags);
+ }
+
+ if (!error && set_desc_ds) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_es) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_ss) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_cs) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_fs) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_gs) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_tr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_ldtr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_gdtr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
+ desc_base, desc_limit, 0);
+ }
+
+ if (!error && set_desc_idtr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
+ desc_base, desc_limit, 0);
+ }
+
+ if (!error && set_cs)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs);
+
+ if (!error && set_ds)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds);
+
+ if (!error && set_es)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es);
+
+ if (!error && set_fs)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs);
+
+ if (!error && set_gs)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs);
+
+ if (!error && set_ss)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss);
+
+ if (!error && set_tr)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr);
+
+ if (!error && set_ldtr)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr);
+
+ if (!error && set_pinning)
+ error = vm_set_pinning(ctx, vcpu, pincpu);
+
+ if (!error && set_exception_bitmap) {
+ error = vm_set_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP,
+ exception_bitmap);
+ }
+
+ if (!error && set_vmcs_entry_interruption_info) {
+ error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,
+ vmcs_entry_interruption_info);
+ }
+
+ if (!error && get_lowmem) {
+ error = vm_get_memory_seg(ctx, 0, &hpa, &len);
+ if (error == 0)
+ printf("lowmem\t\t0x%016lx/%ld\n", hpa, len);
+ }
+
+ if (!error && get_highmem) {
+ error = vm_get_memory_seg(ctx, 4 * GB, &hpa, &len);
+ if (error == 0)
+ printf("highmem\t\t0x%016lx/%ld\n", hpa, len);
+ }
+
+ if (!error && get_efer) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer);
+ if (error == 0)
+ printf("efer[%d]\t\t0x%016lx\n", vcpu, efer);
+ }
+
+ if (!error && get_cr0) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0);
+ if (error == 0)
+ printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
+ }
+
+ if (!error && get_cr3) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3);
+ if (error == 0)
+ printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
+ }
+
+ if (!error && get_cr4) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4);
+ if (error == 0)
+ printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
+ }
+
+ if (!error && get_dr7) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR7, &dr7);
+ if (error == 0)
+ printf("dr7[%d]\t\t0x%016lx\n", vcpu, dr7);
+ }
+
+ if (!error && get_rsp) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSP, &rsp);
+ if (error == 0)
+ printf("rsp[%d]\t\t0x%016lx\n", vcpu, rsp);
+ }
+
+ if (!error && get_rip) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
+ if (error == 0)
+ printf("rip[%d]\t\t0x%016lx\n", vcpu, rip);
+ }
+
+ if (!error && get_rax) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RAX, &rax);
+ if (error == 0)
+ printf("rax[%d]\t\t0x%016lx\n", vcpu, rax);
+ }
+
+ if (!error && get_rbx) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBX, &rbx);
+ if (error == 0)
+ printf("rbx[%d]\t\t0x%016lx\n", vcpu, rbx);
+ }
+
+ if (!error && get_rcx) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RCX, &rcx);
+ if (error == 0)
+ printf("rcx[%d]\t\t0x%016lx\n", vcpu, rcx);
+ }
+
+ if (!error && get_rdx) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDX, &rdx);
+ if (error == 0)
+ printf("rdx[%d]\t\t0x%016lx\n", vcpu, rdx);
+ }
+
+ if (!error && get_rsi) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSI, &rsi);
+ if (error == 0)
+ printf("rsi[%d]\t\t0x%016lx\n", vcpu, rsi);
+ }
+
+ if (!error && get_rdi) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDI, &rdi);
+ if (error == 0)
+ printf("rdi[%d]\t\t0x%016lx\n", vcpu, rdi);
+ }
+
+ if (!error && get_rbp) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBP, &rbp);
+ if (error == 0)
+ printf("rbp[%d]\t\t0x%016lx\n", vcpu, rbp);
+ }
+
+ if (!error && get_r8) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R8, &r8);
+ if (error == 0)
+ printf("r8[%d]\t\t0x%016lx\n", vcpu, r8);
+ }
+
+ if (!error && get_r9) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R9, &r9);
+ if (error == 0)
+ printf("r9[%d]\t\t0x%016lx\n", vcpu, r9);
+ }
+
+ if (!error && get_r10) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R10, &r10);
+ if (error == 0)
+ printf("r10[%d]\t\t0x%016lx\n", vcpu, r10);
+ }
+
+ if (!error && get_r11) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R11, &r11);
+ if (error == 0)
+ printf("r11[%d]\t\t0x%016lx\n", vcpu, r11);
+ }
+
+ if (!error && get_r12) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R12, &r12);
+ if (error == 0)
+ printf("r12[%d]\t\t0x%016lx\n", vcpu, r12);
+ }
+
+ if (!error && get_r13) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R13, &r13);
+ if (error == 0)
+ printf("r13[%d]\t\t0x%016lx\n", vcpu, r13);
+ }
+
+ if (!error && get_r14) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R14, &r14);
+ if (error == 0)
+ printf("r14[%d]\t\t0x%016lx\n", vcpu, r14);
+ }
+
+ if (!error && get_r15) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R15, &r15);
+ if (error == 0)
+ printf("r15[%d]\t\t0x%016lx\n", vcpu, r15);
+ }
+
+ if (!error && get_rflags) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
+ &rflags);
+ if (error == 0)
+ printf("rflags[%d]\t0x%016lx\n", vcpu, rflags);
+ }
+
+ if (!error && get_stats) {
+ int i, num_stats;
+ uint64_t *stats;
+ struct timeval tv;
+ const char *desc;
+
+ stats = vm_get_stats(ctx, vcpu, &tv, &num_stats);
+ if (stats != NULL) {
+ printf("vcpu%d\n", vcpu);
+ for (i = 0; i < num_stats; i++) {
+ desc = vm_get_stat_desc(ctx, i);
+ printf("%-32s\t%ld\n", desc, stats[i]);
+ }
+ }
+ }
+
+ if (!error && get_desc_ds) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_DS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && get_desc_es) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_ES,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && get_desc_fs) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_FS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && get_desc_gs) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && get_desc_ss) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && get_desc_cs) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_CS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && get_desc_tr) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && get_desc_ldtr) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && get_desc_gdtr) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("gdtr[%d]\t\t0x%016lx/0x%08x\n",
+ vcpu, desc_base, desc_limit);
+ }
+ }
+
+ if (!error && get_desc_idtr) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("idtr[%d]\t\t0x%016lx/0x%08x\n",
+ vcpu, desc_base, desc_limit);
+ }
+ }
+
+ if (!error && get_cs) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CS, &cs);
+ if (error == 0)
+ printf("cs[%d]\t\t0x%04lx\n", vcpu, cs);
+ }
+
+ if (!error && get_ds) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DS, &ds);
+ if (error == 0)
+ printf("ds[%d]\t\t0x%04lx\n", vcpu, ds);
+ }
+
+ if (!error && get_es) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_ES, &es);
+ if (error == 0)
+ printf("es[%d]\t\t0x%04lx\n", vcpu, es);
+ }
+
+ if (!error && get_fs) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_FS, &fs);
+ if (error == 0)
+ printf("fs[%d]\t\t0x%04lx\n", vcpu, fs);
+ }
+
+ if (!error && get_gs) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_GS, &gs);
+ if (error == 0)
+ printf("gs[%d]\t\t0x%04lx\n", vcpu, gs);
+ }
+
+ if (!error && get_ss) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_SS, &ss);
+ if (error == 0)
+ printf("ss[%d]\t\t0x%04lx\n", vcpu, ss);
+ }
+
+ if (!error && get_tr) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_TR, &tr);
+ if (error == 0)
+ printf("tr[%d]\t\t0x%04lx\n", vcpu, tr);
+ }
+
+ if (!error && get_ldtr) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_LDTR, &ldtr);
+ if (error == 0)
+ printf("ldtr[%d]\t\t0x%04lx\n", vcpu, ldtr);
+ }
+
+ if (!error && get_pinning) {
+ error = vm_get_pinning(ctx, vcpu, &pincpu);
+ if (error == 0) {
+ if (pincpu < 0)
+ printf("pincpu[%d]\tunpinned\n", vcpu);
+ else
+ printf("pincpu[%d]\t%d\n", vcpu, pincpu);
+ }
+ }
+
+ if (!error && get_pinbased_ctls) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl);
+ if (error == 0)
+ printf("pinbased_ctls[%d]\t0x%08x\n", vcpu, ctl);
+ }
+
+ if (!error && get_procbased_ctls) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_PRI_PROC_BASED_CTLS, &ctl);
+ if (error == 0)
+ printf("procbased_ctls[%d]\t0x%08x\n", vcpu, ctl);
+ }
+
+ if (!error && get_procbased_ctls2) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_SEC_PROC_BASED_CTLS, &ctl);
+ if (error == 0)
+ printf("procbased_ctls2[%d]\t0x%08x\n", vcpu, ctl);
+ }
+
+ if (!error && get_vmcs_gla) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_LINEAR_ADDRESS, &u64);
+ if (error == 0)
+ printf("gla[%d]\t\t0x%016lx\n", vcpu, u64);
+ }
+
+ if (!error && get_vmcs_gpa) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_PHYSICAL_ADDRESS, &u64);
+ if (error == 0)
+ printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64);
+ }
+
+ if (!error && get_vmcs_entry_interruption_info) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64);
+ if (error == 0) {
+ printf("entry_interruption_info[%d]\t0x%08x\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && get_eptp) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp);
+ if (error == 0)
+ printf("eptp[%d]\t\t0x%016lx\n", vcpu, eptp);
+ }
+
+ if (!error && get_exception_bitmap) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP,
+ &bm);
+ if (error == 0)
+ printf("exception_bitmap[%d]\t0x%08x\n", vcpu, bm);
+ }
+
+ if (!error && get_io_bitmap) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A, &bm);
+ if (error == 0)
+ printf("io_bitmap_a[%d]\t0x%08x\n", vcpu, bm);
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B, &bm);
+ if (error == 0)
+ printf("io_bitmap_b[%d]\t0x%08x\n", vcpu, bm);
+ }
+
+ if (!error && get_tsc_offset) {
+ uint64_t tscoff;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET, &tscoff);
+ if (error == 0)
+ printf("tsc_offset[%d]\t0x%016lx\n", tscoff);
+ }
+
+ if (!error && get_cr0_mask) {
+ uint64_t cr0mask;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_MASK, &cr0mask);
+ if (error == 0)
+ printf("cr0_mask[%d]\t\t0x%016lx\n", cr0mask);
+ }
+
+ if (!error && get_cr0_shadow) {
+ uint64_t cr0shadow;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_SHADOW,
+ &cr0shadow);
+ if (error == 0)
+ printf("cr0_shadow[%d]\t\t0x%016lx\n", cr0shadow);
+ }
+
+ if (!error && get_cr4_mask) {
+ uint64_t cr4mask;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_MASK, &cr4mask);
+ if (error == 0)
+ printf("cr4_mask[%d]\t\t0x%016lx\n", cr4mask);
+ }
+
+ if (!error && get_cr4_shadow) {
+ uint64_t cr4shadow;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_SHADOW,
+ &cr4shadow);
+ if (error == 0)
+ printf("cr4_shadow[%d]\t\t0x%016lx\n", cr4shadow);
+ }
+
+ if (!error && get_cr3_targets) {
+ uint64_t target_count, target_addr;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT,
+ &target_count);
+ if (error == 0) {
+ printf("cr3_target_count[%d]\t0x%08x\n",
+ vcpu, target_count);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET0,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target0[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET1,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target1[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET2,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target2[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET3,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target3[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+ }
+
+ if (!error && get_apic_access_addr) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_APIC_ACCESS, &addr);
+ if (error == 0)
+ printf("apic_access_addr[%d]\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && get_virtual_apic_addr) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_VIRTUAL_APIC, &addr);
+ if (error == 0)
+ printf("virtual_apic_addr[%d]\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && get_tpr_threshold) {
+ uint64_t threshold;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD,
+ &threshold);
+ if (error == 0)
+ printf("tpr_threshold[%d]\t0x%08x\n", vcpu, threshold);
+ }
+
+ if (!error && get_msr_bitmap_address) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr);
+ if (error == 0)
+ printf("msr_bitmap[%d]\t\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && get_msr_bitmap) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr);
+ if (error == 0)
+ error = dump_vmcs_msr_bitmap(vcpu, addr);
+ }
+
+ if (!error && get_vpid) {
+ uint64_t vpid;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid);
+ if (error == 0)
+ printf("vpid[%d]\t\t0x%04x\n", vcpu, vpid);
+ }
+
+ if (!error && get_ple_window) {
+ uint64_t window;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_WINDOW, &window);
+ if (error == 0)
+ printf("ple_window[%d]\t\t0x%08x\n", vcpu, window);
+ }
+
+ if (!error && get_ple_gap) {
+ uint64_t gap;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_GAP, &gap);
+ if (error == 0)
+ printf("ple_gap[%d]\t\t0x%08x\n", vcpu, gap);
+ }
+
+ if (!error && get_inst_err) {
+ uint64_t insterr;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_INSTRUCTION_ERROR,
+ &insterr);
+ if (error == 0) {
+ printf("instruction_error[%d]\t0x%08x\n",
+ vcpu, insterr);
+ }
+ }
+
+ if (!error && get_exit_ctls) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl);
+ if (error == 0)
+ printf("exit_ctls[%d]\t\t0x%08x\n", vcpu, ctl);
+ }
+
+ if (!error && get_entry_ctls) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_CTLS, &ctl);
+ if (error == 0)
+ printf("entry_ctls[%d]\t\t0x%08x\n", vcpu, ctl);
+ }
+
+ if (!error && get_host_pat) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_IA32_PAT, &pat);
+ if (error == 0)
+ printf("host_pat[%d]\t\t0x%016lx\n", vcpu, pat);
+ }
+
+ if (!error && get_guest_pat) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_IA32_PAT, &pat);
+ if (error == 0)
+ printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat);
+ }
+
+ if (!error && get_host_cr0) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR0, &cr0);
+ if (error == 0)
+ printf("host_cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
+ }
+
+ if (!error && get_host_cr3) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR3, &cr3);
+ if (error == 0)
+ printf("host_cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
+ }
+
+ if (!error && get_host_cr4) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR4, &cr4);
+ if (error == 0)
+ printf("host_cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
+ }
+
+ if (!error && get_host_rip) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RIP, &rip);
+ if (error == 0)
+ printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rip);
+ }
+
+ if (!error && get_host_rsp) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RSP, &rsp);
+ if (error == 0)
+ printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rsp);
+ }
+
+ if (!error && get_guest_sysenter) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_IA32_SYSENTER_CS, &cs);
+ if (error == 0)
+ printf("guest_sysenter_cs[%d]\t0x%08x\n", vcpu, cs);
+
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_IA32_SYSENTER_ESP, &rsp);
+ if (error == 0)
+ printf("guest_sysenter_sp[%d]\t0x%016lx\n", vcpu, rsp);
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_IA32_SYSENTER_EIP, &rip);
+ if (error == 0)
+ printf("guest_sysenter_ip[%d]\t0x%016lx\n", vcpu, rip);
+ }
+
+ if (!error && get_vmcs_link) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr);
+ if (error == 0)
+ printf("vmcs_pointer[%d]\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && get_vmcs_exit_reason) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON, &u64);
+ if (error == 0)
+ printf("vmcs_exit_reason[%d]\t0x%016lx\n", vcpu, u64);
+ }
+
+ if (!error && get_vmcs_exit_qualification) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION,
+ &u64);
+ if (error == 0)
+ printf("vmcs_exit_qualification[%d]\t0x%016lx\n",
+ vcpu, u64);
+ }
+
+ if (!error && get_vmcs_exit_interruption_info) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_EXIT_INTERRUPTION_INFO, &u64);
+ if (error == 0) {
+ printf("vmcs_exit_interruption_info[%d]\t0x%08x\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && get_vmcs_exit_interruption_error) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_EXIT_INTERRUPTION_ERROR, &u64);
+ if (error == 0) {
+ printf("vmcs_exit_interruption_error[%d]\t0x%08x\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && get_vmcs_interruptibility) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_INTERRUPTIBILITY, &u64);
+ if (error == 0) {
+ printf("vmcs_guest_interruptibility[%d]\t0x%08x\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && setcap) {
+ int captype;
+ captype = vm_capability_name2type(capname);
+ error = vm_set_capability(ctx, vcpu, captype, capval);
+ if (error != 0 && errno == ENOENT)
+ printf("Capability \"%s\" is not available\n", capname);
+ }
+
+ if (!error && getcap) {
+ int captype, val;
+ captype = vm_capability_name2type(capname);
+ error = vm_get_capability(ctx, vcpu, captype, &val);
+ if (error == 0) {
+ printf("Capability \"%s\" is %s on vcpu %d\n", capname,
+ val ? "set" : "not set", vcpu);
+ } else if (errno == ENOENT) {
+ printf("Capability \"%s\" is not available\n", capname);
+ }
+ }
+
+ if (!error && run) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
+ assert(error == 0);
+
+ error = vm_run(ctx, vcpu, rip, &vmexit);
+ if (error == 0)
+ dump_vm_run_exitcode(&vmexit, vcpu);
+ else
+ printf("vm_run error %d\n", error);
+ }
+
+ if (error)
+ printf("errno = %d\n", errno);
+
+ if (!error && destroy)
+ vm_destroy(ctx);
+
+ exit(error);
+}