From 0dd10c0047b5916f56e50ea0b0c7bbadc99c42b1 Mon Sep 17 00:00:00 2001
From: Neel Natu <neel@FreeBSD.org>
Date: Tue, 13 May 2014 16:40:27 +0000
Subject: Don't include the guest memory segments in the bhyve(8) process core
 dump. This has not added a lot of value when debugging bhyve issues while
 greatly increasing the time and space required to store the core file.

Passing the "-C" option to bhyve(8) will change the default and dump guest
memory in the core dump.

Requested by:	grehan
Reviewed by:	grehan
---
 lib/libvmmapi/vmmapi.c | 18 +++++++++++++++---
 lib/libvmmapi/vmmapi.h |  3 +++
 2 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'lib/libvmmapi')

diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index 4a7f852c6ffc..b853ae7e273d 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -57,6 +57,7 @@ struct vmctx {
 	int	fd;
 	uint32_t lowmem_limit;
 	enum vm_mmap_style vms;
+	int	memflags;
 	size_t	lowmem;
 	char	*lowmem_addr;
 	size_t	highmem;
@@ -101,6 +102,7 @@ vm_open(const char *name)
 	assert(vm != NULL);
 
 	vm->fd = -1;
+	vm->memflags = 0;
 	vm->lowmem_limit = 3 * GB;
 	vm->name = (char *)(vm + 1);
 	strcpy(vm->name, name);
@@ -180,10 +182,17 @@ vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit)
 	ctx->lowmem_limit = limit;
 }
 
+void
+vm_set_memflags(struct vmctx *ctx, int flags)
+{
+
+	ctx->memflags = flags;
+}
+
 static int
 setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **addr)
 {
-	int error;
+	int error, mmap_flags;
 	struct vm_memory_segment seg;
 
 	/*
@@ -195,8 +204,11 @@ setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **addr)
 	seg.len = len;
 	error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg);
 	if (error == 0 && addr != NULL) {
-		*addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED,
-				ctx->fd, gpa);
+		mmap_flags = MAP_SHARED;
+		if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
+			mmap_flags |= MAP_NOCORE;
+		*addr = mmap(NULL, len, PROT_READ | PROT_WRITE, mmap_flags,
+		    ctx->fd, gpa);
 	}
 	return (error);
 }
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index 2a2ca6b7f9dc..c1a4b35b3628 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -42,6 +42,8 @@ enum vm_mmap_style {
 	VM_MMAP_SPARSE,		/* mappings created on-demand */
 };
 
+#define	VM_MEM_F_INCORE	0x01	/* include guest memory in core file */
+
 int	vm_create(const char *name);
 struct vmctx *vm_open(const char *name);
 void	vm_destroy(struct vmctx *ctx);
@@ -53,6 +55,7 @@ void	*vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
 int	vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
 uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
 void	vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
+void	vm_set_memflags(struct vmctx *ctx, int flags);
 int	vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
 		    uint64_t base, uint32_t limit, uint32_t access);
 int	vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
-- 
cgit v1.3


From b3e9732a763de0001eab2b331b635be53c3f32ad Mon Sep 17 00:00:00 2001
From: John Baldwin <jhb@FreeBSD.org>
Date: Thu, 15 May 2014 14:16:55 +0000
Subject: Implement a PCI interrupt router to route PCI legacy INTx interrupts
 to the legacy 8259A PICs. - Implement an ICH-comptabile PCI interrupt router
 on the lpc device with   8 steerable pins configured via config space access
 to byte-wide   registers at 0x60-63 and 0x68-6b. - For each configured PCI
 INTx interrupt, route it to both an I/O APIC   pin and a PCI interrupt router
 pin.  When a PCI INTx interrupt is   asserted, ensure that both pins are
 asserted. - Provide an initial routing of PCI interrupt router (PIRQ) pins to
   8259A pins (ISA IRQs) and initialize the interrupt line config register  
 for the corresponding PCI function with the ISA IRQ as this matches  
 existing hardware. - Add a global _PIC method for OSPM to select the desired
 interrupt routing   configuration. - Update the _PRT methods for PCI bridges
 to provide both APIC and legacy   PRT tables and return the appropriate table
 based on the configured   routing configuration.  Note that if the lpc device
 is not configured, no   routing information is provided. - When the lpc
 device is enabled, provide ACPI PCI link devices corresponding   to each PIRQ
 pin. - Add a VMM ioctl to adjust the trigger mode (edge vs level) for 8259A  
 pins via the ELCR. - Mark the power management SCI as level triggered. -
 Don't hardcode the number of elements in Packages in the source for   the
 DSDT.  iasl(8) will fill in the actual number of elements, and   this makes
 it simpler to generate a Package with a variable number of   elements.

Reviewed by:	tycho
---
 lib/libvmmapi/vmmapi.c      |  14 ++
 lib/libvmmapi/vmmapi.h      |   2 +
 sys/amd64/include/vmm.h     |   5 +
 sys/amd64/include/vmm_dev.h |   8 +
 sys/amd64/vmm/io/vatpic.c   |  37 +++++
 sys/amd64/vmm/io/vatpic.h   |   1 +
 sys/amd64/vmm/vmm_dev.c     |   6 +
 usr.sbin/bhyve/Makefile     |   1 +
 usr.sbin/bhyve/acpi.c       |   2 +-
 usr.sbin/bhyve/acpi.h       |   1 +
 usr.sbin/bhyve/bhyverun.c   |   3 +
 usr.sbin/bhyve/mptbl.c      |   3 +-
 usr.sbin/bhyve/pci_emul.c   | 177 +++++++++++++++-------
 usr.sbin/bhyve/pci_emul.h   |   6 +-
 usr.sbin/bhyve/pci_irq.c    | 349 ++++++++++++++++++++++++++++++++++++++++++++
 usr.sbin/bhyve/pci_irq.h    |  45 ++++++
 usr.sbin/bhyve/pci_lpc.c    |  68 ++++++++-
 usr.sbin/bhyve/pci_lpc.h    |   2 +
 usr.sbin/bhyve/pm.c         |  13 ++
 19 files changed, 682 insertions(+), 61 deletions(-)
 create mode 100644 usr.sbin/bhyve/pci_irq.c
 create mode 100644 usr.sbin/bhyve/pci_irq.h

(limited to 'lib/libvmmapi')

diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index b853ae7e273d..5e630f87d248 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -507,6 +507,7 @@ int
 vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
 {
 	struct vm_isa_irq isa_irq;
+
 	bzero(&isa_irq, sizeof(struct vm_isa_irq));
 	isa_irq.atpic_irq = atpic_irq;
 	isa_irq.ioapic_irq = ioapic_irq;
@@ -514,6 +515,19 @@ vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
 	return (ioctl(ctx->fd, VM_ISA_PULSE_IRQ, &isa_irq));
 }
 
+int
+vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq,
+    enum vm_intr_trigger trigger)
+{
+	struct vm_isa_irq_trigger isa_irq_trigger;
+
+	bzero(&isa_irq_trigger, sizeof(struct vm_isa_irq_trigger));
+	isa_irq_trigger.atpic_irq = atpic_irq;
+	isa_irq_trigger.trigger = trigger;
+
+	return (ioctl(ctx->fd, VM_ISA_SET_IRQ_TRIGGER, &isa_irq_trigger));
+}
+
 int
 vm_inject_nmi(struct vmctx *ctx, int vcpu)
 {
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index c1a4b35b3628..88e99475cd97 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -78,6 +78,8 @@ int	vm_ioapic_pincount(struct vmctx *ctx, int *pincount);
 int	vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
 int	vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
 int	vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
+int	vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq,
+	    enum vm_intr_trigger trigger);
 int	vm_inject_nmi(struct vmctx *ctx, int vcpu);
 int	vm_capability_name2type(const char *capname);
 const char *vm_capability_type2name(int type);
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 68240b9d1317..50d879b1bb8b 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -301,6 +301,11 @@ enum x2apic_state {
 	X2APIC_STATE_LAST
 };
 
+enum vm_intr_trigger {
+	EDGE_TRIGGER,
+	LEVEL_TRIGGER
+};
+	
 /*
  * The 'access' field has the format specified in Table 21-2 of the Intel
  * Architecture Manual vol 3b.
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index fcd437f0198f..ecafa9ca5e31 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -84,6 +84,11 @@ struct vm_isa_irq {
 	int		ioapic_irq;
 };
 
+struct vm_isa_irq_trigger {
+	int		atpic_irq;
+	enum vm_intr_trigger trigger;
+};
+
 struct vm_capability {
 	int		cpuid;
 	enum vm_cap_type captype;
@@ -213,6 +218,7 @@ enum {
 	IOCNUM_ISA_ASSERT_IRQ = 80,
 	IOCNUM_ISA_DEASSERT_IRQ = 81,
 	IOCNUM_ISA_PULSE_IRQ = 82,
+	IOCNUM_ISA_SET_IRQ_TRIGGER = 83,
 };
 
 #define	VM_RUN		\
@@ -253,6 +259,8 @@ enum {
 	_IOW('v', IOCNUM_ISA_DEASSERT_IRQ, struct vm_isa_irq)
 #define	VM_ISA_PULSE_IRQ	\
 	_IOW('v', IOCNUM_ISA_PULSE_IRQ, struct vm_isa_irq)
+#define	VM_ISA_SET_IRQ_TRIGGER	\
+	_IOW('v', IOCNUM_ISA_SET_IRQ_TRIGGER, struct vm_isa_irq_trigger)
 #define	VM_SET_CAPABILITY \
 	_IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability)
 #define	VM_GET_CAPABILITY \
diff --git a/sys/amd64/vmm/io/vatpic.c b/sys/amd64/vmm/io/vatpic.c
index 66905e70bceb..298560c98e29 100644
--- a/sys/amd64/vmm/io/vatpic.c
+++ b/sys/amd64/vmm/io/vatpic.c
@@ -446,6 +446,43 @@ vatpic_pulse_irq(struct vm *vm, int irq)
 	return (vatpic_set_irqstate(vm, irq, IRQSTATE_PULSE));
 }
 
+int
+vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger)
+{
+	struct vatpic *vatpic;
+
+	if (irq < 0 || irq > 15)
+		return (EINVAL);
+
+	/*
+	 * See comment in vatpic_elc_handler.  These IRQs must be
+	 * edge triggered.
+	 */
+	if (trigger == LEVEL_TRIGGER) {
+		switch (irq) {
+		case 0:
+		case 1:
+		case 2:
+		case 8:
+		case 13:
+			return (EINVAL);
+		}
+	}
+
+	vatpic = vm_atpic(vm);
+
+	VATPIC_LOCK(vatpic);
+
+	if (trigger == LEVEL_TRIGGER)
+		vatpic->elc[irq >> 3] |=  1 << (irq & 0x7);
+	else
+		vatpic->elc[irq >> 3] &=  ~(1 << (irq & 0x7));
+
+	VATPIC_UNLOCK(vatpic);
+
+	return (0);
+}
+
 void
 vatpic_pending_intr(struct vm *vm, int *vecptr)
 {
diff --git a/sys/amd64/vmm/io/vatpic.h b/sys/amd64/vmm/io/vatpic.h
index d4d6b26cfa52..84d5651dd4cb 100644
--- a/sys/amd64/vmm/io/vatpic.h
+++ b/sys/amd64/vmm/io/vatpic.h
@@ -49,6 +49,7 @@ int vatpic_elc_handler(void *vm, int vcpuid, bool in, int port, int bytes,
 int vatpic_assert_irq(struct vm *vm, int irq);
 int vatpic_deassert_irq(struct vm *vm, int irq);
 int vatpic_pulse_irq(struct vm *vm, int irq);
+int vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger);
 
 void vatpic_pending_intr(struct vm *vm, int *vecptr);
 void vatpic_intr_accepted(struct vm *vm, int vector);
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index 3112c52c97a8..f1d57955767d 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -156,6 +156,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	struct vm_lapic_msi *vmmsi;
 	struct vm_ioapic_irq *ioapic_irq;
 	struct vm_isa_irq *isa_irq;
+	struct vm_isa_irq_trigger *isa_irq_trigger;
 	struct vm_capability *vmcap;
 	struct vm_pptdev *pptdev;
 	struct vm_pptdev_mmio *pptmmio;
@@ -346,6 +347,11 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		if (error == 0 && isa_irq->ioapic_irq != -1)
 			error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
 		break;
+	case VM_ISA_SET_IRQ_TRIGGER:
+		isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
+		error = vatpic_set_irq_trigger(sc->vm,
+		    isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
+		break;
 	case VM_MAP_MEMORY:
 		seg = (struct vm_memory_segment *)data;
 		error = vm_malloc(sc->vm, seg->gpa, seg->len);
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile
index c73cb3d31e02..23e16cb7d0d3 100644
--- a/usr.sbin/bhyve/Makefile
+++ b/usr.sbin/bhyve/Makefile
@@ -23,6 +23,7 @@ SRCS=	\
 	pci_ahci.c		\
 	pci_emul.c		\
 	pci_hostbridge.c	\
+	pci_irq.c		\
 	pci_lpc.c		\
 	pci_passthru.c		\
 	pci_virtio_block.c	\
diff --git a/usr.sbin/bhyve/acpi.c b/usr.sbin/bhyve/acpi.c
index db7f0eb72c48..c4ec020bd5e1 100644
--- a/usr.sbin/bhyve/acpi.c
+++ b/usr.sbin/bhyve/acpi.c
@@ -704,7 +704,7 @@ basl_fwrite_dsdt(FILE *fp)
 	dsdt_line("DefinitionBlock (\"bhyve_dsdt.aml\", \"DSDT\", 2,"
 		 "\"BHYVE \", \"BVDSDT  \", 0x00000001)");
 	dsdt_line("{");
-	dsdt_line("  Name (_S5, Package (0x02)");
+	dsdt_line("  Name (_S5, Package ()");
 	dsdt_line("  {");
 	dsdt_line("      0x05,");
 	dsdt_line("      Zero,");
diff --git a/usr.sbin/bhyve/acpi.h b/usr.sbin/bhyve/acpi.h
index 57edc48cdc15..652164af351c 100644
--- a/usr.sbin/bhyve/acpi.h
+++ b/usr.sbin/bhyve/acpi.h
@@ -49,5 +49,6 @@ void	dsdt_fixed_irq(uint8_t irq);
 void	dsdt_fixed_mem32(uint32_t base, uint32_t length);
 void	dsdt_indent(int levels);
 void	dsdt_unindent(int levels);
+void	sci_init(struct vmctx *ctx);
 
 #endif /* _ACPI_H_ */
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index b2354c9e22a6..d9b4418e99f0 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -60,6 +60,7 @@ __FBSDID("$FreeBSD$");
 #include "mevent.h"
 #include "mptbl.h"
 #include "pci_emul.h"
+#include "pci_irq.h"
 #include "pci_lpc.h"
 #include "smbiostbl.h"
 #include "xmsr.h"
@@ -770,9 +771,11 @@ main(int argc, char *argv[])
 
 	init_mem();
 	init_inout();
+	pci_irq_init(ctx);
 	ioapic_init(ctx);
 
 	rtc_init(ctx);
+	sci_init(ctx);
 
 	/*
 	 * Exit if a device emulation finds an error in it's initilization
diff --git a/usr.sbin/bhyve/mptbl.c b/usr.sbin/bhyve/mptbl.c
index 4c2167e98c85..904d103a51c5 100644
--- a/usr.sbin/bhyve/mptbl.c
+++ b/usr.sbin/bhyve/mptbl.c
@@ -210,7 +210,8 @@ mpt_count_ioint_entries(void)
 }
 
 static void
-mpt_generate_pci_int(int bus, int slot, int pin, int ioapic_irq, void *arg)
+mpt_generate_pci_int(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
+    void *arg)
 {
 	int_entry_ptr *mpiep, mpie;
 
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
index 5b87da7f0604..e7f4894e1679 100644
--- a/usr.sbin/bhyve/pci_emul.c
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
 #include "ioapic.h"
 #include "mem.h"
 #include "pci_emul.h"
+#include "pci_irq.h"
 #include "pci_lpc.h"
 
 #define CONF1_ADDR_PORT    0x0cf8
@@ -81,6 +82,7 @@ struct funcinfo {
 
 struct intxinfo {
 	int	ii_count;
+	int	ii_pirq_pin;
 	int	ii_ioapic_irq;
 };
 
@@ -113,6 +115,7 @@ static uint64_t pci_emul_membase64;
 #define	PCI_EMUL_MEMLIMIT64	0xFD00000000UL
 
 static struct pci_devemu *pci_emul_finddev(char *name);
+static void	pci_lintr_route(struct pci_devinst *pi);
 static void	pci_lintr_update(struct pci_devinst *pi);
 
 static struct mem_range pci_mem_hole;
@@ -697,6 +700,7 @@ pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot,
 	pthread_mutex_init(&pdi->pi_lintr.lock, NULL);
 	pdi->pi_lintr.pin = 0;
 	pdi->pi_lintr.state = IDLE;
+	pdi->pi_lintr.pirq_pin = 0;
 	pdi->pi_lintr.ioapic_irq = 0;
 	pdi->pi_d = pde;
 	snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot);
@@ -1066,6 +1070,27 @@ init_pci(struct vmctx *ctx)
 		bi->memlimit64 = pci_emul_membase64;
 	}
 
+	/*
+	 * PCI backends are initialized before routing INTx interrupts
+	 * so that LPC devices are able to reserve ISA IRQs before
+	 * routing PIRQ pins.
+	 */
+	for (bus = 0; bus < MAXBUSES; bus++) {
+		if ((bi = pci_businfo[bus]) == NULL)
+			continue;
+
+		for (slot = 0; slot < MAXSLOTS; slot++) {
+			si = &bi->slotinfo[slot];
+			for (func = 0; func < MAXFUNCS; func++) {
+				fi = &si->si_funcs[func];
+				if (fi->fi_devi == NULL)
+					continue;
+				pci_lintr_route(fi->fi_devi);
+			}
+		}
+	}
+	lpc_pirq_routed();
+
 	/*
 	 * The guest physical memory map looks like the following:
 	 * [0,		    lowmem)		guest system memory
@@ -1093,19 +1118,36 @@ init_pci(struct vmctx *ctx)
 }
 
 static void
-pci_prt_entry(int bus, int slot, int pin, int ioapic_irq, void *arg)
+pci_apic_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
+    void *arg)
 {
-	int *count;
 
-	count = arg;
-	dsdt_line("  Package (0x04)");
+	dsdt_line("  Package ()");
 	dsdt_line("  {");
 	dsdt_line("    0x%X,", slot << 16 | 0xffff);
 	dsdt_line("    0x%02X,", pin - 1);
 	dsdt_line("    Zero,");
 	dsdt_line("    0x%X", ioapic_irq);
-	dsdt_line("  }%s", *count == 1 ? "" : ",");
-	(*count)--;
+	dsdt_line("  },");
+}
+
+static void
+pci_pirq_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
+    void *arg)
+{
+	char *name;
+
+	name = lpc_pirq_name(pirq_pin);
+	if (name == NULL)
+		return;
+	dsdt_line("  Package ()");
+	dsdt_line("  {");
+	dsdt_line("    0x%X,", slot << 16 | 0xffff);
+	dsdt_line("    0x%02X,", pin - 1);
+	dsdt_line("    %s,", name);
+	dsdt_line("    0x00");
+	dsdt_line("  },");
+	free(name);
 }
 
 /*
@@ -1118,7 +1160,7 @@ pci_bus_write_dsdt(int bus)
 	struct businfo *bi;
 	struct slotinfo *si;
 	struct pci_devinst *pi;
-	int count, slot, func;
+	int count, func, slot;
 
 	/*
 	 * If there are no devices on this 'bus' then just return.
@@ -1133,9 +1175,6 @@ pci_bus_write_dsdt(int bus)
 			return;
 	}
 
-	dsdt_indent(1);
-	dsdt_line("Scope (_SB)");
-	dsdt_line("{");
 	dsdt_line("  Device (PC%02X)", bus);
 	dsdt_line("  {");
 	dsdt_line("    Name (_HID, EisaId (\"PNP0A03\"))");
@@ -1228,10 +1267,25 @@ pci_bus_write_dsdt(int bus)
 	count = pci_count_lintr(bus);
 	if (count != 0) {
 		dsdt_indent(2);
-		dsdt_line("Name (_PRT, Package (0x%02X)", count);
+		dsdt_line("Name (PPRT, Package ()");
 		dsdt_line("{");
-		pci_walk_lintr(bus, pci_prt_entry, &count);
-		dsdt_line("})");
+		pci_walk_lintr(bus, pci_pirq_prt_entry, NULL);
+ 		dsdt_line("})");
+		dsdt_line("Name (APRT, Package ()");
+		dsdt_line("{");
+		pci_walk_lintr(bus, pci_apic_prt_entry, NULL);
+ 		dsdt_line("})");
+		dsdt_line("Method (_PRT, 0, NotSerialized)");
+		dsdt_line("{");
+		dsdt_line("  If (PICM)");
+		dsdt_line("  {");
+		dsdt_line("    Return (APRT)");
+		dsdt_line("  }");
+		dsdt_line("  Else");
+		dsdt_line("  {");
+		dsdt_line("    Return (PPRT)");
+		dsdt_line("  }");
+		dsdt_line("}");
 		dsdt_unindent(2);
 	}
 
@@ -1247,8 +1301,6 @@ pci_bus_write_dsdt(int bus)
 	dsdt_unindent(2);
 done:
 	dsdt_line("  }");
-	dsdt_line("}");
-	dsdt_unindent(1);
 }
 
 void
@@ -1256,8 +1308,19 @@ pci_write_dsdt(void)
 {
 	int bus;
 
+	dsdt_indent(1);
+	dsdt_line("Name (PICM, 0x00)");
+	dsdt_line("Method (_PIC, 1, NotSerialized)");
+	dsdt_line("{");
+	dsdt_line("  Store (Arg0, PICM)");
+	dsdt_line("}");
+	dsdt_line("");
+	dsdt_line("Scope (_SB)");
+	dsdt_line("{");
 	for (bus = 0; bus < MAXBUSES; bus++)
 		pci_bus_write_dsdt(bus);
+	dsdt_line("}");
+	dsdt_unindent(1);
 }
 
 int
@@ -1330,18 +1393,19 @@ pci_lintr_permitted(struct pci_devinst *pi)
 		(cmd & PCIM_CMD_INTxDIS)));
 }
 
-int
+void
 pci_lintr_request(struct pci_devinst *pi)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
-	int bestpin, bestcount, irq, pin;
+	int bestpin, bestcount, pin;
 
 	bi = pci_businfo[pi->pi_bus];
 	assert(bi != NULL);
 
 	/*
-	 * First, allocate a pin from our slot.
+	 * Just allocate a pin from our slot.  The pin will be
+	 * assigned IRQs later when interrupts are routed.
 	 */
 	si = &bi->slotinfo[pi->pi_slot];
 	bestpin = 0;
@@ -1353,26 +1417,43 @@ pci_lintr_request(struct pci_devinst *pi)
 		}
 	}
 
-	/*
-	 * Attempt to allocate an I/O APIC pin for this intpin.  If
-	 * 8259A support is added we will need a separate field to
-	 * assign the intpin to an input pin on the PCI interrupt
-	 * router.
-	 */
-	if (si->si_intpins[bestpin].ii_count == 0) {
-		irq = ioapic_pci_alloc_irq();
-		if (irq < 0)
-			return (-1);		
-		si->si_intpins[bestpin].ii_ioapic_irq = irq;
-	} else
-		irq = si->si_intpins[bestpin].ii_ioapic_irq;
 	si->si_intpins[bestpin].ii_count++;
-
 	pi->pi_lintr.pin = bestpin + 1;
-	pi->pi_lintr.ioapic_irq = irq;
-	pci_set_cfgdata8(pi, PCIR_INTLINE, irq);
 	pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1);
-	return (0);
+}
+
+static void
+pci_lintr_route(struct pci_devinst *pi)
+{
+	struct businfo *bi;
+	struct intxinfo *ii;
+
+	if (pi->pi_lintr.pin == 0)
+		return;
+
+	bi = pci_businfo[pi->pi_bus];
+	assert(bi != NULL);
+	ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1];
+
+	/*
+	 * Attempt to allocate an I/O APIC pin for this intpin if one
+	 * is not yet assigned.
+	 */
+	if (ii->ii_ioapic_irq == 0)
+		ii->ii_ioapic_irq = ioapic_pci_alloc_irq();
+	assert(ii->ii_ioapic_irq > 0);
+
+	/*
+	 * Attempt to allocate a PIRQ pin for this intpin if one is
+	 * not yet assigned.
+	 */
+	if (ii->ii_pirq_pin == 0)
+		ii->ii_pirq_pin = pirq_alloc_pin(pi->pi_vmctx);
+	assert(ii->ii_pirq_pin > 0);
+
+	pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq;
+	pi->pi_lintr.pirq_pin = ii->ii_pirq_pin;
+	pci_set_cfgdata8(pi, PCIR_INTLINE, pirq_irq(ii->ii_pirq_pin));
 }
 
 void
@@ -1385,8 +1466,7 @@ pci_lintr_assert(struct pci_devinst *pi)
 	if (pi->pi_lintr.state == IDLE) {
 		if (pci_lintr_permitted(pi)) {
 			pi->pi_lintr.state = ASSERTED;
-			vm_ioapic_assert_irq(pi->pi_vmctx,
-			    pi->pi_lintr.ioapic_irq);
+			pci_irq_assert(pi);
 		} else
 			pi->pi_lintr.state = PENDING;
 	}
@@ -1402,7 +1482,7 @@ pci_lintr_deassert(struct pci_devinst *pi)
 	pthread_mutex_lock(&pi->pi_lintr.lock);
 	if (pi->pi_lintr.state == ASSERTED) {
 		pi->pi_lintr.state = IDLE;
-		vm_ioapic_deassert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq);
+		pci_irq_deassert(pi);
 	} else if (pi->pi_lintr.state == PENDING)
 		pi->pi_lintr.state = IDLE;
 	pthread_mutex_unlock(&pi->pi_lintr.lock);
@@ -1414,11 +1494,11 @@ pci_lintr_update(struct pci_devinst *pi)
 
 	pthread_mutex_lock(&pi->pi_lintr.lock);
 	if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) {
-		vm_ioapic_deassert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq);
+		pci_irq_deassert(pi);
 		pi->pi_lintr.state = PENDING;
 	} else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) {
 		pi->pi_lintr.state = ASSERTED;
-		vm_ioapic_assert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq);
+		pci_irq_assert(pi);
 	}
 	pthread_mutex_unlock(&pi->pi_lintr.lock);
 }
@@ -1458,7 +1538,8 @@ pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg)
 		for (pin = 0; pin < 4; pin++) {
 			ii = &si->si_intpins[pin];
 			if (ii->ii_count != 0)
-				cb(bus, slot, pin + 1, ii->ii_ioapic_irq, arg);
+				cb(bus, slot, pin + 1, ii->ii_pirq_pin,
+				    ii->ii_ioapic_irq, arg);
 		}
 	}
 }
@@ -1755,20 +1836,6 @@ INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata);
 INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata);
 INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
 
-/*
- * I/O ports to configure PCI IRQ routing. We ignore all writes to it.
- */
-static int
-pci_irq_port_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
-		     uint32_t *eax, void *arg)
-{
-	assert(in == 0);
-	return (0);
-}
-INOUT_PORT(pci_irq, 0xC00, IOPORT_F_OUT, pci_irq_port_handler);
-INOUT_PORT(pci_irq, 0xC01, IOPORT_F_OUT, pci_irq_port_handler);
-SYSRES_IO(0xC00, 2);
-
 #define PCI_EMUL_TEST
 #ifdef PCI_EMUL_TEST
 /*
diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h
index e1040a8b715a..866ffc5b8224 100644
--- a/usr.sbin/bhyve/pci_emul.h
+++ b/usr.sbin/bhyve/pci_emul.h
@@ -120,6 +120,7 @@ struct pci_devinst {
 	struct {
 		int8_t    	pin;
 		enum lintr_stat	state;
+		int		pirq_pin;
 		int	  	ioapic_irq;
 		pthread_mutex_t	lock;
 	} pi_lintr;
@@ -200,7 +201,8 @@ struct pciecap {
 	uint16_t	slot_status2;
 } __packed;
 
-typedef void (*pci_lintr_cb)(int b, int s, int pin, int ioapic_irq, void *arg);
+typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin,
+    int ioapic_irq, void *arg);
 
 int	init_pci(struct vmctx *ctx);
 void	msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
@@ -218,7 +220,7 @@ void	pci_generate_msi(struct pci_devinst *pi, int msgnum);
 void	pci_generate_msix(struct pci_devinst *pi, int msgnum);
 void	pci_lintr_assert(struct pci_devinst *pi);
 void	pci_lintr_deassert(struct pci_devinst *pi);
-int	pci_lintr_request(struct pci_devinst *pi);
+void	pci_lintr_request(struct pci_devinst *pi);
 int	pci_msi_enabled(struct pci_devinst *pi);
 int	pci_msix_enabled(struct pci_devinst *pi);
 int	pci_msix_table_bar(struct pci_devinst *pi);
diff --git a/usr.sbin/bhyve/pci_irq.c b/usr.sbin/bhyve/pci_irq.c
new file mode 100644
index 000000000000..653aeb0ff1f6
--- /dev/null
+++ b/usr.sbin/bhyve/pci_irq.c
@@ -0,0 +1,349 @@
+/*-
+ * Copyright (c) 2014 Advanced Computing Technologies LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <machine/vmm.h>
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vmmapi.h>
+
+#include "acpi.h"
+#include "inout.h"
+#include "pci_emul.h"
+#include "pci_irq.h"
+#include "pci_lpc.h"
+
+/*
+ * Implement an 8 pin PCI interrupt router compatible with the router
+ * present on Intel's ICH10 chip.
+ */
+
+/* Fields in each PIRQ register. */
+#define	PIRQ_DIS	0x80
+#define	PIRQ_IRQ	0x0f
+
+/* Only IRQs 3-7, 9-12, and 14-15 are permitted. */
+#define	PERMITTED_IRQS	0xdef8
+#define	IRQ_PERMITTED(irq)	(((1U << (irq)) & PERMITTED_IRQS) != 0)
+
+/* IRQ count to disable an IRQ. */
+#define	IRQ_DISABLED	0xff
+
+static struct pirq {
+	uint8_t	reg;
+	int	use_count;
+	int	active_count;
+	pthread_mutex_t lock;
+} pirqs[8];
+
+static u_char irq_counts[16];
+static int pirq_cold = 1;
+
+/*
+ * Returns true if this pin is enabled with a valid IRQ.  Setting the
+ * register to a reserved IRQ causes interrupts to not be asserted as
+ * if the pin was disabled.
+ */
+static bool
+pirq_valid_irq(int reg)
+{
+
+	if (reg & PIRQ_DIS)
+		return (false);
+	return (IRQ_PERMITTED(reg & PIRQ_IRQ));
+}
+
+uint8_t
+pirq_read(int pin)
+{
+
+	assert(pin > 0 && pin <= nitems(pirqs));
+	return (pirqs[pin - 1].reg);
+}
+
+void
+pirq_write(struct vmctx *ctx, int pin, uint8_t val)
+{
+	struct pirq *pirq;
+
+	assert(pin > 0 && pin <= nitems(pirqs));
+	pirq = &pirqs[pin - 1];
+	pthread_mutex_lock(&pirq->lock);
+	if (pirq->reg != (val & (PIRQ_DIS | PIRQ_IRQ))) {
+		if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg))
+			vm_isa_deassert_irq(ctx, pirq->reg & PIRQ_IRQ, -1);
+		pirq->reg = val & (PIRQ_DIS | PIRQ_IRQ);
+		if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg))
+			vm_isa_assert_irq(ctx, pirq->reg & PIRQ_IRQ, -1);
+	}
+	pthread_mutex_unlock(&pirq->lock);
+}
+
+void
+pci_irq_reserve(int irq)
+{
+
+	assert(irq < nitems(irq_counts));
+	assert(pirq_cold);
+	assert(irq_counts[irq] == 0 || irq_counts[irq] == IRQ_DISABLED);
+	irq_counts[irq] = IRQ_DISABLED;
+}
+
+void
+pci_irq_use(int irq)
+{
+
+	assert(irq < nitems(irq_counts));
+	assert(pirq_cold);
+	if (irq_counts[irq] != IRQ_DISABLED)
+		irq_counts[irq]++;
+}
+
+void
+pci_irq_init(struct vmctx *ctx)
+{
+	int i;
+
+	for (i = 0; i < nitems(pirqs); i++) {
+		pirqs[i].reg = PIRQ_DIS;
+		pirqs[i].use_count = 0;
+		pirqs[i].active_count = 0;
+		pthread_mutex_init(&pirqs[i].lock, NULL);
+	}
+	for (i = 0; i < nitems(irq_counts); i++) {
+		if (IRQ_PERMITTED(i))
+			irq_counts[i] = 0;
+		else
+			irq_counts[i] = IRQ_DISABLED;
+	}
+}
+
+void
+pci_irq_assert(struct pci_devinst *pi)
+{
+	struct pirq *pirq;
+
+	if (pi->pi_lintr.pirq_pin > 0) {
+		assert(pi->pi_lintr.pirq_pin <= nitems(pirqs));
+		pirq = &pirqs[pi->pi_lintr.pirq_pin - 1];
+		pthread_mutex_lock(&pirq->lock);
+		pirq->active_count++;
+		if (pirq->active_count == 1 && pirq_valid_irq(pirq->reg)) {
+			vm_isa_assert_irq(pi->pi_vmctx, pirq->reg & PIRQ_IRQ,
+			    pi->pi_lintr.ioapic_irq);
+			pthread_mutex_unlock(&pirq->lock);
+			return;
+		}
+		pthread_mutex_unlock(&pirq->lock);
+	}
+	vm_ioapic_assert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq);
+}
+
+void
+pci_irq_deassert(struct pci_devinst *pi)
+{
+	struct pirq *pirq;
+
+	if (pi->pi_lintr.pirq_pin > 0) {
+		assert(pi->pi_lintr.pirq_pin <= nitems(pirqs));
+		pirq = &pirqs[pi->pi_lintr.pirq_pin - 1];
+		pthread_mutex_lock(&pirq->lock);
+		pirq->active_count--;
+		if (pirq->active_count == 0 && pirq_valid_irq(pirq->reg)) {
+			vm_isa_deassert_irq(pi->pi_vmctx, pirq->reg & PIRQ_IRQ,
+			    pi->pi_lintr.ioapic_irq);
+			pthread_mutex_unlock(&pirq->lock);
+			return;
+		}
+		pthread_mutex_unlock(&pirq->lock);
+	}
+	vm_ioapic_deassert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq);
+}
+
+int
+pirq_alloc_pin(struct vmctx *ctx)
+{
+	int best_count, best_irq, best_pin, irq, pin;
+
+	pirq_cold = 1;
+
+	/* First, find the least-used PIRQ pin. */
+	best_pin = 0;
+	best_count = pirqs[0].use_count;
+	for (pin = 1; pin < nitems(pirqs); pin++) {
+		if (pirqs[pin].use_count < best_count) {
+			best_pin = pin;
+			best_count = pirqs[pin].use_count;
+		}
+	}
+	pirqs[best_pin].use_count++;
+
+	/* Second, route this pin to an IRQ. */
+	if (pirqs[best_pin].reg == PIRQ_DIS) {
+		best_irq = -1;
+		best_count = 0;
+		for (irq = 0; irq < nitems(irq_counts); irq++) {
+			if (irq_counts[irq] == IRQ_DISABLED)
+				continue;
+			if (best_irq == -1 || irq_counts[irq] < best_count) {
+				best_irq = irq;
+				best_count = irq_counts[irq];
+			}
+		}
+		assert(best_irq != 0);
+		irq_counts[best_irq]++;
+		pirqs[best_pin].reg = best_irq;
+		vm_isa_set_irq_trigger(ctx, best_irq, LEVEL_TRIGGER);
+	}
+
+	return (best_pin + 1);
+}
+
+int
+pirq_irq(int pin)
+{
+
+	if (pin == -1)
+		return (255);
+	assert(pin > 0 && pin <= nitems(pirqs));
+	return (pirqs[pin - 1].reg & PIRQ_IRQ);
+}
+
+/* XXX: Generate $PIR table. */
+
+static void
+pirq_dsdt(void)
+{
+	char *irq_prs, *old;
+	int irq, pin;
+
+	irq_prs = NULL;
+	for (irq = 0; irq < nitems(irq_counts); irq++) {
+		if (!IRQ_PERMITTED(irq))
+			continue;
+		if (irq_prs == NULL)
+			asprintf(&irq_prs, "%d", irq);
+		else {
+			old = irq_prs;
+			asprintf(&irq_prs, "%s,%d", old, irq);
+			free(old);
+		}
+	}
+
+	/*
+	 * A helper method to validate a link register's value.  This
+	 * duplicates pirq_valid_irq().
+	 */
+	dsdt_line("");
+	dsdt_line("Method (PIRV, 1, NotSerialized)");
+	dsdt_line("{");
+	dsdt_line("  If (And (Arg0, 0x%02X))", PIRQ_DIS);
+	dsdt_line("  {");
+	dsdt_line("    Return (0x00)");
+	dsdt_line("  }");
+	dsdt_line("  And (Arg0, 0x%02X, Local0)", PIRQ_IRQ);
+	dsdt_line("  If (LLess (Local0, 0x03))");
+	dsdt_line("  {");
+	dsdt_line("    Return (0x00)");
+	dsdt_line("  }");
+	dsdt_line("  If (LEqual (Local0, 0x08))");
+	dsdt_line("  {");
+	dsdt_line("    Return (0x00)");
+	dsdt_line("  }");
+	dsdt_line("  If (LEqual (Local0, 0x0D))");
+	dsdt_line("  {");
+	dsdt_line("    Return (0x00)");
+	dsdt_line("  }");
+	dsdt_line("  Return (0x01)");
+	dsdt_line("}");
+
+	for (pin = 0; pin < nitems(pirqs); pin++) {
+		dsdt_line("");
+		dsdt_line("Device (LNK%c)", 'A' + pin);
+		dsdt_line("{");
+		dsdt_line("  Name (_HID, EisaId (\"PNP0C0F\"))");
+		dsdt_line("  Name (_UID, 0x%02X)", pin + 1);
+		dsdt_line("  Method (_STA, 0, NotSerialized)");
+		dsdt_line("  {");
+		dsdt_line("    If (PIRV (PIR%c))", 'A' + pin);
+		dsdt_line("    {");
+		dsdt_line("       Return (0x0B)");
+		dsdt_line("    }");
+		dsdt_line("    Else");
+		dsdt_line("    {");
+		dsdt_line("       Return (0x09)");
+		dsdt_line("    }");
+		dsdt_line("  }");
+		dsdt_line("  Name (_PRS, ResourceTemplate ()");
+		dsdt_line("  {");
+		dsdt_line("    IRQ (Level, ActiveLow, Shared, )");
+		dsdt_line("      {%s}", irq_prs);
+		dsdt_line("  })");
+		dsdt_line("  Name (CB%02X, ResourceTemplate ()", pin + 1);
+		dsdt_line("  {");
+		dsdt_line("    IRQ (Level, ActiveLow, Shared, )");
+		dsdt_line("      {}");
+		dsdt_line("  })");
+		dsdt_line("  CreateWordField (CB%02X, 0x01, CIR%c)",
+		    pin + 1, 'A' + pin);
+		dsdt_line("  Method (_CRS, 0, NotSerialized)");
+		dsdt_line("  {");
+		dsdt_line("    And (PIR%c, 0x%02X, Local0)", 'A' + pin,
+		    PIRQ_DIS | PIRQ_IRQ);
+		dsdt_line("    If (PIRV (Local0))");
+		dsdt_line("    {");
+		dsdt_line("      ShiftLeft (0x01, Local0, CIR%c)", 'A' + pin);
+		dsdt_line("    }");
+		dsdt_line("    Else");
+		dsdt_line("    {");
+		dsdt_line("      Store (0x00, CIR%c)", 'A' + pin);
+		dsdt_line("    }");
+		dsdt_line("    Return (CB%02X)", pin + 1);
+		dsdt_line("  }");
+		dsdt_line("  Method (_DIS, 0, NotSerialized)");
+		dsdt_line("  {");
+		dsdt_line("    Store (0x80, PIR%c)", 'A' + pin);
+		dsdt_line("  }");
+		dsdt_line("  Method (_SRS, 1, NotSerialized)");
+		dsdt_line("  {");
+		dsdt_line("    CreateWordField (Arg0, 0x01, SIR%c)", 'A' + pin);
+		dsdt_line("    FindSetRightBit (SIR%c, Local0)", 'A' + pin);
+		dsdt_line("    Store (Decrement (Local0), PIR%c)", 'A' + pin);
+		dsdt_line("  }");
+		dsdt_line("}");
+	}
+	free(irq_prs);
+}
+LPC_DSDT(pirq_dsdt);
diff --git a/usr.sbin/bhyve/pci_irq.h b/usr.sbin/bhyve/pci_irq.h
new file mode 100644
index 000000000000..9d331a5d6321
--- /dev/null
+++ b/usr.sbin/bhyve/pci_irq.h
@@ -0,0 +1,45 @@
+/*-
+ * Copyright (c) 2014 Advanced Computing Technologies LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __PCI_IRQ_H__
+#define	__PCI_IRQ_H__
+
+struct pci_devinst;
+
+void	pci_irq_assert(struct pci_devinst *pi);
+void	pci_irq_deassert(struct pci_devinst *pi);
+void	pci_irq_init(struct vmctx *ctx);
+void	pci_irq_reserve(int irq);
+void	pci_irq_use(int irq);
+int	pirq_alloc_pin(struct vmctx *ctx);
+int	pirq_irq(int pin);
+uint8_t	pirq_read(int pin);
+void	pirq_write(struct vmctx *ctx, int pin, uint8_t val);
+
+#endif
diff --git a/usr.sbin/bhyve/pci_lpc.c b/usr.sbin/bhyve/pci_lpc.c
index f5e4a697c83a..6b61b7afd0a3 100644
--- a/usr.sbin/bhyve/pci_lpc.c
+++ b/usr.sbin/bhyve/pci_lpc.c
@@ -43,6 +43,7 @@ __FBSDID("$FreeBSD$");
 #include "acpi.h"
 #include "inout.h"
 #include "pci_emul.h"
+#include "pci_irq.h"
 #include "pci_lpc.h"
 #include "uart_emul.h"
 
@@ -173,6 +174,7 @@ lpc_init(void)
 			    "LPC device %s\n", name);
 			return (-1);
 		}
+		pci_irq_reserve(sc->irq);
 
 		sc->uart_softc = uart_init(lpc_uart_intr_assert,
 				    lpc_uart_intr_deassert, sc);
@@ -208,7 +210,21 @@ pci_lpc_write_dsdt(struct pci_devinst *pi)
 	dsdt_line("Device (ISA)");
 	dsdt_line("{");
 	dsdt_line("  Name (_ADR, 0x%04X%04X)", pi->pi_slot, pi->pi_func);
-	dsdt_line("  OperationRegion (P40C, PCI_Config, 0x60, 0x04)");
+	dsdt_line("  OperationRegion (LPCR, PCI_Config, 0x00, 0x100)");
+	dsdt_line("  Field (LPCR, AnyAcc, NoLock, Preserve)");
+	dsdt_line("  {");
+	dsdt_line("    Offset (0x60),");
+	dsdt_line("    PIRA,   8,");
+	dsdt_line("    PIRB,   8,");
+	dsdt_line("    PIRC,   8,");
+	dsdt_line("    PIRD,   8,");
+	dsdt_line("    Offset (0x68),");
+	dsdt_line("    PIRE,   8,");
+	dsdt_line("    PIRF,   8,");
+	dsdt_line("    PIRG,   8,");
+	dsdt_line("    PIRH,   8");
+	dsdt_line("  }");
+	dsdt_line("");
 
 	dsdt_indent(1);
 	SET_FOREACH(ldpp, lpc_dsdt_set) {
@@ -305,13 +321,34 @@ pci_lpc_uart_dsdt(void)
 }
 LPC_DSDT(pci_lpc_uart_dsdt);
 
+static int
+pci_lpc_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+		  int coff, int bytes, uint32_t val)
+{
+	int pirq_pin;
+
+	if (bytes == 1) {
+		pirq_pin = 0;
+		if (coff >= 0x60 && coff <= 0x63)
+			pirq_pin = coff - 0x60 + 1;
+		if (coff >= 0x68 && coff <= 0x6b)
+			pirq_pin = coff - 0x68 + 5;
+		if (pirq_pin != 0) {
+			pirq_write(ctx, pirq_pin, val);
+			pci_set_cfgdata8(pi, coff, pirq_read(pirq_pin));
+			return (0);
+		}
+	}
+	return (-1);
+}
+
 static void
 pci_lpc_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	       int baridx, uint64_t offset, int size, uint64_t value)
 {
 }
 
-uint64_t
+static uint64_t
 pci_lpc_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 	      int baridx, uint64_t offset, int size)
 {
@@ -324,6 +361,7 @@ pci_lpc_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
 static int
 pci_lpc_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 {
+
 	/*
 	 * Do not allow more than one LPC bridge to be configured.
 	 */
@@ -356,10 +394,36 @@ pci_lpc_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	return (0);
 }
 
+char *
+lpc_pirq_name(int pin)
+{
+	char *name;
+
+	if (lpc_bridge == NULL)
+		return (NULL);
+	asprintf(&name, "\\_SB.PC00.ISA.LNK%c,", 'A' + pin - 1);
+	return (name);
+}
+
+void
+lpc_pirq_routed(void)
+{
+	int pin;
+
+	if (lpc_bridge == NULL)
+		return;
+
+ 	for (pin = 0; pin < 4; pin++)
+		pci_set_cfgdata8(lpc_bridge, 0x60 + pin, pirq_read(pin + 1));
+	for (pin = 0; pin < 4; pin++)
+		pci_set_cfgdata8(lpc_bridge, 0x68 + pin, pirq_read(pin + 5));
+}
+
 struct pci_devemu pci_de_lpc = {
 	.pe_emu =	"lpc",
 	.pe_init =	pci_lpc_init,
 	.pe_write_dsdt = pci_lpc_write_dsdt,
+	.pe_cfgwrite =	pci_lpc_cfgwrite,
 	.pe_barwrite =	pci_lpc_write,
 	.pe_barread =	pci_lpc_read
 };
diff --git a/usr.sbin/bhyve/pci_lpc.h b/usr.sbin/bhyve/pci_lpc.h
index e45bcb97e4d6..55a58653f422 100644
--- a/usr.sbin/bhyve/pci_lpc.h
+++ b/usr.sbin/bhyve/pci_lpc.h
@@ -66,5 +66,7 @@ struct lpc_sysres {
 #define	SYSRES_MEM(base, length)	LPC_SYSRES(LPC_SYSRES_MEM, base, length)
 
 int	lpc_device_parse(const char *opt);
+char	*lpc_pirq_name(int pin);
+void	lpc_pirq_routed(void);
 
 #endif
diff --git a/usr.sbin/bhyve/pm.c b/usr.sbin/bhyve/pm.c
index 99087e44d8c6..67126d8765c7 100644
--- a/usr.sbin/bhyve/pm.c
+++ b/usr.sbin/bhyve/pm.c
@@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
 #include "acpi.h"
 #include "inout.h"
 #include "mevent.h"
+#include "pci_irq.h"
 #include "pci_lpc.h"
 
 static pthread_mutex_t pm_lock = PTHREAD_MUTEX_INITIALIZER;
@@ -289,3 +290,15 @@ smi_cmd_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 }
 INOUT_PORT(smi_cmd, SMI_CMD, IOPORT_F_OUT, smi_cmd_handler);
 SYSRES_IO(SMI_CMD, 1);
+
+void
+sci_init(struct vmctx *ctx)
+{
+
+	/*
+	 * Mark ACPI's SCI as level trigger and bump its use count
+	 * in the PIRQ router.
+	 */
+	pci_irq_use(SCI_INT);
+	vm_isa_set_irq_trigger(ctx, SCI_INT, LEVEL_TRIGGER);
+}
-- 
cgit v1.3


From da11f4aa1da2ed00735a3ec0361d46dd3f8a8b4f Mon Sep 17 00:00:00 2001
From: Neel Natu <neel@FreeBSD.org>
Date: Sat, 24 May 2014 23:12:30 +0000
Subject: Add libvmmapi functions vm_copyin() and vm_copyout() to copy into and
 out of the guest linear address space. These APIs in turn use a new ioctl
 'VM_GLA2GPA' to convert the guest linear address to guest physical.

Use the new copyin/copyout APIs when emulating ins/outs instruction in
bhyve(8).
---
 lib/libvmmapi/vmmapi.c      | 86 +++++++++++++++++++++++++++++++++++++++++++++
 lib/libvmmapi/vmmapi.h      |  5 +++
 sys/amd64/include/vmm.h     |  1 -
 sys/amd64/include/vmm_dev.h | 12 +++++++
 sys/amd64/vmm/vmm_dev.c     | 24 +++++++++++++
 sys/amd64/vmm/vmm_ioport.c  | 16 ++-------
 usr.sbin/bhyve/inout.c      | 47 +++++++++++++------------
 7 files changed, 154 insertions(+), 37 deletions(-)

(limited to 'lib/libvmmapi')

diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index 5e630f87d248..45fffcf4827a 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/mman.h>
 
 #include <machine/specialreg.h>
+#include <machine/param.h>
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -937,3 +938,88 @@ vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities)
 		*capabilities = cap.capabilities;
 	return (error);
 }
+
+static int
+vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint64_t gla, int prot, int *fault, uint64_t *gpa)
+{
+	struct vm_gla2gpa gg;
+	int error;
+
+	bzero(&gg, sizeof(struct vm_gla2gpa));
+	gg.vcpuid = vcpu;
+	gg.prot = prot;
+	gg.gla = gla;
+	gg.paging = *paging;
+
+	error = ioctl(ctx->fd, VM_GLA2GPA, &gg);
+	if (error == 0) {
+		*fault = gg.fault;
+		*gpa = gg.gpa;
+	}
+	return (error);
+}
+
+#ifndef min
+#define	min(a,b)	(((a) < (b)) ? (a) : (b))
+#endif
+
+int
+vm_copyin(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint64_t gla, void *vp, size_t len)
+{
+	char *dst;
+	const char *src;
+	uint64_t gpa;
+	int error, fault, n, off;
+
+	dst = vp;
+	while (len) {
+		error = vm_gla2gpa(ctx, vcpu, paging, gla, PROT_READ,
+		    &fault, &gpa);
+		if (error)
+			return (-1);
+		if (fault)
+			return (1);
+
+		off = gpa & PAGE_MASK;
+		n = min(len, PAGE_SIZE - off);
+		src = vm_map_gpa(ctx, gpa, n);
+		bcopy(src, dst, n);
+
+		gla += n;
+		dst += n;
+		len -= n;
+	}
+	return (0);
+}
+
+int
+vm_copyout(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    const void *vp, uint64_t gla, size_t len)
+{
+	uint64_t gpa;
+	char *dst;
+	const char *src;
+	int error, fault, n, off;
+
+	src = vp;
+	while (len) {
+		error = vm_gla2gpa(ctx, vcpu, paging, gla, PROT_WRITE,
+		    &fault, &gpa);
+		if (error)
+			return (-1);
+		if (fault)
+			return (1);
+
+		off = gpa & PAGE_MASK;
+		n = min(len, PAGE_SIZE - off);
+		dst = vm_map_gpa(ctx, gpa, n);
+		bcopy(src, dst, n);
+
+		gla += n;
+		src += n;
+		len -= n;
+	}
+	return (0);
+}
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index 88e99475cd97..cad41c89ba0e 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -109,6 +109,11 @@ int	vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s);
 
 int	vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities);
 
+int	vm_copyin(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+	    uint64_t gla_src, void *dst, size_t len);
+int	vm_copyout(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+	    const void *src, uint64_t gla_dst, size_t len);
+
 /* Reset vcpu register state */
 int	vcpu_reset(struct vmctx *ctx, int vcpu);
 
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 021efafb1ae0..28e2808e89b7 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -427,7 +427,6 @@ struct vm_inout_str {
 	enum vm_reg_name seg_name;
 	struct seg_desc seg_desc;
 	uint64_t	gla;		/* may be set to VIE_INVALID_GLA */
-	uint64_t	gpa;
 };
 
 struct vm_exit {
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index ecafa9ca5e31..f094d519a413 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -168,6 +168,15 @@ struct vm_suspend {
 	enum vm_suspend_how how;
 };
 
+struct vm_gla2gpa {
+	int		vcpuid;		/* inputs */
+	int 		prot;		/* PROT_READ or PROT_WRITE */
+	uint64_t	gla;
+	struct vm_guest_paging paging;
+	int		fault;		/* outputs */
+	uint64_t	gpa;
+};
+
 enum {
 	/* general routines */
 	IOCNUM_ABIVERS = 0,
@@ -180,6 +189,7 @@ enum {
 	IOCNUM_MAP_MEMORY = 10,
 	IOCNUM_GET_MEMORY_SEG = 11,
 	IOCNUM_GET_GPA_PMAP = 12,
+	IOCNUM_GLA2GPA = 13,
 
 	/* register/state accessors */
 	IOCNUM_SET_REGISTER = 20,
@@ -289,4 +299,6 @@ enum {
 	_IOR('v', IOCNUM_GET_HPET_CAPABILITIES, struct vm_hpet_cap)
 #define	VM_GET_GPA_PMAP \
 	_IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte)
+#define	VM_GLA2GPA	\
+	_IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa)
 #endif
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index f1d57955767d..05617853641c 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$");
 
 #include <machine/vmparam.h>
 #include <machine/vmm.h>
+#include <machine/vmm_instruction_emul.h>
 #include <machine/vmm_dev.h>
 
 #include "vmm_lapic.h"
@@ -168,6 +169,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	struct vm_x2apic *x2apic;
 	struct vm_gpa_pte *gpapte;
 	struct vm_suspend *vmsuspend;
+	struct vm_gla2gpa *gg;
 
 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL)
@@ -192,6 +194,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	case VM_PPTDEV_MSI:
 	case VM_PPTDEV_MSIX:
 	case VM_SET_X2APIC_STATE:
+	case VM_GLA2GPA:
 		/*
 		 * XXX fragile, handle with care
 		 * Assumes that the first field of the ioctl data is the vcpu.
@@ -415,6 +418,27 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	case VM_GET_HPET_CAPABILITIES:
 		error = vhpet_getcap((struct vm_hpet_cap *)data);
 		break;
+	case VM_GLA2GPA: {
+		CTASSERT(PROT_READ == VM_PROT_READ);
+		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
+		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
+		gg = (struct vm_gla2gpa *)data;
+		error = vmm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
+		    gg->prot, &gg->gpa);
+		KASSERT(error == 0 || error == 1 || error == -1,
+		    ("%s: vmm_gla2gpa unknown error %d", __func__, error));
+		if (error >= 0) {
+			/*
+			 * error = 0: the translation was successful
+			 * error = 1: a fault was injected into the guest
+			 */
+			gg->fault = error;
+			error = 0;
+		} else {
+			error = EFAULT;
+		}
+		break;
+	}
 	default:
 		error = ENOTTY;
 		break;
diff --git a/sys/amd64/vmm/vmm_ioport.c b/sys/amd64/vmm/vmm_ioport.c
index f9fda2d45402..96f2418a72b4 100644
--- a/sys/amd64/vmm/vmm_ioport.c
+++ b/sys/amd64/vmm/vmm_ioport.c
@@ -145,7 +145,7 @@ emulate_inout_str(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu)
 {
 	struct vm_inout_str *vis;
 	uint64_t gla, index, segbase;
-	int error, in;
+	int in;
 
 	vis = &vmexit->u.inout_str;
 	in = vis->inout.in;
@@ -197,18 +197,8 @@ emulate_inout_str(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu)
 	}
 	vis->gla = gla;
 
-	error = vmm_gla2gpa(vm, vcpuid, &vis->paging, gla,
-	    in ? VM_PROT_WRITE : VM_PROT_READ, &vis->gpa);
-	KASSERT(error == 0 || error == 1 || error == -1,
-	    ("%s: vmm_gla2gpa unexpected error %d", __func__, error));
-	if (error == -1) {
-		return (EFAULT);
-	} else if (error == 1) {
-		return (0);	/* Resume guest to handle page fault */
-	} else {
-		*retu = true;
-		return (0);	/* Return to userspace to finish emulation */
-	}
+	*retu = true;
+	return (0);	/* Return to userspace to finish emulation */
 }
 
 int
diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c
index e7cbd98b57c4..9df3ab47420a 100644
--- a/usr.sbin/bhyve/inout.c
+++ b/usr.sbin/bhyve/inout.c
@@ -102,14 +102,12 @@ int
 emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 {
 	int addrsize, bytes, flags, in, port, rep;
-	uint64_t gpa, gpaend;
 	uint32_t val;
 	inout_func_t handler;
 	void *arg;
-	char *gva;
 	int error, retval;
 	enum vm_reg_name idxreg;
-	uint64_t index, count;
+	uint64_t gla, index, count;
 	struct vm_inout_str *vis;
 
 	bytes = vmexit->u.inout.bytes;
@@ -149,10 +147,6 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 		/* Count register */
 		count = vis->count & vie_size2mask(addrsize);
 
-		gpa = vis->gpa;
-		gpaend = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
-		gva = paddr_guest2host(ctx, gpa, gpaend - gpa);
-
 		if (vie_alignment_check(vis->paging.cpl, bytes, vis->cr0,
 		    vis->rflags, vis->gla)) {
 			error = vm_inject_exception2(ctx, vcpu, IDT_AC, 0);
@@ -160,26 +154,34 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 			return (INOUT_RESTART);
 		}
 
-		while (count != 0 && gpa < gpaend) {
-			/*
-			 * XXX this may not work for unaligned accesses because
-			 * the last access on the page may spill over into the
-			 * adjacent page in the linear address space. This is a
-			 * problem because we don't have a gla2gpa() mapping of
-			 * this adjacent page.
-			 */
-			assert(gpaend - gpa >= bytes);
-
+		gla = vis->gla;
+		while (count) {
 			val = 0;
-			if (!in)
-				bcopy(gva, &val, bytes);
+			if (!in) {
+				error = vm_copyin(ctx, vcpu, &vis->paging,
+				    gla, &val, bytes);
+				assert(error == 0 || error == 1 || error == -1);
+				if (error) {
+					retval = (error == 1) ? INOUT_RESTART :
+					    INOUT_ERROR;
+					break;
+				}
+			}
 
 			retval = handler(ctx, vcpu, in, port, bytes, &val, arg);
 			if (retval != 0)
 				break;
 
-			if (in)
-				bcopy(&val, gva, bytes);
+			if (in) {
+				error = vm_copyout(ctx, vcpu, &vis->paging,
+				    &val, gla, bytes);	
+				assert(error == 0 || error == 1 || error == -1);
+				if (error) {
+					retval = (error == 1) ? INOUT_RESTART :
+					    INOUT_ERROR;
+					break;
+				}
+			}
 
 			/* Update index */
 			if (vis->rflags & PSL_D)
@@ -188,8 +190,7 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 				index += bytes;
 
 			count--;
-			gva += bytes;
-			gpa += bytes;
+			gla += bytes;
 		}
 
 		/* Update index register */
-- 
cgit v1.3


From 6303b65d355a1d40a1b7a6de3f4988f9f8ee1723 Mon Sep 17 00:00:00 2001
From: Neel Natu <neel@FreeBSD.org>
Date: Mon, 26 May 2014 18:21:08 +0000
Subject: Fix issue with restarting an "insb/insw/insl" instruction because of
 a page fault on the destination buffer.

Prior to this change a page fault would be detected in vm_copyout(). This
was done after the I/O port access was done. If the I/O port access had
side-effects (e.g. reading the uart FIFO) then restarting the instruction
would result in incorrect behavior.

Fix this by validating the guest linear address before doing the I/O port
emulation. If the validation results in a page fault exception being injected
into the guest then the instruction can now be restarted without any
side-effects.
---
 lib/libvmmapi/vmmapi.c | 76 +++++++++++++++++++++++++++++++-------------------
 lib/libvmmapi/vmmapi.h | 16 ++++++++---
 usr.sbin/bhyve/inout.c | 37 +++++++++++-------------
 3 files changed, 76 insertions(+), 53 deletions(-)

(limited to 'lib/libvmmapi')

diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index 45fffcf4827a..ba2904c68856 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/sysctl.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <sys/_iovec.h>
 
 #include <machine/specialreg.h>
 #include <machine/param.h>
@@ -940,7 +941,7 @@ vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities)
 }
 
 static int
-vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
     uint64_t gla, int prot, int *fault, uint64_t *gpa)
 {
 	struct vm_gla2gpa gg;
@@ -965,18 +966,20 @@ vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
 #endif
 
 int
-vm_copyin(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
-    uint64_t gla, void *vp, size_t len)
+vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt)
 {
-	char *dst;
-	const char *src;
 	uint64_t gpa;
-	int error, fault, n, off;
+	int error, fault, i, n, off;
+
+	for (i = 0; i < iovcnt; i++) {
+		iov[i].iov_base = 0;
+		iov[i].iov_len = 0;
+	}
 
-	dst = vp;
 	while (len) {
-		error = vm_gla2gpa(ctx, vcpu, paging, gla, PROT_READ,
-		    &fault, &gpa);
+		assert(iovcnt > 0);
+		error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, &gpa);
 		if (error)
 			return (-1);
 		if (fault)
@@ -984,42 +987,59 @@ vm_copyin(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
 
 		off = gpa & PAGE_MASK;
 		n = min(len, PAGE_SIZE - off);
-		src = vm_map_gpa(ctx, gpa, n);
-		bcopy(src, dst, n);
+
+		iov->iov_base = (void *)gpa;
+		iov->iov_len = n;
+		iov++;
+		iovcnt--;
 
 		gla += n;
-		dst += n;
 		len -= n;
 	}
 	return (0);
 }
 
-int
-vm_copyout(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
-    const void *vp, uint64_t gla, size_t len)
+void
+vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *iov, void *vp, size_t len)
 {
-	uint64_t gpa;
+	const char *src;
 	char *dst;
+	uint64_t gpa;
+	size_t n;
+
+	dst = vp;
+	while (len) {
+		assert(iov->iov_len);
+		gpa = (uint64_t)iov->iov_base;
+		n = min(len, iov->iov_len);
+		src = vm_map_gpa(ctx, gpa, n);
+		bcopy(src, dst, n);
+
+		iov++;
+		dst += n;
+		len -= n;
+	}
+}
+
+void
+vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov,
+    size_t len)
+{
 	const char *src;
-	int error, fault, n, off;
+	char *dst;
+	uint64_t gpa;
+	size_t n;
 
 	src = vp;
 	while (len) {
-		error = vm_gla2gpa(ctx, vcpu, paging, gla, PROT_WRITE,
-		    &fault, &gpa);
-		if (error)
-			return (-1);
-		if (fault)
-			return (1);
-
-		off = gpa & PAGE_MASK;
-		n = min(len, PAGE_SIZE - off);
+		assert(iov->iov_len);
+		gpa = (uint64_t)iov->iov_base;
+		n = min(len, iov->iov_len);
 		dst = vm_map_gpa(ctx, gpa, n);
 		bcopy(src, dst, n);
 
-		gla += n;
+		iov++;
 		src += n;
 		len -= n;
 	}
-	return (0);
 }
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index cad41c89ba0e..bab41da7a1a8 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -29,6 +29,7 @@
 #ifndef _VMMAPI_H_
 #define	_VMMAPI_H_
 
+struct iovec;
 struct vmctx;
 enum x2apic_state;
 
@@ -109,10 +110,17 @@ int	vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s);
 
 int	vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities);
 
-int	vm_copyin(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
-	    uint64_t gla_src, void *dst, size_t len);
-int	vm_copyout(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
-	    const void *src, uint64_t gla_dst, size_t len);
+/*
+ * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'.
+ * The 'iovcnt' should be big enough to accomodate all GPA segments.
+ * Returns 0 on success, 1 on a guest fault condition and -1 otherwise.
+ */
+int	vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+	    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt);
+void	vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov,
+	    void *host_dst, size_t len);
+void	vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src,
+	    struct iovec *guest_iov, size_t len);
 
 /* Reset vcpu register state */
 int	vcpu_reset(struct vmctx *ctx, int vcpu);
diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c
index 7b22b46baffe..4aaa54aff847 100644
--- a/usr.sbin/bhyve/inout.c
+++ b/usr.sbin/bhyve/inout.c
@@ -31,6 +31,8 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
+#include <sys/_iovec.h>
+#include <sys/mman.h>
 
 #include <x86/psl.h>
 #include <x86/segments.h>
@@ -109,6 +111,7 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 	enum vm_reg_name idxreg;
 	uint64_t gla, index, count;
 	struct vm_inout_str *vis;
+	struct iovec iov[2];
 
 	bytes = vmexit->u.inout.bytes;
 	in = vmexit->u.inout.in;
@@ -157,6 +160,15 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 				return (INOUT_RESTART);
 			}
 
+			error = vm_gla2gpa(ctx, vcpu, &vis->paging, gla, bytes,
+			    in ? PROT_WRITE : PROT_READ, iov, nitems(iov));
+			assert(error == 0 || error == 1 || error == -1);
+			if (error) {
+				retval = (error == 1) ? INOUT_RESTART :
+				    INOUT_ERROR;
+				break;
+			}
+
 			if (vie_alignment_check(vis->paging.cpl, bytes,
 			    vis->cr0, vis->rflags, gla)) {
 				error = vm_inject_exception2(ctx, vcpu,
@@ -165,33 +177,16 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 				return (INOUT_RESTART);
 			}
 
-
 			val = 0;
-			if (!in) {
-				error = vm_copyin(ctx, vcpu, &vis->paging,
-				    gla, &val, bytes);
-				assert(error == 0 || error == 1 || error == -1);
-				if (error) {
-					retval = (error == 1) ? INOUT_RESTART :
-					    INOUT_ERROR;
-					break;
-				}
-			}
+			if (!in)
+				vm_copyin(ctx, vcpu, iov, &val, bytes);
 
 			retval = handler(ctx, vcpu, in, port, bytes, &val, arg);
 			if (retval != 0)
 				break;
 
-			if (in) {
-				error = vm_copyout(ctx, vcpu, &vis->paging,
-				    &val, gla, bytes);	
-				assert(error == 0 || error == 1 || error == -1);
-				if (error) {
-					retval = (error == 1) ? INOUT_RESTART :
-					    INOUT_ERROR;
-					break;
-				}
-			}
+			if (in)
+				vm_copyout(ctx, vcpu, &val, iov, bytes);
 
 			/* Update index */
 			if (vis->rflags & PSL_D)
-- 
cgit v1.3


From 95ebc360efc984cab758d634f1c357b73650f651 Mon Sep 17 00:00:00 2001
From: Neel Natu <neel@FreeBSD.org>
Date: Sat, 31 May 2014 23:37:34 +0000
Subject: Activate vcpus from bhyve(8) using the ioctl VM_ACTIVATE_CPU instead
 of doing it implicitly in vmm.ko.

Add ioctl VM_GET_CPUS to get the current set of 'active' and 'suspended' cpus
and display them via /usr/sbin/bhyvectl using the "--get-active-cpus" and
"--get-suspended-cpus" options.

This is in preparation for being able to reset virtual machine state without
having to destroy and recreate it.
---
 lib/libvmmapi/vmmapi.c       | 44 +++++++++++++++++++++++++++++++++++++++++++-
 lib/libvmmapi/vmmapi.h       |  7 +++++++
 sys/amd64/include/vmm.h      |  3 ++-
 sys/amd64/include/vmm_dev.h  | 20 ++++++++++++++++++++
 sys/amd64/vmm/io/vlapic.c    |  4 ----
 sys/amd64/vmm/vmm.c          | 29 ++++++++++++++++++++---------
 sys/amd64/vmm/vmm_dev.c      | 29 ++++++++++++++++++++++++++++-
 usr.sbin/bhyve/bhyverun.c    | 13 +++++++++++++
 usr.sbin/bhyve/pci_lpc.c     |  1 -
 usr.sbin/bhyvectl/bhyvectl.c | 39 ++++++++++++++++++++++++++++++++++++++-
 10 files changed, 171 insertions(+), 18 deletions(-)

(limited to 'lib/libvmmapi')

diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index ba2904c68856..89c782520bcc 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -29,11 +29,12 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include <sys/types.h>
+#include <sys/param.h>
 #include <sys/sysctl.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <sys/_iovec.h>
+#include <sys/cpuset.h>
 
 #include <machine/specialreg.h>
 #include <machine/param.h>
@@ -1043,3 +1044,44 @@ vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov,
 		len -= n;
 	}
 }
+
+static int
+vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus)
+{
+	struct vm_cpuset vm_cpuset;
+	int error;
+
+	bzero(&vm_cpuset, sizeof(struct vm_cpuset));
+	vm_cpuset.which = which;
+	vm_cpuset.cpusetsize = sizeof(cpuset_t);
+	vm_cpuset.cpus = cpus;
+
+	error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset);
+	return (error);
+}
+
+int
+vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus)
+{
+
+	return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus));
+}
+
+int
+vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus)
+{
+
+	return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus));
+}
+
+int
+vm_activate_cpu(struct vmctx *ctx, int vcpu)
+{
+	struct vm_activate_cpu ac;
+	int error;
+
+	bzero(&ac, sizeof(struct vm_activate_cpu));
+	ac.vcpuid = vcpu;
+	error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac);
+	return (error);
+}
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index bab41da7a1a8..0f2e3ae57a49 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -29,6 +29,9 @@
 #ifndef _VMMAPI_H_
 #define	_VMMAPI_H_
 
+#include <sys/param.h>
+#include <sys/cpuset.h>
+
 struct iovec;
 struct vmctx;
 enum x2apic_state;
@@ -125,6 +128,10 @@ void	vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src,
 /* Reset vcpu register state */
 int	vcpu_reset(struct vmctx *ctx, int vcpu);
 
+int	vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus);
+int	vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus);
+int	vm_activate_cpu(struct vmctx *ctx, int vcpu);
+
 /*
  * FreeBSD specific APIs
  */
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index f1902d2fd546..05df325a152a 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -140,8 +140,9 @@ int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
 int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
 int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
 int vm_apicid2vcpuid(struct vm *vm, int apicid);
-void vm_activate_cpu(struct vm *vm, int vcpu);
+int vm_activate_cpu(struct vm *vm, int vcpu);
 cpuset_t vm_active_cpus(struct vm *vm);
+cpuset_t vm_suspended_cpus(struct vm *vm);
 struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
 void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip);
 
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index f094d519a413..a6568dc4e2f2 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -177,6 +177,18 @@ struct vm_gla2gpa {
 	uint64_t	gpa;
 };
 
+struct vm_activate_cpu {
+	int		vcpuid;
+};
+
+struct vm_cpuset {
+	int		which;
+	int		cpusetsize;
+	cpuset_t	*cpus;
+};
+#define	VM_ACTIVE_CPUS		0
+#define	VM_SUSPENDED_CPUS	1
+
 enum {
 	/* general routines */
 	IOCNUM_ABIVERS = 0,
@@ -229,6 +241,10 @@ enum {
 	IOCNUM_ISA_DEASSERT_IRQ = 81,
 	IOCNUM_ISA_PULSE_IRQ = 82,
 	IOCNUM_ISA_SET_IRQ_TRIGGER = 83,
+
+	/* vm_cpuset */
+	IOCNUM_ACTIVATE_CPU = 90,
+	IOCNUM_GET_CPUSET = 91,
 };
 
 #define	VM_RUN		\
@@ -301,4 +317,8 @@ enum {
 	_IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte)
 #define	VM_GLA2GPA	\
 	_IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa)
+#define	VM_ACTIVATE_CPU	\
+	_IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu)
+#define	VM_GET_CPUS	\
+	_IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset)
 #endif
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index d93641c19b4d..4034d34ca756 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -1004,11 +1004,7 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
 			if (vlapic2->boot_state != BS_SIPI)
 				return (0);
 
-			/*
-			 * XXX this assumes that the startup IPI always succeeds
-			 */
 			vlapic2->boot_state = BS_RUNNING;
-			vm_activate_cpu(vlapic2->vm, dest);
 
 			*retu = true;
 			vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 8ebdfd763801..e84359d6e696 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -342,8 +342,6 @@ vm_create(const char *name, struct vm **retvm)
 	struct vm *vm;
 	struct vmspace *vmspace;
 
-	const int BSP = 0;
-
 	/*
 	 * If vmm.ko could not be successfully initialized then don't attempt
 	 * to create the virtual machine.
@@ -373,8 +371,6 @@ vm_create(const char *name, struct vm **retvm)
 		guest_msrs_init(vm, i);
 	}
 
-	vm_activate_cpu(vm, BSP);
-
 	*retvm = vm;
 	return (0);
 }
@@ -1294,6 +1290,12 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
+	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
+		return (EINVAL);
+
+	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
+		return (EINVAL);
+
 	rptr = &vm->rendezvous_func;
 	sptr = &vm->suspend;
 	pmap = vmspace_pmap(vm->vmspace);
@@ -1708,17 +1710,19 @@ vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
 	return (state);
 }
 
-void
+int
 vm_activate_cpu(struct vm *vm, int vcpuid)
 {
 
-	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
-	    ("vm_activate_cpu: invalid vcpuid %d", vcpuid));
-	KASSERT(!CPU_ISSET(vcpuid, &vm->active_cpus),
-	    ("vm_activate_cpu: vcpuid %d is already active", vcpuid));
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (CPU_ISSET(vcpuid, &vm->active_cpus))
+		return (EBUSY);
 
 	VCPU_CTR0(vm, vcpuid, "activated");
 	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
+	return (0);
 }
 
 cpuset_t
@@ -1728,6 +1732,13 @@ vm_active_cpus(struct vm *vm)
 	return (vm->active_cpus);
 }
 
+cpuset_t
+vm_suspended_cpus(struct vm *vm)
+{
+
+	return (vm->suspended_cpus);
+}
+
 void *
 vcpu_stats(struct vm *vm, int vcpuid)
 {
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index 05617853641c..824389f18a98 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -146,7 +146,8 @@ static int
 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	     struct thread *td)
 {
-	int error, vcpu, state_changed;
+	int error, vcpu, state_changed, size;
+	cpuset_t *cpuset;
 	struct vmmdev_softc *sc;
 	struct vm_memory_segment *seg;
 	struct vm_register *vmreg;
@@ -170,6 +171,8 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	struct vm_gpa_pte *gpapte;
 	struct vm_suspend *vmsuspend;
 	struct vm_gla2gpa *gg;
+	struct vm_activate_cpu *vac;
+	struct vm_cpuset *vm_cpuset;
 
 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL)
@@ -195,6 +198,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	case VM_PPTDEV_MSIX:
 	case VM_SET_X2APIC_STATE:
 	case VM_GLA2GPA:
+	case VM_ACTIVATE_CPU:
 		/*
 		 * XXX fragile, handle with care
 		 * Assumes that the first field of the ioctl data is the vcpu.
@@ -439,6 +443,29 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		}
 		break;
 	}
+	case VM_ACTIVATE_CPU:
+		vac = (struct vm_activate_cpu *)data;
+		error = vm_activate_cpu(sc->vm, vac->vcpuid);
+		break;
+	case VM_GET_CPUS:
+		error = 0;
+		vm_cpuset = (struct vm_cpuset *)data;
+		size = vm_cpuset->cpusetsize;
+		if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
+			error = ERANGE;
+			break;
+		}
+		cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
+		if (vm_cpuset->which == VM_ACTIVE_CPUS)
+			*cpuset = vm_active_cpus(sc->vm);
+		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
+			*cpuset = vm_suspended_cpus(sc->vm);
+		else
+			error = EINVAL;
+		if (error == 0)
+			error = copyout(cpuset, vm_cpuset->cpus, size);
+		free(cpuset, M_TEMP);
+		break;
 	default:
 		error = ENOTTY;
 		break;
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index f9a67cb691fc..1e5d3b33abd2 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -242,6 +242,15 @@ fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
 
 	assert(fromcpu == BSP);
 
+	/*
+	 * The 'newcpu' must be activated in the context of 'fromcpu'. If
+	 * vm_activate_cpu() is delayed until newcpu's pthread starts running
+	 * then vmm.ko is out-of-sync with bhyve and this can create a race
+	 * with vm_suspend().
+	 */
+	error = vm_activate_cpu(ctx, newcpu);
+	assert(error == 0);
+
 	CPU_SET_ATOMIC(newcpu, &cpumask);
 
 	/*
@@ -532,6 +541,7 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
 	int error, rc, prevcpu;
 	enum vm_exitcode exitcode;
 	enum vm_suspend_how how;
+	cpuset_t active_cpus;
 
 	if (vcpumap[vcpu] != NULL) {
 		error = pthread_setaffinity_np(pthread_self(),
@@ -539,6 +549,9 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
 		assert(error == 0);
 	}
 
+	error = vm_active_cpus(ctx, &active_cpus);
+	assert(CPU_ISSET(vcpu, &active_cpus));
+
 	while (1) {
 		error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]);
 		if (error != 0)
diff --git a/usr.sbin/bhyve/pci_lpc.c b/usr.sbin/bhyve/pci_lpc.c
index 6b61b7afd0a3..e98b1411dfcd 100644
--- a/usr.sbin/bhyve/pci_lpc.c
+++ b/usr.sbin/bhyve/pci_lpc.c
@@ -32,7 +32,6 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <machine/vmm.h>
-#include <machine/vmm_dev.h>
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c
index ceee33a2aeeb..e77f0d77df6f 100644
--- a/usr.sbin/bhyvectl/bhyvectl.c
+++ b/usr.sbin/bhyvectl/bhyvectl.c
@@ -193,7 +193,9 @@ usage(void)
 	"       [--assert-lapic-lvt=<pin>]\n"
 	"       [--inject-nmi]\n"
 	"       [--force-reset]\n"
-	"       [--force-poweroff]\n",
+	"       [--force-poweroff]\n"
+	"       [--get-active-cpus]\n"
+	"       [--get-suspended-cpus]\n",
 	progname);
 	exit(1);
 }
@@ -203,6 +205,7 @@ static int inject_nmi, assert_lapic_lvt;
 static int force_reset, force_poweroff;
 static const char *capname;
 static int create, destroy, get_lowmem, get_highmem;
+static int get_active_cpus, get_suspended_cpus;
 static uint64_t memsize;
 static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4;
 static int set_efer, get_efer;
@@ -390,6 +393,25 @@ enum {
 	ASSERT_LAPIC_LVT,
 };
 
+static void
+print_cpus(const char *banner, const cpuset_t *cpus)
+{
+	int i, first;
+
+	first = 1;
+	printf("%s:\t", banner);
+	if (!CPU_EMPTY(cpus)) {
+		for (i = 0; i < CPU_SETSIZE; i++) {
+			if (CPU_ISSET(i, cpus)) {
+				printf("%s%d", first ? " " : ", ", i);
+				first = 0;
+			}
+		}
+	} else
+		printf(" (none)");
+	printf("\n");
+}
+
 int
 main(int argc, char *argv[])
 {
@@ -401,6 +423,7 @@ main(int argc, char *argv[])
 	uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte;
 	struct vmctx *ctx;
 	int wired;
+	cpuset_t cpus;
 
 	uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat;
 	uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp;
@@ -570,6 +593,8 @@ main(int argc, char *argv[])
 		{ "inject-nmi",	NO_ARG,		&inject_nmi,	1 },
 		{ "force-reset",	NO_ARG,	&force_reset,	1 },
 		{ "force-poweroff", NO_ARG,	&force_poweroff, 1 },
+		{ "get-active-cpus", NO_ARG,	&get_active_cpus, 1 },
+		{ "get-suspended-cpus", NO_ARG,	&get_suspended_cpus, 1 },
 		{ NULL,		0,		NULL,		0 }
 	};
 
@@ -1529,6 +1554,18 @@ main(int argc, char *argv[])
 		}
 	}
 
+	if (!error && (get_active_cpus || get_all)) {
+		error = vm_active_cpus(ctx, &cpus);
+		if (!error)
+			print_cpus("active cpus", &cpus);
+	}
+
+	if (!error && (get_suspended_cpus || get_all)) {
+		error = vm_suspended_cpus(ctx, &cpus);
+		if (!error)
+			print_cpus("suspended cpus", &cpus);
+	}
+
 	if (!error && run) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
 		assert(error == 0);
-- 
cgit v1.3


From 5fcf252f410e7784626d6d5d0e75042be23d4f24 Mon Sep 17 00:00:00 2001
From: Neel Natu <neel@FreeBSD.org>
Date: Sat, 7 Jun 2014 21:36:52 +0000
Subject: Add ioctl(VM_REINIT) to reinitialize the virtual machine state
 maintained by vmm.ko. This allows the virtual machine to be restarted without
 having to destroy it first.

Reviewed by:	grehan
---
 lib/libvmmapi/vmmapi.c         |   7 ++
 lib/libvmmapi/vmmapi.h         |   1 +
 sys/amd64/include/vmm.h        |   1 +
 sys/amd64/include/vmm_dev.h    |   3 +
 sys/amd64/vmm/vmm.c            | 204 ++++++++++++++++++++++++++---------------
 sys/amd64/vmm/vmm_dev.c        |   4 +
 sys/amd64/vmm/vmm_stat.c       |  16 +++-
 sys/amd64/vmm/vmm_stat.h       |   5 +-
 usr.sbin/bhyveload/bhyveload.c |  21 ++++-
 9 files changed, 177 insertions(+), 85 deletions(-)

(limited to 'lib/libvmmapi')

diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index 89c782520bcc..5ce3d8e7ae82 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -367,6 +367,13 @@ vm_suspend(struct vmctx *ctx, enum vm_suspend_how how)
 	return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend));
 }
 
+int
+vm_reinit(struct vmctx *ctx)
+{
+
+	return (ioctl(ctx->fd, VM_REINIT, 0));
+}
+
 static int
 vm_inject_exception_real(struct vmctx *ctx, int vcpu, int vector,
     int error_code, int error_code_valid)
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index 0f2e3ae57a49..4cc429065e46 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -69,6 +69,7 @@ int	vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
 int	vm_run(struct vmctx *ctx, int vcpu, uint64_t rip,
 	       struct vm_exit *ret_vmexit);
 int	vm_suspend(struct vmctx *ctx, enum vm_suspend_how how);
+int	vm_reinit(struct vmctx *ctx);
 int	vm_apicid2vcpu(struct vmctx *ctx, int apicid);
 int	vm_inject_exception(struct vmctx *ctx, int vcpu, int vec);
 int	vm_inject_exception2(struct vmctx *ctx, int vcpu, int vec, int errcode);
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 05df325a152a..00e1d96afe7f 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -105,6 +105,7 @@ extern struct vmm_ops vmm_ops_amd;
 
 int vm_create(const char *name, struct vm **retvm);
 void vm_destroy(struct vm *vm);
+int vm_reinit(struct vm *vm);
 const char *vm_name(struct vm *vm);
 int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
 int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index a6568dc4e2f2..9b3b00ded0ba 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -196,6 +196,7 @@ enum {
 	IOCNUM_SET_CAPABILITY = 2,
 	IOCNUM_GET_CAPABILITY = 3,
 	IOCNUM_SUSPEND = 4,
+	IOCNUM_REINIT = 5,
 
 	/* memory apis */
 	IOCNUM_MAP_MEMORY = 10,
@@ -251,6 +252,8 @@ enum {
 	_IOWR('v', IOCNUM_RUN, struct vm_run)
 #define	VM_SUSPEND	\
 	_IOW('v', IOCNUM_SUSPEND, struct vm_suspend)
+#define	VM_REINIT	\
+	_IO('v', IOCNUM_REINIT)
 #define	VM_MAP_MEMORY	\
 	_IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment)
 #define	VM_GET_MEMORY_SEG \
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index e84359d6e696..435ba391e141 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -84,25 +84,31 @@ __FBSDID("$FreeBSD$");
 
 struct vlapic;
 
+/*
+ * Initialization:
+ * (a) allocated when vcpu is created
+ * (i) initialized when vcpu is created and when it is reinitialized
+ * (o) initialized the first time the vcpu is created
+ * (x) initialized before use
+ */
 struct vcpu {
-	int		flags;
-	enum vcpu_state	state;
-	struct mtx	mtx;
-	int		hostcpu;	/* host cpuid this vcpu last ran on */
-	uint64_t	guest_msrs[VMM_MSR_NUM];
-	struct vlapic	*vlapic;
-	int		 vcpuid;
-	struct savefpu	*guestfpu;	/* guest fpu state */
-	uint64_t	guest_xcr0;
-	void		*stats;
-	struct vm_exit	exitinfo;
-	enum x2apic_state x2apic_state;
-	int		nmi_pending;
-	int		extint_pending;
-	struct vm_exception exception;
-	int		exception_pending;
+	struct mtx 	mtx;		/* (o) protects 'state' and 'hostcpu' */
+	enum vcpu_state	state;		/* (o) vcpu state */
+	int		hostcpu;	/* (o) vcpu's host cpu */
+	struct vlapic	*vlapic;	/* (i) APIC device model */
+	enum x2apic_state x2apic_state;	/* (i) APIC mode */
+	int		nmi_pending;	/* (i) NMI pending */
+	int		extint_pending;	/* (i) INTR pending */
+	struct vm_exception exception;	/* (x) exception collateral */
+	int	exception_pending;	/* (i) exception pending */
+	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
+	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
+	void		*stats;		/* (a,i) statistics */
+	uint64_t guest_msrs[VMM_MSR_NUM]; /* (i) emulated MSRs */
+	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
 };
 
+#define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
@@ -116,36 +122,33 @@ struct mem_seg {
 };
 #define	VM_MAX_MEMORY_SEGMENTS	2
 
+/*
+ * Initialization:
+ * (o) initialized the first time the VM is created
+ * (i) initialized when VM is created and when it is reinitialized
+ * (x) initialized before use
+ */
 struct vm {
-	void		*cookie;	/* processor-specific data */
-	void		*iommu;		/* iommu-specific data */
-	struct vhpet	*vhpet;		/* virtual HPET */
-	struct vioapic	*vioapic;	/* virtual ioapic */
-	struct vatpic	*vatpic;	/* virtual atpic */
-	struct vatpit	*vatpit;	/* virtual atpit */
-	struct vmspace	*vmspace;	/* guest's address space */
-	struct vcpu	vcpu[VM_MAXCPU];
-	int		num_mem_segs;
-	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
-	char		name[VM_MAX_NAMELEN];
-
-	/*
-	 * Set of active vcpus.
-	 * An active vcpu is one that has been started implicitly (BSP) or
-	 * explicitly (AP) by sending it a startup ipi.
-	 */
-	volatile cpuset_t active_cpus;
-
-	struct mtx	rendezvous_mtx;
-	cpuset_t	rendezvous_req_cpus;
-	cpuset_t	rendezvous_done_cpus;
-	void		*rendezvous_arg;
+	void		*cookie;		/* (i) cpu-specific data */
+	void		*iommu;			/* (x) iommu-specific data */
+	struct vhpet	*vhpet;			/* (i) virtual HPET */
+	struct vioapic	*vioapic;		/* (i) virtual ioapic */
+	struct vatpic	*vatpic;		/* (i) virtual atpic */
+	struct vatpit	*vatpit;		/* (i) virtual atpit */
+	volatile cpuset_t active_cpus;		/* (i) active vcpus */
+	int		suspend;		/* (i) stop VM execution */
+	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
+	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
+	cpuset_t	rendezvous_req_cpus;	/* (x) rendezvous requested */
+	cpuset_t	rendezvous_done_cpus;	/* (x) rendezvous finished */
+	void		*rendezvous_arg;	/* (x) rendezvous func/arg */
 	vm_rendezvous_func_t rendezvous_func;
-
-	int		suspend;
-	volatile cpuset_t suspended_cpus;
-
-	volatile cpuset_t halted_cpus;
+	struct mtx	rendezvous_mtx;		/* (o) rendezvous lock */
+	int		num_mem_segs;		/* (o) guest memory segments */
+	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
+	struct vmspace	*vmspace;		/* (o) guest's address space */
+	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
+	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
 };
 
 static int vmm_initialized;
@@ -206,31 +209,46 @@ SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
     "IPI vector used for vcpu notifications");
 
 static void
-vcpu_cleanup(struct vm *vm, int i)
+vcpu_cleanup(struct vm *vm, int i, bool destroy)
 {
 	struct vcpu *vcpu = &vm->vcpu[i];
 
 	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
-	vmm_stat_free(vcpu->stats);	
-	fpu_save_area_free(vcpu->guestfpu);
+	if (destroy) {
+		vmm_stat_free(vcpu->stats);	
+		fpu_save_area_free(vcpu->guestfpu);
+	}
 }
 
 static void
-vcpu_init(struct vm *vm, uint32_t vcpu_id)
+vcpu_init(struct vm *vm, int vcpu_id, bool create)
 {
 	struct vcpu *vcpu;
-	
+
+	KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU,
+	    ("vcpu_init: invalid vcpu %d", vcpu_id));
+	  
 	vcpu = &vm->vcpu[vcpu_id];
 
-	vcpu_lock_init(vcpu);
-	vcpu->hostcpu = NOCPU;
-	vcpu->vcpuid = vcpu_id;
+	if (create) {
+		KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
+		    "initialized", vcpu_id));
+		vcpu_lock_init(vcpu);
+		vcpu->state = VCPU_IDLE;
+		vcpu->hostcpu = NOCPU;
+		vcpu->guestfpu = fpu_save_area_alloc();
+		vcpu->stats = vmm_stat_alloc();
+	}
+
 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
+	vcpu->nmi_pending = 0;
+	vcpu->extint_pending = 0;
+	vcpu->exception_pending = 0;
 	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
-	vcpu->guestfpu = fpu_save_area_alloc();
 	fpu_save_area_reset(vcpu->guestfpu);
-	vcpu->stats = vmm_stat_alloc();
+	vmm_stat_init(vcpu->stats);
+	guest_msrs_init(vm, vcpu_id);
 }
 
 struct vm_exit *
@@ -335,10 +353,30 @@ static moduledata_t vmm_kmod = {
 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
 MODULE_VERSION(vmm, 1);
 
+static void
+vm_init(struct vm *vm, bool create)
+{
+	int i;
+
+	vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
+	vm->iommu = NULL;
+	vm->vioapic = vioapic_init(vm);
+	vm->vhpet = vhpet_init(vm);
+	vm->vatpic = vatpic_init(vm);
+	vm->vatpit = vatpit_init(vm);
+
+	CPU_ZERO(&vm->active_cpus);
+
+	vm->suspend = 0;
+	CPU_ZERO(&vm->suspended_cpus);
+
+	for (i = 0; i < VM_MAXCPU; i++)
+		vcpu_init(vm, i, create);
+}
+
 int
 vm_create(const char *name, struct vm **retvm)
 {
-	int i;
 	struct vm *vm;
 	struct vmspace *vmspace;
 
@@ -358,18 +396,11 @@ vm_create(const char *name, struct vm **retvm)
 
 	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
 	strcpy(vm->name, name);
+	vm->num_mem_segs = 0;
 	vm->vmspace = vmspace;
 	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
-	vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
-	vm->vioapic = vioapic_init(vm);
-	vm->vhpet = vhpet_init(vm);
-	vm->vatpic = vatpic_init(vm);
-	vm->vatpit = vatpit_init(vm);
 
-	for (i = 0; i < VM_MAXCPU; i++) {
-		vcpu_init(vm, i);
-		guest_msrs_init(vm, i);
-	}
+	vm_init(vm, true);
 
 	*retvm = vm;
 	return (0);
@@ -385,8 +416,8 @@ vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
 	bzero(seg, sizeof(*seg));
 }
 
-void
-vm_destroy(struct vm *vm)
+static void
+vm_cleanup(struct vm *vm, bool destroy)
 {
 	int i;
 
@@ -400,21 +431,48 @@ vm_destroy(struct vm *vm)
 	vatpic_cleanup(vm->vatpic);
 	vioapic_cleanup(vm->vioapic);
 
-	for (i = 0; i < vm->num_mem_segs; i++)
-		vm_free_mem_seg(vm, &vm->mem_segs[i]);
+	for (i = 0; i < VM_MAXCPU; i++)
+		vcpu_cleanup(vm, i, destroy);
 
-	vm->num_mem_segs = 0;
+	VMCLEANUP(vm->cookie);
 
-	for (i = 0; i < VM_MAXCPU; i++)
-		vcpu_cleanup(vm, i);
+	if (destroy) {
+		for (i = 0; i < vm->num_mem_segs; i++)
+			vm_free_mem_seg(vm, &vm->mem_segs[i]);
 
-	VMSPACE_FREE(vm->vmspace);
+		vm->num_mem_segs = 0;
 
-	VMCLEANUP(vm->cookie);
+		VMSPACE_FREE(vm->vmspace);
+		vm->vmspace = NULL;
+	}
+}
 
+void
+vm_destroy(struct vm *vm)
+{
+	vm_cleanup(vm, true);
 	free(vm, M_VM);
 }
 
+int
+vm_reinit(struct vm *vm)
+{
+	int error;
+
+	/*
+	 * A virtual machine can be reset only if all vcpus are suspended.
+	 */
+	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
+		vm_cleanup(vm, false);
+		vm_init(vm, false);
+		error = 0;
+	} else {
+		error = EBUSY;
+	}
+
+	return (error);
+}
+
 const char *
 vm_name(struct vm *vm)
 {
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index 824389f18a98..f3e31a33df4a 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -220,6 +220,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	case VM_BIND_PPTDEV:
 	case VM_UNBIND_PPTDEV:
 	case VM_MAP_MEMORY:
+	case VM_REINIT:
 		/*
 		 * ioctls that operate on the entire virtual machine must
 		 * prevent all vcpus from running.
@@ -253,6 +254,9 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		vmsuspend = (struct vm_suspend *)data;
 		error = vm_suspend(sc->vm, vmsuspend->how);
 		break;
+	case VM_REINIT:
+		error = vm_reinit(sc->vm);
+		break;
 	case VM_STAT_DESC: {
 		statdesc = (struct vm_stat_desc *)data;
 		error = vmm_stat_desc_copy(statdesc->index,
diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c
index e3d699923c0f..ef9f41173fee 100644
--- a/sys/amd64/vmm/vmm_stat.c
+++ b/sys/amd64/vmm/vmm_stat.c
@@ -52,8 +52,10 @@ static struct vmm_stat_type *vsttab[MAX_VMM_STAT_ELEMS];
 
 static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat");
 
+#define	vst_size	((size_t)vst_num_elems * sizeof(uint64_t))
+
 void
-vmm_stat_init(void *arg)
+vmm_stat_register(void *arg)
 {
 	struct vmm_stat_type *vst = arg;
 
@@ -97,11 +99,15 @@ vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf)
 void *
 vmm_stat_alloc(void)
 {
-	u_long size;
-	
-	size = vst_num_elems * sizeof(uint64_t);
 
-	return (malloc(size, M_VMM_STAT, M_ZERO | M_WAITOK));
+	return (malloc(vst_size, M_VMM_STAT, M_WAITOK));
+}
+
+void
+vmm_stat_init(void *vp)
+{
+
+	bzero(vp, vst_size);
 }
 
 void
diff --git a/sys/amd64/vmm/vmm_stat.h b/sys/amd64/vmm/vmm_stat.h
index 9110c8f0f0ed..6e98965ac270 100644
--- a/sys/amd64/vmm/vmm_stat.h
+++ b/sys/amd64/vmm/vmm_stat.h
@@ -49,13 +49,13 @@ struct vmm_stat_type {
 	enum vmm_stat_scope scope;
 };
 
-void	vmm_stat_init(void *arg);
+void	vmm_stat_register(void *arg);
 
 #define	VMM_STAT_DEFINE(type, nelems, desc, scope)			\
 	struct vmm_stat_type type[1] = {				\
 		{ -1, nelems, desc, scope }				\
 	};								\
-	SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type)
+	SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_register, type)
 
 #define	VMM_STAT_DECLARE(type)						\
 	extern struct vmm_stat_type type[1]
@@ -71,6 +71,7 @@ void	vmm_stat_init(void *arg);
 	VMM_STAT_DEFINE(type, nelems, desc, VMM_STAT_SCOPE_ANY)
 
 void	*vmm_stat_alloc(void);
+void	vmm_stat_init(void *vp);
 void 	vmm_stat_free(void *vp);
 
 /*
diff --git a/usr.sbin/bhyveload/bhyveload.c b/usr.sbin/bhyveload/bhyveload.c
index c1a54326360b..44424963fed0 100644
--- a/usr.sbin/bhyveload/bhyveload.c
+++ b/usr.sbin/bhyveload/bhyveload.c
@@ -642,7 +642,7 @@ main(int argc, char** argv)
 	void *h;
 	void (*func)(struct loader_callbacks *, void *, int, int);
 	uint64_t mem_size;
-	int opt, error;
+	int opt, error, need_reinit;
 
 	progname = basename(argv[0]);
 
@@ -691,11 +691,14 @@ main(int argc, char** argv)
 
 	vmname = argv[0];
 
+	need_reinit = 0;
 	error = vm_create(vmname);
-	if (error != 0 && errno != EEXIST) {
-		perror("vm_create");
-		exit(1);
-
+	if (error) {
+		if (errno != EEXIST) {
+			perror("vm_create");
+			exit(1);
+		}
+		need_reinit = 1;
 	}
 
 	ctx = vm_open(vmname);
@@ -704,6 +707,14 @@ main(int argc, char** argv)
 		exit(1);
 	}
 
+	if (need_reinit) {
+		error = vm_reinit(ctx);
+		if (error) {
+			perror("vm_reinit");
+			exit(1);
+		}
+	}
+
 	error = vm_setup_memory(ctx, mem_size, VM_MMAP_ALL);
 	if (error) {
 		perror("vm_setup_memory");
-- 
cgit v1.3


From be679db4cd69a64d8810c513c2cbea2e6edf0e27 Mon Sep 17 00:00:00 2001
From: Neel Natu <neel@FreeBSD.org>
Date: Tue, 24 Jun 2014 02:02:51 +0000
Subject: Provide APIs to directly get 'lowmem' and 'highmem' size directly.

Previously the sizes were inferred indirectly based on the size of the mappings
at 0 and 4GB respectively. This works fine as long as size of the allocation is
identical to the size of the mapping in the guest's address space. However, if
the mapping is disjoint then this assumption falls apart (e.g., due to the
legacy BIOS hole between 640KB and 1MB).
---
 lib/libvmmapi/vmmapi.c         | 14 ++++++++++++++
 lib/libvmmapi/vmmapi.h         |  2 ++
 usr.sbin/bhyve/pci_emul.c      |  3 +--
 usr.sbin/bhyve/rtc.c           | 15 +++++----------
 usr.sbin/bhyve/smbiostbl.c     |  9 ++-------
 usr.sbin/bhyveload/bhyveload.c |  4 ++--
 6 files changed, 26 insertions(+), 21 deletions(-)

(limited to 'lib/libvmmapi')

diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index 5ce3d8e7ae82..9fb2308731e7 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -274,6 +274,20 @@ vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
 	return (NULL);
 }
 
+size_t
+vm_get_lowmem_size(struct vmctx *ctx)
+{
+
+	return (ctx->lowmem);
+}
+
+size_t
+vm_get_highmem_size(struct vmctx *ctx)
+{
+
+	return (ctx->highmem);
+}
+
 int
 vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
 	    uint64_t base, uint32_t limit, uint32_t access)
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index 4cc429065e46..067eaa0aa26c 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -60,6 +60,8 @@ int	vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
 uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
 void	vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
 void	vm_set_memflags(struct vmctx *ctx, int flags);
+size_t	vm_get_lowmem_size(struct vmctx *ctx);
+size_t	vm_get_highmem_size(struct vmctx *ctx);
 int	vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
 		    uint64_t base, uint32_t limit, uint32_t access);
 int	vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
index a2c47ec0509c..458ba76480b1 100644
--- a/usr.sbin/bhyve/pci_emul.c
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -1118,8 +1118,7 @@ init_pci(struct vmctx *ctx)
 	 * Accesses to memory addresses that are not allocated to system
 	 * memory or PCI devices return 0xff's.
 	 */
-	error = vm_get_memory_seg(ctx, 0, &lowmem, NULL);
-	assert(error == 0);
+	lowmem = vm_get_lowmem_size(ctx);
 
 	memset(&pci_mem_hole, 0, sizeof(struct mem_range));
 	pci_mem_hole.name = "PCI hole";
diff --git a/usr.sbin/bhyve/rtc.c b/usr.sbin/bhyve/rtc.c
index 11877479807c..b3631fc07228 100644
--- a/usr.sbin/bhyve/rtc.c
+++ b/usr.sbin/bhyve/rtc.c
@@ -343,19 +343,14 @@ rtc_init(struct vmctx *ctx)
 	 * 0x34/0x35 - 64KB chunks above 16MB, below 4GB
 	 * 0x5b/0x5c/0x5d - 64KB chunks above 4GB
 	 */
-	err = vm_get_memory_seg(ctx, 0, &lomem, NULL);
-	assert(err == 0);
-
-	lomem = (lomem - m_16MB) / m_64KB;
+	lomem = (vm_get_lowmem_size(ctx) - m_16MB) / m_64KB;
 	rtc_nvram[nvoff(RTC_LMEM_LSB)] = lomem;
 	rtc_nvram[nvoff(RTC_LMEM_MSB)] = lomem >> 8;
 
-	if (vm_get_memory_seg(ctx, m_4GB, &himem, NULL) == 0) {	  
-		himem /= m_64KB;
-		rtc_nvram[nvoff(RTC_HMEM_LSB)] = himem;
-		rtc_nvram[nvoff(RTC_HMEM_SB)]  = himem >> 8;
-		rtc_nvram[nvoff(RTC_HMEM_MSB)] = himem >> 16;
-	}
+	himem = vm_get_highmem_size(ctx) / m_64KB;
+	rtc_nvram[nvoff(RTC_HMEM_LSB)] = himem;
+	rtc_nvram[nvoff(RTC_HMEM_SB)]  = himem >> 8;
+	rtc_nvram[nvoff(RTC_HMEM_MSB)] = himem >> 16;
 }
 
 INOUT_PORT(rtc, IO_RTC, IOPORT_F_INOUT, rtc_addr_handler);
diff --git a/usr.sbin/bhyve/smbiostbl.c b/usr.sbin/bhyve/smbiostbl.c
index 9d1cfb3198be..d560f022fc2a 100644
--- a/usr.sbin/bhyve/smbiostbl.c
+++ b/usr.sbin/bhyve/smbiostbl.c
@@ -779,13 +779,8 @@ smbios_build(struct vmctx *ctx)
 	int				i;
 	int				err;
 
-	err = vm_get_memory_seg(ctx, 0, &guest_lomem, NULL);
-	if (err != 0)
-		return (err);
-
-	err = vm_get_memory_seg(ctx, 4*GB, &guest_himem, NULL);
-	if (err != 0)
-		return (err);
+	guest_lomem = vm_get_lowmem_size(ctx);
+	guest_himem = vm_get_highmem_size(ctx);
 
 	startaddr = paddr_guest2host(ctx, SMBIOS_BASE, SMBIOS_MAX_LENGTH);
 	if (startaddr == NULL) {
diff --git a/usr.sbin/bhyveload/bhyveload.c b/usr.sbin/bhyveload/bhyveload.c
index 44424963fed0..ff6b26926f3b 100644
--- a/usr.sbin/bhyveload/bhyveload.c
+++ b/usr.sbin/bhyveload/bhyveload.c
@@ -505,8 +505,8 @@ static void
 cb_getmem(void *arg, uint64_t *ret_lowmem, uint64_t *ret_highmem)
 {
 
-	vm_get_memory_seg(ctx, 0, ret_lowmem, NULL);
-	vm_get_memory_seg(ctx, 4 * GB, ret_highmem, NULL);
+	*ret_lowmem = vm_get_lowmem_size(ctx);
+	*ret_highmem = vm_get_highmem_size(ctx);
 }
 
 struct env {
-- 
cgit v1.3


From 091d453222c352732e496226ffceb33c0b165f56 Mon Sep 17 00:00:00 2001
From: Neel Natu <neel@FreeBSD.org>
Date: Sat, 19 Jul 2014 20:59:08 +0000
Subject: Handle nested exceptions in bhyve.

A nested exception condition arises when a second exception is triggered while
delivering the first exception. Most nested exceptions can be handled serially
but some are converted into a double fault. If an exception is generated during
delivery of a double fault then the virtual machine shuts down as a result of
a triple fault.

vm_exit_intinfo() is used to record that a VM-exit happened while an event was
being delivered through the IDT. If an exception is triggered while handling
the VM-exit it will be treated like a nested exception.

vm_entry_intinfo() is used by processor-specific code to get the event to be
injected into the guest on the next VM-entry. This function is responsible for
deciding the disposition of nested exceptions.
---
 lib/libvmmapi/vmmapi.c       |  29 ++++++
 lib/libvmmapi/vmmapi.h       |   3 +
 sys/amd64/include/vmm.h      |  40 ++++++--
 sys/amd64/include/vmm_dev.h  |  12 +++
 sys/amd64/vmm/intel/vmx.c    | 122 ++++++++++++++---------
 sys/amd64/vmm/vmm.c          | 225 +++++++++++++++++++++++++++++++++++++++----
 sys/amd64/vmm/vmm_dev.c      |  12 +++
 usr.sbin/bhyve/bhyverun.c    |   2 +
 usr.sbin/bhyve/task_switch.c |  10 +-
 usr.sbin/bhyvectl/bhyvectl.c |  46 ++++++++-
 10 files changed, 424 insertions(+), 77 deletions(-)

(limited to 'lib/libvmmapi')

diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index 9fb2308731e7..483aa5199b6c 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -1106,3 +1106,32 @@ vm_activate_cpu(struct vmctx *ctx, int vcpu)
 	error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac);
 	return (error);
 }
+
+int
+vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2)
+{
+	struct vm_intinfo vmii;
+	int error;
+
+	bzero(&vmii, sizeof(struct vm_intinfo));
+	vmii.vcpuid = vcpu;
+	error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii);
+	if (error == 0) {
+		*info1 = vmii.info1;
+		*info2 = vmii.info2;
+	}
+	return (error);
+}
+
+int
+vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1)
+{
+	struct vm_intinfo vmii;
+	int error;
+
+	bzero(&vmii, sizeof(struct vm_intinfo));
+	vmii.vcpuid = vcpu;
+	vmii.info1 = info1;
+	error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii);
+	return (error);
+}
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index 067eaa0aa26c..2040c91e205f 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -104,6 +104,9 @@ int	vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot,
 	    int func, int idx, uint64_t addr, uint64_t msg,
 	    uint32_t vector_control);
 
+int	vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2);
+int	vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo);
+
 /*
  * Return a pointer to the statistics buffer. Note that this is not MT-safe.
  */
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 9c05b894968b..6895e64037e0 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -34,6 +34,7 @@ enum vm_suspend_how {
 	VM_SUSPEND_RESET,
 	VM_SUSPEND_POWEROFF,
 	VM_SUSPEND_HALT,
+	VM_SUSPEND_TRIPLEFAULT,
 	VM_SUSPEND_LAST
 };
 
@@ -88,6 +89,16 @@ enum x2apic_state {
 	X2APIC_STATE_LAST
 };
 
+#define	VM_INTINFO_VECTOR(info)	((info) & 0xff)
+#define	VM_INTINFO_DEL_ERRCODE	0x800
+#define	VM_INTINFO_RSVD		0x7ffff000
+#define	VM_INTINFO_VALID	0x80000000
+#define	VM_INTINFO_TYPE		0x700
+#define	VM_INTINFO_HWINTR	(0 << 8)
+#define	VM_INTINFO_NMI		(2 << 8)
+#define	VM_INTINFO_HWEXCEPTION	(3 << 8)
+#define	VM_INTINFO_SWINTR	(4 << 8)
+
 #ifdef _KERNEL
 
 #define	VM_MAX_NAMELEN	32
@@ -278,14 +289,31 @@ struct vatpit *vm_atpit(struct vm *vm);
 int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme);
 
 /*
- * Returns 0 if there is no exception pending for this vcpu. Returns 1 if an
- * exception is pending and also updates 'vme'. The pending exception is
- * cleared when this function returns.
+ * This function is called after a VM-exit that occurred during exception or
+ * interrupt delivery through the IDT. The format of 'intinfo' is described
+ * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2.
  *
- * This function should only be called in the context of the thread that is
- * executing this vcpu.
+ * If a VM-exit handler completes the event delivery successfully then it
+ * should call vm_exit_intinfo() to extinguish the pending event. For e.g.,
+ * if the task switch emulation is triggered via a task gate then it should
+ * call this function with 'intinfo=0' to indicate that the external event
+ * is not pending anymore.
+ *
+ * Return value is 0 on success and non-zero on failure.
  */
-int vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *vme);
+int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo);
+
+/*
+ * This function is called before every VM-entry to retrieve a pending
+ * event that should be injected into the guest. This function combines
+ * nested events into a double or triple fault.
+ *
+ * Returns 0 if there are no events that need to be injected into the guest
+ * and non-zero otherwise.
+ */
+int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info);
+
+int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);
 
 void vm_inject_gp(struct vm *vm, int vcpuid); /* general protection fault */
 void vm_inject_ud(struct vm *vm, int vcpuid); /* undefined instruction fault */
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index 9b3b00ded0ba..e4d839ef6549 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -189,6 +189,12 @@ struct vm_cpuset {
 #define	VM_ACTIVE_CPUS		0
 #define	VM_SUSPENDED_CPUS	1
 
+struct vm_intinfo {
+	int		vcpuid;
+	uint64_t	info1;
+	uint64_t	info2;
+};
+
 enum {
 	/* general routines */
 	IOCNUM_ABIVERS = 0,
@@ -211,6 +217,8 @@ enum {
 	IOCNUM_GET_SEGMENT_DESCRIPTOR = 23,
 
 	/* interrupt injection */
+	IOCNUM_GET_INTINFO = 28,
+	IOCNUM_SET_INTINFO = 29,
 	IOCNUM_INJECT_EXCEPTION = 30,
 	IOCNUM_LAPIC_IRQ = 31,
 	IOCNUM_INJECT_NMI = 32,
@@ -324,4 +332,8 @@ enum {
 	_IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu)
 #define	VM_GET_CPUS	\
 	_IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset)
+#define	VM_SET_INTINFO	\
+	_IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo)
+#define	VM_GET_INTINFO	\
+	_IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo)
 #endif
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 271f8ce173e2..22732a276e8d 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -1213,22 +1213,31 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
 {
 	struct vm_exception exc;
 	int vector, need_nmi_exiting, extint_pending;
-	uint64_t rflags;
+	uint64_t rflags, entryinfo;
 	uint32_t gi, info;
 
-	if (vm_exception_pending(vmx->vm, vcpu, &exc)) {
-		KASSERT(exc.vector >= 0 && exc.vector < 32,
-		    ("%s: invalid exception vector %d", __func__, exc.vector));
+	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
+		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
+		    "intinfo is not valid: %#lx", __func__, entryinfo));
 
 		info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
 		     "pending exception %d: %#x", __func__, exc.vector, info));
 
-		info = exc.vector | VMCS_INTR_T_HWEXCEPTION | VMCS_INTR_VALID;
-		if (exc.error_code_valid) {
-			info |= VMCS_INTR_DEL_ERRCODE;
-			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, exc.error_code);
+		info = entryinfo;
+		vector = info & 0xff;
+		if (vector == IDT_BP || vector == IDT_OF) {
+			/*
+			 * VT-x requires #BP and #OF to be injected as software
+			 * exceptions.
+			 */
+			info &= ~VMCS_INTR_T_MASK;
+			info |= VMCS_INTR_T_SWEXCEPTION;
 		}
+
+		if (info & VMCS_INTR_DEL_ERRCODE)
+			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
+
 		vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 	}
 
@@ -1407,6 +1416,16 @@ vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 }
 
+static void
+vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
+{
+	uint32_t gi;
+
+	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+	KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
+	    ("NMI blocking is not in effect %#x", gi));
+}
+
 static int
 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
@@ -2050,7 +2069,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 	struct vm_task_switch *ts;
 	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
 	uint32_t intr_type, reason;
-	uint64_t qual, gpa;
+	uint64_t exitintinfo, qual, gpa;
 	bool retu;
 
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
@@ -2070,47 +2089,49 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 	 * be handled specially by re-injecting the event if the IDT
 	 * vectoring information field's valid bit is set.
 	 *
-	 * If the VM-exit is due to a task gate in the IDT then we don't
-	 * reinject the event because emulating the task switch also
-	 * completes the event delivery.
-	 *
 	 * See "Information for VM Exits During Event Delivery" in Intel SDM
 	 * for details.
 	 */
-	switch (reason) {
-	case EXIT_REASON_EPT_FAULT:
-	case EXIT_REASON_EPT_MISCONFIG:
-	case EXIT_REASON_APIC_ACCESS:
-	case EXIT_REASON_TASK_SWITCH:
-	case EXIT_REASON_EXCEPTION:
-		idtvec_info = vmcs_idt_vectoring_info();
-		VCPU_CTR2(vmx->vm, vcpu, "vm exit %s: idtvec_info 0x%08x",
-		    exit_reason_to_str(reason), idtvec_info);	
-		if ((idtvec_info & VMCS_IDT_VEC_VALID) &&
-		    (reason != EXIT_REASON_TASK_SWITCH)) {
-			idtvec_info &= ~(1 << 12); /* clear undefined bit */
-			vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info);
-			if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
-				idtvec_err = vmcs_idt_vectoring_err();
-				vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR,
-				    idtvec_err);
-			}
-			/*
-			 * If 'virtual NMIs' are being used and the VM-exit
-			 * happened while injecting an NMI during the previous
-			 * VM-entry, then clear "blocking by NMI" in the Guest
-			 * Interruptibility-state.
-			 */
-			if ((idtvec_info & VMCS_INTR_T_MASK) ==
-			    VMCS_INTR_T_NMI) {
-				 vmx_clear_nmi_blocking(vmx, vcpu);
-			}
+	idtvec_info = vmcs_idt_vectoring_info();
+	if (idtvec_info & VMCS_IDT_VEC_VALID) {
+		idtvec_info &= ~(1 << 12); /* clear undefined bit */
+		exitintinfo = idtvec_info;
+		if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
+			idtvec_err = vmcs_idt_vectoring_err();
+			exitintinfo |= (uint64_t)idtvec_err << 32;
+		}
+		error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
+		KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
+		    __func__, error));
+
+		/*
+		 * If 'virtual NMIs' are being used and the VM-exit
+		 * happened while injecting an NMI during the previous
+		 * VM-entry, then clear "blocking by NMI" in the
+		 * Guest Interruptibility-State so the NMI can be
+		 * reinjected on the subsequent VM-entry.
+		 *
+		 * However, if the NMI was being delivered through a task
+		 * gate, then the new task must start execution with NMIs
+		 * blocked so don't clear NMI blocking in this case.
+		 */
+		intr_type = idtvec_info & VMCS_INTR_T_MASK;
+		if (intr_type == VMCS_INTR_T_NMI) {
+			if (reason != EXIT_REASON_TASK_SWITCH)
+				vmx_clear_nmi_blocking(vmx, vcpu);
+			else
+				vmx_assert_nmi_blocking(vmx, vcpu);
+		}
+
+		/*
+		 * Update VM-entry instruction length if the event being
+		 * delivered was a software interrupt or software exception.
+		 */
+		if (intr_type == VMCS_INTR_T_SWINTR ||
+		    intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
+		    intr_type == VMCS_INTR_T_SWEXCEPTION) {
 			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 		}
-		break;
-	default:
-		idtvec_info = 0;
-		break;
 	}
 
 	switch (reason) {
@@ -2136,7 +2157,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		 */
 		if (ts->reason == TSR_IDT_GATE) {
 			KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
-			    ("invalid idtvec_info %x for IDT task switch",
+			    ("invalid idtvec_info %#x for IDT task switch",
 			    idtvec_info));
 			intr_type = idtvec_info & VMCS_INTR_T_MASK;
 			if (intr_type != VMCS_INTR_T_SWINTR &&
@@ -2302,6 +2323,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		 * the guest.
 		 *
 		 * See "Resuming Guest Software after Handling an Exception".
+		 * See "Information for VM Exits Due to Vectored Events".
 		 */
 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 		    (intr_info & 0xff) != IDT_DF &&
@@ -2519,6 +2541,13 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 		 * pmap_invalidate_ept().
 		 */
 		disable_intr();
+		vmx_inject_interrupts(vmx, vcpu, vlapic);
+
+		/*
+		 * Check for vcpu suspension after injecting events because
+		 * vmx_inject_interrupts() can suspend the vcpu due to a
+		 * triple fault.
+		 */
 		if (vcpu_suspended(suspend_cookie)) {
 			enable_intr();
 			vm_exit_suspended(vmx->vm, vcpu, vmcs_guest_rip());
@@ -2539,7 +2568,6 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 			break;
 		}
 
-		vmx_inject_interrupts(vmx, vcpu, vlapic);
 		vmx_run_trace(vmx, vcpu);
 		rc = vmx_enter_guest(vmxctx, vmx, launched);
 
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index d1d9d5a52a63..25042546b39e 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -97,6 +97,7 @@ struct vcpu {
 	int		hostcpu;	/* (o) vcpu's host cpu */
 	struct vlapic	*vlapic;	/* (i) APIC device model */
 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
+	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
 	int		nmi_pending;	/* (i) NMI pending */
 	int		extint_pending;	/* (i) INTR pending */
 	struct vm_exception exception;	/* (x) exception collateral */
@@ -241,6 +242,7 @@ vcpu_init(struct vm *vm, int vcpu_id, bool create)
 
 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
+	vcpu->exitintinfo = 0;
 	vcpu->nmi_pending = 0;
 	vcpu->extint_pending = 0;
 	vcpu->exception_pending = 0;
@@ -1457,6 +1459,202 @@ restart:
 	return (error);
 }
 
+int
+vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
+{
+	struct vcpu *vcpu;
+	int type, vector;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	if (info & VM_INTINFO_VALID) {
+		type = info & VM_INTINFO_TYPE;
+		vector = info & 0xff;
+		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
+			return (EINVAL);
+		if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
+			return (EINVAL);
+		if (info & VM_INTINFO_RSVD)
+			return (EINVAL);
+	} else {
+		info = 0;
+	}
+	VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
+	vcpu->exitintinfo = info;
+	return (0);
+}
+
+enum exc_class {
+	EXC_BENIGN,
+	EXC_CONTRIBUTORY,
+	EXC_PAGEFAULT
+};
+
+#define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
+
+static enum exc_class
+exception_class(uint64_t info)
+{
+	int type, vector;
+
+	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
+	type = info & VM_INTINFO_TYPE;
+	vector = info & 0xff;
+
+	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
+	switch (type) {
+	case VM_INTINFO_HWINTR:
+	case VM_INTINFO_SWINTR:
+	case VM_INTINFO_NMI:
+		return (EXC_BENIGN);
+	default:
+		/*
+		 * Hardware exception.
+		 *
+		 * SVM and VT-x use identical type values to represent NMI,
+		 * hardware interrupt and software interrupt.
+		 *
+		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
+		 * for exceptions except #BP and #OF. #BP and #OF use a type
+		 * value of '5' or '6'. Therefore we don't check for explicit
+		 * values of 'type' to classify 'intinfo' into a hardware
+		 * exception.
+		 */
+		break;
+	}
+
+	switch (vector) {
+	case IDT_PF:
+	case IDT_VE:
+		return (EXC_PAGEFAULT);
+	case IDT_DE:
+	case IDT_TS:
+	case IDT_NP:
+	case IDT_SS:
+	case IDT_GP:
+		return (EXC_CONTRIBUTORY);
+	default:
+		return (EXC_BENIGN);
+	}
+}
+
+static int
+nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
+    uint64_t *retinfo)
+{
+	enum exc_class exc1, exc2;
+	int type1, vector1;
+
+	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
+	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
+
+	/*
+	 * If an exception occurs while attempting to call the double-fault
+	 * handler the processor enters shutdown mode (aka triple fault).
+	 */
+	type1 = info1 & VM_INTINFO_TYPE;
+	vector1 = info1 & 0xff;
+	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
+		VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
+		    info1, info2);
+		vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
+		*retinfo = 0;
+		return (0);
+	}
+
+	/*
+	 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
+	 */
+	exc1 = exception_class(info1);
+	exc2 = exception_class(info2);
+	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
+	    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
+		/* Convert nested fault into a double fault. */
+		*retinfo = IDT_DF;
+		*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
+		*retinfo |= VM_INTINFO_DEL_ERRCODE;
+	} else {
+		/* Handle exceptions serially */
+		*retinfo = info2;
+	}
+	return (1);
+}
+
+static uint64_t
+vcpu_exception_intinfo(struct vcpu *vcpu)
+{
+	uint64_t info = 0;
+
+	if (vcpu->exception_pending) {
+		info = vcpu->exception.vector & 0xff;
+		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
+		if (vcpu->exception.error_code_valid) {
+			info |= VM_INTINFO_DEL_ERRCODE;
+			info |= (uint64_t)vcpu->exception.error_code << 32;
+		}
+	}
+	return (info);
+}
+
+int
+vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
+{
+	struct vcpu *vcpu;
+	uint64_t info1, info2;
+	int valid;
+
+	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	info1 = vcpu->exitintinfo;
+	vcpu->exitintinfo = 0;
+
+	info2 = 0;
+	if (vcpu->exception_pending) {
+		info2 = vcpu_exception_intinfo(vcpu);
+		vcpu->exception_pending = 0;
+		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
+		    vcpu->exception.vector, info2);
+	}
+
+	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
+		valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
+	} else if (info1 & VM_INTINFO_VALID) {
+		*retinfo = info1;
+		valid = 1;
+	} else if (info2 & VM_INTINFO_VALID) {
+		*retinfo = info2;
+		valid = 1;
+	} else {
+		valid = 0;
+	}
+
+	if (valid) {
+		VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
+		    "retinfo(%#lx)", __func__, info1, info2, *retinfo);
+	}
+
+	return (valid);
+}
+
+int
+vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+	*info1 = vcpu->exitintinfo;
+	*info2 = vcpu_exception_intinfo(vcpu);
+	return (0);
+}
+
 int
 vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
 {
@@ -1468,6 +1666,14 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
 	if (exception->vector < 0 || exception->vector >= 32)
 		return (EINVAL);
 
+	/*
+	 * A double fault exception should never be injected directly into
+	 * the guest. It is a derived exception that results from specific
+	 * combinations of nested faults.
+	 */
+	if (exception->vector == IDT_DF)
+		return (EINVAL);
+
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->exception_pending) {
@@ -1483,25 +1689,6 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
 	return (0);
 }
 
-int
-vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception)
-{
-	struct vcpu *vcpu;
-	int pending;
-
-	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
-
-	vcpu = &vm->vcpu[vcpuid];
-	pending = vcpu->exception_pending;
-	if (pending) {
-		vcpu->exception_pending = 0;
-		*exception = vcpu->exception;
-		VCPU_CTR1(vm, vcpuid, "Exception %d delivered",
-		    exception->vector);
-	}
-	return (pending);
-}
-
 static void
 vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
 {
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index f3e31a33df4a..a85109edaa1d 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -173,6 +173,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	struct vm_gla2gpa *gg;
 	struct vm_activate_cpu *vac;
 	struct vm_cpuset *vm_cpuset;
+	struct vm_intinfo *vmii;
 
 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL)
@@ -199,6 +200,8 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	case VM_SET_X2APIC_STATE:
 	case VM_GLA2GPA:
 	case VM_ACTIVATE_CPU:
+	case VM_SET_INTINFO:
+	case VM_GET_INTINFO:
 		/*
 		 * XXX fragile, handle with care
 		 * Assumes that the first field of the ioctl data is the vcpu.
@@ -470,6 +473,15 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 			error = copyout(cpuset, vm_cpuset->cpus, size);
 		free(cpuset, M_TEMP);
 		break;
+	case VM_SET_INTINFO:
+		vmii = (struct vm_intinfo *)data;
+		error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
+		break;
+	case VM_GET_INTINFO:
+		vmii = (struct vm_intinfo *)data;
+		error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
+		    &vmii->info2);
+		break;
 	default:
 		error = ENOTTY;
 		break;
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index 457ec513a0d4..2b95d9cf880a 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -534,6 +534,8 @@ vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 		exit(1);
 	case VM_SUSPEND_HALT:
 		exit(2);
+	case VM_SUSPEND_TRIPLEFAULT:
+		exit(3);
 	default:
 		fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
 		exit(100);
diff --git a/usr.sbin/bhyve/task_switch.c b/usr.sbin/bhyve/task_switch.c
index b2f5bedb862f..e946807aa8e5 100644
--- a/usr.sbin/bhyve/task_switch.c
+++ b/usr.sbin/bhyve/task_switch.c
@@ -904,10 +904,14 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	 */
 
 	/*
-	 * XXX is the original task switch was triggered by a hardware
-	 * exception then do we generate a double-fault if we encounter
-	 * an exception during the task switch?
+	 * If the task switch was triggered by an event delivered through
+	 * the IDT then extinguish the pending event from the vcpu's
+	 * exitintinfo.
 	 */
+	if (task_switch->reason == TSR_IDT_GATE) {
+		error = vm_set_intinfo(ctx, vcpu, 0);
+		assert(error == 0);
+	}
 
 	/*
 	 * XXX should inject debug exception if 'T' bit is 1
diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c
index e77f0d77df6f..b6006b72a767 100644
--- a/usr.sbin/bhyvectl/bhyvectl.c
+++ b/usr.sbin/bhyvectl/bhyvectl.c
@@ -195,7 +195,8 @@ usage(void)
 	"       [--force-reset]\n"
 	"       [--force-poweroff]\n"
 	"       [--get-active-cpus]\n"
-	"       [--get-suspended-cpus]\n",
+	"       [--get-suspended-cpus]\n"
+	"       [--get-intinfo]\n",
 	progname);
 	exit(1);
 }
@@ -205,6 +206,7 @@ static int inject_nmi, assert_lapic_lvt;
 static int force_reset, force_poweroff;
 static const char *capname;
 static int create, destroy, get_lowmem, get_highmem;
+static int get_intinfo;
 static int get_active_cpus, get_suspended_cpus;
 static uint64_t memsize;
 static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4;
@@ -412,6 +414,37 @@ print_cpus(const char *banner, const cpuset_t *cpus)
 	printf("\n");
 }
 
+static void
+print_intinfo(const char *banner, uint64_t info)
+{
+	int type;
+
+	printf("%s:\t", banner);
+	if (info & VM_INTINFO_VALID) {
+		type = info & VM_INTINFO_TYPE;
+		switch (type) {
+		case VM_INTINFO_HWINTR:
+			printf("extint");
+			break;
+		case VM_INTINFO_NMI:
+			printf("nmi");
+			break;
+		case VM_INTINFO_SWINTR:
+			printf("swint");
+			break;
+		default:
+			printf("exception");
+			break;
+		}
+		printf(" vector %d", (int)VM_INTINFO_VECTOR(info));
+		if (info & VM_INTINFO_DEL_ERRCODE)
+			printf(" errcode %#x", (u_int)(info >> 32));
+	} else {
+		printf("n/a");
+	}
+	printf("\n");
+}
+
 int
 main(int argc, char *argv[])
 {
@@ -420,7 +453,7 @@ main(int argc, char *argv[])
 	vm_paddr_t gpa, gpa_pmap;
 	size_t len;
 	struct vm_exit vmexit;
-	uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte;
+	uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte, info[2];
 	struct vmctx *ctx;
 	int wired;
 	cpuset_t cpus;
@@ -595,6 +628,7 @@ main(int argc, char *argv[])
 		{ "force-poweroff", NO_ARG,	&force_poweroff, 1 },
 		{ "get-active-cpus", NO_ARG,	&get_active_cpus, 1 },
 		{ "get-suspended-cpus", NO_ARG,	&get_suspended_cpus, 1 },
+		{ "get-intinfo", NO_ARG,	&get_intinfo,	1 },
 		{ NULL,		0,		NULL,		0 }
 	};
 
@@ -1566,6 +1600,14 @@ main(int argc, char *argv[])
 			print_cpus("suspended cpus", &cpus);
 	}
 
+	if (!error && (get_intinfo || get_all)) {
+		error = vm_get_intinfo(ctx, vcpu, &info[0], &info[1]);
+		if (!error) {
+			print_intinfo("pending", info[0]);
+			print_intinfo("current", info[1]);
+		}
+	}
+
 	if (!error && run) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
 		assert(error == 0);
-- 
cgit v1.3


From d665d229cef8b8617a89e94898a4e8d770aedd34 Mon Sep 17 00:00:00 2001
From: Neel Natu <neel@FreeBSD.org>
Date: Wed, 23 Jul 2014 04:28:51 +0000
Subject: Emulate instructions emitted by OpenBSD/i386 version 5.5: - CMP REG,
 r/m - MOV AX/EAX/RAX, moffset - MOV moffset, AX/EAX/RAX - PUSH r/m

---
 lib/libvmmapi/vmmapi.c                   |  40 +++-
 lib/libvmmapi/vmmapi.h                   |   9 +-
 sys/amd64/include/vmm.h                  |  32 +++
 sys/amd64/include/vmm_instruction_emul.h |   4 +-
 sys/amd64/vmm/vmm.c                      | 119 ++++++++++-
 sys/amd64/vmm/vmm_instruction_emul.c     | 356 ++++++++++++++++++++++++++-----
 usr.sbin/bhyve/bhyverun.c                |   8 +-
 usr.sbin/bhyve/inout.c                   |  12 +-
 usr.sbin/bhyve/mem.c                     |   8 +-
 usr.sbin/bhyve/mem.h                     |   3 +-
 usr.sbin/bhyve/task_switch.c             |  15 +-
 11 files changed, 516 insertions(+), 90 deletions(-)

(limited to 'lib/libvmmapi')

diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index 483aa5199b6c..087d0b789f11 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/_iovec.h>
 #include <sys/cpuset.h>
 
+#include <x86/segments.h>
 #include <machine/specialreg.h>
 #include <machine/param.h>
 
@@ -326,6 +327,16 @@ vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
 	return (error);
 }
 
+int
+vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc)
+{
+	int error;
+
+	error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit,
+	    &seg_desc->access);
+	return (error);
+}
+
 int
 vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
 {
@@ -988,7 +999,7 @@ gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
 #endif
 
 int
-vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt)
 {
 	uint64_t gpa;
@@ -1135,3 +1146,30 @@ vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1)
 	error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii);
 	return (error);
 }
+
+void
+vm_inject_ss(struct vmctx *ctx, int vcpu, int errcode)
+{
+	int error;
+
+	error = vm_inject_exception2(ctx, vcpu, IDT_SS, errcode);
+	assert(error == 0);
+}
+
+void
+vm_inject_ac(struct vmctx *ctx, int vcpu, int errcode)
+{
+	int error;
+
+	error = vm_inject_exception2(ctx, vcpu, IDT_AC, errcode);
+	assert(error == 0);
+}
+
+void
+vm_inject_gp(struct vmctx *ctx, int vcpu, int errcode)
+{
+	int error;
+
+	error = vm_inject_exception2(ctx, vcpu, IDT_GP, errcode);
+	assert(error == 0);
+}
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index 2040c91e205f..72d75c002676 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -66,6 +66,8 @@ int	vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
 		    uint64_t base, uint32_t limit, uint32_t access);
 int	vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
 		    uint64_t *base, uint32_t *limit, uint32_t *access);
+int	vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg,
+			struct seg_desc *seg_desc);
 int	vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
 int	vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
 int	vm_run(struct vmctx *ctx, int vcpu, uint64_t rip,
@@ -124,13 +126,18 @@ int	vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities);
  * The 'iovcnt' should be big enough to accomodate all GPA segments.
  * Returns 0 on success, 1 on a guest fault condition and -1 otherwise.
  */
-int	vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+int	vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg,
 	    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt);
 void	vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov,
 	    void *host_dst, size_t len);
 void	vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src,
 	    struct iovec *guest_iov, size_t len);
 
+/* Helper functions to inject exceptions */
+void	vm_inject_ss(struct vmctx *ctx, int vcpu, int errcode);
+void	vm_inject_ac(struct vmctx *ctx, int vcpu, int errcode);
+void	vm_inject_gp(struct vmctx *ctx, int vcpu, int errcode);
+
 /* Reset vcpu register state */
 int	vcpu_reset(struct vmctx *ctx, int vcpu);
 
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 6895e64037e0..6f476b4afa32 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -114,6 +114,7 @@ struct vioapic;
 struct vlapic;
 struct vmspace;
 struct vm_object;
+struct vm_guest_paging;
 struct pmap;
 
 typedef int	(*vmm_init_func_t)(int ipinum);
@@ -317,10 +318,41 @@ int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);
 
 void vm_inject_gp(struct vm *vm, int vcpuid); /* general protection fault */
 void vm_inject_ud(struct vm *vm, int vcpuid); /* undefined instruction fault */
+void vm_inject_ac(struct vm *vm, int vcpuid, int errcode); /* #AC */
+void vm_inject_ss(struct vm *vm, int vcpuid, int errcode); /* #SS */
 void vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2);
 
 enum vm_reg_name vm_segment_name(int seg_encoding);
 
+struct vm_copyinfo {
+	uint64_t	gpa;
+	size_t		len;
+	void		*hva;
+	void		*cookie;
+};
+
+/*
+ * Set up 'copyinfo[]' to copy to/from guest linear address space starting
+ * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
+ * a copyin or PROT_WRITE for a copyout. 
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ *
+ * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
+ * the return value is 0. The 'copyinfo[]' resources should be freed by calling
+ * 'vm_copy_teardown()' after the copy is done.
+ */
+int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+    uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
+    int num_copyinfo);
+void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+    int num_copyinfo);
+void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+    void *kaddr, size_t len);
+void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
+    struct vm_copyinfo *copyinfo, size_t len);
 #endif	/* KERNEL */
 
 #define	VM_MAXCPU	16			/* maximum virtual cpus */
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
index 05b60fb66ddf..bbd3d88d9cf3 100644
--- a/sys/amd64/include/vmm_instruction_emul.h
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -52,8 +52,8 @@ typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
  * s
  */
 int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
-			    mem_region_read_t mrr, mem_region_write_t mrw,
-			    void *mrarg);
+    struct vm_guest_paging *paging, mem_region_read_t mrr,
+    mem_region_write_t mrw, void *mrarg);
 
 int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
     uint64_t val, int size);
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 25042546b39e..b667b4826690 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -1235,8 +1235,8 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 		return (0);
 	}
 
-	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
-	    retu);
+	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
+	    mread, mwrite, retu);
 
 	return (error);
 }
@@ -1751,6 +1751,30 @@ vm_inject_ud(struct vm *vm, int vcpuid)
 	vm_inject_fault(vm, vcpuid, &udf);
 }
 
+void
+vm_inject_ac(struct vm *vm, int vcpuid, int error_code)
+{
+	struct vm_exception acf = {
+		.vector = IDT_AC,
+		.error_code_valid = 1,
+		.error_code = error_code
+	};
+
+	vm_inject_fault(vm, vcpuid, &acf);
+}
+
+void
+vm_inject_ss(struct vm *vm, int vcpuid, int error_code)
+{
+	struct vm_exception ssf = {
+		.vector = IDT_SS,
+		.error_code_valid = 1,
+		.error_code = error_code
+	};
+
+	vm_inject_fault(vm, vcpuid, &ssf);
+}
+
 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
 
 int
@@ -2182,6 +2206,97 @@ vm_segment_name(int seg)
 	return (seg_names[seg]);
 }
 
+void
+vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+    int num_copyinfo)
+{
+	int idx;
+
+	for (idx = 0; idx < num_copyinfo; idx++) {
+		if (copyinfo[idx].cookie != NULL)
+			vm_gpa_release(copyinfo[idx].cookie);
+	}
+	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
+}
+
+int
+vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+    uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
+    int num_copyinfo)
+{
+	int error, idx, nused;
+	size_t n, off, remaining;
+	void *hva, *cookie;
+	uint64_t gpa;
+
+	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
+
+	nused = 0;
+	remaining = len;
+	while (remaining > 0) {
+		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
+		error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
+		if (error)
+			return (error);
+		off = gpa & PAGE_MASK;
+		n = min(remaining, PAGE_SIZE - off);
+		copyinfo[nused].gpa = gpa;
+		copyinfo[nused].len = n;
+		remaining -= n;
+		gla += n;
+		nused++;
+	}
+
+	for (idx = 0; idx < nused; idx++) {
+		hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len,
+		    prot, &cookie);
+		if (hva == NULL)
+			break;
+		copyinfo[idx].hva = hva;
+		copyinfo[idx].cookie = cookie;
+	}
+
+	if (idx != nused) {
+		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
+		return (-1);
+	} else {
+		return (0);
+	}
+}
+
+void
+vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
+    size_t len)
+{
+	char *dst;
+	int idx;
+	
+	dst = kaddr;
+	idx = 0;
+	while (len > 0) {
+		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
+		len -= copyinfo[idx].len;
+		dst += copyinfo[idx].len;
+		idx++;
+	}
+}
+
+void
+vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
+    struct vm_copyinfo *copyinfo, size_t len)
+{
+	const char *src;
+	int idx;
+
+	src = kaddr;
+	idx = 0;
+	while (len > 0) {
+		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
+		len -= copyinfo[idx].len;
+		src += copyinfo[idx].len;
+		idx++;
+	}
+}
 
 /*
  * Return the amount of in-use and wired memory for the VM. Since
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index 7e09ccbb56c0..e8a5f7bcd4eb 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
 #else	/* !_KERNEL */
 #include <sys/types.h>
 #include <sys/errno.h>
+#include <sys/_iovec.h>
 
 #include <machine/vmm.h>
 
@@ -65,6 +66,8 @@ enum {
 	VIE_OP_TYPE_AND,
 	VIE_OP_TYPE_OR,
 	VIE_OP_TYPE_TWO_BYTE,
+	VIE_OP_TYPE_PUSH,
+	VIE_OP_TYPE_CMP,
 	VIE_OP_TYPE_LAST
 };
 
@@ -72,6 +75,7 @@ enum {
 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
+#define	VIE_OP_F_NO_MODRM	(1 << 3)
 
 static const struct vie_op two_byte_opcodes[256] = {
 	[0xB6] = {
@@ -89,6 +93,10 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_byte = 0x0F,
 		.op_type = VIE_OP_TYPE_TWO_BYTE
 	},
+	[0x3B] = {
+		.op_byte = 0x3B,
+		.op_type = VIE_OP_TYPE_CMP,
+	},
 	[0x88] = {
 		.op_byte = 0x88,
 		.op_type = VIE_OP_TYPE_MOV,
@@ -105,6 +113,16 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_byte = 0x8B,
 		.op_type = VIE_OP_TYPE_MOV,
 	},
+	[0xA1] = {
+		.op_byte = 0xA1,
+		.op_type = VIE_OP_TYPE_MOV,
+		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
+	},
+	[0xA3] = {
+		.op_byte = 0xA3,
+		.op_type = VIE_OP_TYPE_MOV,
+		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
+	},
 	[0xC6] = {
 		/* XXX Group 11 extended opcode - not just MOV */
 		.op_byte = 0xC6,
@@ -132,6 +150,11 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_type = VIE_OP_TYPE_OR,
 		.op_flags = VIE_OP_F_IMM8,
 	},
+	[0xFF] = {
+		/* XXX Group 5 extended opcode - not just PUSH */
+		.op_byte = 0xFF,
+		.op_type = VIE_OP_TYPE_PUSH,
+	}
 };
 
 /* struct vie.mod */
@@ -284,6 +307,53 @@ vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
 	return (error);
 }
 
+/*
+ * Return the status flags that would result from doing (x - y).
+ */
+static u_long
+getcc16(uint16_t x, uint16_t y)
+{
+	u_long rflags;
+
+	__asm __volatile("sub %1,%2; pushfq; popq %0" :
+	    "=r" (rflags) : "m" (y), "r" (x));
+	return (rflags);
+}
+
+static u_long
+getcc32(uint32_t x, uint32_t y)
+{
+	u_long rflags;
+
+	__asm __volatile("sub %1,%2; pushfq; popq %0" :
+	    "=r" (rflags) : "m" (y), "r" (x));
+	return (rflags);
+}
+
+static u_long
+getcc64(uint64_t x, uint64_t y)
+{
+	u_long rflags;
+
+	__asm __volatile("sub %1,%2; pushfq; popq %0" :
+	    "=r" (rflags) : "m" (y), "r" (x));
+	return (rflags);
+}
+
+static u_long
+getcc(int opsize, uint64_t x, uint64_t y)
+{
+	KASSERT(opsize == 2 || opsize == 4 || opsize == 8,
+	    ("getcc: invalid operand size %d", opsize));
+
+	if (opsize == 2)
+		return (getcc16(x, y));
+	else if (opsize == 4)
+		return (getcc32(x, y));
+	else
+		return (getcc64(x, y));
+}
+
 static int
 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
@@ -346,6 +416,32 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 			error = vie_update_register(vm, vcpuid, reg, val, size);
 		}
 		break;
+	case 0xA1:
+		/*
+		 * MOV from seg:moffset to AX/EAX/RAX
+		 * A1:		mov AX, moffs16
+		 * A1:		mov EAX, moffs32
+		 * REX.W + A1:	mov RAX, moffs64
+		 */
+		error = memread(vm, vcpuid, gpa, &val, size, arg);
+		if (error == 0) {
+			reg = VM_REG_GUEST_RAX;
+			error = vie_update_register(vm, vcpuid, reg, val, size);
+		}
+		break;
+	case 0xA3:
+		/*
+		 * MOV from AX/EAX/RAX to seg:moffset
+		 * A3:		mov moffs16, AX
+		 * A3:		mov moffs32, EAX 
+		 * REX.W + A3:	mov moffs64, RAX
+		 */
+		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
+		if (error == 0) {
+			val &= size2mask[size];
+			error = memwrite(vm, vcpuid, gpa, val, size, arg);
+		}
+		break;
 	case 0xC6:
 		/*
 		 * MOV from imm8 to mem (ModRM:r/m)
@@ -553,10 +649,150 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	return (error);
 }
 
+#define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
+
+static int
+emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+	int error, size;
+	uint64_t op1, op2, rflags, rflags2;
+	enum vm_reg_name reg;
+
+	size = vie->opsize;
+	switch (vie->op.op_byte) {
+	case 0x3B:
+		/*
+		 * 3B/r		CMP r16, r/m16
+		 * 3B/r		CMP r32, r/m32
+		 * REX.W + 3B/r	CMP r64, r/m64
+		 *
+		 * Compare first operand (reg) with second operand (r/m) and
+		 * set status flags in EFLAGS register. The comparison is
+		 * performed by subtracting the second operand from the first
+		 * operand and then setting the status flags.
+		 */
+
+		/* Get the first operand */
+		reg = gpr_map[vie->reg];
+		error = vie_read_register(vm, vcpuid, reg, &op1);
+		if (error)
+			return (error);
+
+		/* Get the second operand */
+		error = memread(vm, vcpuid, gpa, &op2, size, arg);
+		if (error)
+			return (error);
+
+		break;
+	default:
+		return (EINVAL);
+	}
+	rflags2 = getcc(size, op1, op2);
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	if (error)
+		return (error);
+	rflags &= ~RFLAGS_STATUS_BITS;
+	rflags |= rflags2 & RFLAGS_STATUS_BITS;
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
+	return (error);
+}
+
+static int
+emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *arg)
+{
+#ifdef _KERNEL
+	struct vm_copyinfo copyinfo[2];
+#else
+	struct iovec copyinfo[2];
+#endif
+	struct seg_desc ss_desc;
+	uint64_t cr0, rflags, rsp, stack_gla, val;
+	int error, size, stackaddrsize;
+
+	/*
+	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
+	 *
+	 * PUSH is part of the group 5 extended opcodes and is identified
+	 * by ModRM:reg = b110.
+	 */
+	if ((vie->reg & 7) != 6)
+		return (EINVAL);
+
+	size = vie->opsize;
+	/*
+	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
+	 */
+	if (paging->cpu_mode == CPU_MODE_REAL)
+		stackaddrsize = 2;
+	else if (paging->cpu_mode == CPU_MODE_64BIT)
+		stackaddrsize = 8;
+	else {
+		/*
+		 * In protected or compability mode the 'B' flag in the
+		 * stack-segment descriptor determines the size of the
+		 * stack pointer.
+		 */
+		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
+		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
+		    __func__, error));
+		if (SEG_DESC_DEF32(ss_desc.access))
+			stackaddrsize = 4;
+		else
+			stackaddrsize = 2;
+	}
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
+	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
+	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
+
+	rsp -= size;
+	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
+	    rsp, size, stackaddrsize, PROT_WRITE, &stack_gla)) {
+		vm_inject_ss(vm, vcpuid, 0);
+		return (0);
+	}
+
+	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
+		vm_inject_ss(vm, vcpuid, 0);
+		return (0);
+	}
+
+	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
+		vm_inject_ac(vm, vcpuid, 0);
+		return (0);
+	}
+
+	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, PROT_WRITE,
+	    copyinfo, nitems(copyinfo));
+	if (error)
+		return (error);
+
+	error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
+	if (error == 0) {
+		vm_copyout(vm, vcpuid, &val, copyinfo, size);
+		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
+		    stackaddrsize);
+		KASSERT(error == 0, ("error %d updating rsp", error));
+	}
+#ifdef _KERNEL
+	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+#endif
+	return (error);
+}
+
 int
 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
-			mem_region_read_t memread, mem_region_write_t memwrite,
-			void *memarg)
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *memarg)
 {
 	int error;
 
@@ -564,6 +800,14 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		return (EINVAL);
 
 	switch (vie->op.op_type) {
+	case VIE_OP_TYPE_PUSH:
+		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
+		    memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_CMP:
+		error = emulate_cmp(vm, vcpuid, gpa, vie,
+				    memread, memwrite, memarg);
+		break;
 	case VIE_OP_TYPE_MOV:
 		error = emulate_mov(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
@@ -970,45 +1214,24 @@ fault:
 }
 
 int
-vmm_fetch_instruction(struct vm *vm, int cpuid, struct vm_guest_paging *paging,
+vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t rip, int inst_length, struct vie *vie)
 {
-	int n, error, prot;
-	uint64_t gpa, off;
-	void *hpa, *cookie;
+	struct vm_copyinfo copyinfo[2];
+	int error, prot;
 
-	/*
-	 * XXX cache previously fetched instructions using 'rip' as the tag
-	 */
-
-	prot = VM_PROT_READ | VM_PROT_EXECUTE;
 	if (inst_length > VIE_INST_SIZE)
 		panic("vmm_fetch_instruction: invalid length %d", inst_length);
 
-	/* Copy the instruction into 'vie' */
-	while (vie->num_valid < inst_length) {
-		error = vmm_gla2gpa(vm, cpuid, paging, rip, prot, &gpa);
-		if (error)
-			return (error);
-
-		off = gpa & PAGE_MASK;
-		n = min(inst_length - vie->num_valid, PAGE_SIZE - off);
-
-		if ((hpa = vm_gpa_hold(vm, gpa, n, prot, &cookie)) == NULL)
-			break;
-
-		bcopy(hpa, &vie->inst[vie->num_valid], n);
-
-		vm_gpa_release(cookie);
-
-		rip += n;
-		vie->num_valid += n;
+	prot = PROT_READ | PROT_EXEC;
+	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
+	    copyinfo, nitems(copyinfo));
+	if (error == 0) {
+		vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
+		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+		vie->num_valid = inst_length;
 	}
-
-	if (vie->num_valid == inst_length)
-		return (0);
-	else
-		return (-1);
+	return (error);
 }
 
 static int
@@ -1138,6 +1361,9 @@ decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
 	if (cpu_mode == CPU_MODE_REAL)
 		return (-1);
 
+	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
+		return (0);
+
 	if (vie_peek(vie, &x))
 		return (-1);
 
@@ -1314,24 +1540,14 @@ decode_immediate(struct vie *vie)
 	int i, n;
 	uint8_t x;
 	union {
-		char	buf[8];
+		char	buf[4];
 		int8_t	signed8;
 		int16_t	signed16;
 		int32_t	signed32;
-		int64_t	signed64;
 	} u;
 
 	/* Figure out immediate operand size (if any) */
-	if (vie->op.op_flags & VIE_OP_F_MOFFSET) {
-		/*
-		 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
-		 * The memory offset size follows the address-size of the
-		 * instruction. Although this is treated as an immediate
-		 * value during instruction decoding it is interpreted as
-		 * a segment offset by the instruction emulation.
-		 */
-		vie->imm_bytes = vie->addrsize;
-	} else if (vie->op.op_flags & VIE_OP_F_IMM) {
+	if (vie->op.op_flags & VIE_OP_F_IMM) {
 		/*
 		 * Section 2.2.1.5 "Immediates", Intel SDM:
 		 * In 64-bit mode the typical size of immediate operands
@@ -1350,7 +1566,7 @@ decode_immediate(struct vie *vie)
 	if ((n = vie->imm_bytes) == 0)
 		return (0);
 
-	KASSERT(n == 1 || n == 2 || n == 4 || n == 8,
+	KASSERT(n == 1 || n == 2 || n == 4,
 	    ("%s: invalid number of immediate bytes: %d", __func__, n));
 
 	for (i = 0; i < n; i++) {
@@ -1366,20 +1582,41 @@ decode_immediate(struct vie *vie)
 		vie->immediate = u.signed8;
 	else if (n == 2)
 		vie->immediate = u.signed16;
-	else if (n == 4)
-		vie->immediate = u.signed32;
 	else
-		vie->immediate = u.signed64;
+		vie->immediate = u.signed32;
 
+	return (0);
+}
 
-	if (vie->op.op_flags & VIE_OP_F_MOFFSET) {
-		/*
-		 * If the immediate value is going to be interpreted as a
-		 * segment offset then undo the sign-extension above.
-		 */
-		vie->immediate &= size2mask[n];
-	}
+static int
+decode_moffset(struct vie *vie)
+{
+	int i, n;
+	uint8_t x;
+	union {
+		char	buf[8];
+		uint64_t u64;
+	} u;
+
+	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
+		return (0);
 
+	/*
+	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
+	 * The memory offset size follows the address-size of the instruction.
+	 */
+	n = vie->addrsize;
+	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
+
+	u.u64 = 0;
+	for (i = 0; i < n; i++) {
+		if (vie_peek(vie, &x))
+			return (-1);
+
+		u.buf[i] = x;
+		vie_advance(vie);
+	}
+	vie->displacement = u.u64;
 	return (0);
 }
 
@@ -1470,10 +1707,13 @@ vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
 
 	if (decode_displacement(vie))
 		return (-1);
-	
+
 	if (decode_immediate(vie))
 		return (-1);
 
+	if (decode_moffset(vie))
+		return (-1);
+
 	if (verify_inst_length(vie))
 		return (-1);
 
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index 2b95d9cf880a..26c6e5378192 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -347,8 +347,7 @@ vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 		fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
 		    vme->u.msr.code, *pvcpu);
 		if (strictmsr) {
-			error = vm_inject_exception2(ctx, *pvcpu, IDT_GP, 0);
-			assert(error == 0);
+			vm_inject_gp(ctx, *pvcpu, 0);
 			return (VMEXIT_RESTART);
 		}
 	}
@@ -374,8 +373,7 @@ vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 		fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
 		    vme->u.msr.code, vme->u.msr.wval, *pvcpu);
 		if (strictmsr) {
-			error = vm_inject_exception2(ctx, *pvcpu, IDT_GP, 0);
-			assert(error == 0);
+			vm_inject_gp(ctx, *pvcpu, 0);
 			return (VMEXIT_RESTART);
 		}
 	}
@@ -484,7 +482,7 @@ vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	stats.vmexit_inst_emul++;
 
 	err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
-			  &vmexit->u.inst_emul.vie);
+	    &vmexit->u.inst_emul.vie, &vmexit->u.inst_emul.paging);
 
 	if (err) {
 		if (err == EINVAL) {
diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c
index fe9e0d85625b..145ac1cbcba8 100644
--- a/usr.sbin/bhyve/inout.c
+++ b/usr.sbin/bhyve/inout.c
@@ -157,15 +157,13 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 			if (vie_calculate_gla(vis->paging.cpu_mode,
 			    vis->seg_name, &vis->seg_desc, index, bytes,
 			    addrsize, prot, &gla)) {
-				error = vm_inject_exception2(ctx, vcpu,
-				    IDT_GP, 0);
-				assert(error == 0);
+				vm_inject_gp(ctx, vcpu, 0);
 				retval = INOUT_RESTART;
 				break;
 			}
 
-			error = vm_gla2gpa(ctx, vcpu, &vis->paging, gla, bytes,
-			    prot, iov, nitems(iov));
+			error = vm_copy_setup(ctx, vcpu, &vis->paging, gla,
+			    bytes, prot, iov, nitems(iov));
 			assert(error == 0 || error == 1 || error == -1);
 			if (error) {
 				retval = (error == 1) ? INOUT_RESTART :
@@ -175,9 +173,7 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 
 			if (vie_alignment_check(vis->paging.cpl, bytes,
 			    vis->cr0, vis->rflags, gla)) {
-				error = vm_inject_exception2(ctx, vcpu,
-				    IDT_AC, 0);
-				assert(error == 0);
+				vm_inject_ac(ctx, vcpu, 0);
 				return (INOUT_RESTART);
 			}
 
diff --git a/usr.sbin/bhyve/mem.c b/usr.sbin/bhyve/mem.c
index 7ea630f2a587..37cf055f2ccf 100644
--- a/usr.sbin/bhyve/mem.c
+++ b/usr.sbin/bhyve/mem.c
@@ -157,7 +157,9 @@ mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg)
 }
 
 int
-emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie)
+emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie,
+    struct vm_guest_paging *paging)
+
 {
 	struct mmio_rb_range *entry;
 	int err;
@@ -184,10 +186,10 @@ emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie)
 	}
 
 	assert(entry != NULL);
-	err = vmm_emulate_instruction(ctx, vcpu, paddr, vie,
+	err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, paging,
 				      mem_read, mem_write, &entry->mr_param);
 	pthread_rwlock_unlock(&mmio_rwlock);
-	
+
 	return (err);
 }
 
diff --git a/usr.sbin/bhyve/mem.h b/usr.sbin/bhyve/mem.h
index 264bff9e82b0..eb648c145df6 100644
--- a/usr.sbin/bhyve/mem.h
+++ b/usr.sbin/bhyve/mem.h
@@ -50,7 +50,8 @@ struct mem_range {
 #define	MEM_F_RW		0x3
 
 void	init_mem(void);
-int     emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie);
+int     emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie,
+		    struct vm_guest_paging *paging);
 		    
 int	register_mem(struct mem_range *memp);
 int	register_mem_fallback(struct mem_range *memp);
diff --git a/usr.sbin/bhyve/task_switch.c b/usr.sbin/bhyve/task_switch.c
index e946807aa8e5..64339827ea25 100644
--- a/usr.sbin/bhyve/task_switch.c
+++ b/usr.sbin/bhyve/task_switch.c
@@ -214,7 +214,7 @@ desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
 	assert(error == 0);
 	assert(limit >= SEL_LIMIT(sel));
 
-	error = vm_gla2gpa(ctx, vcpu, paging, base + SEL_START(sel),
+	error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel),
 	    sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov));
 	if (error == 0) {
 		if (doread)
@@ -508,9 +508,7 @@ tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
 				 */
 				reserved = ~maxphyaddr | 0x1E6;
 				if (pdpte[i] & reserved) {
-					error = vm_inject_exception2(ctx, vcpu,
-					    IDT_GP, 0);
-					assert(error == 0);
+					vm_inject_gp(ctx, vcpu, 0);
 					return (VMEXIT_RESTART);
 				}
 			}
@@ -649,12 +647,11 @@ push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
 	}
 
 	if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
-		error = vm_inject_exception2(ctx, vcpu, IDT_AC, 1);
-		assert(error == 0);
+		vm_inject_ac(ctx, vcpu, 1);
 		return (VMEXIT_RESTART);
 	}
 
-	error = vm_gla2gpa(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
+	error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
 	    iov, nitems(iov));
 	assert(error == 0 || error == 1 || error == -1);
 	if (error) {
@@ -753,7 +750,7 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	}
 
 	/* Fetch the new TSS */
-	error = vm_gla2gpa(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
+	error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
 	    PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov));
 	if (error == 1) {
 		/* Restart vcpu execution to handle the page fault */
@@ -793,7 +790,7 @@ vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 		return (error);
 
 	/* Get the old TSS */
-	error = vm_gla2gpa(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
+	error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
 	    PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov));
 	if (error == 1) {
 		/* Restart vcpu execution to handle the page fault */
-- 
cgit v1.3


From d37f2adb383c75848d30b1eb5204a1d1d6190373 Mon Sep 17 00:00:00 2001
From: Neel Natu <neel@FreeBSD.org>
Date: Thu, 24 Jul 2014 01:38:11 +0000
Subject: Fix fault injection in bhyve.

The faulting instruction needs to be restarted when the exception handler
is done handling the fault. bhyve now does this correctly by setting
'vmexit[vcpu].inst_length' to zero so the %rip is not advanced.

A minor complication is that the fault injection APIs are used by instruction
emulation code that is shared by vmm.ko and bhyve. Thus the argument that
refers to 'struct vm *' in kernel or 'struct vmctx *' in userspace needs to
be loosely typed as a 'void *'.
---
 lib/libvmmapi/vmmapi.c       | 27 -----------------
 lib/libvmmapi/vmmapi.h       |  5 ---
 sys/amd64/include/vmm.h      | 38 +++++++++++++++++++----
 sys/amd64/vmm/vmm.c          | 72 +++++++++-----------------------------------
 usr.sbin/bhyve/bhyverun.c    | 27 +++++++++++++++--
 usr.sbin/bhyve/inout.c       |  2 +-
 usr.sbin/bhyve/task_switch.c |  7 ++---
 7 files changed, 74 insertions(+), 104 deletions(-)

(limited to 'lib/libvmmapi')

diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index 087d0b789f11..93955c7c233e 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -1146,30 +1146,3 @@ vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1)
 	error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii);
 	return (error);
 }
-
-void
-vm_inject_ss(struct vmctx *ctx, int vcpu, int errcode)
-{
-	int error;
-
-	error = vm_inject_exception2(ctx, vcpu, IDT_SS, errcode);
-	assert(error == 0);
-}
-
-void
-vm_inject_ac(struct vmctx *ctx, int vcpu, int errcode)
-{
-	int error;
-
-	error = vm_inject_exception2(ctx, vcpu, IDT_AC, errcode);
-	assert(error == 0);
-}
-
-void
-vm_inject_gp(struct vmctx *ctx, int vcpu, int errcode)
-{
-	int error;
-
-	error = vm_inject_exception2(ctx, vcpu, IDT_GP, errcode);
-	assert(error == 0);
-}
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index 72d75c002676..fbb6ddd3acfb 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -133,11 +133,6 @@ void	vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov,
 void	vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src,
 	    struct iovec *guest_iov, size_t len);
 
-/* Helper functions to inject exceptions */
-void	vm_inject_ss(struct vmctx *ctx, int vcpu, int errcode);
-void	vm_inject_ac(struct vmctx *ctx, int vcpu, int errcode);
-void	vm_inject_gp(struct vmctx *ctx, int vcpu, int errcode);
-
 /* Reset vcpu register state */
 int	vcpu_reset(struct vmctx *ctx, int vcpu);
 
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 6f476b4afa32..62af24093b28 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -29,6 +29,8 @@
 #ifndef _VMM_H_
 #define	_VMM_H_
 
+#include <x86/segments.h>
+
 enum vm_suspend_how {
 	VM_SUSPEND_NONE,
 	VM_SUSPEND_RESET,
@@ -316,12 +318,6 @@ int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info);
 
 int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);
 
-void vm_inject_gp(struct vm *vm, int vcpuid); /* general protection fault */
-void vm_inject_ud(struct vm *vm, int vcpuid); /* undefined instruction fault */
-void vm_inject_ac(struct vm *vm, int vcpuid, int errcode); /* #AC */
-void vm_inject_ss(struct vm *vm, int vcpuid, int errcode); /* #SS */
-void vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2);
-
 enum vm_reg_name vm_segment_name(int seg_encoding);
 
 struct vm_copyinfo {
@@ -579,4 +575,34 @@ struct vm_exit {
 	} u;
 };
 
+/* APIs to inject faults into the guest */
+void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid,
+    int errcode);
+
+static void __inline
+vm_inject_ud(void *vm, int vcpuid)
+{
+	vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
+}
+
+static void __inline
+vm_inject_gp(void *vm, int vcpuid)
+{
+	vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
+}
+
+static void __inline
+vm_inject_ac(void *vm, int vcpuid, int errcode)
+{
+	vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
+}
+
+static void __inline
+vm_inject_ss(void *vm, int vcpuid, int errcode)
+{
+	vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
+}
+
+void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2);
+
 #endif	/* _VMM_H_ */
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index b667b4826690..78aefc4b3443 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -1689,13 +1689,21 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
 	return (0);
 }
 
-static void
-vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
+void
+vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
+    int errcode)
 {
+	struct vm_exception exception;
 	struct vm_exit *vmexit;
+	struct vm *vm;
 	int error;
 
-	error = vm_inject_exception(vm, vcpuid, exception);
+	vm = vmarg;
+
+	exception.vector = vector;
+	exception.error_code = errcode;
+	exception.error_code_valid = errcode_valid;
+	error = vm_inject_exception(vm, vcpuid, &exception);
 	KASSERT(error == 0, ("vm_inject_exception error %d", error));
 
 	/*
@@ -1710,69 +1718,19 @@ vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
 }
 
 void
-vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
+vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
 {
-	struct vm_exception pf = {
-		.vector = IDT_PF,
-		.error_code_valid = 1,
-		.error_code = error_code
-	};
+	struct vm *vm;
 	int error;
 
+	vm = vmarg;
 	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
 	    error_code, cr2);
 
 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
 	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
 
-	vm_inject_fault(vm, vcpuid, &pf);
-}
-
-void
-vm_inject_gp(struct vm *vm, int vcpuid)
-{
-	struct vm_exception gpf = {
-		.vector = IDT_GP,
-		.error_code_valid = 1,
-		.error_code = 0
-	};
-
-	vm_inject_fault(vm, vcpuid, &gpf);
-}
-
-void
-vm_inject_ud(struct vm *vm, int vcpuid)
-{
-	struct vm_exception udf = {
-		.vector = IDT_UD,
-		.error_code_valid = 0
-	};
-
-	vm_inject_fault(vm, vcpuid, &udf);
-}
-
-void
-vm_inject_ac(struct vm *vm, int vcpuid, int error_code)
-{
-	struct vm_exception acf = {
-		.vector = IDT_AC,
-		.error_code_valid = 1,
-		.error_code = error_code
-	};
-
-	vm_inject_fault(vm, vcpuid, &acf);
-}
-
-void
-vm_inject_ss(struct vm *vm, int vcpuid, int error_code)
-{
-	struct vm_exception ssf = {
-		.vector = IDT_SS,
-		.error_code_valid = 1,
-		.error_code = error_code
-	};
-
-	vm_inject_fault(vm, vcpuid, &ssf);
+	vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
 }
 
 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index 26c6e5378192..e3d5994dce32 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -96,7 +96,7 @@ static cpuset_t cpumask;
 
 static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
 
-struct vm_exit vmexit[VM_MAXCPU];
+static struct vm_exit vmexit[VM_MAXCPU];
 
 struct bhyvestats {
         uint64_t        vmexit_bogus;
@@ -182,6 +182,27 @@ pincpu_parse(const char *opt)
 	return (0);
 }
 
+void
+vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
+    int errcode)
+{
+	struct vmctx *ctx;
+	int error;
+
+	ctx = arg;
+	if (errcode_valid)
+		error = vm_inject_exception2(ctx, vcpu, vector, errcode);
+	else
+		error = vm_inject_exception(ctx, vcpu, vector);
+	assert(error == 0);
+
+	/*
+	 * Set the instruction length to 0 to ensure that the instruction is
+	 * restarted when the fault handler returns.
+	 */
+	vmexit[vcpu].inst_length = 0;
+}
+
 void *
 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
 {
@@ -347,7 +368,7 @@ vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 		fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
 		    vme->u.msr.code, *pvcpu);
 		if (strictmsr) {
-			vm_inject_gp(ctx, *pvcpu, 0);
+			vm_inject_gp(ctx, *pvcpu);
 			return (VMEXIT_RESTART);
 		}
 	}
@@ -373,7 +394,7 @@ vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 		fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
 		    vme->u.msr.code, vme->u.msr.wval, *pvcpu);
 		if (strictmsr) {
-			vm_inject_gp(ctx, *pvcpu, 0);
+			vm_inject_gp(ctx, *pvcpu);
 			return (VMEXIT_RESTART);
 		}
 	}
diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c
index 145ac1cbcba8..447f6c55fde7 100644
--- a/usr.sbin/bhyve/inout.c
+++ b/usr.sbin/bhyve/inout.c
@@ -157,7 +157,7 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 			if (vie_calculate_gla(vis->paging.cpu_mode,
 			    vis->seg_name, &vis->seg_desc, index, bytes,
 			    addrsize, prot, &gla)) {
-				vm_inject_gp(ctx, vcpu, 0);
+				vm_inject_gp(ctx, vcpu);
 				retval = INOUT_RESTART;
 				break;
 			}
diff --git a/usr.sbin/bhyve/task_switch.c b/usr.sbin/bhyve/task_switch.c
index 64339827ea25..543c01f4536e 100644
--- a/usr.sbin/bhyve/task_switch.c
+++ b/usr.sbin/bhyve/task_switch.c
@@ -160,8 +160,6 @@ usd_to_seg_desc(struct user_segment_descriptor *usd)
 static void
 sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
 {
-	int error;
-
 	/*
 	 * Bit 2 from the selector is retained as-is in the error code.
 	 *
@@ -174,8 +172,7 @@ sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
 	sel &= ~0x3;
 	if (ext)
 		sel |= 0x1;
-	error = vm_inject_exception2(ctx, vcpu, vector, sel);
-	assert(error == 0);
+	vm_inject_fault(ctx, vcpu, vector, 1, sel);
 }
 
 static int
@@ -508,7 +505,7 @@ tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
 				 */
 				reserved = ~maxphyaddr | 0x1E6;
 				if (pdpte[i] & reserved) {
-					vm_inject_gp(ctx, vcpu, 0);
+					vm_inject_gp(ctx, vcpu);
 					return (VMEXIT_RESTART);
 				}
 			}
-- 
cgit v1.3