author: Peter Grehan <grehan@FreeBSD.org> 2014-08-19 01:20:24 +0000
committer: Peter Grehan <grehan@FreeBSD.org> 2014-08-19 01:20:24 +0000
commit: a77e87976e47f8352dc8eddeb5db1e905b54e82a (patch)
tree: 6aaf5296bf1aa8632ab9beb5fe4a61727ef7ab68
parent: 93d34b74609b771e9a048429d275715c3831e44c (diff)
41 files changed, 3057 insertions, 594 deletions
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index 9fb2308731e7..93955c7c233e 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/_iovec.h>
 #include <sys/cpuset.h>
 
+#include <x86/segments.h>
 #include <machine/specialreg.h>
 #include <machine/param.h>
 
@@ -327,6 +328,16 @@ vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
 }
 
 int
+vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc)
+{
+	int error;
+
+	error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit,
+	    &seg_desc->access);
+	return (error);
+}
+
+int
 vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
 {
 	int error;
@@ -988,7 +999,7 @@ gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
 #endif
 
 int
-vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
     uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt)
 {
 	uint64_t gpa;
@@ -1106,3 +1117,32 @@ vm_activate_cpu(struct vmctx *ctx, int vcpu)
 	error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac);
 	return (error);
 }
+
+int
+vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2)
+{
+	struct vm_intinfo vmii;
+	int error;
+
+	bzero(&vmii, sizeof(struct vm_intinfo));
+	vmii.vcpuid = vcpu;
+	error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii);
+	if (error == 0) {
+		*info1 = vmii.info1;
+		*info2 = vmii.info2;
+	}
+	return (error);
+}
+
+int
+vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1)
+{
+	struct vm_intinfo vmii;
+	int error;
+
+	bzero(&vmii, sizeof(struct vm_intinfo));
+	vmii.vcpuid = vcpu;
+	vmii.info1 = info1;
+	error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii);
+	return (error);
+}
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index 067eaa0aa26c..fbb6ddd3acfb 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -66,6 +66,8 @@ int	vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
 		    uint64_t base, uint32_t limit, uint32_t access);
 int	vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
 		    uint64_t *base, uint32_t *limit, uint32_t *access);
+int	vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg,
+			struct seg_desc *seg_desc);
 int	vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
 int	vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
 int	vm_run(struct vmctx *ctx, int vcpu, uint64_t rip,
@@ -104,6 +106,9 @@ int	vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot,
 	    int func, int idx, uint64_t addr, uint64_t msg,
 	    uint32_t vector_control);
 
+int	vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2);
+int	vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo);
+
 /*
  * Return a pointer to the statistics buffer. Note that this is not MT-safe.
  */
@@ -121,7 +126,7 @@ int	vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities);
  * The 'iovcnt' should be big enough to accomodate all GPA segments.
  * Returns 0 on success, 1 on a guest fault condition and -1 otherwise.
  */
-int	vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+int	vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg,
 	    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt);
 void	vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov,
 	    void *host_dst, size_t len);
diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c
index 957120cb9cb2..74be82cfba28 100644
--- a/sys/amd64/amd64/identcpu.c
+++ b/sys/amd64/amd64/identcpu.c
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/specialreg.h>
 #include <machine/md_var.h>
 
+#include <amd64/vmm/intel/vmx_controls.h>
 #include <x86/isa/icu.h>
 
 /* XXX - should be in header file: */
@@ -73,6 +74,7 @@ static u_int find_cpu_vendor_id(void);
 static void print_AMD_info(void);
 static void print_AMD_assoc(int i);
 static void print_via_padlock_info(void);
+static void print_vmx_info(void);
 
 int	cpu_class;
 char machine[] = "amd64";
@@ -428,6 +430,9 @@ printcpuinfo(void)
 			if (via_feature_rng != 0 || via_feature_xcrypt != 0)
 				print_via_padlock_info();
 
+			if (cpu_feature2 & CPUID2_VMX)
+				print_vmx_info();
+
 			if ((cpu_feature & CPUID_HTT) &&
 			    cpu_vendor_id == CPU_VENDOR_AMD)
 				cpu_feature &= ~CPUID_HTT;
@@ -722,3 +727,197 @@ print_via_padlock_info(void)
 	"\015RSA"		/* PMM */
 	);
 }
+
+static uint32_t
+vmx_settable(uint64_t basic, int msr, int true_msr)
+{
+	uint64_t val;
+
+	if (basic & (1UL << 55))
+		val = rdmsr(true_msr);
+	else
+		val = rdmsr(msr);
+
+	/* Just report the controls that can be set to 1. */
+	return (val >> 32);
+}
+
+static void
+print_vmx_info(void)
+{
+	uint64_t basic, msr;
+	uint32_t entry, exit, mask, pin, proc, proc2;
+	int comma;
+
+	printf("\n  VT-x: ");
+	msr = rdmsr(MSR_IA32_FEATURE_CONTROL);
+	if (!(msr & IA32_FEATURE_CONTROL_VMX_EN))
+		printf("(disabled in BIOS) ");
+	basic = rdmsr(MSR_VMX_BASIC);
+	pin = vmx_settable(basic, MSR_VMX_PINBASED_CTLS,
+	    MSR_VMX_TRUE_PINBASED_CTLS);
+	proc = vmx_settable(basic, MSR_VMX_PROCBASED_CTLS,
+	    MSR_VMX_TRUE_PROCBASED_CTLS);
+	if (proc & PROCBASED_SECONDARY_CONTROLS)
+		proc2 = vmx_settable(basic, MSR_VMX_PROCBASED_CTLS2,
+		    MSR_VMX_PROCBASED_CTLS2);
+	else
+		proc2 = 0;
+	exit = vmx_settable(basic, MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS);
+	entry = vmx_settable(basic, MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS);
+
+	if (!bootverbose) {
+		comma = 0;
+		if (exit & VM_EXIT_SAVE_PAT && exit & VM_EXIT_LOAD_PAT &&
+		    entry & VM_ENTRY_LOAD_PAT) {
+			printf("%sPAT", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc & PROCBASED_HLT_EXITING) {
+			printf("%sHLT", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc & PROCBASED_MTF) {
+			printf("%sMTF", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc & PROCBASED_PAUSE_EXITING) {
+			printf("%sPAUSE", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc2 & PROCBASED2_ENABLE_EPT) {
+			printf("%sEPT", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc2 & PROCBASED2_UNRESTRICTED_GUEST) {
+			printf("%sUG", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc2 & PROCBASED2_ENABLE_VPID) {
+			printf("%sVPID", comma ? "," : "");
+			comma = 1;
+		}
+		if (proc & PROCBASED_USE_TPR_SHADOW &&
+		    proc2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES &&
+		    proc2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE &&
+		    proc2 & PROCBASED2_APIC_REGISTER_VIRTUALIZATION &&
+		    proc2 & PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY) {
+			printf("%sVID", comma ? "," : "");
+			comma = 1;
+			if (pin & PINBASED_POSTED_INTERRUPT)
+				printf(",PostIntr");
+		}
+		return;
+	}
+
+	mask = basic >> 32;
+	printf("Basic Features=0x%b", mask,
+	"\020"
+	"\02132PA"		/* 32-bit physical addresses */
+	"\022SMM"		/* SMM dual-monitor */
+	"\027INS/OUTS"		/* VM-exit info for INS and OUTS */
+	"\030TRUE"		/* TRUE_CTLS MSRs */
+	);
+	printf("\n        Pin-Based Controls=0x%b", pin,
+	"\020"
+	"\001ExtINT"		/* External-interrupt exiting */
+	"\004NMI"		/* NMI exiting */
+	"\006VNMI"		/* Virtual NMIs */
+	"\007PreTmr"		/* Activate VMX-preemption timer */
+	"\010PostIntr"		/* Process posted interrupts */
+	);
+	printf("\n        Primary Processor Controls=0x%b", proc,
+	"\020"
+	"\003INTWIN"		/* Interrupt-window exiting */
+	"\004TSCOff"		/* Use TSC offsetting */
+	"\010HLT"		/* HLT exiting */
+	"\012INVLPG"		/* INVLPG exiting */
+	"\013MWAIT"		/* MWAIT exiting */
+	"\014RDPMC"		/* RDPMC exiting */
+	"\015RDTSC"		/* RDTSC exiting */
+	"\020CR3-LD"		/* CR3-load exiting */
+	"\021CR3-ST"		/* CR3-store exiting */
+	"\024CR8-LD"		/* CR8-load exiting */
+	"\025CR8-ST"		/* CR8-store exiting */
+	"\026TPR"		/* Use TPR shadow */
+	"\027NMIWIN"		/* NMI-window exiting */
+	"\030MOV-DR"		/* MOV-DR exiting */
+	"\031IO"		/* Unconditional I/O exiting */
+	"\032IOmap"		/* Use I/O bitmaps */
+	"\034MTF"		/* Monitor trap flag */
+	"\035MSRmap"		/* Use MSR bitmaps */
+	"\036MONITOR"		/* MONITOR exiting */
+	"\037PAUSE"		/* PAUSE exiting */
+	);
+	if (proc & PROCBASED_SECONDARY_CONTROLS)
+		printf("\n        Secondary Processor Controls=0x%b", proc2,
+		"\020"
+		"\001APIC"		/* Virtualize APIC accesses */
+		"\002EPT"		/* Enable EPT */
+		"\003DT"		/* Descriptor-table exiting */
+		"\004RDTSCP"		/* Enable RDTSCP */
+		"\005x2APIC"		/* Virtualize x2APIC mode */
+		"\006VPID"		/* Enable VPID */
+		"\007WBINVD"		/* WBINVD exiting */
+		"\010UG"		/* Unrestricted guest */
+		"\011APIC-reg"		/* APIC-register virtualization */
+		"\012VID"		/* Virtual-interrupt delivery */
+		"\013PAUSE-loop"	/* PAUSE-loop exiting */
+		"\014RDRAND"		/* RDRAND exiting */
+		"\015INVPCID"		/* Enable INVPCID */
+		"\016VMFUNC"		/* Enable VM functions */
+		"\017VMCS"		/* VMCS shadowing */
+		"\020EPT#VE"		/* EPT-violation #VE */
+		"\021XSAVES"		/* Enable XSAVES/XRSTORS */
+		);
+	printf("\n        Exit Controls=0x%b", mask,
+	"\020"
+	"\003DR"		/* Save debug controls */
+				/* Ignore Host address-space size */
+	"\015PERF"		/* Load MSR_PERF_GLOBAL_CTRL */
+	"\020AckInt"		/* Acknowledge interrupt on exit */
+	"\023PAT-SV"		/* Save MSR_PAT */
+	"\024PAT-LD"		/* Load MSR_PAT */
+	"\025EFER-SV"		/* Save MSR_EFER */
+	"\026EFER-LD"		/* Load MSR_EFER */
+	"\027PTMR-SV"		/* Save VMX-preemption timer value */
+	);
+	printf("\n        Entry Controls=0x%b", mask,
+	"\020"
+	"\003DR"		/* Save debug controls */
+				/* Ignore IA-32e mode guest */
+				/* Ignore Entry to SMM */
+				/* Ignore Deactivate dual-monitor treatment */
+	"\016PERF"		/* Load MSR_PERF_GLOBAL_CTRL */
+	"\017PAT"		/* Load MSR_PAT */
+	"\020EFER"		/* Load MSR_EFER */
+	);
+	if (proc & PROCBASED_SECONDARY_CONTROLS &&
+	    (proc2 & (PROCBASED2_ENABLE_EPT | PROCBASED2_ENABLE_VPID)) != 0) {
+		msr = rdmsr(MSR_VMX_EPT_VPID_CAP);
+		mask = msr;
+		printf("\n        EPT Features=0x%b", mask,
+		"\020"
+		"\001XO"		/* Execute-only translations */
+		"\007PW4"		/* Page-walk length of 4 */
+		"\011UC"		/* EPT paging-structure mem can be UC */
+		"\017WB"		/* EPT paging-structure mem can be WB */
+		"\0212M"		/* EPT PDE can map a 2-Mbyte page */
+		"\0221G"		/* EPT PDPTE can map a 1-Gbyte page */
+		"\025INVEPT"		/* INVEPT is supported */
+		"\026AD"		/* Accessed and dirty flags for EPT */
+		"\032single"		/* INVEPT single-context type */
+		"\033all"		/* INVEPT all-context type */
+		);
+		mask = msr >> 32;
+		printf("\n        VPID Features=0x%b", mask,
+		"\020"
+		"\001INVVPID"		/* INVVPID is supported */
+		"\011individual"	/* INVVPID individual-address type */
+		"\012single"		/* INVVPID single-context type */
+		"\013all"		/* INVVPID all-context type */
+		 /* INVVPID single-context-retaining-globals type */
+		"\014single-globals"	
+		);
+	}
+}
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 5a359e95993c..63a9b3fdde0f 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -29,11 +29,14 @@
 #ifndef _VMM_H_
 #define	_VMM_H_
 
+#include <x86/segments.h>
+
 enum vm_suspend_how {
 	VM_SUSPEND_NONE,
 	VM_SUSPEND_RESET,
 	VM_SUSPEND_POWEROFF,
 	VM_SUSPEND_HALT,
+	VM_SUSPEND_TRIPLEFAULT,
 	VM_SUSPEND_LAST
 };
 
@@ -75,6 +78,10 @@ enum vm_reg_name {
 	VM_REG_GUEST_GDTR,
 	VM_REG_GUEST_EFER,
 	VM_REG_GUEST_CR2,
+	VM_REG_GUEST_PDPTE0,
+	VM_REG_GUEST_PDPTE1,
+	VM_REG_GUEST_PDPTE2,
+	VM_REG_GUEST_PDPTE3,
 	VM_REG_LAST
 };
 
@@ -84,6 +91,16 @@ enum x2apic_state {
 	X2APIC_STATE_LAST
 };
 
+#define	VM_INTINFO_VECTOR(info)	((info) & 0xff)
+#define	VM_INTINFO_DEL_ERRCODE	0x800
+#define	VM_INTINFO_RSVD		0x7ffff000
+#define	VM_INTINFO_VALID	0x80000000
+#define	VM_INTINFO_TYPE		0x700
+#define	VM_INTINFO_HWINTR	(0 << 8)
+#define	VM_INTINFO_NMI		(2 << 8)
+#define	VM_INTINFO_HWEXCEPTION	(3 << 8)
+#define	VM_INTINFO_SWINTR	(4 << 8)
+
 #ifdef _KERNEL
 
 #define	VM_MAX_NAMELEN	32
@@ -99,6 +116,7 @@ struct vioapic;
 struct vlapic;
 struct vmspace;
 struct vm_object;
+struct vm_guest_paging;
 struct pmap;
 
 typedef int	(*vmm_init_func_t)(int ipinum);
@@ -252,6 +270,14 @@ vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
 	return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
 }
 
+#ifdef _SYS_PROC_H_
+static int __inline
+vcpu_should_yield(struct vm *vm, int vcpu)
+{
+	return (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED));
+}
+#endif
+
 void *vcpu_stats(struct vm *vm, int vcpu);
 void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
 struct vmspace *vm_get_vmspace(struct vm *vm);
@@ -274,21 +300,63 @@ struct vatpit *vm_atpit(struct vm *vm);
 int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme);
 
 /*
- * Returns 0 if there is no exception pending for this vcpu. Returns 1 if an
- * exception is pending and also updates 'vme'. The pending exception is
- * cleared when this function returns.
+ * This function is called after a VM-exit that occurred during exception or
+ * interrupt delivery through the IDT. The format of 'intinfo' is described
+ * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2.
  *
- * This function should only be called in the context of the thread that is
- * executing this vcpu.
+ * If a VM-exit handler completes the event delivery successfully then it
+ * should call vm_exit_intinfo() to extinguish the pending event. For e.g.,
+ * if the task switch emulation is triggered via a task gate then it should
+ * call this function with 'intinfo=0' to indicate that the external event
+ * is not pending anymore.
+ *
+ * Return value is 0 on success and non-zero on failure.
  */
-int vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *vme);
+int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo);
 
-void vm_inject_gp(struct vm *vm, int vcpuid); /* general protection fault */
-void vm_inject_ud(struct vm *vm, int vcpuid); /* undefined instruction fault */
-void vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2);
+/*
+ * This function is called before every VM-entry to retrieve a pending
+ * event that should be injected into the guest. This function combines
+ * nested events into a double or triple fault.
+ *
+ * Returns 0 if there are no events that need to be injected into the guest
+ * and non-zero otherwise.
+ */
+int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info);
+
+int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);
 
 enum vm_reg_name vm_segment_name(int seg_encoding);
 
+struct vm_copyinfo {
+	uint64_t	gpa;
+	size_t		len;
+	void		*hva;
+	void		*cookie;
+};
+
+/*
+ * Set up 'copyinfo[]' to copy to/from guest linear address space starting
+ * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
+ * a copyin or PROT_WRITE for a copyout. 
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ *
+ * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
+ * the return value is 0. The 'copyinfo[]' resources should be freed by calling
+ * 'vm_copy_teardown()' after the copy is done.
+ */
+int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+    uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
+    int num_copyinfo);
+void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+    int num_copyinfo);
+void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+    void *kaddr, size_t len);
+void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
+    struct vm_copyinfo *copyinfo, size_t len);
 #endif	/* KERNEL */
 
 #define	VM_MAXCPU	16			/* maximum virtual cpus */
@@ -322,13 +390,16 @@ struct seg_desc {
 	uint32_t	limit;
 	uint32_t	access;
 };
-#define	SEG_DESC_TYPE(desc)		((desc)->access & 0x001f)
-#define	SEG_DESC_PRESENT(desc)		((desc)->access & 0x0080)
-#define	SEG_DESC_DEF32(desc)		((desc)->access & 0x4000)
-#define	SEG_DESC_GRANULARITY(desc)	((desc)->access & 0x8000)
-#define	SEG_DESC_UNUSABLE(desc)		((desc)->access & 0x10000)
+#define	SEG_DESC_TYPE(access)		((access) & 0x001f)
+#define	SEG_DESC_DPL(access)		(((access) >> 5) & 0x3)
+#define	SEG_DESC_PRESENT(access)	(((access) & 0x0080) ? 1 : 0)
+#define	SEG_DESC_DEF32(access)		(((access) & 0x4000) ? 1 : 0)
+#define	SEG_DESC_GRANULARITY(access)	(((access) & 0x8000) ? 1 : 0)
+#define	SEG_DESC_UNUSABLE(access)	(((access) & 0x10000) ? 1 : 0)
 
 enum vm_cpu_mode {
+	CPU_MODE_REAL,
+	CPU_MODE_PROTECTED,
 	CPU_MODE_COMPATIBILITY,		/* IA-32E mode (CS.L = 0) */
 	CPU_MODE_64BIT,			/* IA-32E mode (CS.L = 1) */
 };
@@ -364,11 +435,14 @@ struct vie {
 	uint8_t		num_valid;		/* size of the instruction */
 	uint8_t		num_processed;
 
+	uint8_t		addrsize:4, opsize:4;	/* address and operand sizes */
 	uint8_t		rex_w:1,		/* REX prefix */
 			rex_r:1,
 			rex_x:1,
 			rex_b:1,
-			rex_present:1;
+			rex_present:1,
+			opsize_override:1,	/* Operand size override */
+			addrsize_override:1;	/* Address size override */
 
 	uint8_t		mod:2,			/* ModRM byte */
 			reg:4,
@@ -410,6 +484,7 @@ enum vm_exitcode {
 	VM_EXITCODE_IOAPIC_EOI,
 	VM_EXITCODE_SUSPENDED,
 	VM_EXITCODE_INOUT_STR,
+	VM_EXITCODE_TASK_SWITCH,
 	VM_EXITCODE_MAX
 };
 
@@ -434,6 +509,22 @@ struct vm_inout_str {
 	struct seg_desc seg_desc;
 };
 
+enum task_switch_reason {
+	TSR_CALL,
+	TSR_IRET,
+	TSR_JMP,
+	TSR_IDT_GATE,	/* task gate in IDT */
+};
+
+struct vm_task_switch {
+	uint16_t	tsssel;		/* new TSS selector */
+	int		ext;		/* task switch due to external event */
+	uint32_t	errcode;
+	int		errcode_valid;	/* push 'errcode' on the new stack */
+	enum task_switch_reason reason;
+	struct vm_guest_paging paging;
+};
+
 struct vm_exit {
 	enum vm_exitcode	exitcode;
 	int			inst_length;	/* 0 means unknown */
@@ -448,6 +539,7 @@ struct vm_exit {
 		struct {
 			uint64_t	gpa;
 			uint64_t	gla;
+			int		cs_d;		/* CS.D */
 			struct vm_guest_paging paging;
 			struct vie	vie;
 		} inst_emul;
@@ -487,7 +579,38 @@ struct vm_exit {
 		struct {
 			enum vm_suspend_how how;
 		} suspended;
+		struct vm_task_switch task_switch;
 	} u;
 };
 
+/* APIs to inject faults into the guest */
+void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid,
+    int errcode);
+
+static void __inline
+vm_inject_ud(void *vm, int vcpuid)
+{
+	vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
+}
+
+static void __inline
+vm_inject_gp(void *vm, int vcpuid)
+{
+	vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
+}
+
+static void __inline
+vm_inject_ac(void *vm, int vcpuid, int errcode)
+{
+	vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
+}
+
+static void __inline
+vm_inject_ss(void *vm, int vcpuid, int errcode)
+{
+	vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
+}
+
+void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2);
+
 #endif	/* _VMM_H_ */
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
index 9b3b00ded0ba..e4d839ef6549 100644
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -189,6 +189,12 @@ struct vm_cpuset {
 #define	VM_ACTIVE_CPUS		0
 #define	VM_SUSPENDED_CPUS	1
 
+struct vm_intinfo {
+	int		vcpuid;
+	uint64_t	info1;
+	uint64_t	info2;
+};
+
 enum {
 	/* general routines */
 	IOCNUM_ABIVERS = 0,
@@ -211,6 +217,8 @@ enum {
 	IOCNUM_GET_SEGMENT_DESCRIPTOR = 23,
 
 	/* interrupt injection */
+	IOCNUM_GET_INTINFO = 28,
+	IOCNUM_SET_INTINFO = 29,
 	IOCNUM_INJECT_EXCEPTION = 30,
 	IOCNUM_LAPIC_IRQ = 31,
 	IOCNUM_INJECT_NMI = 32,
@@ -324,4 +332,8 @@ enum {
 	_IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu)
 #define	VM_GET_CPUS	\
 	_IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset)
+#define	VM_SET_INTINFO	\
+	_IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo)
+#define	VM_GET_INTINFO	\
+	_IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo)
 #endif
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
index e4c408bf165f..bbd3d88d9cf3 100644
--- a/sys/amd64/include/vmm_instruction_emul.h
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -52,8 +52,8 @@ typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
  * s
  */
 int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
-			    mem_region_read_t mrr, mem_region_write_t mrw,
-			    void *mrarg);
+    struct vm_guest_paging *paging, mem_region_read_t mrr,
+    mem_region_write_t mrw, void *mrarg);
 
 int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
     uint64_t val, int size);
@@ -108,7 +108,7 @@ void vie_init(struct vie *vie);
  */
 #define	VIE_INVALID_GLA		(1UL << 63)	/* a non-canonical address */
 int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
-			   enum vm_cpu_mode cpu_mode, struct vie *vie);
+			   enum vm_cpu_mode cpu_mode, int csd, struct vie *vie);
 #endif	/* _KERNEL */
 
 #endif	/* _VMM_INSTRUCTION_EMUL_H_ */
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
index cc97d9516613..51e5c2c06f00 100644
--- a/sys/amd64/vmm/intel/vmcs.c
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -103,6 +103,14 @@ vmcs_field_encoding(int ident)
 		return (VMCS_GUEST_LDTR_SELECTOR);
 	case VM_REG_GUEST_EFER:
 		return (VMCS_GUEST_IA32_EFER);
+	case VM_REG_GUEST_PDPTE0:
+		return (VMCS_GUEST_PDPTE0);
+	case VM_REG_GUEST_PDPTE1:
+		return (VMCS_GUEST_PDPTE1);
+	case VM_REG_GUEST_PDPTE2:
+		return (VMCS_GUEST_PDPTE2);
+	case VM_REG_GUEST_PDPTE3:
+		return (VMCS_GUEST_PDPTE3);
 	default:
 		return (-1);
 	}
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index 657d5b0f65cb..4e9557c39b9b 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -346,6 +346,9 @@ vmcs_write(uint32_t encoding, uint64_t val)
 #define	VMCS_INTR_T_HWINTR	(0 << 8)
 #define	VMCS_INTR_T_NMI		(2 << 8)
 #define	VMCS_INTR_T_HWEXCEPTION	(3 << 8)
+#define	VMCS_INTR_T_SWINTR	(4 << 8)
+#define	VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8)
+#define	VMCS_INTR_T_SWEXCEPTION	(6 << 8)
 #define	VMCS_INTR_DEL_ERRCODE	(1 << 11)
 
 /*
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 2cbb159e9159..b2c570216a78 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -149,8 +149,6 @@ SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
 	     &cr4_zeros_mask, 0, NULL);
 
-static int vmx_no_patmsr;
-
 static int vmx_initialized;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
 	   &vmx_initialized, 0, "Intel VMX initialized");
@@ -158,18 +156,38 @@ SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
 /*
  * Optional capabilities
  */
+static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL);
+
+static int vmx_patmsr;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, patmsr, CTLFLAG_RD, &vmx_patmsr, 0,
+    "PAT MSR saved and restored in VCMS");
+
 static int cap_halt_exit;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
+    "HLT triggers a VM-exit");
+
 static int cap_pause_exit;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
+    0, "PAUSE triggers a VM-exit");
+
 static int cap_unrestricted_guest;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
+    &cap_unrestricted_guest, 0, "Unrestricted guests");
+
 static int cap_monitor_trap;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
+    &cap_monitor_trap, 0, "Monitor trap flag");
+
 static int cap_invpcid;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
+    0, "Guests are allowed to use INVPCID");
 
 static int virtual_interrupt_delivery;
-SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
 
 static int posted_interrupts;
-SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD,
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
     &posted_interrupts, 0, "APICv posted interrupt support");
 
 static int pirvec;
@@ -618,6 +636,7 @@ vmx_init(int ipinum)
 	}
 
 	/* Check support for VM-exit controls */
+	vmx_patmsr = 1;
 	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
 			       VM_EXIT_CTLS_ONE_SETTING,
 			       VM_EXIT_CTLS_ZERO_SETTING,
@@ -637,12 +656,12 @@ vmx_init(int ipinum)
 			if (bootverbose)
 				printf("vmm: PAT MSR access not supported\n");
 			guest_msr_valid(MSR_PAT);
-			vmx_no_patmsr = 1;
+			vmx_patmsr = 0;
 		}
 	}
 
 	/* Check support for VM-entry controls */
-	if (!vmx_no_patmsr) {
+	if (vmx_patmsr) {
 		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
 				       MSR_VMX_TRUE_ENTRY_CTLS,
 				       VM_ENTRY_CTLS_ONE_SETTING,
@@ -918,7 +937,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 	 * MSR_PAT save/restore support, leave access disabled so accesses
 	 * will be trapped.
 	 */
-	if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
+	if (vmx_patmsr && guest_msr_rw(vmx, MSR_PAT))
 		panic("vmx_vminit: error setting guest pat msr access");
 
 	vpid_alloc(vpid, VM_MAXCPU);
@@ -974,7 +993,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 		vmx->cap[i].proc_ctls = procbased_ctls;
 		vmx->cap[i].proc_ctls2 = procbased_ctls2;
 
-		vmx->state[i].lastcpu = -1;
+		vmx->state[i].lastcpu = NOCPU;
 		vmx->state[i].vpid = vpid[i];
 
 		msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
@@ -1047,27 +1066,37 @@ vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
 }
 
 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
+static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
 
-static void
-vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
+/*
+ * Invalidate guest mappings identified by its vpid from the TLB.
+ */
+static __inline void
+vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
 {
 	struct vmxstate *vmxstate;
 	struct invvpid_desc invvpid_desc;
 
 	vmxstate = &vmx->state[vcpu];
-	if (vmxstate->lastcpu == curcpu)
+	if (vmxstate->vpid == 0)
 		return;
 
-	vmxstate->lastcpu = curcpu;
-
-	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+	if (!running) {
+		/*
+		 * Set the 'lastcpu' to an invalid host cpu.
+		 *
+		 * This will invalidate TLB entries tagged with the vcpu's
+		 * vpid the next time it runs via vmx_set_pcpu_defaults().
+		 */
+		vmxstate->lastcpu = NOCPU;
+		return;
+	}
 
-	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
-	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
-	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+	KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
+	    "critical section", __func__, vcpu));
 
 	/*
-	 * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
+	 * Invalidate all mappings tagged with 'vpid'
 	 *
 	 * We do this because this vcpu was executing on a different host
 	 * cpu when it last ran. We do not track whether it invalidated
@@ -1081,25 +1110,43 @@ vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
 	 * Note also that this will invalidate mappings tagged with 'vpid'
 	 * for "all" EP4TAs.
 	 */
-	if (vmxstate->vpid != 0) {
-		if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
-			invvpid_desc._res1 = 0;
-			invvpid_desc._res2 = 0;
-			invvpid_desc.vpid = vmxstate->vpid;
-			invvpid_desc.linear_addr = 0;
-			invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
-		} else {
-			/*
-			 * The invvpid can be skipped if an invept is going to
-			 * be performed before entering the guest. The invept
-			 * will invalidate combined mappings tagged with
-			 * 'vmx->eptp' for all vpids.
-			 */
-			vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
-		}
+	if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
+		invvpid_desc._res1 = 0;
+		invvpid_desc._res2 = 0;
+		invvpid_desc.vpid = vmxstate->vpid;
+		invvpid_desc.linear_addr = 0;
+		invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
+		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
+	} else {
+		/*
+		 * The invvpid can be skipped if an invept is going to
+		 * be performed before entering the guest. The invept
+		 * will invalidate combined mappings tagged with
+		 * 'vmx->eptp' for all vpids.
+		 */
+		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
 	}
 }
 
+static void
+vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
+{
+	struct vmxstate *vmxstate;
+
+	vmxstate = &vmx->state[vcpu];
+	if (vmxstate->lastcpu == curcpu)
+		return;
+
+	vmxstate->lastcpu = curcpu;
+
+	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+
+	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
+	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
+	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+	vmx_invvpid(vmx, vcpu, pmap, 1);
+}
+
 /*
  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
  */
@@ -1183,24 +1230,32 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu)
 static void
 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
 {
-	struct vm_exception exc;
 	int vector, need_nmi_exiting, extint_pending;
-	uint64_t rflags;
+	uint64_t rflags, entryinfo;
 	uint32_t gi, info;
 
-	if (vm_exception_pending(vmx->vm, vcpu, &exc)) {
-		KASSERT(exc.vector >= 0 && exc.vector < 32,
-		    ("%s: invalid exception vector %d", __func__, exc.vector));
+	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
+		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
+		    "intinfo is not valid: %#lx", __func__, entryinfo));
 
 		info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
-		     "pending exception %d: %#x", __func__, exc.vector, info));
+		     "pending exception: %#lx/%#x", __func__, entryinfo, info));
 
-		info = exc.vector | VMCS_INTR_T_HWEXCEPTION | VMCS_INTR_VALID;
-		if (exc.error_code_valid) {
-			info |= VMCS_INTR_DEL_ERRCODE;
-			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, exc.error_code);
+		info = entryinfo;
+		vector = info & 0xff;
+		if (vector == IDT_BP || vector == IDT_OF) {
+			/*
+			 * VT-x requires #BP and #OF to be injected as software
+			 * exceptions.
+			 */
+			info &= ~VMCS_INTR_T_MASK;
+			info |= VMCS_INTR_T_SWEXCEPTION;
 		}
+
+		if (info & VMCS_INTR_DEL_ERRCODE)
+			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
+
 		vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 	}
 
@@ -1379,6 +1434,16 @@ vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 }
 
+static void
+vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
+{
+	uint32_t gi;
+
+	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+	KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
+	    ("NMI blocking is not in effect %#x", gi));
+}
+
 static int
 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
@@ -1659,11 +1724,19 @@ vmx_cpl(void)
 static enum vm_cpu_mode
 vmx_cpu_mode(void)
 {
+	uint32_t csar;
 
-	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA)
-		return (CPU_MODE_64BIT);
-	else
-		return (CPU_MODE_COMPATIBILITY);
+	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
+		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
+		if (csar & 0x2000)
+			return (CPU_MODE_64BIT);	/* CS.L = 1 */
+		else
+			return (CPU_MODE_COMPATIBILITY);
+	} else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
+		return (CPU_MODE_PROTECTED);
+	} else {
+		return (CPU_MODE_REAL);
+	}
 }
 
 static enum vm_paging_mode
@@ -1757,10 +1830,25 @@ vmx_paging_info(struct vm_guest_paging *paging)
 static void
 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
 {
+	struct vm_guest_paging *paging;
+	uint32_t csar;
+	
+	paging = &vmexit->u.inst_emul.paging;
+
 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
 	vmexit->u.inst_emul.gpa = gpa;
 	vmexit->u.inst_emul.gla = gla;
-	vmx_paging_info(&vmexit->u.inst_emul.paging);
+	vmx_paging_info(paging);
+	switch (paging->cpu_mode) {
+	case CPU_MODE_PROTECTED:
+	case CPU_MODE_COMPATIBILITY:
+		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
+		vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
+		break;
+	default:
+		vmexit->u.inst_emul.cs_d = 0;
+		break;
+	}
 }
 
 static int
@@ -1969,6 +2057,26 @@ vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 	return (UNHANDLED);
 }
 
+static enum task_switch_reason
+vmx_task_switch_reason(uint64_t qual)
+{
+	int reason;
+
+	reason = (qual >> 30) & 0x3;
+	switch (reason) {
+	case 0:
+		return (TSR_CALL);
+	case 1:
+		return (TSR_IRET);
+	case 2:
+		return (TSR_JMP);
+	case 3:
+		return (TSR_IDT_GATE);
+	default:
+		panic("%s: invalid reason %d", __func__, reason);
+	}
+}
+
 static int
 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 {
@@ -1976,9 +2084,10 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 	struct vmxctx *vmxctx;
 	struct vlapic *vlapic;
 	struct vm_inout_str *vis;
+	struct vm_task_switch *ts;
 	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
-	uint32_t reason;
-	uint64_t qual, gpa;
+	uint32_t intr_type, reason;
+	uint64_t exitintinfo, qual, gpa;
 	bool retu;
 
 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
@@ -1994,46 +2103,99 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
 
 	/*
-	 * VM exits that could be triggered during event injection on the
-	 * previous VM entry need to be handled specially by re-injecting
-	 * the event.
+	 * VM exits that can be triggered during event delivery need to
+	 * be handled specially by re-injecting the event if the IDT
+	 * vectoring information field's valid bit is set.
 	 *
 	 * See "Information for VM Exits During Event Delivery" in Intel SDM
 	 * for details.
 	 */
-	switch (reason) {
-	case EXIT_REASON_EPT_FAULT:
-	case EXIT_REASON_EPT_MISCONFIG:
-	case EXIT_REASON_APIC_ACCESS:
-	case EXIT_REASON_TASK_SWITCH:
-	case EXIT_REASON_EXCEPTION:
-		idtvec_info = vmcs_idt_vectoring_info();
-		if (idtvec_info & VMCS_IDT_VEC_VALID) {
-			idtvec_info &= ~(1 << 12); /* clear undefined bit */
-			vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info);
-			if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
-				idtvec_err = vmcs_idt_vectoring_err();
-				vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR,
-				    idtvec_err);
-			}
-			/*
-			 * If 'virtual NMIs' are being used and the VM-exit
-			 * happened while injecting an NMI during the previous
-			 * VM-entry, then clear "blocking by NMI" in the Guest
-			 * Interruptibility-state.
-			 */
-			if ((idtvec_info & VMCS_INTR_T_MASK) ==
-			    VMCS_INTR_T_NMI) {
-				 vmx_clear_nmi_blocking(vmx, vcpu);
-			}
+	idtvec_info = vmcs_idt_vectoring_info();
+	if (idtvec_info & VMCS_IDT_VEC_VALID) {
+		idtvec_info &= ~(1 << 12); /* clear undefined bit */
+		exitintinfo = idtvec_info;
+		if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
+			idtvec_err = vmcs_idt_vectoring_err();
+			exitintinfo |= (uint64_t)idtvec_err << 32;
+		}
+		error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
+		KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
+		    __func__, error));
+
+		/*
+		 * If 'virtual NMIs' are being used and the VM-exit
+		 * happened while injecting an NMI during the previous
+		 * VM-entry, then clear "blocking by NMI" in the
+		 * Guest Interruptibility-State so the NMI can be
+		 * reinjected on the subsequent VM-entry.
+		 *
+		 * However, if the NMI was being delivered through a task
+		 * gate, then the new task must start execution with NMIs
+		 * blocked so don't clear NMI blocking in this case.
+		 */
+		intr_type = idtvec_info & VMCS_INTR_T_MASK;
+		if (intr_type == VMCS_INTR_T_NMI) {
+			if (reason != EXIT_REASON_TASK_SWITCH)
+				vmx_clear_nmi_blocking(vmx, vcpu);
+			else
+				vmx_assert_nmi_blocking(vmx, vcpu);
+		}
+
+		/*
+		 * Update VM-entry instruction length if the event being
+		 * delivered was a software interrupt or software exception.
+		 */
+		if (intr_type == VMCS_INTR_T_SWINTR ||
+		    intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
+		    intr_type == VMCS_INTR_T_SWEXCEPTION) {
 			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 		}
-	default:
-		idtvec_info = 0;
-		break;
 	}
 
 	switch (reason) {
+	case EXIT_REASON_TASK_SWITCH:
+		ts = &vmexit->u.task_switch;
+		ts->tsssel = qual & 0xffff;
+		ts->reason = vmx_task_switch_reason(qual);
+		ts->ext = 0;
+		ts->errcode_valid = 0;
+		vmx_paging_info(&ts->paging);
+		/*
+		 * If the task switch was due to a CALL, JMP, IRET, software
+		 * interrupt (INT n) or software exception (INT3, INTO),
+		 * then the saved %rip references the instruction that caused
+		 * the task switch. The instruction length field in the VMCS
+		 * is valid in this case.
+		 *
+		 * In all other cases (e.g., NMI, hardware exception) the
+		 * saved %rip is one that would have been saved in the old TSS
+		 * had the task switch completed normally so the instruction
+		 * length field is not needed in this case and is explicitly
+		 * set to 0.
+		 */
+		if (ts->reason == TSR_IDT_GATE) {
+			KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
+			    ("invalid idtvec_info %#x for IDT task switch",
+			    idtvec_info));
+			intr_type = idtvec_info & VMCS_INTR_T_MASK;
+			if (intr_type != VMCS_INTR_T_SWINTR &&
+			    intr_type != VMCS_INTR_T_SWEXCEPTION &&
+			    intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
+				/* Task switch triggered by external event */
+				ts->ext = 1;
+				vmexit->inst_length = 0;
+				if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
+					ts->errcode_valid = 1;
+					ts->errcode = vmcs_idt_vectoring_err();
+				}
+			}
+		}
+		vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
+		VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
+		    "%s errcode 0x%016lx", ts->reason, ts->tsssel,
+		    ts->ext ? "external" : "internal",
+		    ((uint64_t)ts->errcode << 32) | ts->errcode_valid);
+		break;
 	case EXIT_REASON_CR_ACCESS:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
 		switch (qual & 0xf) {
@@ -2179,6 +2341,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		 * the guest.
 		 *
 		 * See "Resuming Guest Software after Handling an Exception".
+		 * See "Information for VM Exits Due to Vectored Events".
 		 */
 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 		    (intr_info & 0xff) != IDT_DF &&
@@ -2396,6 +2559,13 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 		 * pmap_invalidate_ept().
 		 */
 		disable_intr();
+		vmx_inject_interrupts(vmx, vcpu, vlapic);
+
+		/*
+		 * Check for vcpu suspension after injecting events because
+		 * vmx_inject_interrupts() can suspend the vcpu due to a
+		 * triple fault.
+		 */
 		if (vcpu_suspended(suspend_cookie)) {
 			enable_intr();
 			vm_exit_suspended(vmx->vm, vcpu, vmcs_guest_rip());
@@ -2408,7 +2578,7 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 			break;
 		}
 
-		if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
+		if (vcpu_should_yield(vm, vcpu)) {
 			enable_intr();
 			vm_exit_astpending(vmx->vm, vcpu, vmcs_guest_rip());
 			vmx_astpending_trace(vmx, vcpu, vmexit->rip);
@@ -2416,7 +2586,6 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
 			break;
 		}
 
-		vmx_inject_interrupts(vmx, vcpu, vlapic);
 		vmx_run_trace(vmx, vcpu);
 		rc = vmx_enter_guest(vmxctx, vmx, launched);
 
@@ -2584,6 +2753,7 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 {
 	int error, hostcpu, running, shadow;
 	uint64_t ctls;
+	pmap_t pmap;
 	struct vmx *vmx = arg;
 
 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
@@ -2621,6 +2791,18 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 			error = vmcs_setreg(&vmx->vmcs[vcpu], running,
 				    VMCS_IDENT(shadow), val);
 		}
+
+		if (reg == VM_REG_GUEST_CR3) {
+			/*
+			 * Invalidate the guest vcpu's TLB mappings to emulate
+			 * the behavior of updating %cr3.
+			 *
+			 * XXX the processor retains global mappings when %cr3
+			 * is updated but vmx_invvpid() does not.
+			 */
+			pmap = vmx->ctx[vcpu].pmap;
+			vmx_invvpid(vmx, vcpu, pmap, running);
+		}
 	}
 
 	return (error);
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
index 2aba63c916e6..a3428db8a87d 100644
--- a/sys/amd64/vmm/intel/vmx_msr.c
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/systm.h>
 
 #include <machine/cpufunc.h>
+#include <machine/specialreg.h>
 
 #include "vmx_msr.h"
 
diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h
index e6379a93d155..340b0f7ab436 100644
--- a/sys/amd64/vmm/intel/vmx_msr.h
+++ b/sys/amd64/vmm/intel/vmx_msr.h
@@ -29,29 +29,6 @@
 #ifndef _VMX_MSR_H_
 #define	_VMX_MSR_H_
 
-#define	MSR_VMX_BASIC			0x480
-#define	MSR_VMX_EPT_VPID_CAP		0x48C
-
-#define	MSR_VMX_PROCBASED_CTLS		0x482
-#define	MSR_VMX_TRUE_PROCBASED_CTLS	0x48E
-
-#define	MSR_VMX_PINBASED_CTLS		0x481
-#define	MSR_VMX_TRUE_PINBASED_CTLS	0x48D
-
-#define	MSR_VMX_PROCBASED_CTLS2		0x48B
-
-#define	MSR_VMX_EXIT_CTLS		0x483
-#define	MSR_VMX_TRUE_EXIT_CTLS		0x48f
-
-#define	MSR_VMX_ENTRY_CTLS		0x484
-#define	MSR_VMX_TRUE_ENTRY_CTLS		0x490
-
-#define	MSR_VMX_CR0_FIXED0		0x486
-#define	MSR_VMX_CR0_FIXED1		0x487
-
-#define	MSR_VMX_CR4_FIXED0		0x488
-#define	MSR_VMX_CR4_FIXED1		0x489
-
 uint32_t vmx_revision(void);
 
 int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c
index ca76ea82ce65..f5ef71b65d54 100644
--- a/sys/amd64/vmm/intel/vtd.c
+++ b/sys/amd64/vmm/intel/vtd.c
@@ -452,6 +452,11 @@ vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
 	ptpindex = 0;
 	ptpshift = 0;
 
+	KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__,
+	    gpa, len));
+	KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond "
+	    "domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr));
+
 	if (gpa & PAGE_MASK)
 		panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
 
diff --git a/sys/amd64/vmm/io/vatpic.c b/sys/amd64/vmm/io/vatpic.c
index ee6fc84f90c3..38fc458b7f73 100644
--- a/sys/amd64/vmm/io/vatpic.c
+++ b/sys/amd64/vmm/io/vatpic.c
@@ -195,26 +195,29 @@ vatpic_notify_intr(struct vatpic *vatpic)
 		    atpic->mask, atpic->request, atpic->service);
 
 		/*
+		 * From Section 3.6.2, "Interrupt Modes", in the
+		 * MPtable Specification, Version 1.4
+		 *
 		 * PIC interrupts are routed to both the Local APIC
 		 * and the I/O APIC to support operation in 1 of 3
 		 * modes.
 		 *
 		 * 1. Legacy PIC Mode: the PIC effectively bypasses
-		 * all APIC components.  In mode '1' the local APIC is
+		 * all APIC components.  In this mode the local APIC is
 		 * disabled and LINT0 is reconfigured as INTR to
 		 * deliver the PIC interrupt directly to the CPU.
 		 *
 		 * 2. Virtual Wire Mode: the APIC is treated as a
 		 * virtual wire which delivers interrupts from the PIC
-		 * to the CPU.  In mode '2' LINT0 is programmed as
+		 * to the CPU.  In this mode LINT0 is programmed as
 		 * ExtINT to indicate that the PIC is the source of
 		 * the interrupt.
 		 *
-		 * 3. Symmetric I/O Mode: PIC interrupts are fielded
-		 * by the I/O APIC and delivered to the appropriate
-		 * CPU.  In mode '3' the I/O APIC input 0 is
-		 * programmed as ExtINT to indicate that the PIC is
-		 * the source of the interrupt.
+		 * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are
+		 * fielded by the I/O APIC and delivered to the appropriate
+		 * CPU.  In this mode the I/O APIC input 0 is programmed
+		 * as ExtINT to indicate that the PIC is the source of the
+		 * interrupt.
 		 */
 		atpic->intr_raised = true;
 		lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0);
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index c2a9fd1e117e..fa0200e84b5c 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -97,6 +97,7 @@ struct vcpu {
 	int		hostcpu;	/* (o) vcpu's host cpu */
 	struct vlapic	*vlapic;	/* (i) APIC device model */
 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
+	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
 	int		nmi_pending;	/* (i) NMI pending */
 	int		extint_pending;	/* (i) INTR pending */
 	struct vm_exception exception;	/* (x) exception collateral */
@@ -242,6 +243,7 @@ vcpu_init(struct vm *vm, int vcpu_id, bool create)
 
 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
+	vcpu->exitintinfo = 0;
 	vcpu->nmi_pending = 0;
 	vcpu->extint_pending = 0;
 	vcpu->exception_pending = 0;
@@ -571,6 +573,21 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 	return (0);
 }
 
+static vm_paddr_t
+vm_maxmem(struct vm *vm)
+{
+	int i;
+	vm_paddr_t gpa, maxmem;
+
+	maxmem = 0;
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len;
+		if (gpa > maxmem)
+			maxmem = gpa;
+	}
+	return (maxmem);
+}
+
 static void
 vm_gpa_unwire(struct vm *vm)
 {
@@ -708,7 +725,7 @@ vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
 	if (ppt_assigned_devices(vm) == 0) {
 		KASSERT(vm->iommu == NULL,
 		    ("vm_assign_pptdev: iommu must be NULL"));
-		maxaddr = vmm_mem_maxaddr();
+		maxaddr = vm_maxmem(vm);
 		vm->iommu = iommu_create_domain(maxaddr);
 
 		error = vm_gpa_wire(vm);
@@ -1104,6 +1121,10 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 			}
 		}
 
+		/* Don't go to sleep if the vcpu thread needs to yield */
+		if (vcpu_should_yield(vm, vcpuid))
+			break;
+
 		/*
 		 * Some Linux guests implement "halt" by having all vcpus
 		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
@@ -1127,7 +1148,11 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 
 		t = ticks;
 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
-		msleep_spin(vcpu, &vcpu->mtx, wmesg, 0);
+		/*
+		 * XXX msleep_spin() cannot be interrupted by signals so
+		 * wake up periodically to check pending signals.
+		 */
+		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
 		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
 	}
@@ -1191,15 +1216,18 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 	struct vm_guest_paging *paging;
 	mem_region_read_t mread;
 	mem_region_write_t mwrite;
-	int error;
+	enum vm_cpu_mode cpu_mode;
+	int cs_d, error;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
 	gla = vme->u.inst_emul.gla;
 	gpa = vme->u.inst_emul.gpa;
+	cs_d = vme->u.inst_emul.cs_d;
 	vie = &vme->u.inst_emul.vie;
 	paging = &vme->u.inst_emul.paging;
+	cpu_mode = paging->cpu_mode;
 
 	vie_init(vie);
 
@@ -1213,7 +1241,7 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 	else if (error != 0)
 		panic("%s: vmm_fetch_instruction error %d", __func__, error);
 
-	if (vmm_decode_instruction(vm, vcpuid, gla, paging->cpu_mode, vie) != 0)
+	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)
 		return (EFAULT);
 
 	/* return to userland unless this is an in-kernel emulated device */
@@ -1231,8 +1259,8 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 		return (0);
 	}
 
-	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
-	    retu);
+	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
+	    mread, mwrite, retu);
 
 	return (error);
 }
@@ -1456,6 +1484,202 @@ restart:
 }
 
 int
+vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
+{
+	struct vcpu *vcpu;
+	int type, vector;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	if (info & VM_INTINFO_VALID) {
+		type = info & VM_INTINFO_TYPE;
+		vector = info & 0xff;
+		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
+			return (EINVAL);
+		if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
+			return (EINVAL);
+		if (info & VM_INTINFO_RSVD)
+			return (EINVAL);
+	} else {
+		info = 0;
+	}
+	VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
+	vcpu->exitintinfo = info;
+	return (0);
+}
+
+enum exc_class {
+	EXC_BENIGN,
+	EXC_CONTRIBUTORY,
+	EXC_PAGEFAULT
+};
+
+#define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
+
+static enum exc_class
+exception_class(uint64_t info)
+{
+	int type, vector;
+
+	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
+	type = info & VM_INTINFO_TYPE;
+	vector = info & 0xff;
+
+	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
+	switch (type) {
+	case VM_INTINFO_HWINTR:
+	case VM_INTINFO_SWINTR:
+	case VM_INTINFO_NMI:
+		return (EXC_BENIGN);
+	default:
+		/*
+		 * Hardware exception.
+		 *
+		 * SVM and VT-x use identical type values to represent NMI,
+		 * hardware interrupt and software interrupt.
+		 *
+		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
+		 * for exceptions except #BP and #OF. #BP and #OF use a type
+		 * value of '5' or '6'. Therefore we don't check for explicit
+		 * values of 'type' to classify 'intinfo' into a hardware
+		 * exception.
+		 */
+		break;
+	}
+
+	switch (vector) {
+	case IDT_PF:
+	case IDT_VE:
+		return (EXC_PAGEFAULT);
+	case IDT_DE:
+	case IDT_TS:
+	case IDT_NP:
+	case IDT_SS:
+	case IDT_GP:
+		return (EXC_CONTRIBUTORY);
+	default:
+		return (EXC_BENIGN);
+	}
+}
+
+static int
+nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
+    uint64_t *retinfo)
+{
+	enum exc_class exc1, exc2;
+	int type1, vector1;
+
+	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
+	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
+
+	/*
+	 * If an exception occurs while attempting to call the double-fault
+	 * handler the processor enters shutdown mode (aka triple fault).
+	 */
+	type1 = info1 & VM_INTINFO_TYPE;
+	vector1 = info1 & 0xff;
+	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
+		VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
+		    info1, info2);
+		vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
+		*retinfo = 0;
+		return (0);
+	}
+
+	/*
+	 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
+	 */
+	exc1 = exception_class(info1);
+	exc2 = exception_class(info2);
+	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
+	    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
+		/* Convert nested fault into a double fault. */
+		*retinfo = IDT_DF;
+		*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
+		*retinfo |= VM_INTINFO_DEL_ERRCODE;
+	} else {
+		/* Handle exceptions serially */
+		*retinfo = info2;
+	}
+	return (1);
+}
+
+static uint64_t
+vcpu_exception_intinfo(struct vcpu *vcpu)
+{
+	uint64_t info = 0;
+
+	if (vcpu->exception_pending) {
+		info = vcpu->exception.vector & 0xff;
+		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
+		if (vcpu->exception.error_code_valid) {
+			info |= VM_INTINFO_DEL_ERRCODE;
+			info |= (uint64_t)vcpu->exception.error_code << 32;
+		}
+	}
+	return (info);
+}
+
+int
+vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
+{
+	struct vcpu *vcpu;
+	uint64_t info1, info2;
+	int valid;
+
+	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	info1 = vcpu->exitintinfo;
+	vcpu->exitintinfo = 0;
+
+	info2 = 0;
+	if (vcpu->exception_pending) {
+		info2 = vcpu_exception_intinfo(vcpu);
+		vcpu->exception_pending = 0;
+		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
+		    vcpu->exception.vector, info2);
+	}
+
+	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
+		valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
+	} else if (info1 & VM_INTINFO_VALID) {
+		*retinfo = info1;
+		valid = 1;
+	} else if (info2 & VM_INTINFO_VALID) {
+		*retinfo = info2;
+		valid = 1;
+	} else {
+		valid = 0;
+	}
+
+	if (valid) {
+		VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
+		    "retinfo(%#lx)", __func__, info1, info2, *retinfo);
+	}
+
+	return (valid);
+}
+
+int
+vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+	*info1 = vcpu->exitintinfo;
+	*info2 = vcpu_exception_intinfo(vcpu);
+	return (0);
+}
+
+int
 vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
 {
 	struct vcpu *vcpu;
@@ -1466,6 +1690,14 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
 	if (exception->vector < 0 || exception->vector >= 32)
 		return (EINVAL);
 
+	/*
+	 * A double fault exception should never be injected directly into
+	 * the guest. It is a derived exception that results from specific
+	 * combinations of nested faults.
+	 */
+	if (exception->vector == IDT_DF)
+		return (EINVAL);
+
 	vcpu = &vm->vcpu[vcpuid];
 
 	if (vcpu->exception_pending) {
@@ -1481,32 +1713,21 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
 	return (0);
 }
 
-int
-vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception)
-{
-	struct vcpu *vcpu;
-	int pending;
-
-	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
-
-	vcpu = &vm->vcpu[vcpuid];
-	pending = vcpu->exception_pending;
-	if (pending) {
-		vcpu->exception_pending = 0;
-		*exception = vcpu->exception;
-		VCPU_CTR1(vm, vcpuid, "Exception %d delivered",
-		    exception->vector);
-	}
-	return (pending);
-}
-
-static void
-vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
+void
+vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
+    int errcode)
 {
+	struct vm_exception exception;
 	struct vm_exit *vmexit;
+	struct vm *vm;
 	int error;
 
-	error = vm_inject_exception(vm, vcpuid, exception);
+	vm = vmarg;
+
+	exception.vector = vector;
+	exception.error_code = errcode;
+	exception.error_code_valid = errcode_valid;
+	error = vm_inject_exception(vm, vcpuid, &exception);
 	KASSERT(error == 0, ("vm_inject_exception error %d", error));
 
 	/*
@@ -1521,45 +1742,19 @@ vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
 }
 
 void
-vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
+vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
 {
-	struct vm_exception pf = {
-		.vector = IDT_PF,
-		.error_code_valid = 1,
-		.error_code = error_code
-	};
+	struct vm *vm;
 	int error;
 
+	vm = vmarg;
 	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
 	    error_code, cr2);
 
 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
 	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
 
-	vm_inject_fault(vm, vcpuid, &pf);
-}
-
-void
-vm_inject_gp(struct vm *vm, int vcpuid)
-{
-	struct vm_exception gpf = {
-		.vector = IDT_GP,
-		.error_code_valid = 1,
-		.error_code = 0
-	};
-
-	vm_inject_fault(vm, vcpuid, &gpf);
-}
-
-void
-vm_inject_ud(struct vm *vm, int vcpuid)
-{
-	struct vm_exception udf = {
-		.vector = IDT_UD,
-		.error_code_valid = 0
-	};
-
-	vm_inject_fault(vm, vcpuid, &udf);
+	vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
 }
 
 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
@@ -1993,6 +2188,97 @@ vm_segment_name(int seg)
 	return (seg_names[seg]);
 }
 
+void
+vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+    int num_copyinfo)
+{
+	int idx;
+
+	for (idx = 0; idx < num_copyinfo; idx++) {
+		if (copyinfo[idx].cookie != NULL)
+			vm_gpa_release(copyinfo[idx].cookie);
+	}
+	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
+}
+
+int
+vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+    uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
+    int num_copyinfo)
+{
+	int error, idx, nused;
+	size_t n, off, remaining;
+	void *hva, *cookie;
+	uint64_t gpa;
+
+	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
+
+	nused = 0;
+	remaining = len;
+	while (remaining > 0) {
+		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
+		error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
+		if (error)
+			return (error);
+		off = gpa & PAGE_MASK;
+		n = min(remaining, PAGE_SIZE - off);
+		copyinfo[nused].gpa = gpa;
+		copyinfo[nused].len = n;
+		remaining -= n;
+		gla += n;
+		nused++;
+	}
+
+	for (idx = 0; idx < nused; idx++) {
+		hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len,
+		    prot, &cookie);
+		if (hva == NULL)
+			break;
+		copyinfo[idx].hva = hva;
+		copyinfo[idx].cookie = cookie;
+	}
+
+	if (idx != nused) {
+		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
+		return (-1);
+	} else {
+		return (0);
+	}
+}
+
+void
+vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
+    size_t len)
+{
+	char *dst;
+	int idx;
+	
+	dst = kaddr;
+	idx = 0;
+	while (len > 0) {
+		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
+		len -= copyinfo[idx].len;
+		dst += copyinfo[idx].len;
+		idx++;
+	}
+}
+
+void
+vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
+    struct vm_copyinfo *copyinfo, size_t len)
+{
+	const char *src;
+	int idx;
+
+	src = kaddr;
+	idx = 0;
+	while (len > 0) {
+		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
+		len -= copyinfo[idx].len;
+		src += copyinfo[idx].len;
+		idx++;
+	}
+}
 
 /*
  * Return the amount of in-use and wired memory for the VM. Since
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index f3e31a33df4a..a85109edaa1d 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -173,6 +173,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	struct vm_gla2gpa *gg;
 	struct vm_activate_cpu *vac;
 	struct vm_cpuset *vm_cpuset;
+	struct vm_intinfo *vmii;
 
 	sc = vmmdev_lookup2(cdev);
 	if (sc == NULL)
@@ -199,6 +200,8 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	case VM_SET_X2APIC_STATE:
 	case VM_GLA2GPA:
 	case VM_ACTIVATE_CPU:
+	case VM_SET_INTINFO:
+	case VM_GET_INTINFO:
 		/*
 		 * XXX fragile, handle with care
 		 * Assumes that the first field of the ioctl data is the vcpu.
@@ -470,6 +473,15 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 			error = copyout(cpuset, vm_cpuset->cpus, size);
 		free(cpuset, M_TEMP);
 		break;
+	case VM_SET_INTINFO:
+		vmii = (struct vm_intinfo *)data;
+		error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
+		break;
+	case VM_GET_INTINFO:
+		vmii = (struct vm_intinfo *)data;
+		error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
+		    &vmii->info2);
+		break;
 	default:
 		error = ENOTTY;
 		break;
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index 921deb5ab29d..a65b1251e52b 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
 #else	/* !_KERNEL */
 #include <sys/types.h>
 #include <sys/errno.h>
+#include <sys/_iovec.h>
 
 #include <machine/vmm.h>
 
@@ -65,18 +66,26 @@ enum {
 	VIE_OP_TYPE_AND,
 	VIE_OP_TYPE_OR,
 	VIE_OP_TYPE_TWO_BYTE,
+	VIE_OP_TYPE_PUSH,
+	VIE_OP_TYPE_CMP,
 	VIE_OP_TYPE_LAST
 };
 
 /* struct vie_op.op_flags */
-#define	VIE_OP_F_IMM		(1 << 0)	/* immediate operand present */
-#define	VIE_OP_F_IMM8		(1 << 1)	/* 8-bit immediate operand */
+#define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
+#define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
+#define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
+#define	VIE_OP_F_NO_MODRM	(1 << 3)
 
 static const struct vie_op two_byte_opcodes[256] = {
 	[0xB6] = {
 		.op_byte = 0xB6,
 		.op_type = VIE_OP_TYPE_MOVZX,
 	},
+	[0xB7] = {
+		.op_byte = 0xB7,
+		.op_type = VIE_OP_TYPE_MOVZX,
+	},
 	[0xBE] = {
 		.op_byte = 0xBE,
 		.op_type = VIE_OP_TYPE_MOVSX,
@@ -88,6 +97,10 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_byte = 0x0F,
 		.op_type = VIE_OP_TYPE_TWO_BYTE
 	},
+	[0x3B] = {
+		.op_byte = 0x3B,
+		.op_type = VIE_OP_TYPE_CMP,
+	},
 	[0x88] = {
 		.op_byte = 0x88,
 		.op_type = VIE_OP_TYPE_MOV,
@@ -104,6 +117,22 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_byte = 0x8B,
 		.op_type = VIE_OP_TYPE_MOV,
 	},
+	[0xA1] = {
+		.op_byte = 0xA1,
+		.op_type = VIE_OP_TYPE_MOV,
+		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
+	},
+	[0xA3] = {
+		.op_byte = 0xA3,
+		.op_type = VIE_OP_TYPE_MOV,
+		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
+	},
+	[0xC6] = {
+		/* XXX Group 11 extended opcode - not just MOV */
+		.op_byte = 0xC6,
+		.op_type = VIE_OP_TYPE_MOV,
+		.op_flags = VIE_OP_F_IMM8,
+	},
 	[0xC7] = {
 		.op_byte = 0xC7,
 		.op_type = VIE_OP_TYPE_MOV,
@@ -125,6 +154,11 @@ static const struct vie_op one_byte_opcodes[256] = {
 		.op_type = VIE_OP_TYPE_OR,
 		.op_flags = VIE_OP_F_IMM8,
 	},
+	[0xFF] = {
+		/* XXX Group 5 extended opcode - not just PUSH */
+		.op_byte = 0xFF,
+		.op_type = VIE_OP_TYPE_PUSH,
+	}
 };
 
 /* struct vie.mod */
@@ -175,18 +209,15 @@ vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
 	return (error);
 }
 
-static int
-vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
+static void
+vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
 {
-	uint64_t val;
-	int error, rshift;
-	enum vm_reg_name reg;
-
-	rshift = 0;
-	reg = gpr_map[vie->reg];
+	*lhbr = 0;
+	*reg = gpr_map[vie->reg];
 
 	/*
-	 * 64-bit mode imposes limitations on accessing legacy byte registers.
+	 * 64-bit mode imposes limitations on accessing legacy high byte
+	 * registers (lhbr).
 	 *
 	 * The legacy high-byte registers cannot be addressed if the REX
 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
@@ -198,17 +229,56 @@ vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
 	 */
 	if (!vie->rex_present) {
 		if (vie->reg & 0x4) {
-			/*
-			 * Obtain the value of %ah by reading %rax and shifting
-			 * right by 8 bits (same for %bh, %ch and %dh).
-			 */
-			rshift = 8;
-			reg = gpr_map[vie->reg & 0x3];
+			*lhbr = 1;
+			*reg = gpr_map[vie->reg & 0x3];
 		}
 	}
+}
+
+static int
+vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
+{
+	uint64_t val;
+	int error, lhbr;
+	enum vm_reg_name reg;
 
+	vie_calc_bytereg(vie, &reg, &lhbr);
 	error = vm_get_register(vm, vcpuid, reg, &val);
-	*rval = val >> rshift;
+
+	/*
+	 * To obtain the value of a legacy high byte register shift the
+	 * base register right by 8 bits (%ah = %rax >> 8).
+	 */
+	if (lhbr)
+		*rval = val >> 8;
+	else
+		*rval = val;
+	return (error);
+}
+
+static int
+vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
+{
+	uint64_t origval, val, mask;
+	int error, lhbr;
+	enum vm_reg_name reg;
+
+	vie_calc_bytereg(vie, &reg, &lhbr);
+	error = vm_get_register(vm, vcpuid, reg, &origval);
+	if (error == 0) {
+		val = byte;
+		mask = 0xff;
+		if (lhbr) {
+			/*
+			 * Shift left by 8 to store 'byte' in a legacy high
+			 * byte register.
+			 */
+			val <<= 8;
+			mask <<= 8;
+		}
+		val |= origval & ~mask;
+		error = vm_set_register(vm, vcpuid, reg, val);
+	}
 	return (error);
 }
 
@@ -242,16 +312,52 @@ vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
 }
 
 /*
- * The following simplifying assumptions are made during emulation:
- *
- * - guest is in 64-bit mode
- *   - default address size is 64-bits
- *   - default operand size is 32-bits
- *
- * - operand size override is not supported
- *
- * - address size override is not supported
+ * Return the status flags that would result from doing (x - y).
  */
+static u_long
+getcc16(uint16_t x, uint16_t y)
+{
+	u_long rflags;
+
+	__asm __volatile("sub %1,%2; pushfq; popq %0" :
+	    "=r" (rflags) : "m" (y), "r" (x));
+	return (rflags);
+}
+
+static u_long
+getcc32(uint32_t x, uint32_t y)
+{
+	u_long rflags;
+
+	__asm __volatile("sub %1,%2; pushfq; popq %0" :
+	    "=r" (rflags) : "m" (y), "r" (x));
+	return (rflags);
+}
+
+static u_long
+getcc64(uint64_t x, uint64_t y)
+{
+	u_long rflags;
+
+	__asm __volatile("sub %1,%2; pushfq; popq %0" :
+	    "=r" (rflags) : "m" (y), "r" (x));
+	return (rflags);
+}
+
+static u_long
+getcc(int opsize, uint64_t x, uint64_t y)
+{
+	KASSERT(opsize == 2 || opsize == 4 || opsize == 8,
+	    ("getcc: invalid operand size %d", opsize));
+
+	if (opsize == 2)
+		return (getcc16(x, y));
+	else if (opsize == 4)
+		return (getcc32(x, y));
+	else
+		return (getcc64(x, y));
+}
+
 static int
 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
@@ -261,7 +367,7 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	uint8_t byte;
 	uint64_t val;
 
-	size = 4;
+	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
@@ -271,7 +377,7 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * 88/r:	mov r/m8, r8
 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
 		 */
-		size = 1;
+		size = 1;	/* override for byte operation */
 		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
 		if (error == 0)
 			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
@@ -279,11 +385,10 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	case 0x89:
 		/*
 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
+		 * 89/r:	mov r/m16, r16
 		 * 89/r:	mov r/m32, r32
 		 * REX.W + 89/r	mov r/m64, r64
 		 */
-		if (vie->rex_w)
-			size = 8;
 		reg = gpr_map[vie->reg];
 		error = vie_read_register(vm, vcpuid, reg, &val);
 		if (error == 0) {
@@ -292,38 +397,72 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		}
 		break;
 	case 0x8A:
+		/*
+		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
+		 * 8A/r:	mov r8, r/m8
+		 * REX + 8A/r:	mov r8, r/m8
+		 */
+		size = 1;	/* override for byte operation */
+		error = memread(vm, vcpuid, gpa, &val, size, arg);
+		if (error == 0)
+			error = vie_write_bytereg(vm, vcpuid, vie, val);
+		break;
 	case 0x8B:
 		/*
 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
-		 * 8A/r:	mov r/m8, r8
-		 * REX + 8A/r:	mov r/m8, r8
+		 * 8B/r:	mov r16, r/m16
 		 * 8B/r:	mov r32, r/m32
 		 * REX.W 8B/r:	mov r64, r/m64
 		 */
-		if (vie->op.op_byte == 0x8A)
-			size = 1;
-		else if (vie->rex_w)
-			size = 8;
 		error = memread(vm, vcpuid, gpa, &val, size, arg);
 		if (error == 0) {
 			reg = gpr_map[vie->reg];
 			error = vie_update_register(vm, vcpuid, reg, val, size);
 		}
 		break;
+	case 0xA1:
+		/*
+		 * MOV from seg:moffset to AX/EAX/RAX
+		 * A1:		mov AX, moffs16
+		 * A1:		mov EAX, moffs32
+		 * REX.W + A1:	mov RAX, moffs64
+		 */
+		error = memread(vm, vcpuid, gpa, &val, size, arg);
+		if (error == 0) {
+			reg = VM_REG_GUEST_RAX;
+			error = vie_update_register(vm, vcpuid, reg, val, size);
+		}
+		break;
+	case 0xA3:
+		/*
+		 * MOV from AX/EAX/RAX to seg:moffset
+		 * A3:		mov moffs16, AX
+		 * A3:		mov moffs32, EAX 
+		 * REX.W + A3:	mov moffs64, RAX
+		 */
+		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
+		if (error == 0) {
+			val &= size2mask[size];
+			error = memwrite(vm, vcpuid, gpa, val, size, arg);
+		}
+		break;
+	case 0xC6:
+		/*
+		 * MOV from imm8 to mem (ModRM:r/m)
+		 * C6/0		mov r/m8, imm8
+		 * REX + C6/0	mov r/m8, imm8
+		 */
+		size = 1;	/* override for byte operation */
+		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
+		break;
 	case 0xC7:
 		/*
-		 * MOV from imm32 to mem (ModRM:r/m)
+		 * MOV from imm16/imm32 to mem (ModRM:r/m)
+		 * C7/0		mov r/m16, imm16
 		 * C7/0		mov r/m32, imm32
 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
 		 */
-		val = vie->immediate;		/* already sign-extended */
-
-		if (vie->rex_w)
-			size = 8;
-
-		if (size != 8)
-			val &= size2mask[size];
-
+		val = vie->immediate & size2mask[size];
 		error = memwrite(vm, vcpuid, gpa, val, size, arg);
 		break;
 	default:
@@ -333,17 +472,6 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	return (error);
 }
 
-/*
- * The following simplifying assumptions are made during emulation:
- *
- * - guest is in 64-bit mode
- *   - default address size is 64-bits
- *   - default operand size is 32-bits
- *
- * - operand size override is not supported
- *
- * - address size override is not supported
- */
 static int
 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	     mem_region_read_t memread, mem_region_write_t memwrite,
@@ -353,7 +481,7 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	enum vm_reg_name reg;
 	uint64_t val;
 
-	size = 4;
+	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
@@ -362,8 +490,9 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * MOV and zero extend byte from mem (ModRM:r/m) to
 		 * reg (ModRM:reg).
 		 *
-		 * 0F B6/r		movzx r/m8, r32
-		 * REX.W + 0F B6/r	movzx r/m8, r64
+		 * 0F B6/r		movzx r16, r/m8
+		 * 0F B6/r		movzx r32, r/m8
+		 * REX.W + 0F B6/r	movzx r64, r/m8
 		 */
 
 		/* get the first operand */
@@ -374,19 +503,39 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		/* get the second operand */
 		reg = gpr_map[vie->reg];
 
-		if (vie->rex_w)
-			size = 8;
+		/* zero-extend byte */
+		val = (uint8_t)val;
 
 		/* write the result */
 		error = vie_update_register(vm, vcpuid, reg, val, size);
 		break;
+	case 0xB7:
+		/*
+		 * MOV and zero extend word from mem (ModRM:r/m) to
+		 * reg (ModRM:reg).
+		 *
+		 * 0F B7/r		movzx r32, r/m16
+		 * REX.W + 0F B7/r	movzx r64, r/m16
+		 */
+		error = memread(vm, vcpuid, gpa, &val, 2, arg);
+		if (error)
+			return (error);
+
+		reg = gpr_map[vie->reg];
+
+		/* zero-extend word */
+		val = (uint16_t)val;
+
+		error = vie_update_register(vm, vcpuid, reg, val, size);
+		break;
 	case 0xBE:
 		/*
 		 * MOV and sign extend byte from mem (ModRM:r/m) to
 		 * reg (ModRM:reg).
 		 *
-		 * 0F BE/r		movsx r/m8, r32
-		 * REX.W + 0F BE/r	movsx r/m8, r64
+		 * 0F BE/r		movsx r16, r/m8
+		 * 0F BE/r		movsx r32, r/m8
+		 * REX.W + 0F BE/r	movsx r64, r/m8
 		 */
 
 		/* get the first operand */
@@ -397,9 +546,6 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		/* get the second operand */
 		reg = gpr_map[vie->reg];
 
-		if (vie->rex_w)
-			size = 8;
-
 		/* sign extend byte */
 		val = (int8_t)val;
 
@@ -420,7 +566,7 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	enum vm_reg_name reg;
 	uint64_t val1, val2;
 
-	size = 4;
+	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
@@ -429,11 +575,10 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
 		 * result in reg.
 		 *
+		 * 23/r		and r16, r/m16
 		 * 23/r		and r32, r/m32
 		 * REX.W + 23/r	and r64, r/m64
 		 */
-		if (vie->rex_w)
-			size = 8;
 
 		/* get the first operand */
 		reg = gpr_map[vie->reg];
@@ -455,8 +600,9 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * AND mem (ModRM:r/m) with immediate and store the
 		 * result in mem.
 		 *
-		 * 81/          and r/m32, imm32
-		 * REX.W + 81/  and r/m64, imm32 sign-extended to 64
+		 * 81 /4		and r/m16, imm16
+		 * 81 /4		and r/m32, imm32
+		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
 		 *
 		 * Currently, only the AND operation of the 0x81 opcode
 		 * is implemented (ModRM:reg = b100).
@@ -464,9 +610,6 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		if ((vie->reg & 7) != 4)
 			break;
 
-		if (vie->rex_w)
-			size = 8;
-		
 		/* get the first operand */
                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
                 if (error)
@@ -492,7 +635,7 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	int error, size;
 	uint64_t val1;
 
-	size = 4;
+	size = vie->opsize;
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
@@ -501,8 +644,9 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		 * OR mem (ModRM:r/m) with immediate and store the
 		 * result in mem.
 		 *
-		 * 83/          OR r/m32, imm8 sign-extended to 32
-		 * REX.W + 83/  OR r/m64, imm8 sign-extended to 64
+		 * 83 /1		OR r/m16, imm8 sign-extended to 16
+		 * 83 /1		OR r/m32, imm8 sign-extended to 32
+		 * REX.W + 83/1		OR r/m64, imm8 sign-extended to 64
 		 *
 		 * Currently, only the OR operation of the 0x83 opcode
 		 * is implemented (ModRM:reg = b001).
@@ -510,9 +654,6 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		if ((vie->reg & 7) != 1)
 			break;
 
-		if (vie->rex_w)
-			size = 8;
-		
 		/* get the first operand */
                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
                 if (error)
@@ -531,10 +672,167 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	return (error);
 }
 
+#define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
+
+static int
+emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+	int error, size;
+	uint64_t op1, op2, rflags, rflags2;
+	enum vm_reg_name reg;
+
+	size = vie->opsize;
+	switch (vie->op.op_byte) {
+	case 0x3B:
+		/*
+		 * 3B/r		CMP r16, r/m16
+		 * 3B/r		CMP r32, r/m32
+		 * REX.W + 3B/r	CMP r64, r/m64
+		 *
+		 * Compare first operand (reg) with second operand (r/m) and
+		 * set status flags in EFLAGS register. The comparison is
+		 * performed by subtracting the second operand from the first
+		 * operand and then setting the status flags.
+		 */
+
+		/* Get the first operand */
+		reg = gpr_map[vie->reg];
+		error = vie_read_register(vm, vcpuid, reg, &op1);
+		if (error)
+			return (error);
+
+		/* Get the second operand */
+		error = memread(vm, vcpuid, gpa, &op2, size, arg);
+		if (error)
+			return (error);
+
+		break;
+	default:
+		return (EINVAL);
+	}
+	rflags2 = getcc(size, op1, op2);
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	if (error)
+		return (error);
+	rflags &= ~RFLAGS_STATUS_BITS;
+	rflags |= rflags2 & RFLAGS_STATUS_BITS;
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
+	return (error);
+}
+
+static int
+emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *arg)
+{
+#ifdef _KERNEL
+	struct vm_copyinfo copyinfo[2];
+#else
+	struct iovec copyinfo[2];
+#endif
+	struct seg_desc ss_desc;
+	uint64_t cr0, rflags, rsp, stack_gla, val;
+	int error, size, stackaddrsize;
+
+	/*
+	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
+	 *
+	 * PUSH is part of the group 5 extended opcodes and is identified
+	 * by ModRM:reg = b110.
+	 */
+	if ((vie->reg & 7) != 6)
+		return (EINVAL);
+
+	size = vie->opsize;
+	/*
+	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
+	 */
+	if (paging->cpu_mode == CPU_MODE_REAL) {
+		stackaddrsize = 2;
+	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
+		/*
+		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
+		 * - Stack pointer size is always 64-bits.
+		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
+		 * - 16-bit PUSH/POP is supported by using the operand size
+		 *   override prefix (66H).
+		 */
+		stackaddrsize = 8;
+		size = vie->opsize_override ? 2 : 8;
+	} else {
+		/*
+		 * In protected or compability mode the 'B' flag in the
+		 * stack-segment descriptor determines the size of the
+		 * stack pointer.
+		 */
+		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
+		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
+		    __func__, error));
+		if (SEG_DESC_DEF32(ss_desc.access))
+			stackaddrsize = 4;
+		else
+			stackaddrsize = 2;
+	}
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
+	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
+	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
+
+	rsp -= size;
+	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
+	    rsp, size, stackaddrsize, PROT_WRITE, &stack_gla)) {
+		vm_inject_ss(vm, vcpuid, 0);
+		return (0);
+	}
+
+	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
+		vm_inject_ss(vm, vcpuid, 0);
+		return (0);
+	}
+
+	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
+		vm_inject_ac(vm, vcpuid, 0);
+		return (0);
+	}
+
+	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, PROT_WRITE,
+	    copyinfo, nitems(copyinfo));
+	if (error == -1) {
+		/*
+		 * XXX cannot return a negative error value here because it
+		 * ends up being the return value of the VM_RUN() ioctl and
+		 * is interpreted as a pseudo-error (for e.g. ERESTART).
+		 */
+		return (EFAULT);
+	} else if (error == 1) {
+		/* Resume guest execution to handle page fault */
+		return (0);
+	}
+
+	error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
+	if (error == 0) {
+		vm_copyout(vm, vcpuid, &val, copyinfo, size);
+		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
+		    stackaddrsize);
+		KASSERT(error == 0, ("error %d updating rsp", error));
+	}
+#ifdef _KERNEL
+	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+#endif
+	return (error);
+}
+
 int
 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
-			mem_region_read_t memread, mem_region_write_t memwrite,
-			void *memarg)
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *memarg)
 {
 	int error;
 
@@ -542,6 +840,14 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 		return (EINVAL);
 
 	switch (vie->op.op_type) {
+	case VIE_OP_TYPE_PUSH:
+		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
+		    memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_CMP:
+		error = emulate_cmp(vm, vcpuid, gpa, vie,
+				    memread, memwrite, memarg);
+		break;
 	case VIE_OP_TYPE_MOV:
 		error = emulate_mov(vm, vcpuid, gpa, vie,
 				    memread, memwrite, memarg);
@@ -636,7 +942,7 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
 		 * then the descriptor is unusable and attempting to use
 		 * it results in a #GP(0).
 		 */
-		if (SEG_DESC_UNUSABLE(desc))
+		if (SEG_DESC_UNUSABLE(desc->access))
 			return (-1);
 
 		/* 
@@ -645,13 +951,13 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
 		 * descriptor that is not present. If this was the case then
 		 * it would have been checked before the VM-exit.
 		 */
-		KASSERT(SEG_DESC_PRESENT(desc), ("segment %d not present: %#x",
-		    seg, desc->access));
+		KASSERT(SEG_DESC_PRESENT(desc->access),
+		    ("segment %d not present: %#x", seg, desc->access));
 
 		/*
 		 * The descriptor type must indicate a code/data segment.
 		 */
-		type = SEG_DESC_TYPE(desc);
+		type = SEG_DESC_TYPE(desc->access);
 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
 		    "descriptor type %#x", seg, type));
 
@@ -680,7 +986,8 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
 		if ((type & 0xC) == 0x4) {
 			/* expand-down data segment */
 			low_limit = desc->limit + 1;
-			high_limit = SEG_DESC_DEF32(desc) ? 0xffffffff : 0xffff;
+			high_limit = SEG_DESC_DEF32(desc->access) ?
+			    0xffffffff : 0xffff;
 		} else {
 			/* code segment or expand-up data segment */
 			low_limit = 0;
@@ -947,45 +1254,24 @@ fault:
 }
 
 int
-vmm_fetch_instruction(struct vm *vm, int cpuid, struct vm_guest_paging *paging,
+vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t rip, int inst_length, struct vie *vie)
 {
-	int n, error, prot;
-	uint64_t gpa, off;
-	void *hpa, *cookie;
-
-	/*
-	 * XXX cache previously fetched instructions using 'rip' as the tag
-	 */
+	struct vm_copyinfo copyinfo[2];
+	int error, prot;
 
-	prot = VM_PROT_READ | VM_PROT_EXECUTE;
 	if (inst_length > VIE_INST_SIZE)
 		panic("vmm_fetch_instruction: invalid length %d", inst_length);
 
-	/* Copy the instruction into 'vie' */
-	while (vie->num_valid < inst_length) {
-		error = vmm_gla2gpa(vm, cpuid, paging, rip, prot, &gpa);
-		if (error)
-			return (error);
-
-		off = gpa & PAGE_MASK;
-		n = min(inst_length - vie->num_valid, PAGE_SIZE - off);
-
-		if ((hpa = vm_gpa_hold(vm, gpa, n, prot, &cookie)) == NULL)
-			break;
-
-		bcopy(hpa, &vie->inst[vie->num_valid], n);
-
-		vm_gpa_release(cookie);
-
-		rip += n;
-		vie->num_valid += n;
+	prot = PROT_READ | PROT_EXEC;
+	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
+	    copyinfo, nitems(copyinfo));
+	if (error == 0) {
+		vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
+		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+		vie->num_valid = inst_length;
 	}
-
-	if (vie->num_valid == inst_length)
-		return (0);
-	else
-		return (-1);
+	return (error);
 }
 
 static int
@@ -1007,24 +1293,65 @@ vie_advance(struct vie *vie)
 }
 
 static int
-decode_rex(struct vie *vie)
+decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
 {
 	uint8_t x;
 
-	if (vie_peek(vie, &x))
-		return (-1);
+	while (1) {
+		if (vie_peek(vie, &x))
+			return (-1);
 
-	if (x >= 0x40 && x <= 0x4F) {
-		vie->rex_present = 1;
+		if (x == 0x66)
+			vie->opsize_override = 1;
+		else if (x == 0x67)
+			vie->addrsize_override = 1;
+		else
+			break;
+
+		vie_advance(vie);
+	}
 
+	/*
+	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
+	 * - Only one REX prefix is allowed per instruction.
+	 * - The REX prefix must immediately precede the opcode byte or the
+	 *   escape opcode byte.
+	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
+	 *   the mandatory prefix must come before the REX prefix.
+	 */
+	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
+		vie->rex_present = 1;
 		vie->rex_w = x & 0x8 ? 1 : 0;
 		vie->rex_r = x & 0x4 ? 1 : 0;
 		vie->rex_x = x & 0x2 ? 1 : 0;
 		vie->rex_b = x & 0x1 ? 1 : 0;
-
 		vie_advance(vie);
 	}
 
+	/*
+	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
+	 */
+	if (cpu_mode == CPU_MODE_64BIT) {
+		/*
+		 * Default address size is 64-bits and default operand size
+		 * is 32-bits.
+		 */
+		vie->addrsize = vie->addrsize_override ? 4 : 8;
+		if (vie->rex_w)
+			vie->opsize = 8;
+		else if (vie->opsize_override)
+			vie->opsize = 2;
+		else
+			vie->opsize = 4;
+	} else if (cs_d) {
+		/* Default address and operand sizes are 32-bits */
+		vie->addrsize = vie->addrsize_override ? 2 : 4;
+		vie->opsize = vie->opsize_override ? 2 : 4;
+	} else {
+		/* Default address and operand sizes are 16-bits */
+		vie->addrsize = vie->addrsize_override ? 4 : 2;
+		vie->opsize = vie->opsize_override ? 4 : 2;
+	}
 	return (0);
 }
 
@@ -1071,6 +1398,12 @@ decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
 {
 	uint8_t x;
 
+	if (cpu_mode == CPU_MODE_REAL)
+		return (-1);
+
+	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
+		return (0);
+
 	if (vie_peek(vie, &x))
 		return (-1);
 
@@ -1249,20 +1582,32 @@ decode_immediate(struct vie *vie)
 	union {
 		char	buf[4];
 		int8_t	signed8;
+		int16_t	signed16;
 		int32_t	signed32;
 	} u;
 
 	/* Figure out immediate operand size (if any) */
-	if (vie->op.op_flags & VIE_OP_F_IMM)
-		vie->imm_bytes = 4;
-	else if (vie->op.op_flags & VIE_OP_F_IMM8)
+	if (vie->op.op_flags & VIE_OP_F_IMM) {
+		/*
+		 * Section 2.2.1.5 "Immediates", Intel SDM:
+		 * In 64-bit mode the typical size of immediate operands
+		 * remains 32-bits. When the operand size if 64-bits, the
+		 * processor sign-extends all immediates to 64-bits prior
+		 * to their use.
+		 */
+		if (vie->opsize == 4 || vie->opsize == 8)
+			vie->imm_bytes = 4;
+		else
+			vie->imm_bytes = 2;
+	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
 		vie->imm_bytes = 1;
+	}
 
 	if ((n = vie->imm_bytes) == 0)
 		return (0);
 
-	if (n != 1 && n != 4)
-		panic("decode_immediate: invalid imm_bytes %d", n);
+	KASSERT(n == 1 || n == 2 || n == 4,
+	    ("%s: invalid number of immediate bytes: %d", __func__, n));
 
 	for (i = 0; i < n; i++) {
 		if (vie_peek(vie, &x))
@@ -1271,12 +1616,47 @@ decode_immediate(struct vie *vie)
 		u.buf[i] = x;
 		vie_advance(vie);
 	}
-	
+
+	/* sign-extend the immediate value before use */
 	if (n == 1)
-		vie->immediate = u.signed8;		/* sign-extended */
+		vie->immediate = u.signed8;
+	else if (n == 2)
+		vie->immediate = u.signed16;
 	else
-		vie->immediate = u.signed32;		/* sign-extended */
+		vie->immediate = u.signed32;
+
+	return (0);
+}
+
+static int
+decode_moffset(struct vie *vie)
+{
+	int i, n;
+	uint8_t x;
+	union {
+		char	buf[8];
+		uint64_t u64;
+	} u;
+
+	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
+		return (0);
+
+	/*
+	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
+	 * The memory offset size follows the address-size of the instruction.
+	 */
+	n = vie->addrsize;
+	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
+
+	u.u64 = 0;
+	for (i = 0; i < n; i++) {
+		if (vie_peek(vie, &x))
+			return (-1);
 
+		u.buf[i] = x;
+		vie_advance(vie);
+	}
+	vie->displacement = u.u64;
 	return (0);
 }
 
@@ -1301,7 +1681,7 @@ static int
 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
 {
 	int error;
-	uint64_t base, idx;
+	uint64_t base, idx, gla2;
 
 	/* Skip 'gla' verification */
 	if (gla == VIE_INVALID_GLA)
@@ -1334,11 +1714,14 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
 		}
 	}
 
-	if (base + vie->scale * idx + vie->displacement != gla) {
+	/* XXX assuming that the base address of the segment is 0 */
+	gla2 = base + vie->scale * idx + vie->displacement;
+	gla2 &= size2mask[vie->addrsize];
+	if (gla != gla2) {
 		printf("verify_gla mismatch: "
 		       "base(0x%0lx), scale(%d), index(0x%0lx), "
-		       "disp(0x%0lx), gla(0x%0lx)\n",
-		       base, vie->scale, idx, vie->displacement, gla);
+		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
+		       base, vie->scale, idx, vie->displacement, gla, gla2);
 		return (-1);
 	}
 
@@ -1347,13 +1730,11 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
 
 int
 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
-		       enum vm_cpu_mode cpu_mode, struct vie *vie)
+		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
 {
 
-	if (cpu_mode == CPU_MODE_64BIT) {
-		if (decode_rex(vie))
-			return (-1);
-	}
+	if (decode_prefixes(vie, cpu_mode, cs_d))
+		return (-1);
 
 	if (decode_opcode(vie))
 		return (-1);
@@ -1366,10 +1747,13 @@ vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
 
 	if (decode_displacement(vie))
 		return (-1);
-	
+
 	if (decode_immediate(vie))
 		return (-1);
 
+	if (decode_moffset(vie))
+		return (-1);
+
 	if (verify_inst_length(vie))
 		return (-1);
 
diff --git a/sys/x86/include/specialreg.h b/sys/x86/include/specialreg.h
index c0cef2ca8717..86106845b72a 100644
--- a/sys/x86/include/specialreg.h
+++ b/sys/x86/include/specialreg.h
@@ -436,6 +436,25 @@
 #define	MSR_MC4_MISC		0x413
 
 /*
+ * VMX MSRs
+ */
+#define	MSR_VMX_BASIC		0x480
+#define	MSR_VMX_PINBASED_CTLS	0x481
+#define	MSR_VMX_PROCBASED_CTLS	0x482
+#define	MSR_VMX_EXIT_CTLS	0x483
+#define	MSR_VMX_ENTRY_CTLS	0x484
+#define	MSR_VMX_CR0_FIXED0	0x486
+#define	MSR_VMX_CR0_FIXED1	0x487
+#define	MSR_VMX_CR4_FIXED0	0x488
+#define	MSR_VMX_CR4_FIXED1	0x489
+#define	MSR_VMX_PROCBASED_CTLS2	0x48b
+#define	MSR_VMX_EPT_VPID_CAP	0x48c
+#define	MSR_VMX_TRUE_PINBASED_CTLS	0x48d
+#define	MSR_VMX_TRUE_PROCBASED_CTLS	0x48e
+#define	MSR_VMX_TRUE_EXIT_CTLS	0x48f
+#define	MSR_VMX_TRUE_ENTRY_CTLS	0x490
+
+/*
  * X2APIC MSRs
  */
 #define	MSR_APIC_ID		0x802
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile
index 23e16cb7d0d3..1c95f77dff2b 100644
--- a/usr.sbin/bhyve/Makefile
+++ b/usr.sbin/bhyve/Makefile
@@ -35,6 +35,7 @@ SRCS=	\
 	post.c			\
 	rtc.c			\
 	smbiostbl.c		\
+	task_switch.c		\
 	uart_emul.c		\
 	virtio.c		\
 	xmsr.c			\
diff --git a/usr.sbin/bhyve/acpi.c b/usr.sbin/bhyve/acpi.c
index c4ec020bd5e1..5dea3001de22 100644
--- a/usr.sbin/bhyve/acpi.c
+++ b/usr.sbin/bhyve/acpi.c
@@ -40,12 +40,13 @@
  *  Layout
  *  ------
  *   RSDP  ->   0xf2400    (36 bytes fixed)
- *     RSDT  ->   0xf2440    (36 bytes + 4*N table addrs, 2 used)
- *     XSDT  ->   0xf2480    (36 bytes + 8*N table addrs, 2 used)
+ *     RSDT  ->   0xf2440    (36 bytes + 4*7 table addrs, 4 used)
+ *     XSDT  ->   0xf2480    (36 bytes + 8*7 table addrs, 4 used)
  *       MADT  ->   0xf2500  (depends on #CPUs)
  *       FADT  ->   0xf2600  (268 bytes)
  *       HPET  ->   0xf2740  (56 bytes)
- *         FACS  ->   0xf2780 (64 bytes)
+ *       MCFG  ->   0xf2780  (60 bytes)
+ *         FACS  ->   0xf27C0 (64 bytes)
  *         DSDT  ->   0xf2800 (variable - can go up to 0x100000)
  */
 
@@ -80,7 +81,8 @@ __FBSDID("$FreeBSD$");
 #define MADT_OFFSET		0x100
 #define FADT_OFFSET		0x200
 #define	HPET_OFFSET		0x340
-#define FACS_OFFSET		0x380
+#define	MCFG_OFFSET		0x380
+#define FACS_OFFSET		0x3C0
 #define DSDT_OFFSET		0x400
 
 #define	BHYVE_ASL_TEMPLATE	"bhyve.XXXXXXX"
@@ -178,6 +180,8 @@ basl_fwrite_rsdt(FILE *fp)
 	    basl_acpi_base + FADT_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : %08X\n",
 	    basl_acpi_base + HPET_OFFSET);
+	EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : %08X\n",
+	    basl_acpi_base + MCFG_OFFSET);
 
 	EFFLUSH(fp);
 
@@ -216,6 +220,8 @@ basl_fwrite_xsdt(FILE *fp)
 	    basl_acpi_base + FADT_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : 00000000%08X\n",
 	    basl_acpi_base + HPET_OFFSET);
+	EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : 00000000%08X\n",
+	    basl_acpi_base + MCFG_OFFSET);
 
 	EFFLUSH(fp);
 
@@ -583,6 +589,39 @@ err_exit:
 }
 
 static int
+basl_fwrite_mcfg(FILE *fp)
+{
+	int err = 0;
+
+	EFPRINTF(fp, "/*\n");
+	EFPRINTF(fp, " * bhyve MCFG template\n");
+	EFPRINTF(fp, " */\n");
+	EFPRINTF(fp, "[0004]\t\tSignature : \"MCFG\"\n");
+	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMCFG  \"\n");
+	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+
+	/* iasl will fill in the compiler ID/revision fields */
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+	EFPRINTF(fp, "[0008]\t\tReserved : 0\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp, "[0008]\t\tBase Address : %016lX\n", pci_ecfg_base());
+	EFPRINTF(fp, "[0002]\t\tSegment Group: 0000\n");
+	EFPRINTF(fp, "[0001]\t\tStart Bus: 00\n");
+	EFPRINTF(fp, "[0001]\t\tEnd Bus: FF\n");
+	EFPRINTF(fp, "[0004]\t\tReserved : 0\n");
+	EFFLUSH(fp);
+	return (0);
+err_exit:
+	return (errno);
+}
+
+static int
 basl_fwrite_facs(FILE *fp)
 {
 	int err;
@@ -921,6 +960,7 @@ static struct {
 	{ basl_fwrite_madt, MADT_OFFSET },
 	{ basl_fwrite_fadt, FADT_OFFSET },
 	{ basl_fwrite_hpet, HPET_OFFSET },
+	{ basl_fwrite_mcfg, MCFG_OFFSET },
 	{ basl_fwrite_facs, FACS_OFFSET },
 	{ basl_fwrite_dsdt, DSDT_OFFSET },
 	{ NULL }
diff --git a/usr.sbin/bhyve/atkbdc.c b/usr.sbin/bhyve/atkbdc.c
index 6e13c1910334..930b7af95f3d 100644
--- a/usr.sbin/bhyve/atkbdc.c
+++ b/usr.sbin/bhyve/atkbdc.c
@@ -31,6 +31,10 @@ __FBSDID("$FreeBSD$");
 
 #include <machine/vmm.h>
 
+#include <vmmapi.h>
+
+#include <assert.h>
+#include <errno.h>
 #include <stdio.h>
 
 #include "inout.h"
@@ -48,29 +52,30 @@ atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
     uint32_t *eax, void *arg)
 {
 	if (bytes != 1)
-		return (INOUT_ERROR);
+		return (-1);
 
 	*eax = 0;
 
-	return (INOUT_OK);
+	return (0);
 }
 
 static int
 atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port,
     int bytes, uint32_t *eax, void *arg)
 {
-	int retval;
+	int error, retval;
 
 	if (bytes != 1)
-		return (INOUT_ERROR);
+		return (-1);
 
-	retval = INOUT_OK;
+	retval = 0;
 	if (in) {
 		*eax = KBD_SYS_FLAG;	/* system passed POST */
 	} else {
 		switch (*eax) {
 		case KBDC_RESET:	/* Pulse "reset" line. */
-			retval = INOUT_RESET;
+			error = vm_suspend(ctx, VM_SUSPEND_RESET);
+			assert(error == 0 || errno == EALREADY);
 			break;
 		}
 	}
diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8
index aad1aef70635..80b814a4882e 100644
--- a/usr.sbin/bhyve/bhyve.8
+++ b/usr.sbin/bhyve/bhyve.8
@@ -24,7 +24,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd April 2, 2014
+.Dd June 26, 2014
 .Dt BHYVE 8
 .Os
 .Sh NAME
@@ -32,12 +32,14 @@
 .Nd "run a guest operating system inside a virtual machine"
 .Sh SYNOPSIS
 .Nm
-.Op Fl aehwxACHPW
+.Op Fl abehwxACHPWY
 .Op Fl c Ar numcpus
 .Op Fl g Ar gdbport
+.Op Fl l Ar lpcdev Ns Op , Ns Ar conf
+.Op Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t
 .Op Fl p Ar vcpu:hostcpu
 .Op Fl s Ar slot,emulation Ns Op , Ns Ar conf
-.Op Fl l Ar lpcdev Ns Op , Ns Ar conf
+.Op Fl U Ar uuid
 .Ar vmname
 .Sh DESCRIPTION
 .Nm
@@ -66,21 +68,49 @@ Generate ACPI tables.
 Required for
 .Fx Ns /amd64
 guests.
+.It Fl b
+Enable a low-level console device supported by
+.Fx kernels compiled with
+.Cd "device bvmconsole" .
+This option will be deprecated in a future version.
 .It Fl c Ar numcpus
 Number of guest virtual CPUs.
 The default is 1 and the maximum is 16.
 .It Fl C
 Include guest memory in core file.
-.It Fl H
-Yield the virtual CPU thread when a HLT instruction is detected.
-If this option is not specified, virtual CPUs will use 100% of a host CPU.
+.It Fl e
+Force
+.Nm
+to exit when a guest issues an access to an I/O port that is not emulated.
+This is intended for debug purposes.
 .It Fl g Ar gdbport
 For
-.Fx Ns /amd64 kernels compiled with
-.Cd "option bvmdebug" ,
+.Fx
+kernels compiled with
+.Cd "device bvmdebug" ,
 allow a remote kernel kgdb to be relayed to the guest kernel gdb stub
 via a local IPv4 address and this port.
 This option will be deprecated in a future version.
+.It Fl h
+Print help message and exit.
+.It Fl H
+Yield the virtual CPU thread when a HLT instruction is detected.
+If this option is not specified, virtual CPUs will use 100% of a host CPU.
+.It Fl l Ar lpcdev Ns Op , Ns Ar conf
+Allow devices behind the LPC PCI-ISA bridge to be configured.
+The only supported devices are the TTY-class devices,
+.Li com1
+and
+.Li com2 .
+.It Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t
+Guest physical memory size in bytes.
+This must be the same size that was given to
+.Xr bhyveload 8 .
+.Pp
+The size argument may be suffixed with one of K, M, G or T (either upper
+or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes,
+or terabytes.
+If no suffix is given, the value is assumed to be in megabytes.
 .It Fl p Ar vcpu:hostcpu
 Pin guest's virtual CPU
 .Em vcpu
@@ -88,9 +118,6 @@ to
 .Em hostcpu .
 .It Fl P
 Force the guest virtual CPU to exit when a PAUSE instruction is detected.
-.It Fl W
-Force virtio PCI device emulations to use MSI interrupts instead of MSI-X
-interrupts.
 .It Fl s Ar slot,emulation Ns Op , Ns Ar conf
 Configure a virtual PCI slot and function.
 .Pp
@@ -211,34 +238,21 @@ The host device must have been reserved at boot-time using the
 loader variable as described in
 .Xr vmm 4 .
 .El
-.It Fl l Ar lpcdev Ns Op , Ns Ar conf
-Allow devices behind the LPC PCI-ISA bridge to be configured.
-The only supported devices are the TTY-class devices,
-.Li com1
-and
-.Li com2 .
-.It Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t
-Guest physical memory size in bytes.
-This must be the same size that was given to
-.Xr bhyveload 8 .
-.Pp
-The size argument may be suffixed with one of K, M, G or T (either upper
-or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes,
-or terabytes.
-If no suffix is given, the value is assumed to be in megabytes.
-.It Fl e
-Force
-.Nm
-to exit when a guest issues an access to an I/O port that is not emulated.
-This is intended for debug purposes.
+.It Fl U Ar uuid
+Set the universally unique identifier
+.Pq UUID
+in the guest's System Management BIOS System Information structure.
+By default a UUID is generated from the host's hostname and
+.Ar vmname .
 .It Fl w
 Ignore accesses to unimplemented Model Specific Registers (MSRs). This is intended for debug purposes.
+.It Fl W
+Force virtio PCI device emulations to use MSI interrupts instead of MSI-X
+interrupts.
 .It Fl x
 The guest's local APIC is configured in x2APIC mode.
 .It Fl Y
 Disable MPtable generation.
-.It Fl h
-Print help message and exit.
 .It Ar vmname
 Alphanumeric name of the guest.
 This should be the same as that created by
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index 1e5d3b33abd2..7dcf6d016b87 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -69,16 +69,11 @@ __FBSDID("$FreeBSD$");
 
 #define GUEST_NIO_PORT		0x488	/* guest upcalls via i/o port */
 
-#define	VMEXIT_CONTINUE		1	/* continue from next instruction */
-#define	VMEXIT_RESTART		2	/* restart current instruction */
-#define	VMEXIT_ABORT		3	/* abort the vm run loop */
-#define	VMEXIT_RESET		4	/* guest machine has reset */
-#define	VMEXIT_POWEROFF		5	/* guest machine has powered off */
-
 #define MB		(1024UL * 1024)
 #define GB		(1024UL * MB)
 
 typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
+extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
 
 char *vmname;
 
@@ -101,7 +96,7 @@ static cpuset_t cpumask;
 
 static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
 
-struct vm_exit vmexit[VM_MAXCPU];
+static struct vm_exit vmexit[VM_MAXCPU];
 
 struct bhyvestats {
         uint64_t        vmexit_bogus;
@@ -112,8 +107,6 @@ struct bhyvestats {
         uint64_t        vmexit_inst_emul;
         uint64_t        cpu_switch_rotate;
         uint64_t        cpu_switch_direct;
-        int             io_reset;
-	int		io_poweroff;
 } stats;
 
 struct mt_vmm_info {
@@ -129,26 +122,26 @@ usage(int code)
 {
 
         fprintf(stderr,
-                "Usage: %s [-aehwAHIPW] [-g <gdb port>] [-s <pci>] [-c vcpus]\n"
-		"       %*s [-p vcpu:hostcpu] [-m mem] [-l <lpc>] <vm>\n"
+                "Usage: %s [-abehwxACHPWY] [-c vcpus] [-g <gdb port>] [-l <lpc>]\n"
+		"       %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n"
 		"       -a: local apic is in xAPIC mode (deprecated)\n"
-		"       -A: create an ACPI table\n"
-		"       -g: gdb port\n"
+		"       -A: create ACPI tables\n"
 		"       -c: # cpus (default 1)\n"
 		"       -C: include guest memory in core file\n"
-		"       -p: pin 'vcpu' to 'hostcpu'\n"
-		"       -H: vmexit from the guest on hlt\n"
-		"       -P: vmexit from the guest on pause\n"
-		"       -W: force virtio to use single-vector MSI\n"
 		"       -e: exit on unhandled I/O access\n"
+		"       -g: gdb port\n"
 		"       -h: help\n"
-		"       -s: <slot,driver,configinfo> PCI slot config\n"
+		"       -H: vmexit from the guest on hlt\n"
 		"       -l: LPC device configuration\n"
 		"       -m: memory size in MB\n"
+		"       -p: pin 'vcpu' to 'hostcpu'\n"
+		"       -P: vmexit from the guest on pause\n"
+		"       -s: <slot,driver,configinfo> PCI slot config\n"
+		"       -U: uuid\n"
 		"       -w: ignore unimplemented MSRs\n"
+		"       -W: force virtio to use single-vector MSI\n"
 		"       -x: local apic is in x2APIC mode\n"
-		"       -Y: disable MPtable generation\n"
-		"       -U: uuid\n",
+		"       -Y: disable MPtable generation\n",
 		progname, (int)strlen(progname), "");
 
 	exit(code);
@@ -187,6 +180,27 @@ pincpu_parse(const char *opt)
 	return (0);
 }
 
+void
+vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
+    int errcode)
+{
+	struct vmctx *ctx;
+	int error;
+
+	ctx = arg;
+	if (errcode_valid)
+		error = vm_inject_exception2(ctx, vcpu, vector, errcode);
+	else
+		error = vm_inject_exception(ctx, vcpu, vector);
+	assert(error == 0);
+
+	/*
+	 * Set the instruction length to 0 to ensure that the instruction is
+	 * restarted when the fault handler returns.
+	 */
+	vmexit[vcpu].inst_length = 0;
+}
+
 void *
 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
 {
@@ -315,27 +329,18 @@ vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 	}
 
 	error = emulate_inout(ctx, vcpu, vme, strictio);
-	if (error == INOUT_OK && in && !string) {
+	if (!error && in && !string) {
 		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX,
 		    vme->u.inout.eax);
+		assert(error == 0);
 	}
 
-	switch (error) {
-	case INOUT_OK:
-		return (VMEXIT_CONTINUE);
-	case INOUT_RESTART:
-		return (VMEXIT_RESTART);
-	case INOUT_RESET:
-		stats.io_reset++;
-		return (VMEXIT_RESET);
-	case INOUT_POWEROFF:
-		stats.io_poweroff++;
-		return (VMEXIT_POWEROFF);
-	default:
-		fprintf(stderr, "Unhandled %s%c 0x%04x\n",
-			in ? "in" : "out",
-			bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
+	if (error) {
+		fprintf(stderr, "Unhandled %s%c 0x%04x\n", in ? "in" : "out",
+		    bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
 		return (VMEXIT_ABORT);
+	} else {
+		return (VMEXIT_CONTINUE);
 	}
 }
 
@@ -352,8 +357,7 @@ vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 		fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
 		    vme->u.msr.code, *pvcpu);
 		if (strictmsr) {
-			error = vm_inject_exception2(ctx, *pvcpu, IDT_GP, 0);
-			assert(error == 0);
+			vm_inject_gp(ctx, *pvcpu);
 			return (VMEXIT_RESTART);
 		}
 	}
@@ -379,8 +383,7 @@ vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 		fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
 		    vme->u.msr.code, vme->u.msr.wval, *pvcpu);
 		if (strictmsr) {
-			error = vm_inject_exception2(ctx, *pvcpu, IDT_GP, 0);
-			assert(error == 0);
+			vm_inject_gp(ctx, *pvcpu);
 			return (VMEXIT_RESTART);
 		}
 	}
@@ -399,6 +402,16 @@ vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 	return (retval);
 }
 
+#define	DEBUG_EPT_MISCONFIG
+#ifdef DEBUG_EPT_MISCONFIG
+#define	EXIT_REASON_EPT_MISCONFIG	49
+#define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400
+#define	VMCS_IDENT(x)			((x) | 0x80000000)
+
+static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
+static int ept_misconfig_ptenum;
+#endif
+
 static int
 vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 {
@@ -413,7 +426,21 @@ vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	    vmexit->u.vmx.exit_qualification);
 	fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
 	fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
-
+#ifdef DEBUG_EPT_MISCONFIG
+	if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
+		vm_get_register(ctx, *pvcpu,
+		    VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
+		    &ept_misconfig_gpa);
+		vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
+		    &ept_misconfig_ptenum);
+		fprintf(stderr, "\tEPT misconfiguration:\n");
+		fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
+		fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
+		    ept_misconfig_ptenum, ept_misconfig_pte[0],
+		    ept_misconfig_pte[1], ept_misconfig_pte[2],
+		    ept_misconfig_pte[3]);
+	}
+#endif	/* DEBUG_EPT_MISCONFIG */
 	return (VMEXIT_ABORT);
 }
 
@@ -465,7 +492,7 @@ vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	stats.vmexit_inst_emul++;
 
 	err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
-			  &vmexit->u.inst_emul.vie);
+	    &vmexit->u.inst_emul.vie, &vmexit->u.inst_emul.paging);
 
 	if (err) {
 		if (err == EINVAL) {
@@ -515,6 +542,8 @@ vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 		exit(1);
 	case VM_SUSPEND_HALT:
 		exit(2);
+	case VM_SUSPEND_TRIPLEFAULT:
+		exit(3);
 	default:
 		fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
 		exit(100);
@@ -532,7 +561,8 @@ static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
 	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
 	[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
 	[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
-	[VM_EXITCODE_SUSPENDED] = vmexit_suspend
+	[VM_EXITCODE_SUSPENDED] = vmexit_suspend,
+	[VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
 };
 
 static void
@@ -540,7 +570,6 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
 {
 	int error, rc, prevcpu;
 	enum vm_exitcode exitcode;
-	enum vm_suspend_how how;
 	cpuset_t active_cpus;
 
 	if (vcpumap[vcpu] != NULL) {
@@ -575,16 +604,6 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
 		case VMEXIT_RESTART:
                         rip = vmexit[vcpu].rip;
 			break;
-		case VMEXIT_RESET:
-		case VMEXIT_POWEROFF:
-			if (rc == VMEXIT_RESET)
-				how = VM_SUSPEND_RESET;
-			else
-				how = VM_SUSPEND_POWEROFF;
-			error = vm_suspend(ctx, how);
-			assert(error == 0 || errno == EALREADY);
-                        rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length;
-			break;
 		case VMEXIT_ABORT:
 			abort();
 		default:
diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h
index f18d42fc3f01..87824ef9f4f4 100644
--- a/usr.sbin/bhyve/bhyverun.h
+++ b/usr.sbin/bhyve/bhyverun.h
@@ -35,6 +35,10 @@
 #define	__CTASSERT(x, y)	typedef char __assert ## y[(x) ? 1 : -1]
 #endif
 
+#define	VMEXIT_CONTINUE		1	/* continue from next instruction */
+#define	VMEXIT_RESTART		2	/* restart current instruction */
+#define	VMEXIT_ABORT		3	/* abort the vm run loop */
+
 struct vmctx;
 extern int guest_ncpus;
 extern char *guest_uuid_str;
diff --git a/usr.sbin/bhyve/block_if.c b/usr.sbin/bhyve/block_if.c
index b29bc7856e86..1ec0344f3fca 100644
--- a/usr.sbin/bhyve/block_if.c
+++ b/usr.sbin/bhyve/block_if.c
@@ -390,6 +390,55 @@ blockif_close(struct blockif_ctxt *bc)
 }
 
 /*
+ * Return virtual C/H/S values for a given block. Use the algorithm
+ * outlined in the VHD specification to calculate values.
+ */
+void
+blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
+{
+	off_t sectors;		/* total sectors of the block dev */
+	off_t hcyl;		/* cylinders times heads */
+	uint16_t secpt;		/* sectors per track */
+	uint8_t heads;
+
+	assert(bc->bc_magic == BLOCKIF_SIG);
+
+	sectors = bc->bc_size / bc->bc_sectsz;
+
+	/* Clamp the size to the largest possible with CHS */
+	if (sectors > 65535UL*16*255)
+		sectors = 65535UL*16*255;
+
+	if (sectors >= 65536UL*16*63) {
+		secpt = 255;
+		heads = 16;
+		hcyl = sectors / secpt;
+	} else {
+		secpt = 17;
+		hcyl = sectors / secpt;
+		heads = (hcyl + 1023) / 1024;
+
+		if (heads < 4)
+			heads = 4;
+
+		if (hcyl >= (heads * 1024) || heads > 16) {
+			secpt = 31;
+			heads = 16;
+			hcyl = sectors / secpt;
+		}
+		if (hcyl >= (heads * 1024)) {
+			secpt = 63;
+			heads = 16;
+			hcyl = sectors / secpt;
+		}
+	}
+
+	*c = hcyl / heads;
+	*h = heads;
+	*s = secpt;
+}
+
+/*
  * Accessors
  */
 off_t
diff --git a/usr.sbin/bhyve/block_if.h b/usr.sbin/bhyve/block_if.h
index e0c0bb1f8c8b..c2c21f657446 100644
--- a/usr.sbin/bhyve/block_if.h
+++ b/usr.sbin/bhyve/block_if.h
@@ -52,6 +52,8 @@ struct blockif_req {
 struct blockif_ctxt;
 struct blockif_ctxt *blockif_open(const char *optstr, const char *ident);
 off_t	blockif_size(struct blockif_ctxt *bc);
+void	blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h,
+    uint8_t *s);
 int	blockif_sectsz(struct blockif_ctxt *bc);
 int	blockif_queuesz(struct blockif_ctxt *bc);
 int	blockif_is_ro(struct blockif_ctxt *bc);
diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c
index fe9e0d85625b..1041a59d2194 100644
--- a/usr.sbin/bhyve/inout.c
+++ b/usr.sbin/bhyve/inout.c
@@ -154,31 +154,28 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 		/* Limit number of back-to-back in/out emulations to 16 */
 		iterations = MIN(count, 16);
 		while (iterations > 0) {
+			assert(retval == 0);
 			if (vie_calculate_gla(vis->paging.cpu_mode,
 			    vis->seg_name, &vis->seg_desc, index, bytes,
 			    addrsize, prot, &gla)) {
-				error = vm_inject_exception2(ctx, vcpu,
-				    IDT_GP, 0);
-				assert(error == 0);
-				retval = INOUT_RESTART;
+				vm_inject_gp(ctx, vcpu);
 				break;
 			}
 
-			error = vm_gla2gpa(ctx, vcpu, &vis->paging, gla, bytes,
-			    prot, iov, nitems(iov));
-			assert(error == 0 || error == 1 || error == -1);
-			if (error) {
-				retval = (error == 1) ? INOUT_RESTART :
-				    INOUT_ERROR;
+			error = vm_copy_setup(ctx, vcpu, &vis->paging, gla,
+			    bytes, prot, iov, nitems(iov));
+			if (error == -1) {
+				retval = -1;  /* Unrecoverable error */
+				break;
+			} else if (error == 1) {
+				retval = 0;  /* Resume guest to handle fault */
 				break;
 			}
 
 			if (vie_alignment_check(vis->paging.cpl, bytes,
 			    vis->cr0, vis->rflags, gla)) {
-				error = vm_inject_exception2(ctx, vcpu,
-				    IDT_AC, 0);
-				assert(error == 0);
-				return (INOUT_RESTART);
+				vm_inject_ac(ctx, vcpu, 0);
+				break;
 			}
 
 			val = 0;
@@ -217,8 +214,8 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
 		}
 
 		/* Restart the instruction if more iterations remain */
-		if (retval == INOUT_OK && count != 0)
-			retval = INOUT_RESTART;
+		if (retval == 0 && count != 0)
+			vmexit->inst_length = 0;
 	} else {
 		if (!in) {
 			val = vmexit->u.inout.eax & vie_size2mask(bytes);
diff --git a/usr.sbin/bhyve/inout.h b/usr.sbin/bhyve/inout.h
index f15a2c87db72..7f390951d418 100644
--- a/usr.sbin/bhyve/inout.h
+++ b/usr.sbin/bhyve/inout.h
@@ -34,13 +34,9 @@
 struct vmctx;
 struct vm_exit;
 
-/* Handler return values. */
-#define	INOUT_ERROR	-1
-#define	INOUT_OK	0
-#define	INOUT_RESTART	1
-#define	INOUT_RESET	2
-#define	INOUT_POWEROFF	3
-
+/*
+ * inout emulation handlers return 0 on success and -1 on failure.
+ */
 typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port,
 			    int bytes, uint32_t *eax, void *arg);
 
diff --git a/usr.sbin/bhyve/mem.c b/usr.sbin/bhyve/mem.c
index 7ea630f2a587..2a9f430c8262 100644
--- a/usr.sbin/bhyve/mem.c
+++ b/usr.sbin/bhyve/mem.c
@@ -157,10 +157,12 @@ mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg)
 }
 
 int
-emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie)
+emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie,
+    struct vm_guest_paging *paging)
+
 {
 	struct mmio_rb_range *entry;
-	int err;
+	int err, immutable;
 	
 	pthread_rwlock_rdlock(&mmio_rwlock);
 	/*
@@ -184,10 +186,28 @@ emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie)
 	}
 
 	assert(entry != NULL);
-	err = vmm_emulate_instruction(ctx, vcpu, paddr, vie,
+
+	/*
+	 * An 'immutable' memory range is guaranteed to be never removed
+	 * so there is no need to hold 'mmio_rwlock' while calling the
+	 * handler.
+	 *
+	 * XXX writes to the PCIR_COMMAND register can cause register_mem()
+	 * to be called. If the guest is using PCI extended config space
+	 * to modify the PCIR_COMMAND register then register_mem() can
+	 * deadlock on 'mmio_rwlock'. However by registering the extended
+	 * config space window as 'immutable' the deadlock can be avoided.
+	 */
+	immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE);
+	if (immutable)
+		pthread_rwlock_unlock(&mmio_rwlock);
+
+	err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, paging,
 				      mem_read, mem_write, &entry->mr_param);
-	pthread_rwlock_unlock(&mmio_rwlock);
-	
+
+	if (!immutable)
+		pthread_rwlock_unlock(&mmio_rwlock);
+
 	return (err);
 }
 
@@ -244,6 +264,7 @@ unregister_mem(struct mem_range *memp)
 		mr = &entry->mr_param;
 		assert(mr->name == memp->name);
 		assert(mr->base == memp->base && mr->size == memp->size); 
+		assert((mr->flags & MEM_F_IMMUTABLE) == 0);
 		RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry);
 
 		/* flush Per-vCPU cache */	
diff --git a/usr.sbin/bhyve/mem.h b/usr.sbin/bhyve/mem.h
index 264bff9e82b0..f671eaedf786 100644
--- a/usr.sbin/bhyve/mem.h
+++ b/usr.sbin/bhyve/mem.h
@@ -48,9 +48,11 @@ struct mem_range {
 #define	MEM_F_READ		0x1
 #define	MEM_F_WRITE		0x2
 #define	MEM_F_RW		0x3
+#define	MEM_F_IMMUTABLE		0x4	/* mem_range cannot be unregistered */
 
 void	init_mem(void);
-int     emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie);
+int     emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie,
+		    struct vm_guest_paging *paging);
 		    
 int	register_mem(struct mem_range *memp);
 int	register_mem_fallback(struct mem_range *memp);
diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c
index 9f6110730b09..214237df3757 100644
--- a/usr.sbin/bhyve/pci_ahci.c
+++ b/usr.sbin/bhyve/pci_ahci.c
@@ -336,8 +336,9 @@ ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd)
 	fis[13] = cfis[13];
 	if (fis[2] & ATA_S_ERROR)
 		p->is |= AHCI_P_IX_TFE;
+	else
+		p->ci &= ~(1 << slot);
 	p->tfd = tfd;
-	p->ci &= ~(1 << slot);
 	ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
 }
 
@@ -598,10 +599,16 @@ handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)
 	} else {
 		uint16_t buf[256];
 		uint64_t sectors;
+		uint16_t cyl;
+		uint8_t sech, heads;
 
 		sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
+		blockif_chs(p->bctx, &cyl, &heads, &sech);
 		memset(buf, 0, sizeof(buf));
 		buf[0] = 0x0040;
+		buf[1] = cyl;
+		buf[3] = heads;
+		buf[6] = sech;
 		/* TODO emulate different serial? */
 		ata_string((uint8_t *)(buf+10), "123456", 20);
 		ata_string((uint8_t *)(buf+23), "001", 8);
@@ -645,8 +652,8 @@ handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)
 		write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
 		p->tfd = ATA_S_DSC | ATA_S_READY;
 		p->is |= AHCI_P_IX_DP;
+		p->ci &= ~(1 << slot);
 	}
-	p->ci &= ~(1 << slot);
 	ahci_generate_intr(p->pr_sc);
 }
 
@@ -688,8 +695,8 @@ handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis)
 		write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
 		p->tfd = ATA_S_DSC | ATA_S_READY;
 		p->is |= AHCI_P_IX_DHR;
+		p->ci &= ~(1 << slot);
 	}
-	p->ci &= ~(1 << slot);
 	ahci_generate_intr(p->pr_sc);
 }
 
@@ -1292,7 +1299,6 @@ ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
 		if (!p->atapi) {
 			p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
 			p->is |= AHCI_P_IX_TFE;
-			p->ci &= ~(1 << slot);
 			ahci_generate_intr(p->pr_sc);
 		} else
 			handle_packet_cmd(p, slot, cfis);
@@ -1301,7 +1307,6 @@ ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
 		WPRINTF("Unsupported cmd:%02x\n", cfis[2]);
 		p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
 		p->is |= AHCI_P_IX_TFE;
-		p->ci &= ~(1 << slot);
 		ahci_generate_intr(p->pr_sc);
 		break;
 	}
@@ -1369,8 +1374,11 @@ ahci_handle_port(struct ahci_port *p)
 	 * are already in-flight.
 	 */
 	for (i = 0; (i < 32) && p->ci; i++) {
-		if ((p->ci & (1 << i)) && !(p->pending & (1 << i)))
+		if ((p->ci & (1 << i)) && !(p->pending & (1 << i))) {
+			p->cmd &= ~AHCI_P_CMD_CCS_MASK;
+			p->cmd |= i << AHCI_P_CMD_CCS_SHIFT;
 			ahci_handle_slot(p, i);
+		}
 	}
 }
 
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
index 458ba76480b1..6b906ede42fc 100644
--- a/usr.sbin/bhyve/pci_emul.c
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -109,16 +109,20 @@ static uint64_t pci_emul_membase64;
 #define	PCI_EMUL_IOBASE		0x2000
 #define	PCI_EMUL_IOLIMIT	0x10000
 
-#define	PCI_EMUL_MEMLIMIT32	0xE0000000	/* 3.5GB */
+#define	PCI_EMUL_ECFG_BASE	0xE0000000		    /* 3.5GB */
+#define	PCI_EMUL_ECFG_SIZE	(MAXBUSES * 1024 * 1024)    /* 1MB per bus */
+SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE);
+
+#define	PCI_EMUL_MEMLIMIT32	PCI_EMUL_ECFG_BASE
 
 #define	PCI_EMUL_MEMBASE64	0xD000000000UL
 #define	PCI_EMUL_MEMLIMIT64	0xFD00000000UL
 
 static struct pci_devemu *pci_emul_finddev(char *name);
-static void	pci_lintr_route(struct pci_devinst *pi);
-static void	pci_lintr_update(struct pci_devinst *pi);
-
-static struct mem_range pci_mem_hole;
+static void pci_lintr_route(struct pci_devinst *pi);
+static void pci_lintr_update(struct pci_devinst *pi);
+static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot,
+    int func, int coff, int bytes, uint32_t *val);
 
 /*
  * I/O access
@@ -1023,12 +1027,37 @@ pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
 	return (0);
 }
 
+static int
+pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+    int bytes, uint64_t *val, void *arg1, long arg2)
+{
+	int bus, slot, func, coff, in;
+
+	coff = addr & 0xfff;
+	func = (addr >> 12) & 0x7;
+	slot = (addr >> 15) & 0x1f;
+	bus = (addr >> 20) & 0xff;
+	in = (dir == MEM_F_READ);
+	if (in)
+		*val = ~0UL;
+	pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val);
+	return (0);
+}
+
+uint64_t
+pci_ecfg_base(void)
+{
+
+	return (PCI_EMUL_ECFG_BASE);
+}
+
 #define	BUSIO_ROUNDUP		32
 #define	BUSMEM_ROUNDUP		(1024 * 1024)
 
 int
 init_pci(struct vmctx *ctx)
 {
+	struct mem_range mr;
 	struct pci_devemu *pde;
 	struct businfo *bi;
 	struct slotinfo *si;
@@ -1112,22 +1141,34 @@ init_pci(struct vmctx *ctx)
 	 * The guest physical memory map looks like the following:
 	 * [0,		    lowmem)		guest system memory
 	 * [lowmem,	    lowmem_limit)	memory hole (may be absent)
-	 * [lowmem_limit,   4GB)		PCI hole (32-bit BAR allocation)
+	 * [lowmem_limit,   0xE0000000)		PCI hole (32-bit BAR allocation)
+	 * [0xE0000000,	    0xF0000000)		PCI extended config window
+	 * [0xF0000000,	    4GB)		LAPIC, IOAPIC, HPET, firmware
 	 * [4GB,	    4GB + highmem)
-	 *
+	 */
+
+	/*
 	 * Accesses to memory addresses that are not allocated to system
 	 * memory or PCI devices return 0xff's.
 	 */
 	lowmem = vm_get_lowmem_size(ctx);
+	bzero(&mr, sizeof(struct mem_range));
+	mr.name = "PCI hole";
+	mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
+	mr.base = lowmem;
+	mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem;
+	mr.handler = pci_emul_fallback_handler;
+	error = register_mem_fallback(&mr);
+	assert(error == 0);
 
-	memset(&pci_mem_hole, 0, sizeof(struct mem_range));
-	pci_mem_hole.name = "PCI hole";
-	pci_mem_hole.flags = MEM_F_RW;
-	pci_mem_hole.base = lowmem;
-	pci_mem_hole.size = (4ULL * 1024 * 1024 * 1024) - lowmem;
-	pci_mem_hole.handler = pci_emul_fallback_handler;
-
-	error = register_mem_fallback(&pci_mem_hole);
+	/* PCI extended config space */
+	bzero(&mr, sizeof(struct mem_range));
+	mr.name = "PCI ECFG";
+	mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
+	mr.base = PCI_EMUL_ECFG_BASE;
+	mr.size = PCI_EMUL_ECFG_SIZE;
+	mr.handler = pci_emul_ecfg_handler;
+	error = register_mem(&mr);
 	assert(error == 0);
 
 	return (0);
@@ -1612,41 +1653,6 @@ pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv)
 	}
 }
 
-static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff;
-
-static int
-pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
-		 uint32_t *eax, void *arg)
-{
-	uint32_t x;
-
-	if (bytes != 4) {
-		if (in)
-			*eax = (bytes == 2) ? 0xffff : 0xff;
-		return (0);
-	}
-
-	if (in) {
-		x = (cfgbus << 16) |
-		    (cfgslot << 11) |
-		    (cfgfunc << 8) |
-		    cfgoff;
-                if (cfgenable)
-			x |= CONF1_ENABLE;	       
-		*eax = x;
-	} else {
-		x = *eax;
-		cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE;
-		cfgoff = x & PCI_REGMAX;
-		cfgfunc = (x >> 8) & PCI_FUNCMAX;
-		cfgslot = (x >> 11) & PCI_SLOTMAX;
-		cfgbus = (x >> 16) & PCI_BUSMAX;
-	}
-
-	return (0);
-}
-INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr);
-
 static uint32_t
 bits_changed(uint32_t old, uint32_t new, uint32_t mask)
 {
@@ -1709,41 +1715,51 @@ pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes)
 	pci_lintr_update(pi);
 }	
 
-static int
-pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
-		 uint32_t *eax, void *arg)
+static void
+pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func,
+    int coff, int bytes, uint32_t *eax)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	struct pci_devinst *pi;
 	struct pci_devemu *pe;
-	int coff, idx, needcfg;
+	int idx, needcfg;
 	uint64_t addr, bar, mask;
 
-	assert(bytes == 1 || bytes == 2 || bytes == 4);
-
-	if ((bi = pci_businfo[cfgbus]) != NULL) {
-		si = &bi->slotinfo[cfgslot];
-		pi = si->si_funcs[cfgfunc].fi_devi;
+	if ((bi = pci_businfo[bus]) != NULL) {
+		si = &bi->slotinfo[slot];
+		pi = si->si_funcs[func].fi_devi;
 	} else
 		pi = NULL;
 
-	coff = cfgoff + (port - CONF1_DATA_PORT);
-
-#if 0
-	printf("pcicfg-%s from 0x%0x of %d bytes (%d/%d/%d)\n\r",
-		in ? "read" : "write", coff, bytes, cfgbus, cfgslot, cfgfunc);
-#endif
-
 	/*
-	 * Just return if there is no device at this cfgslot:cfgfunc,
-	 * if the guest is doing an un-aligned access, or if the config
-	 * address word isn't enabled.
+	 * Just return if there is no device at this slot:func or if the
+	 * the guest is doing an un-aligned access.
 	 */
-	if (!cfgenable || pi == NULL || (coff & (bytes - 1)) != 0) {
+	if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) ||
+	    (coff & (bytes - 1)) != 0) {
 		if (in)
 			*eax = 0xffffffff;
-		return (0);
+		return;
+	}
+
+	/*
+	 * Ignore all writes beyond the standard config space and return all
+	 * ones on reads.
+	 */
+	if (coff >= PCI_REGMAX + 1) {
+		if (in) {
+			*eax = 0xffffffff;
+			/*
+			 * Extended capabilities begin at offset 256 in config
+			 * space. Absence of extended capabilities is signaled
+			 * with all 0s in the extended capability header at
+			 * offset 256.
+			 */
+			if (coff <= PCI_REGMAX + 4)
+				*eax = 0x00000000;
+		}
+		return;
 	}
 
 	pe = pi->pi_d;
@@ -1754,8 +1770,8 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 	if (in) {
 		/* Let the device emulation override the default handler */
 		if (pe->pe_cfgread != NULL) {
-			needcfg = pe->pe_cfgread(ctx, vcpu, pi,
-						    coff, bytes, eax);
+			needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes,
+			    eax);
 		} else {
 			needcfg = 1;
 		}
@@ -1769,12 +1785,12 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 				*eax = pci_get_cfgdata32(pi, coff);
 		}
 
-		pci_emul_hdrtype_fixup(cfgbus, cfgslot, coff, bytes, eax);
+		pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax);
 	} else {
 		/* Let the device emulation override the default handler */
 		if (pe->pe_cfgwrite != NULL &&
 		    (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
-			return (0);
+			return;
 
 		/*
 		 * Special handling for write to BAR registers
@@ -1785,7 +1801,7 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 			 * 4-byte aligned.
 			 */
 			if (bytes != 4 || (coff & 0x3) != 0)
-				return (0);
+				return;
 			idx = (coff - PCIR_BAR(0)) / 4;
 			mask = ~(pi->pi_bar[idx].size - 1);
 			switch (pi->pi_bar[idx].type) {
@@ -1843,7 +1859,57 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 			CFGWRITE(pi, coff, *eax, bytes);
 		}
 	}
+}
+
+static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff;
+
+static int
+pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		 uint32_t *eax, void *arg)
+{
+	uint32_t x;
+
+	if (bytes != 4) {
+		if (in)
+			*eax = (bytes == 2) ? 0xffff : 0xff;
+		return (0);
+	}
+
+	if (in) {
+		x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff;
+		if (cfgenable)
+			x |= CONF1_ENABLE;
+		*eax = x;
+	} else {
+		x = *eax;
+		cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE;
+		cfgoff = x & PCI_REGMAX;
+		cfgfunc = (x >> 8) & PCI_FUNCMAX;
+		cfgslot = (x >> 11) & PCI_SLOTMAX;
+		cfgbus = (x >> 16) & PCI_BUSMAX;
+	}
+
+	return (0);
+}
+INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr);
+
+static int
+pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		 uint32_t *eax, void *arg)
+{
+	int coff;
 
+	assert(bytes == 1 || bytes == 2 || bytes == 4);
+
+	coff = cfgoff + (port - CONF1_DATA_PORT);
+	if (cfgenable) {
+		pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes,
+		    eax);
+	} else {
+		/* Ignore accesses to cfgdata if not enabled by cfgaddr */
+		if (in)
+			*eax = 0xffffffff;
+	}
 	return (0);
 }
 
diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h
index 866ffc5b8224..6b8c4e0fd31c 100644
--- a/usr.sbin/bhyve/pci_emul.h
+++ b/usr.sbin/bhyve/pci_emul.h
@@ -235,6 +235,7 @@ uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size);
 int	pci_count_lintr(int bus);
 void	pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg);
 void	pci_write_dsdt(void);
+uint64_t pci_ecfg_base(void);
 int	pci_bus_configured(int bus);
 
 static __inline void 
diff --git a/usr.sbin/bhyve/pci_irq.c b/usr.sbin/bhyve/pci_irq.c
index 653aeb0ff1f6..20e033f2c40e 100644
--- a/usr.sbin/bhyve/pci_irq.c
+++ b/usr.sbin/bhyve/pci_irq.c
@@ -115,7 +115,7 @@ void
 pci_irq_reserve(int irq)
 {
 
-	assert(irq < nitems(irq_counts));
+	assert(irq >= 0 && irq < nitems(irq_counts));
 	assert(pirq_cold);
 	assert(irq_counts[irq] == 0 || irq_counts[irq] == IRQ_DISABLED);
 	irq_counts[irq] = IRQ_DISABLED;
@@ -125,10 +125,10 @@ void
 pci_irq_use(int irq)
 {
 
-	assert(irq < nitems(irq_counts));
+	assert(irq >= 0 && irq < nitems(irq_counts));
 	assert(pirq_cold);
-	if (irq_counts[irq] != IRQ_DISABLED)
-		irq_counts[irq]++;
+	assert(irq_counts[irq] != IRQ_DISABLED);
+	irq_counts[irq]++;
 }
 
 void
@@ -197,7 +197,7 @@ pirq_alloc_pin(struct vmctx *ctx)
 {
 	int best_count, best_irq, best_pin, irq, pin;
 
-	pirq_cold = 1;
+	pirq_cold = 0;
 
 	/* First, find the least-used PIRQ pin. */
 	best_pin = 0;
@@ -222,7 +222,7 @@ pirq_alloc_pin(struct vmctx *ctx)
 				best_count = irq_counts[irq];
 			}
 		}
-		assert(best_irq != 0);
+		assert(best_irq >= 0);
 		irq_counts[best_irq]++;
 		pirqs[best_pin].reg = best_irq;
 		vm_isa_set_irq_trigger(ctx, best_irq, LEVEL_TRIGGER);
@@ -234,9 +234,6 @@ pirq_alloc_pin(struct vmctx *ctx)
 int
 pirq_irq(int pin)
 {
-
-	if (pin == -1)
-		return (255);
 	assert(pin > 0 && pin <= nitems(pirqs));
 	return (pirqs[pin - 1].reg & PIRQ_IRQ);
 }
diff --git a/usr.sbin/bhyve/pm.c b/usr.sbin/bhyve/pm.c
index 67126d8765c7..f5a2d438be7f 100644
--- a/usr.sbin/bhyve/pm.c
+++ b/usr.sbin/bhyve/pm.c
@@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmm.h>
 
 #include <assert.h>
+#include <errno.h>
 #include <pthread.h>
 #include <signal.h>
 #include <vmmapi.h>
@@ -56,6 +57,8 @@ static int
 reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
     uint32_t *eax, void *arg)
 {
+	int error;
+
 	static uint8_t reset_control;
 
 	if (bytes != 1)
@@ -66,8 +69,10 @@ reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		reset_control = *eax;
 
 		/* Treat hard and soft resets the same. */
-		if (reset_control & 0x4)
-			return (INOUT_RESET);
+		if (reset_control & 0x4) {
+			error = vm_suspend(ctx, VM_SUSPEND_RESET);
+			assert(error == 0 || errno == EALREADY);
+		}
 	}
 	return (0);
 }
@@ -224,6 +229,7 @@ static int
 pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
     uint32_t *eax, void *arg)
 {
+	int error;
 
 	if (bytes != 2)
 		return (-1);
@@ -243,8 +249,10 @@ pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
 		 * says that '5' should be stored in SLP_TYP for S5.
 		 */
 		if (*eax & PM1_SLP_EN) {
-			if ((pm1_control & PM1_SLP_TYP) >> 10 == 5)
-				return (INOUT_POWEROFF);
+			if ((pm1_control & PM1_SLP_TYP) >> 10 == 5) {
+				error = vm_suspend(ctx, VM_SUSPEND_POWEROFF);
+				assert(error == 0 || errno == EALREADY);
+			}
 		}
 	}
 	return (0);
diff --git a/usr.sbin/bhyve/smbiostbl.c b/usr.sbin/bhyve/smbiostbl.c
index d560f022fc2a..28c7eb2c74b4 100644
--- a/usr.sbin/bhyve/smbiostbl.c
+++ b/usr.sbin/bhyve/smbiostbl.c
@@ -321,8 +321,8 @@ struct smbios_table_type0 smbios_type0_template = {
 
 const char *smbios_type0_strings[] = {
 	"BHYVE",	/* vendor string */
-	__TIME__,	/* bios version string */
-	__DATE__,	/* bios release date string */
+	"1.00",		/* bios version string */
+	"03/14/2014",	/* bios release date string */
 	NULL
 };
 
diff --git a/usr.sbin/bhyve/task_switch.c b/usr.sbin/bhyve/task_switch.c
new file mode 100644
index 000000000000..0002da8df8ef
--- /dev/null
+++ b/usr.sbin/bhyve/task_switch.c
@@ -0,0 +1,932 @@
+/*-
+ * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/_iovec.h>
+#include <sys/mman.h>
+
+#include <x86/psl.h>
+#include <x86/segments.h>
+#include <x86/specialreg.h>
+#include <machine/vmm.h>
+#include <machine/vmm_instruction_emul.h>
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <errno.h>
+
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+
+/*
+ * Using 'struct i386tss' is tempting but causes myriad sign extension
+ * issues because all of its fields are defined as signed integers.
+ */
+struct tss32 {
+	uint16_t	tss_link;
+	uint16_t	rsvd1;
+	uint32_t	tss_esp0;
+	uint16_t	tss_ss0;
+	uint16_t	rsvd2;
+	uint32_t	tss_esp1;
+	uint16_t	tss_ss1;
+	uint16_t	rsvd3;
+	uint32_t	tss_esp2;
+	uint16_t	tss_ss2;
+	uint16_t	rsvd4;
+	uint32_t	tss_cr3;
+	uint32_t	tss_eip;
+	uint32_t	tss_eflags;
+	uint32_t	tss_eax;
+	uint32_t	tss_ecx;
+	uint32_t	tss_edx;
+	uint32_t	tss_ebx;
+	uint32_t	tss_esp;
+	uint32_t	tss_ebp;
+	uint32_t	tss_esi;
+	uint32_t	tss_edi;
+	uint16_t	tss_es;
+	uint16_t	rsvd5;
+	uint16_t	tss_cs;
+	uint16_t	rsvd6;
+	uint16_t	tss_ss;
+	uint16_t	rsvd7;
+	uint16_t	tss_ds;
+	uint16_t	rsvd8;
+	uint16_t	tss_fs;
+	uint16_t	rsvd9;
+	uint16_t	tss_gs;
+	uint16_t	rsvd10;
+	uint16_t	tss_ldt;
+	uint16_t	rsvd11;
+	uint16_t	tss_trap;
+	uint16_t	tss_iomap;
+};
+CTASSERT(sizeof(struct tss32) == 104);
+
+#define	SEL_START(sel)	(((sel) & ~0x7))
+#define	SEL_LIMIT(sel)	(((sel) | 0x7))
+#define	TSS_BUSY(type)	(((type) & 0x2) != 0)
+
+static uint64_t
+GETREG(struct vmctx *ctx, int vcpu, int reg)
+{
+	uint64_t val;
+	int error;
+
+	error = vm_get_register(ctx, vcpu, reg, &val);
+	assert(error == 0);
+	return (val);
+}
+
+static void
+SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
+{
+	int error;
+
+	error = vm_set_register(ctx, vcpu, reg, val);
+	assert(error == 0);
+}
+
+static struct seg_desc
+usd_to_seg_desc(struct user_segment_descriptor *usd)
+{
+	struct seg_desc seg_desc;
+
+	seg_desc.base = (u_int)USD_GETBASE(usd);
+	if (usd->sd_gran)
+		seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
+	else
+		seg_desc.limit = (u_int)USD_GETLIMIT(usd);
+	seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
+	seg_desc.access |= usd->sd_xx << 12;
+	seg_desc.access |= usd->sd_def32 << 14;
+	seg_desc.access |= usd->sd_gran << 15;
+
+	return (seg_desc);
+}
+
+/*
+ * Inject an exception with an error code that is a segment selector.
+ * The format of the error code is described in section 6.13, "Error Code",
+ * Intel SDM volume 3.
+ *
+ * Bit 0 (EXT) denotes whether the exception occurred during delivery
+ * of an external event like an interrupt.
+ *
+ * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
+ * in the IDT.
+ *
+ * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
+ */
+static void
+sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
+{
+	/*
+	 * Bit 2 from the selector is retained as-is in the error code.
+	 *
+	 * Bit 1 can be safely cleared because none of the selectors
+	 * encountered during task switch emulation refer to a task
+	 * gate in the IDT.
+	 *
+	 * Bit 0 is set depending on the value of 'ext'.
+	 */
+	sel &= ~0x3;
+	if (ext)
+		sel |= 0x1;
+	vm_inject_fault(ctx, vcpu, vector, 1, sel);
+}
+
+/*
+ * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
+ * and non-zero otherwise.
+ */
+static int
+desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
+{
+	uint64_t base;
+	uint32_t limit, access;
+	int error, reg;
+
+	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
+	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
+	assert(error == 0);
+
+	if (reg == VM_REG_GUEST_LDTR) {
+		if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
+			return (-1);
+	}
+
+	if (limit < SEL_LIMIT(sel))
+		return (-1);
+	else
+		return (0);
+}
+
+/*
+ * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
+ * by the selector 'sel'.
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+static int
+desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint16_t sel, struct user_segment_descriptor *desc, bool doread)
+{
+	struct iovec iov[2];
+	uint64_t base;
+	uint32_t limit, access;
+	int error, reg;
+
+	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
+	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
+	assert(error == 0);
+	assert(limit >= SEL_LIMIT(sel));
+
+	error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel),
+	    sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov));
+	if (error == 0) {
+		if (doread)
+			vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
+		else
+			vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
+	}
+	return (error);
+}
+
+static int
+desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint16_t sel, struct user_segment_descriptor *desc)
+{
+	return (desc_table_rw(ctx, vcpu, paging, sel, desc, true));
+}
+
+static int
+desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint16_t sel, struct user_segment_descriptor *desc)
+{
+	return (desc_table_rw(ctx, vcpu, paging, sel, desc, false));
+}
+
+/*
+ * Read the TSS descriptor referenced by 'sel' into 'desc'.
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+static int
+read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
+    uint16_t sel, struct user_segment_descriptor *desc)
+{
+	struct vm_guest_paging sup_paging;
+	int error;
+
+	assert(!ISLDT(sel));
+	assert(IDXSEL(sel) != 0);
+
+	/* Fetch the new TSS descriptor */
+	if (desc_table_limit_check(ctx, vcpu, sel)) {
+		if (ts->reason == TSR_IRET)
+			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+		else
+			sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext);
+		return (1);
+	}
+
+	sup_paging = ts->paging;
+	sup_paging.cpl = 0;		/* implicit supervisor mode */
+	error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc);
+	return (error);
+}
+
+static bool
+code_desc(int sd_type)
+{
+	/* code descriptor */
+	return ((sd_type & 0x18) == 0x18);
+}
+
+static bool
+stack_desc(int sd_type)
+{
+	/* writable data descriptor */
+	return ((sd_type & 0x1A) == 0x12);
+}
+
+static bool
+data_desc(int sd_type)
+{
+	/* data descriptor or a readable code descriptor */
+	return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
+}
+
+static bool
+ldt_desc(int sd_type)
+{
+
+	return (sd_type == SDT_SYSLDT);
+}
+
+/*
+ * Validate the descriptor 'seg_desc' associated with 'segment'.
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+static int
+validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
+    int segment, struct seg_desc *seg_desc)
+{
+	struct vm_guest_paging sup_paging;
+	struct user_segment_descriptor usd;
+	int error, idtvec;
+	int cpl, dpl, rpl;
+	uint16_t sel, cs;
+	bool ldtseg, codeseg, stackseg, dataseg, conforming;
+
+	ldtseg = codeseg = stackseg = dataseg = false;
+	switch (segment) {
+	case VM_REG_GUEST_LDTR:
+		ldtseg = true;
+		break;
+	case VM_REG_GUEST_CS:
+		codeseg = true;
+		break;
+	case VM_REG_GUEST_SS:
+		stackseg = true;
+		break;
+	case VM_REG_GUEST_DS:
+	case VM_REG_GUEST_ES:
+	case VM_REG_GUEST_FS:
+	case VM_REG_GUEST_GS:
+		dataseg = true;
+		break;
+	default:
+		assert(0);
+	}
+
+	/* Get the segment selector */
+	sel = GETREG(ctx, vcpu, segment);
+
+	/* LDT selector must point into the GDT */
+	if (ldtseg && ISLDT(sel)) {
+		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+		return (1);
+	}
+
+	/* Descriptor table limit check */
+	if (desc_table_limit_check(ctx, vcpu, sel)) {
+		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+		return (1);
+	}
+
+	/* NULL selector */
+	if (IDXSEL(sel) == 0) {
+		/* Code and stack segment selectors cannot be NULL */
+		if (codeseg || stackseg) {
+			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+			return (1);
+		}
+		seg_desc->base = 0;
+		seg_desc->limit = 0;
+		seg_desc->access = 0x10000;	/* unusable */
+		return (0);
+	}
+
+	/* Read the descriptor from the GDT/LDT */
+	sup_paging = ts->paging;
+	sup_paging.cpl = 0;	/* implicit supervisor mode */
+	error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd);
+	if (error)
+		return (error);
+
+	/* Verify that the descriptor type is compatible with the segment */
+	if ((ldtseg && !ldt_desc(usd.sd_type)) ||
+	    (codeseg && !code_desc(usd.sd_type)) ||
+	    (dataseg && !data_desc(usd.sd_type)) ||
+	    (stackseg && !stack_desc(usd.sd_type))) {
+		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+		return (1);
+	}
+
+	/* Segment must be marked present */
+	if (!usd.sd_p) {
+		if (ldtseg)
+			idtvec = IDT_TS;
+		else if (stackseg)
+			idtvec = IDT_SS;
+		else
+			idtvec = IDT_NP;
+		sel_exception(ctx, vcpu, idtvec, sel, ts->ext);
+		return (1);
+	}
+
+	cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
+	cpl = cs & SEL_RPL_MASK;
+	rpl = sel & SEL_RPL_MASK;
+	dpl = usd.sd_dpl;
+
+	if (stackseg && (rpl != cpl || dpl != cpl)) {
+		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+		return (1);
+	}
+
+	if (codeseg) {
+		conforming = (usd.sd_type & 0x4) ? true : false;
+		if ((conforming && (cpl < dpl)) ||
+		    (!conforming && (cpl != dpl))) {
+			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+			return (1);
+		}
+	}
+
+	if (dataseg) {
+		/*
+		 * A data segment is always non-conforming except when it's
+		 * descriptor is a readable, conforming code segment.
+		 */
+		if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
+			conforming = true;
+		else
+			conforming = false;
+
+		if (!conforming && (rpl > dpl || cpl > dpl)) {
+			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
+			return (1);
+		}
+	}
+	*seg_desc = usd_to_seg_desc(&usd);
+	return (0);
+}
+
+static void
+tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch,
+    uint32_t eip, struct tss32 *tss, struct iovec *iov)
+{
+
+	/* General purpose registers */
+	tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX);
+	tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX);
+	tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX);
+	tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX);
+	tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
+	tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP);
+	tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI);
+	tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI);
+
+	/* Segment selectors */
+	tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES);
+	tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
+	tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
+	tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS);
+	tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS);
+	tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS);
+
+	/* eflags and eip */
+	tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
+	if (task_switch->reason == TSR_IRET)
+		tss->tss_eflags &= ~PSL_NT;
+	tss->tss_eip = eip;
+
+	/* Copy updated old TSS into guest memory */
+	vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32));
+}
+
+static void
+update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
+{
+	int error;
+
+	error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access);
+	assert(error == 0);
+}
+
+/*
+ * Update the vcpu registers to reflect the state of the new task.
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+static int
+tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
+    uint16_t ot_sel, struct tss32 *tss, struct iovec *iov)
+{
+	struct seg_desc seg_desc, seg_desc2;
+	uint64_t *pdpte, maxphyaddr, reserved;
+	uint32_t eflags;
+	int error, i;
+	bool nested;
+
+	nested = false;
+	if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
+		tss->tss_link = ot_sel;
+		nested = true;
+	}
+
+	eflags = tss->tss_eflags;
+	if (nested)
+		eflags |= PSL_NT;
+
+	/* LDTR */
+	SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
+
+	/* PBDR */
+	if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
+		if (ts->paging.paging_mode == PAGING_MODE_PAE) {
+			/*
+			 * XXX Assuming 36-bit MAXPHYADDR.
+			 */
+			maxphyaddr = (1UL << 36) - 1;
+			pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
+			for (i = 0; i < 4; i++) {
+				/* Check reserved bits if the PDPTE is valid */
+				if (!(pdpte[i] & 0x1))
+					continue;
+				/*
+				 * Bits 2:1, 8:5 and bits above the processor's
+				 * maximum physical address are reserved.
+				 */
+				reserved = ~maxphyaddr | 0x1E6;
+				if (pdpte[i] & reserved) {
+					vm_inject_gp(ctx, vcpu);
+					return (1);
+				}
+			}
+			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
+			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
+			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
+			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
+		}
+		SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
+		ts->paging.cr3 = tss->tss_cr3;
+	}
+
+	/* eflags and eip */
+	SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
+
+	/* General purpose registers */
+	SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
+
+	/* Segment selectors */
+	SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es);
+	SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs);
+	SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss);
+	SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds);
+	SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs);
+	SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs);
+
+	/*
+	 * If this is a nested task then write out the new TSS to update
+	 * the previous link field.
+	 */
+	if (nested)
+		vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
+
+	/* Validate segment descriptors */
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc);
+	if (error)
+		return (error);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
+
+	/*
+	 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
+	 *
+	 * The SS and CS attribute checks on VM-entry are inter-dependent so
+	 * we need to make sure that both segments are valid before updating
+	 * either of them. This ensures that the VMCS state can pass the
+	 * VM-entry checks so the guest can handle any exception injected
+	 * during task switch emulation.
+	 */
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc);
+	if (error)
+		return (error);
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2);
+	if (error)
+		return (error);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
+	ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
+
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc);
+	if (error)
+		return (error);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
+
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc);
+	if (error)
+		return (error);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
+
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc);
+	if (error)
+		return (error);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
+
+	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc);
+	if (error)
+		return (error);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
+
+	return (0);
+}
+
+/*
+ * Push an error code on the stack of the new task. This is needed if the
+ * task switch was triggered by a hardware exception that causes an error
+ * code to be saved (e.g. #PF).
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+static int
+push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    int task_type, uint32_t errcode)
+{
+	struct iovec iov[2];
+	struct seg_desc seg_desc;
+	int stacksize, bytes, error;
+	uint64_t gla, cr0, rflags;
+	uint32_t esp;
+	uint16_t stacksel;
+
+	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
+	rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
+	stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
+
+	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base,
+	    &seg_desc.limit, &seg_desc.access);
+	assert(error == 0);
+
+	/*
+	 * Section "Error Code" in the Intel SDM vol 3: the error code is
+	 * pushed on the stack as a doubleword or word (depending on the
+	 * default interrupt, trap or task gate size).
+	 */
+	if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
+		bytes = 4;
+	else
+		bytes = 2;
+
+	/*
+	 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
+	 * stack-segment descriptor determines the size of the stack
+	 * pointer outside of 64-bit mode.
+	 */
+	if (SEG_DESC_DEF32(seg_desc.access))
+		stacksize = 4;
+	else
+		stacksize = 2;
+
+	esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
+	esp -= bytes;
+
+	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
+	    &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
+		sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
+		return (1);
+	}
+
+	if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
+		vm_inject_ac(ctx, vcpu, 1);
+		return (1);
+	}
+
+	error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
+	    iov, nitems(iov));
+	if (error)
+		return (error);
+
+	vm_copyout(ctx, vcpu, &errcode, iov, bytes);
+	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp);
+	return (0);
+}
+
+/*
+ * Evaluate return value from helper functions and potentially return to
+ * the VM run loop.
+ *  0: success
+ * +1: an exception was injected into the guest vcpu
+ * -1: unrecoverable/programming error
+ */
+#define	CHKERR(x)							\
+	do {								\
+		assert(((x) == 0) || ((x) == 1) || ((x) == -1));	\
+		if ((x) == -1)						\
+			return (VMEXIT_ABORT);				\
+		else if ((x) == 1)					\
+			return (VMEXIT_CONTINUE);			\
+	} while (0)
+
+int
+vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+	struct seg_desc nt;
+	struct tss32 oldtss, newtss;
+	struct vm_task_switch *task_switch;
+	struct vm_guest_paging *paging, sup_paging;
+	struct user_segment_descriptor nt_desc, ot_desc;
+	struct iovec nt_iov[2], ot_iov[2];
+	uint64_t cr0, ot_base;
+	uint32_t eip, ot_lim, access;
+	int error, ext, minlimit, nt_type, ot_type, vcpu;
+	enum task_switch_reason reason;
+	uint16_t nt_sel, ot_sel;
+
+	task_switch = &vmexit->u.task_switch;
+	nt_sel = task_switch->tsssel;
+	ext = vmexit->u.task_switch.ext;
+	reason = vmexit->u.task_switch.reason;
+	paging = &vmexit->u.task_switch.paging;
+	vcpu = *pvcpu;
+
+	assert(paging->cpu_mode == CPU_MODE_PROTECTED);
+
+	/*
+	 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
+	 * The following page table accesses are implicitly supervisor mode:
+	 * - accesses to GDT or LDT to load segment descriptors
+	 * - accesses to the task state segment during task switch
+	 */
+	sup_paging = *paging;
+	sup_paging.cpl = 0;	/* implicit supervisor mode */
+
+	/* Fetch the new TSS descriptor */
+	error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc);
+	CHKERR(error);
+
+	nt = usd_to_seg_desc(&nt_desc);
+
+	/* Verify the type of the new TSS */
+	nt_type = SEG_DESC_TYPE(nt.access);
+	if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
+	    nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
+		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
+		goto done;
+	}
+
+	/* TSS descriptor must have present bit set */
+	if (!SEG_DESC_PRESENT(nt.access)) {
+		sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext);
+		goto done;
+	}
+
+	/*
+	 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
+	 * 44 bytes for a 16-bit TSS.
+	 */
+	if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
+		minlimit = 104 - 1;
+	else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
+		minlimit = 44 - 1;
+	else
+		minlimit = 0;
+
+	assert(minlimit > 0);
+	if (nt.limit < minlimit) {
+		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
+		goto done;
+	}
+
+	/* TSS must be busy if task switch is due to IRET */
+	if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
+		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
+		goto done;
+	}
+
+	/*
+	 * TSS must be available (not busy) if task switch reason is
+	 * CALL, JMP, exception or interrupt.
+	 */
+	if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
+		sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext);
+		goto done;
+	}
+
+	/* Fetch the new TSS */
+	error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
+	    PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov));
+	CHKERR(error);
+	vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1);
+
+	/* Get the old TSS selector from the guest's task register */
+	ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR);
+	if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
+		/*
+		 * This might happen if a task switch was attempted without
+		 * ever loading the task register with LTR. In this case the
+		 * TR would contain the values from power-on:
+		 * (sel = 0, base = 0, limit = 0xffff).
+		 */
+		sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext);
+		goto done;
+	}
+
+	/* Get the old TSS base and limit from the guest's task register */
+	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
+	    &access);
+	assert(error == 0);
+	assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
+	ot_type = SEG_DESC_TYPE(access);
+	assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
+
+	/* Fetch the old TSS descriptor */
+	error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc);
+	CHKERR(error);
+
+	/* Get the old TSS */
+	error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
+	    PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov));
+	CHKERR(error);
+	vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1);
+
+	/*
+	 * Clear the busy bit in the old TSS descriptor if the task switch
+	 * due to an IRET or JMP instruction.
+	 */
+	if (reason == TSR_IRET || reason == TSR_JMP) {
+		ot_desc.sd_type &= ~0x2;
+		error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel,
+		    &ot_desc);
+		CHKERR(error);
+	}
+
+	if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
+		fprintf(stderr, "Task switch to 16-bit TSS not supported\n");
+		return (VMEXIT_ABORT);
+	}
+
+	/* Save processor state in old TSS */
+	eip = vmexit->rip + vmexit->inst_length;
+	tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov);
+
+	/*
+	 * If the task switch was triggered for any reason other than IRET
+	 * then set the busy bit in the new TSS descriptor.
+	 */
+	if (reason != TSR_IRET) {
+		nt_desc.sd_type |= 0x2;
+		error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel,
+		    &nt_desc);
+		CHKERR(error);
+	}
+
+	/* Update task register to point at the new TSS */
+	SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel);
+
+	/* Update the hidden descriptor state of the task register */
+	nt = usd_to_seg_desc(&nt_desc);
+	update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt);
+
+	/* Set CR0.TS */
+	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
+	SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
+
+	/*
+	 * We are now committed to the task switch. Any exceptions encountered
+	 * after this point will be handled in the context of the new task and
+	 * the saved instruction pointer will belong to the new task.
+	 */
+	vmexit->rip = newtss.tss_eip;
+	vmexit->inst_length = 0;
+
+	/* Load processor state from new TSS */
+	error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov);
+	CHKERR(error);
+
+	/*
+	 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
+	 * caused an error code to be generated, this error code is copied
+	 * to the stack of the new task.
+	 */
+	if (task_switch->errcode_valid) {
+		assert(task_switch->ext);
+		assert(task_switch->reason == TSR_IDT_GATE);
+		error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type,
+		    task_switch->errcode);
+		CHKERR(error);
+	}
+
+	/*
+	 * Treatment of virtual-NMI blocking if NMI is delivered through
+	 * a task gate.
+	 *
+	 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
+	 * If the virtual NMIs VM-execution control is 1, VM entry injects
+	 * an NMI, and delivery of the NMI causes a task switch that causes
+	 * a VM exit, virtual-NMI blocking is in effect before the VM exit
+	 * commences.
+	 *
+	 * Thus, virtual-NMI blocking is in effect at the time of the task
+	 * switch VM exit.
+	 */
+
+	/*
+	 * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
+	 *
+	 * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
+	 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
+	 * This unblocking of virtual-NMI occurs even if IRET causes a fault.
+	 *
+	 * Thus, virtual-NMI blocking is cleared at the time of the task switch
+	 * VM exit.
+	 */
+
+	/*
+	 * If the task switch was triggered by an event delivered through
+	 * the IDT then extinguish the pending event from the vcpu's
+	 * exitintinfo.
+	 */
+	if (task_switch->reason == TSR_IDT_GATE) {
+		error = vm_set_intinfo(ctx, vcpu, 0);
+		assert(error == 0);
+	}
+
+	/*
+	 * XXX should inject debug exception if 'T' bit is 1
+	 */
+done:
+	return (VMEXIT_CONTINUE);
+}
diff --git a/usr.sbin/bhyve/virtio.c b/usr.sbin/bhyve/virtio.c
index 4e58dd62a0e7..1f2730092936 100644
--- a/usr.sbin/bhyve/virtio.c
+++ b/usr.sbin/bhyve/virtio.c
@@ -437,7 +437,7 @@ vq_endchains(struct vqueue_info *vq, int used_all_avail)
 	if (used_all_avail &&
 	    (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY))
 		intr = 1;
-	else if (vs->vs_flags & VIRTIO_EVENT_IDX) {
+	else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) {
 		event_idx = VQ_USED_EVENT_IDX(vq);
 		/*
 		 * This calculation is per docs and the kernel
diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h
index 01b5f7b9112d..1f29dfa8ef73 100644
--- a/usr.sbin/bhyve/virtio.h
+++ b/usr.sbin/bhyve/virtio.h
@@ -352,7 +352,7 @@ struct virtio_consts {
 					/* called to read config regs */
 	int	(*vc_cfgwrite)(void *, int, int, uint32_t);
 					/* called to write config regs */
-	uint32_t vc_hv_caps;		/* hypervisor-provided capabilities */
+	uint64_t vc_hv_caps;		/* hypervisor-provided capabilities */
 };
 
 /*
diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c
index e77f0d77df6f..b6006b72a767 100644
--- a/usr.sbin/bhyvectl/bhyvectl.c
+++ b/usr.sbin/bhyvectl/bhyvectl.c
@@ -195,7 +195,8 @@ usage(void)
 	"       [--force-reset]\n"
 	"       [--force-poweroff]\n"
 	"       [--get-active-cpus]\n"
-	"       [--get-suspended-cpus]\n",
+	"       [--get-suspended-cpus]\n"
+	"       [--get-intinfo]\n",
 	progname);
 	exit(1);
 }
@@ -205,6 +206,7 @@ static int inject_nmi, assert_lapic_lvt;
 static int force_reset, force_poweroff;
 static const char *capname;
 static int create, destroy, get_lowmem, get_highmem;
+static int get_intinfo;
 static int get_active_cpus, get_suspended_cpus;
 static uint64_t memsize;
 static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4;
@@ -412,6 +414,37 @@ print_cpus(const char *banner, const cpuset_t *cpus)
 	printf("\n");
 }
 
+static void
+print_intinfo(const char *banner, uint64_t info)
+{
+	int type;
+
+	printf("%s:\t", banner);
+	if (info & VM_INTINFO_VALID) {
+		type = info & VM_INTINFO_TYPE;
+		switch (type) {
+		case VM_INTINFO_HWINTR:
+			printf("extint");
+			break;
+		case VM_INTINFO_NMI:
+			printf("nmi");
+			break;
+		case VM_INTINFO_SWINTR:
+			printf("swint");
+			break;
+		default:
+			printf("exception");
+			break;
+		}
+		printf(" vector %d", (int)VM_INTINFO_VECTOR(info));
+		if (info & VM_INTINFO_DEL_ERRCODE)
+			printf(" errcode %#x", (u_int)(info >> 32));
+	} else {
+		printf("n/a");
+	}
+	printf("\n");
+}
+
 int
 main(int argc, char *argv[])
 {
@@ -420,7 +453,7 @@ main(int argc, char *argv[])
 	vm_paddr_t gpa, gpa_pmap;
 	size_t len;
 	struct vm_exit vmexit;
-	uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte;
+	uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte, info[2];
 	struct vmctx *ctx;
 	int wired;
 	cpuset_t cpus;
@@ -595,6 +628,7 @@ main(int argc, char *argv[])
 		{ "force-poweroff", NO_ARG,	&force_poweroff, 1 },
 		{ "get-active-cpus", NO_ARG,	&get_active_cpus, 1 },
 		{ "get-suspended-cpus", NO_ARG,	&get_suspended_cpus, 1 },
+		{ "get-intinfo", NO_ARG,	&get_intinfo,	1 },
 		{ NULL,		0,		NULL,		0 }
 	};
 
@@ -1566,6 +1600,14 @@ main(int argc, char *argv[])
 			print_cpus("suspended cpus", &cpus);
 	}
 
+	if (!error && (get_intinfo || get_all)) {
+		error = vm_get_intinfo(ctx, vcpu, &info[0], &info[1]);
+		if (!error) {
+			print_intinfo("pending", info[0]);
+			print_intinfo("current", info[1]);
+		}
+	}
+
 	if (!error && run) {
 		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
 		assert(error == 0);
diff --git a/usr.sbin/bhyveload/bhyveload.8 b/usr.sbin/bhyveload/bhyveload.8
index 3a300cc00f85..0bdf151eb268 100644
--- a/usr.sbin/bhyveload/bhyveload.8
+++ b/usr.sbin/bhyveload/bhyveload.8
@@ -35,11 +35,11 @@
 guest inside a bhyve virtual machine
 .Sh SYNOPSIS
 .Nm
-.Op Fl m Ar mem-size
+.Op Fl c Ar cons-dev
 .Op Fl d Ar disk-path
-.Op Fl h Ar host-path
 .Op Fl e Ar name=value
-.Op Fl c Ar cons-dev
+.Op Fl h Ar host-path
+.Op Fl m Ar mem-size
 .Ar vmname
 .Sh DESCRIPTION
 .Nm
@@ -62,6 +62,32 @@ and will be created if it does not already exist.
 .Sh OPTIONS
 The following options are available:
 .Bl -tag -width indent
+.It Fl c Ar cons-dev
+.Ar cons-dev
+is a
+.Xr tty 4
+device to use for
+.Nm
+terminal I/O.
+.Pp
+The text string "stdio" is also accepted and selects the use of
+unbuffered standard I/O. This is the default value.
+.It Fl d Ar disk-path
+The
+.Ar disk-path
+is the pathname of the guest's boot disk image.
+.It Fl e Ar name=value
+Set the FreeBSD loader environment variable
+.Ar name
+to
+.Ar value .
+.Pp
+The option may be used more than once to set more than one environment
+variable.
+.It Fl h Ar host-path
+The
+.Ar host-path
+is the directory at the top of the guest's boot filesystem.
 .It Fl m Ar mem-size Xo
 .Sm off
 .Op Cm K | k | M | m | G | g | T | t
@@ -85,32 +111,6 @@ respectively.
 The default value of
 .Ar mem-size
 is 256M.
-.It Fl d Ar disk-path
-The
-.Ar disk-path
-is the pathname of the guest's boot disk image.
-.It Fl h Ar host-path
-The
-.Ar host-path
-is the directory at the top of the guest's boot filesystem.
-.It Fl e Ar name=value
-Set the FreeBSD loader environment variable
-.Ar name
-to
-.Ar value .
-.Pp
-The option may be used more than once to set more than one environment
-variable.
-.It Fl c Ar cons-dev
-.Ar cons-dev
-is a
-.Xr tty 4
-device to use for
-.Nm
-terminal I/O.
-.Pp
-The text string "stdio" is also accepted and selects the use of
-unbuffered standard I/O. This is the default value.
 .El
 .Sh EXAMPLES
 To create a virtual machine named
diff --git a/usr.sbin/bhyveload/bhyveload.c b/usr.sbin/bhyveload/bhyveload.c
index ff6b26926f3b..eaf71a819373 100644
--- a/usr.sbin/bhyveload/bhyveload.c
+++ b/usr.sbin/bhyveload/bhyveload.c
@@ -629,8 +629,8 @@ usage(void)
 {
 
 	fprintf(stderr,
-	    "usage: %s [-m mem-size] [-d <disk-path>] [-h <host-path>]\n"
-	    "       %*s [-e <name=value>] [-c <console-device>] <vmname>\n",
+	    "usage: %s [-c <console-device>] [-d <disk-path>] [-e <name=value>]\n"
+	    "       %*s [-h <host-path>] [-m mem-size] <vmname>\n",
 	    progname,
 	    (int)strlen(progname), "");
 	exit(1);
author	Peter Grehan <grehan@FreeBSD.org>	2014-08-19 01:20:24 +0000
committer	Peter Grehan <grehan@FreeBSD.org>	2014-08-19 01:20:24 +0000
commit	a77e87976e47f8352dc8eddeb5db1e905b54e82a (patch)
tree	6aaf5296bf1aa8632ab9beb5fe4a61727ef7ab68
parent	93d34b74609b771e9a048429d275715c3831e44c (diff)