diff options
| author | Peter Grehan <grehan@FreeBSD.org> | 2014-08-19 01:20:24 +0000 | 
|---|---|---|
| committer | Peter Grehan <grehan@FreeBSD.org> | 2014-08-19 01:20:24 +0000 | 
| commit | a77e87976e47f8352dc8eddeb5db1e905b54e82a (patch) | |
| tree | 6aaf5296bf1aa8632ab9beb5fe4a61727ef7ab68 | |
| parent | 93d34b74609b771e9a048429d275715c3831e44c (diff) | |
Notes
41 files changed, 3057 insertions, 594 deletions
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c index 9fb2308731e7..93955c7c233e 100644 --- a/lib/libvmmapi/vmmapi.c +++ b/lib/libvmmapi/vmmapi.c @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$");  #include <sys/_iovec.h>  #include <sys/cpuset.h> +#include <x86/segments.h>  #include <machine/specialreg.h>  #include <machine/param.h> @@ -327,6 +328,16 @@ vm_get_desc(struct vmctx *ctx, int vcpu, int reg,  }  int +vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc) +{ +	int error; + +	error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit, +	    &seg_desc->access); +	return (error); +} + +int  vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val)  {  	int error; @@ -988,7 +999,7 @@ gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,  #endif  int -vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, +vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,      uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt)  {  	uint64_t gpa; @@ -1106,3 +1117,32 @@ vm_activate_cpu(struct vmctx *ctx, int vcpu)  	error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac);  	return (error);  } + +int +vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2) +{ +	struct vm_intinfo vmii; +	int error; + +	bzero(&vmii, sizeof(struct vm_intinfo)); +	vmii.vcpuid = vcpu; +	error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii); +	if (error == 0) { +		*info1 = vmii.info1; +		*info2 = vmii.info2; +	} +	return (error); +} + +int +vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1) +{ +	struct vm_intinfo vmii; +	int error; + +	bzero(&vmii, sizeof(struct vm_intinfo)); +	vmii.vcpuid = vcpu; +	vmii.info1 = info1; +	error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii); +	return (error); +} diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h index 067eaa0aa26c..fbb6ddd3acfb 100644 --- a/lib/libvmmapi/vmmapi.h +++ b/lib/libvmmapi/vmmapi.h @@ -66,6 +66,8 @@ int	vm_set_desc(struct vmctx *ctx, int vcpu, int reg,  		    uint64_t base, uint32_t limit, uint32_t access);  int	vm_get_desc(struct vmctx *ctx, int vcpu, int reg,  		    uint64_t *base, uint32_t *limit, uint32_t *access); +int	vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, +			struct seg_desc *seg_desc);  int	vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);  int	vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);  int	vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, @@ -104,6 +106,9 @@ int	vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot,  	    int func, int idx, uint64_t addr, uint64_t msg,  	    uint32_t vector_control); +int	vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2); +int	vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo); +  /*   * Return a pointer to the statistics buffer. Note that this is not MT-safe.   */ @@ -121,7 +126,7 @@ int	vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities);   * The 'iovcnt' should be big enough to accomodate all GPA segments.   * Returns 0 on success, 1 on a guest fault condition and -1 otherwise.   */ -int	vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, +int	vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg,  	    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt);  void	vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov,  	    void *host_dst, size_t len); diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c index 957120cb9cb2..74be82cfba28 100644 --- a/sys/amd64/amd64/identcpu.c +++ b/sys/amd64/amd64/identcpu.c @@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$");  #include <machine/specialreg.h>  #include <machine/md_var.h> +#include <amd64/vmm/intel/vmx_controls.h>  #include <x86/isa/icu.h>  /* XXX - should be in header file: */ @@ -73,6 +74,7 @@ static u_int find_cpu_vendor_id(void);  static void print_AMD_info(void);  static void print_AMD_assoc(int i);  static void print_via_padlock_info(void); +static void print_vmx_info(void);  int	cpu_class;  char machine[] = "amd64"; @@ -428,6 +430,9 @@ printcpuinfo(void)  			if (via_feature_rng != 0 || via_feature_xcrypt != 0)  				print_via_padlock_info(); +			if (cpu_feature2 & CPUID2_VMX) +				print_vmx_info(); +  			if ((cpu_feature & CPUID_HTT) &&  			    cpu_vendor_id == CPU_VENDOR_AMD)  				cpu_feature &= ~CPUID_HTT; @@ -722,3 +727,197 @@ print_via_padlock_info(void)  	"\015RSA"		/* PMM */  	);  } + +static uint32_t +vmx_settable(uint64_t basic, int msr, int true_msr) +{ +	uint64_t val; + +	if (basic & (1UL << 55)) +		val = rdmsr(true_msr); +	else +		val = rdmsr(msr); + +	/* Just report the controls that can be set to 1. */ +	return (val >> 32); +} + +static void +print_vmx_info(void) +{ +	uint64_t basic, msr; +	uint32_t entry, exit, mask, pin, proc, proc2; +	int comma; + +	printf("\n  VT-x: "); +	msr = rdmsr(MSR_IA32_FEATURE_CONTROL); +	if (!(msr & IA32_FEATURE_CONTROL_VMX_EN)) +		printf("(disabled in BIOS) "); +	basic = rdmsr(MSR_VMX_BASIC); +	pin = vmx_settable(basic, MSR_VMX_PINBASED_CTLS, +	    MSR_VMX_TRUE_PINBASED_CTLS); +	proc = vmx_settable(basic, MSR_VMX_PROCBASED_CTLS, +	    MSR_VMX_TRUE_PROCBASED_CTLS); +	if (proc & PROCBASED_SECONDARY_CONTROLS) +		proc2 = vmx_settable(basic, MSR_VMX_PROCBASED_CTLS2, +		    MSR_VMX_PROCBASED_CTLS2); +	else +		proc2 = 0; +	exit = vmx_settable(basic, MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS); +	entry = vmx_settable(basic, MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS); + +	if (!bootverbose) { +		comma = 0; +		if (exit & VM_EXIT_SAVE_PAT && exit & VM_EXIT_LOAD_PAT && +		    entry & VM_ENTRY_LOAD_PAT) { +			printf("%sPAT", comma ? "," : ""); +			comma = 1; +		} +		if (proc & PROCBASED_HLT_EXITING) { +			printf("%sHLT", comma ? "," : ""); +			comma = 1; +		} +		if (proc & PROCBASED_MTF) { +			printf("%sMTF", comma ? "," : ""); +			comma = 1; +		} +		if (proc & PROCBASED_PAUSE_EXITING) { +			printf("%sPAUSE", comma ? "," : ""); +			comma = 1; +		} +		if (proc2 & PROCBASED2_ENABLE_EPT) { +			printf("%sEPT", comma ? "," : ""); +			comma = 1; +		} +		if (proc2 & PROCBASED2_UNRESTRICTED_GUEST) { +			printf("%sUG", comma ? "," : ""); +			comma = 1; +		} +		if (proc2 & PROCBASED2_ENABLE_VPID) { +			printf("%sVPID", comma ? "," : ""); +			comma = 1; +		} +		if (proc & PROCBASED_USE_TPR_SHADOW && +		    proc2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES && +		    proc2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE && +		    proc2 & PROCBASED2_APIC_REGISTER_VIRTUALIZATION && +		    proc2 & PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY) { +			printf("%sVID", comma ? "," : ""); +			comma = 1; +			if (pin & PINBASED_POSTED_INTERRUPT) +				printf(",PostIntr"); +		} +		return; +	} + +	mask = basic >> 32; +	printf("Basic Features=0x%b", mask, +	"\020" +	"\02132PA"		/* 32-bit physical addresses */ +	"\022SMM"		/* SMM dual-monitor */ +	"\027INS/OUTS"		/* VM-exit info for INS and OUTS */ +	"\030TRUE"		/* TRUE_CTLS MSRs */ +	); +	printf("\n        Pin-Based Controls=0x%b", pin, +	"\020" +	"\001ExtINT"		/* External-interrupt exiting */ +	"\004NMI"		/* NMI exiting */ +	"\006VNMI"		/* Virtual NMIs */ +	"\007PreTmr"		/* Activate VMX-preemption timer */ +	"\010PostIntr"		/* Process posted interrupts */ +	); +	printf("\n        Primary Processor Controls=0x%b", proc, +	"\020" +	"\003INTWIN"		/* Interrupt-window exiting */ +	"\004TSCOff"		/* Use TSC offsetting */ +	"\010HLT"		/* HLT exiting */ +	"\012INVLPG"		/* INVLPG exiting */ +	"\013MWAIT"		/* MWAIT exiting */ +	"\014RDPMC"		/* RDPMC exiting */ +	"\015RDTSC"		/* RDTSC exiting */ +	"\020CR3-LD"		/* CR3-load exiting */ +	"\021CR3-ST"		/* CR3-store exiting */ +	"\024CR8-LD"		/* CR8-load exiting */ +	"\025CR8-ST"		/* CR8-store exiting */ +	"\026TPR"		/* Use TPR shadow */ +	"\027NMIWIN"		/* NMI-window exiting */ +	"\030MOV-DR"		/* MOV-DR exiting */ +	"\031IO"		/* Unconditional I/O exiting */ +	"\032IOmap"		/* Use I/O bitmaps */ +	"\034MTF"		/* Monitor trap flag */ +	"\035MSRmap"		/* Use MSR bitmaps */ +	"\036MONITOR"		/* MONITOR exiting */ +	"\037PAUSE"		/* PAUSE exiting */ +	); +	if (proc & PROCBASED_SECONDARY_CONTROLS) +		printf("\n        Secondary Processor Controls=0x%b", proc2, +		"\020" +		"\001APIC"		/* Virtualize APIC accesses */ +		"\002EPT"		/* Enable EPT */ +		"\003DT"		/* Descriptor-table exiting */ +		"\004RDTSCP"		/* Enable RDTSCP */ +		"\005x2APIC"		/* Virtualize x2APIC mode */ +		"\006VPID"		/* Enable VPID */ +		"\007WBINVD"		/* WBINVD exiting */ +		"\010UG"		/* Unrestricted guest */ +		"\011APIC-reg"		/* APIC-register virtualization */ +		"\012VID"		/* Virtual-interrupt delivery */ +		"\013PAUSE-loop"	/* PAUSE-loop exiting */ +		"\014RDRAND"		/* RDRAND exiting */ +		"\015INVPCID"		/* Enable INVPCID */ +		"\016VMFUNC"		/* Enable VM functions */ +		"\017VMCS"		/* VMCS shadowing */ +		"\020EPT#VE"		/* EPT-violation #VE */ +		"\021XSAVES"		/* Enable XSAVES/XRSTORS */ +		); +	printf("\n        Exit Controls=0x%b", mask, +	"\020" +	"\003DR"		/* Save debug controls */ +				/* Ignore Host address-space size */ +	"\015PERF"		/* Load MSR_PERF_GLOBAL_CTRL */ +	"\020AckInt"		/* Acknowledge interrupt on exit */ +	"\023PAT-SV"		/* Save MSR_PAT */ +	"\024PAT-LD"		/* Load MSR_PAT */ +	"\025EFER-SV"		/* Save MSR_EFER */ +	"\026EFER-LD"		/* Load MSR_EFER */ +	"\027PTMR-SV"		/* Save VMX-preemption timer value */ +	); +	printf("\n        Entry Controls=0x%b", mask, +	"\020" +	"\003DR"		/* Save debug controls */ +				/* Ignore IA-32e mode guest */ +				/* Ignore Entry to SMM */ +				/* Ignore Deactivate dual-monitor treatment */ +	"\016PERF"		/* Load MSR_PERF_GLOBAL_CTRL */ +	"\017PAT"		/* Load MSR_PAT */ +	"\020EFER"		/* Load MSR_EFER */ +	); +	if (proc & PROCBASED_SECONDARY_CONTROLS && +	    (proc2 & (PROCBASED2_ENABLE_EPT | PROCBASED2_ENABLE_VPID)) != 0) { +		msr = rdmsr(MSR_VMX_EPT_VPID_CAP); +		mask = msr; +		printf("\n        EPT Features=0x%b", mask, +		"\020" +		"\001XO"		/* Execute-only translations */ +		"\007PW4"		/* Page-walk length of 4 */ +		"\011UC"		/* EPT paging-structure mem can be UC */ +		"\017WB"		/* EPT paging-structure mem can be WB */ +		"\0212M"		/* EPT PDE can map a 2-Mbyte page */ +		"\0221G"		/* EPT PDPTE can map a 1-Gbyte page */ +		"\025INVEPT"		/* INVEPT is supported */ +		"\026AD"		/* Accessed and dirty flags for EPT */ +		"\032single"		/* INVEPT single-context type */ +		"\033all"		/* INVEPT all-context type */ +		); +		mask = msr >> 32; +		printf("\n        VPID Features=0x%b", mask, +		"\020" +		"\001INVVPID"		/* INVVPID is supported */ +		"\011individual"	/* INVVPID individual-address type */ +		"\012single"		/* INVVPID single-context type */ +		"\013all"		/* INVVPID all-context type */ +		 /* INVVPID single-context-retaining-globals type */ +		"\014single-globals"	 +		); +	} +} diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 5a359e95993c..63a9b3fdde0f 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -29,11 +29,14 @@  #ifndef _VMM_H_  #define	_VMM_H_ +#include <x86/segments.h> +  enum vm_suspend_how {  	VM_SUSPEND_NONE,  	VM_SUSPEND_RESET,  	VM_SUSPEND_POWEROFF,  	VM_SUSPEND_HALT, +	VM_SUSPEND_TRIPLEFAULT,  	VM_SUSPEND_LAST  }; @@ -75,6 +78,10 @@ enum vm_reg_name {  	VM_REG_GUEST_GDTR,  	VM_REG_GUEST_EFER,  	VM_REG_GUEST_CR2, +	VM_REG_GUEST_PDPTE0, +	VM_REG_GUEST_PDPTE1, +	VM_REG_GUEST_PDPTE2, +	VM_REG_GUEST_PDPTE3,  	VM_REG_LAST  }; @@ -84,6 +91,16 @@ enum x2apic_state {  	X2APIC_STATE_LAST  }; +#define	VM_INTINFO_VECTOR(info)	((info) & 0xff) +#define	VM_INTINFO_DEL_ERRCODE	0x800 +#define	VM_INTINFO_RSVD		0x7ffff000 +#define	VM_INTINFO_VALID	0x80000000 +#define	VM_INTINFO_TYPE		0x700 +#define	VM_INTINFO_HWINTR	(0 << 8) +#define	VM_INTINFO_NMI		(2 << 8) +#define	VM_INTINFO_HWEXCEPTION	(3 << 8) +#define	VM_INTINFO_SWINTR	(4 << 8) +  #ifdef _KERNEL  #define	VM_MAX_NAMELEN	32 @@ -99,6 +116,7 @@ struct vioapic;  struct vlapic;  struct vmspace;  struct vm_object; +struct vm_guest_paging;  struct pmap;  typedef int	(*vmm_init_func_t)(int ipinum); @@ -252,6 +270,14 @@ vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)  	return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING);  } +#ifdef _SYS_PROC_H_ +static int __inline +vcpu_should_yield(struct vm *vm, int vcpu) +{ +	return (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)); +} +#endif +  void *vcpu_stats(struct vm *vm, int vcpu);  void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);  struct vmspace *vm_get_vmspace(struct vm *vm); @@ -274,21 +300,63 @@ struct vatpit *vm_atpit(struct vm *vm);  int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme);  /* - * Returns 0 if there is no exception pending for this vcpu. Returns 1 if an - * exception is pending and also updates 'vme'. The pending exception is - * cleared when this function returns. + * This function is called after a VM-exit that occurred during exception or + * interrupt delivery through the IDT. The format of 'intinfo' is described + * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2.   * - * This function should only be called in the context of the thread that is - * executing this vcpu. + * If a VM-exit handler completes the event delivery successfully then it + * should call vm_exit_intinfo() to extinguish the pending event. For e.g., + * if the task switch emulation is triggered via a task gate then it should + * call this function with 'intinfo=0' to indicate that the external event + * is not pending anymore. + * + * Return value is 0 on success and non-zero on failure.   */ -int vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *vme); +int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo); -void vm_inject_gp(struct vm *vm, int vcpuid); /* general protection fault */ -void vm_inject_ud(struct vm *vm, int vcpuid); /* undefined instruction fault */ -void vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2); +/* + * This function is called before every VM-entry to retrieve a pending + * event that should be injected into the guest. This function combines + * nested events into a double or triple fault. + * + * Returns 0 if there are no events that need to be injected into the guest + * and non-zero otherwise. + */ +int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info); + +int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);  enum vm_reg_name vm_segment_name(int seg_encoding); +struct vm_copyinfo { +	uint64_t	gpa; +	size_t		len; +	void		*hva; +	void		*cookie; +}; + +/* + * Set up 'copyinfo[]' to copy to/from guest linear address space starting + * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for + * a copyin or PROT_WRITE for a copyout.  + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + * + * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if + * the return value is 0. The 'copyinfo[]' resources should be freed by calling + * 'vm_copy_teardown()' after the copy is done. + */ +int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, +    uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, +    int num_copyinfo); +void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, +    int num_copyinfo); +void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, +    void *kaddr, size_t len); +void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, +    struct vm_copyinfo *copyinfo, size_t len);  #endif	/* KERNEL */  #define	VM_MAXCPU	16			/* maximum virtual cpus */ @@ -322,13 +390,16 @@ struct seg_desc {  	uint32_t	limit;  	uint32_t	access;  }; -#define	SEG_DESC_TYPE(desc)		((desc)->access & 0x001f) -#define	SEG_DESC_PRESENT(desc)		((desc)->access & 0x0080) -#define	SEG_DESC_DEF32(desc)		((desc)->access & 0x4000) -#define	SEG_DESC_GRANULARITY(desc)	((desc)->access & 0x8000) -#define	SEG_DESC_UNUSABLE(desc)		((desc)->access & 0x10000) +#define	SEG_DESC_TYPE(access)		((access) & 0x001f) +#define	SEG_DESC_DPL(access)		(((access) >> 5) & 0x3) +#define	SEG_DESC_PRESENT(access)	(((access) & 0x0080) ? 1 : 0) +#define	SEG_DESC_DEF32(access)		(((access) & 0x4000) ? 1 : 0) +#define	SEG_DESC_GRANULARITY(access)	(((access) & 0x8000) ? 1 : 0) +#define	SEG_DESC_UNUSABLE(access)	(((access) & 0x10000) ? 1 : 0)  enum vm_cpu_mode { +	CPU_MODE_REAL, +	CPU_MODE_PROTECTED,  	CPU_MODE_COMPATIBILITY,		/* IA-32E mode (CS.L = 0) */  	CPU_MODE_64BIT,			/* IA-32E mode (CS.L = 1) */  }; @@ -364,11 +435,14 @@ struct vie {  	uint8_t		num_valid;		/* size of the instruction */  	uint8_t		num_processed; +	uint8_t		addrsize:4, opsize:4;	/* address and operand sizes */  	uint8_t		rex_w:1,		/* REX prefix */  			rex_r:1,  			rex_x:1,  			rex_b:1, -			rex_present:1; +			rex_present:1, +			opsize_override:1,	/* Operand size override */ +			addrsize_override:1;	/* Address size override */  	uint8_t		mod:2,			/* ModRM byte */  			reg:4, @@ -410,6 +484,7 @@ enum vm_exitcode {  	VM_EXITCODE_IOAPIC_EOI,  	VM_EXITCODE_SUSPENDED,  	VM_EXITCODE_INOUT_STR, +	VM_EXITCODE_TASK_SWITCH,  	VM_EXITCODE_MAX  }; @@ -434,6 +509,22 @@ struct vm_inout_str {  	struct seg_desc seg_desc;  }; +enum task_switch_reason { +	TSR_CALL, +	TSR_IRET, +	TSR_JMP, +	TSR_IDT_GATE,	/* task gate in IDT */ +}; + +struct vm_task_switch { +	uint16_t	tsssel;		/* new TSS selector */ +	int		ext;		/* task switch due to external event */ +	uint32_t	errcode; +	int		errcode_valid;	/* push 'errcode' on the new stack */ +	enum task_switch_reason reason; +	struct vm_guest_paging paging; +}; +  struct vm_exit {  	enum vm_exitcode	exitcode;  	int			inst_length;	/* 0 means unknown */ @@ -448,6 +539,7 @@ struct vm_exit {  		struct {  			uint64_t	gpa;  			uint64_t	gla; +			int		cs_d;		/* CS.D */  			struct vm_guest_paging paging;  			struct vie	vie;  		} inst_emul; @@ -487,7 +579,38 @@ struct vm_exit {  		struct {  			enum vm_suspend_how how;  		} suspended; +		struct vm_task_switch task_switch;  	} u;  }; +/* APIs to inject faults into the guest */ +void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid, +    int errcode); + +static void __inline +vm_inject_ud(void *vm, int vcpuid) +{ +	vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0); +} + +static void __inline +vm_inject_gp(void *vm, int vcpuid) +{ +	vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0); +} + +static void __inline +vm_inject_ac(void *vm, int vcpuid, int errcode) +{ +	vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode); +} + +static void __inline +vm_inject_ss(void *vm, int vcpuid, int errcode) +{ +	vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode); +} + +void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2); +  #endif	/* _VMM_H_ */ diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h index 9b3b00ded0ba..e4d839ef6549 100644 --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -189,6 +189,12 @@ struct vm_cpuset {  #define	VM_ACTIVE_CPUS		0  #define	VM_SUSPENDED_CPUS	1 +struct vm_intinfo { +	int		vcpuid; +	uint64_t	info1; +	uint64_t	info2; +}; +  enum {  	/* general routines */  	IOCNUM_ABIVERS = 0, @@ -211,6 +217,8 @@ enum {  	IOCNUM_GET_SEGMENT_DESCRIPTOR = 23,  	/* interrupt injection */ +	IOCNUM_GET_INTINFO = 28, +	IOCNUM_SET_INTINFO = 29,  	IOCNUM_INJECT_EXCEPTION = 30,  	IOCNUM_LAPIC_IRQ = 31,  	IOCNUM_INJECT_NMI = 32, @@ -324,4 +332,8 @@ enum {  	_IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu)  #define	VM_GET_CPUS	\  	_IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset) +#define	VM_SET_INTINFO	\ +	_IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo) +#define	VM_GET_INTINFO	\ +	_IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo)  #endif diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h index e4c408bf165f..bbd3d88d9cf3 100644 --- a/sys/amd64/include/vmm_instruction_emul.h +++ b/sys/amd64/include/vmm_instruction_emul.h @@ -52,8 +52,8 @@ typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,   * s   */  int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie, -			    mem_region_read_t mrr, mem_region_write_t mrw, -			    void *mrarg); +    struct vm_guest_paging *paging, mem_region_read_t mrr, +    mem_region_write_t mrw, void *mrarg);  int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,      uint64_t val, int size); @@ -108,7 +108,7 @@ void vie_init(struct vie *vie);   */  #define	VIE_INVALID_GLA		(1UL << 63)	/* a non-canonical address */  int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, -			   enum vm_cpu_mode cpu_mode, struct vie *vie); +			   enum vm_cpu_mode cpu_mode, int csd, struct vie *vie);  #endif	/* _KERNEL */  #endif	/* _VMM_INSTRUCTION_EMUL_H_ */ diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c index cc97d9516613..51e5c2c06f00 100644 --- a/sys/amd64/vmm/intel/vmcs.c +++ b/sys/amd64/vmm/intel/vmcs.c @@ -103,6 +103,14 @@ vmcs_field_encoding(int ident)  		return (VMCS_GUEST_LDTR_SELECTOR);  	case VM_REG_GUEST_EFER:  		return (VMCS_GUEST_IA32_EFER); +	case VM_REG_GUEST_PDPTE0: +		return (VMCS_GUEST_PDPTE0); +	case VM_REG_GUEST_PDPTE1: +		return (VMCS_GUEST_PDPTE1); +	case VM_REG_GUEST_PDPTE2: +		return (VMCS_GUEST_PDPTE2); +	case VM_REG_GUEST_PDPTE3: +		return (VMCS_GUEST_PDPTE3);  	default:  		return (-1);  	} diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h index 657d5b0f65cb..4e9557c39b9b 100644 --- a/sys/amd64/vmm/intel/vmcs.h +++ b/sys/amd64/vmm/intel/vmcs.h @@ -346,6 +346,9 @@ vmcs_write(uint32_t encoding, uint64_t val)  #define	VMCS_INTR_T_HWINTR	(0 << 8)  #define	VMCS_INTR_T_NMI		(2 << 8)  #define	VMCS_INTR_T_HWEXCEPTION	(3 << 8) +#define	VMCS_INTR_T_SWINTR	(4 << 8) +#define	VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8) +#define	VMCS_INTR_T_SWEXCEPTION	(6 << 8)  #define	VMCS_INTR_DEL_ERRCODE	(1 << 11)  /* diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 2cbb159e9159..b2c570216a78 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -149,8 +149,6 @@ SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,  SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,  	     &cr4_zeros_mask, 0, NULL); -static int vmx_no_patmsr; -  static int vmx_initialized;  SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,  	   &vmx_initialized, 0, "Intel VMX initialized"); @@ -158,18 +156,38 @@ SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,  /*   * Optional capabilities   */ +static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL); + +static int vmx_patmsr; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, patmsr, CTLFLAG_RD, &vmx_patmsr, 0, +    "PAT MSR saved and restored in VCMS"); +  static int cap_halt_exit; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0, +    "HLT triggers a VM-exit"); +  static int cap_pause_exit; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, +    0, "PAUSE triggers a VM-exit"); +  static int cap_unrestricted_guest; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, +    &cap_unrestricted_guest, 0, "Unrestricted guests"); +  static int cap_monitor_trap; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, +    &cap_monitor_trap, 0, "Monitor trap flag"); +  static int cap_invpcid; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, +    0, "Guests are allowed to use INVPCID");  static int virtual_interrupt_delivery; -SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,      &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");  static int posted_interrupts; -SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD, +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,      &posted_interrupts, 0, "APICv posted interrupt support");  static int pirvec; @@ -618,6 +636,7 @@ vmx_init(int ipinum)  	}  	/* Check support for VM-exit controls */ +	vmx_patmsr = 1;  	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,  			       VM_EXIT_CTLS_ONE_SETTING,  			       VM_EXIT_CTLS_ZERO_SETTING, @@ -637,12 +656,12 @@ vmx_init(int ipinum)  			if (bootverbose)  				printf("vmm: PAT MSR access not supported\n");  			guest_msr_valid(MSR_PAT); -			vmx_no_patmsr = 1; +			vmx_patmsr = 0;  		}  	}  	/* Check support for VM-entry controls */ -	if (!vmx_no_patmsr) { +	if (vmx_patmsr) {  		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,  				       MSR_VMX_TRUE_ENTRY_CTLS,  				       VM_ENTRY_CTLS_ONE_SETTING, @@ -918,7 +937,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap)  	 * MSR_PAT save/restore support, leave access disabled so accesses  	 * will be trapped.  	 */ -	if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) +	if (vmx_patmsr && guest_msr_rw(vmx, MSR_PAT))  		panic("vmx_vminit: error setting guest pat msr access");  	vpid_alloc(vpid, VM_MAXCPU); @@ -974,7 +993,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap)  		vmx->cap[i].proc_ctls = procbased_ctls;  		vmx->cap[i].proc_ctls2 = procbased_ctls2; -		vmx->state[i].lastcpu = -1; +		vmx->state[i].lastcpu = NOCPU;  		vmx->state[i].vpid = vpid[i];  		msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); @@ -1047,27 +1066,37 @@ vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)  }  static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); +static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); -static void -vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) +/* + * Invalidate guest mappings identified by its vpid from the TLB. + */ +static __inline void +vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)  {  	struct vmxstate *vmxstate;  	struct invvpid_desc invvpid_desc;  	vmxstate = &vmx->state[vcpu]; -	if (vmxstate->lastcpu == curcpu) +	if (vmxstate->vpid == 0)  		return; -	vmxstate->lastcpu = curcpu; - -	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); +	if (!running) { +		/* +		 * Set the 'lastcpu' to an invalid host cpu. +		 * +		 * This will invalidate TLB entries tagged with the vcpu's +		 * vpid the next time it runs via vmx_set_pcpu_defaults(). +		 */ +		vmxstate->lastcpu = NOCPU; +		return; +	} -	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); -	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); -	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); +	KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside " +	    "critical section", __func__, vcpu));  	/* -	 * If we are using VPIDs then invalidate all mappings tagged with 'vpid' +	 * Invalidate all mappings tagged with 'vpid'  	 *  	 * We do this because this vcpu was executing on a different host  	 * cpu when it last ran. We do not track whether it invalidated @@ -1081,25 +1110,43 @@ vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)  	 * Note also that this will invalidate mappings tagged with 'vpid'  	 * for "all" EP4TAs.  	 */ -	if (vmxstate->vpid != 0) { -		if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { -			invvpid_desc._res1 = 0; -			invvpid_desc._res2 = 0; -			invvpid_desc.vpid = vmxstate->vpid; -			invvpid_desc.linear_addr = 0; -			invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); -		} else { -			/* -			 * The invvpid can be skipped if an invept is going to -			 * be performed before entering the guest. The invept -			 * will invalidate combined mappings tagged with -			 * 'vmx->eptp' for all vpids. -			 */ -			vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); -		} +	if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { +		invvpid_desc._res1 = 0; +		invvpid_desc._res2 = 0; +		invvpid_desc.vpid = vmxstate->vpid; +		invvpid_desc.linear_addr = 0; +		invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); +		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1); +	} else { +		/* +		 * The invvpid can be skipped if an invept is going to +		 * be performed before entering the guest. The invept +		 * will invalidate combined mappings tagged with +		 * 'vmx->eptp' for all vpids. +		 */ +		vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);  	}  } +static void +vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) +{ +	struct vmxstate *vmxstate; + +	vmxstate = &vmx->state[vcpu]; +	if (vmxstate->lastcpu == curcpu) +		return; + +	vmxstate->lastcpu = curcpu; + +	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); + +	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); +	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); +	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); +	vmx_invvpid(vmx, vcpu, pmap, 1); +} +  /*   * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.   */ @@ -1183,24 +1230,32 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu)  static void  vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)  { -	struct vm_exception exc;  	int vector, need_nmi_exiting, extint_pending; -	uint64_t rflags; +	uint64_t rflags, entryinfo;  	uint32_t gi, info; -	if (vm_exception_pending(vmx->vm, vcpu, &exc)) { -		KASSERT(exc.vector >= 0 && exc.vector < 32, -		    ("%s: invalid exception vector %d", __func__, exc.vector)); +	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { +		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " +		    "intinfo is not valid: %#lx", __func__, entryinfo));  		info = vmcs_read(VMCS_ENTRY_INTR_INFO);  		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " -		     "pending exception %d: %#x", __func__, exc.vector, info)); +		     "pending exception: %#lx/%#x", __func__, entryinfo, info)); -		info = exc.vector | VMCS_INTR_T_HWEXCEPTION | VMCS_INTR_VALID; -		if (exc.error_code_valid) { -			info |= VMCS_INTR_DEL_ERRCODE; -			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, exc.error_code); +		info = entryinfo; +		vector = info & 0xff; +		if (vector == IDT_BP || vector == IDT_OF) { +			/* +			 * VT-x requires #BP and #OF to be injected as software +			 * exceptions. +			 */ +			info &= ~VMCS_INTR_T_MASK; +			info |= VMCS_INTR_T_SWEXCEPTION;  		} + +		if (info & VMCS_INTR_DEL_ERRCODE) +			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); +  		vmcs_write(VMCS_ENTRY_INTR_INFO, info);  	} @@ -1379,6 +1434,16 @@ vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)  	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);  } +static void +vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid) +{ +	uint32_t gi; + +	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); +	KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, +	    ("NMI blocking is not in effect %#x", gi)); +} +  static int  vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)  { @@ -1659,11 +1724,19 @@ vmx_cpl(void)  static enum vm_cpu_mode  vmx_cpu_mode(void)  { +	uint32_t csar; -	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) -		return (CPU_MODE_64BIT); -	else -		return (CPU_MODE_COMPATIBILITY); +	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) { +		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); +		if (csar & 0x2000) +			return (CPU_MODE_64BIT);	/* CS.L = 1 */ +		else +			return (CPU_MODE_COMPATIBILITY); +	} else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) { +		return (CPU_MODE_PROTECTED); +	} else { +		return (CPU_MODE_REAL); +	}  }  static enum vm_paging_mode @@ -1757,10 +1830,25 @@ vmx_paging_info(struct vm_guest_paging *paging)  static void  vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)  { +	struct vm_guest_paging *paging; +	uint32_t csar; +	 +	paging = &vmexit->u.inst_emul.paging; +  	vmexit->exitcode = VM_EXITCODE_INST_EMUL;  	vmexit->u.inst_emul.gpa = gpa;  	vmexit->u.inst_emul.gla = gla; -	vmx_paging_info(&vmexit->u.inst_emul.paging); +	vmx_paging_info(paging); +	switch (paging->cpu_mode) { +	case CPU_MODE_PROTECTED: +	case CPU_MODE_COMPATIBILITY: +		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); +		vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); +		break; +	default: +		vmexit->u.inst_emul.cs_d = 0; +		break; +	}  }  static int @@ -1969,6 +2057,26 @@ vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)  	return (UNHANDLED);  } +static enum task_switch_reason +vmx_task_switch_reason(uint64_t qual) +{ +	int reason; + +	reason = (qual >> 30) & 0x3; +	switch (reason) { +	case 0: +		return (TSR_CALL); +	case 1: +		return (TSR_IRET); +	case 2: +		return (TSR_JMP); +	case 3: +		return (TSR_IDT_GATE); +	default: +		panic("%s: invalid reason %d", __func__, reason); +	} +} +  static int  vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)  { @@ -1976,9 +2084,10 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)  	struct vmxctx *vmxctx;  	struct vlapic *vlapic;  	struct vm_inout_str *vis; +	struct vm_task_switch *ts;  	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; -	uint32_t reason; -	uint64_t qual, gpa; +	uint32_t intr_type, reason; +	uint64_t exitintinfo, qual, gpa;  	bool retu;  	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); @@ -1994,46 +2103,99 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)  	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);  	/* -	 * VM exits that could be triggered during event injection on the -	 * previous VM entry need to be handled specially by re-injecting -	 * the event. +	 * VM exits that can be triggered during event delivery need to +	 * be handled specially by re-injecting the event if the IDT +	 * vectoring information field's valid bit is set.  	 *  	 * See "Information for VM Exits During Event Delivery" in Intel SDM  	 * for details.  	 */ -	switch (reason) { -	case EXIT_REASON_EPT_FAULT: -	case EXIT_REASON_EPT_MISCONFIG: -	case EXIT_REASON_APIC_ACCESS: -	case EXIT_REASON_TASK_SWITCH: -	case EXIT_REASON_EXCEPTION: -		idtvec_info = vmcs_idt_vectoring_info(); -		if (idtvec_info & VMCS_IDT_VEC_VALID) { -			idtvec_info &= ~(1 << 12); /* clear undefined bit */ -			vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info); -			if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { -				idtvec_err = vmcs_idt_vectoring_err(); -				vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, -				    idtvec_err); -			} -			/* -			 * If 'virtual NMIs' are being used and the VM-exit -			 * happened while injecting an NMI during the previous -			 * VM-entry, then clear "blocking by NMI" in the Guest -			 * Interruptibility-state. -			 */ -			if ((idtvec_info & VMCS_INTR_T_MASK) == -			    VMCS_INTR_T_NMI) { -				 vmx_clear_nmi_blocking(vmx, vcpu); -			} +	idtvec_info = vmcs_idt_vectoring_info(); +	if (idtvec_info & VMCS_IDT_VEC_VALID) { +		idtvec_info &= ~(1 << 12); /* clear undefined bit */ +		exitintinfo = idtvec_info; +		if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { +			idtvec_err = vmcs_idt_vectoring_err(); +			exitintinfo |= (uint64_t)idtvec_err << 32; +		} +		error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo); +		KASSERT(error == 0, ("%s: vm_set_intinfo error %d", +		    __func__, error)); + +		/* +		 * If 'virtual NMIs' are being used and the VM-exit +		 * happened while injecting an NMI during the previous +		 * VM-entry, then clear "blocking by NMI" in the +		 * Guest Interruptibility-State so the NMI can be +		 * reinjected on the subsequent VM-entry. +		 * +		 * However, if the NMI was being delivered through a task +		 * gate, then the new task must start execution with NMIs +		 * blocked so don't clear NMI blocking in this case. +		 */ +		intr_type = idtvec_info & VMCS_INTR_T_MASK; +		if (intr_type == VMCS_INTR_T_NMI) { +			if (reason != EXIT_REASON_TASK_SWITCH) +				vmx_clear_nmi_blocking(vmx, vcpu); +			else +				vmx_assert_nmi_blocking(vmx, vcpu); +		} + +		/* +		 * Update VM-entry instruction length if the event being +		 * delivered was a software interrupt or software exception. +		 */ +		if (intr_type == VMCS_INTR_T_SWINTR || +		    intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || +		    intr_type == VMCS_INTR_T_SWEXCEPTION) {  			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);  		} -	default: -		idtvec_info = 0; -		break;  	}  	switch (reason) { +	case EXIT_REASON_TASK_SWITCH: +		ts = &vmexit->u.task_switch; +		ts->tsssel = qual & 0xffff; +		ts->reason = vmx_task_switch_reason(qual); +		ts->ext = 0; +		ts->errcode_valid = 0; +		vmx_paging_info(&ts->paging); +		/* +		 * If the task switch was due to a CALL, JMP, IRET, software +		 * interrupt (INT n) or software exception (INT3, INTO), +		 * then the saved %rip references the instruction that caused +		 * the task switch. The instruction length field in the VMCS +		 * is valid in this case. +		 * +		 * In all other cases (e.g., NMI, hardware exception) the +		 * saved %rip is one that would have been saved in the old TSS +		 * had the task switch completed normally so the instruction +		 * length field is not needed in this case and is explicitly +		 * set to 0. +		 */ +		if (ts->reason == TSR_IDT_GATE) { +			KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, +			    ("invalid idtvec_info %#x for IDT task switch", +			    idtvec_info)); +			intr_type = idtvec_info & VMCS_INTR_T_MASK; +			if (intr_type != VMCS_INTR_T_SWINTR && +			    intr_type != VMCS_INTR_T_SWEXCEPTION && +			    intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { +				/* Task switch triggered by external event */ +				ts->ext = 1; +				vmexit->inst_length = 0; +				if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { +					ts->errcode_valid = 1; +					ts->errcode = vmcs_idt_vectoring_err(); +				} +			} +		} +		vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; +		VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, " +		    "%s errcode 0x%016lx", ts->reason, ts->tsssel, +		    ts->ext ? "external" : "internal", +		    ((uint64_t)ts->errcode << 32) | ts->errcode_valid); +		break;  	case EXIT_REASON_CR_ACCESS:  		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);  		switch (qual & 0xf) { @@ -2179,6 +2341,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)  		 * the guest.  		 *  		 * See "Resuming Guest Software after Handling an Exception". +		 * See "Information for VM Exits Due to Vectored Events".  		 */  		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&  		    (intr_info & 0xff) != IDT_DF && @@ -2396,6 +2559,13 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,  		 * pmap_invalidate_ept().  		 */  		disable_intr(); +		vmx_inject_interrupts(vmx, vcpu, vlapic); + +		/* +		 * Check for vcpu suspension after injecting events because +		 * vmx_inject_interrupts() can suspend the vcpu due to a +		 * triple fault. +		 */  		if (vcpu_suspended(suspend_cookie)) {  			enable_intr();  			vm_exit_suspended(vmx->vm, vcpu, vmcs_guest_rip()); @@ -2408,7 +2578,7 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,  			break;  		} -		if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) { +		if (vcpu_should_yield(vm, vcpu)) {  			enable_intr();  			vm_exit_astpending(vmx->vm, vcpu, vmcs_guest_rip());  			vmx_astpending_trace(vmx, vcpu, vmexit->rip); @@ -2416,7 +2586,6 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,  			break;  		} -		vmx_inject_interrupts(vmx, vcpu, vlapic);  		vmx_run_trace(vmx, vcpu);  		rc = vmx_enter_guest(vmxctx, vmx, launched); @@ -2584,6 +2753,7 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)  {  	int error, hostcpu, running, shadow;  	uint64_t ctls; +	pmap_t pmap;  	struct vmx *vmx = arg;  	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); @@ -2621,6 +2791,18 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)  			error = vmcs_setreg(&vmx->vmcs[vcpu], running,  				    VMCS_IDENT(shadow), val);  		} + +		if (reg == VM_REG_GUEST_CR3) { +			/* +			 * Invalidate the guest vcpu's TLB mappings to emulate +			 * the behavior of updating %cr3. +			 * +			 * XXX the processor retains global mappings when %cr3 +			 * is updated but vmx_invvpid() does not. +			 */ +			pmap = vmx->ctx[vcpu].pmap; +			vmx_invvpid(vmx, vcpu, pmap, running); +		}  	}  	return (error); diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c index 2aba63c916e6..a3428db8a87d 100644 --- a/sys/amd64/vmm/intel/vmx_msr.c +++ b/sys/amd64/vmm/intel/vmx_msr.c @@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$");  #include <sys/systm.h>  #include <machine/cpufunc.h> +#include <machine/specialreg.h>  #include "vmx_msr.h" diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h index e6379a93d155..340b0f7ab436 100644 --- a/sys/amd64/vmm/intel/vmx_msr.h +++ b/sys/amd64/vmm/intel/vmx_msr.h @@ -29,29 +29,6 @@  #ifndef _VMX_MSR_H_  #define	_VMX_MSR_H_ -#define	MSR_VMX_BASIC			0x480 -#define	MSR_VMX_EPT_VPID_CAP		0x48C - -#define	MSR_VMX_PROCBASED_CTLS		0x482 -#define	MSR_VMX_TRUE_PROCBASED_CTLS	0x48E - -#define	MSR_VMX_PINBASED_CTLS		0x481 -#define	MSR_VMX_TRUE_PINBASED_CTLS	0x48D - -#define	MSR_VMX_PROCBASED_CTLS2		0x48B - -#define	MSR_VMX_EXIT_CTLS		0x483 -#define	MSR_VMX_TRUE_EXIT_CTLS		0x48f - -#define	MSR_VMX_ENTRY_CTLS		0x484 -#define	MSR_VMX_TRUE_ENTRY_CTLS		0x490 - -#define	MSR_VMX_CR0_FIXED0		0x486 -#define	MSR_VMX_CR0_FIXED1		0x487 - -#define	MSR_VMX_CR4_FIXED0		0x488 -#define	MSR_VMX_CR4_FIXED1		0x489 -  uint32_t vmx_revision(void);  int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c index ca76ea82ce65..f5ef71b65d54 100644 --- a/sys/amd64/vmm/intel/vtd.c +++ b/sys/amd64/vmm/intel/vtd.c @@ -452,6 +452,11 @@ vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,  	ptpindex = 0;  	ptpshift = 0; +	KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__, +	    gpa, len)); +	KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond " +	    "domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr)); +  	if (gpa & PAGE_MASK)  		panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa); diff --git a/sys/amd64/vmm/io/vatpic.c b/sys/amd64/vmm/io/vatpic.c index ee6fc84f90c3..38fc458b7f73 100644 --- a/sys/amd64/vmm/io/vatpic.c +++ b/sys/amd64/vmm/io/vatpic.c @@ -195,26 +195,29 @@ vatpic_notify_intr(struct vatpic *vatpic)  		    atpic->mask, atpic->request, atpic->service);  		/* +		 * From Section 3.6.2, "Interrupt Modes", in the +		 * MPtable Specification, Version 1.4 +		 *  		 * PIC interrupts are routed to both the Local APIC  		 * and the I/O APIC to support operation in 1 of 3  		 * modes.  		 *  		 * 1. Legacy PIC Mode: the PIC effectively bypasses -		 * all APIC components.  In mode '1' the local APIC is +		 * all APIC components.  In this mode the local APIC is  		 * disabled and LINT0 is reconfigured as INTR to  		 * deliver the PIC interrupt directly to the CPU.  		 *  		 * 2. Virtual Wire Mode: the APIC is treated as a  		 * virtual wire which delivers interrupts from the PIC -		 * to the CPU.  In mode '2' LINT0 is programmed as +		 * to the CPU.  In this mode LINT0 is programmed as  		 * ExtINT to indicate that the PIC is the source of  		 * the interrupt.  		 * -		 * 3. Symmetric I/O Mode: PIC interrupts are fielded -		 * by the I/O APIC and delivered to the appropriate -		 * CPU.  In mode '3' the I/O APIC input 0 is -		 * programmed as ExtINT to indicate that the PIC is -		 * the source of the interrupt. +		 * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are +		 * fielded by the I/O APIC and delivered to the appropriate +		 * CPU.  In this mode the I/O APIC input 0 is programmed +		 * as ExtINT to indicate that the PIC is the source of the +		 * interrupt.  		 */  		atpic->intr_raised = true;  		lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0); diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index c2a9fd1e117e..fa0200e84b5c 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -97,6 +97,7 @@ struct vcpu {  	int		hostcpu;	/* (o) vcpu's host cpu */  	struct vlapic	*vlapic;	/* (i) APIC device model */  	enum x2apic_state x2apic_state;	/* (i) APIC mode */ +	uint64_t	exitintinfo;	/* (i) events pending at VM exit */  	int		nmi_pending;	/* (i) NMI pending */  	int		extint_pending;	/* (i) INTR pending */  	struct vm_exception exception;	/* (x) exception collateral */ @@ -242,6 +243,7 @@ vcpu_init(struct vm *vm, int vcpu_id, bool create)  	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);  	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); +	vcpu->exitintinfo = 0;  	vcpu->nmi_pending = 0;  	vcpu->extint_pending = 0;  	vcpu->exception_pending = 0; @@ -571,6 +573,21 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)  	return (0);  } +static vm_paddr_t +vm_maxmem(struct vm *vm) +{ +	int i; +	vm_paddr_t gpa, maxmem; + +	maxmem = 0; +	for (i = 0; i < vm->num_mem_segs; i++) { +		gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len; +		if (gpa > maxmem) +			maxmem = gpa; +	} +	return (maxmem); +} +  static void  vm_gpa_unwire(struct vm *vm)  { @@ -708,7 +725,7 @@ vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)  	if (ppt_assigned_devices(vm) == 0) {  		KASSERT(vm->iommu == NULL,  		    ("vm_assign_pptdev: iommu must be NULL")); -		maxaddr = vmm_mem_maxaddr(); +		maxaddr = vm_maxmem(vm);  		vm->iommu = iommu_create_domain(maxaddr);  		error = vm_gpa_wire(vm); @@ -1104,6 +1121,10 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)  			}  		} +		/* Don't go to sleep if the vcpu thread needs to yield */ +		if (vcpu_should_yield(vm, vcpuid)) +			break; +  		/*  		 * Some Linux guests implement "halt" by having all vcpus  		 * execute HLT with interrupts disabled. 'halted_cpus' keeps @@ -1127,7 +1148,11 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)  		t = ticks;  		vcpu_require_state_locked(vcpu, VCPU_SLEEPING); -		msleep_spin(vcpu, &vcpu->mtx, wmesg, 0); +		/* +		 * XXX msleep_spin() cannot be interrupted by signals so +		 * wake up periodically to check pending signals. +		 */ +		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);  		vcpu_require_state_locked(vcpu, VCPU_FROZEN);  		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);  	} @@ -1191,15 +1216,18 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)  	struct vm_guest_paging *paging;  	mem_region_read_t mread;  	mem_region_write_t mwrite; -	int error; +	enum vm_cpu_mode cpu_mode; +	int cs_d, error;  	vcpu = &vm->vcpu[vcpuid];  	vme = &vcpu->exitinfo;  	gla = vme->u.inst_emul.gla;  	gpa = vme->u.inst_emul.gpa; +	cs_d = vme->u.inst_emul.cs_d;  	vie = &vme->u.inst_emul.vie;  	paging = &vme->u.inst_emul.paging; +	cpu_mode = paging->cpu_mode;  	vie_init(vie); @@ -1213,7 +1241,7 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)  	else if (error != 0)  		panic("%s: vmm_fetch_instruction error %d", __func__, error); -	if (vmm_decode_instruction(vm, vcpuid, gla, paging->cpu_mode, vie) != 0) +	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)  		return (EFAULT);  	/* return to userland unless this is an in-kernel emulated device */ @@ -1231,8 +1259,8 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)  		return (0);  	} -	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite, -	    retu); +	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging, +	    mread, mwrite, retu);  	return (error);  } @@ -1456,6 +1484,202 @@ restart:  }  int +vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) +{ +	struct vcpu *vcpu; +	int type, vector; + +	if (vcpuid < 0 || vcpuid >= VM_MAXCPU) +		return (EINVAL); + +	vcpu = &vm->vcpu[vcpuid]; + +	if (info & VM_INTINFO_VALID) { +		type = info & VM_INTINFO_TYPE; +		vector = info & 0xff; +		if (type == VM_INTINFO_NMI && vector != IDT_NMI) +			return (EINVAL); +		if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) +			return (EINVAL); +		if (info & VM_INTINFO_RSVD) +			return (EINVAL); +	} else { +		info = 0; +	} +	VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info); +	vcpu->exitintinfo = info; +	return (0); +} + +enum exc_class { +	EXC_BENIGN, +	EXC_CONTRIBUTORY, +	EXC_PAGEFAULT +}; + +#define	IDT_VE	20	/* Virtualization Exception (Intel specific) */ + +static enum exc_class +exception_class(uint64_t info) +{ +	int type, vector; + +	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); +	type = info & VM_INTINFO_TYPE; +	vector = info & 0xff; + +	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ +	switch (type) { +	case VM_INTINFO_HWINTR: +	case VM_INTINFO_SWINTR: +	case VM_INTINFO_NMI: +		return (EXC_BENIGN); +	default: +		/* +		 * Hardware exception. +		 * +		 * SVM and VT-x use identical type values to represent NMI, +		 * hardware interrupt and software interrupt. +		 * +		 * SVM uses type '3' for all exceptions. VT-x uses type '3' +		 * for exceptions except #BP and #OF. #BP and #OF use a type +		 * value of '5' or '6'. Therefore we don't check for explicit +		 * values of 'type' to classify 'intinfo' into a hardware +		 * exception. +		 */ +		break; +	} + +	switch (vector) { +	case IDT_PF: +	case IDT_VE: +		return (EXC_PAGEFAULT); +	case IDT_DE: +	case IDT_TS: +	case IDT_NP: +	case IDT_SS: +	case IDT_GP: +		return (EXC_CONTRIBUTORY); +	default: +		return (EXC_BENIGN); +	} +} + +static int +nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, +    uint64_t *retinfo) +{ +	enum exc_class exc1, exc2; +	int type1, vector1; + +	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); +	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); + +	/* +	 * If an exception occurs while attempting to call the double-fault +	 * handler the processor enters shutdown mode (aka triple fault). +	 */ +	type1 = info1 & VM_INTINFO_TYPE; +	vector1 = info1 & 0xff; +	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { +		VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)", +		    info1, info2); +		vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); +		*retinfo = 0; +		return (0); +	} + +	/* +	 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 +	 */ +	exc1 = exception_class(info1); +	exc2 = exception_class(info2); +	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || +	    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { +		/* Convert nested fault into a double fault. */ +		*retinfo = IDT_DF; +		*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; +		*retinfo |= VM_INTINFO_DEL_ERRCODE; +	} else { +		/* Handle exceptions serially */ +		*retinfo = info2; +	} +	return (1); +} + +static uint64_t +vcpu_exception_intinfo(struct vcpu *vcpu) +{ +	uint64_t info = 0; + +	if (vcpu->exception_pending) { +		info = vcpu->exception.vector & 0xff; +		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; +		if (vcpu->exception.error_code_valid) { +			info |= VM_INTINFO_DEL_ERRCODE; +			info |= (uint64_t)vcpu->exception.error_code << 32; +		} +	} +	return (info); +} + +int +vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) +{ +	struct vcpu *vcpu; +	uint64_t info1, info2; +	int valid; + +	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); + +	vcpu = &vm->vcpu[vcpuid]; + +	info1 = vcpu->exitintinfo; +	vcpu->exitintinfo = 0; + +	info2 = 0; +	if (vcpu->exception_pending) { +		info2 = vcpu_exception_intinfo(vcpu); +		vcpu->exception_pending = 0; +		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx", +		    vcpu->exception.vector, info2); +	} + +	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { +		valid = nested_fault(vm, vcpuid, info1, info2, retinfo); +	} else if (info1 & VM_INTINFO_VALID) { +		*retinfo = info1; +		valid = 1; +	} else if (info2 & VM_INTINFO_VALID) { +		*retinfo = info2; +		valid = 1; +	} else { +		valid = 0; +	} + +	if (valid) { +		VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), " +		    "retinfo(%#lx)", __func__, info1, info2, *retinfo); +	} + +	return (valid); +} + +int +vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) +{ +	struct vcpu *vcpu; + +	if (vcpuid < 0 || vcpuid >= VM_MAXCPU) +		return (EINVAL); + +	vcpu = &vm->vcpu[vcpuid]; +	*info1 = vcpu->exitintinfo; +	*info2 = vcpu_exception_intinfo(vcpu); +	return (0); +} + +int  vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)  {  	struct vcpu *vcpu; @@ -1466,6 +1690,14 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)  	if (exception->vector < 0 || exception->vector >= 32)  		return (EINVAL); +	/* +	 * A double fault exception should never be injected directly into +	 * the guest. It is a derived exception that results from specific +	 * combinations of nested faults. +	 */ +	if (exception->vector == IDT_DF) +		return (EINVAL); +  	vcpu = &vm->vcpu[vcpuid];  	if (vcpu->exception_pending) { @@ -1481,32 +1713,21 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)  	return (0);  } -int -vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception) -{ -	struct vcpu *vcpu; -	int pending; - -	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); - -	vcpu = &vm->vcpu[vcpuid]; -	pending = vcpu->exception_pending; -	if (pending) { -		vcpu->exception_pending = 0; -		*exception = vcpu->exception; -		VCPU_CTR1(vm, vcpuid, "Exception %d delivered", -		    exception->vector); -	} -	return (pending); -} - -static void -vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception) +void +vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid, +    int errcode)  { +	struct vm_exception exception;  	struct vm_exit *vmexit; +	struct vm *vm;  	int error; -	error = vm_inject_exception(vm, vcpuid, exception); +	vm = vmarg; + +	exception.vector = vector; +	exception.error_code = errcode; +	exception.error_code_valid = errcode_valid; +	error = vm_inject_exception(vm, vcpuid, &exception);  	KASSERT(error == 0, ("vm_inject_exception error %d", error));  	/* @@ -1521,45 +1742,19 @@ vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)  }  void -vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2) +vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)  { -	struct vm_exception pf = { -		.vector = IDT_PF, -		.error_code_valid = 1, -		.error_code = error_code -	}; +	struct vm *vm;  	int error; +	vm = vmarg;  	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",  	    error_code, cr2);  	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);  	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); -	vm_inject_fault(vm, vcpuid, &pf); -} - -void -vm_inject_gp(struct vm *vm, int vcpuid) -{ -	struct vm_exception gpf = { -		.vector = IDT_GP, -		.error_code_valid = 1, -		.error_code = 0 -	}; - -	vm_inject_fault(vm, vcpuid, &gpf); -} - -void -vm_inject_ud(struct vm *vm, int vcpuid) -{ -	struct vm_exception udf = { -		.vector = IDT_UD, -		.error_code_valid = 0 -	}; - -	vm_inject_fault(vm, vcpuid, &udf); +	vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);  }  static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); @@ -1993,6 +2188,97 @@ vm_segment_name(int seg)  	return (seg_names[seg]);  } +void +vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, +    int num_copyinfo) +{ +	int idx; + +	for (idx = 0; idx < num_copyinfo; idx++) { +		if (copyinfo[idx].cookie != NULL) +			vm_gpa_release(copyinfo[idx].cookie); +	} +	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); +} + +int +vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, +    uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, +    int num_copyinfo) +{ +	int error, idx, nused; +	size_t n, off, remaining; +	void *hva, *cookie; +	uint64_t gpa; + +	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); + +	nused = 0; +	remaining = len; +	while (remaining > 0) { +		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); +		error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa); +		if (error) +			return (error); +		off = gpa & PAGE_MASK; +		n = min(remaining, PAGE_SIZE - off); +		copyinfo[nused].gpa = gpa; +		copyinfo[nused].len = n; +		remaining -= n; +		gla += n; +		nused++; +	} + +	for (idx = 0; idx < nused; idx++) { +		hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len, +		    prot, &cookie); +		if (hva == NULL) +			break; +		copyinfo[idx].hva = hva; +		copyinfo[idx].cookie = cookie; +	} + +	if (idx != nused) { +		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); +		return (-1); +	} else { +		return (0); +	} +} + +void +vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, +    size_t len) +{ +	char *dst; +	int idx; +	 +	dst = kaddr; +	idx = 0; +	while (len > 0) { +		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); +		len -= copyinfo[idx].len; +		dst += copyinfo[idx].len; +		idx++; +	} +} + +void +vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, +    struct vm_copyinfo *copyinfo, size_t len) +{ +	const char *src; +	int idx; + +	src = kaddr; +	idx = 0; +	while (len > 0) { +		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); +		len -= copyinfo[idx].len; +		src += copyinfo[idx].len; +		idx++; +	} +}  /*   * Return the amount of in-use and wired memory for the VM. Since diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index f3e31a33df4a..a85109edaa1d 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -173,6 +173,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,  	struct vm_gla2gpa *gg;  	struct vm_activate_cpu *vac;  	struct vm_cpuset *vm_cpuset; +	struct vm_intinfo *vmii;  	sc = vmmdev_lookup2(cdev);  	if (sc == NULL) @@ -199,6 +200,8 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,  	case VM_SET_X2APIC_STATE:  	case VM_GLA2GPA:  	case VM_ACTIVATE_CPU: +	case VM_SET_INTINFO: +	case VM_GET_INTINFO:  		/*  		 * XXX fragile, handle with care  		 * Assumes that the first field of the ioctl data is the vcpu. @@ -470,6 +473,15 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,  			error = copyout(cpuset, vm_cpuset->cpus, size);  		free(cpuset, M_TEMP);  		break; +	case VM_SET_INTINFO: +		vmii = (struct vm_intinfo *)data; +		error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1); +		break; +	case VM_GET_INTINFO: +		vmii = (struct vm_intinfo *)data; +		error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1, +		    &vmii->info2); +		break;  	default:  		error = ENOTTY;  		break; diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c index 921deb5ab29d..a65b1251e52b 100644 --- a/sys/amd64/vmm/vmm_instruction_emul.c +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");  #else	/* !_KERNEL */  #include <sys/types.h>  #include <sys/errno.h> +#include <sys/_iovec.h>  #include <machine/vmm.h> @@ -65,18 +66,26 @@ enum {  	VIE_OP_TYPE_AND,  	VIE_OP_TYPE_OR,  	VIE_OP_TYPE_TWO_BYTE, +	VIE_OP_TYPE_PUSH, +	VIE_OP_TYPE_CMP,  	VIE_OP_TYPE_LAST  };  /* struct vie_op.op_flags */ -#define	VIE_OP_F_IMM		(1 << 0)	/* immediate operand present */ -#define	VIE_OP_F_IMM8		(1 << 1)	/* 8-bit immediate operand */ +#define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */ +#define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */ +#define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */ +#define	VIE_OP_F_NO_MODRM	(1 << 3)  static const struct vie_op two_byte_opcodes[256] = {  	[0xB6] = {  		.op_byte = 0xB6,  		.op_type = VIE_OP_TYPE_MOVZX,  	}, +	[0xB7] = { +		.op_byte = 0xB7, +		.op_type = VIE_OP_TYPE_MOVZX, +	},  	[0xBE] = {  		.op_byte = 0xBE,  		.op_type = VIE_OP_TYPE_MOVSX, @@ -88,6 +97,10 @@ static const struct vie_op one_byte_opcodes[256] = {  		.op_byte = 0x0F,  		.op_type = VIE_OP_TYPE_TWO_BYTE  	}, +	[0x3B] = { +		.op_byte = 0x3B, +		.op_type = VIE_OP_TYPE_CMP, +	},  	[0x88] = {  		.op_byte = 0x88,  		.op_type = VIE_OP_TYPE_MOV, @@ -104,6 +117,22 @@ static const struct vie_op one_byte_opcodes[256] = {  		.op_byte = 0x8B,  		.op_type = VIE_OP_TYPE_MOV,  	}, +	[0xA1] = { +		.op_byte = 0xA1, +		.op_type = VIE_OP_TYPE_MOV, +		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, +	}, +	[0xA3] = { +		.op_byte = 0xA3, +		.op_type = VIE_OP_TYPE_MOV, +		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, +	}, +	[0xC6] = { +		/* XXX Group 11 extended opcode - not just MOV */ +		.op_byte = 0xC6, +		.op_type = VIE_OP_TYPE_MOV, +		.op_flags = VIE_OP_F_IMM8, +	},  	[0xC7] = {  		.op_byte = 0xC7,  		.op_type = VIE_OP_TYPE_MOV, @@ -125,6 +154,11 @@ static const struct vie_op one_byte_opcodes[256] = {  		.op_type = VIE_OP_TYPE_OR,  		.op_flags = VIE_OP_F_IMM8,  	}, +	[0xFF] = { +		/* XXX Group 5 extended opcode - not just PUSH */ +		.op_byte = 0xFF, +		.op_type = VIE_OP_TYPE_PUSH, +	}  };  /* struct vie.mod */ @@ -175,18 +209,15 @@ vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)  	return (error);  } -static int -vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) +static void +vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)  { -	uint64_t val; -	int error, rshift; -	enum vm_reg_name reg; - -	rshift = 0; -	reg = gpr_map[vie->reg]; +	*lhbr = 0; +	*reg = gpr_map[vie->reg];  	/* -	 * 64-bit mode imposes limitations on accessing legacy byte registers. +	 * 64-bit mode imposes limitations on accessing legacy high byte +	 * registers (lhbr).  	 *  	 * The legacy high-byte registers cannot be addressed if the REX  	 * prefix is present. In this case the values 4, 5, 6 and 7 of the @@ -198,17 +229,56 @@ vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)  	 */  	if (!vie->rex_present) {  		if (vie->reg & 0x4) { -			/* -			 * Obtain the value of %ah by reading %rax and shifting -			 * right by 8 bits (same for %bh, %ch and %dh). -			 */ -			rshift = 8; -			reg = gpr_map[vie->reg & 0x3]; +			*lhbr = 1; +			*reg = gpr_map[vie->reg & 0x3];  		}  	} +} + +static int +vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) +{ +	uint64_t val; +	int error, lhbr; +	enum vm_reg_name reg; +	vie_calc_bytereg(vie, ®, &lhbr);  	error = vm_get_register(vm, vcpuid, reg, &val); -	*rval = val >> rshift; + +	/* +	 * To obtain the value of a legacy high byte register shift the +	 * base register right by 8 bits (%ah = %rax >> 8). +	 */ +	if (lhbr) +		*rval = val >> 8; +	else +		*rval = val; +	return (error); +} + +static int +vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte) +{ +	uint64_t origval, val, mask; +	int error, lhbr; +	enum vm_reg_name reg; + +	vie_calc_bytereg(vie, ®, &lhbr); +	error = vm_get_register(vm, vcpuid, reg, &origval); +	if (error == 0) { +		val = byte; +		mask = 0xff; +		if (lhbr) { +			/* +			 * Shift left by 8 to store 'byte' in a legacy high +			 * byte register. +			 */ +			val <<= 8; +			mask <<= 8; +		} +		val |= origval & ~mask; +		error = vm_set_register(vm, vcpuid, reg, val); +	}  	return (error);  } @@ -242,16 +312,52 @@ vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,  }  /* - * The following simplifying assumptions are made during emulation: - * - * - guest is in 64-bit mode - *   - default address size is 64-bits - *   - default operand size is 32-bits - * - * - operand size override is not supported - * - * - address size override is not supported + * Return the status flags that would result from doing (x - y).   */ +static u_long +getcc16(uint16_t x, uint16_t y) +{ +	u_long rflags; + +	__asm __volatile("sub %1,%2; pushfq; popq %0" : +	    "=r" (rflags) : "m" (y), "r" (x)); +	return (rflags); +} + +static u_long +getcc32(uint32_t x, uint32_t y) +{ +	u_long rflags; + +	__asm __volatile("sub %1,%2; pushfq; popq %0" : +	    "=r" (rflags) : "m" (y), "r" (x)); +	return (rflags); +} + +static u_long +getcc64(uint64_t x, uint64_t y) +{ +	u_long rflags; + +	__asm __volatile("sub %1,%2; pushfq; popq %0" : +	    "=r" (rflags) : "m" (y), "r" (x)); +	return (rflags); +} + +static u_long +getcc(int opsize, uint64_t x, uint64_t y) +{ +	KASSERT(opsize == 2 || opsize == 4 || opsize == 8, +	    ("getcc: invalid operand size %d", opsize)); + +	if (opsize == 2) +		return (getcc16(x, y)); +	else if (opsize == 4) +		return (getcc32(x, y)); +	else +		return (getcc64(x, y)); +} +  static int  emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg) @@ -261,7 +367,7 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  	uint8_t byte;  	uint64_t val; -	size = 4; +	size = vie->opsize;  	error = EINVAL;  	switch (vie->op.op_byte) { @@ -271,7 +377,7 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  		 * 88/r:	mov r/m8, r8  		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)  		 */ -		size = 1; +		size = 1;	/* override for byte operation */  		error = vie_read_bytereg(vm, vcpuid, vie, &byte);  		if (error == 0)  			error = memwrite(vm, vcpuid, gpa, byte, size, arg); @@ -279,11 +385,10 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  	case 0x89:  		/*  		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m) +		 * 89/r:	mov r/m16, r16  		 * 89/r:	mov r/m32, r32  		 * REX.W + 89/r	mov r/m64, r64  		 */ -		if (vie->rex_w) -			size = 8;  		reg = gpr_map[vie->reg];  		error = vie_read_register(vm, vcpuid, reg, &val);  		if (error == 0) { @@ -292,38 +397,72 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  		}  		break;  	case 0x8A: +		/* +		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) +		 * 8A/r:	mov r8, r/m8 +		 * REX + 8A/r:	mov r8, r/m8 +		 */ +		size = 1;	/* override for byte operation */ +		error = memread(vm, vcpuid, gpa, &val, size, arg); +		if (error == 0) +			error = vie_write_bytereg(vm, vcpuid, vie, val); +		break;  	case 0x8B:  		/*  		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg) -		 * 8A/r:	mov r/m8, r8 -		 * REX + 8A/r:	mov r/m8, r8 +		 * 8B/r:	mov r16, r/m16  		 * 8B/r:	mov r32, r/m32  		 * REX.W 8B/r:	mov r64, r/m64  		 */ -		if (vie->op.op_byte == 0x8A) -			size = 1; -		else if (vie->rex_w) -			size = 8;  		error = memread(vm, vcpuid, gpa, &val, size, arg);  		if (error == 0) {  			reg = gpr_map[vie->reg];  			error = vie_update_register(vm, vcpuid, reg, val, size);  		}  		break; +	case 0xA1: +		/* +		 * MOV from seg:moffset to AX/EAX/RAX +		 * A1:		mov AX, moffs16 +		 * A1:		mov EAX, moffs32 +		 * REX.W + A1:	mov RAX, moffs64 +		 */ +		error = memread(vm, vcpuid, gpa, &val, size, arg); +		if (error == 0) { +			reg = VM_REG_GUEST_RAX; +			error = vie_update_register(vm, vcpuid, reg, val, size); +		} +		break; +	case 0xA3: +		/* +		 * MOV from AX/EAX/RAX to seg:moffset +		 * A3:		mov moffs16, AX +		 * A3:		mov moffs32, EAX  +		 * REX.W + A3:	mov moffs64, RAX +		 */ +		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); +		if (error == 0) { +			val &= size2mask[size]; +			error = memwrite(vm, vcpuid, gpa, val, size, arg); +		} +		break; +	case 0xC6: +		/* +		 * MOV from imm8 to mem (ModRM:r/m) +		 * C6/0		mov r/m8, imm8 +		 * REX + C6/0	mov r/m8, imm8 +		 */ +		size = 1;	/* override for byte operation */ +		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg); +		break;  	case 0xC7:  		/* -		 * MOV from imm32 to mem (ModRM:r/m) +		 * MOV from imm16/imm32 to mem (ModRM:r/m) +		 * C7/0		mov r/m16, imm16  		 * C7/0		mov r/m32, imm32  		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)  		 */ -		val = vie->immediate;		/* already sign-extended */ - -		if (vie->rex_w) -			size = 8; - -		if (size != 8) -			val &= size2mask[size]; - +		val = vie->immediate & size2mask[size];  		error = memwrite(vm, vcpuid, gpa, val, size, arg);  		break;  	default: @@ -333,17 +472,6 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  	return (error);  } -/* - * The following simplifying assumptions are made during emulation: - * - * - guest is in 64-bit mode - *   - default address size is 64-bits - *   - default operand size is 32-bits - * - * - operand size override is not supported - * - * - address size override is not supported - */  static int  emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  	     mem_region_read_t memread, mem_region_write_t memwrite, @@ -353,7 +481,7 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  	enum vm_reg_name reg;  	uint64_t val; -	size = 4; +	size = vie->opsize;  	error = EINVAL;  	switch (vie->op.op_byte) { @@ -362,8 +490,9 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  		 * MOV and zero extend byte from mem (ModRM:r/m) to  		 * reg (ModRM:reg).  		 * -		 * 0F B6/r		movzx r/m8, r32 -		 * REX.W + 0F B6/r	movzx r/m8, r64 +		 * 0F B6/r		movzx r16, r/m8 +		 * 0F B6/r		movzx r32, r/m8 +		 * REX.W + 0F B6/r	movzx r64, r/m8  		 */  		/* get the first operand */ @@ -374,19 +503,39 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  		/* get the second operand */  		reg = gpr_map[vie->reg]; -		if (vie->rex_w) -			size = 8; +		/* zero-extend byte */ +		val = (uint8_t)val;  		/* write the result */  		error = vie_update_register(vm, vcpuid, reg, val, size);  		break; +	case 0xB7: +		/* +		 * MOV and zero extend word from mem (ModRM:r/m) to +		 * reg (ModRM:reg). +		 * +		 * 0F B7/r		movzx r32, r/m16 +		 * REX.W + 0F B7/r	movzx r64, r/m16 +		 */ +		error = memread(vm, vcpuid, gpa, &val, 2, arg); +		if (error) +			return (error); + +		reg = gpr_map[vie->reg]; + +		/* zero-extend word */ +		val = (uint16_t)val; + +		error = vie_update_register(vm, vcpuid, reg, val, size); +		break;  	case 0xBE:  		/*  		 * MOV and sign extend byte from mem (ModRM:r/m) to  		 * reg (ModRM:reg).  		 * -		 * 0F BE/r		movsx r/m8, r32 -		 * REX.W + 0F BE/r	movsx r/m8, r64 +		 * 0F BE/r		movsx r16, r/m8 +		 * 0F BE/r		movsx r32, r/m8 +		 * REX.W + 0F BE/r	movsx r64, r/m8  		 */  		/* get the first operand */ @@ -397,9 +546,6 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  		/* get the second operand */  		reg = gpr_map[vie->reg]; -		if (vie->rex_w) -			size = 8; -  		/* sign extend byte */  		val = (int8_t)val; @@ -420,7 +566,7 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  	enum vm_reg_name reg;  	uint64_t val1, val2; -	size = 4; +	size = vie->opsize;  	error = EINVAL;  	switch (vie->op.op_byte) { @@ -429,11 +575,10 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the  		 * result in reg.  		 * +		 * 23/r		and r16, r/m16  		 * 23/r		and r32, r/m32  		 * REX.W + 23/r	and r64, r/m64  		 */ -		if (vie->rex_w) -			size = 8;  		/* get the first operand */  		reg = gpr_map[vie->reg]; @@ -455,8 +600,9 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  		 * AND mem (ModRM:r/m) with immediate and store the  		 * result in mem.  		 * -		 * 81/          and r/m32, imm32 -		 * REX.W + 81/  and r/m64, imm32 sign-extended to 64 +		 * 81 /4		and r/m16, imm16 +		 * 81 /4		and r/m32, imm32 +		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64  		 *  		 * Currently, only the AND operation of the 0x81 opcode  		 * is implemented (ModRM:reg = b100). @@ -464,9 +610,6 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  		if ((vie->reg & 7) != 4)  			break; -		if (vie->rex_w) -			size = 8; -		  		/* get the first operand */                  error = memread(vm, vcpuid, gpa, &val1, size, arg);                  if (error) @@ -492,7 +635,7 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  	int error, size;  	uint64_t val1; -	size = 4; +	size = vie->opsize;  	error = EINVAL;  	switch (vie->op.op_byte) { @@ -501,8 +644,9 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  		 * OR mem (ModRM:r/m) with immediate and store the  		 * result in mem.  		 * -		 * 83/          OR r/m32, imm8 sign-extended to 32 -		 * REX.W + 83/  OR r/m64, imm8 sign-extended to 64 +		 * 83 /1		OR r/m16, imm8 sign-extended to 16 +		 * 83 /1		OR r/m32, imm8 sign-extended to 32 +		 * REX.W + 83/1		OR r/m64, imm8 sign-extended to 64  		 *  		 * Currently, only the OR operation of the 0x83 opcode  		 * is implemented (ModRM:reg = b001). @@ -510,9 +654,6 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  		if ((vie->reg & 7) != 1)  			break; -		if (vie->rex_w) -			size = 8; -		  		/* get the first operand */                  error = memread(vm, vcpuid, gpa, &val1, size, arg);                  if (error) @@ -531,10 +672,167 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  	return (error);  } +#define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) + +static int +emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, +	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ +	int error, size; +	uint64_t op1, op2, rflags, rflags2; +	enum vm_reg_name reg; + +	size = vie->opsize; +	switch (vie->op.op_byte) { +	case 0x3B: +		/* +		 * 3B/r		CMP r16, r/m16 +		 * 3B/r		CMP r32, r/m32 +		 * REX.W + 3B/r	CMP r64, r/m64 +		 * +		 * Compare first operand (reg) with second operand (r/m) and +		 * set status flags in EFLAGS register. The comparison is +		 * performed by subtracting the second operand from the first +		 * operand and then setting the status flags. +		 */ + +		/* Get the first operand */ +		reg = gpr_map[vie->reg]; +		error = vie_read_register(vm, vcpuid, reg, &op1); +		if (error) +			return (error); + +		/* Get the second operand */ +		error = memread(vm, vcpuid, gpa, &op2, size, arg); +		if (error) +			return (error); + +		break; +	default: +		return (EINVAL); +	} +	rflags2 = getcc(size, op1, op2); +	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); +	if (error) +		return (error); +	rflags &= ~RFLAGS_STATUS_BITS; +	rflags |= rflags2 & RFLAGS_STATUS_BITS; + +	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); +	return (error); +} + +static int +emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, +    struct vm_guest_paging *paging, mem_region_read_t memread, +    mem_region_write_t memwrite, void *arg) +{ +#ifdef _KERNEL +	struct vm_copyinfo copyinfo[2]; +#else +	struct iovec copyinfo[2]; +#endif +	struct seg_desc ss_desc; +	uint64_t cr0, rflags, rsp, stack_gla, val; +	int error, size, stackaddrsize; + +	/* +	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. +	 * +	 * PUSH is part of the group 5 extended opcodes and is identified +	 * by ModRM:reg = b110. +	 */ +	if ((vie->reg & 7) != 6) +		return (EINVAL); + +	size = vie->opsize; +	/* +	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 +	 */ +	if (paging->cpu_mode == CPU_MODE_REAL) { +		stackaddrsize = 2; +	} else if (paging->cpu_mode == CPU_MODE_64BIT) { +		/* +		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 +		 * - Stack pointer size is always 64-bits. +		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode. +		 * - 16-bit PUSH/POP is supported by using the operand size +		 *   override prefix (66H). +		 */ +		stackaddrsize = 8; +		size = vie->opsize_override ? 2 : 8; +	} else { +		/* +		 * In protected or compability mode the 'B' flag in the +		 * stack-segment descriptor determines the size of the +		 * stack pointer. +		 */ +		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); +		KASSERT(error == 0, ("%s: error %d getting SS descriptor", +		    __func__, error)); +		if (SEG_DESC_DEF32(ss_desc.access)) +			stackaddrsize = 4; +		else +			stackaddrsize = 2; +	} + +	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); +	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); + +	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); +	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + +	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); +	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); + +	rsp -= size; +	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, +	    rsp, size, stackaddrsize, PROT_WRITE, &stack_gla)) { +		vm_inject_ss(vm, vcpuid, 0); +		return (0); +	} + +	if (vie_canonical_check(paging->cpu_mode, stack_gla)) { +		vm_inject_ss(vm, vcpuid, 0); +		return (0); +	} + +	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { +		vm_inject_ac(vm, vcpuid, 0); +		return (0); +	} + +	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, PROT_WRITE, +	    copyinfo, nitems(copyinfo)); +	if (error == -1) { +		/* +		 * XXX cannot return a negative error value here because it +		 * ends up being the return value of the VM_RUN() ioctl and +		 * is interpreted as a pseudo-error (for e.g. ERESTART). +		 */ +		return (EFAULT); +	} else if (error == 1) { +		/* Resume guest execution to handle page fault */ +		return (0); +	} + +	error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); +	if (error == 0) { +		vm_copyout(vm, vcpuid, &val, copyinfo, size); +		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, +		    stackaddrsize); +		KASSERT(error == 0, ("error %d updating rsp", error)); +	} +#ifdef _KERNEL +	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); +#endif +	return (error); +} +  int  vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, -			mem_region_read_t memread, mem_region_write_t memwrite, -			void *memarg) +    struct vm_guest_paging *paging, mem_region_read_t memread, +    mem_region_write_t memwrite, void *memarg)  {  	int error; @@ -542,6 +840,14 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,  		return (EINVAL);  	switch (vie->op.op_type) { +	case VIE_OP_TYPE_PUSH: +		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread, +		    memwrite, memarg); +		break; +	case VIE_OP_TYPE_CMP: +		error = emulate_cmp(vm, vcpuid, gpa, vie, +				    memread, memwrite, memarg); +		break;  	case VIE_OP_TYPE_MOV:  		error = emulate_mov(vm, vcpuid, gpa, vie,  				    memread, memwrite, memarg); @@ -636,7 +942,7 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,  		 * then the descriptor is unusable and attempting to use  		 * it results in a #GP(0).  		 */ -		if (SEG_DESC_UNUSABLE(desc)) +		if (SEG_DESC_UNUSABLE(desc->access))  			return (-1);  		/*  @@ -645,13 +951,13 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,  		 * descriptor that is not present. If this was the case then  		 * it would have been checked before the VM-exit.  		 */ -		KASSERT(SEG_DESC_PRESENT(desc), ("segment %d not present: %#x", -		    seg, desc->access)); +		KASSERT(SEG_DESC_PRESENT(desc->access), +		    ("segment %d not present: %#x", seg, desc->access));  		/*  		 * The descriptor type must indicate a code/data segment.  		 */ -		type = SEG_DESC_TYPE(desc); +		type = SEG_DESC_TYPE(desc->access);  		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "  		    "descriptor type %#x", seg, type)); @@ -680,7 +986,8 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,  		if ((type & 0xC) == 0x4) {  			/* expand-down data segment */  			low_limit = desc->limit + 1; -			high_limit = SEG_DESC_DEF32(desc) ? 0xffffffff : 0xffff; +			high_limit = SEG_DESC_DEF32(desc->access) ? +			    0xffffffff : 0xffff;  		} else {  			/* code segment or expand-up data segment */  			low_limit = 0; @@ -947,45 +1254,24 @@ fault:  }  int -vmm_fetch_instruction(struct vm *vm, int cpuid, struct vm_guest_paging *paging, +vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,      uint64_t rip, int inst_length, struct vie *vie)  { -	int n, error, prot; -	uint64_t gpa, off; -	void *hpa, *cookie; - -	/* -	 * XXX cache previously fetched instructions using 'rip' as the tag -	 */ +	struct vm_copyinfo copyinfo[2]; +	int error, prot; -	prot = VM_PROT_READ | VM_PROT_EXECUTE;  	if (inst_length > VIE_INST_SIZE)  		panic("vmm_fetch_instruction: invalid length %d", inst_length); -	/* Copy the instruction into 'vie' */ -	while (vie->num_valid < inst_length) { -		error = vmm_gla2gpa(vm, cpuid, paging, rip, prot, &gpa); -		if (error) -			return (error); - -		off = gpa & PAGE_MASK; -		n = min(inst_length - vie->num_valid, PAGE_SIZE - off); - -		if ((hpa = vm_gpa_hold(vm, gpa, n, prot, &cookie)) == NULL) -			break; - -		bcopy(hpa, &vie->inst[vie->num_valid], n); - -		vm_gpa_release(cookie); - -		rip += n; -		vie->num_valid += n; +	prot = PROT_READ | PROT_EXEC; +	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot, +	    copyinfo, nitems(copyinfo)); +	if (error == 0) { +		vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); +		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); +		vie->num_valid = inst_length;  	} - -	if (vie->num_valid == inst_length) -		return (0); -	else -		return (-1); +	return (error);  }  static int @@ -1007,24 +1293,65 @@ vie_advance(struct vie *vie)  }  static int -decode_rex(struct vie *vie) +decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)  {  	uint8_t x; -	if (vie_peek(vie, &x)) -		return (-1); +	while (1) { +		if (vie_peek(vie, &x)) +			return (-1); -	if (x >= 0x40 && x <= 0x4F) { -		vie->rex_present = 1; +		if (x == 0x66) +			vie->opsize_override = 1; +		else if (x == 0x67) +			vie->addrsize_override = 1; +		else +			break; + +		vie_advance(vie); +	} +	/* +	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: +	 * - Only one REX prefix is allowed per instruction. +	 * - The REX prefix must immediately precede the opcode byte or the +	 *   escape opcode byte. +	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) +	 *   the mandatory prefix must come before the REX prefix. +	 */ +	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { +		vie->rex_present = 1;  		vie->rex_w = x & 0x8 ? 1 : 0;  		vie->rex_r = x & 0x4 ? 1 : 0;  		vie->rex_x = x & 0x2 ? 1 : 0;  		vie->rex_b = x & 0x1 ? 1 : 0; -  		vie_advance(vie);  	} +	/* +	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 +	 */ +	if (cpu_mode == CPU_MODE_64BIT) { +		/* +		 * Default address size is 64-bits and default operand size +		 * is 32-bits. +		 */ +		vie->addrsize = vie->addrsize_override ? 4 : 8; +		if (vie->rex_w) +			vie->opsize = 8; +		else if (vie->opsize_override) +			vie->opsize = 2; +		else +			vie->opsize = 4; +	} else if (cs_d) { +		/* Default address and operand sizes are 32-bits */ +		vie->addrsize = vie->addrsize_override ? 2 : 4; +		vie->opsize = vie->opsize_override ? 2 : 4; +	} else { +		/* Default address and operand sizes are 16-bits */ +		vie->addrsize = vie->addrsize_override ? 4 : 2; +		vie->opsize = vie->opsize_override ? 4 : 2; +	}  	return (0);  } @@ -1071,6 +1398,12 @@ decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)  {  	uint8_t x; +	if (cpu_mode == CPU_MODE_REAL) +		return (-1); + +	if (vie->op.op_flags & VIE_OP_F_NO_MODRM) +		return (0); +  	if (vie_peek(vie, &x))  		return (-1); @@ -1249,20 +1582,32 @@ decode_immediate(struct vie *vie)  	union {  		char	buf[4];  		int8_t	signed8; +		int16_t	signed16;  		int32_t	signed32;  	} u;  	/* Figure out immediate operand size (if any) */ -	if (vie->op.op_flags & VIE_OP_F_IMM) -		vie->imm_bytes = 4; -	else if (vie->op.op_flags & VIE_OP_F_IMM8) +	if (vie->op.op_flags & VIE_OP_F_IMM) { +		/* +		 * Section 2.2.1.5 "Immediates", Intel SDM: +		 * In 64-bit mode the typical size of immediate operands +		 * remains 32-bits. When the operand size if 64-bits, the +		 * processor sign-extends all immediates to 64-bits prior +		 * to their use. +		 */ +		if (vie->opsize == 4 || vie->opsize == 8) +			vie->imm_bytes = 4; +		else +			vie->imm_bytes = 2; +	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {  		vie->imm_bytes = 1; +	}  	if ((n = vie->imm_bytes) == 0)  		return (0); -	if (n != 1 && n != 4) -		panic("decode_immediate: invalid imm_bytes %d", n); +	KASSERT(n == 1 || n == 2 || n == 4, +	    ("%s: invalid number of immediate bytes: %d", __func__, n));  	for (i = 0; i < n; i++) {  		if (vie_peek(vie, &x)) @@ -1271,12 +1616,47 @@ decode_immediate(struct vie *vie)  		u.buf[i] = x;  		vie_advance(vie);  	} -	 + +	/* sign-extend the immediate value before use */  	if (n == 1) -		vie->immediate = u.signed8;		/* sign-extended */ +		vie->immediate = u.signed8; +	else if (n == 2) +		vie->immediate = u.signed16;  	else -		vie->immediate = u.signed32;		/* sign-extended */ +		vie->immediate = u.signed32; + +	return (0); +} + +static int +decode_moffset(struct vie *vie) +{ +	int i, n; +	uint8_t x; +	union { +		char	buf[8]; +		uint64_t u64; +	} u; + +	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) +		return (0); + +	/* +	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: +	 * The memory offset size follows the address-size of the instruction. +	 */ +	n = vie->addrsize; +	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); + +	u.u64 = 0; +	for (i = 0; i < n; i++) { +		if (vie_peek(vie, &x)) +			return (-1); +		u.buf[i] = x; +		vie_advance(vie); +	} +	vie->displacement = u.u64;  	return (0);  } @@ -1301,7 +1681,7 @@ static int  verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)  {  	int error; -	uint64_t base, idx; +	uint64_t base, idx, gla2;  	/* Skip 'gla' verification */  	if (gla == VIE_INVALID_GLA) @@ -1334,11 +1714,14 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)  		}  	} -	if (base + vie->scale * idx + vie->displacement != gla) { +	/* XXX assuming that the base address of the segment is 0 */ +	gla2 = base + vie->scale * idx + vie->displacement; +	gla2 &= size2mask[vie->addrsize]; +	if (gla != gla2) {  		printf("verify_gla mismatch: "  		       "base(0x%0lx), scale(%d), index(0x%0lx), " -		       "disp(0x%0lx), gla(0x%0lx)\n", -		       base, vie->scale, idx, vie->displacement, gla); +		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", +		       base, vie->scale, idx, vie->displacement, gla, gla2);  		return (-1);  	} @@ -1347,13 +1730,11 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)  int  vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, -		       enum vm_cpu_mode cpu_mode, struct vie *vie) +		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)  { -	if (cpu_mode == CPU_MODE_64BIT) { -		if (decode_rex(vie)) -			return (-1); -	} +	if (decode_prefixes(vie, cpu_mode, cs_d)) +		return (-1);  	if (decode_opcode(vie))  		return (-1); @@ -1366,10 +1747,13 @@ vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,  	if (decode_displacement(vie))  		return (-1); -	 +  	if (decode_immediate(vie))  		return (-1); +	if (decode_moffset(vie)) +		return (-1); +  	if (verify_inst_length(vie))  		return (-1); diff --git a/sys/x86/include/specialreg.h b/sys/x86/include/specialreg.h index c0cef2ca8717..86106845b72a 100644 --- a/sys/x86/include/specialreg.h +++ b/sys/x86/include/specialreg.h @@ -436,6 +436,25 @@  #define	MSR_MC4_MISC		0x413  /* + * VMX MSRs + */ +#define	MSR_VMX_BASIC		0x480 +#define	MSR_VMX_PINBASED_CTLS	0x481 +#define	MSR_VMX_PROCBASED_CTLS	0x482 +#define	MSR_VMX_EXIT_CTLS	0x483 +#define	MSR_VMX_ENTRY_CTLS	0x484 +#define	MSR_VMX_CR0_FIXED0	0x486 +#define	MSR_VMX_CR0_FIXED1	0x487 +#define	MSR_VMX_CR4_FIXED0	0x488 +#define	MSR_VMX_CR4_FIXED1	0x489 +#define	MSR_VMX_PROCBASED_CTLS2	0x48b +#define	MSR_VMX_EPT_VPID_CAP	0x48c +#define	MSR_VMX_TRUE_PINBASED_CTLS	0x48d +#define	MSR_VMX_TRUE_PROCBASED_CTLS	0x48e +#define	MSR_VMX_TRUE_EXIT_CTLS	0x48f +#define	MSR_VMX_TRUE_ENTRY_CTLS	0x490 + +/*   * X2APIC MSRs   */  #define	MSR_APIC_ID		0x802 diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile index 23e16cb7d0d3..1c95f77dff2b 100644 --- a/usr.sbin/bhyve/Makefile +++ b/usr.sbin/bhyve/Makefile @@ -35,6 +35,7 @@ SRCS=	\  	post.c			\  	rtc.c			\  	smbiostbl.c		\ +	task_switch.c		\  	uart_emul.c		\  	virtio.c		\  	xmsr.c			\ diff --git a/usr.sbin/bhyve/acpi.c b/usr.sbin/bhyve/acpi.c index c4ec020bd5e1..5dea3001de22 100644 --- a/usr.sbin/bhyve/acpi.c +++ b/usr.sbin/bhyve/acpi.c @@ -40,12 +40,13 @@   *  Layout   *  ------   *   RSDP  ->   0xf2400    (36 bytes fixed) - *     RSDT  ->   0xf2440    (36 bytes + 4*N table addrs, 2 used) - *     XSDT  ->   0xf2480    (36 bytes + 8*N table addrs, 2 used) + *     RSDT  ->   0xf2440    (36 bytes + 4*7 table addrs, 4 used) + *     XSDT  ->   0xf2480    (36 bytes + 8*7 table addrs, 4 used)   *       MADT  ->   0xf2500  (depends on #CPUs)   *       FADT  ->   0xf2600  (268 bytes)   *       HPET  ->   0xf2740  (56 bytes) - *         FACS  ->   0xf2780 (64 bytes) + *       MCFG  ->   0xf2780  (60 bytes) + *         FACS  ->   0xf27C0 (64 bytes)   *         DSDT  ->   0xf2800 (variable - can go up to 0x100000)   */ @@ -80,7 +81,8 @@ __FBSDID("$FreeBSD$");  #define MADT_OFFSET		0x100  #define FADT_OFFSET		0x200  #define	HPET_OFFSET		0x340 -#define FACS_OFFSET		0x380 +#define	MCFG_OFFSET		0x380 +#define FACS_OFFSET		0x3C0  #define DSDT_OFFSET		0x400  #define	BHYVE_ASL_TEMPLATE	"bhyve.XXXXXXX" @@ -178,6 +180,8 @@ basl_fwrite_rsdt(FILE *fp)  	    basl_acpi_base + FADT_OFFSET);  	EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : %08X\n",  	    basl_acpi_base + HPET_OFFSET); +	EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : %08X\n", +	    basl_acpi_base + MCFG_OFFSET);  	EFFLUSH(fp); @@ -216,6 +220,8 @@ basl_fwrite_xsdt(FILE *fp)  	    basl_acpi_base + FADT_OFFSET);  	EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : 00000000%08X\n",  	    basl_acpi_base + HPET_OFFSET); +	EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : 00000000%08X\n", +	    basl_acpi_base + MCFG_OFFSET);  	EFFLUSH(fp); @@ -583,6 +589,39 @@ err_exit:  }  static int +basl_fwrite_mcfg(FILE *fp) +{ +	int err = 0; + +	EFPRINTF(fp, "/*\n"); +	EFPRINTF(fp, " * bhyve MCFG template\n"); +	EFPRINTF(fp, " */\n"); +	EFPRINTF(fp, "[0004]\t\tSignature : \"MCFG\"\n"); +	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); +	EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); +	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); +	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); +	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMCFG  \"\n"); +	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + +	/* iasl will fill in the compiler ID/revision fields */ +	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); +	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); +	EFPRINTF(fp, "[0008]\t\tReserved : 0\n"); +	EFPRINTF(fp, "\n"); + +	EFPRINTF(fp, "[0008]\t\tBase Address : %016lX\n", pci_ecfg_base()); +	EFPRINTF(fp, "[0002]\t\tSegment Group: 0000\n"); +	EFPRINTF(fp, "[0001]\t\tStart Bus: 00\n"); +	EFPRINTF(fp, "[0001]\t\tEnd Bus: FF\n"); +	EFPRINTF(fp, "[0004]\t\tReserved : 0\n"); +	EFFLUSH(fp); +	return (0); +err_exit: +	return (errno); +} + +static int  basl_fwrite_facs(FILE *fp)  {  	int err; @@ -921,6 +960,7 @@ static struct {  	{ basl_fwrite_madt, MADT_OFFSET },  	{ basl_fwrite_fadt, FADT_OFFSET },  	{ basl_fwrite_hpet, HPET_OFFSET }, +	{ basl_fwrite_mcfg, MCFG_OFFSET },  	{ basl_fwrite_facs, FACS_OFFSET },  	{ basl_fwrite_dsdt, DSDT_OFFSET },  	{ NULL } diff --git a/usr.sbin/bhyve/atkbdc.c b/usr.sbin/bhyve/atkbdc.c index 6e13c1910334..930b7af95f3d 100644 --- a/usr.sbin/bhyve/atkbdc.c +++ b/usr.sbin/bhyve/atkbdc.c @@ -31,6 +31,10 @@ __FBSDID("$FreeBSD$");  #include <machine/vmm.h> +#include <vmmapi.h> + +#include <assert.h> +#include <errno.h>  #include <stdio.h>  #include "inout.h" @@ -48,29 +52,30 @@ atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,      uint32_t *eax, void *arg)  {  	if (bytes != 1) -		return (INOUT_ERROR); +		return (-1);  	*eax = 0; -	return (INOUT_OK); +	return (0);  }  static int  atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port,      int bytes, uint32_t *eax, void *arg)  { -	int retval; +	int error, retval;  	if (bytes != 1) -		return (INOUT_ERROR); +		return (-1); -	retval = INOUT_OK; +	retval = 0;  	if (in) {  		*eax = KBD_SYS_FLAG;	/* system passed POST */  	} else {  		switch (*eax) {  		case KBDC_RESET:	/* Pulse "reset" line. */ -			retval = INOUT_RESET; +			error = vm_suspend(ctx, VM_SUSPEND_RESET); +			assert(error == 0 || errno == EALREADY);  			break;  		}  	} diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8 index aad1aef70635..80b814a4882e 100644 --- a/usr.sbin/bhyve/bhyve.8 +++ b/usr.sbin/bhyve/bhyve.8 @@ -24,7 +24,7 @@  .\"  .\" $FreeBSD$  .\" -.Dd April 2, 2014 +.Dd June 26, 2014  .Dt BHYVE 8  .Os  .Sh NAME @@ -32,12 +32,14 @@  .Nd "run a guest operating system inside a virtual machine"  .Sh SYNOPSIS  .Nm -.Op Fl aehwxACHPW +.Op Fl abehwxACHPWY  .Op Fl c Ar numcpus  .Op Fl g Ar gdbport +.Op Fl l Ar lpcdev Ns Op , Ns Ar conf +.Op Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t  .Op Fl p Ar vcpu:hostcpu  .Op Fl s Ar slot,emulation Ns Op , Ns Ar conf -.Op Fl l Ar lpcdev Ns Op , Ns Ar conf +.Op Fl U Ar uuid  .Ar vmname  .Sh DESCRIPTION  .Nm @@ -66,21 +68,49 @@ Generate ACPI tables.  Required for  .Fx Ns /amd64  guests. +.It Fl b +Enable a low-level console device supported by +.Fx kernels compiled with +.Cd "device bvmconsole" . +This option will be deprecated in a future version.  .It Fl c Ar numcpus  Number of guest virtual CPUs.  The default is 1 and the maximum is 16.  .It Fl C  Include guest memory in core file. -.It Fl H -Yield the virtual CPU thread when a HLT instruction is detected. -If this option is not specified, virtual CPUs will use 100% of a host CPU. +.It Fl e +Force +.Nm +to exit when a guest issues an access to an I/O port that is not emulated. +This is intended for debug purposes.  .It Fl g Ar gdbport  For -.Fx Ns /amd64 kernels compiled with -.Cd "option bvmdebug" , +.Fx +kernels compiled with +.Cd "device bvmdebug" ,  allow a remote kernel kgdb to be relayed to the guest kernel gdb stub  via a local IPv4 address and this port.  This option will be deprecated in a future version. +.It Fl h +Print help message and exit. +.It Fl H +Yield the virtual CPU thread when a HLT instruction is detected. +If this option is not specified, virtual CPUs will use 100% of a host CPU. +.It Fl l Ar lpcdev Ns Op , Ns Ar conf +Allow devices behind the LPC PCI-ISA bridge to be configured. +The only supported devices are the TTY-class devices, +.Li com1 +and +.Li com2 . +.It Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t +Guest physical memory size in bytes. +This must be the same size that was given to +.Xr bhyveload 8 . +.Pp +The size argument may be suffixed with one of K, M, G or T (either upper +or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes, +or terabytes. +If no suffix is given, the value is assumed to be in megabytes.  .It Fl p Ar vcpu:hostcpu  Pin guest's virtual CPU  .Em vcpu @@ -88,9 +118,6 @@ to  .Em hostcpu .  .It Fl P  Force the guest virtual CPU to exit when a PAUSE instruction is detected. -.It Fl W -Force virtio PCI device emulations to use MSI interrupts instead of MSI-X -interrupts.  .It Fl s Ar slot,emulation Ns Op , Ns Ar conf  Configure a virtual PCI slot and function.  .Pp @@ -211,34 +238,21 @@ The host device must have been reserved at boot-time using the  loader variable as described in  .Xr vmm 4 .  .El -.It Fl l Ar lpcdev Ns Op , Ns Ar conf -Allow devices behind the LPC PCI-ISA bridge to be configured. -The only supported devices are the TTY-class devices, -.Li com1 -and -.Li com2 . -.It Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t -Guest physical memory size in bytes. -This must be the same size that was given to -.Xr bhyveload 8 . -.Pp -The size argument may be suffixed with one of K, M, G or T (either upper -or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes, -or terabytes. -If no suffix is given, the value is assumed to be in megabytes. -.It Fl e -Force -.Nm -to exit when a guest issues an access to an I/O port that is not emulated. -This is intended for debug purposes. +.It Fl U Ar uuid +Set the universally unique identifier +.Pq UUID +in the guest's System Management BIOS System Information structure. +By default a UUID is generated from the host's hostname and +.Ar vmname .  .It Fl w  Ignore accesses to unimplemented Model Specific Registers (MSRs). This is intended for debug purposes. +.It Fl W +Force virtio PCI device emulations to use MSI interrupts instead of MSI-X +interrupts.  .It Fl x  The guest's local APIC is configured in x2APIC mode.  .It Fl Y  Disable MPtable generation. -.It Fl h -Print help message and exit.  .It Ar vmname  Alphanumeric name of the guest.  This should be the same as that created by diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index 1e5d3b33abd2..7dcf6d016b87 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -69,16 +69,11 @@ __FBSDID("$FreeBSD$");  #define GUEST_NIO_PORT		0x488	/* guest upcalls via i/o port */ -#define	VMEXIT_CONTINUE		1	/* continue from next instruction */ -#define	VMEXIT_RESTART		2	/* restart current instruction */ -#define	VMEXIT_ABORT		3	/* abort the vm run loop */ -#define	VMEXIT_RESET		4	/* guest machine has reset */ -#define	VMEXIT_POWEROFF		5	/* guest machine has powered off */ -  #define MB		(1024UL * 1024)  #define GB		(1024UL * MB)  typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); +extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);  char *vmname; @@ -101,7 +96,7 @@ static cpuset_t cpumask;  static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); -struct vm_exit vmexit[VM_MAXCPU]; +static struct vm_exit vmexit[VM_MAXCPU];  struct bhyvestats {          uint64_t        vmexit_bogus; @@ -112,8 +107,6 @@ struct bhyvestats {          uint64_t        vmexit_inst_emul;          uint64_t        cpu_switch_rotate;          uint64_t        cpu_switch_direct; -        int             io_reset; -	int		io_poweroff;  } stats;  struct mt_vmm_info { @@ -129,26 +122,26 @@ usage(int code)  {          fprintf(stderr, -                "Usage: %s [-aehwAHIPW] [-g <gdb port>] [-s <pci>] [-c vcpus]\n" -		"       %*s [-p vcpu:hostcpu] [-m mem] [-l <lpc>] <vm>\n" +                "Usage: %s [-abehwxACHPWY] [-c vcpus] [-g <gdb port>] [-l <lpc>]\n" +		"       %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n"  		"       -a: local apic is in xAPIC mode (deprecated)\n" -		"       -A: create an ACPI table\n" -		"       -g: gdb port\n" +		"       -A: create ACPI tables\n"  		"       -c: # cpus (default 1)\n"  		"       -C: include guest memory in core file\n" -		"       -p: pin 'vcpu' to 'hostcpu'\n" -		"       -H: vmexit from the guest on hlt\n" -		"       -P: vmexit from the guest on pause\n" -		"       -W: force virtio to use single-vector MSI\n"  		"       -e: exit on unhandled I/O access\n" +		"       -g: gdb port\n"  		"       -h: help\n" -		"       -s: <slot,driver,configinfo> PCI slot config\n" +		"       -H: vmexit from the guest on hlt\n"  		"       -l: LPC device configuration\n"  		"       -m: memory size in MB\n" +		"       -p: pin 'vcpu' to 'hostcpu'\n" +		"       -P: vmexit from the guest on pause\n" +		"       -s: <slot,driver,configinfo> PCI slot config\n" +		"       -U: uuid\n"  		"       -w: ignore unimplemented MSRs\n" +		"       -W: force virtio to use single-vector MSI\n"  		"       -x: local apic is in x2APIC mode\n" -		"       -Y: disable MPtable generation\n" -		"       -U: uuid\n", +		"       -Y: disable MPtable generation\n",  		progname, (int)strlen(progname), "");  	exit(code); @@ -187,6 +180,27 @@ pincpu_parse(const char *opt)  	return (0);  } +void +vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, +    int errcode) +{ +	struct vmctx *ctx; +	int error; + +	ctx = arg; +	if (errcode_valid) +		error = vm_inject_exception2(ctx, vcpu, vector, errcode); +	else +		error = vm_inject_exception(ctx, vcpu, vector); +	assert(error == 0); + +	/* +	 * Set the instruction length to 0 to ensure that the instruction is +	 * restarted when the fault handler returns. +	 */ +	vmexit[vcpu].inst_length = 0; +} +  void *  paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)  { @@ -315,27 +329,18 @@ vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)  	}  	error = emulate_inout(ctx, vcpu, vme, strictio); -	if (error == INOUT_OK && in && !string) { +	if (!error && in && !string) {  		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX,  		    vme->u.inout.eax); +		assert(error == 0);  	} -	switch (error) { -	case INOUT_OK: -		return (VMEXIT_CONTINUE); -	case INOUT_RESTART: -		return (VMEXIT_RESTART); -	case INOUT_RESET: -		stats.io_reset++; -		return (VMEXIT_RESET); -	case INOUT_POWEROFF: -		stats.io_poweroff++; -		return (VMEXIT_POWEROFF); -	default: -		fprintf(stderr, "Unhandled %s%c 0x%04x\n", -			in ? "in" : "out", -			bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port); +	if (error) { +		fprintf(stderr, "Unhandled %s%c 0x%04x\n", in ? "in" : "out", +		    bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);  		return (VMEXIT_ABORT); +	} else { +		return (VMEXIT_CONTINUE);  	}  } @@ -352,8 +357,7 @@ vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)  		fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",  		    vme->u.msr.code, *pvcpu);  		if (strictmsr) { -			error = vm_inject_exception2(ctx, *pvcpu, IDT_GP, 0); -			assert(error == 0); +			vm_inject_gp(ctx, *pvcpu);  			return (VMEXIT_RESTART);  		}  	} @@ -379,8 +383,7 @@ vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)  		fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",  		    vme->u.msr.code, vme->u.msr.wval, *pvcpu);  		if (strictmsr) { -			error = vm_inject_exception2(ctx, *pvcpu, IDT_GP, 0); -			assert(error == 0); +			vm_inject_gp(ctx, *pvcpu);  			return (VMEXIT_RESTART);  		}  	} @@ -399,6 +402,16 @@ vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)  	return (retval);  } +#define	DEBUG_EPT_MISCONFIG +#ifdef DEBUG_EPT_MISCONFIG +#define	EXIT_REASON_EPT_MISCONFIG	49 +#define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400 +#define	VMCS_IDENT(x)			((x) | 0x80000000) + +static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; +static int ept_misconfig_ptenum; +#endif +  static int  vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)  { @@ -413,7 +426,21 @@ vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)  	    vmexit->u.vmx.exit_qualification);  	fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);  	fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); - +#ifdef DEBUG_EPT_MISCONFIG +	if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { +		vm_get_register(ctx, *pvcpu, +		    VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), +		    &ept_misconfig_gpa); +		vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, +		    &ept_misconfig_ptenum); +		fprintf(stderr, "\tEPT misconfiguration:\n"); +		fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); +		fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", +		    ept_misconfig_ptenum, ept_misconfig_pte[0], +		    ept_misconfig_pte[1], ept_misconfig_pte[2], +		    ept_misconfig_pte[3]); +	} +#endif	/* DEBUG_EPT_MISCONFIG */  	return (VMEXIT_ABORT);  } @@ -465,7 +492,7 @@ vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)  	stats.vmexit_inst_emul++;  	err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa, -			  &vmexit->u.inst_emul.vie); +	    &vmexit->u.inst_emul.vie, &vmexit->u.inst_emul.paging);  	if (err) {  		if (err == EINVAL) { @@ -515,6 +542,8 @@ vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)  		exit(1);  	case VM_SUSPEND_HALT:  		exit(2); +	case VM_SUSPEND_TRIPLEFAULT: +		exit(3);  	default:  		fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);  		exit(100); @@ -532,7 +561,8 @@ static vmexit_handler_t handler[VM_EXITCODE_MAX] = {  	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,  	[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,  	[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, -	[VM_EXITCODE_SUSPENDED] = vmexit_suspend +	[VM_EXITCODE_SUSPENDED] = vmexit_suspend, +	[VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,  };  static void @@ -540,7 +570,6 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)  {  	int error, rc, prevcpu;  	enum vm_exitcode exitcode; -	enum vm_suspend_how how;  	cpuset_t active_cpus;  	if (vcpumap[vcpu] != NULL) { @@ -575,16 +604,6 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)  		case VMEXIT_RESTART:                          rip = vmexit[vcpu].rip;  			break; -		case VMEXIT_RESET: -		case VMEXIT_POWEROFF: -			if (rc == VMEXIT_RESET) -				how = VM_SUSPEND_RESET; -			else -				how = VM_SUSPEND_POWEROFF; -			error = vm_suspend(ctx, how); -			assert(error == 0 || errno == EALREADY); -                        rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length; -			break;  		case VMEXIT_ABORT:  			abort();  		default: diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h index f18d42fc3f01..87824ef9f4f4 100644 --- a/usr.sbin/bhyve/bhyverun.h +++ b/usr.sbin/bhyve/bhyverun.h @@ -35,6 +35,10 @@  #define	__CTASSERT(x, y)	typedef char __assert ## y[(x) ? 1 : -1]  #endif +#define	VMEXIT_CONTINUE		1	/* continue from next instruction */ +#define	VMEXIT_RESTART		2	/* restart current instruction */ +#define	VMEXIT_ABORT		3	/* abort the vm run loop */ +  struct vmctx;  extern int guest_ncpus;  extern char *guest_uuid_str; diff --git a/usr.sbin/bhyve/block_if.c b/usr.sbin/bhyve/block_if.c index b29bc7856e86..1ec0344f3fca 100644 --- a/usr.sbin/bhyve/block_if.c +++ b/usr.sbin/bhyve/block_if.c @@ -390,6 +390,55 @@ blockif_close(struct blockif_ctxt *bc)  }  /* + * Return virtual C/H/S values for a given block. Use the algorithm + * outlined in the VHD specification to calculate values. + */ +void +blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) +{ +	off_t sectors;		/* total sectors of the block dev */ +	off_t hcyl;		/* cylinders times heads */ +	uint16_t secpt;		/* sectors per track */ +	uint8_t heads; + +	assert(bc->bc_magic == BLOCKIF_SIG); + +	sectors = bc->bc_size / bc->bc_sectsz; + +	/* Clamp the size to the largest possible with CHS */ +	if (sectors > 65535UL*16*255) +		sectors = 65535UL*16*255; + +	if (sectors >= 65536UL*16*63) { +		secpt = 255; +		heads = 16; +		hcyl = sectors / secpt; +	} else { +		secpt = 17; +		hcyl = sectors / secpt; +		heads = (hcyl + 1023) / 1024; + +		if (heads < 4) +			heads = 4; + +		if (hcyl >= (heads * 1024) || heads > 16) { +			secpt = 31; +			heads = 16; +			hcyl = sectors / secpt; +		} +		if (hcyl >= (heads * 1024)) { +			secpt = 63; +			heads = 16; +			hcyl = sectors / secpt; +		} +	} + +	*c = hcyl / heads; +	*h = heads; +	*s = secpt; +} + +/*   * Accessors   */  off_t diff --git a/usr.sbin/bhyve/block_if.h b/usr.sbin/bhyve/block_if.h index e0c0bb1f8c8b..c2c21f657446 100644 --- a/usr.sbin/bhyve/block_if.h +++ b/usr.sbin/bhyve/block_if.h @@ -52,6 +52,8 @@ struct blockif_req {  struct blockif_ctxt;  struct blockif_ctxt *blockif_open(const char *optstr, const char *ident);  off_t	blockif_size(struct blockif_ctxt *bc); +void	blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, +    uint8_t *s);  int	blockif_sectsz(struct blockif_ctxt *bc);  int	blockif_queuesz(struct blockif_ctxt *bc);  int	blockif_is_ro(struct blockif_ctxt *bc); diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c index fe9e0d85625b..1041a59d2194 100644 --- a/usr.sbin/bhyve/inout.c +++ b/usr.sbin/bhyve/inout.c @@ -154,31 +154,28 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)  		/* Limit number of back-to-back in/out emulations to 16 */  		iterations = MIN(count, 16);  		while (iterations > 0) { +			assert(retval == 0);  			if (vie_calculate_gla(vis->paging.cpu_mode,  			    vis->seg_name, &vis->seg_desc, index, bytes,  			    addrsize, prot, &gla)) { -				error = vm_inject_exception2(ctx, vcpu, -				    IDT_GP, 0); -				assert(error == 0); -				retval = INOUT_RESTART; +				vm_inject_gp(ctx, vcpu);  				break;  			} -			error = vm_gla2gpa(ctx, vcpu, &vis->paging, gla, bytes, -			    prot, iov, nitems(iov)); -			assert(error == 0 || error == 1 || error == -1); -			if (error) { -				retval = (error == 1) ? INOUT_RESTART : -				    INOUT_ERROR; +			error = vm_copy_setup(ctx, vcpu, &vis->paging, gla, +			    bytes, prot, iov, nitems(iov)); +			if (error == -1) { +				retval = -1;  /* Unrecoverable error */ +				break; +			} else if (error == 1) { +				retval = 0;  /* Resume guest to handle fault */  				break;  			}  			if (vie_alignment_check(vis->paging.cpl, bytes,  			    vis->cr0, vis->rflags, gla)) { -				error = vm_inject_exception2(ctx, vcpu, -				    IDT_AC, 0); -				assert(error == 0); -				return (INOUT_RESTART); +				vm_inject_ac(ctx, vcpu, 0); +				break;  			}  			val = 0; @@ -217,8 +214,8 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)  		}  		/* Restart the instruction if more iterations remain */ -		if (retval == INOUT_OK && count != 0) -			retval = INOUT_RESTART; +		if (retval == 0 && count != 0) +			vmexit->inst_length = 0;  	} else {  		if (!in) {  			val = vmexit->u.inout.eax & vie_size2mask(bytes); diff --git a/usr.sbin/bhyve/inout.h b/usr.sbin/bhyve/inout.h index f15a2c87db72..7f390951d418 100644 --- a/usr.sbin/bhyve/inout.h +++ b/usr.sbin/bhyve/inout.h @@ -34,13 +34,9 @@  struct vmctx;  struct vm_exit; -/* Handler return values. */ -#define	INOUT_ERROR	-1 -#define	INOUT_OK	0 -#define	INOUT_RESTART	1 -#define	INOUT_RESET	2 -#define	INOUT_POWEROFF	3 - +/* + * inout emulation handlers return 0 on success and -1 on failure. + */  typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port,  			    int bytes, uint32_t *eax, void *arg); diff --git a/usr.sbin/bhyve/mem.c b/usr.sbin/bhyve/mem.c index 7ea630f2a587..2a9f430c8262 100644 --- a/usr.sbin/bhyve/mem.c +++ b/usr.sbin/bhyve/mem.c @@ -157,10 +157,12 @@ mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg)  }  int -emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie) +emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie, +    struct vm_guest_paging *paging) +  {  	struct mmio_rb_range *entry; -	int err; +	int err, immutable;  	pthread_rwlock_rdlock(&mmio_rwlock);  	/* @@ -184,10 +186,28 @@ emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie)  	}  	assert(entry != NULL); -	err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, + +	/* +	 * An 'immutable' memory range is guaranteed to be never removed +	 * so there is no need to hold 'mmio_rwlock' while calling the +	 * handler. +	 * +	 * XXX writes to the PCIR_COMMAND register can cause register_mem() +	 * to be called. If the guest is using PCI extended config space +	 * to modify the PCIR_COMMAND register then register_mem() can +	 * deadlock on 'mmio_rwlock'. However by registering the extended +	 * config space window as 'immutable' the deadlock can be avoided. +	 */ +	immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE); +	if (immutable) +		pthread_rwlock_unlock(&mmio_rwlock); + +	err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, paging,  				      mem_read, mem_write, &entry->mr_param); -	pthread_rwlock_unlock(&mmio_rwlock); -	 + +	if (!immutable) +		pthread_rwlock_unlock(&mmio_rwlock); +  	return (err);  } @@ -244,6 +264,7 @@ unregister_mem(struct mem_range *memp)  		mr = &entry->mr_param;  		assert(mr->name == memp->name);  		assert(mr->base == memp->base && mr->size == memp->size);  +		assert((mr->flags & MEM_F_IMMUTABLE) == 0);  		RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry);  		/* flush Per-vCPU cache */	 diff --git a/usr.sbin/bhyve/mem.h b/usr.sbin/bhyve/mem.h index 264bff9e82b0..f671eaedf786 100644 --- a/usr.sbin/bhyve/mem.h +++ b/usr.sbin/bhyve/mem.h @@ -48,9 +48,11 @@ struct mem_range {  #define	MEM_F_READ		0x1  #define	MEM_F_WRITE		0x2  #define	MEM_F_RW		0x3 +#define	MEM_F_IMMUTABLE		0x4	/* mem_range cannot be unregistered */  void	init_mem(void); -int     emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie); +int     emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie, +		    struct vm_guest_paging *paging);  int	register_mem(struct mem_range *memp);  int	register_mem_fallback(struct mem_range *memp); diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c index 9f6110730b09..214237df3757 100644 --- a/usr.sbin/bhyve/pci_ahci.c +++ b/usr.sbin/bhyve/pci_ahci.c @@ -336,8 +336,9 @@ ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd)  	fis[13] = cfis[13];  	if (fis[2] & ATA_S_ERROR)  		p->is |= AHCI_P_IX_TFE; +	else +		p->ci &= ~(1 << slot);  	p->tfd = tfd; -	p->ci &= ~(1 << slot);  	ahci_write_fis(p, FIS_TYPE_REGD2H, fis);  } @@ -598,10 +599,16 @@ handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)  	} else {  		uint16_t buf[256];  		uint64_t sectors; +		uint16_t cyl; +		uint8_t sech, heads;  		sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); +		blockif_chs(p->bctx, &cyl, &heads, &sech);  		memset(buf, 0, sizeof(buf));  		buf[0] = 0x0040; +		buf[1] = cyl; +		buf[3] = heads; +		buf[6] = sech;  		/* TODO emulate different serial? */  		ata_string((uint8_t *)(buf+10), "123456", 20);  		ata_string((uint8_t *)(buf+23), "001", 8); @@ -645,8 +652,8 @@ handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)  		write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));  		p->tfd = ATA_S_DSC | ATA_S_READY;  		p->is |= AHCI_P_IX_DP; +		p->ci &= ~(1 << slot);  	} -	p->ci &= ~(1 << slot);  	ahci_generate_intr(p->pr_sc);  } @@ -688,8 +695,8 @@ handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis)  		write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));  		p->tfd = ATA_S_DSC | ATA_S_READY;  		p->is |= AHCI_P_IX_DHR; +		p->ci &= ~(1 << slot);  	} -	p->ci &= ~(1 << slot);  	ahci_generate_intr(p->pr_sc);  } @@ -1292,7 +1299,6 @@ ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis)  		if (!p->atapi) {  			p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;  			p->is |= AHCI_P_IX_TFE; -			p->ci &= ~(1 << slot);  			ahci_generate_intr(p->pr_sc);  		} else  			handle_packet_cmd(p, slot, cfis); @@ -1301,7 +1307,6 @@ ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis)  		WPRINTF("Unsupported cmd:%02x\n", cfis[2]);  		p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;  		p->is |= AHCI_P_IX_TFE; -		p->ci &= ~(1 << slot);  		ahci_generate_intr(p->pr_sc);  		break;  	} @@ -1369,8 +1374,11 @@ ahci_handle_port(struct ahci_port *p)  	 * are already in-flight.  	 */  	for (i = 0; (i < 32) && p->ci; i++) { -		if ((p->ci & (1 << i)) && !(p->pending & (1 << i))) +		if ((p->ci & (1 << i)) && !(p->pending & (1 << i))) { +			p->cmd &= ~AHCI_P_CMD_CCS_MASK; +			p->cmd |= i << AHCI_P_CMD_CCS_SHIFT;  			ahci_handle_slot(p, i); +		}  	}  } diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c index 458ba76480b1..6b906ede42fc 100644 --- a/usr.sbin/bhyve/pci_emul.c +++ b/usr.sbin/bhyve/pci_emul.c @@ -109,16 +109,20 @@ static uint64_t pci_emul_membase64;  #define	PCI_EMUL_IOBASE		0x2000  #define	PCI_EMUL_IOLIMIT	0x10000 -#define	PCI_EMUL_MEMLIMIT32	0xE0000000	/* 3.5GB */ +#define	PCI_EMUL_ECFG_BASE	0xE0000000		    /* 3.5GB */ +#define	PCI_EMUL_ECFG_SIZE	(MAXBUSES * 1024 * 1024)    /* 1MB per bus */ +SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE); + +#define	PCI_EMUL_MEMLIMIT32	PCI_EMUL_ECFG_BASE  #define	PCI_EMUL_MEMBASE64	0xD000000000UL  #define	PCI_EMUL_MEMLIMIT64	0xFD00000000UL  static struct pci_devemu *pci_emul_finddev(char *name); -static void	pci_lintr_route(struct pci_devinst *pi); -static void	pci_lintr_update(struct pci_devinst *pi); - -static struct mem_range pci_mem_hole; +static void pci_lintr_route(struct pci_devinst *pi); +static void pci_lintr_update(struct pci_devinst *pi); +static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, +    int func, int coff, int bytes, uint32_t *val);  /*   * I/O access @@ -1023,12 +1027,37 @@ pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,  	return (0);  } +static int +pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, +    int bytes, uint64_t *val, void *arg1, long arg2) +{ +	int bus, slot, func, coff, in; + +	coff = addr & 0xfff; +	func = (addr >> 12) & 0x7; +	slot = (addr >> 15) & 0x1f; +	bus = (addr >> 20) & 0xff; +	in = (dir == MEM_F_READ); +	if (in) +		*val = ~0UL; +	pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val); +	return (0); +} + +uint64_t +pci_ecfg_base(void) +{ + +	return (PCI_EMUL_ECFG_BASE); +} +  #define	BUSIO_ROUNDUP		32  #define	BUSMEM_ROUNDUP		(1024 * 1024)  int  init_pci(struct vmctx *ctx)  { +	struct mem_range mr;  	struct pci_devemu *pde;  	struct businfo *bi;  	struct slotinfo *si; @@ -1112,22 +1141,34 @@ init_pci(struct vmctx *ctx)  	 * The guest physical memory map looks like the following:  	 * [0,		    lowmem)		guest system memory  	 * [lowmem,	    lowmem_limit)	memory hole (may be absent) -	 * [lowmem_limit,   4GB)		PCI hole (32-bit BAR allocation) +	 * [lowmem_limit,   0xE0000000)		PCI hole (32-bit BAR allocation) +	 * [0xE0000000,	    0xF0000000)		PCI extended config window +	 * [0xF0000000,	    4GB)		LAPIC, IOAPIC, HPET, firmware  	 * [4GB,	    4GB + highmem) -	 * +	 */ + +	/*  	 * Accesses to memory addresses that are not allocated to system  	 * memory or PCI devices return 0xff's.  	 */  	lowmem = vm_get_lowmem_size(ctx); +	bzero(&mr, sizeof(struct mem_range)); +	mr.name = "PCI hole"; +	mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; +	mr.base = lowmem; +	mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem; +	mr.handler = pci_emul_fallback_handler; +	error = register_mem_fallback(&mr); +	assert(error == 0); -	memset(&pci_mem_hole, 0, sizeof(struct mem_range)); -	pci_mem_hole.name = "PCI hole"; -	pci_mem_hole.flags = MEM_F_RW; -	pci_mem_hole.base = lowmem; -	pci_mem_hole.size = (4ULL * 1024 * 1024 * 1024) - lowmem; -	pci_mem_hole.handler = pci_emul_fallback_handler; - -	error = register_mem_fallback(&pci_mem_hole); +	/* PCI extended config space */ +	bzero(&mr, sizeof(struct mem_range)); +	mr.name = "PCI ECFG"; +	mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; +	mr.base = PCI_EMUL_ECFG_BASE; +	mr.size = PCI_EMUL_ECFG_SIZE; +	mr.handler = pci_emul_ecfg_handler; +	error = register_mem(&mr);  	assert(error == 0);  	return (0); @@ -1612,41 +1653,6 @@ pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv)  	}  } -static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; - -static int -pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes, -		 uint32_t *eax, void *arg) -{ -	uint32_t x; - -	if (bytes != 4) { -		if (in) -			*eax = (bytes == 2) ? 0xffff : 0xff; -		return (0); -	} - -	if (in) { -		x = (cfgbus << 16) | -		    (cfgslot << 11) | -		    (cfgfunc << 8) | -		    cfgoff; -                if (cfgenable) -			x |= CONF1_ENABLE;	        -		*eax = x; -	} else { -		x = *eax; -		cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; -		cfgoff = x & PCI_REGMAX; -		cfgfunc = (x >> 8) & PCI_FUNCMAX; -		cfgslot = (x >> 11) & PCI_SLOTMAX; -		cfgbus = (x >> 16) & PCI_BUSMAX; -	} - -	return (0); -} -INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); -  static uint32_t  bits_changed(uint32_t old, uint32_t new, uint32_t mask)  { @@ -1709,41 +1715,51 @@ pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes)  	pci_lintr_update(pi);  }	 -static int -pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, -		 uint32_t *eax, void *arg) +static void +pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, +    int coff, int bytes, uint32_t *eax)  {  	struct businfo *bi;  	struct slotinfo *si;  	struct pci_devinst *pi;  	struct pci_devemu *pe; -	int coff, idx, needcfg; +	int idx, needcfg;  	uint64_t addr, bar, mask; -	assert(bytes == 1 || bytes == 2 || bytes == 4); - -	if ((bi = pci_businfo[cfgbus]) != NULL) { -		si = &bi->slotinfo[cfgslot]; -		pi = si->si_funcs[cfgfunc].fi_devi; +	if ((bi = pci_businfo[bus]) != NULL) { +		si = &bi->slotinfo[slot]; +		pi = si->si_funcs[func].fi_devi;  	} else  		pi = NULL; -	coff = cfgoff + (port - CONF1_DATA_PORT); - -#if 0 -	printf("pcicfg-%s from 0x%0x of %d bytes (%d/%d/%d)\n\r", -		in ? "read" : "write", coff, bytes, cfgbus, cfgslot, cfgfunc); -#endif -  	/* -	 * Just return if there is no device at this cfgslot:cfgfunc, -	 * if the guest is doing an un-aligned access, or if the config -	 * address word isn't enabled. +	 * Just return if there is no device at this slot:func or if the +	 * the guest is doing an un-aligned access.  	 */ -	if (!cfgenable || pi == NULL || (coff & (bytes - 1)) != 0) { +	if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) || +	    (coff & (bytes - 1)) != 0) {  		if (in)  			*eax = 0xffffffff; -		return (0); +		return; +	} + +	/* +	 * Ignore all writes beyond the standard config space and return all +	 * ones on reads. +	 */ +	if (coff >= PCI_REGMAX + 1) { +		if (in) { +			*eax = 0xffffffff; +			/* +			 * Extended capabilities begin at offset 256 in config +			 * space. Absence of extended capabilities is signaled +			 * with all 0s in the extended capability header at +			 * offset 256. +			 */ +			if (coff <= PCI_REGMAX + 4) +				*eax = 0x00000000; +		} +		return;  	}  	pe = pi->pi_d; @@ -1754,8 +1770,8 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,  	if (in) {  		/* Let the device emulation override the default handler */  		if (pe->pe_cfgread != NULL) { -			needcfg = pe->pe_cfgread(ctx, vcpu, pi, -						    coff, bytes, eax); +			needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes, +			    eax);  		} else {  			needcfg = 1;  		} @@ -1769,12 +1785,12 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,  				*eax = pci_get_cfgdata32(pi, coff);  		} -		pci_emul_hdrtype_fixup(cfgbus, cfgslot, coff, bytes, eax); +		pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax);  	} else {  		/* Let the device emulation override the default handler */  		if (pe->pe_cfgwrite != NULL &&  		    (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0) -			return (0); +			return;  		/*  		 * Special handling for write to BAR registers @@ -1785,7 +1801,7 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,  			 * 4-byte aligned.  			 */  			if (bytes != 4 || (coff & 0x3) != 0) -				return (0); +				return;  			idx = (coff - PCIR_BAR(0)) / 4;  			mask = ~(pi->pi_bar[idx].size - 1);  			switch (pi->pi_bar[idx].type) { @@ -1843,7 +1859,57 @@ pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,  			CFGWRITE(pi, coff, *eax, bytes);  		}  	} +} + +static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; + +static int +pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes, +		 uint32_t *eax, void *arg) +{ +	uint32_t x; + +	if (bytes != 4) { +		if (in) +			*eax = (bytes == 2) ? 0xffff : 0xff; +		return (0); +	} + +	if (in) { +		x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff; +		if (cfgenable) +			x |= CONF1_ENABLE; +		*eax = x; +	} else { +		x = *eax; +		cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; +		cfgoff = x & PCI_REGMAX; +		cfgfunc = (x >> 8) & PCI_FUNCMAX; +		cfgslot = (x >> 11) & PCI_SLOTMAX; +		cfgbus = (x >> 16) & PCI_BUSMAX; +	} + +	return (0); +} +INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); + +static int +pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, +		 uint32_t *eax, void *arg) +{ +	int coff; +	assert(bytes == 1 || bytes == 2 || bytes == 4); + +	coff = cfgoff + (port - CONF1_DATA_PORT); +	if (cfgenable) { +		pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes, +		    eax); +	} else { +		/* Ignore accesses to cfgdata if not enabled by cfgaddr */ +		if (in) +			*eax = 0xffffffff; +	}  	return (0);  } diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h index 866ffc5b8224..6b8c4e0fd31c 100644 --- a/usr.sbin/bhyve/pci_emul.h +++ b/usr.sbin/bhyve/pci_emul.h @@ -235,6 +235,7 @@ uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size);  int	pci_count_lintr(int bus);  void	pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg);  void	pci_write_dsdt(void); +uint64_t pci_ecfg_base(void);  int	pci_bus_configured(int bus);  static __inline void  diff --git a/usr.sbin/bhyve/pci_irq.c b/usr.sbin/bhyve/pci_irq.c index 653aeb0ff1f6..20e033f2c40e 100644 --- a/usr.sbin/bhyve/pci_irq.c +++ b/usr.sbin/bhyve/pci_irq.c @@ -115,7 +115,7 @@ void  pci_irq_reserve(int irq)  { -	assert(irq < nitems(irq_counts)); +	assert(irq >= 0 && irq < nitems(irq_counts));  	assert(pirq_cold);  	assert(irq_counts[irq] == 0 || irq_counts[irq] == IRQ_DISABLED);  	irq_counts[irq] = IRQ_DISABLED; @@ -125,10 +125,10 @@ void  pci_irq_use(int irq)  { -	assert(irq < nitems(irq_counts)); +	assert(irq >= 0 && irq < nitems(irq_counts));  	assert(pirq_cold); -	if (irq_counts[irq] != IRQ_DISABLED) -		irq_counts[irq]++; +	assert(irq_counts[irq] != IRQ_DISABLED); +	irq_counts[irq]++;  }  void @@ -197,7 +197,7 @@ pirq_alloc_pin(struct vmctx *ctx)  {  	int best_count, best_irq, best_pin, irq, pin; -	pirq_cold = 1; +	pirq_cold = 0;  	/* First, find the least-used PIRQ pin. */  	best_pin = 0; @@ -222,7 +222,7 @@ pirq_alloc_pin(struct vmctx *ctx)  				best_count = irq_counts[irq];  			}  		} -		assert(best_irq != 0); +		assert(best_irq >= 0);  		irq_counts[best_irq]++;  		pirqs[best_pin].reg = best_irq;  		vm_isa_set_irq_trigger(ctx, best_irq, LEVEL_TRIGGER); @@ -234,9 +234,6 @@ pirq_alloc_pin(struct vmctx *ctx)  int  pirq_irq(int pin)  { - -	if (pin == -1) -		return (255);  	assert(pin > 0 && pin <= nitems(pirqs));  	return (pirqs[pin - 1].reg & PIRQ_IRQ);  } diff --git a/usr.sbin/bhyve/pm.c b/usr.sbin/bhyve/pm.c index 67126d8765c7..f5a2d438be7f 100644 --- a/usr.sbin/bhyve/pm.c +++ b/usr.sbin/bhyve/pm.c @@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");  #include <machine/vmm.h>  #include <assert.h> +#include <errno.h>  #include <pthread.h>  #include <signal.h>  #include <vmmapi.h> @@ -56,6 +57,8 @@ static int  reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,      uint32_t *eax, void *arg)  { +	int error; +  	static uint8_t reset_control;  	if (bytes != 1) @@ -66,8 +69,10 @@ reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,  		reset_control = *eax;  		/* Treat hard and soft resets the same. */ -		if (reset_control & 0x4) -			return (INOUT_RESET); +		if (reset_control & 0x4) { +			error = vm_suspend(ctx, VM_SUSPEND_RESET); +			assert(error == 0 || errno == EALREADY); +		}  	}  	return (0);  } @@ -224,6 +229,7 @@ static int  pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,      uint32_t *eax, void *arg)  { +	int error;  	if (bytes != 2)  		return (-1); @@ -243,8 +249,10 @@ pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,  		 * says that '5' should be stored in SLP_TYP for S5.  		 */  		if (*eax & PM1_SLP_EN) { -			if ((pm1_control & PM1_SLP_TYP) >> 10 == 5) -				return (INOUT_POWEROFF); +			if ((pm1_control & PM1_SLP_TYP) >> 10 == 5) { +				error = vm_suspend(ctx, VM_SUSPEND_POWEROFF); +				assert(error == 0 || errno == EALREADY); +			}  		}  	}  	return (0); diff --git a/usr.sbin/bhyve/smbiostbl.c b/usr.sbin/bhyve/smbiostbl.c index d560f022fc2a..28c7eb2c74b4 100644 --- a/usr.sbin/bhyve/smbiostbl.c +++ b/usr.sbin/bhyve/smbiostbl.c @@ -321,8 +321,8 @@ struct smbios_table_type0 smbios_type0_template = {  const char *smbios_type0_strings[] = {  	"BHYVE",	/* vendor string */ -	__TIME__,	/* bios version string */ -	__DATE__,	/* bios release date string */ +	"1.00",		/* bios version string */ +	"03/14/2014",	/* bios release date string */  	NULL  }; diff --git a/usr.sbin/bhyve/task_switch.c b/usr.sbin/bhyve/task_switch.c new file mode 100644 index 000000000000..0002da8df8ef --- /dev/null +++ b/usr.sbin/bhyve/task_switch.c @@ -0,0 +1,932 @@ +/*- + * Copyright (c) 2014 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *    notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + *    notice, this list of conditions and the following disclaimer in the + *    documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/_iovec.h> +#include <sys/mman.h> + +#include <x86/psl.h> +#include <x86/segments.h> +#include <x86/specialreg.h> +#include <machine/vmm.h> +#include <machine/vmm_instruction_emul.h> + +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <errno.h> + +#include <vmmapi.h> + +#include "bhyverun.h" + +/* + * Using 'struct i386tss' is tempting but causes myriad sign extension + * issues because all of its fields are defined as signed integers. + */ +struct tss32 { +	uint16_t	tss_link; +	uint16_t	rsvd1; +	uint32_t	tss_esp0; +	uint16_t	tss_ss0; +	uint16_t	rsvd2; +	uint32_t	tss_esp1; +	uint16_t	tss_ss1; +	uint16_t	rsvd3; +	uint32_t	tss_esp2; +	uint16_t	tss_ss2; +	uint16_t	rsvd4; +	uint32_t	tss_cr3; +	uint32_t	tss_eip; +	uint32_t	tss_eflags; +	uint32_t	tss_eax; +	uint32_t	tss_ecx; +	uint32_t	tss_edx; +	uint32_t	tss_ebx; +	uint32_t	tss_esp; +	uint32_t	tss_ebp; +	uint32_t	tss_esi; +	uint32_t	tss_edi; +	uint16_t	tss_es; +	uint16_t	rsvd5; +	uint16_t	tss_cs; +	uint16_t	rsvd6; +	uint16_t	tss_ss; +	uint16_t	rsvd7; +	uint16_t	tss_ds; +	uint16_t	rsvd8; +	uint16_t	tss_fs; +	uint16_t	rsvd9; +	uint16_t	tss_gs; +	uint16_t	rsvd10; +	uint16_t	tss_ldt; +	uint16_t	rsvd11; +	uint16_t	tss_trap; +	uint16_t	tss_iomap; +}; +CTASSERT(sizeof(struct tss32) == 104); + +#define	SEL_START(sel)	(((sel) & ~0x7)) +#define	SEL_LIMIT(sel)	(((sel) | 0x7)) +#define	TSS_BUSY(type)	(((type) & 0x2) != 0) + +static uint64_t +GETREG(struct vmctx *ctx, int vcpu, int reg) +{ +	uint64_t val; +	int error; + +	error = vm_get_register(ctx, vcpu, reg, &val); +	assert(error == 0); +	return (val); +} + +static void +SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val) +{ +	int error; + +	error = vm_set_register(ctx, vcpu, reg, val); +	assert(error == 0); +} + +static struct seg_desc +usd_to_seg_desc(struct user_segment_descriptor *usd) +{ +	struct seg_desc seg_desc; + +	seg_desc.base = (u_int)USD_GETBASE(usd); +	if (usd->sd_gran) +		seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff; +	else +		seg_desc.limit = (u_int)USD_GETLIMIT(usd); +	seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7; +	seg_desc.access |= usd->sd_xx << 12; +	seg_desc.access |= usd->sd_def32 << 14; +	seg_desc.access |= usd->sd_gran << 15; + +	return (seg_desc); +} + +/* + * Inject an exception with an error code that is a segment selector. + * The format of the error code is described in section 6.13, "Error Code", + * Intel SDM volume 3. + * + * Bit 0 (EXT) denotes whether the exception occurred during delivery + * of an external event like an interrupt. + * + * Bit 1 (IDT) indicates whether the selector points to a gate descriptor + * in the IDT. + * + * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI). + */ +static void +sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext) +{ +	/* +	 * Bit 2 from the selector is retained as-is in the error code. +	 * +	 * Bit 1 can be safely cleared because none of the selectors +	 * encountered during task switch emulation refer to a task +	 * gate in the IDT. +	 * +	 * Bit 0 is set depending on the value of 'ext'. +	 */ +	sel &= ~0x3; +	if (ext) +		sel |= 0x1; +	vm_inject_fault(ctx, vcpu, vector, 1, sel); +} + +/* + * Return 0 if the selector 'sel' in within the limits of the GDT/LDT + * and non-zero otherwise. + */ +static int +desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel) +{ +	uint64_t base; +	uint32_t limit, access; +	int error, reg; + +	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; +	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); +	assert(error == 0); + +	if (reg == VM_REG_GUEST_LDTR) { +		if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access)) +			return (-1); +	} + +	if (limit < SEL_LIMIT(sel)) +		return (-1); +	else +		return (0); +} + +/* + * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced + * by the selector 'sel'. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, +    uint16_t sel, struct user_segment_descriptor *desc, bool doread) +{ +	struct iovec iov[2]; +	uint64_t base; +	uint32_t limit, access; +	int error, reg; + +	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; +	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); +	assert(error == 0); +	assert(limit >= SEL_LIMIT(sel)); + +	error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel), +	    sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov)); +	if (error == 0) { +		if (doread) +			vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc)); +		else +			vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc)); +	} +	return (error); +} + +static int +desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, +    uint16_t sel, struct user_segment_descriptor *desc) +{ +	return (desc_table_rw(ctx, vcpu, paging, sel, desc, true)); +} + +static int +desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, +    uint16_t sel, struct user_segment_descriptor *desc) +{ +	return (desc_table_rw(ctx, vcpu, paging, sel, desc, false)); +} + +/* + * Read the TSS descriptor referenced by 'sel' into 'desc'. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, +    uint16_t sel, struct user_segment_descriptor *desc) +{ +	struct vm_guest_paging sup_paging; +	int error; + +	assert(!ISLDT(sel)); +	assert(IDXSEL(sel) != 0); + +	/* Fetch the new TSS descriptor */ +	if (desc_table_limit_check(ctx, vcpu, sel)) { +		if (ts->reason == TSR_IRET) +			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); +		else +			sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext); +		return (1); +	} + +	sup_paging = ts->paging; +	sup_paging.cpl = 0;		/* implicit supervisor mode */ +	error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc); +	return (error); +} + +static bool +code_desc(int sd_type) +{ +	/* code descriptor */ +	return ((sd_type & 0x18) == 0x18); +} + +static bool +stack_desc(int sd_type) +{ +	/* writable data descriptor */ +	return ((sd_type & 0x1A) == 0x12); +} + +static bool +data_desc(int sd_type) +{ +	/* data descriptor or a readable code descriptor */ +	return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A); +} + +static bool +ldt_desc(int sd_type) +{ + +	return (sd_type == SDT_SYSLDT); +} + +/* + * Validate the descriptor 'seg_desc' associated with 'segment'. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, +    int segment, struct seg_desc *seg_desc) +{ +	struct vm_guest_paging sup_paging; +	struct user_segment_descriptor usd; +	int error, idtvec; +	int cpl, dpl, rpl; +	uint16_t sel, cs; +	bool ldtseg, codeseg, stackseg, dataseg, conforming; + +	ldtseg = codeseg = stackseg = dataseg = false; +	switch (segment) { +	case VM_REG_GUEST_LDTR: +		ldtseg = true; +		break; +	case VM_REG_GUEST_CS: +		codeseg = true; +		break; +	case VM_REG_GUEST_SS: +		stackseg = true; +		break; +	case VM_REG_GUEST_DS: +	case VM_REG_GUEST_ES: +	case VM_REG_GUEST_FS: +	case VM_REG_GUEST_GS: +		dataseg = true; +		break; +	default: +		assert(0); +	} + +	/* Get the segment selector */ +	sel = GETREG(ctx, vcpu, segment); + +	/* LDT selector must point into the GDT */ +	if (ldtseg && ISLDT(sel)) { +		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); +		return (1); +	} + +	/* Descriptor table limit check */ +	if (desc_table_limit_check(ctx, vcpu, sel)) { +		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); +		return (1); +	} + +	/* NULL selector */ +	if (IDXSEL(sel) == 0) { +		/* Code and stack segment selectors cannot be NULL */ +		if (codeseg || stackseg) { +			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); +			return (1); +		} +		seg_desc->base = 0; +		seg_desc->limit = 0; +		seg_desc->access = 0x10000;	/* unusable */ +		return (0); +	} + +	/* Read the descriptor from the GDT/LDT */ +	sup_paging = ts->paging; +	sup_paging.cpl = 0;	/* implicit supervisor mode */ +	error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd); +	if (error) +		return (error); + +	/* Verify that the descriptor type is compatible with the segment */ +	if ((ldtseg && !ldt_desc(usd.sd_type)) || +	    (codeseg && !code_desc(usd.sd_type)) || +	    (dataseg && !data_desc(usd.sd_type)) || +	    (stackseg && !stack_desc(usd.sd_type))) { +		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); +		return (1); +	} + +	/* Segment must be marked present */ +	if (!usd.sd_p) { +		if (ldtseg) +			idtvec = IDT_TS; +		else if (stackseg) +			idtvec = IDT_SS; +		else +			idtvec = IDT_NP; +		sel_exception(ctx, vcpu, idtvec, sel, ts->ext); +		return (1); +	} + +	cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); +	cpl = cs & SEL_RPL_MASK; +	rpl = sel & SEL_RPL_MASK; +	dpl = usd.sd_dpl; + +	if (stackseg && (rpl != cpl || dpl != cpl)) { +		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); +		return (1); +	} + +	if (codeseg) { +		conforming = (usd.sd_type & 0x4) ? true : false; +		if ((conforming && (cpl < dpl)) || +		    (!conforming && (cpl != dpl))) { +			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); +			return (1); +		} +	} + +	if (dataseg) { +		/* +		 * A data segment is always non-conforming except when it's +		 * descriptor is a readable, conforming code segment. +		 */ +		if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0) +			conforming = true; +		else +			conforming = false; + +		if (!conforming && (rpl > dpl || cpl > dpl)) { +			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); +			return (1); +		} +	} +	*seg_desc = usd_to_seg_desc(&usd); +	return (0); +} + +static void +tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch, +    uint32_t eip, struct tss32 *tss, struct iovec *iov) +{ + +	/* General purpose registers */ +	tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX); +	tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX); +	tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX); +	tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX); +	tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); +	tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP); +	tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI); +	tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI); + +	/* Segment selectors */ +	tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES); +	tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); +	tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS); +	tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS); +	tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS); +	tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS); + +	/* eflags and eip */ +	tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); +	if (task_switch->reason == TSR_IRET) +		tss->tss_eflags &= ~PSL_NT; +	tss->tss_eip = eip; + +	/* Copy updated old TSS into guest memory */ +	vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32)); +} + +static void +update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd) +{ +	int error; + +	error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access); +	assert(error == 0); +} + +/* + * Update the vcpu registers to reflect the state of the new task. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, +    uint16_t ot_sel, struct tss32 *tss, struct iovec *iov) +{ +	struct seg_desc seg_desc, seg_desc2; +	uint64_t *pdpte, maxphyaddr, reserved; +	uint32_t eflags; +	int error, i; +	bool nested; + +	nested = false; +	if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) { +		tss->tss_link = ot_sel; +		nested = true; +	} + +	eflags = tss->tss_eflags; +	if (nested) +		eflags |= PSL_NT; + +	/* LDTR */ +	SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt); + +	/* PBDR */ +	if (ts->paging.paging_mode != PAGING_MODE_FLAT) { +		if (ts->paging.paging_mode == PAGING_MODE_PAE) { +			/* +			 * XXX Assuming 36-bit MAXPHYADDR. +			 */ +			maxphyaddr = (1UL << 36) - 1; +			pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32); +			for (i = 0; i < 4; i++) { +				/* Check reserved bits if the PDPTE is valid */ +				if (!(pdpte[i] & 0x1)) +					continue; +				/* +				 * Bits 2:1, 8:5 and bits above the processor's +				 * maximum physical address are reserved. +				 */ +				reserved = ~maxphyaddr | 0x1E6; +				if (pdpte[i] & reserved) { +					vm_inject_gp(ctx, vcpu); +					return (1); +				} +			} +			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]); +			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]); +			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]); +			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]); +		} +		SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3); +		ts->paging.cr3 = tss->tss_cr3; +	} + +	/* eflags and eip */ +	SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags); +	SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip); + +	/* General purpose registers */ +	SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax); +	SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx); +	SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx); +	SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx); +	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp); +	SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp); +	SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi); +	SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi); + +	/* Segment selectors */ +	SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es); +	SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs); +	SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss); +	SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds); +	SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs); +	SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs); + +	/* +	 * If this is a nested task then write out the new TSS to update +	 * the previous link field. +	 */ +	if (nested) +		vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss)); + +	/* Validate segment descriptors */ +	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc); +	if (error) +		return (error); +	update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc); + +	/* +	 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3. +	 * +	 * The SS and CS attribute checks on VM-entry are inter-dependent so +	 * we need to make sure that both segments are valid before updating +	 * either of them. This ensures that the VMCS state can pass the +	 * VM-entry checks so the guest can handle any exception injected +	 * during task switch emulation. +	 */ +	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc); +	if (error) +		return (error); +	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2); +	if (error) +		return (error); +	update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc); +	update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2); +	ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK; + +	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc); +	if (error) +		return (error); +	update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc); + +	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc); +	if (error) +		return (error); +	update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc); + +	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc); +	if (error) +		return (error); +	update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc); + +	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc); +	if (error) +		return (error); +	update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc); + +	return (0); +} + +/* + * Push an error code on the stack of the new task. This is needed if the + * task switch was triggered by a hardware exception that causes an error + * code to be saved (e.g. #PF). + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, +    int task_type, uint32_t errcode) +{ +	struct iovec iov[2]; +	struct seg_desc seg_desc; +	int stacksize, bytes, error; +	uint64_t gla, cr0, rflags; +	uint32_t esp; +	uint16_t stacksel; + +	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); +	rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); +	stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS); + +	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base, +	    &seg_desc.limit, &seg_desc.access); +	assert(error == 0); + +	/* +	 * Section "Error Code" in the Intel SDM vol 3: the error code is +	 * pushed on the stack as a doubleword or word (depending on the +	 * default interrupt, trap or task gate size). +	 */ +	if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS) +		bytes = 4; +	else +		bytes = 2; + +	/* +	 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the +	 * stack-segment descriptor determines the size of the stack +	 * pointer outside of 64-bit mode. +	 */ +	if (SEG_DESC_DEF32(seg_desc.access)) +		stacksize = 4; +	else +		stacksize = 2; + +	esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); +	esp -= bytes; + +	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, +	    &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) { +		sel_exception(ctx, vcpu, IDT_SS, stacksel, 1); +		return (1); +	} + +	if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) { +		vm_inject_ac(ctx, vcpu, 1); +		return (1); +	} + +	error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE, +	    iov, nitems(iov)); +	if (error) +		return (error); + +	vm_copyout(ctx, vcpu, &errcode, iov, bytes); +	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp); +	return (0); +} + +/* + * Evaluate return value from helper functions and potentially return to + * the VM run loop. + *  0: success + * +1: an exception was injected into the guest vcpu + * -1: unrecoverable/programming error + */ +#define	CHKERR(x)							\ +	do {								\ +		assert(((x) == 0) || ((x) == 1) || ((x) == -1));	\ +		if ((x) == -1)						\ +			return (VMEXIT_ABORT);				\ +		else if ((x) == 1)					\ +			return (VMEXIT_CONTINUE);			\ +	} while (0) + +int +vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ +	struct seg_desc nt; +	struct tss32 oldtss, newtss; +	struct vm_task_switch *task_switch; +	struct vm_guest_paging *paging, sup_paging; +	struct user_segment_descriptor nt_desc, ot_desc; +	struct iovec nt_iov[2], ot_iov[2]; +	uint64_t cr0, ot_base; +	uint32_t eip, ot_lim, access; +	int error, ext, minlimit, nt_type, ot_type, vcpu; +	enum task_switch_reason reason; +	uint16_t nt_sel, ot_sel; + +	task_switch = &vmexit->u.task_switch; +	nt_sel = task_switch->tsssel; +	ext = vmexit->u.task_switch.ext; +	reason = vmexit->u.task_switch.reason; +	paging = &vmexit->u.task_switch.paging; +	vcpu = *pvcpu; + +	assert(paging->cpu_mode == CPU_MODE_PROTECTED); + +	/* +	 * Section 4.6, "Access Rights" in Intel SDM Vol 3. +	 * The following page table accesses are implicitly supervisor mode: +	 * - accesses to GDT or LDT to load segment descriptors +	 * - accesses to the task state segment during task switch +	 */ +	sup_paging = *paging; +	sup_paging.cpl = 0;	/* implicit supervisor mode */ + +	/* Fetch the new TSS descriptor */ +	error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc); +	CHKERR(error); + +	nt = usd_to_seg_desc(&nt_desc); + +	/* Verify the type of the new TSS */ +	nt_type = SEG_DESC_TYPE(nt.access); +	if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS && +	    nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) { +		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); +		goto done; +	} + +	/* TSS descriptor must have present bit set */ +	if (!SEG_DESC_PRESENT(nt.access)) { +		sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext); +		goto done; +	} + +	/* +	 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and +	 * 44 bytes for a 16-bit TSS. +	 */ +	if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS) +		minlimit = 104 - 1; +	else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) +		minlimit = 44 - 1; +	else +		minlimit = 0; + +	assert(minlimit > 0); +	if (nt.limit < minlimit) { +		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); +		goto done; +	} + +	/* TSS must be busy if task switch is due to IRET */ +	if (reason == TSR_IRET && !TSS_BUSY(nt_type)) { +		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); +		goto done; +	} + +	/* +	 * TSS must be available (not busy) if task switch reason is +	 * CALL, JMP, exception or interrupt. +	 */ +	if (reason != TSR_IRET && TSS_BUSY(nt_type)) { +		sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext); +		goto done; +	} + +	/* Fetch the new TSS */ +	error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1, +	    PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov)); +	CHKERR(error); +	vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1); + +	/* Get the old TSS selector from the guest's task register */ +	ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR); +	if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) { +		/* +		 * This might happen if a task switch was attempted without +		 * ever loading the task register with LTR. In this case the +		 * TR would contain the values from power-on: +		 * (sel = 0, base = 0, limit = 0xffff). +		 */ +		sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext); +		goto done; +	} + +	/* Get the old TSS base and limit from the guest's task register */ +	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim, +	    &access); +	assert(error == 0); +	assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access)); +	ot_type = SEG_DESC_TYPE(access); +	assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY); + +	/* Fetch the old TSS descriptor */ +	error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc); +	CHKERR(error); + +	/* Get the old TSS */ +	error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1, +	    PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov)); +	CHKERR(error); +	vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1); + +	/* +	 * Clear the busy bit in the old TSS descriptor if the task switch +	 * due to an IRET or JMP instruction. +	 */ +	if (reason == TSR_IRET || reason == TSR_JMP) { +		ot_desc.sd_type &= ~0x2; +		error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel, +		    &ot_desc); +		CHKERR(error); +	} + +	if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) { +		fprintf(stderr, "Task switch to 16-bit TSS not supported\n"); +		return (VMEXIT_ABORT); +	} + +	/* Save processor state in old TSS */ +	eip = vmexit->rip + vmexit->inst_length; +	tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov); + +	/* +	 * If the task switch was triggered for any reason other than IRET +	 * then set the busy bit in the new TSS descriptor. +	 */ +	if (reason != TSR_IRET) { +		nt_desc.sd_type |= 0x2; +		error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel, +		    &nt_desc); +		CHKERR(error); +	} + +	/* Update task register to point at the new TSS */ +	SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel); + +	/* Update the hidden descriptor state of the task register */ +	nt = usd_to_seg_desc(&nt_desc); +	update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt); + +	/* Set CR0.TS */ +	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); +	SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS); + +	/* +	 * We are now committed to the task switch. Any exceptions encountered +	 * after this point will be handled in the context of the new task and +	 * the saved instruction pointer will belong to the new task. +	 */ +	vmexit->rip = newtss.tss_eip; +	vmexit->inst_length = 0; + +	/* Load processor state from new TSS */ +	error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov); +	CHKERR(error); + +	/* +	 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception +	 * caused an error code to be generated, this error code is copied +	 * to the stack of the new task. +	 */ +	if (task_switch->errcode_valid) { +		assert(task_switch->ext); +		assert(task_switch->reason == TSR_IDT_GATE); +		error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type, +		    task_switch->errcode); +		CHKERR(error); +	} + +	/* +	 * Treatment of virtual-NMI blocking if NMI is delivered through +	 * a task gate. +	 * +	 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3: +	 * If the virtual NMIs VM-execution control is 1, VM entry injects +	 * an NMI, and delivery of the NMI causes a task switch that causes +	 * a VM exit, virtual-NMI blocking is in effect before the VM exit +	 * commences. +	 * +	 * Thus, virtual-NMI blocking is in effect at the time of the task +	 * switch VM exit. +	 */ + +	/* +	 * Treatment of virtual-NMI unblocking on IRET from NMI handler task. +	 * +	 * Section "Changes to Instruction Behavior in VMX Non-Root Operation" +	 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking. +	 * This unblocking of virtual-NMI occurs even if IRET causes a fault. +	 * +	 * Thus, virtual-NMI blocking is cleared at the time of the task switch +	 * VM exit. +	 */ + +	/* +	 * If the task switch was triggered by an event delivered through +	 * the IDT then extinguish the pending event from the vcpu's +	 * exitintinfo. +	 */ +	if (task_switch->reason == TSR_IDT_GATE) { +		error = vm_set_intinfo(ctx, vcpu, 0); +		assert(error == 0); +	} + +	/* +	 * XXX should inject debug exception if 'T' bit is 1 +	 */ +done: +	return (VMEXIT_CONTINUE); +} diff --git a/usr.sbin/bhyve/virtio.c b/usr.sbin/bhyve/virtio.c index 4e58dd62a0e7..1f2730092936 100644 --- a/usr.sbin/bhyve/virtio.c +++ b/usr.sbin/bhyve/virtio.c @@ -437,7 +437,7 @@ vq_endchains(struct vqueue_info *vq, int used_all_avail)  	if (used_all_avail &&  	    (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY))  		intr = 1; -	else if (vs->vs_flags & VIRTIO_EVENT_IDX) { +	else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) {  		event_idx = VQ_USED_EVENT_IDX(vq);  		/*  		 * This calculation is per docs and the kernel diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h index 01b5f7b9112d..1f29dfa8ef73 100644 --- a/usr.sbin/bhyve/virtio.h +++ b/usr.sbin/bhyve/virtio.h @@ -352,7 +352,7 @@ struct virtio_consts {  					/* called to read config regs */  	int	(*vc_cfgwrite)(void *, int, int, uint32_t);  					/* called to write config regs */ -	uint32_t vc_hv_caps;		/* hypervisor-provided capabilities */ +	uint64_t vc_hv_caps;		/* hypervisor-provided capabilities */  };  /* diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c index e77f0d77df6f..b6006b72a767 100644 --- a/usr.sbin/bhyvectl/bhyvectl.c +++ b/usr.sbin/bhyvectl/bhyvectl.c @@ -195,7 +195,8 @@ usage(void)  	"       [--force-reset]\n"  	"       [--force-poweroff]\n"  	"       [--get-active-cpus]\n" -	"       [--get-suspended-cpus]\n", +	"       [--get-suspended-cpus]\n" +	"       [--get-intinfo]\n",  	progname);  	exit(1);  } @@ -205,6 +206,7 @@ static int inject_nmi, assert_lapic_lvt;  static int force_reset, force_poweroff;  static const char *capname;  static int create, destroy, get_lowmem, get_highmem; +static int get_intinfo;  static int get_active_cpus, get_suspended_cpus;  static uint64_t memsize;  static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4; @@ -412,6 +414,37 @@ print_cpus(const char *banner, const cpuset_t *cpus)  	printf("\n");  } +static void +print_intinfo(const char *banner, uint64_t info) +{ +	int type; + +	printf("%s:\t", banner); +	if (info & VM_INTINFO_VALID) { +		type = info & VM_INTINFO_TYPE; +		switch (type) { +		case VM_INTINFO_HWINTR: +			printf("extint"); +			break; +		case VM_INTINFO_NMI: +			printf("nmi"); +			break; +		case VM_INTINFO_SWINTR: +			printf("swint"); +			break; +		default: +			printf("exception"); +			break; +		} +		printf(" vector %d", (int)VM_INTINFO_VECTOR(info)); +		if (info & VM_INTINFO_DEL_ERRCODE) +			printf(" errcode %#x", (u_int)(info >> 32)); +	} else { +		printf("n/a"); +	} +	printf("\n"); +} +  int  main(int argc, char *argv[])  { @@ -420,7 +453,7 @@ main(int argc, char *argv[])  	vm_paddr_t gpa, gpa_pmap;  	size_t len;  	struct vm_exit vmexit; -	uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte; +	uint64_t ctl, eptp, bm, addr, u64, pteval[4], *pte, info[2];  	struct vmctx *ctx;  	int wired;  	cpuset_t cpus; @@ -595,6 +628,7 @@ main(int argc, char *argv[])  		{ "force-poweroff", NO_ARG,	&force_poweroff, 1 },  		{ "get-active-cpus", NO_ARG,	&get_active_cpus, 1 },  		{ "get-suspended-cpus", NO_ARG,	&get_suspended_cpus, 1 }, +		{ "get-intinfo", NO_ARG,	&get_intinfo,	1 },  		{ NULL,		0,		NULL,		0 }  	}; @@ -1566,6 +1600,14 @@ main(int argc, char *argv[])  			print_cpus("suspended cpus", &cpus);  	} +	if (!error && (get_intinfo || get_all)) { +		error = vm_get_intinfo(ctx, vcpu, &info[0], &info[1]); +		if (!error) { +			print_intinfo("pending", info[0]); +			print_intinfo("current", info[1]); +		} +	} +  	if (!error && run) {  		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);  		assert(error == 0); diff --git a/usr.sbin/bhyveload/bhyveload.8 b/usr.sbin/bhyveload/bhyveload.8 index 3a300cc00f85..0bdf151eb268 100644 --- a/usr.sbin/bhyveload/bhyveload.8 +++ b/usr.sbin/bhyveload/bhyveload.8 @@ -35,11 +35,11 @@  guest inside a bhyve virtual machine  .Sh SYNOPSIS  .Nm -.Op Fl m Ar mem-size +.Op Fl c Ar cons-dev  .Op Fl d Ar disk-path -.Op Fl h Ar host-path  .Op Fl e Ar name=value -.Op Fl c Ar cons-dev +.Op Fl h Ar host-path +.Op Fl m Ar mem-size  .Ar vmname  .Sh DESCRIPTION  .Nm @@ -62,6 +62,32 @@ and will be created if it does not already exist.  .Sh OPTIONS  The following options are available:  .Bl -tag -width indent +.It Fl c Ar cons-dev +.Ar cons-dev +is a +.Xr tty 4 +device to use for +.Nm +terminal I/O. +.Pp +The text string "stdio" is also accepted and selects the use of +unbuffered standard I/O. This is the default value. +.It Fl d Ar disk-path +The +.Ar disk-path +is the pathname of the guest's boot disk image. +.It Fl e Ar name=value +Set the FreeBSD loader environment variable +.Ar name +to +.Ar value . +.Pp +The option may be used more than once to set more than one environment +variable. +.It Fl h Ar host-path +The +.Ar host-path +is the directory at the top of the guest's boot filesystem.  .It Fl m Ar mem-size Xo  .Sm off  .Op Cm K | k | M | m | G | g | T | t @@ -85,32 +111,6 @@ respectively.  The default value of  .Ar mem-size  is 256M. -.It Fl d Ar disk-path -The -.Ar disk-path -is the pathname of the guest's boot disk image. -.It Fl h Ar host-path -The -.Ar host-path -is the directory at the top of the guest's boot filesystem. -.It Fl e Ar name=value -Set the FreeBSD loader environment variable -.Ar name -to -.Ar value . -.Pp -The option may be used more than once to set more than one environment -variable. -.It Fl c Ar cons-dev -.Ar cons-dev -is a -.Xr tty 4 -device to use for -.Nm -terminal I/O. -.Pp -The text string "stdio" is also accepted and selects the use of -unbuffered standard I/O. This is the default value.  .El  .Sh EXAMPLES  To create a virtual machine named diff --git a/usr.sbin/bhyveload/bhyveload.c b/usr.sbin/bhyveload/bhyveload.c index ff6b26926f3b..eaf71a819373 100644 --- a/usr.sbin/bhyveload/bhyveload.c +++ b/usr.sbin/bhyveload/bhyveload.c @@ -629,8 +629,8 @@ usage(void)  {  	fprintf(stderr, -	    "usage: %s [-m mem-size] [-d <disk-path>] [-h <host-path>]\n" -	    "       %*s [-e <name=value>] [-c <console-device>] <vmname>\n", +	    "usage: %s [-c <console-device>] [-d <disk-path>] [-e <name=value>]\n" +	    "       %*s [-h <host-path>] [-m mem-size] <vmname>\n",  	    progname,  	    (int)strlen(progname), "");  	exit(1);  | 
