diff options
Diffstat (limited to 'sys/amd64/vmm')
66 files changed, 31499 insertions, 0 deletions
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c new file mode 100644 index 000000000000..c3a4547afeeb --- /dev/null +++ b/sys/amd64/vmm/amd/amdv.c @@ -0,0 +1,130 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/errno.h> + +#include <machine/vmm.h> +#include "io/iommu.h" + +static int +amd_iommu_init(void) +{ + + printf("amd_iommu_init: not implemented\n"); + return (ENXIO); +} + +static void +amd_iommu_cleanup(void) +{ + + printf("amd_iommu_cleanup: not implemented\n"); +} + +static void +amd_iommu_enable(void) +{ + + printf("amd_iommu_enable: not implemented\n"); +} + +static void +amd_iommu_disable(void) +{ + + printf("amd_iommu_disable: not implemented\n"); +} + +static void * +amd_iommu_create_domain(vm_paddr_t maxaddr) +{ + + printf("amd_iommu_create_domain: not implemented\n"); + return (NULL); +} + +static void +amd_iommu_destroy_domain(void *domain) +{ + + printf("amd_iommu_destroy_domain: not implemented\n"); +} + +static uint64_t +amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, + uint64_t len) +{ + + printf("amd_iommu_create_mapping: not implemented\n"); + return (0); +} + +static uint64_t +amd_iommu_remove_mapping(void *domain, vm_paddr_t gpa, uint64_t len) +{ + + printf("amd_iommu_remove_mapping: not implemented\n"); + return (0); +} + +static void +amd_iommu_add_device(void *domain, uint16_t rid) +{ + + printf("amd_iommu_add_device: not implemented\n"); +} + +static void +amd_iommu_remove_device(void *domain, uint16_t rid) +{ + + printf("amd_iommu_remove_device: not implemented\n"); +} + +static void +amd_iommu_invalidate_tlb(void *domain) +{ + + printf("amd_iommu_invalidate_tlb: not implemented\n"); +} + +struct iommu_ops iommu_ops_amd = { + amd_iommu_init, + amd_iommu_cleanup, + amd_iommu_enable, + amd_iommu_disable, + amd_iommu_create_domain, + amd_iommu_destroy_domain, + amd_iommu_create_mapping, + amd_iommu_remove_mapping, + amd_iommu_add_device, + amd_iommu_remove_device, + amd_iommu_invalidate_tlb, +}; diff --git a/sys/amd64/vmm/amd/amdvi_hw.c b/sys/amd64/vmm/amd/amdvi_hw.c new file mode 100644 index 000000000000..831c31277570 --- /dev/null +++ b/sys/amd64/vmm/amd/amdvi_hw.c @@ -0,0 +1,1387 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2016, Anish Gupta (anish@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/rman.h> +#include <sys/smp.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> + +#include <machine/resource.h> +#include <machine/vmm.h> +#include <machine/pmap.h> +#include <machine/vmparam.h> +#include <machine/pci_cfgreg.h> + +#include "ivhd_if.h" +#include "pcib_if.h" + +#include "io/iommu.h" +#include "amdvi_priv.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, amdvi, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, + NULL); + +#define MOD_INC(a, s, m) (((a) + (s)) % ((m) * (s))) +#define MOD_DEC(a, s, m) (((a) - (s)) % ((m) * (s))) + +/* Print RID or device ID in PCI string format. */ +#define RID2PCI_STR(d) PCI_RID2BUS(d), PCI_RID2SLOT(d), PCI_RID2FUNC(d) + +static void amdvi_dump_cmds(struct amdvi_softc *softc, int count); +static void amdvi_print_dev_cap(struct amdvi_softc *softc); + +MALLOC_DEFINE(M_AMDVI, "amdvi", "amdvi"); + +extern device_t *ivhd_devs; + +extern int ivhd_count; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, count, CTLFLAG_RDTUN, &ivhd_count, + 0, NULL); + +static int amdvi_enable_user = 0; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, enable, CTLFLAG_RDTUN, + &amdvi_enable_user, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi_enable", &amdvi_enable_user); + +#ifdef AMDVI_ATS_ENABLE +/* XXX: ATS is not tested. */ +static int amdvi_enable_iotlb = 1; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, iotlb_enabled, CTLFLAG_RDTUN, + &amdvi_enable_iotlb, 0, NULL); +TUNABLE_INT("hw.vmm.enable_iotlb", &amdvi_enable_iotlb); +#endif + +static int amdvi_host_ptp = 1; /* Use page tables for host. */ +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, host_ptp, CTLFLAG_RDTUN, + &amdvi_host_ptp, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi.host_ptp", &amdvi_host_ptp); + +/* Page table level used <= supported by h/w[v1=7]. */ +int amdvi_ptp_level = 4; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, ptp_level, CTLFLAG_RDTUN, + &amdvi_ptp_level, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi.ptp_level", &amdvi_ptp_level); + +/* Disable fault event reporting. */ +static int amdvi_disable_io_fault = 0; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, disable_io_fault, CTLFLAG_RDTUN, + &amdvi_disable_io_fault, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi.disable_io_fault", &amdvi_disable_io_fault); + +static uint32_t amdvi_dom_id = 0; /* 0 is reserved for host. */ +SYSCTL_UINT(_hw_vmm_amdvi, OID_AUTO, domain_id, CTLFLAG_RD, + &amdvi_dom_id, 0, NULL); +/* + * Device table entry. + * Bus(256) x Dev(32) x Fun(8) x DTE(256 bits or 32 bytes). + * = 256 * 2 * PAGE_SIZE. + */ +static struct amdvi_dte amdvi_dte[PCI_NUM_DEV_MAX] __aligned(PAGE_SIZE); +CTASSERT(PCI_NUM_DEV_MAX == 0x10000); +CTASSERT(sizeof(amdvi_dte) == 0x200000); + +static SLIST_HEAD (, amdvi_domain) dom_head; + +static inline uint32_t +amdvi_pci_read(struct amdvi_softc *softc, int off) +{ + + return (pci_cfgregread(softc->pci_seg, PCI_RID2BUS(softc->pci_rid), + PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid), + off, 4)); +} + +#ifdef AMDVI_ATS_ENABLE +/* XXX: Should be in pci.c */ +/* + * Check if device has ATS capability and its enabled. + * If ATS is absent or disabled, return (-1), otherwise ATS + * queue length. + */ +static int +amdvi_find_ats_qlen(uint16_t devid) +{ + device_t dev; + uint32_t off, cap; + int qlen = -1; + + dev = pci_find_bsf(PCI_RID2BUS(devid), PCI_RID2SLOT(devid), + PCI_RID2FUNC(devid)); + + if (!dev) { + return (-1); + } +#define PCIM_ATS_EN BIT(31) + + if (pci_find_extcap(dev, PCIZ_ATS, &off) == 0) { + cap = pci_read_config(dev, off + 4, 4); + qlen = (cap & 0x1F); + qlen = qlen ? qlen : 32; + printf("AMD-Vi: PCI device %d.%d.%d ATS %s qlen=%d\n", + RID2PCI_STR(devid), + (cap & PCIM_ATS_EN) ? "enabled" : "Disabled", + qlen); + qlen = (cap & PCIM_ATS_EN) ? qlen : -1; + } + + return (qlen); +} + +/* + * Check if an endpoint device support device IOTLB or ATS. + */ +static inline bool +amdvi_dev_support_iotlb(struct amdvi_softc *softc, uint16_t devid) +{ + struct ivhd_dev_cfg *cfg; + int qlen, i; + bool pci_ats, ivhd_ats; + + qlen = amdvi_find_ats_qlen(devid); + if (qlen < 0) + return (false); + + KASSERT(softc, ("softc is NULL")); + cfg = softc->dev_cfg; + + ivhd_ats = false; + for (i = 0; i < softc->dev_cfg_cnt; i++) { + if ((cfg->start_id <= devid) && (cfg->end_id >= devid)) { + ivhd_ats = cfg->enable_ats; + break; + } + cfg++; + } + + pci_ats = (qlen < 0) ? false : true; + if (pci_ats != ivhd_ats) + device_printf(softc->dev, + "BIOS bug: mismatch in ATS setting for %d.%d.%d," + "ATS inv qlen = %d\n", RID2PCI_STR(devid), qlen); + + /* Ignore IVRS setting and respect PCI setting. */ + return (pci_ats); +} +#endif + +/* Enable IOTLB support for IOMMU if its supported. */ +static inline void +amdvi_hw_enable_iotlb(struct amdvi_softc *softc) +{ +#ifndef AMDVI_ATS_ENABLE + softc->iotlb = false; +#else + bool supported; + + supported = (softc->ivhd_flag & IVHD_FLAG_IOTLB) ? true : false; + + if (softc->pci_cap & AMDVI_PCI_CAP_IOTLB) { + if (!supported) + device_printf(softc->dev, "IOTLB disabled by BIOS.\n"); + + if (supported && !amdvi_enable_iotlb) { + device_printf(softc->dev, "IOTLB disabled by user.\n"); + supported = false; + } + } else + supported = false; + + softc->iotlb = supported; + +#endif +} + +static int +amdvi_init_cmd(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl = softc->ctrl; + + ctrl->cmd.len = 8; /* Use 256 command buffer entries. */ + softc->cmd_max = 1 << ctrl->cmd.len; + + softc->cmd = malloc(sizeof(struct amdvi_cmd) * + softc->cmd_max, M_AMDVI, M_WAITOK | M_ZERO); + + if ((uintptr_t)softc->cmd & PAGE_MASK) + panic("AMDVi: Command buffer not aligned on page boundary."); + + ctrl->cmd.base = vtophys(softc->cmd) / PAGE_SIZE; + /* + * XXX: Reset the h/w pointers in case IOMMU is restarting, + * h/w doesn't clear these pointers based on empirical data. + */ + ctrl->cmd_tail = 0; + ctrl->cmd_head = 0; + + return (0); +} + +/* + * Note: Update tail pointer after we have written the command since tail + * pointer update cause h/w to execute new commands, see section 3.3 + * of AMD IOMMU spec ver 2.0. + */ +/* Get the command tail pointer w/o updating it. */ +static struct amdvi_cmd * +amdvi_get_cmd_tail(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_cmd *tail; + + KASSERT(softc, ("softc is NULL")); + KASSERT(softc->cmd != NULL, ("cmd is NULL")); + + ctrl = softc->ctrl; + KASSERT(ctrl != NULL, ("ctrl is NULL")); + + tail = (struct amdvi_cmd *)((uint8_t *)softc->cmd + + ctrl->cmd_tail); + + return (tail); +} + +/* + * Update the command tail pointer which will start command execution. + */ +static void +amdvi_update_cmd_tail(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + int size; + + size = sizeof(struct amdvi_cmd); + KASSERT(softc->cmd != NULL, ("cmd is NULL")); + + ctrl = softc->ctrl; + KASSERT(ctrl != NULL, ("ctrl is NULL")); + + ctrl->cmd_tail = MOD_INC(ctrl->cmd_tail, size, softc->cmd_max); + softc->total_cmd++; + +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "cmd_tail: %s Tail:0x%x, Head:0x%x.\n", + ctrl->cmd_tail, + ctrl->cmd_head); +#endif + +} + +/* + * Various commands supported by IOMMU. + */ + +/* Completion wait command. */ +static void +amdvi_cmd_cmp(struct amdvi_softc *softc, const uint64_t data) +{ + struct amdvi_cmd *cmd; + uint64_t pa; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + + pa = vtophys(&softc->cmp_data); + cmd->opcode = AMDVI_CMP_WAIT_OPCODE; + cmd->word0 = (pa & 0xFFFFFFF8) | AMDVI_CMP_WAIT_STORE; + cmd->word1 = (pa >> 32) & 0xFFFFF; + cmd->addr = data; + + amdvi_update_cmd_tail(softc); +} + +/* Invalidate device table entry. */ +static void +amdvi_cmd_inv_dte(struct amdvi_softc *softc, uint16_t devid) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + cmd->opcode = AMDVI_INVD_DTE_OPCODE; + cmd->word0 = devid; + amdvi_update_cmd_tail(softc); +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidated DTE:0x%x\n", devid); +#endif +} + +/* Invalidate IOMMU page, use for invalidation of domain. */ +static void +amdvi_cmd_inv_iommu_pages(struct amdvi_softc *softc, uint16_t domain_id, + uint64_t addr, bool guest_nested, + bool pde, bool page) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + + cmd->opcode = AMDVI_INVD_PAGE_OPCODE; + cmd->word1 = domain_id; + /* + * Invalidate all addresses for this domain. + */ + cmd->addr = addr; + cmd->addr |= pde ? AMDVI_INVD_PAGE_PDE : 0; + cmd->addr |= page ? AMDVI_INVD_PAGE_S : 0; + + amdvi_update_cmd_tail(softc); +} + +#ifdef AMDVI_ATS_ENABLE +/* Invalidate device IOTLB. */ +static void +amdvi_cmd_inv_iotlb(struct amdvi_softc *softc, uint16_t devid) +{ + struct amdvi_cmd *cmd; + int qlen; + + if (!softc->iotlb) + return; + + qlen = amdvi_find_ats_qlen(devid); + if (qlen < 0) { + panic("AMDVI: Invalid ATS qlen(%d) for device %d.%d.%d\n", + qlen, RID2PCI_STR(devid)); + } + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidate IOTLB devID 0x%x" + " Qlen:%d\n", devid, qlen); +#endif + cmd->opcode = AMDVI_INVD_IOTLB_OPCODE; + cmd->word0 = devid; + cmd->word1 = qlen; + cmd->addr = AMDVI_INVD_IOTLB_ALL_ADDR | + AMDVI_INVD_IOTLB_S; + amdvi_update_cmd_tail(softc); +} +#endif + +#ifdef notyet /* For Interrupt Remap. */ +static void +amdvi_cmd_inv_intr_map(struct amdvi_softc *softc, + uint16_t devid) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + cmd->opcode = AMDVI_INVD_INTR_OPCODE; + cmd->word0 = devid; + amdvi_update_cmd_tail(softc); +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidate INTR map of devID 0x%x\n", devid); +#endif +} +#endif + +/* Invalidate domain using INVALIDATE_IOMMU_PAGES command. */ +static void +amdvi_inv_domain(struct amdvi_softc *softc, uint16_t domain_id) +{ + struct amdvi_cmd *cmd __diagused; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + + /* + * See section 3.3.3 of IOMMU spec rev 2.0, software note + * for invalidating domain. + */ + amdvi_cmd_inv_iommu_pages(softc, domain_id, AMDVI_INVD_PAGE_ALL_ADDR, + false, true, true); + +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidate domain:0x%x\n", domain_id); + +#endif +} + +static bool +amdvi_cmp_wait(struct amdvi_softc *softc) +{ +#ifdef AMDVI_DEBUG_CMD + struct amdvi_ctrl *ctrl = softc->ctrl; +#endif + const uint64_t VERIFY = 0xA5A5; + volatile uint64_t *read; + int i; + bool status; + + read = &softc->cmp_data; + *read = 0; + amdvi_cmd_cmp(softc, VERIFY); + /* Wait for h/w to update completion data. */ + for (i = 0; i < 100 && (*read != VERIFY); i++) { + DELAY(1000); /* 1 ms */ + } + status = (VERIFY == softc->cmp_data) ? true : false; + +#ifdef AMDVI_DEBUG_CMD + if (status) + device_printf(softc->dev, "CMD completion DONE Tail:0x%x, " + "Head:0x%x, loop:%d.\n", ctrl->cmd_tail, + ctrl->cmd_head, loop); +#endif + return (status); +} + +static void +amdvi_wait(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + int i; + + KASSERT(softc, ("softc is NULL")); + + ctrl = softc->ctrl; + KASSERT(ctrl != NULL, ("ctrl is NULL")); + /* Don't wait if h/w is not enabled. */ + if ((ctrl->control & AMDVI_CTRL_EN) == 0) + return; + + for (i = 0; i < 10; i++) { + if (amdvi_cmp_wait(softc)) + return; + } + + device_printf(softc->dev, "Error: completion failed" + " tail:0x%x, head:0x%x.\n", + ctrl->cmd_tail, ctrl->cmd_head); + /* Dump the last command. */ + amdvi_dump_cmds(softc, 1); +} + +static void +amdvi_dump_cmds(struct amdvi_softc *softc, int count) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_cmd *cmd; + int off, i; + + ctrl = softc->ctrl; + device_printf(softc->dev, "Dump last %d command(s):\n", count); + /* + * If h/w is stuck in completion, it is the previous command, + * start dumping from previous command onward. + */ + off = MOD_DEC(ctrl->cmd_head, sizeof(struct amdvi_cmd), + softc->cmd_max); + for (i = 0; off != ctrl->cmd_tail && i < count; i++) { + cmd = (struct amdvi_cmd *)((uint8_t *)softc->cmd + off); + printf(" [CMD%d, off:0x%x] opcode= 0x%x 0x%x" + " 0x%x 0x%lx\n", i, off, cmd->opcode, + cmd->word0, cmd->word1, cmd->addr); + off = MOD_INC(off, sizeof(struct amdvi_cmd), softc->cmd_max); + } +} + +static int +amdvi_init_event(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + + ctrl = softc->ctrl; + ctrl->event.len = 8; + softc->event_max = 1 << ctrl->event.len; + softc->event = malloc(sizeof(struct amdvi_event) * + softc->event_max, M_AMDVI, M_WAITOK | M_ZERO); + if ((uintptr_t)softc->event & PAGE_MASK) { + device_printf(softc->dev, "Event buffer not aligned on page."); + return (false); + } + ctrl->event.base = vtophys(softc->event) / PAGE_SIZE; + + /* Reset the pointers. */ + ctrl->evt_head = 0; + ctrl->evt_tail = 0; + + return (0); +} + +static inline void +amdvi_decode_evt_flag(uint16_t flag) +{ + + flag &= AMDVI_EVENT_FLAG_MASK; + printf(" 0x%b]\n", flag, + "\020" + "\001GN" + "\002NX" + "\003US" + "\004I" + "\005PR" + "\006RW" + "\007PE" + "\010RZ" + "\011TR" + ); +} + +/* See section 2.5.4 of AMD IOMMU spec ver 2.62.*/ +static inline void +amdvi_decode_evt_flag_type(uint8_t type) +{ + + switch (AMDVI_EVENT_FLAG_TYPE(type)) { + case 0: + printf("RSVD\n"); + break; + case 1: + printf("Master Abort\n"); + break; + case 2: + printf("Target Abort\n"); + break; + case 3: + printf("Data Err\n"); + break; + default: + break; + } +} + +static void +amdvi_decode_inv_dte_evt(uint16_t devid, uint16_t domid, uint64_t addr, + uint16_t flag) +{ + + printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", + devid, domid, addr); + amdvi_decode_evt_flag(flag); +} + +static void +amdvi_decode_pf_evt(uint16_t devid, uint16_t domid, uint64_t addr, + uint16_t flag) +{ + + printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", + devid, domid, addr); + amdvi_decode_evt_flag(flag); +} + +static void +amdvi_decode_dte_hwerr_evt(uint16_t devid, uint16_t domid, + uint64_t addr, uint16_t flag) +{ + + printf("\t[DEV_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", devid, domid, addr); + amdvi_decode_evt_flag(flag); + amdvi_decode_evt_flag_type(flag); +} + +static void +amdvi_decode_page_hwerr_evt(uint16_t devid, uint16_t domid, uint64_t addr, + uint16_t flag) +{ + + printf("\t[PAGE_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", devid, domid, addr); + amdvi_decode_evt_flag(flag); + amdvi_decode_evt_flag_type(AMDVI_EVENT_FLAG_TYPE(flag)); +} + +static void +amdvi_decode_evt(struct amdvi_event *evt) +{ + struct amdvi_cmd *cmd; + + switch (evt->opcode) { + case AMDVI_EVENT_INVALID_DTE: + amdvi_decode_inv_dte_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_PFAULT: + amdvi_decode_pf_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_DTE_HW_ERROR: + amdvi_decode_dte_hwerr_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_PAGE_HW_ERROR: + amdvi_decode_page_hwerr_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_ILLEGAL_CMD: + /* FALL THROUGH */ + case AMDVI_EVENT_CMD_HW_ERROR: + printf("\t[%s EVT]\n", (evt->opcode == AMDVI_EVENT_ILLEGAL_CMD) ? + "ILLEGAL CMD" : "CMD HW ERR"); + cmd = (struct amdvi_cmd *)PHYS_TO_DMAP(evt->addr); + printf("\tCMD opcode= 0x%x 0x%x 0x%x 0x%lx\n", + cmd->opcode, cmd->word0, cmd->word1, cmd->addr); + break; + + case AMDVI_EVENT_IOTLB_TIMEOUT: + printf("\t[IOTLB_INV_TIMEOUT devid:0x%x addr:0x%lx]\n", + evt->devid, evt->addr); + break; + + case AMDVI_EVENT_INVALID_DTE_REQ: + printf("\t[INV_DTE devid:0x%x addr:0x%lx type:0x%x tr:%d]\n", + evt->devid, evt->addr, evt->flag >> 9, + (evt->flag >> 8) & 1); + break; + + case AMDVI_EVENT_INVALID_PPR_REQ: + case AMDVI_EVENT_COUNTER_ZERO: + printf("AMD-Vi: v2 events.\n"); + break; + + default: + printf("Unsupported AMD-Vi event:%d\n", evt->opcode); + } +} + +static void +amdvi_print_events(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_event *event; + int i, size; + + ctrl = softc->ctrl; + size = sizeof(struct amdvi_event); + for (i = 0; i < softc->event_max; i++) { + event = &softc->event[ctrl->evt_head / size]; + if (!event->opcode) + break; + device_printf(softc->dev, "\t[Event%d: Head:0x%x Tail:0x%x]\n", + i, ctrl->evt_head, ctrl->evt_tail); + amdvi_decode_evt(event); + ctrl->evt_head = MOD_INC(ctrl->evt_head, size, + softc->event_max); + } +} + +static int +amdvi_init_dte(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + + ctrl = softc->ctrl; + ctrl->dte.base = vtophys(amdvi_dte) / PAGE_SIZE; + ctrl->dte.size = 0x1FF; /* 2MB device table. */ + + return (0); +} + +/* + * Not all capabilities of IOMMU are available in ACPI IVHD flag + * or EFR entry, read directly from device. + */ +static int +amdvi_print_pci_cap(device_t dev) +{ + struct amdvi_softc *softc; + uint32_t off, cap; + + softc = device_get_softc(dev); + off = softc->cap_off; + + /* + * Section 3.7.1 of IOMMU sepc rev 2.0. + * Read capability from device. + */ + cap = amdvi_pci_read(softc, off); + + /* Make sure capability type[18:16] is 3. */ + KASSERT((((cap >> 16) & 0x7) == 0x3), + ("Not a IOMMU capability 0x%x@0x%x", cap, off)); + + softc->pci_cap = cap >> 24; + device_printf(softc->dev, "PCI cap 0x%x@0x%x feature:%b\n", + cap, off, softc->pci_cap, + "\20\1IOTLB\2HT\3NPCache\4EFR\5CapExt"); + + return (0); +} + +static void +amdvi_event_intr(void *arg) +{ + struct amdvi_softc *softc; + struct amdvi_ctrl *ctrl; + + softc = (struct amdvi_softc *)arg; + ctrl = softc->ctrl; + device_printf(softc->dev, "EVT INTR %ld Status:0x%x" + " EVT Head:0x%x Tail:0x%x]\n", softc->event_intr_cnt++, + ctrl->status, ctrl->evt_head, ctrl->evt_tail); + printf(" [CMD Total 0x%lx] Tail:0x%x, Head:0x%x.\n", + softc->total_cmd, ctrl->cmd_tail, ctrl->cmd_head); + + amdvi_print_events(softc); + ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR; +} + +static void +amdvi_free_evt_intr_res(device_t dev) +{ + + struct amdvi_softc *softc; + device_t mmio_dev; + + softc = device_get_softc(dev); + mmio_dev = softc->pci_dev; + + IVHD_TEARDOWN_INTR(mmio_dev); +} + +static bool +amdvi_alloc_intr_resources(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + device_t dev, mmio_dev; + int err; + + dev = softc->dev; + mmio_dev = softc->pci_dev; + + /* Clear interrupt status bits. */ + ctrl = softc->ctrl; + ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR; + + err = IVHD_SETUP_INTR(mmio_dev, amdvi_event_intr, softc, "fault"); + if (err) + device_printf(dev, "Interrupt setup failed on %s\n", + device_get_nameunit(mmio_dev)); + return (err); +} + +static void +amdvi_print_dev_cap(struct amdvi_softc *softc) +{ + struct ivhd_dev_cfg *cfg; + int i; + + cfg = softc->dev_cfg; + for (i = 0; i < softc->dev_cfg_cnt; i++) { + device_printf(softc->dev, "device [0x%x - 0x%x] " + "config:%b%s\n", cfg->start_id, cfg->end_id, + cfg->data, + "\020\001INIT\002ExtInt\003NMI" + "\007LINT0\010LINT1", + cfg->enable_ats ? "ATS enabled" : ""); + cfg++; + } +} + +static int +amdvi_handle_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct amdvi_softc *softc; + int result, type, error = 0; + + softc = (struct amdvi_softc *)arg1; + type = arg2; + + switch (type) { + case 0: + result = softc->ctrl->cmd_head; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + case 1: + result = softc->ctrl->cmd_tail; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + case 2: + result = softc->ctrl->evt_head; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + case 3: + result = softc->ctrl->evt_tail; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + + default: + device_printf(softc->dev, "Unknown sysctl:%d\n", type); + } + + return (error); +} + +static void +amdvi_add_sysctl(struct amdvi_softc *softc) +{ + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + device_t dev; + + dev = softc->dev; + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "event_intr_count", CTLFLAG_RD, + &softc->event_intr_cnt, "Event interrupt count"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "command_count", CTLFLAG_RD, + &softc->total_cmd, "Command submitted count"); + SYSCTL_ADD_U16(ctx, child, OID_AUTO, "pci_rid", CTLFLAG_RD, + &softc->pci_rid, 0, "IOMMU RID"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_head", + CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, softc, 0, + amdvi_handle_sysctl, "IU", "Command head"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_tail", + CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, softc, 1, + amdvi_handle_sysctl, "IU", "Command tail"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_head", + CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, softc, 2, + amdvi_handle_sysctl, "IU", "Command head"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_tail", + CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, softc, 3, + amdvi_handle_sysctl, "IU", "Command tail"); +} + +int +amdvi_setup_hw(struct amdvi_softc *softc) +{ + device_t dev; + int status; + + dev = softc->dev; + + amdvi_hw_enable_iotlb(softc); + + amdvi_print_dev_cap(softc); + + if ((status = amdvi_print_pci_cap(dev)) != 0) { + device_printf(dev, "PCI capability.\n"); + return (status); + } + if ((status = amdvi_init_cmd(softc)) != 0) { + device_printf(dev, "Couldn't configure command buffer.\n"); + return (status); + } + if ((status = amdvi_init_event(softc)) != 0) { + device_printf(dev, "Couldn't configure event buffer.\n"); + return (status); + } + if ((status = amdvi_init_dte(softc)) != 0) { + device_printf(dev, "Couldn't configure device table.\n"); + return (status); + } + if ((status = amdvi_alloc_intr_resources(softc)) != 0) { + return (status); + } + amdvi_add_sysctl(softc); + return (0); +} + +int +amdvi_teardown_hw(struct amdvi_softc *softc) +{ + device_t dev; + + dev = softc->dev; + + /* + * Called after disable, h/w is stopped by now, free all the resources. + */ + amdvi_free_evt_intr_res(dev); + + if (softc->cmd) + free(softc->cmd, M_AMDVI); + + if (softc->event) + free(softc->event, M_AMDVI); + + return (0); +} + +/*********** bhyve interfaces *********************/ +static int +amdvi_init(void) +{ + if (!ivhd_count) { + return (EIO); + } + if (!amdvi_enable_user && ivhd_count) { + printf("bhyve: Found %d AMD-Vi/IOMMU device(s), " + "use hw.vmm.amdvi.enable=1 to enable pass-through.\n", + ivhd_count); + return (EINVAL); + } + return (0); +} + +static void +amdvi_cleanup(void) +{ + /* Nothing. */ +} + +static uint16_t +amdvi_domainId(void) +{ + + /* + * If we hit maximum domain limit, rollover leaving host + * domain(0). + * XXX: make sure that this domain is not used. + */ + if (amdvi_dom_id == AMDVI_MAX_DOMAIN) + amdvi_dom_id = 1; + + return ((uint16_t)amdvi_dom_id++); +} + +static void +amdvi_do_inv_domain(uint16_t domain_id, bool create) +{ + struct amdvi_softc *softc; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + KASSERT(softc, ("softc is NULL")); + /* + * If not present pages are cached, invalidate page after + * creating domain. + */ +#if 0 + if (create && ((softc->pci_cap & AMDVI_PCI_CAP_NPCACHE) == 0)) + continue; +#endif + amdvi_inv_domain(softc, domain_id); + amdvi_wait(softc); + } +} + +static void * +amdvi_create_domain(vm_paddr_t maxaddr) +{ + struct amdvi_domain *dom; + + dom = malloc(sizeof(struct amdvi_domain), M_AMDVI, M_ZERO | M_WAITOK); + dom->id = amdvi_domainId(); + //dom->maxaddr = maxaddr; +#ifdef AMDVI_DEBUG_CMD + printf("Created domain #%d\n", dom->id); +#endif + /* + * Host domain(#0) don't create translation table. + */ + if (dom->id || amdvi_host_ptp) + dom->ptp = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); + + dom->ptp_level = amdvi_ptp_level; + + amdvi_do_inv_domain(dom->id, true); + SLIST_INSERT_HEAD(&dom_head, dom, next); + + return (dom); +} + +static void +amdvi_free_ptp(uint64_t *ptp, int level) +{ + int i; + + if (level < 1) + return; + + for (i = 0; i < NPTEPG ; i++) { + if ((ptp[i] & AMDVI_PT_PRESENT) == 0) + continue; + /* XXX: Add super-page or PTE mapping > 4KB. */ +#ifdef notyet + /* Super-page mapping. */ + if (AMDVI_PD_SUPER(ptp[i])) + continue; +#endif + + amdvi_free_ptp((uint64_t *)PHYS_TO_DMAP(ptp[i] + & AMDVI_PT_MASK), level - 1); + } + + free(ptp, M_AMDVI); +} + +static void +amdvi_destroy_domain(void *arg) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + KASSERT(domain, ("domain is NULL")); +#ifdef AMDVI_DEBUG_CMD + printf("Destroying domain %d\n", domain->id); +#endif + if (domain->ptp) + amdvi_free_ptp(domain->ptp, domain->ptp_level); + + amdvi_do_inv_domain(domain->id, false); + SLIST_REMOVE(&dom_head, domain, amdvi_domain, next); + free(domain, M_AMDVI); +} + +static uint64_t +amdvi_set_pt(uint64_t *pt, int level, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t pg_size, bool create) +{ + uint64_t *page, pa; + int shift, index; + const int PT_SHIFT = 9; + const int PT_INDEX_MASK = (1 << PT_SHIFT) - 1; /* Based on PT_SHIFT */ + + if (!pg_size) + return (0); + + if (hpa & (pg_size - 1)) { + printf("HPA is not size aligned.\n"); + return (0); + } + if (gpa & (pg_size - 1)) { + printf("HPA is not size aligned.\n"); + return (0); + } + shift = PML4SHIFT; + while ((shift > PAGE_SHIFT) && (pg_size < (1UL << shift))) { + index = (gpa >> shift) & PT_INDEX_MASK; + + if ((pt[index] == 0) && create) { + page = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); + pa = vtophys(page); + pt[index] = pa | AMDVI_PT_PRESENT | AMDVI_PT_RW | + ((level - 1) << AMDVI_PD_LEVEL_SHIFT); + } +#ifdef AMDVI_DEBUG_PTE + if ((gpa % 0x1000000) == 0) + printf("[level%d, shift = %d]PTE:0x%lx\n", + level, shift, pt[index]); +#endif +#define PTE2PA(x) ((uint64_t)(x) & AMDVI_PT_MASK) + pa = PTE2PA(pt[index]); + pt = (uint64_t *)PHYS_TO_DMAP(pa); + shift -= PT_SHIFT; + level--; + } + + /* Leaf entry. */ + index = (gpa >> shift) & PT_INDEX_MASK; + + if (create) { + pt[index] = hpa | AMDVI_PT_RW | AMDVI_PT_PRESENT; + } else + pt[index] = 0; + +#ifdef AMDVI_DEBUG_PTE + if ((gpa % 0x1000000) == 0) + printf("[Last level%d, shift = %d]PTE:0x%lx\n", + level, shift, pt[index]); +#endif + return (1ULL << shift); +} + +static uint64_t +amdvi_update_mapping(struct amdvi_domain *domain, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t size, bool create) +{ + uint64_t mapped, *ptp, len; + int level; + + KASSERT(domain, ("domain is NULL")); + level = domain->ptp_level; + KASSERT(level, ("Page table level is 0")); + + ptp = domain->ptp; + KASSERT(ptp, ("PTP is NULL")); + mapped = 0; + while (mapped < size) { + len = amdvi_set_pt(ptp, level, gpa + mapped, hpa + mapped, + PAGE_SIZE, create); + if (!len) { + printf("Error: Couldn't map HPA:0x%lx GPA:0x%lx\n", + hpa, gpa); + return (0); + } + mapped += len; + } + + return (mapped); +} + +static int +amdvi_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, + uint64_t len, uint64_t *res_len) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + + if (domain->id && !domain->ptp) { + printf("ptp is NULL"); + return (EINVAL); + } + + /* + * If host domain is created w/o page table, skip IOMMU page + * table set-up. + */ + if (domain->ptp) + *res_len = amdvi_update_mapping(domain, gpa, hpa, len, true); + else + *res_len = len; + return (0); +} + +static int +amdvi_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len, uint64_t *res_len) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + /* + * If host domain is created w/o page table, skip IOMMU page + * table set-up. + */ + if (domain->ptp) + *res_len = amdvi_update_mapping(domain, gpa, 0, len, false); + else + *res_len = len; + return (0); +} + +static struct amdvi_softc * +amdvi_find_iommu(uint16_t devid) +{ + struct amdvi_softc *softc; + int i, j; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + for (j = 0; j < softc->dev_cfg_cnt; j++) + if ((devid >= softc->dev_cfg[j].start_id) && + (devid <= softc->dev_cfg[j].end_id)) + return (softc); + } + + return (NULL); +} + +/* + * Set-up device table entry. + * IOMMU spec Rev 2.0, section 3.2.2.2, some of the fields must + * be set concurrently, e.g. read and write bits. + */ +static void +amdvi_set_dte(struct amdvi_domain *domain, struct amdvi_softc *softc, + uint16_t devid, bool enable) +{ + struct amdvi_dte* temp; + + KASSERT(domain, ("domain is NULL for pci_rid:0x%x\n", devid)); + KASSERT(softc, ("softc is NULL for pci_rid:0x%x\n", devid)); + + temp = &amdvi_dte[devid]; + +#ifdef AMDVI_ATS_ENABLE + /* If IOMMU and device support IOTLB, enable it. */ + if (amdvi_dev_support_iotlb(softc, devid) && softc->iotlb) + temp->iotlb_enable = 1; +#endif + + /* Avoid duplicate I/O faults. */ + temp->sup_second_io_fault = 1; + temp->sup_all_io_fault = amdvi_disable_io_fault; + + temp->dt_valid = 1; + temp->domain_id = domain->id; + + if (enable) { + if (domain->ptp) { + temp->pt_base = vtophys(domain->ptp) >> 12; + temp->pt_level = amdvi_ptp_level; + } + /* + * XXX: Page table valid[TV] bit must be set even if host domain + * page tables are not enabled. + */ + temp->pt_valid = 1; + temp->read_allow = 1; + temp->write_allow = 1; + } +} + +static void +amdvi_inv_device(struct amdvi_softc *softc, uint16_t devid) +{ + KASSERT(softc, ("softc is NULL")); + + amdvi_cmd_inv_dte(softc, devid); +#ifdef AMDVI_ATS_ENABLE + if (amdvi_dev_support_iotlb(softc, devid)) + amdvi_cmd_inv_iotlb(softc, devid); +#endif + amdvi_wait(softc); +} + +static int +amdvi_add_device(void *arg, device_t dev __unused, uint16_t devid) +{ + struct amdvi_domain *domain; + struct amdvi_softc *softc; + + domain = (struct amdvi_domain *)arg; + KASSERT(domain != NULL, ("domain is NULL")); +#ifdef AMDVI_DEBUG_CMD + printf("Assigning device(%d.%d.%d) to domain:%d\n", + RID2PCI_STR(devid), domain->id); +#endif + softc = amdvi_find_iommu(devid); + if (softc == NULL) + return (ENXIO); + amdvi_set_dte(domain, softc, devid, true); + amdvi_inv_device(softc, devid); + return (0); +} + +static int +amdvi_remove_device(void *arg, device_t dev __unused, uint16_t devid) +{ + struct amdvi_domain *domain; + struct amdvi_softc *softc; + + domain = (struct amdvi_domain *)arg; +#ifdef AMDVI_DEBUG_CMD + printf("Remove device(0x%x) from domain:%d\n", + devid, domain->id); +#endif + softc = amdvi_find_iommu(devid); + if (softc == NULL) + return (ENXIO); + amdvi_set_dte(domain, softc, devid, false); + amdvi_inv_device(softc, devid); + return (0); +} + +static void +amdvi_enable(void) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_softc *softc; + uint64_t val; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + KASSERT(softc, ("softc is NULL\n")); + ctrl = softc->ctrl; + KASSERT(ctrl, ("ctrl is NULL\n")); + + val = ( AMDVI_CTRL_EN | + AMDVI_CTRL_CMD | + AMDVI_CTRL_ELOG | + AMDVI_CTRL_ELOGINT | + AMDVI_CTRL_INV_TO_1S); + + if (softc->ivhd_flag & IVHD_FLAG_COH) + val |= AMDVI_CTRL_COH; + if (softc->ivhd_flag & IVHD_FLAG_HTT) + val |= AMDVI_CTRL_HTT; + if (softc->ivhd_flag & IVHD_FLAG_RPPW) + val |= AMDVI_CTRL_RPPW; + if (softc->ivhd_flag & IVHD_FLAG_PPW) + val |= AMDVI_CTRL_PPW; + if (softc->ivhd_flag & IVHD_FLAG_ISOC) + val |= AMDVI_CTRL_ISOC; + + ctrl->control = val; + } +} + +static void +amdvi_disable(void) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_softc *softc; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + KASSERT(softc, ("softc is NULL\n")); + ctrl = softc->ctrl; + KASSERT(ctrl, ("ctrl is NULL\n")); + + ctrl->control = 0; + } +} + +static int +amdvi_invalidate_tlb(void *arg) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + KASSERT(domain, ("domain is NULL")); + amdvi_do_inv_domain(domain->id, false); + return (0); +} + +const struct iommu_ops iommu_ops_amd = { + .init = amdvi_init, + .cleanup = amdvi_cleanup, + .enable = amdvi_enable, + .disable = amdvi_disable, + .create_domain = amdvi_create_domain, + .destroy_domain = amdvi_destroy_domain, + .create_mapping = amdvi_create_mapping, + .remove_mapping = amdvi_remove_mapping, + .add_device = amdvi_add_device, + .remove_device = amdvi_remove_device, + .invalidate_tlb = amdvi_invalidate_tlb, +}; diff --git a/sys/amd64/vmm/amd/amdvi_priv.h b/sys/amd64/vmm/amd/amdvi_priv.h new file mode 100644 index 000000000000..2a2646b6907e --- /dev/null +++ b/sys/amd64/vmm/amd/amdvi_priv.h @@ -0,0 +1,409 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2016 Anish Gupta (anish@freebsd.org) + * Copyright (c) 2021 The FreeBSD Foundation + * + * Portions of this software were developed by Ka Ho Ng + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _AMDVI_PRIV_H_ +#define _AMDVI_PRIV_H_ + +#include <contrib/dev/acpica/include/acpi.h> + +#define BIT(n) (1ULL << (n)) +/* Return value of bits[n:m] where n and (n >= ) m are bit positions. */ +#define REG_BITS(x, n, m) (((x) >> (m)) & \ + ((1 << (((n) - (m)) + 1)) - 1)) + +/* + * IOMMU PCI capability. + */ +#define AMDVI_PCI_CAP_IOTLB BIT(0) /* IOTLB is supported. */ +#define AMDVI_PCI_CAP_HT BIT(1) /* HyperTransport tunnel support. */ +#define AMDVI_PCI_CAP_NPCACHE BIT(2) /* Not present page cached. */ +#define AMDVI_PCI_CAP_EFR BIT(3) /* Extended features. */ +#define AMDVI_PCI_CAP_EXT BIT(4) /* Miscellaneous information reg. */ + +/* + * IOMMU extended features. + */ +#define AMDVI_EX_FEA_PREFSUP BIT(0) /* Prefetch command support. */ +#define AMDVI_EX_FEA_PPRSUP BIT(1) /* PPR support */ +#define AMDVI_EX_FEA_XTSUP BIT(2) /* Reserved */ +#define AMDVI_EX_FEA_NXSUP BIT(3) /* No-execute. */ +#define AMDVI_EX_FEA_GTSUP BIT(4) /* Guest translation support. */ +#define AMDVI_EX_FEA_EFRW BIT(5) /* Reserved */ +#define AMDVI_EX_FEA_IASUP BIT(6) /* Invalidate all command supp. */ +#define AMDVI_EX_FEA_GASUP BIT(7) /* Guest APIC or AVIC support. */ +#define AMDVI_EX_FEA_HESUP BIT(8) /* Hardware Error. */ +#define AMDVI_EX_FEA_PCSUP BIT(9) /* Performance counters support. */ +/* XXX: add more EFER bits. */ + +/* + * Device table entry or DTE + * NOTE: Must be 256-bits/32 bytes aligned. + */ +struct amdvi_dte { + uint32_t dt_valid:1; /* Device Table valid. */ + uint32_t pt_valid:1; /* Page translation valid. */ + uint16_t :7; /* Reserved[8:2] */ + uint8_t pt_level:3; /* Paging level, 0 to disable. */ + uint64_t pt_base:40; /* Page table root pointer. */ + uint8_t :3; /* Reserved[54:52] */ + uint8_t gv_valid:1; /* Revision 2, GVA to SPA. */ + uint8_t gv_level:2; /* Revision 2, GLX level. */ + uint8_t gv_cr3_lsb:3; /* Revision 2, GCR3[14:12] */ + uint8_t read_allow:1; /* I/O read enabled. */ + uint8_t write_allow:1; /* I/O write enabled. */ + uint8_t :1; /* Reserved[63] */ + uint16_t domain_id:16; /* Domain ID */ + uint16_t gv_cr3_lsb2:16; /* Revision 2, GCR3[30:15] */ + uint8_t iotlb_enable:1; /* Device support IOTLB */ + uint8_t sup_second_io_fault:1; /* Suppress subsequent I/O faults. */ + uint8_t sup_all_io_fault:1; /* Suppress all I/O page faults. */ + uint8_t IOctl:2; /* Port I/O control. */ + uint8_t iotlb_cache_disable:1; /* IOTLB cache hints. */ + uint8_t snoop_disable:1; /* Snoop disable. */ + uint8_t allow_ex:1; /* Allow exclusion. */ + uint8_t sysmgmt:2; /* System management message.*/ + uint8_t :1; /* Reserved[106] */ + uint32_t gv_cr3_msb:21; /* Revision 2, GCR3[51:31] */ + uint8_t intmap_valid:1; /* Interrupt map valid. */ + uint8_t intmap_len:4; /* Interrupt map table length. */ + uint8_t intmap_ign:1; /* Ignore unmapped interrupts. */ + uint64_t intmap_base:46; /* IntMap base. */ + uint8_t :4; /* Reserved[183:180] */ + uint8_t init_pass:1; /* INIT pass through or PT */ + uint8_t extintr_pass:1; /* External Interrupt PT */ + uint8_t nmi_pass:1; /* NMI PT */ + uint8_t :1; /* Reserved[187] */ + uint8_t intr_ctrl:2; /* Interrupt control */ + uint8_t lint0_pass:1; /* LINT0 PT */ + uint8_t lint1_pass:1; /* LINT1 PT */ + uint64_t :64; /* Reserved[255:192] */ +} __attribute__((__packed__)); +CTASSERT(sizeof(struct amdvi_dte) == 32); + +/* + * IOMMU command entry. + */ +struct amdvi_cmd { + uint32_t word0; + uint32_t word1:28; + uint8_t opcode:4; + uint64_t addr; +} __attribute__((__packed__)); + +/* Command opcodes. */ +#define AMDVI_CMP_WAIT_OPCODE 0x1 /* Completion wait. */ +#define AMDVI_INVD_DTE_OPCODE 0x2 /* Invalidate device table entry. */ +#define AMDVI_INVD_PAGE_OPCODE 0x3 /* Invalidate pages. */ +#define AMDVI_INVD_IOTLB_OPCODE 0x4 /* Invalidate IOTLB pages. */ +#define AMDVI_INVD_INTR_OPCODE 0x5 /* Invalidate Interrupt table. */ +#define AMDVI_PREFETCH_PAGES_OPCODE 0x6 /* Prefetch IOMMU pages. */ +#define AMDVI_COMP_PPR_OPCODE 0x7 /* Complete PPR request. */ +#define AMDVI_INV_ALL_OPCODE 0x8 /* Invalidate all. */ + +/* Completion wait attributes. */ +#define AMDVI_CMP_WAIT_STORE BIT(0) /* Write back data. */ +#define AMDVI_CMP_WAIT_INTR BIT(1) /* Completion wait interrupt. */ +#define AMDVI_CMP_WAIT_FLUSH BIT(2) /* Flush queue. */ + +/* Invalidate page. */ +#define AMDVI_INVD_PAGE_S BIT(0) /* Invalidation size. */ +#define AMDVI_INVD_PAGE_PDE BIT(1) /* Invalidate PDE. */ +#define AMDVI_INVD_PAGE_GN_GVA BIT(2) /* GPA or GVA. */ + +#define AMDVI_INVD_PAGE_ALL_ADDR (0x7FFFFFFFFFFFFULL << 12) + +/* Invalidate IOTLB. */ +#define AMDVI_INVD_IOTLB_S BIT(0) /* Invalidation size 4k or addr */ +#define AMDVI_INVD_IOTLB_GN_GVA BIT(2) /* GPA or GVA. */ + +#define AMDVI_INVD_IOTLB_ALL_ADDR (0x7FFFFFFFFFFFFULL << 12) +/* XXX: add more command entries. */ + +/* + * IOMMU event entry. + */ +struct amdvi_event { + uint16_t devid; + uint16_t pasid_hi; + uint16_t pasid_domid; /* PASID low or DomainID */ + uint16_t flag:12; + uint8_t opcode:4; + uint64_t addr; +} __attribute__((__packed__)); +CTASSERT(sizeof(struct amdvi_event) == 16); + +/* Various event types. */ +#define AMDVI_EVENT_INVALID_DTE 0x1 +#define AMDVI_EVENT_PFAULT 0x2 +#define AMDVI_EVENT_DTE_HW_ERROR 0x3 +#define AMDVI_EVENT_PAGE_HW_ERROR 0x4 +#define AMDVI_EVENT_ILLEGAL_CMD 0x5 +#define AMDVI_EVENT_CMD_HW_ERROR 0x6 +#define AMDVI_EVENT_IOTLB_TIMEOUT 0x7 +#define AMDVI_EVENT_INVALID_DTE_REQ 0x8 +#define AMDVI_EVENT_INVALID_PPR_REQ 0x9 +#define AMDVI_EVENT_COUNTER_ZERO 0xA + +#define AMDVI_EVENT_FLAG_MASK 0x1FF /* Mask for event flags. */ +#define AMDVI_EVENT_FLAG_TYPE(x) (((x) >> 9) & 0x3) + +/* + * IOMMU control block. + */ +struct amdvi_ctrl { + struct { + uint16_t size:9; + uint16_t :3; + uint64_t base:40; /* Devtable register base. */ + uint16_t :12; + } dte; + struct { + uint16_t :12; + uint64_t base:40; + uint8_t :4; + uint8_t len:4; + uint8_t :4; + } cmd; + struct { + uint16_t :12; + uint64_t base:40; + uint8_t :4; + uint8_t len:4; + uint8_t :4; + } event; + uint16_t control :13; + uint64_t :51; + struct { + uint8_t enable:1; + uint8_t allow:1; + uint16_t :10; + uint64_t base:40; + uint16_t :12; + uint16_t :12; + uint64_t limit:40; + uint16_t :12; + } excl; + /* + * Revision 2 only. + */ + uint64_t ex_feature; + struct { + uint16_t :12; + uint64_t base:40; + uint8_t :4; + uint8_t len:4; + uint8_t :4; + } ppr; + uint64_t first_event; + uint64_t second_event; + uint64_t event_status; + /* Revision 2 only, end. */ + uint8_t pad1[0x1FA8]; /* Padding. */ + uint32_t cmd_head:19; + uint64_t :45; + uint32_t cmd_tail:19; + uint64_t :45; + uint32_t evt_head:19; + uint64_t :45; + uint32_t evt_tail:19; + uint64_t :45; + uint32_t status:19; + uint64_t :45; + uint64_t pad2; + uint8_t :4; + uint16_t ppr_head:15; + uint64_t :45; + uint8_t :4; + uint16_t ppr_tail:15; + uint64_t :45; + uint8_t pad3[0x1FC0]; /* Padding. */ + + /* XXX: More for rev2. */ +} __attribute__((__packed__)); +CTASSERT(offsetof(struct amdvi_ctrl, pad1)== 0x58); +CTASSERT(offsetof(struct amdvi_ctrl, pad2)== 0x2028); +CTASSERT(offsetof(struct amdvi_ctrl, pad3)== 0x2040); + +#define AMDVI_MMIO_V1_SIZE (4 * PAGE_SIZE) /* v1 size */ +/* + * AMF IOMMU v2 size including event counters + */ +#define AMDVI_MMIO_V2_SIZE (8 * PAGE_SIZE) + +CTASSERT(sizeof(struct amdvi_ctrl) == 0x4000); +CTASSERT(sizeof(struct amdvi_ctrl) == AMDVI_MMIO_V1_SIZE); + +/* IVHD flag */ +#define IVHD_FLAG_HTT BIT(0) /* Hypertransport Tunnel. */ +#define IVHD_FLAG_PPW BIT(1) /* Pass posted write. */ +#define IVHD_FLAG_RPPW BIT(2) /* Response pass posted write. */ +#define IVHD_FLAG_ISOC BIT(3) /* Isoc support. */ +#define IVHD_FLAG_IOTLB BIT(4) /* IOTLB support. */ +#define IVHD_FLAG_COH BIT(5) /* Coherent control, default 1 */ +#define IVHD_FLAG_PFS BIT(6) /* Prefetch IOMMU pages. */ +#define IVHD_FLAG_PPRS BIT(7) /* Peripheral page support. */ + +/* IVHD device entry data setting. */ +#define IVHD_DEV_LINT0_PASS BIT(6) /* LINT0 interrupts. */ +#define IVHD_DEV_LINT1_PASS BIT(7) /* LINT1 interrupts. */ + +/* Bit[5:4] for System Mgmt. Bit3 is reserved. */ +#define IVHD_DEV_INIT_PASS BIT(0) /* INIT */ +#define IVHD_DEV_EXTINTR_PASS BIT(1) /* ExtInt */ +#define IVHD_DEV_NMI_PASS BIT(2) /* NMI */ + +/* IVHD 8-byte extended data settings. */ +#define IVHD_DEV_EXT_ATS_DISABLE BIT(31) /* Disable ATS */ + +/* IOMMU control register. */ +#define AMDVI_CTRL_EN BIT(0) /* IOMMU enable. */ +#define AMDVI_CTRL_HTT BIT(1) /* Hypertransport tunnel enable. */ +#define AMDVI_CTRL_ELOG BIT(2) /* Event log enable. */ +#define AMDVI_CTRL_ELOGINT BIT(3) /* Event log interrupt. */ +#define AMDVI_CTRL_COMINT BIT(4) /* Completion wait interrupt. */ +#define AMDVI_CTRL_PPW BIT(8) +#define AMDVI_CTRL_RPPW BIT(9) +#define AMDVI_CTRL_COH BIT(10) +#define AMDVI_CTRL_ISOC BIT(11) +#define AMDVI_CTRL_CMD BIT(12) /* Command buffer enable. */ +#define AMDVI_CTRL_PPRLOG BIT(13) +#define AMDVI_CTRL_PPRINT BIT(14) +#define AMDVI_CTRL_PPREN BIT(15) +#define AMDVI_CTRL_GTE BIT(16) /* Guest translation enable. */ +#define AMDVI_CTRL_GAE BIT(17) /* Guest APIC enable. */ + +/* Invalidation timeout. */ +#define AMDVI_CTRL_INV_NO_TO 0 /* No timeout. */ +#define AMDVI_CTRL_INV_TO_1ms 1 /* 1 ms */ +#define AMDVI_CTRL_INV_TO_10ms 2 /* 10 ms */ +#define AMDVI_CTRL_INV_TO_100ms 3 /* 100 ms */ +#define AMDVI_CTRL_INV_TO_1S 4 /* 1 second */ +#define AMDVI_CTRL_INV_TO_10S 5 /* 10 second */ +#define AMDVI_CTRL_INV_TO_100S 6 /* 100 second */ + +/* + * Max number of PCI devices. + * 256 bus x 32 slot/devices x 8 functions. + */ +#define PCI_NUM_DEV_MAX 0x10000 + +/* Maximum number of domains supported by IOMMU. */ +#define AMDVI_MAX_DOMAIN (BIT(16) - 1) + +/* + * IOMMU Page Table attributes. + */ +#define AMDVI_PT_PRESENT BIT(0) +#define AMDVI_PT_COHERENT BIT(60) +#define AMDVI_PT_READ BIT(61) +#define AMDVI_PT_WRITE BIT(62) + +#define AMDVI_PT_RW (AMDVI_PT_READ | AMDVI_PT_WRITE) +#define AMDVI_PT_MASK 0xFFFFFFFFFF000UL /* Only [51:12] for PA */ + +#define AMDVI_PD_LEVEL_SHIFT 9 +#define AMDVI_PD_SUPER(x) (((x) >> AMDVI_PD_LEVEL_SHIFT) == 7) +/* + * IOMMU Status, offset 0x2020 + */ +#define AMDVI_STATUS_EV_OF BIT(0) /* Event overflow. */ +#define AMDVI_STATUS_EV_INTR BIT(1) /* Event interrupt. */ +/* Completion wait command completed. */ +#define AMDVI_STATUS_CMP BIT(2) + +#define IVRS_CTRL_RID 1 /* MMIO RID */ + +/* ACPI IVHD */ +struct ivhd_dev_cfg { + uint32_t start_id; + uint32_t end_id; + uint8_t data; /* Device configuration. */ + bool enable_ats; /* ATS enabled for the device. */ + int ats_qlen; /* ATS invalidation queue depth. */ +}; + +struct amdvi_domain { + uint64_t *ptp; /* Highest level page table */ + int ptp_level; /* Level of page tables */ + u_int id; /* Domain id */ + SLIST_ENTRY (amdvi_domain) next; +}; + +/* + * Different type of IVHD. + * XXX: Use AcpiIvrsType once new IVHD types are available. +*/ +enum IvrsType +{ + IVRS_TYPE_HARDWARE_LEGACY = ACPI_IVRS_TYPE_HARDWARE1, + /* Legacy without EFRi support. */ + IVRS_TYPE_HARDWARE_EFR = ACPI_IVRS_TYPE_HARDWARE2, + /* With EFR support. */ + IVRS_TYPE_HARDWARE_MIXED = 0x40, /* Mixed with EFR support. */ +}; + +/* + * AMD IOMMU softc. + */ +struct amdvi_softc { + struct amdvi_ctrl *ctrl; /* Control area. */ + device_t dev; /* IOMMU device. */ + device_t pci_dev; /* IOMMU PCI function device. */ + enum IvrsType ivhd_type; /* IOMMU IVHD type. */ + bool iotlb; /* IOTLB supported by IOMMU */ + struct amdvi_cmd *cmd; /* Command descriptor area. */ + int cmd_max; /* Max number of commands. */ + uint64_t cmp_data; /* Command completion write back. */ + struct amdvi_event *event; /* Event descriptor area. */ + int event_max; /* Max number of events. */ + /* ACPI various flags. */ + uint32_t ivhd_flag; /* ACPI IVHD flag. */ + uint32_t ivhd_feature; /* ACPI v1 Reserved or v2 attribute. */ + uint64_t ext_feature; /* IVHD EFR */ + /* PCI related. */ + uint16_t cap_off; /* PCI Capability offset. */ + uint8_t pci_cap; /* PCI capability. */ + uint16_t pci_seg; /* IOMMU PCI domain/segment. */ + uint16_t pci_rid; /* PCI BDF of IOMMU */ + + /* ACPI device configuration for end points. */ + struct ivhd_dev_cfg *dev_cfg; + int dev_cfg_cnt; + int dev_cfg_cap; + + /* Software statistics. */ + uint64_t event_intr_cnt; /* Total event INTR count. */ + uint64_t total_cmd; /* Total number of commands. */ +}; + +int amdvi_setup_hw(struct amdvi_softc *softc); +int amdvi_teardown_hw(struct amdvi_softc *softc); +#endif /* _AMDVI_PRIV_H_ */ diff --git a/sys/amd64/vmm/amd/amdviiommu.c b/sys/amd64/vmm/amd/amdviiommu.c new file mode 100644 index 000000000000..5f5822a667b5 --- /dev/null +++ b/sys/amd64/vmm/amd/amdviiommu.c @@ -0,0 +1,180 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 The FreeBSD Foundation + * + * Portions of this software were developed by Ka Ho Ng + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/rman.h> + +#include <dev/pci/pcireg.h> +#include <dev/pci/pcivar.h> + +#include "amdvi_priv.h" +#include "ivhd_if.h" + +struct amdviiommu_softc { + struct resource *event_res; /* Event interrupt resource. */ + void *event_tag; /* Event interrupt tag. */ + int event_rid; +}; + +static int amdviiommu_probe(device_t); +static int amdviiommu_attach(device_t); +static int amdviiommu_detach(device_t); +static int ivhd_setup_intr(device_t, driver_intr_t, void *, + const char *); +static int ivhd_teardown_intr(device_t); + +static device_method_t amdviiommu_methods[] = { + /* device interface */ + DEVMETHOD(device_probe, amdviiommu_probe), + DEVMETHOD(device_attach, amdviiommu_attach), + DEVMETHOD(device_detach, amdviiommu_detach), + DEVMETHOD(ivhd_setup_intr, ivhd_setup_intr), + DEVMETHOD(ivhd_teardown_intr, ivhd_teardown_intr), + DEVMETHOD_END +}; +static driver_t amdviiommu_driver = { + "amdviiommu", + amdviiommu_methods, + sizeof(struct amdviiommu_softc), +}; + +static int +amdviiommu_probe(device_t dev) +{ + int error; + int capoff; + + /* + * Check base class and sub-class + */ + if (pci_get_class(dev) != PCIC_BASEPERIPH || + pci_get_subclass(dev) != PCIS_BASEPERIPH_IOMMU) + return (ENXIO); + + /* + * A IOMMU capability block carries a 0Fh capid. + */ + error = pci_find_cap(dev, PCIY_SECDEV, &capoff); + if (error) + return (ENXIO); + + /* + * bit [18:16] == 011b indicates the capability block is IOMMU + * capability block. If the field is not set to 011b, bail out. + */ + if ((pci_read_config(dev, capoff + 2, 2) & 0x7) != 0x3) + return (ENXIO); + + return (BUS_PROBE_SPECIFIC); +} + +static int +amdviiommu_attach(device_t dev) +{ + + device_set_desc(dev, "AMD-Vi/IOMMU PCI function"); + return (0); +} + +static int +amdviiommu_detach(device_t dev) +{ + + return (0); +} + +static int +ivhd_setup_intr(device_t dev, driver_intr_t handler, void *arg, + const char *desc) +{ + struct amdviiommu_softc *sc; + int error, msicnt; + + sc = device_get_softc(dev); + msicnt = 1; + if (sc->event_res != NULL) + panic("%s is called without intr teardown", __func__); + sc->event_rid = 1; + + error = pci_alloc_msi(dev, &msicnt); + if (error) { + device_printf(dev, "Couldn't find event MSI IRQ resource.\n"); + return (ENOENT); + } + + sc->event_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, + &sc->event_rid, RF_ACTIVE); + if (sc->event_res == NULL) { + device_printf(dev, "Unable to allocate event INTR resource.\n"); + error = ENOMEM; + goto fail; + } + + error = bus_setup_intr(dev, sc->event_res, INTR_TYPE_MISC | INTR_MPSAFE, + NULL, handler, arg, &sc->event_tag); + if (error) { + device_printf(dev, "Fail to setup event intr\n"); + goto fail; + } + + bus_describe_intr(dev, sc->event_res, sc->event_tag, "%s", desc); + return (0); + +fail: + ivhd_teardown_intr(dev); + return (error); +} + +static int +ivhd_teardown_intr(device_t dev) +{ + struct amdviiommu_softc *sc; + + sc = device_get_softc(dev); + + if (sc->event_tag != NULL) { + bus_teardown_intr(dev, sc->event_res, sc->event_tag); + sc->event_tag = NULL; + } + if (sc->event_res != NULL) { + bus_release_resource(dev, SYS_RES_IRQ, sc->event_rid, + sc->event_res); + sc->event_res = NULL; + } + pci_release_msi(dev); + return (0); +} + +/* This driver has to be loaded before ivhd */ +DRIVER_MODULE(amdviiommu, pci, amdviiommu_driver, 0, 0); +MODULE_DEPEND(amdviiommu, pci, 1, 1, 1); diff --git a/sys/amd64/vmm/amd/ivhd_if.m b/sys/amd64/vmm/amd/ivhd_if.m new file mode 100644 index 000000000000..3b37de9f4ba0 --- /dev/null +++ b/sys/amd64/vmm/amd/ivhd_if.m @@ -0,0 +1,45 @@ +#- +# Copyright (c) 2021 The FreeBSD Foundation +# +# Portions of this software were developed by Ka Ho Ng +# under sponsorship from the FreeBSD Foundation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/bus.h> + +INTERFACE ivhd; + +METHOD int setup_intr { + device_t dev; + driver_intr_t handler; + void *arg; + const char *desc; +}; + +METHOD int teardown_intr { + device_t dev; +}; diff --git a/sys/amd64/vmm/amd/ivrs_drv.c b/sys/amd64/vmm/amd/ivrs_drv.c new file mode 100644 index 000000000000..c75e0fcc2d68 --- /dev/null +++ b/sys/amd64/vmm/amd/ivrs_drv.c @@ -0,0 +1,761 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2016, Anish Gupta (anish@freebsd.org) + * Copyright (c) 2021 The FreeBSD Foundation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_acpi.h" +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/malloc.h> + +#include <machine/vmparam.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <contrib/dev/acpica/include/acpi.h> +#include <contrib/dev/acpica/include/accommon.h> +#include <dev/acpica/acpivar.h> +#include <dev/pci/pcireg.h> +#include <dev/pci/pcivar.h> + +#include "io/iommu.h" +#include "amdvi_priv.h" + +device_t *ivhd_devs; /* IVHD or AMD-Vi device list. */ +int ivhd_count; /* Number of IVHD header. */ +/* + * Cached IVHD header list. + * Single entry for each IVHD, filtered the legacy one. + */ +ACPI_IVRS_HARDWARE1 **ivhd_hdrs; + +extern int amdvi_ptp_level; /* Page table levels. */ + +typedef int (*ivhd_iter_t)(ACPI_IVRS_HEADER *ptr, void *arg); +/* + * Iterate IVRS table for IVHD and IVMD device type. + */ +static void +ivrs_hdr_iterate_tbl(ivhd_iter_t iter, void *arg) +{ + ACPI_TABLE_IVRS *ivrs; + ACPI_IVRS_HEADER *ivrs_hdr, *end; + ACPI_STATUS status; + + status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs); + if (ACPI_FAILURE(status)) + return; + + if (ivrs->Header.Length == 0) { + return; + } + + ivrs_hdr = (ACPI_IVRS_HEADER *)(ivrs + 1); + end = (ACPI_IVRS_HEADER *)((char *)ivrs + ivrs->Header.Length); + + while (ivrs_hdr < end) { + if ((uint8_t *)ivrs_hdr + ivrs_hdr->Length > (uint8_t *)end) { + printf("AMD-Vi:IVHD/IVMD is corrupted, length : %d\n", + ivrs_hdr->Length); + break; + } + + switch (ivrs_hdr->Type) { + case IVRS_TYPE_HARDWARE_LEGACY: /* Legacy */ + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + if (!iter(ivrs_hdr, arg)) + return; + break; + + case ACPI_IVRS_TYPE_MEMORY1: + case ACPI_IVRS_TYPE_MEMORY2: + case ACPI_IVRS_TYPE_MEMORY3: + if (!iter(ivrs_hdr, arg)) + return; + + break; + + default: + printf("AMD-Vi:Not IVHD/IVMD type(%d)", ivrs_hdr->Type); + } + + ivrs_hdr = (ACPI_IVRS_HEADER *)((uint8_t *)ivrs_hdr + + ivrs_hdr->Length); + } +} + +static bool +ivrs_is_ivhd(UINT8 type) +{ + + switch(type) { + case IVRS_TYPE_HARDWARE_LEGACY: + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + return (true); + + default: + return (false); + } +} + +/* Count the number of AMD-Vi devices in the system. */ +static int +ivhd_count_iter(ACPI_IVRS_HEADER * ivrs_he, void *arg) +{ + int *count; + + count = (int *)arg; + if (ivrs_is_ivhd(ivrs_he->Type)) + (*count)++; + + return (1); +} + +struct find_ivrs_hdr_args { + int i; + ACPI_IVRS_HEADER *ptr; +}; + +static int +ivrs_hdr_find_iter(ACPI_IVRS_HEADER * ivrs_hdr, void *args) +{ + struct find_ivrs_hdr_args *fi; + + fi = (struct find_ivrs_hdr_args *)args; + if (ivrs_is_ivhd(ivrs_hdr->Type)) { + if (fi->i == 0) { + fi->ptr = ivrs_hdr; + return (0); + } + fi->i--; + } + + return (1); +} + +static ACPI_IVRS_HARDWARE1 * +ivhd_find_by_index(int idx) +{ + struct find_ivrs_hdr_args fi; + + fi.i = idx; + fi.ptr = NULL; + + ivrs_hdr_iterate_tbl(ivrs_hdr_find_iter, &fi); + + return ((ACPI_IVRS_HARDWARE1 *)fi.ptr); +} + +static void +ivhd_dev_add_entry(struct amdvi_softc *softc, uint32_t start_id, + uint32_t end_id, uint8_t cfg, bool ats) +{ + struct ivhd_dev_cfg *dev_cfg; + + KASSERT(softc->dev_cfg_cap >= softc->dev_cfg_cnt, + ("Impossible case: number of dev_cfg exceeding capacity")); + if (softc->dev_cfg_cap == softc->dev_cfg_cnt) { + if (softc->dev_cfg_cap == 0) + softc->dev_cfg_cap = 1; + else + softc->dev_cfg_cap <<= 2; + softc->dev_cfg = realloc(softc->dev_cfg, + sizeof(*softc->dev_cfg) * softc->dev_cfg_cap, M_DEVBUF, + M_WAITOK); + } + + dev_cfg = &softc->dev_cfg[softc->dev_cfg_cnt++]; + dev_cfg->start_id = start_id; + dev_cfg->end_id = end_id; + dev_cfg->data = cfg; + dev_cfg->enable_ats = ats; +} + +/* + * Record device attributes as suggested by BIOS. + */ +static int +ivhd_dev_parse(ACPI_IVRS_HARDWARE1 *ivhd, struct amdvi_softc *softc) +{ + ACPI_IVRS_DE_HEADER *de; + uint8_t *p, *end; + int range_start_id = -1, range_end_id = -1, i; + uint32_t *extended; + uint8_t all_data = 0, range_data = 0; + bool range_enable_ats = false, enable_ats; + + switch (ivhd->Header.Type) { + case IVRS_TYPE_HARDWARE_LEGACY: + p = (uint8_t *)ivhd + sizeof(ACPI_IVRS_HARDWARE1); + break; + + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + p = (uint8_t *)ivhd + sizeof(ACPI_IVRS_HARDWARE2); + break; + + default: + device_printf(softc->dev, + "unknown type: 0x%x\n", ivhd->Header.Type); + return (-1); + } + + end = (uint8_t *)ivhd + ivhd->Header.Length; + + while (p < end) { + de = (ACPI_IVRS_DE_HEADER *)p; + switch (de->Type) { + case ACPI_IVRS_TYPE_ALL: + all_data = de->DataSetting; + for (i = 0; i < softc->dev_cfg_cnt; i++) + softc->dev_cfg[i].data |= all_data; + break; + + case ACPI_IVRS_TYPE_SELECT: + case ACPI_IVRS_TYPE_ALIAS_SELECT: + case ACPI_IVRS_TYPE_EXT_SELECT: + enable_ats = false; + if (de->Type == ACPI_IVRS_TYPE_EXT_SELECT) { + extended = (uint32_t *)(de + 1); + enable_ats = + (*extended & IVHD_DEV_EXT_ATS_DISABLE) ? + false : true; + } + ivhd_dev_add_entry(softc, de->Id, de->Id, + de->DataSetting | all_data, enable_ats); + break; + + case ACPI_IVRS_TYPE_START: + case ACPI_IVRS_TYPE_ALIAS_START: + case ACPI_IVRS_TYPE_EXT_START: + if (range_start_id != -1) { + device_printf(softc->dev, + "Unexpected start-of-range device entry\n"); + return (EINVAL); + } + range_start_id = de->Id; + range_data = de->DataSetting; + if (de->Type == ACPI_IVRS_TYPE_EXT_START) { + extended = (uint32_t *)(de + 1); + range_enable_ats = + (*extended & IVHD_DEV_EXT_ATS_DISABLE) ? + false : true; + } + break; + + case ACPI_IVRS_TYPE_END: + if (range_start_id == -1) { + device_printf(softc->dev, + "Unexpected end-of-range device entry\n"); + return (EINVAL); + } + range_end_id = de->Id; + if (range_end_id < range_start_id) { + device_printf(softc->dev, + "Device entry range going backward\n"); + return (EINVAL); + } + ivhd_dev_add_entry(softc, range_start_id, range_end_id, + range_data | all_data, range_enable_ats); + range_start_id = range_end_id = -1; + range_data = 0; + all_data = 0; + break; + + case ACPI_IVRS_TYPE_PAD4: + break; + + case ACPI_IVRS_TYPE_SPECIAL: + /* HPET or IOAPIC */ + break; + default: + if ((de->Type < 5) || + (de->Type >= ACPI_IVRS_TYPE_PAD8)) + device_printf(softc->dev, + "Unknown dev entry:0x%x\n", de->Type); + } + + if (de->Type < 0x40) + p += sizeof(ACPI_IVRS_DEVICE4); + else if (de->Type < 0x80) + p += sizeof(ACPI_IVRS_DEVICE8A); + else { + printf("Variable size IVHD type 0x%x not supported\n", + de->Type); + break; + } + } + + return (0); +} + +static bool +ivhd_is_newer(ACPI_IVRS_HEADER *old, ACPI_IVRS_HEADER *new) +{ + if (old->DeviceId == new->DeviceId) { + /* + * Newer IVRS header type take precedence. + */ + if (old->Type == IVRS_TYPE_HARDWARE_LEGACY && + ((new->Type == IVRS_TYPE_HARDWARE_EFR) || + (new->Type == IVRS_TYPE_HARDWARE_MIXED))) + return (true); + + /* + * Mixed format IVHD header type take precedence + * over fixed format IVHD header types. + */ + if (old->Type == IVRS_TYPE_HARDWARE_EFR && + new->Type == IVRS_TYPE_HARDWARE_MIXED) + return (true); + } + + return (false); +} + +static void +ivhd_identify(driver_t *driver, device_t parent) +{ + ACPI_TABLE_IVRS *ivrs; + ACPI_IVRS_HARDWARE1 *ivhd; + ACPI_STATUS status; + int i, j, count = 0; + uint32_t ivrs_ivinfo; + + if (acpi_disabled("ivhd")) + return; + + status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs); + if (ACPI_FAILURE(status)) + return; + + if (ivrs->Header.Length == 0) { + return; + } + + ivrs_ivinfo = ivrs->Info; + printf("AMD-Vi: IVRS Info VAsize = %d PAsize = %d GVAsize = %d" + " flags:%b\n", + REG_BITS(ivrs_ivinfo, 21, 15), REG_BITS(ivrs_ivinfo, 14, 8), + REG_BITS(ivrs_ivinfo, 7, 5), REG_BITS(ivrs_ivinfo, 22, 22), + "\020\001EFRSup"); + + ivrs_hdr_iterate_tbl(ivhd_count_iter, &count); + if (!count) + return; + + ivhd_hdrs = malloc(sizeof(void *) * count, M_DEVBUF, + M_WAITOK | M_ZERO); + for (i = 0; i < count; i++) { + ivhd = ivhd_find_by_index(i); + KASSERT(ivhd, ("ivhd%d is NULL\n", i)); + + /* + * Scan for presence of legacy and non-legacy device type + * for same IOMMU device and override the old one. + * + * If there is no existing IVHD to the same IOMMU device, + * the IVHD header pointer is appended. + */ + for (j = 0; j < ivhd_count; j++) { + if (ivhd_is_newer(&ivhd_hdrs[j]->Header, &ivhd->Header)) + break; + } + ivhd_hdrs[j] = ivhd; + if (j == ivhd_count) + ivhd_count++; + } + + ivhd_devs = malloc(sizeof(device_t) * ivhd_count, M_DEVBUF, + M_WAITOK | M_ZERO); + for (i = 0, j = 0; i < ivhd_count; i++) { + ivhd = ivhd_hdrs[i]; + KASSERT(ivhd, ("ivhd%d is NULL\n", i)); + + /* + * Use a high order to ensure that this driver is probed after + * the Host-PCI bridge and the root PCI bus. + */ + ivhd_devs[i] = BUS_ADD_CHILD(parent, + ACPI_DEV_BASE_ORDER + 10 * 10, "ivhd", i); + + /* + * XXX: In case device was not destroyed before, add will fail. + * locate the old device instance. + */ + if (ivhd_devs[i] == NULL) { + ivhd_devs[i] = device_find_child(parent, "ivhd", i); + if (ivhd_devs[i] == NULL) { + printf("AMD-Vi: can't find ivhd%d\n", i); + break; + } + } + j++; + } + + /* + * Update device count in case failed to attach. + */ + ivhd_count = j; +} + +static int +ivhd_probe(device_t dev) +{ + ACPI_IVRS_HARDWARE1 *ivhd; + int unit; + + if (acpi_get_handle(dev) != NULL) + return (ENXIO); + + unit = device_get_unit(dev); + KASSERT((unit < ivhd_count), + ("ivhd unit %d > count %d", unit, ivhd_count)); + ivhd = ivhd_hdrs[unit]; + KASSERT(ivhd, ("ivhd is NULL")); + + switch (ivhd->Header.Type) { + case IVRS_TYPE_HARDWARE_EFR: + device_set_desc(dev, "AMD-Vi/IOMMU ivhd with EFR"); + break; + + case IVRS_TYPE_HARDWARE_MIXED: + device_set_desc(dev, "AMD-Vi/IOMMU ivhd in mixed format"); + break; + + case IVRS_TYPE_HARDWARE_LEGACY: + default: + device_set_desc(dev, "AMD-Vi/IOMMU ivhd"); + break; + } + + return (BUS_PROBE_NOWILDCARD); +} + +static void +ivhd_print_flag(device_t dev, enum IvrsType ivhd_type, uint8_t flag) +{ + /* + * IVHD lgeacy type has two extra high bits in flag which has + * been moved to EFR for non-legacy device. + */ + switch (ivhd_type) { + case IVRS_TYPE_HARDWARE_LEGACY: + device_printf(dev, "Flag:%b\n", flag, + "\020" + "\001HtTunEn" + "\002PassPW" + "\003ResPassPW" + "\004Isoc" + "\005IotlbSup" + "\006Coherent" + "\007PreFSup" + "\010PPRSup"); + break; + + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + device_printf(dev, "Flag:%b\n", flag, + "\020" + "\001HtTunEn" + "\002PassPW" + "\003ResPassPW" + "\004Isoc" + "\005IotlbSup" + "\006Coherent"); + break; + + default: + device_printf(dev, "Can't decode flag of ivhd type :0x%x\n", + ivhd_type); + break; + } +} + +/* + * Feature in legacy IVHD type(0x10) and attribute in newer type(0x11 and 0x40). + */ +static void +ivhd_print_feature(device_t dev, enum IvrsType ivhd_type, uint32_t feature) +{ + switch (ivhd_type) { + case IVRS_TYPE_HARDWARE_LEGACY: + device_printf(dev, "Features(type:0x%x) HATS = %d GATS = %d" + " MsiNumPPR = %d PNBanks= %d PNCounters= %d\n", + ivhd_type, + REG_BITS(feature, 31, 30), + REG_BITS(feature, 29, 28), + REG_BITS(feature, 27, 23), + REG_BITS(feature, 22, 17), + REG_BITS(feature, 16, 13)); + device_printf(dev, "max PASID = %d GLXSup = %d Feature:%b\n", + REG_BITS(feature, 12, 8), + REG_BITS(feature, 4, 3), + feature, + "\020" + "\002NXSup" + "\003GTSup" + "\004<b4>" + "\005IASup" + "\006GASup" + "\007HESup"); + break; + + /* Fewer features or attributes are reported in non-legacy type. */ + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + device_printf(dev, "Features(type:0x%x) MsiNumPPR = %d" + " PNBanks= %d PNCounters= %d\n", + ivhd_type, + REG_BITS(feature, 27, 23), + REG_BITS(feature, 22, 17), + REG_BITS(feature, 16, 13)); + break; + + default: /* Other ivhd type features are not decoded. */ + device_printf(dev, "Can't decode ivhd type :0x%x\n", ivhd_type); + } +} + +/* Print extended features of IOMMU. */ +static void +ivhd_print_ext_feature(device_t dev, uint64_t ext_feature) +{ + uint32_t ext_low, ext_high; + + if (!ext_feature) + return; + + ext_low = ext_feature; + device_printf(dev, "Extended features[31:0]:%b " + "HATS = 0x%x GATS = 0x%x " + "GLXSup = 0x%x SmiFSup = 0x%x SmiFRC = 0x%x " + "GAMSup = 0x%x DualPortLogSup = 0x%x DualEventLogSup = 0x%x\n", + (int)ext_low, + "\020" + "\001PreFSup" + "\002PPRSup" + "\003<b2>" + "\004NXSup" + "\005GTSup" + "\006<b5>" + "\007IASup" + "\010GASup" + "\011HESup" + "\012PCSup", + REG_BITS(ext_low, 11, 10), + REG_BITS(ext_low, 13, 12), + REG_BITS(ext_low, 15, 14), + REG_BITS(ext_low, 17, 16), + REG_BITS(ext_low, 20, 18), + REG_BITS(ext_low, 23, 21), + REG_BITS(ext_low, 25, 24), + REG_BITS(ext_low, 29, 28)); + + ext_high = ext_feature >> 32; + device_printf(dev, "Extended features[62:32]:%b " + "Max PASID: 0x%x DevTblSegSup = 0x%x " + "MarcSup = 0x%x\n", + (int)(ext_high), + "\020" + "\006USSup" + "\011PprOvrflwEarlySup" + "\012PPRAutoRspSup" + "\015BlKStopMrkSup" + "\016PerfOptSup" + "\017MsiCapMmioSup" + "\021GIOSup" + "\022HASup" + "\023EPHSup" + "\024AttrFWSup" + "\025HDSup" + "\027InvIotlbSup", + REG_BITS(ext_high, 5, 0), + REG_BITS(ext_high, 8, 7), + REG_BITS(ext_high, 11, 10)); +} + +static int +ivhd_print_cap(struct amdvi_softc *softc, ACPI_IVRS_HARDWARE1 * ivhd) +{ + device_t dev; + int max_ptp_level; + + dev = softc->dev; + + ivhd_print_flag(dev, softc->ivhd_type, softc->ivhd_flag); + ivhd_print_feature(dev, softc->ivhd_type, softc->ivhd_feature); + ivhd_print_ext_feature(dev, softc->ext_feature); + max_ptp_level = 7; + /* Make sure device support minimum page level as requested by user. */ + if (max_ptp_level < amdvi_ptp_level) { + device_printf(dev, "insufficient PTP level:%d\n", + max_ptp_level); + return (EINVAL); + } else { + device_printf(softc->dev, "supported paging level:%d, will use only: %d\n", + max_ptp_level, amdvi_ptp_level); + } + + return (0); +} + +static int +ivhd_attach(device_t dev) +{ + ACPI_IVRS_HARDWARE1 *ivhd; + ACPI_IVRS_HARDWARE2 *ivhd_efr; + struct amdvi_softc *softc; + int status, unit; + + unit = device_get_unit(dev); + KASSERT((unit < ivhd_count), + ("ivhd unit %d > count %d", unit, ivhd_count)); + /* Make sure its same device for which attach is called. */ + KASSERT((ivhd_devs[unit] == dev), + ("Not same device old %p new %p", ivhd_devs[unit], dev)); + + softc = device_get_softc(dev); + softc->dev = dev; + ivhd = ivhd_hdrs[unit]; + KASSERT(ivhd, ("ivhd is NULL")); + softc->pci_dev = pci_find_dbsf(ivhd->PciSegmentGroup, + PCI_RID2BUS(ivhd->Header.DeviceId), + PCI_RID2SLOT(ivhd->Header.DeviceId), + PCI_RID2FUNC(ivhd->Header.DeviceId)); + + softc->ivhd_type = ivhd->Header.Type; + softc->pci_seg = ivhd->PciSegmentGroup; + softc->pci_rid = ivhd->Header.DeviceId; + softc->ivhd_flag = ivhd->Header.Flags; + /* + * On lgeacy IVHD type(0x10), it is documented as feature + * but in newer type it is attribute. + */ + softc->ivhd_feature = ivhd->FeatureReporting; + /* + * PCI capability has more capabilities that are not part of IVRS. + */ + softc->cap_off = ivhd->CapabilityOffset; + +#ifdef notyet + /* IVHD Info bit[4:0] is event MSI/X number. */ + softc->event_msix = ivhd->Info & 0x1F; +#endif + switch (ivhd->Header.Type) { + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + ivhd_efr = (ACPI_IVRS_HARDWARE2 *)ivhd; + softc->ext_feature = ivhd_efr->EfrRegisterImage; + break; + } + + softc->ctrl = (struct amdvi_ctrl *) PHYS_TO_DMAP(ivhd->BaseAddress); + status = ivhd_dev_parse(ivhd, softc); + if (status != 0) { + device_printf(dev, + "endpoint device parsing error=%d\n", status); + goto fail; + } + + status = ivhd_print_cap(softc, ivhd); + if (status != 0) + goto fail; + + status = amdvi_setup_hw(softc); + if (status != 0) { + device_printf(dev, "couldn't be initialised, error=%d\n", + status); + goto fail; + } + + return (0); + +fail: + free(softc->dev_cfg, M_DEVBUF); + return (status); +} + +static int +ivhd_detach(device_t dev) +{ + struct amdvi_softc *softc; + + softc = device_get_softc(dev); + + amdvi_teardown_hw(softc); + free(softc->dev_cfg, M_DEVBUF); + + /* + * XXX: delete the device. + * don't allow detach, return EBUSY. + */ + return (0); +} + +static int +ivhd_suspend(device_t dev) +{ + + return (0); +} + +static int +ivhd_resume(device_t dev) +{ + + return (0); +} + +static device_method_t ivhd_methods[] = { + DEVMETHOD(device_identify, ivhd_identify), + DEVMETHOD(device_probe, ivhd_probe), + DEVMETHOD(device_attach, ivhd_attach), + DEVMETHOD(device_detach, ivhd_detach), + DEVMETHOD(device_suspend, ivhd_suspend), + DEVMETHOD(device_resume, ivhd_resume), + DEVMETHOD_END +}; + +static driver_t ivhd_driver = { + "ivhd", + ivhd_methods, + sizeof(struct amdvi_softc), +}; + +/* + * Load this module at the end after PCI re-probing to configure interrupt. + */ +DRIVER_MODULE_ORDERED(ivhd, acpi, ivhd_driver, 0, 0, SI_ORDER_ANY); +MODULE_DEPEND(ivhd, acpi, 1, 1, 1); +MODULE_DEPEND(ivhd, pci, 1, 1, 1); diff --git a/sys/amd64/vmm/amd/npt.c b/sys/amd64/vmm/amd/npt.c new file mode 100644 index 000000000000..6fd6628053f2 --- /dev/null +++ b/sys/amd64/vmm/amd/npt.c @@ -0,0 +1,85 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_extern.h> + +#include "npt.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, npt, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, + NULL); + +static int npt_flags; +SYSCTL_INT(_hw_vmm_npt, OID_AUTO, pmap_flags, CTLFLAG_RD, + &npt_flags, 0, NULL); + +#define NPT_IPIMASK 0xFF + +/* + * AMD nested page table init. + */ +int +svm_npt_init(int ipinum) +{ + int enable_superpage = 1; + + npt_flags = ipinum & NPT_IPIMASK; + TUNABLE_INT_FETCH("hw.vmm.npt.enable_superpage", &enable_superpage); + if (enable_superpage) + npt_flags |= PMAP_PDE_SUPERPAGE; + + return (0); +} + +static int +npt_pinit(pmap_t pmap) +{ + + return (pmap_pinit_type(pmap, PT_RVI, npt_flags)); +} + +struct vmspace * +svm_npt_alloc(vm_offset_t min, vm_offset_t max) +{ + + return (vmspace_alloc(min, max, npt_pinit)); +} + +void +svm_npt_free(struct vmspace *vmspace) +{ + + vmspace_free(vmspace); +} diff --git a/sys/amd64/vmm/amd/npt.h b/sys/amd64/vmm/amd/npt.h new file mode 100644 index 000000000000..9ab163cf9076 --- /dev/null +++ b/sys/amd64/vmm/amd/npt.h @@ -0,0 +1,36 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SVM_NPT_H_ +#define _SVM_NPT_H_ + +int svm_npt_init(int ipinum); +struct vmspace *svm_npt_alloc(vm_offset_t min, vm_offset_t max); +void svm_npt_free(struct vmspace *vmspace); + +#endif /* _SVM_NPT_H_ */ diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c new file mode 100644 index 000000000000..2fe6a5bc3584 --- /dev/null +++ b/sys/amd64/vmm/amd/svm.c @@ -0,0 +1,2854 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_bhyve_snapshot.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/smp.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/reg.h> +#include <sys/smr.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> + +#include <machine/cpufunc.h> +#include <machine/psl.h> +#include <machine/md_var.h> +#include <machine/specialreg.h> +#include <machine/smp.h> +#include <machine/vmm.h> +#include <machine/vmm_dev.h> +#include <machine/vmm_instruction_emul.h> +#include <machine/vmm_snapshot.h> + +#include <dev/vmm/vmm_ktr.h> +#include <dev/vmm/vmm_mem.h> + +#include "vmm_lapic.h" +#include "vmm_stat.h" +#include "vmm_ioport.h" +#include "vatpic.h" +#include "vlapic.h" +#include "vlapic_priv.h" + +#include "x86.h" +#include "vmcb.h" +#include "svm.h" +#include "svm_softc.h" +#include "svm_msr.h" +#include "npt.h" +#include "io/ppt.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, + NULL); + +/* + * SVM CPUID function 0x8000_000A, edx bit decoding. + */ +#define AMD_CPUID_SVM_NP BIT(0) /* Nested paging or RVI */ +#define AMD_CPUID_SVM_LBR BIT(1) /* Last branch virtualization */ +#define AMD_CPUID_SVM_SVML BIT(2) /* SVM lock */ +#define AMD_CPUID_SVM_NRIP_SAVE BIT(3) /* Next RIP is saved */ +#define AMD_CPUID_SVM_TSC_RATE BIT(4) /* TSC rate control. */ +#define AMD_CPUID_SVM_VMCB_CLEAN BIT(5) /* VMCB state caching */ +#define AMD_CPUID_SVM_FLUSH_BY_ASID BIT(6) /* Flush by ASID */ +#define AMD_CPUID_SVM_DECODE_ASSIST BIT(7) /* Decode assist */ +#define AMD_CPUID_SVM_PAUSE_INC BIT(10) /* Pause intercept filter. */ +#define AMD_CPUID_SVM_PAUSE_FTH BIT(12) /* Pause filter threshold */ +#define AMD_CPUID_SVM_AVIC BIT(13) /* AVIC present */ + +#define VMCB_CACHE_DEFAULT (VMCB_CACHE_ASID | \ + VMCB_CACHE_IOPM | \ + VMCB_CACHE_I | \ + VMCB_CACHE_TPR | \ + VMCB_CACHE_CR2 | \ + VMCB_CACHE_CR | \ + VMCB_CACHE_DR | \ + VMCB_CACHE_DT | \ + VMCB_CACHE_SEG | \ + VMCB_CACHE_NP) + +static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT; +SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean, + 0, NULL); + +static MALLOC_DEFINE(M_SVM, "svm", "svm"); +static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic"); + +static uint32_t svm_feature = ~0U; /* AMD SVM features. */ +SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0, + "SVM features advertised by CPUID.8000000AH:EDX"); + +static int disable_npf_assist; +SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN, + &disable_npf_assist, 0, NULL); + +/* Maximum ASIDs supported by the processor */ +static uint32_t nasid; +SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0, + "Number of ASIDs supported by this processor"); + +/* Current ASID generation for each host cpu */ +static struct asid asid[MAXCPU]; + +/* SVM host state saved area of size 4KB for each physical core. */ +static uint8_t *hsave; + +static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery"); +static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry"); +static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window"); + +static int svm_getdesc(void *vcpui, int reg, struct seg_desc *desc); +static int svm_setreg(void *vcpui, int ident, uint64_t val); +static int svm_getreg(void *vcpui, int ident, uint64_t *val); +static __inline int +flush_by_asid(void) +{ + + return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID); +} + +static __inline int +decode_assist(void) +{ + + return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST); +} + +static void +svm_disable(void *arg __unused) +{ + uint64_t efer; + + efer = rdmsr(MSR_EFER); + efer &= ~EFER_SVM; + wrmsr(MSR_EFER, efer); +} + +/* + * Disable SVM on all CPUs. + */ +static int +svm_modcleanup(void) +{ + + smp_rendezvous(NULL, svm_disable, NULL, NULL); + + if (hsave != NULL) + kmem_free(hsave, (mp_maxid + 1) * PAGE_SIZE); + + return (0); +} + +/* + * Verify that all the features required by bhyve are available. + */ +static int +check_svm_features(void) +{ + u_int regs[4]; + + /* CPUID Fn8000_000A is for SVM */ + do_cpuid(0x8000000A, regs); + svm_feature &= regs[3]; + + /* + * The number of ASIDs can be configured to be less than what is + * supported by the hardware but not more. + */ + if (nasid == 0 || nasid > regs[1]) + nasid = regs[1]; + KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid)); + + /* bhyve requires the Nested Paging feature */ + if (!(svm_feature & AMD_CPUID_SVM_NP)) { + printf("SVM: Nested Paging feature not available.\n"); + return (ENXIO); + } + + /* bhyve requires the NRIP Save feature */ + if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) { + printf("SVM: NRIP Save feature not available.\n"); + return (ENXIO); + } + + return (0); +} + +static void +svm_enable(void *arg __unused) +{ + uint64_t efer; + + efer = rdmsr(MSR_EFER); + efer |= EFER_SVM; + wrmsr(MSR_EFER, efer); + + wrmsr(MSR_VM_HSAVE_PA, vtophys(&hsave[curcpu * PAGE_SIZE])); +} + +/* + * Return 1 if SVM is enabled on this processor and 0 otherwise. + */ +static int +svm_available(void) +{ + uint64_t msr; + + /* Section 15.4 Enabling SVM from APM2. */ + if ((amd_feature2 & AMDID2_SVM) == 0) { + printf("SVM: not available.\n"); + return (0); + } + + msr = rdmsr(MSR_VM_CR); + if ((msr & VM_CR_SVMDIS) != 0) { + printf("SVM: disabled by BIOS.\n"); + return (0); + } + + return (1); +} + +static int +svm_modinit(int ipinum) +{ + int error, cpu; + + if (!svm_available()) + return (ENXIO); + + error = check_svm_features(); + if (error) + return (error); + + vmcb_clean &= VMCB_CACHE_DEFAULT; + + for (cpu = 0; cpu < MAXCPU; cpu++) { + /* + * Initialize the host ASIDs to their "highest" valid values. + * + * The next ASID allocation will rollover both 'gen' and 'num' + * and start off the sequence at {1,1}. + */ + asid[cpu].gen = ~0UL; + asid[cpu].num = nasid - 1; + } + + svm_msr_init(); + svm_npt_init(ipinum); + + /* Enable SVM on all CPUs */ + hsave = kmem_malloc((mp_maxid + 1) * PAGE_SIZE, M_WAITOK | M_ZERO); + smp_rendezvous(NULL, svm_enable, NULL, NULL); + + return (0); +} + +static void +svm_modsuspend(void) +{ +} + +static void +svm_modresume(void) +{ + + svm_enable(NULL); +} + +#ifdef BHYVE_SNAPSHOT +void +svm_set_tsc_offset(struct svm_vcpu *vcpu, uint64_t offset) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(vcpu); + ctrl->tsc_offset = offset; + + svm_set_dirty(vcpu, VMCB_CACHE_I); + SVM_CTR1(vcpu, "tsc offset changed to %#lx", offset); + + vm_set_tsc_offset(vcpu->vcpu, offset); +} +#endif + +/* Pentium compatible MSRs */ +#define MSR_PENTIUM_START 0 +#define MSR_PENTIUM_END 0x1FFF +/* AMD 6th generation and Intel compatible MSRs */ +#define MSR_AMD6TH_START 0xC0000000UL +#define MSR_AMD6TH_END 0xC0001FFFUL +/* AMD 7th and 8th generation compatible MSRs */ +#define MSR_AMD7TH_START 0xC0010000UL +#define MSR_AMD7TH_END 0xC0011FFFUL + +static void +svm_get_cs_info(struct vmcb *vmcb, struct vm_guest_paging *paging, int *cs_d, + uint64_t *base) +{ + struct vmcb_segment seg; + int error __diagused; + + error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); + KASSERT(error == 0, ("%s: vmcb_seg error %d", __func__, error)); + + switch (paging->cpu_mode) { + case CPU_MODE_REAL: + *base = seg.base; + *cs_d = 0; + break; + case CPU_MODE_PROTECTED: + case CPU_MODE_COMPATIBILITY: + *cs_d = !!(seg.attrib & VMCB_CS_ATTRIB_D); + *base = seg.base; + break; + default: + *base = 0; + *cs_d = 0; + break; + } +} + +/* + * Get the index and bit position for a MSR in permission bitmap. + * Two bits are used for each MSR: lower bit for read and higher bit for write. + */ +static int +svm_msr_index(uint64_t msr, int *index, int *bit) +{ + uint32_t base, off; + + *index = -1; + *bit = (msr % 4) * 2; + base = 0; + + if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) { + *index = msr / 4; + return (0); + } + + base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); + if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) { + off = (msr - MSR_AMD6TH_START); + *index = (off + base) / 4; + return (0); + } + + base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1); + if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) { + off = (msr - MSR_AMD7TH_START); + *index = (off + base) / 4; + return (0); + } + + return (EINVAL); +} + +/* + * Allow vcpu to read or write the 'msr' without trapping into the hypervisor. + */ +static void +svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write) +{ + int index, bit, error __diagused; + + error = svm_msr_index(msr, &index, &bit); + KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr)); + KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE, + ("%s: invalid index %d for msr %#lx", __func__, index, msr)); + KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d " + "msr %#lx", __func__, bit, msr)); + + if (read) + perm_bitmap[index] &= ~(1UL << bit); + + if (write) + perm_bitmap[index] &= ~(2UL << bit); +} + +static void +svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr) +{ + + svm_msr_perm(perm_bitmap, msr, true, true); +} + +static void +svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr) +{ + + svm_msr_perm(perm_bitmap, msr, true, false); +} + +static __inline int +svm_get_intercept(struct svm_vcpu *vcpu, int idx, uint32_t bitmask) +{ + struct vmcb_ctrl *ctrl; + + KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); + + ctrl = svm_get_vmcb_ctrl(vcpu); + return (ctrl->intercept[idx] & bitmask ? 1 : 0); +} + +static __inline void +svm_set_intercept(struct svm_vcpu *vcpu, int idx, uint32_t bitmask, int enabled) +{ + struct vmcb_ctrl *ctrl; + uint32_t oldval; + + KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); + + ctrl = svm_get_vmcb_ctrl(vcpu); + oldval = ctrl->intercept[idx]; + + if (enabled) + ctrl->intercept[idx] |= bitmask; + else + ctrl->intercept[idx] &= ~bitmask; + + if (ctrl->intercept[idx] != oldval) { + svm_set_dirty(vcpu, VMCB_CACHE_I); + SVM_CTR3(vcpu, "intercept[%d] modified from %#x to %#x", idx, + oldval, ctrl->intercept[idx]); + } +} + +static __inline void +svm_disable_intercept(struct svm_vcpu *vcpu, int off, uint32_t bitmask) +{ + + svm_set_intercept(vcpu, off, bitmask, 0); +} + +static __inline void +svm_enable_intercept(struct svm_vcpu *vcpu, int off, uint32_t bitmask) +{ + + svm_set_intercept(vcpu, off, bitmask, 1); +} + +static void +vmcb_init(struct svm_softc *sc, struct svm_vcpu *vcpu, uint64_t iopm_base_pa, + uint64_t msrpm_base_pa, uint64_t np_pml4) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + uint32_t mask; + int n; + + ctrl = svm_get_vmcb_ctrl(vcpu); + state = svm_get_vmcb_state(vcpu); + + ctrl->iopm_base_pa = iopm_base_pa; + ctrl->msrpm_base_pa = msrpm_base_pa; + + /* Enable nested paging */ + ctrl->np_enable = 1; + ctrl->n_cr3 = np_pml4; + + /* + * Intercept accesses to the control registers that are not shadowed + * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8. + */ + for (n = 0; n < 16; n++) { + mask = (BIT(n) << 16) | BIT(n); + if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8) + svm_disable_intercept(vcpu, VMCB_CR_INTCPT, mask); + else + svm_enable_intercept(vcpu, VMCB_CR_INTCPT, mask); + } + + /* + * Intercept everything when tracing guest exceptions otherwise + * just intercept machine check exception. + */ + if (vcpu_trace_exceptions(vcpu->vcpu)) { + for (n = 0; n < 32; n++) { + /* + * Skip unimplemented vectors in the exception bitmap. + */ + if (n == 2 || n == 9) { + continue; + } + svm_enable_intercept(vcpu, VMCB_EXC_INTCPT, BIT(n)); + } + } else { + svm_enable_intercept(vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC)); + } + + /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */ + svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO); + svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR); + svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID); + svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR); + svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT); + svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI); + svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI); + svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN); + svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_FERR_FREEZE); + svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVD); + svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVLPGA); + + svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR); + svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT); + + /* + * Intercept SVM instructions since AMD enables them in guests otherwise. + * Non-intercepted VMMCALL causes #UD, skip it. + */ + svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMLOAD); + svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMSAVE); + svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_STGI); + svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_CLGI); + svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_SKINIT); + svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_ICEBP); + if (vcpu_trap_wbinvd(vcpu->vcpu)) { + svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT, + VMCB_INTCPT_WBINVD); + } + + /* + * From section "Canonicalization and Consistency Checks" in APMv2 + * the VMRUN intercept bit must be set to pass the consistency check. + */ + svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN); + + /* + * The ASID will be set to a non-zero value just before VMRUN. + */ + ctrl->asid = 0; + + /* + * Section 15.21.1, Interrupt Masking in EFLAGS + * Section 15.21.2, Virtualizing APIC.TPR + * + * This must be set for %rflag and %cr8 isolation of guest and host. + */ + ctrl->v_intr_masking = 1; + + /* Enable Last Branch Record aka LBR for debugging */ + ctrl->lbr_virt_en = 1; + state->dbgctl = BIT(0); + + /* EFER_SVM must always be set when the guest is executing */ + state->efer = EFER_SVM; + + /* Set up the PAT to power-on state */ + state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK) | + PAT_VALUE(1, PAT_WRITE_THROUGH) | + PAT_VALUE(2, PAT_UNCACHED) | + PAT_VALUE(3, PAT_UNCACHEABLE) | + PAT_VALUE(4, PAT_WRITE_BACK) | + PAT_VALUE(5, PAT_WRITE_THROUGH) | + PAT_VALUE(6, PAT_UNCACHED) | + PAT_VALUE(7, PAT_UNCACHEABLE); + + /* Set up DR6/7 to power-on state */ + state->dr6 = DBREG_DR6_RESERVED1; + state->dr7 = DBREG_DR7_RESERVED1; +} + +/* + * Initialize a virtual machine. + */ +static void * +svm_init(struct vm *vm, pmap_t pmap) +{ + struct svm_softc *svm_sc; + + svm_sc = malloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO); + + svm_sc->msr_bitmap = contigmalloc(SVM_MSR_BITMAP_SIZE, M_SVM, + M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0); + if (svm_sc->msr_bitmap == NULL) + panic("contigmalloc of SVM MSR bitmap failed"); + svm_sc->iopm_bitmap = contigmalloc(SVM_IO_BITMAP_SIZE, M_SVM, + M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0); + if (svm_sc->iopm_bitmap == NULL) + panic("contigmalloc of SVM IO bitmap failed"); + + svm_sc->vm = vm; + svm_sc->nptp = vtophys(pmap->pm_pmltop); + + /* + * Intercept read and write accesses to all MSRs. + */ + memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE); + + /* + * Access to the following MSRs is redirected to the VMCB when the + * guest is executing. Therefore it is safe to allow the guest to + * read/write these MSRs directly without hypervisor involvement. + */ + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE); + + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT); + + svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC); + + /* + * Intercept writes to make sure that the EFER_SVM bit is not cleared. + */ + svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER); + + /* Intercept access to all I/O ports. */ + memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE); + + return (svm_sc); +} + +static void * +svm_vcpu_init(void *vmi, struct vcpu *vcpu1, int vcpuid) +{ + struct svm_softc *sc = vmi; + struct svm_vcpu *vcpu; + + vcpu = malloc(sizeof(*vcpu), M_SVM, M_WAITOK | M_ZERO); + vcpu->sc = sc; + vcpu->vcpu = vcpu1; + vcpu->vcpuid = vcpuid; + vcpu->vmcb = malloc_aligned(sizeof(struct vmcb), PAGE_SIZE, M_SVM, + M_WAITOK | M_ZERO); + vcpu->nextrip = ~0; + vcpu->lastcpu = NOCPU; + vcpu->vmcb_pa = vtophys(vcpu->vmcb); + vmcb_init(sc, vcpu, vtophys(sc->iopm_bitmap), vtophys(sc->msr_bitmap), + sc->nptp); + svm_msr_guest_init(sc, vcpu); + return (vcpu); +} + +/* + * Collateral for a generic SVM VM-exit. + */ +static void +vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2) +{ + + vme->exitcode = VM_EXITCODE_SVM; + vme->u.svm.exitcode = code; + vme->u.svm.exitinfo1 = info1; + vme->u.svm.exitinfo2 = info2; +} + +static int +svm_cpl(struct vmcb_state *state) +{ + + /* + * From APMv2: + * "Retrieve the CPL from the CPL field in the VMCB, not + * from any segment DPL" + */ + return (state->cpl); +} + +static enum vm_cpu_mode +svm_vcpu_mode(struct vmcb *vmcb) +{ + struct vmcb_segment seg; + struct vmcb_state *state; + int error __diagused; + + state = &vmcb->state; + + if (state->efer & EFER_LMA) { + error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); + KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__, + error)); + + /* + * Section 4.8.1 for APM2, check if Code Segment has + * Long attribute set in descriptor. + */ + if (seg.attrib & VMCB_CS_ATTRIB_L) + return (CPU_MODE_64BIT); + else + return (CPU_MODE_COMPATIBILITY); + } else if (state->cr0 & CR0_PE) { + return (CPU_MODE_PROTECTED); + } else { + return (CPU_MODE_REAL); + } +} + +static enum vm_paging_mode +svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer) +{ + + if ((cr0 & CR0_PG) == 0) + return (PAGING_MODE_FLAT); + if ((cr4 & CR4_PAE) == 0) + return (PAGING_MODE_32); + if (efer & EFER_LME) + return (PAGING_MODE_64); + else + return (PAGING_MODE_PAE); +} + +/* + * ins/outs utility routines + */ +static uint64_t +svm_inout_str_index(struct svm_regctx *regs, int in) +{ + uint64_t val; + + val = in ? regs->sctx_rdi : regs->sctx_rsi; + + return (val); +} + +static uint64_t +svm_inout_str_count(struct svm_regctx *regs, int rep) +{ + uint64_t val; + + val = rep ? regs->sctx_rcx : 1; + + return (val); +} + +static void +svm_inout_str_seginfo(struct svm_vcpu *vcpu, int64_t info1, int in, + struct vm_inout_str *vis) +{ + int error __diagused, s; + + if (in) { + vis->seg_name = VM_REG_GUEST_ES; + } else if (decode_assist()) { + /* + * The effective segment number in EXITINFO1[12:10] is populated + * only if the processor has the DecodeAssist capability. + * + * XXX this is not specified explicitly in APMv2 but can be + * verified empirically. + */ + s = (info1 >> 10) & 0x7; + + /* The segment field has standard encoding */ + vis->seg_name = vm_segment_name(s); + } else { + /* + * The segment register need to be manually decoded by fetching + * the instructions near ip. However, we are unable to fetch it + * while the interrupts are disabled. Therefore, we leave the + * value unset until the generic ins/outs handler runs. + */ + vis->seg_name = VM_REG_LAST; + svm_get_cs_info(vcpu->vmcb, &vis->paging, &vis->cs_d, + &vis->cs_base); + return; + } + + error = svm_getdesc(vcpu, vis->seg_name, &vis->seg_desc); + KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error)); +} + +static int +svm_inout_str_addrsize(uint64_t info1) +{ + uint32_t size; + + size = (info1 >> 7) & 0x7; + switch (size) { + case 1: + return (2); /* 16 bit */ + case 2: + return (4); /* 32 bit */ + case 4: + return (8); /* 64 bit */ + default: + panic("%s: invalid size encoding %d", __func__, size); + } +} + +static void +svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging) +{ + struct vmcb_state *state; + + state = &vmcb->state; + paging->cr3 = state->cr3; + paging->cpl = svm_cpl(state); + paging->cpu_mode = svm_vcpu_mode(vmcb); + paging->paging_mode = svm_paging_mode(state->cr0, state->cr4, + state->efer); +} + +#define UNHANDLED 0 + +/* + * Handle guest I/O intercept. + */ +static int +svm_handle_io(struct svm_vcpu *vcpu, struct vm_exit *vmexit) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + struct svm_regctx *regs; + struct vm_inout_str *vis; + uint64_t info1; + int inout_string; + + state = svm_get_vmcb_state(vcpu); + ctrl = svm_get_vmcb_ctrl(vcpu); + regs = svm_get_guest_regctx(vcpu); + + info1 = ctrl->exitinfo1; + inout_string = info1 & BIT(2) ? 1 : 0; + + vmexit->exitcode = VM_EXITCODE_INOUT; + vmexit->u.inout.in = (info1 & BIT(0)) ? 1 : 0; + vmexit->u.inout.string = inout_string; + vmexit->u.inout.rep = (info1 & BIT(3)) ? 1 : 0; + vmexit->u.inout.bytes = (info1 >> 4) & 0x7; + vmexit->u.inout.port = (uint16_t)(info1 >> 16); + vmexit->u.inout.eax = (uint32_t)(state->rax); + + if (inout_string) { + vmexit->exitcode = VM_EXITCODE_INOUT_STR; + vis = &vmexit->u.inout_str; + svm_paging_info(svm_get_vmcb(vcpu), &vis->paging); + vis->rflags = state->rflags; + vis->cr0 = state->cr0; + vis->index = svm_inout_str_index(regs, vmexit->u.inout.in); + vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep); + vis->addrsize = svm_inout_str_addrsize(info1); + vis->cs_d = 0; + vis->cs_base = 0; + svm_inout_str_seginfo(vcpu, info1, vmexit->u.inout.in, vis); + } + + return (UNHANDLED); +} + +static int +npf_fault_type(uint64_t exitinfo1) +{ + + if (exitinfo1 & VMCB_NPF_INFO1_W) + return (VM_PROT_WRITE); + else if (exitinfo1 & VMCB_NPF_INFO1_ID) + return (VM_PROT_EXECUTE); + else + return (VM_PROT_READ); +} + +static bool +svm_npf_emul_fault(uint64_t exitinfo1) +{ + + if (exitinfo1 & VMCB_NPF_INFO1_ID) { + return (false); + } + + if (exitinfo1 & VMCB_NPF_INFO1_GPT) { + return (false); + } + + if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) { + return (false); + } + + return (true); +} + +static void +svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit) +{ + struct vm_guest_paging *paging; + struct vmcb_ctrl *ctrl; + char *inst_bytes; + int inst_len; + + ctrl = &vmcb->ctrl; + paging = &vmexit->u.inst_emul.paging; + + vmexit->exitcode = VM_EXITCODE_INST_EMUL; + vmexit->u.inst_emul.gpa = gpa; + vmexit->u.inst_emul.gla = VIE_INVALID_GLA; + svm_paging_info(vmcb, paging); + + svm_get_cs_info(vmcb, paging, &vmexit->u.inst_emul.cs_d, + &vmexit->u.inst_emul.cs_base); + + /* + * Copy the instruction bytes into 'vie' if available. + */ + if (decode_assist() && !disable_npf_assist) { + inst_len = ctrl->inst_len; + inst_bytes = ctrl->inst_bytes; + } else { + inst_len = 0; + inst_bytes = NULL; + } + vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len); +} + +#ifdef KTR +static const char * +intrtype_to_str(int intr_type) +{ + switch (intr_type) { + case VMCB_EVENTINJ_TYPE_INTR: + return ("hwintr"); + case VMCB_EVENTINJ_TYPE_NMI: + return ("nmi"); + case VMCB_EVENTINJ_TYPE_INTn: + return ("swintr"); + case VMCB_EVENTINJ_TYPE_EXCEPTION: + return ("exception"); + default: + panic("%s: unknown intr_type %d", __func__, intr_type); + } +} +#endif + +/* + * Inject an event to vcpu as described in section 15.20, "Event injection". + */ +static void +svm_eventinject(struct svm_vcpu *vcpu, int intr_type, int vector, + uint32_t error, bool ec_valid) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(vcpu); + + KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, + ("%s: event already pending %#lx", __func__, ctrl->eventinj)); + + KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d", + __func__, vector)); + + switch (intr_type) { + case VMCB_EVENTINJ_TYPE_INTR: + case VMCB_EVENTINJ_TYPE_NMI: + case VMCB_EVENTINJ_TYPE_INTn: + break; + case VMCB_EVENTINJ_TYPE_EXCEPTION: + if (vector >= 0 && vector <= 31 && vector != 2) + break; + /* FALLTHROUGH */ + default: + panic("%s: invalid intr_type/vector: %d/%d", __func__, + intr_type, vector); + } + ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID; + if (ec_valid) { + ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID; + ctrl->eventinj |= (uint64_t)error << 32; + SVM_CTR3(vcpu, "Injecting %s at vector %d errcode %#x", + intrtype_to_str(intr_type), vector, error); + } else { + SVM_CTR2(vcpu, "Injecting %s at vector %d", + intrtype_to_str(intr_type), vector); + } +} + +static void +svm_update_virqinfo(struct svm_vcpu *vcpu) +{ + struct vlapic *vlapic; + struct vmcb_ctrl *ctrl; + + vlapic = vm_lapic(vcpu->vcpu); + ctrl = svm_get_vmcb_ctrl(vcpu); + + /* Update %cr8 in the emulated vlapic */ + vlapic_set_cr8(vlapic, ctrl->v_tpr); + + /* Virtual interrupt injection is not used. */ + KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid " + "v_intr_vector %d", __func__, ctrl->v_intr_vector)); +} + +static void +svm_save_intinfo(struct svm_softc *svm_sc, struct svm_vcpu *vcpu) +{ + struct vmcb_ctrl *ctrl; + uint64_t intinfo; + + ctrl = svm_get_vmcb_ctrl(vcpu); + intinfo = ctrl->exitintinfo; + if (!VMCB_EXITINTINFO_VALID(intinfo)) + return; + + /* + * From APMv2, Section "Intercepts during IDT interrupt delivery" + * + * If a #VMEXIT happened during event delivery then record the event + * that was being delivered. + */ + SVM_CTR2(vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n", intinfo, + VMCB_EXITINTINFO_VECTOR(intinfo)); + vmm_stat_incr(vcpu->vcpu, VCPU_EXITINTINFO, 1); + vm_exit_intinfo(vcpu->vcpu, intinfo); +} + +#ifdef INVARIANTS +static __inline int +vintr_intercept_enabled(struct svm_vcpu *vcpu) +{ + + return (svm_get_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR)); +} +#endif + +static __inline void +enable_intr_window_exiting(struct svm_vcpu *vcpu) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(vcpu); + + if (ctrl->v_irq && ctrl->v_intr_vector == 0) { + KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__)); + KASSERT(vintr_intercept_enabled(vcpu), + ("%s: vintr intercept should be enabled", __func__)); + return; + } + + SVM_CTR0(vcpu, "Enable intr window exiting"); + ctrl->v_irq = 1; + ctrl->v_ign_tpr = 1; + ctrl->v_intr_vector = 0; + svm_set_dirty(vcpu, VMCB_CACHE_TPR); + svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); +} + +static __inline void +disable_intr_window_exiting(struct svm_vcpu *vcpu) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(vcpu); + + if (!ctrl->v_irq && ctrl->v_intr_vector == 0) { + KASSERT(!vintr_intercept_enabled(vcpu), + ("%s: vintr intercept should be disabled", __func__)); + return; + } + + SVM_CTR0(vcpu, "Disable intr window exiting"); + ctrl->v_irq = 0; + ctrl->v_intr_vector = 0; + svm_set_dirty(vcpu, VMCB_CACHE_TPR); + svm_disable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); +} + +static int +svm_modify_intr_shadow(struct svm_vcpu *vcpu, uint64_t val) +{ + struct vmcb_ctrl *ctrl; + int oldval, newval; + + ctrl = svm_get_vmcb_ctrl(vcpu); + oldval = ctrl->intr_shadow; + newval = val ? 1 : 0; + if (newval != oldval) { + ctrl->intr_shadow = newval; + SVM_CTR1(vcpu, "Setting intr_shadow to %d", newval); + } + return (0); +} + +static int +svm_get_intr_shadow(struct svm_vcpu *vcpu, uint64_t *val) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(vcpu); + *val = ctrl->intr_shadow; + return (0); +} + +/* + * Once an NMI is injected it blocks delivery of further NMIs until the handler + * executes an IRET. The IRET intercept is enabled when an NMI is injected to + * to track when the vcpu is done handling the NMI. + */ +static int +nmi_blocked(struct svm_vcpu *vcpu) +{ + int blocked; + + blocked = svm_get_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); + return (blocked); +} + +static void +enable_nmi_blocking(struct svm_vcpu *vcpu) +{ + + KASSERT(!nmi_blocked(vcpu), ("vNMI already blocked")); + SVM_CTR0(vcpu, "vNMI blocking enabled"); + svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); +} + +static void +clear_nmi_blocking(struct svm_vcpu *vcpu) +{ + int error __diagused; + + KASSERT(nmi_blocked(vcpu), ("vNMI already unblocked")); + SVM_CTR0(vcpu, "vNMI blocking cleared"); + /* + * When the IRET intercept is cleared the vcpu will attempt to execute + * the "iret" when it runs next. However, it is possible to inject + * another NMI into the vcpu before the "iret" has actually executed. + * + * For e.g. if the "iret" encounters a #NPF when accessing the stack + * it will trap back into the hypervisor. If an NMI is pending for + * the vcpu it will be injected into the guest. + * + * XXX this needs to be fixed + */ + svm_disable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); + + /* + * Set 'intr_shadow' to prevent an NMI from being injected on the + * immediate VMRUN. + */ + error = svm_modify_intr_shadow(vcpu, 1); + KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error)); +} + +#define EFER_MBZ_BITS 0xFFFFFFFFFFFF0200UL + +static int +svm_write_efer(struct svm_softc *sc, struct svm_vcpu *vcpu, uint64_t newval, + bool *retu) +{ + struct vm_exit *vme; + struct vmcb_state *state; + uint64_t changed, lma, oldval; + int error __diagused; + + state = svm_get_vmcb_state(vcpu); + + oldval = state->efer; + SVM_CTR2(vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval); + + newval &= ~0xFE; /* clear the Read-As-Zero (RAZ) bits */ + changed = oldval ^ newval; + + if (newval & EFER_MBZ_BITS) + goto gpf; + + /* APMv2 Table 14-5 "Long-Mode Consistency Checks" */ + if (changed & EFER_LME) { + if (state->cr0 & CR0_PG) + goto gpf; + } + + /* EFER.LMA = EFER.LME & CR0.PG */ + if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) + lma = EFER_LMA; + else + lma = 0; + + if ((newval & EFER_LMA) != lma) + goto gpf; + + if (newval & EFER_NXE) { + if (!vm_cpuid_capability(vcpu->vcpu, VCC_NO_EXECUTE)) + goto gpf; + } + + /* + * XXX bhyve does not enforce segment limits in 64-bit mode. Until + * this is fixed flag guest attempt to set EFER_LMSLE as an error. + */ + if (newval & EFER_LMSLE) { + vme = vm_exitinfo(vcpu->vcpu); + vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0); + *retu = true; + return (0); + } + + if (newval & EFER_FFXSR) { + if (!vm_cpuid_capability(vcpu->vcpu, VCC_FFXSR)) + goto gpf; + } + + if (newval & EFER_TCE) { + if (!vm_cpuid_capability(vcpu->vcpu, VCC_TCE)) + goto gpf; + } + + error = svm_setreg(vcpu, VM_REG_GUEST_EFER, newval); + KASSERT(error == 0, ("%s: error %d updating efer", __func__, error)); + return (0); +gpf: + vm_inject_gp(vcpu->vcpu); + return (0); +} + +static int +emulate_wrmsr(struct svm_softc *sc, struct svm_vcpu *vcpu, u_int num, + uint64_t val, bool *retu) +{ + int error; + + if (lapic_msr(num)) + error = lapic_wrmsr(vcpu->vcpu, num, val, retu); + else if (num == MSR_EFER) + error = svm_write_efer(sc, vcpu, val, retu); + else + error = svm_wrmsr(vcpu, num, val, retu); + + return (error); +} + +static int +emulate_rdmsr(struct svm_vcpu *vcpu, u_int num, bool *retu) +{ + struct vmcb_state *state; + struct svm_regctx *ctx; + uint64_t result; + int error; + + if (lapic_msr(num)) + error = lapic_rdmsr(vcpu->vcpu, num, &result, retu); + else + error = svm_rdmsr(vcpu, num, &result, retu); + + if (error == 0) { + state = svm_get_vmcb_state(vcpu); + ctx = svm_get_guest_regctx(vcpu); + state->rax = result & 0xffffffff; + ctx->sctx_rdx = result >> 32; + } + + return (error); +} + +#ifdef KTR +static const char * +exit_reason_to_str(uint64_t reason) +{ + int i; + static char reasonbuf[32]; + static const struct { + int reason; + const char *str; + } reasons[] = { + { .reason = VMCB_EXIT_INVALID, .str = "invalvmcb" }, + { .reason = VMCB_EXIT_SHUTDOWN, .str = "shutdown" }, + { .reason = VMCB_EXIT_NPF, .str = "nptfault" }, + { .reason = VMCB_EXIT_PAUSE, .str = "pause" }, + { .reason = VMCB_EXIT_HLT, .str = "hlt" }, + { .reason = VMCB_EXIT_CPUID, .str = "cpuid" }, + { .reason = VMCB_EXIT_IO, .str = "inout" }, + { .reason = VMCB_EXIT_MC, .str = "mchk" }, + { .reason = VMCB_EXIT_INTR, .str = "extintr" }, + { .reason = VMCB_EXIT_NMI, .str = "nmi" }, + { .reason = VMCB_EXIT_VINTR, .str = "vintr" }, + { .reason = VMCB_EXIT_MSR, .str = "msr" }, + { .reason = VMCB_EXIT_IRET, .str = "iret" }, + { .reason = VMCB_EXIT_MONITOR, .str = "monitor" }, + { .reason = VMCB_EXIT_MWAIT, .str = "mwait" }, + { .reason = VMCB_EXIT_VMRUN, .str = "vmrun" }, + { .reason = VMCB_EXIT_VMMCALL, .str = "vmmcall" }, + { .reason = VMCB_EXIT_VMLOAD, .str = "vmload" }, + { .reason = VMCB_EXIT_VMSAVE, .str = "vmsave" }, + { .reason = VMCB_EXIT_STGI, .str = "stgi" }, + { .reason = VMCB_EXIT_CLGI, .str = "clgi" }, + { .reason = VMCB_EXIT_SKINIT, .str = "skinit" }, + { .reason = VMCB_EXIT_ICEBP, .str = "icebp" }, + { .reason = VMCB_EXIT_INVD, .str = "invd" }, + { .reason = VMCB_EXIT_INVLPGA, .str = "invlpga" }, + { .reason = VMCB_EXIT_POPF, .str = "popf" }, + { .reason = VMCB_EXIT_PUSHF, .str = "pushf" }, + }; + + for (i = 0; i < nitems(reasons); i++) { + if (reasons[i].reason == reason) + return (reasons[i].str); + } + snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason); + return (reasonbuf); +} +#endif /* KTR */ + +/* + * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs + * that are due to instruction intercepts as well as MSR and IOIO intercepts + * and exceptions caused by INT3, INTO and BOUND instructions. + * + * Return 1 if the nRIP is valid and 0 otherwise. + */ +static int +nrip_valid(uint64_t exitcode) +{ + switch (exitcode) { + case 0x00 ... 0x0F: /* read of CR0 through CR15 */ + case 0x10 ... 0x1F: /* write of CR0 through CR15 */ + case 0x20 ... 0x2F: /* read of DR0 through DR15 */ + case 0x30 ... 0x3F: /* write of DR0 through DR15 */ + case 0x43: /* INT3 */ + case 0x44: /* INTO */ + case 0x45: /* BOUND */ + case 0x65 ... 0x7C: /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */ + case 0x80 ... 0x8D: /* VMEXIT_VMRUN ... VMEXIT_XSETBV */ + return (1); + default: + return (0); + } +} + +static int +svm_vmexit(struct svm_softc *svm_sc, struct svm_vcpu *vcpu, + struct vm_exit *vmexit) +{ + struct vmcb *vmcb; + struct vmcb_state *state; + struct vmcb_ctrl *ctrl; + struct svm_regctx *ctx; + uint64_t code, info1, info2, val; + uint32_t eax, ecx, edx; + int error __diagused, errcode_valid, handled, idtvec, reflect; + bool retu; + + ctx = svm_get_guest_regctx(vcpu); + vmcb = svm_get_vmcb(vcpu); + state = &vmcb->state; + ctrl = &vmcb->ctrl; + + handled = 0; + code = ctrl->exitcode; + info1 = ctrl->exitinfo1; + info2 = ctrl->exitinfo2; + + vmexit->exitcode = VM_EXITCODE_BOGUS; + vmexit->rip = state->rip; + vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0; + + vmm_stat_incr(vcpu->vcpu, VMEXIT_COUNT, 1); + + /* + * #VMEXIT(INVALID) needs to be handled early because the VMCB is + * in an inconsistent state and can trigger assertions that would + * never happen otherwise. + */ + if (code == VMCB_EXIT_INVALID) { + vm_exit_svm(vmexit, code, info1, info2); + return (0); + } + + KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event " + "injection valid bit is set %#lx", __func__, ctrl->eventinj)); + + KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15, + ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)", + vmexit->inst_length, code, info1, info2)); + + svm_update_virqinfo(vcpu); + svm_save_intinfo(svm_sc, vcpu); + + switch (code) { + case VMCB_EXIT_IRET: + /* + * Restart execution at "iret" but with the intercept cleared. + */ + vmexit->inst_length = 0; + clear_nmi_blocking(vcpu); + handled = 1; + break; + case VMCB_EXIT_VINTR: /* interrupt window exiting */ + vmm_stat_incr(vcpu->vcpu, VMEXIT_VINTR, 1); + handled = 1; + break; + case VMCB_EXIT_INTR: /* external interrupt */ + vmm_stat_incr(vcpu->vcpu, VMEXIT_EXTINT, 1); + handled = 1; + break; + case VMCB_EXIT_NMI: /* external NMI */ + handled = 1; + break; + case 0x40 ... 0x5F: + vmm_stat_incr(vcpu->vcpu, VMEXIT_EXCEPTION, 1); + reflect = 1; + idtvec = code - 0x40; + switch (idtvec) { + case IDT_MC: + /* + * Call the machine check handler by hand. Also don't + * reflect the machine check back into the guest. + */ + reflect = 0; + SVM_CTR0(vcpu, "Vectoring to MCE handler"); + __asm __volatile("int $18"); + break; + case IDT_PF: + error = svm_setreg(vcpu, VM_REG_GUEST_CR2, info2); + KASSERT(error == 0, ("%s: error %d updating cr2", + __func__, error)); + /* fallthru */ + case IDT_NP: + case IDT_SS: + case IDT_GP: + case IDT_AC: + case IDT_TS: + errcode_valid = 1; + break; + + case IDT_DF: + errcode_valid = 1; + info1 = 0; + break; + case IDT_DB: { + /* + * Check if we are being stepped (RFLAGS.TF) + * and bounce vmexit to userland. + */ + bool stepped = 0; + uint64_t dr6 = 0; + + svm_getreg(vcpu, VM_REG_GUEST_DR6, &dr6); + stepped = !!(dr6 & DBREG_DR6_BS); + if (stepped && (vcpu->caps & (1 << VM_CAP_RFLAGS_TF))) { + vmexit->exitcode = VM_EXITCODE_DB; + vmexit->u.dbg.trace_trap = 1; + vmexit->u.dbg.pushf_intercept = 0; + + if (vcpu->dbg.popf_sstep) { + /* + * DB# exit was caused by stepping over + * popf. + */ + uint64_t rflags; + + vcpu->dbg.popf_sstep = 0; + + /* + * Update shadowed TF bit so the next + * setcap(..., RFLAGS_SSTEP, 0) restores + * the correct value + */ + svm_getreg(vcpu, VM_REG_GUEST_RFLAGS, + &rflags); + vcpu->dbg.rflags_tf = rflags & PSL_T; + } else if (vcpu->dbg.pushf_sstep) { + /* + * DB# exit was caused by stepping over + * pushf. + */ + vcpu->dbg.pushf_sstep = 0; + + /* + * Adjusting the pushed rflags after a + * restarted pushf instruction must be + * handled outside of svm.c due to the + * critical_enter() lock being held. + */ + vmexit->u.dbg.pushf_intercept = 1; + vmexit->u.dbg.tf_shadow_val = + vcpu->dbg.rflags_tf; + svm_paging_info(svm_get_vmcb(vcpu), + &vmexit->u.dbg.paging); + } + + /* Clear DR6 "single-step" bit. */ + dr6 &= ~DBREG_DR6_BS; + error = svm_setreg(vcpu, VM_REG_GUEST_DR6, dr6); + KASSERT(error == 0, + ("%s: error %d updating DR6\r\n", __func__, + error)); + + reflect = 0; + } + break; + } + case IDT_BP: + vmexit->exitcode = VM_EXITCODE_BPT; + vmexit->u.bpt.inst_length = vmexit->inst_length; + vmexit->inst_length = 0; + + reflect = 0; + break; + case IDT_OF: + case IDT_BR: + /* + * The 'nrip' field is populated for INT3, INTO and + * BOUND exceptions and this also implies that + * 'inst_length' is non-zero. + * + * Reset 'inst_length' to zero so the guest %rip at + * event injection is identical to what it was when + * the exception originally happened. + */ + SVM_CTR2(vcpu, "Reset inst_length from %d " + "to zero before injecting exception %d", + vmexit->inst_length, idtvec); + vmexit->inst_length = 0; + /* fallthru */ + default: + errcode_valid = 0; + info1 = 0; + break; + } + + if (reflect) { + KASSERT(vmexit->inst_length == 0, + ("invalid inst_length (%d) " + "when reflecting exception %d into guest", + vmexit->inst_length, idtvec)); + /* Reflect the exception back into the guest */ + SVM_CTR2(vcpu, "Reflecting exception " + "%d/%#x into the guest", idtvec, (int)info1); + error = vm_inject_exception(vcpu->vcpu, idtvec, + errcode_valid, info1, 0); + KASSERT(error == 0, ("%s: vm_inject_exception error %d", + __func__, error)); + handled = 1; + } + break; + case VMCB_EXIT_MSR: /* MSR access. */ + eax = state->rax; + ecx = ctx->sctx_rcx; + edx = ctx->sctx_rdx; + retu = false; + + if (info1) { + vmm_stat_incr(vcpu->vcpu, VMEXIT_WRMSR, 1); + val = (uint64_t)edx << 32 | eax; + SVM_CTR2(vcpu, "wrmsr %#x val %#lx", ecx, val); + if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) { + vmexit->exitcode = VM_EXITCODE_WRMSR; + vmexit->u.msr.code = ecx; + vmexit->u.msr.wval = val; + } else if (!retu) { + handled = 1; + } else { + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_wrmsr retu with bogus exitcode")); + } + } else { + SVM_CTR1(vcpu, "rdmsr %#x", ecx); + vmm_stat_incr(vcpu->vcpu, VMEXIT_RDMSR, 1); + if (emulate_rdmsr(vcpu, ecx, &retu)) { + vmexit->exitcode = VM_EXITCODE_RDMSR; + vmexit->u.msr.code = ecx; + } else if (!retu) { + handled = 1; + } else { + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_rdmsr retu with bogus exitcode")); + } + } + break; + case VMCB_EXIT_IO: + handled = svm_handle_io(vcpu, vmexit); + vmm_stat_incr(vcpu->vcpu, VMEXIT_INOUT, 1); + break; + case VMCB_EXIT_CPUID: + vmm_stat_incr(vcpu->vcpu, VMEXIT_CPUID, 1); + handled = x86_emulate_cpuid(vcpu->vcpu, + &state->rax, &ctx->sctx_rbx, &ctx->sctx_rcx, + &ctx->sctx_rdx); + break; + case VMCB_EXIT_HLT: + vmm_stat_incr(vcpu->vcpu, VMEXIT_HLT, 1); + vmexit->exitcode = VM_EXITCODE_HLT; + vmexit->u.hlt.rflags = state->rflags; + break; + case VMCB_EXIT_PAUSE: + vmexit->exitcode = VM_EXITCODE_PAUSE; + vmm_stat_incr(vcpu->vcpu, VMEXIT_PAUSE, 1); + break; + case VMCB_EXIT_NPF: + /* EXITINFO2 contains the faulting guest physical address */ + if (info1 & VMCB_NPF_INFO1_RSV) { + SVM_CTR2(vcpu, "nested page fault with " + "reserved bits set: info1(%#lx) info2(%#lx)", + info1, info2); + } else if (vm_mem_allocated(vcpu->vcpu, info2) || + ppt_is_mmio(svm_sc->vm, info2)) { + vmexit->exitcode = VM_EXITCODE_PAGING; + vmexit->u.paging.gpa = info2; + vmexit->u.paging.fault_type = npf_fault_type(info1); + vmm_stat_incr(vcpu->vcpu, VMEXIT_NESTED_FAULT, 1); + SVM_CTR3(vcpu, "nested page fault " + "on gpa %#lx/%#lx at rip %#lx", + info2, info1, state->rip); + } else if (svm_npf_emul_fault(info1)) { + svm_handle_inst_emul(vmcb, info2, vmexit); + vmm_stat_incr(vcpu->vcpu, VMEXIT_INST_EMUL, 1); + SVM_CTR3(vcpu, "inst_emul fault " + "for gpa %#lx/%#lx at rip %#lx", + info2, info1, state->rip); + } + break; + case VMCB_EXIT_MONITOR: + vmexit->exitcode = VM_EXITCODE_MONITOR; + break; + case VMCB_EXIT_MWAIT: + vmexit->exitcode = VM_EXITCODE_MWAIT; + break; + case VMCB_EXIT_PUSHF: { + if (vcpu->caps & (1 << VM_CAP_RFLAGS_TF)) { + uint64_t rflags; + + svm_getreg(vcpu, VM_REG_GUEST_RFLAGS, &rflags); + /* Restart this instruction. */ + vmexit->inst_length = 0; + /* Disable PUSHF intercepts - avoid a loop. */ + svm_set_intercept(vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_PUSHF, 0); + /* Trace restarted instruction. */ + svm_setreg(vcpu, VM_REG_GUEST_RFLAGS, (rflags | PSL_T)); + /* Let the IDT_DB handler know that pushf was stepped. + */ + vcpu->dbg.pushf_sstep = 1; + handled = 1; + } + break; + } + case VMCB_EXIT_POPF: { + if (vcpu->caps & (1 << VM_CAP_RFLAGS_TF)) { + uint64_t rflags; + + svm_getreg(vcpu, VM_REG_GUEST_RFLAGS, &rflags); + /* Restart this instruction */ + vmexit->inst_length = 0; + /* Disable POPF intercepts - avoid a loop*/ + svm_set_intercept(vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_POPF, 0); + /* Trace restarted instruction */ + svm_setreg(vcpu, VM_REG_GUEST_RFLAGS, (rflags | PSL_T)); + vcpu->dbg.popf_sstep = 1; + handled = 1; + } + break; + } + case VMCB_EXIT_SHUTDOWN: + case VMCB_EXIT_VMRUN: + case VMCB_EXIT_VMMCALL: + case VMCB_EXIT_VMLOAD: + case VMCB_EXIT_VMSAVE: + case VMCB_EXIT_STGI: + case VMCB_EXIT_CLGI: + case VMCB_EXIT_SKINIT: + case VMCB_EXIT_ICEBP: + case VMCB_EXIT_INVLPGA: + vm_inject_ud(vcpu->vcpu); + handled = 1; + break; + case VMCB_EXIT_INVD: + case VMCB_EXIT_WBINVD: + /* ignore exit */ + handled = 1; + break; + default: + vmm_stat_incr(vcpu->vcpu, VMEXIT_UNKNOWN, 1); + break; + } + + SVM_CTR4(vcpu, "%s %s vmexit at %#lx/%d", + handled ? "handled" : "unhandled", exit_reason_to_str(code), + vmexit->rip, vmexit->inst_length); + + if (handled) { + vmexit->rip += vmexit->inst_length; + vmexit->inst_length = 0; + state->rip = vmexit->rip; + } else { + if (vmexit->exitcode == VM_EXITCODE_BOGUS) { + /* + * If this VM exit was not claimed by anybody then + * treat it as a generic SVM exit. + */ + vm_exit_svm(vmexit, code, info1, info2); + } else { + /* + * The exitcode and collateral have been populated. + * The VM exit will be processed further in userland. + */ + } + } + return (handled); +} + +static void +svm_inj_intinfo(struct svm_softc *svm_sc, struct svm_vcpu *vcpu) +{ + uint64_t intinfo; + + if (!vm_entry_intinfo(vcpu->vcpu, &intinfo)) + return; + + KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not " + "valid: %#lx", __func__, intinfo)); + + svm_eventinject(vcpu, VMCB_EXITINTINFO_TYPE(intinfo), + VMCB_EXITINTINFO_VECTOR(intinfo), + VMCB_EXITINTINFO_EC(intinfo), + VMCB_EXITINTINFO_EC_VALID(intinfo)); + vmm_stat_incr(vcpu->vcpu, VCPU_INTINFO_INJECTED, 1); + SVM_CTR1(vcpu, "Injected entry intinfo: %#lx", intinfo); +} + +/* + * Inject event to virtual cpu. + */ +static void +svm_inj_interrupts(struct svm_softc *sc, struct svm_vcpu *vcpu, + struct vlapic *vlapic) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + uint8_t v_tpr; + int vector, need_intr_window; + int extint_pending; + + if (vcpu->caps & (1 << VM_CAP_MASK_HWINTR)) { + return; + } + + state = svm_get_vmcb_state(vcpu); + ctrl = svm_get_vmcb_ctrl(vcpu); + + need_intr_window = 0; + + if (vcpu->nextrip != state->rip) { + ctrl->intr_shadow = 0; + SVM_CTR2(vcpu, "Guest interrupt blocking " + "cleared due to rip change: %#lx/%#lx", + vcpu->nextrip, state->rip); + } + + /* + * Inject pending events or exceptions for this vcpu. + * + * An event might be pending because the previous #VMEXIT happened + * during event delivery (i.e. ctrl->exitintinfo). + * + * An event might also be pending because an exception was injected + * by the hypervisor (e.g. #PF during instruction emulation). + */ + svm_inj_intinfo(sc, vcpu); + + /* NMI event has priority over interrupts. */ + if (vm_nmi_pending(vcpu->vcpu)) { + if (nmi_blocked(vcpu)) { + /* + * Can't inject another NMI if the guest has not + * yet executed an "iret" after the last NMI. + */ + SVM_CTR0(vcpu, "Cannot inject NMI due " + "to NMI-blocking"); + } else if (ctrl->intr_shadow) { + /* + * Can't inject an NMI if the vcpu is in an intr_shadow. + */ + SVM_CTR0(vcpu, "Cannot inject NMI due to " + "interrupt shadow"); + need_intr_window = 1; + goto done; + } else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { + /* + * If there is already an exception/interrupt pending + * then defer the NMI until after that. + */ + SVM_CTR1(vcpu, "Cannot inject NMI due to " + "eventinj %#lx", ctrl->eventinj); + + /* + * Use self-IPI to trigger a VM-exit as soon as + * possible after the event injection is completed. + * + * This works only if the external interrupt exiting + * is at a lower priority than the event injection. + * + * Although not explicitly specified in APMv2 the + * relative priorities were verified empirically. + */ + ipi_cpu(curcpu, IPI_AST); /* XXX vmm_ipinum? */ + } else { + vm_nmi_clear(vcpu->vcpu); + + /* Inject NMI, vector number is not used */ + svm_eventinject(vcpu, VMCB_EVENTINJ_TYPE_NMI, + IDT_NMI, 0, false); + + /* virtual NMI blocking is now in effect */ + enable_nmi_blocking(vcpu); + + SVM_CTR0(vcpu, "Injecting vNMI"); + } + } + + extint_pending = vm_extint_pending(vcpu->vcpu); + if (!extint_pending) { + if (!vlapic_pending_intr(vlapic, &vector)) + goto done; + KASSERT(vector >= 16 && vector <= 255, + ("invalid vector %d from local APIC", vector)); + } else { + /* Ask the legacy pic for a vector to inject */ + vatpic_pending_intr(sc->vm, &vector); + KASSERT(vector >= 0 && vector <= 255, + ("invalid vector %d from INTR", vector)); + } + + /* + * If the guest has disabled interrupts or is in an interrupt shadow + * then we cannot inject the pending interrupt. + */ + if ((state->rflags & PSL_I) == 0) { + SVM_CTR2(vcpu, "Cannot inject vector %d due to " + "rflags %#lx", vector, state->rflags); + need_intr_window = 1; + goto done; + } + + if (ctrl->intr_shadow) { + SVM_CTR1(vcpu, "Cannot inject vector %d due to " + "interrupt shadow", vector); + need_intr_window = 1; + goto done; + } + + if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { + SVM_CTR2(vcpu, "Cannot inject vector %d due to " + "eventinj %#lx", vector, ctrl->eventinj); + need_intr_window = 1; + goto done; + } + + svm_eventinject(vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false); + + if (!extint_pending) { + vlapic_intr_accepted(vlapic, vector); + } else { + vm_extint_clear(vcpu->vcpu); + vatpic_intr_accepted(sc->vm, vector); + } + + /* + * Force a VM-exit as soon as the vcpu is ready to accept another + * interrupt. This is done because the PIC might have another vector + * that it wants to inject. Also, if the APIC has a pending interrupt + * that was preempted by the ExtInt then it allows us to inject the + * APIC vector as soon as possible. + */ + need_intr_window = 1; +done: + /* + * The guest can modify the TPR by writing to %CR8. In guest mode + * the processor reflects this write to V_TPR without hypervisor + * intervention. + * + * The guest can also modify the TPR by writing to it via the memory + * mapped APIC page. In this case, the write will be emulated by the + * hypervisor. For this reason V_TPR must be updated before every + * VMRUN. + */ + v_tpr = vlapic_get_cr8(vlapic); + KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr)); + if (ctrl->v_tpr != v_tpr) { + SVM_CTR2(vcpu, "VMCB V_TPR changed from %#x to %#x", + ctrl->v_tpr, v_tpr); + ctrl->v_tpr = v_tpr; + svm_set_dirty(vcpu, VMCB_CACHE_TPR); + } + + if (need_intr_window) { + /* + * We use V_IRQ in conjunction with the VINTR intercept to + * trap into the hypervisor as soon as a virtual interrupt + * can be delivered. + * + * Since injected events are not subject to intercept checks + * we need to ensure that the V_IRQ is not actually going to + * be delivered on VM entry. The KASSERT below enforces this. + */ + KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 || + (state->rflags & PSL_I) == 0 || ctrl->intr_shadow, + ("Bogus intr_window_exiting: eventinj (%#lx), " + "intr_shadow (%u), rflags (%#lx)", + ctrl->eventinj, ctrl->intr_shadow, state->rflags)); + enable_intr_window_exiting(vcpu); + } else { + disable_intr_window_exiting(vcpu); + } +} + +static __inline void +restore_host_tss(void) +{ + struct system_segment_descriptor *tss_sd; + + /* + * The TSS descriptor was in use prior to launching the guest so it + * has been marked busy. + * + * 'ltr' requires the descriptor to be marked available so change the + * type to "64-bit available TSS". + */ + tss_sd = PCPU_GET(tss); + tss_sd->sd_type = SDT_SYSTSS; + ltr(GSEL(GPROC0_SEL, SEL_KPL)); +} + +static void +svm_pmap_activate(struct svm_vcpu *vcpu, pmap_t pmap) +{ + struct vmcb_ctrl *ctrl; + long eptgen; + int cpu; + bool alloc_asid; + + cpu = curcpu; + CPU_SET_ATOMIC(cpu, &pmap->pm_active); + smr_enter(pmap->pm_eptsmr); + + ctrl = svm_get_vmcb_ctrl(vcpu); + + /* + * The TLB entries associated with the vcpu's ASID are not valid + * if either of the following conditions is true: + * + * 1. The vcpu's ASID generation is different than the host cpu's + * ASID generation. This happens when the vcpu migrates to a new + * host cpu. It can also happen when the number of vcpus executing + * on a host cpu is greater than the number of ASIDs available. + * + * 2. The pmap generation number is different than the value cached in + * the 'vcpustate'. This happens when the host invalidates pages + * belonging to the guest. + * + * asidgen eptgen Action + * mismatch mismatch + * 0 0 (a) + * 0 1 (b1) or (b2) + * 1 0 (c) + * 1 1 (d) + * + * (a) There is no mismatch in eptgen or ASID generation and therefore + * no further action is needed. + * + * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is + * retained and the TLB entries associated with this ASID + * are flushed by VMRUN. + * + * (b2) If the cpu does not support FlushByAsid then a new ASID is + * allocated. + * + * (c) A new ASID is allocated. + * + * (d) A new ASID is allocated. + */ + + alloc_asid = false; + eptgen = atomic_load_long(&pmap->pm_eptgen); + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING; + + if (vcpu->asid.gen != asid[cpu].gen) { + alloc_asid = true; /* (c) and (d) */ + } else if (vcpu->eptgen != eptgen) { + if (flush_by_asid()) + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; /* (b1) */ + else + alloc_asid = true; /* (b2) */ + } else { + /* + * This is the common case (a). + */ + KASSERT(!alloc_asid, ("ASID allocation not necessary")); + KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING, + ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl)); + } + + if (alloc_asid) { + if (++asid[cpu].num >= nasid) { + asid[cpu].num = 1; + if (++asid[cpu].gen == 0) + asid[cpu].gen = 1; + /* + * If this cpu does not support "flush-by-asid" + * then flush the entire TLB on a generation + * bump. Subsequent ASID allocation in this + * generation can be done without a TLB flush. + */ + if (!flush_by_asid()) + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL; + } + vcpu->asid.gen = asid[cpu].gen; + vcpu->asid.num = asid[cpu].num; + + ctrl->asid = vcpu->asid.num; + svm_set_dirty(vcpu, VMCB_CACHE_ASID); + /* + * If this cpu supports "flush-by-asid" then the TLB + * was not flushed after the generation bump. The TLB + * is flushed selectively after every new ASID allocation. + */ + if (flush_by_asid()) + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; + } + vcpu->eptgen = eptgen; + + KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero")); + KASSERT(ctrl->asid == vcpu->asid.num, + ("ASID mismatch: %u/%u", ctrl->asid, vcpu->asid.num)); +} + +static void +svm_pmap_deactivate(pmap_t pmap) +{ + smr_exit(pmap->pm_eptsmr); + CPU_CLR_ATOMIC(curcpu, &pmap->pm_active); +} + +static __inline void +disable_gintr(void) +{ + + __asm __volatile("clgi"); +} + +static __inline void +enable_gintr(void) +{ + + __asm __volatile("stgi"); +} + +static __inline void +svm_dr_enter_guest(struct svm_regctx *gctx) +{ + + /* Save host control debug registers. */ + gctx->host_dr7 = rdr7(); + gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); + + /* + * Disable debugging in DR7 and DEBUGCTL to avoid triggering + * exceptions in the host based on the guest DRx values. The + * guest DR6, DR7, and DEBUGCTL are saved/restored in the + * VMCB. + */ + load_dr7(0); + wrmsr(MSR_DEBUGCTLMSR, 0); + + /* Save host debug registers. */ + gctx->host_dr0 = rdr0(); + gctx->host_dr1 = rdr1(); + gctx->host_dr2 = rdr2(); + gctx->host_dr3 = rdr3(); + gctx->host_dr6 = rdr6(); + + /* Restore guest debug registers. */ + load_dr0(gctx->sctx_dr0); + load_dr1(gctx->sctx_dr1); + load_dr2(gctx->sctx_dr2); + load_dr3(gctx->sctx_dr3); +} + +static __inline void +svm_dr_leave_guest(struct svm_regctx *gctx) +{ + + /* Save guest debug registers. */ + gctx->sctx_dr0 = rdr0(); + gctx->sctx_dr1 = rdr1(); + gctx->sctx_dr2 = rdr2(); + gctx->sctx_dr3 = rdr3(); + + /* + * Restore host debug registers. Restore DR7 and DEBUGCTL + * last. + */ + load_dr0(gctx->host_dr0); + load_dr1(gctx->host_dr1); + load_dr2(gctx->host_dr2); + load_dr3(gctx->host_dr3); + load_dr6(gctx->host_dr6); + wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl); + load_dr7(gctx->host_dr7); +} + +/* + * Start vcpu with specified RIP. + */ +static int +svm_run(void *vcpui, register_t rip, pmap_t pmap, struct vm_eventinfo *evinfo) +{ + struct svm_regctx *gctx; + struct svm_softc *svm_sc; + struct svm_vcpu *vcpu; + struct vmcb_state *state; + struct vmcb_ctrl *ctrl; + struct vm_exit *vmexit; + struct vlapic *vlapic; + uint64_t vmcb_pa; + int handled; + uint16_t ldt_sel; + + vcpu = vcpui; + svm_sc = vcpu->sc; + state = svm_get_vmcb_state(vcpu); + ctrl = svm_get_vmcb_ctrl(vcpu); + vmexit = vm_exitinfo(vcpu->vcpu); + vlapic = vm_lapic(vcpu->vcpu); + + gctx = svm_get_guest_regctx(vcpu); + vmcb_pa = vcpu->vmcb_pa; + + if (vcpu->lastcpu != curcpu) { + /* + * Force new ASID allocation by invalidating the generation. + */ + vcpu->asid.gen = 0; + + /* + * Invalidate the VMCB state cache by marking all fields dirty. + */ + svm_set_dirty(vcpu, 0xffffffff); + + /* + * XXX + * Setting 'vcpu->lastcpu' here is bit premature because + * we may return from this function without actually executing + * the VMRUN instruction. This could happen if a rendezvous + * or an AST is pending on the first time through the loop. + * + * This works for now but any new side-effects of vcpu + * migration should take this case into account. + */ + vcpu->lastcpu = curcpu; + vmm_stat_incr(vcpu->vcpu, VCPU_MIGRATIONS, 1); + } + + svm_msr_guest_enter(vcpu); + + /* Update Guest RIP */ + state->rip = rip; + + do { + /* + * Disable global interrupts to guarantee atomicity during + * loading of guest state. This includes not only the state + * loaded by the "vmrun" instruction but also software state + * maintained by the hypervisor: suspended and rendezvous + * state, NPT generation number, vlapic interrupts etc. + */ + disable_gintr(); + + if (vcpu_suspended(evinfo)) { + enable_gintr(); + vm_exit_suspended(vcpu->vcpu, state->rip); + break; + } + + if (vcpu_rendezvous_pending(vcpu->vcpu, evinfo)) { + enable_gintr(); + vm_exit_rendezvous(vcpu->vcpu, state->rip); + break; + } + + if (vcpu_reqidle(evinfo)) { + enable_gintr(); + vm_exit_reqidle(vcpu->vcpu, state->rip); + break; + } + + /* We are asked to give the cpu by scheduler. */ + if (vcpu_should_yield(vcpu->vcpu)) { + enable_gintr(); + vm_exit_astpending(vcpu->vcpu, state->rip); + break; + } + + if (vcpu_debugged(vcpu->vcpu)) { + enable_gintr(); + vm_exit_debug(vcpu->vcpu, state->rip); + break; + } + + /* + * #VMEXIT resumes the host with the guest LDTR, so + * save the current LDT selector so it can be restored + * after an exit. The userspace hypervisor probably + * doesn't use a LDT, but save and restore it to be + * safe. + */ + ldt_sel = sldt(); + + svm_inj_interrupts(svm_sc, vcpu, vlapic); + + /* + * Check the pmap generation and the ASID generation to + * ensure that the vcpu does not use stale TLB mappings. + */ + svm_pmap_activate(vcpu, pmap); + + ctrl->vmcb_clean = vmcb_clean & ~vcpu->dirty; + vcpu->dirty = 0; + SVM_CTR1(vcpu, "vmcb clean %#x", ctrl->vmcb_clean); + + /* Launch Virtual Machine. */ + SVM_CTR1(vcpu, "Resume execution at %#lx", state->rip); + svm_dr_enter_guest(gctx); + svm_launch(vmcb_pa, gctx, get_pcpu()); + svm_dr_leave_guest(gctx); + + svm_pmap_deactivate(pmap); + + /* + * The host GDTR and IDTR is saved by VMRUN and restored + * automatically on #VMEXIT. However, the host TSS needs + * to be restored explicitly. + */ + restore_host_tss(); + + /* Restore host LDTR. */ + lldt(ldt_sel); + + /* #VMEXIT disables interrupts so re-enable them here. */ + enable_gintr(); + + /* Update 'nextrip' */ + vcpu->nextrip = state->rip; + + /* Handle #VMEXIT and if required return to user space. */ + handled = svm_vmexit(svm_sc, vcpu, vmexit); + } while (handled); + + svm_msr_guest_exit(vcpu); + + return (0); +} + +static void +svm_vcpu_cleanup(void *vcpui) +{ + struct svm_vcpu *vcpu = vcpui; + + free(vcpu->vmcb, M_SVM); + free(vcpu, M_SVM); +} + +static void +svm_cleanup(void *vmi) +{ + struct svm_softc *sc = vmi; + + free(sc->iopm_bitmap, M_SVM); + free(sc->msr_bitmap, M_SVM); + free(sc, M_SVM); +} + +static register_t * +swctx_regptr(struct svm_regctx *regctx, int reg) +{ + + switch (reg) { + case VM_REG_GUEST_RBX: + return (®ctx->sctx_rbx); + case VM_REG_GUEST_RCX: + return (®ctx->sctx_rcx); + case VM_REG_GUEST_RDX: + return (®ctx->sctx_rdx); + case VM_REG_GUEST_RDI: + return (®ctx->sctx_rdi); + case VM_REG_GUEST_RSI: + return (®ctx->sctx_rsi); + case VM_REG_GUEST_RBP: + return (®ctx->sctx_rbp); + case VM_REG_GUEST_R8: + return (®ctx->sctx_r8); + case VM_REG_GUEST_R9: + return (®ctx->sctx_r9); + case VM_REG_GUEST_R10: + return (®ctx->sctx_r10); + case VM_REG_GUEST_R11: + return (®ctx->sctx_r11); + case VM_REG_GUEST_R12: + return (®ctx->sctx_r12); + case VM_REG_GUEST_R13: + return (®ctx->sctx_r13); + case VM_REG_GUEST_R14: + return (®ctx->sctx_r14); + case VM_REG_GUEST_R15: + return (®ctx->sctx_r15); + case VM_REG_GUEST_DR0: + return (®ctx->sctx_dr0); + case VM_REG_GUEST_DR1: + return (®ctx->sctx_dr1); + case VM_REG_GUEST_DR2: + return (®ctx->sctx_dr2); + case VM_REG_GUEST_DR3: + return (®ctx->sctx_dr3); + default: + return (NULL); + } +} + +static int +svm_getreg(void *vcpui, int ident, uint64_t *val) +{ + struct svm_vcpu *vcpu; + register_t *reg; + + vcpu = vcpui; + + if (ident == VM_REG_GUEST_INTR_SHADOW) { + return (svm_get_intr_shadow(vcpu, val)); + } + + if (vmcb_read(vcpu, ident, val) == 0) { + return (0); + } + + reg = swctx_regptr(svm_get_guest_regctx(vcpu), ident); + + if (reg != NULL) { + *val = *reg; + return (0); + } + + SVM_CTR1(vcpu, "svm_getreg: unknown register %#x", ident); + return (EINVAL); +} + +static int +svm_setreg(void *vcpui, int ident, uint64_t val) +{ + struct svm_vcpu *vcpu; + register_t *reg; + + vcpu = vcpui; + + if (ident == VM_REG_GUEST_INTR_SHADOW) { + return (svm_modify_intr_shadow(vcpu, val)); + } + + /* Do not permit user write access to VMCB fields by offset. */ + if (!VMCB_ACCESS_OK(ident)) { + if (vmcb_write(vcpu, ident, val) == 0) { + return (0); + } + } + + reg = swctx_regptr(svm_get_guest_regctx(vcpu), ident); + + if (reg != NULL) { + *reg = val; + return (0); + } + + if (ident == VM_REG_GUEST_ENTRY_INST_LENGTH) { + /* Ignore. */ + return (0); + } + + /* + * XXX deal with CR3 and invalidate TLB entries tagged with the + * vcpu's ASID. This needs to be treated differently depending on + * whether 'running' is true/false. + */ + + SVM_CTR1(vcpu, "svm_setreg: unknown register %#x", ident); + return (EINVAL); +} + +static int +svm_getdesc(void *vcpui, int reg, struct seg_desc *desc) +{ + return (vmcb_getdesc(vcpui, reg, desc)); +} + +static int +svm_setdesc(void *vcpui, int reg, struct seg_desc *desc) +{ + return (vmcb_setdesc(vcpui, reg, desc)); +} + +#ifdef BHYVE_SNAPSHOT +static int +svm_snapshot_reg(void *vcpui, int ident, struct vm_snapshot_meta *meta) +{ + int ret; + uint64_t val; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = svm_getreg(vcpui, ident, &val); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + + ret = svm_setreg(vcpui, ident, val); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} +#endif + +static int +svm_setcap(void *vcpui, int type, int val) +{ + struct svm_vcpu *vcpu; + struct vlapic *vlapic; + int error; + + vcpu = vcpui; + error = 0; + + switch (type) { + case VM_CAP_HALT_EXIT: + svm_set_intercept(vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_HLT, val); + break; + case VM_CAP_PAUSE_EXIT: + svm_set_intercept(vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_PAUSE, val); + break; + case VM_CAP_UNRESTRICTED_GUEST: + /* Unrestricted guest execution cannot be disabled in SVM */ + if (val == 0) + error = EINVAL; + break; + case VM_CAP_BPT_EXIT: + svm_set_intercept(vcpu, VMCB_EXC_INTCPT, BIT(IDT_BP), val); + break; + case VM_CAP_IPI_EXIT: + vlapic = vm_lapic(vcpu->vcpu); + vlapic->ipi_exit = val; + break; + case VM_CAP_MASK_HWINTR: + vcpu->caps &= ~(1 << VM_CAP_MASK_HWINTR); + vcpu->caps |= (val << VM_CAP_MASK_HWINTR); + break; + case VM_CAP_RFLAGS_TF: { + uint64_t rflags; + + /* Fetch RFLAGS. */ + if (svm_getreg(vcpu, VM_REG_GUEST_RFLAGS, &rflags)) { + error = (EINVAL); + break; + } + if (val) { + /* Save current TF bit. */ + vcpu->dbg.rflags_tf = rflags & PSL_T; + /* Trace next instruction. */ + if (svm_setreg(vcpu, VM_REG_GUEST_RFLAGS, + (rflags | PSL_T))) { + error = (EINVAL); + break; + } + vcpu->caps |= (1 << VM_CAP_RFLAGS_TF); + } else { + /* + * Restore shadowed RFLAGS.TF only if vCPU was + * previously stepped + */ + if (vcpu->caps & (1 << VM_CAP_RFLAGS_TF)) { + rflags &= ~PSL_T; + rflags |= vcpu->dbg.rflags_tf; + vcpu->dbg.rflags_tf = 0; + + if (svm_setreg(vcpu, VM_REG_GUEST_RFLAGS, + rflags)) { + error = (EINVAL); + break; + } + vcpu->caps &= ~(1 << VM_CAP_RFLAGS_TF); + } + } + + svm_set_intercept(vcpu, VMCB_EXC_INTCPT, BIT(IDT_DB), val); + svm_set_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_POPF, + val); + svm_set_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_PUSHF, + val); + break; + } + default: + error = ENOENT; + break; + } + return (error); +} + +static int +svm_getcap(void *vcpui, int type, int *retval) +{ + struct svm_vcpu *vcpu; + struct vlapic *vlapic; + int error; + + vcpu = vcpui; + error = 0; + + switch (type) { + case VM_CAP_HALT_EXIT: + *retval = svm_get_intercept(vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_HLT); + break; + case VM_CAP_PAUSE_EXIT: + *retval = svm_get_intercept(vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_PAUSE); + break; + case VM_CAP_UNRESTRICTED_GUEST: + *retval = 1; /* unrestricted guest is always enabled */ + break; + case VM_CAP_BPT_EXIT: + *retval = svm_get_intercept(vcpu, VMCB_EXC_INTCPT, BIT(IDT_BP)); + break; + case VM_CAP_IPI_EXIT: + vlapic = vm_lapic(vcpu->vcpu); + *retval = vlapic->ipi_exit; + break; + case VM_CAP_RFLAGS_TF: + *retval = !!(vcpu->caps & (1 << VM_CAP_RFLAGS_TF)); + break; + case VM_CAP_MASK_HWINTR: + *retval = !!(vcpu->caps & (1 << VM_CAP_MASK_HWINTR)); + break; + default: + error = ENOENT; + break; + } + return (error); +} + +static struct vmspace * +svm_vmspace_alloc(vm_offset_t min, vm_offset_t max) +{ + return (svm_npt_alloc(min, max)); +} + +static void +svm_vmspace_free(struct vmspace *vmspace) +{ + svm_npt_free(vmspace); +} + +static struct vlapic * +svm_vlapic_init(void *vcpui) +{ + struct svm_vcpu *vcpu; + struct vlapic *vlapic; + + vcpu = vcpui; + vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO); + vlapic->vm = vcpu->sc->vm; + vlapic->vcpu = vcpu->vcpu; + vlapic->vcpuid = vcpu->vcpuid; + vlapic->apic_page = malloc_aligned(PAGE_SIZE, PAGE_SIZE, M_SVM_VLAPIC, + M_WAITOK | M_ZERO); + + vlapic_init(vlapic); + + return (vlapic); +} + +static void +svm_vlapic_cleanup(struct vlapic *vlapic) +{ + + vlapic_cleanup(vlapic); + free(vlapic->apic_page, M_SVM_VLAPIC); + free(vlapic, M_SVM_VLAPIC); +} + +#ifdef BHYVE_SNAPSHOT +static int +svm_vcpu_snapshot(void *vcpui, struct vm_snapshot_meta *meta) +{ + struct svm_vcpu *vcpu; + int err, running, hostcpu; + + vcpu = vcpui; + err = 0; + + running = vcpu_is_running(vcpu->vcpu, &hostcpu); + if (running && hostcpu != curcpu) { + printf("%s: %s%d is running", __func__, vm_name(vcpu->sc->vm), + vcpu->vcpuid); + return (EINVAL); + } + + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_CR0, meta); + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_CR2, meta); + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_CR3, meta); + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_CR4, meta); + + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_DR6, meta); + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_DR7, meta); + + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_RAX, meta); + + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_RSP, meta); + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_RIP, meta); + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_RFLAGS, meta); + + /* Guest segments */ + /* ES */ + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_ES, meta); + err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_ES, meta); + + /* CS */ + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_CS, meta); + err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_CS, meta); + + /* SS */ + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_SS, meta); + err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_SS, meta); + + /* DS */ + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_DS, meta); + err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_DS, meta); + + /* FS */ + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_FS, meta); + err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_FS, meta); + + /* GS */ + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_GS, meta); + err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_GS, meta); + + /* TR */ + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_TR, meta); + err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_TR, meta); + + /* LDTR */ + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_LDTR, meta); + err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_LDTR, meta); + + /* EFER */ + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_EFER, meta); + + /* IDTR and GDTR */ + err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_IDTR, meta); + err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_GDTR, meta); + + /* Specific AMD registers */ + err += svm_snapshot_reg(vcpu, VM_REG_GUEST_INTR_SHADOW, meta); + + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_CR_INTERCEPT, 4), meta); + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_DR_INTERCEPT, 4), meta); + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_EXC_INTERCEPT, 4), meta); + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_INST1_INTERCEPT, 4), meta); + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_INST2_INTERCEPT, 4), meta); + + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_PAUSE_FILTHRESH, 2), meta); + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_PAUSE_FILCNT, 2), meta); + + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_ASID, 4), meta); + + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_TLB_CTRL, 4), meta); + + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_VIRQ, 8), meta); + + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_EXIT_REASON, 8), meta); + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_EXITINFO1, 8), meta); + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_EXITINFO2, 8), meta); + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_EXITINTINFO, 8), meta); + + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_NP_ENABLE, 1), meta); + + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_AVIC_BAR, 8), meta); + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_AVIC_PAGE, 8), meta); + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_AVIC_LT, 8), meta); + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_AVIC_PT, 8), meta); + + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_CPL, 1), meta); + + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_STAR, 8), meta); + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_LSTAR, 8), meta); + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_CSTAR, 8), meta); + + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_SFMASK, 8), meta); + + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_KERNELGBASE, 8), meta); + + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_SYSENTER_CS, 8), meta); + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_SYSENTER_ESP, 8), meta); + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_SYSENTER_EIP, 8), meta); + + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_GUEST_PAT, 8), meta); + + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_DBGCTL, 8), meta); + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_BR_FROM, 8), meta); + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_BR_TO, 8), meta); + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_INT_FROM, 8), meta); + err += vmcb_snapshot_any(vcpu, + VMCB_ACCESS(VMCB_OFF_INT_TO, 8), meta); + if (err != 0) + goto done; + + /* Snapshot swctx for virtual cpu */ + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rbp, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rbx, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rcx, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rdx, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rdi, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rsi, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r8, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r9, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r10, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r11, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r12, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r13, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r14, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r15, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr0, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr1, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr2, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr3, meta, err, done); + + /* Restore other svm_vcpu struct fields */ + + /* Restore NEXTRIP field */ + SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, err, done); + + /* Restore lastcpu field */ + SNAPSHOT_VAR_OR_LEAVE(vcpu->lastcpu, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->dirty, meta, err, done); + + /* Restore EPTGEN field - EPT is Extended Page Table */ + SNAPSHOT_VAR_OR_LEAVE(vcpu->eptgen, meta, err, done); + + SNAPSHOT_VAR_OR_LEAVE(vcpu->asid.gen, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->asid.num, meta, err, done); + + SNAPSHOT_BUF_OR_LEAVE(&vcpu->mtrr, sizeof(vcpu->mtrr), meta, err, done); + + /* Set all caches dirty */ + if (meta->op == VM_SNAPSHOT_RESTORE) + svm_set_dirty(vcpu, 0xffffffff); + +done: + return (err); +} + +static int +svm_restore_tsc(void *vcpui, uint64_t offset) +{ + struct svm_vcpu *vcpu = vcpui; + + svm_set_tsc_offset(vcpu, offset); + + return (0); +} +#endif + +const struct vmm_ops vmm_ops_amd = { + .modinit = svm_modinit, + .modcleanup = svm_modcleanup, + .modresume = svm_modresume, + .modsuspend = svm_modsuspend, + .init = svm_init, + .run = svm_run, + .cleanup = svm_cleanup, + .vcpu_init = svm_vcpu_init, + .vcpu_cleanup = svm_vcpu_cleanup, + .getreg = svm_getreg, + .setreg = svm_setreg, + .getdesc = svm_getdesc, + .setdesc = svm_setdesc, + .getcap = svm_getcap, + .setcap = svm_setcap, + .vmspace_alloc = svm_vmspace_alloc, + .vmspace_free = svm_vmspace_free, + .vlapic_init = svm_vlapic_init, + .vlapic_cleanup = svm_vlapic_cleanup, +#ifdef BHYVE_SNAPSHOT + .vcpu_snapshot = svm_vcpu_snapshot, + .restore_tsc = svm_restore_tsc, +#endif +}; diff --git a/sys/amd64/vmm/amd/svm.h b/sys/amd64/vmm/amd/svm.h new file mode 100644 index 000000000000..16459506832a --- /dev/null +++ b/sys/amd64/vmm/amd/svm.h @@ -0,0 +1,73 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SVM_H_ +#define _SVM_H_ + +struct pcpu; +struct svm_softc; +struct svm_vcpu; + +/* + * Guest register state that is saved outside the VMCB. + */ +struct svm_regctx { + register_t sctx_rbp; + register_t sctx_rbx; + register_t sctx_rcx; + register_t sctx_rdx; + register_t sctx_rdi; + register_t sctx_rsi; + register_t sctx_r8; + register_t sctx_r9; + register_t sctx_r10; + register_t sctx_r11; + register_t sctx_r12; + register_t sctx_r13; + register_t sctx_r14; + register_t sctx_r15; + register_t sctx_dr0; + register_t sctx_dr1; + register_t sctx_dr2; + register_t sctx_dr3; + + register_t host_dr0; + register_t host_dr1; + register_t host_dr2; + register_t host_dr3; + register_t host_dr6; + register_t host_dr7; + uint64_t host_debugctl; +}; + +void svm_launch(uint64_t pa, struct svm_regctx *gctx, struct pcpu *pcpu); +#ifdef BHYVE_SNAPSHOT +void svm_set_tsc_offset(struct svm_vcpu *vcpu, uint64_t offset); +#endif + +#endif /* _SVM_H_ */ diff --git a/sys/amd64/vmm/amd/svm_genassym.c b/sys/amd64/vmm/amd/svm_genassym.c new file mode 100644 index 000000000000..21d008190028 --- /dev/null +++ b/sys/amd64/vmm/amd/svm_genassym.c @@ -0,0 +1,49 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/assym.h> +#include <x86/specialreg.h> + +#include "svm.h" + +ASSYM(SCTX_RBX, offsetof(struct svm_regctx, sctx_rbx)); +ASSYM(SCTX_RCX, offsetof(struct svm_regctx, sctx_rcx)); +ASSYM(SCTX_RBP, offsetof(struct svm_regctx, sctx_rbp)); +ASSYM(SCTX_RDX, offsetof(struct svm_regctx, sctx_rdx)); +ASSYM(SCTX_RDI, offsetof(struct svm_regctx, sctx_rdi)); +ASSYM(SCTX_RSI, offsetof(struct svm_regctx, sctx_rsi)); +ASSYM(SCTX_R8, offsetof(struct svm_regctx, sctx_r8)); +ASSYM(SCTX_R9, offsetof(struct svm_regctx, sctx_r9)); +ASSYM(SCTX_R10, offsetof(struct svm_regctx, sctx_r10)); +ASSYM(SCTX_R11, offsetof(struct svm_regctx, sctx_r11)); +ASSYM(SCTX_R12, offsetof(struct svm_regctx, sctx_r12)); +ASSYM(SCTX_R13, offsetof(struct svm_regctx, sctx_r13)); +ASSYM(SCTX_R14, offsetof(struct svm_regctx, sctx_r14)); +ASSYM(SCTX_R15, offsetof(struct svm_regctx, sctx_r15)); +ASSYM(MSR_GSBASE, MSR_GSBASE); diff --git a/sys/amd64/vmm/amd/svm_msr.c b/sys/amd64/vmm/amd/svm_msr.c new file mode 100644 index 000000000000..1f7be6029e64 --- /dev/null +++ b/sys/amd64/vmm/amd/svm_msr.c @@ -0,0 +1,185 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_bhyve_snapshot.h" + +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/systm.h> + +#include <machine/cpufunc.h> +#include <machine/specialreg.h> +#include <machine/vmm.h> + +#include "svm.h" +#include "vmcb.h" +#include "svm_softc.h" +#include "svm_msr.h" + +#ifndef MSR_AMDK8_IPM +#define MSR_AMDK8_IPM 0xc0010055 +#endif + +enum { + IDX_MSR_LSTAR, + IDX_MSR_CSTAR, + IDX_MSR_STAR, + IDX_MSR_SF_MASK, + HOST_MSR_NUM /* must be the last enumeration */ +}; + +static uint64_t host_msrs[HOST_MSR_NUM]; + +void +svm_msr_init(void) +{ + /* + * It is safe to cache the values of the following MSRs because they + * don't change based on curcpu, curproc or curthread. + */ + host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); +} + +void +svm_msr_guest_init(struct svm_softc *sc, struct svm_vcpu *vcpu) +{ + /* + * All the MSRs accessible to the guest are either saved/restored by + * hardware on every #VMEXIT/VMRUN (e.g., G_PAT) or are saved/restored + * by VMSAVE/VMLOAD (e.g., MSR_GSBASE). + * + * There are no guest MSRs that are saved/restored "by hand" so nothing + * more to do here. + */ + return; +} + +void +svm_msr_guest_enter(struct svm_vcpu *vcpu) +{ + /* + * Save host MSRs (if any) and restore guest MSRs (if any). + */ +} + +void +svm_msr_guest_exit(struct svm_vcpu *vcpu) +{ + /* + * Save guest MSRs (if any) and restore host MSRs. + */ + wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]); + wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]); + wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]); + wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); + + /* MSR_KGSBASE will be restored on the way back to userspace */ +} + +int +svm_rdmsr(struct svm_vcpu *vcpu, u_int num, uint64_t *result, bool *retu) +{ + int error = 0; + + switch (num) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + *result = 0; + break; + case MSR_MTRRcap: + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: + if (vm_rdmtrr(&vcpu->mtrr, num, result) != 0) { + vm_inject_gp(vcpu->vcpu); + } + break; + case MSR_SYSCFG: + case MSR_AMDK8_IPM: + case MSR_EXTFEATURES: + *result = 0; + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +int +svm_wrmsr(struct svm_vcpu *vcpu, u_int num, uint64_t val, bool *retu) +{ + int error = 0; + + switch (num) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + break; /* ignore writes */ + case MSR_MTRRcap: + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: + if (vm_wrmtrr(&vcpu->mtrr, num, val) != 0) { + vm_inject_gp(vcpu->vcpu); + } + break; + case MSR_SYSCFG: + break; /* Ignore writes */ + case MSR_AMDK8_IPM: + /* + * Ignore writes to the "Interrupt Pending Message" MSR. + */ + break; + case MSR_K8_UCODE_UPDATE: + /* + * Ignore writes to microcode update register. + */ + break; +#ifdef BHYVE_SNAPSHOT + case MSR_TSC: + svm_set_tsc_offset(vcpu, val - rdtsc()); + break; +#endif + case MSR_EXTFEATURES: + break; + default: + error = EINVAL; + break; + } + + return (error); +} diff --git a/sys/amd64/vmm/amd/svm_msr.h b/sys/amd64/vmm/amd/svm_msr.h new file mode 100644 index 000000000000..0242e508cd0a --- /dev/null +++ b/sys/amd64/vmm/amd/svm_msr.h @@ -0,0 +1,43 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SVM_MSR_H_ +#define _SVM_MSR_H_ + +struct svm_softc; +struct svm_vcpu; + +void svm_msr_init(void); +void svm_msr_guest_init(struct svm_softc *sc, struct svm_vcpu *vcpu); +void svm_msr_guest_enter(struct svm_vcpu *vcpu); +void svm_msr_guest_exit(struct svm_vcpu *vcpu); + +int svm_wrmsr(struct svm_vcpu *vcpu, u_int num, uint64_t val, bool *retu); +int svm_rdmsr(struct svm_vcpu *vcpu, u_int num, uint64_t *result, bool *retu); + +#endif /* _SVM_MSR_H_ */ diff --git a/sys/amd64/vmm/amd/svm_softc.h b/sys/amd64/vmm/amd/svm_softc.h new file mode 100644 index 000000000000..0fd2303a7242 --- /dev/null +++ b/sys/amd64/vmm/amd/svm_softc.h @@ -0,0 +1,127 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SVM_SOFTC_H_ +#define _SVM_SOFTC_H_ + +#include "x86.h" + +#define SVM_IO_BITMAP_SIZE (3 * PAGE_SIZE) +#define SVM_MSR_BITMAP_SIZE (2 * PAGE_SIZE) + +struct svm_softc; + +struct dbg { + uint32_t rflags_tf; /* saved RFLAGS.TF value when single-stepping a vcpu */ + bool popf_sstep; /* indicates that we've stepped over popf */ + bool pushf_sstep; /* indicates that we've stepped over pushf */ +}; + +struct asid { + uint64_t gen; /* range is [1, ~0UL] */ + uint32_t num; /* range is [1, nasid - 1] */ +}; + +struct svm_vcpu { + struct svm_softc *sc; + struct vcpu *vcpu; + struct vmcb *vmcb; /* hardware saved vcpu context */ + struct svm_regctx swctx; /* software saved vcpu context */ + uint64_t vmcb_pa; /* VMCB physical address */ + uint64_t nextrip; /* next instruction to be executed by guest */ + int lastcpu; /* host cpu that the vcpu last ran on */ + uint32_t dirty; /* state cache bits that must be cleared */ + long eptgen; /* pmap->pm_eptgen when the vcpu last ran */ + struct asid asid; + struct vm_mtrr mtrr; + int vcpuid; + struct dbg dbg; + int caps; /* optional vm capabilities */ +}; + +/* + * SVM softc, one per virtual machine. + */ +struct svm_softc { + vm_paddr_t nptp; /* nested page table */ + uint8_t *iopm_bitmap; /* shared by all vcpus */ + uint8_t *msr_bitmap; /* shared by all vcpus */ + struct vm *vm; +}; + +#define SVM_CTR0(vcpu, format) \ + VCPU_CTR0((vcpu)->sc->vm, (vcpu)->vcpuid, format) + +#define SVM_CTR1(vcpu, format, p1) \ + VCPU_CTR1((vcpu)->sc->vm, (vcpu)->vcpuid, format, p1) + +#define SVM_CTR2(vcpu, format, p1, p2) \ + VCPU_CTR2((vcpu)->sc->vm, (vcpu)->vcpuid, format, p1, p2) + +#define SVM_CTR3(vcpu, format, p1, p2, p3) \ + VCPU_CTR3((vcpu)->sc->vm, (vcpu)->vcpuid, format, p1, p2, p3) + +#define SVM_CTR4(vcpu, format, p1, p2, p3, p4) \ + VCPU_CTR4((vcpu)->sc->vm, (vcpu)->vcpuid, format, p1, p2, p3, p4) + +static __inline struct vmcb * +svm_get_vmcb(struct svm_vcpu *vcpu) +{ + + return (vcpu->vmcb); +} + +static __inline struct vmcb_state * +svm_get_vmcb_state(struct svm_vcpu *vcpu) +{ + + return (&vcpu->vmcb->state); +} + +static __inline struct vmcb_ctrl * +svm_get_vmcb_ctrl(struct svm_vcpu *vcpu) +{ + + return (&vcpu->vmcb->ctrl); +} + +static __inline struct svm_regctx * +svm_get_guest_regctx(struct svm_vcpu *vcpu) +{ + + return (&vcpu->swctx); +} + +static __inline void +svm_set_dirty(struct svm_vcpu *vcpu, uint32_t dirtybits) +{ + + vcpu->dirty |= dirtybits; +} + +#endif /* _SVM_SOFTC_H_ */ diff --git a/sys/amd64/vmm/amd/svm_support.S b/sys/amd64/vmm/amd/svm_support.S new file mode 100644 index 000000000000..26bf36b98f71 --- /dev/null +++ b/sys/amd64/vmm/amd/svm_support.S @@ -0,0 +1,157 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <machine/asmacros.h> + +#include "svm_assym.h" + +/* + * Be friendly to DTrace FBT's prologue/epilogue pattern matching. + * + * They are also responsible for saving/restoring the host %rbp across VMRUN. + */ +#define VENTER push %rbp ; mov %rsp,%rbp +#define VLEAVE pop %rbp + +/* + * svm_launch(uint64_t vmcb, struct svm_regctx *gctx, struct pcpu *pcpu) + * %rdi: physical address of VMCB + * %rsi: pointer to guest context + * %rdx: pointer to the pcpu data + */ +ENTRY(svm_launch) + VENTER + + /* save pointer to the pcpu data */ + push %rdx + + /* + * Host register state saved across a VMRUN. + * + * All "callee saved registers" except: + * %rsp: because it is preserved by the processor across VMRUN. + * %rbp: because it is saved/restored by the function prologue/epilogue. + */ + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + /* Save the physical address of the VMCB in %rax */ + movq %rdi, %rax + + push %rsi /* push guest context pointer on the stack */ + + /* + * Restore guest state. + */ + movq SCTX_R8(%rsi), %r8 + movq SCTX_R9(%rsi), %r9 + movq SCTX_R10(%rsi), %r10 + movq SCTX_R11(%rsi), %r11 + movq SCTX_R12(%rsi), %r12 + movq SCTX_R13(%rsi), %r13 + movq SCTX_R14(%rsi), %r14 + movq SCTX_R15(%rsi), %r15 + movq SCTX_RBP(%rsi), %rbp + movq SCTX_RBX(%rsi), %rbx + movq SCTX_RCX(%rsi), %rcx + movq SCTX_RDX(%rsi), %rdx + movq SCTX_RDI(%rsi), %rdi + movq SCTX_RSI(%rsi), %rsi /* %rsi must be restored last */ + + vmload %rax + vmrun %rax + vmsave %rax + + pop %rax /* pop guest context pointer from the stack */ + + /* + * Save guest state. + */ + movq %r8, SCTX_R8(%rax) + movq %r9, SCTX_R9(%rax) + movq %r10, SCTX_R10(%rax) + movq %r11, SCTX_R11(%rax) + movq %r12, SCTX_R12(%rax) + movq %r13, SCTX_R13(%rax) + movq %r14, SCTX_R14(%rax) + movq %r15, SCTX_R15(%rax) + movq %rbp, SCTX_RBP(%rax) + movq %rbx, SCTX_RBX(%rax) + movq %rcx, SCTX_RCX(%rax) + movq %rdx, SCTX_RDX(%rax) + movq %rdi, SCTX_RDI(%rax) + movq %rsi, SCTX_RSI(%rax) + + /* + * To prevent malicious branch target predictions from + * affecting the host, overwrite all entries in the RSB upon + * exiting a guest. + */ + mov $16, %ecx /* 16 iterations, two calls per loop */ + mov %rsp, %rax +0: call 2f /* create an RSB entry. */ +1: pause + call 1b /* capture rogue speculation. */ +2: call 2f /* create an RSB entry. */ +1: pause + call 1b /* capture rogue speculation. */ +2: sub $1, %ecx + jnz 0b + mov %rax, %rsp + + /* Restore host state */ + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + + /* Restore %GS.base to point to the host's pcpu data */ + pop %rdx + mov %edx, %eax + shr $32, %rdx + mov $MSR_GSBASE, %rcx + wrmsr + + /* + * Clobber the remaining registers with guest contents so they + * can't be misused. + */ + xor %rbp, %rbp + xor %rdi, %rdi + xor %rsi, %rsi + xor %r8, %r8 + xor %r9, %r9 + xor %r10, %r10 + xor %r11, %r11 + + VLEAVE + ret +END(svm_launch) diff --git a/sys/amd64/vmm/amd/vmcb.c b/sys/amd64/vmm/amd/vmcb.c new file mode 100644 index 000000000000..9a1008fa495c --- /dev/null +++ b/sys/amd64/vmm/amd/vmcb.c @@ -0,0 +1,561 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_bhyve_snapshot.h" + +#include <sys/param.h> +#include <sys/systm.h> + +#include <machine/segments.h> +#include <machine/specialreg.h> +#include <machine/vmm.h> +#include <machine/vmm_snapshot.h> + +#include <dev/vmm/vmm_ktr.h> + +#include "vlapic.h" +#include "vmcb.h" +#include "svm.h" +#include "svm_softc.h" + +/* + * The VMCB aka Virtual Machine Control Block is a 4KB aligned page + * in memory that describes the virtual machine. + * + * The VMCB contains: + * - instructions or events in the guest to intercept + * - control bits that modify execution environment of the guest + * - guest processor state (e.g. general purpose registers) + */ + +/* + * Return VMCB segment area. + */ +static struct vmcb_segment * +vmcb_segptr(struct vmcb *vmcb, int type) +{ + struct vmcb_state *state; + struct vmcb_segment *seg; + + state = &vmcb->state; + + switch (type) { + case VM_REG_GUEST_CS: + seg = &state->cs; + break; + + case VM_REG_GUEST_DS: + seg = &state->ds; + break; + + case VM_REG_GUEST_ES: + seg = &state->es; + break; + + case VM_REG_GUEST_FS: + seg = &state->fs; + break; + + case VM_REG_GUEST_GS: + seg = &state->gs; + break; + + case VM_REG_GUEST_SS: + seg = &state->ss; + break; + + case VM_REG_GUEST_GDTR: + seg = &state->gdt; + break; + + case VM_REG_GUEST_IDTR: + seg = &state->idt; + break; + + case VM_REG_GUEST_LDTR: + seg = &state->ldt; + break; + + case VM_REG_GUEST_TR: + seg = &state->tr; + break; + + default: + seg = NULL; + break; + } + + return (seg); +} + +static int +vmcb_access(struct svm_vcpu *vcpu, int write, int ident, uint64_t *val) +{ + struct vmcb *vmcb; + int off, bytes; + char *ptr; + + vmcb = svm_get_vmcb(vcpu); + off = VMCB_ACCESS_OFFSET(ident); + bytes = VMCB_ACCESS_BYTES(ident); + + if ((off + bytes) >= sizeof (struct vmcb)) + return (EINVAL); + + ptr = (char *)vmcb; + + if (!write) + *val = 0; + + switch (bytes) { + case 8: + case 4: + case 2: + case 1: + if (write) + memcpy(ptr + off, val, bytes); + else + memcpy(val, ptr + off, bytes); + break; + default: + SVM_CTR1(vcpu, "Invalid size %d for VMCB access: %d", bytes); + return (EINVAL); + } + + /* Invalidate all VMCB state cached by h/w. */ + if (write) + svm_set_dirty(vcpu, 0xffffffff); + + return (0); +} + +/* + * Read from segment selector, control and general purpose register of VMCB. + */ +int +vmcb_read(struct svm_vcpu *vcpu, int ident, uint64_t *retval) +{ + struct vmcb *vmcb; + struct vmcb_state *state; + struct vmcb_segment *seg; + int err; + + vmcb = svm_get_vmcb(vcpu); + state = &vmcb->state; + err = 0; + + if (VMCB_ACCESS_OK(ident)) + return (vmcb_access(vcpu, 0, ident, retval)); + + switch (ident) { + case VM_REG_GUEST_CR0: + *retval = state->cr0; + break; + + case VM_REG_GUEST_CR2: + *retval = state->cr2; + break; + + case VM_REG_GUEST_CR3: + *retval = state->cr3; + break; + + case VM_REG_GUEST_CR4: + *retval = state->cr4; + break; + + case VM_REG_GUEST_DR6: + *retval = state->dr6; + break; + + case VM_REG_GUEST_DR7: + *retval = state->dr7; + break; + + case VM_REG_GUEST_EFER: + *retval = state->efer; + break; + + case VM_REG_GUEST_RAX: + *retval = state->rax; + break; + + case VM_REG_GUEST_RFLAGS: + *retval = state->rflags; + break; + + case VM_REG_GUEST_RIP: + *retval = state->rip; + break; + + case VM_REG_GUEST_RSP: + *retval = state->rsp; + break; + + case VM_REG_GUEST_CS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_SS: + case VM_REG_GUEST_LDTR: + case VM_REG_GUEST_TR: + seg = vmcb_segptr(vmcb, ident); + KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB", + __func__, ident)); + *retval = seg->selector; + break; + + case VM_REG_GUEST_FS_BASE: + case VM_REG_GUEST_GS_BASE: + seg = vmcb_segptr(vmcb, ident == VM_REG_GUEST_FS_BASE ? + VM_REG_GUEST_FS : VM_REG_GUEST_GS); + KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB", + __func__, ident)); + *retval = seg->base; + break; + case VM_REG_GUEST_KGS_BASE: + *retval = state->kernelgsbase; + break; + + case VM_REG_GUEST_TPR: + *retval = vlapic_get_cr8(vm_lapic(vcpu->vcpu)); + break; + + case VM_REG_GUEST_GDTR: + case VM_REG_GUEST_IDTR: + /* GDTR and IDTR don't have segment selectors */ + err = EINVAL; + break; + default: + err = EINVAL; + break; + } + + return (err); +} + +/* + * Write to segment selector, control and general purpose register of VMCB. + */ +int +vmcb_write(struct svm_vcpu *vcpu, int ident, uint64_t val) +{ + struct vmcb *vmcb; + struct vmcb_state *state; + struct vmcb_segment *seg; + int err, dirtyseg; + + vmcb = svm_get_vmcb(vcpu); + state = &vmcb->state; + dirtyseg = 0; + err = 0; + + if (VMCB_ACCESS_OK(ident)) + return (vmcb_access(vcpu, 1, ident, &val)); + + switch (ident) { + case VM_REG_GUEST_CR0: + state->cr0 = val; + svm_set_dirty(vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_CR2: + state->cr2 = val; + svm_set_dirty(vcpu, VMCB_CACHE_CR2); + break; + + case VM_REG_GUEST_CR3: + state->cr3 = val; + svm_set_dirty(vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_CR4: + state->cr4 = val; + svm_set_dirty(vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_DR6: + state->dr6 = val; + svm_set_dirty(vcpu, VMCB_CACHE_DR); + break; + + case VM_REG_GUEST_DR7: + state->dr7 = val; + svm_set_dirty(vcpu, VMCB_CACHE_DR); + break; + + case VM_REG_GUEST_EFER: + /* EFER_SVM must always be set when the guest is executing */ + state->efer = val | EFER_SVM; + svm_set_dirty(vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_RAX: + state->rax = val; + break; + + case VM_REG_GUEST_RFLAGS: + state->rflags = val; + break; + + case VM_REG_GUEST_RIP: + state->rip = val; + break; + + case VM_REG_GUEST_RSP: + state->rsp = val; + break; + + case VM_REG_GUEST_CS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_SS: + dirtyseg = 1; /* FALLTHROUGH */ + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_LDTR: + case VM_REG_GUEST_TR: + seg = vmcb_segptr(vmcb, ident); + KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB", + __func__, ident)); + seg->selector = val; + if (dirtyseg) + svm_set_dirty(vcpu, VMCB_CACHE_SEG); + break; + + case VM_REG_GUEST_GDTR: + case VM_REG_GUEST_IDTR: + /* GDTR and IDTR don't have segment selectors */ + err = EINVAL; + break; + default: + err = EINVAL; + break; + } + + return (err); +} + +int +vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg2) +{ + struct vmcb_segment *seg; + + seg = vmcb_segptr(vmcb, ident); + if (seg != NULL) { + bcopy(seg, seg2, sizeof(struct vmcb_segment)); + return (0); + } else { + return (EINVAL); + } +} + +int +vmcb_setdesc(struct svm_vcpu *vcpu, int reg, struct seg_desc *desc) +{ + struct vmcb *vmcb; + struct vmcb_segment *seg; + uint16_t attrib; + + vmcb = svm_get_vmcb(vcpu); + + seg = vmcb_segptr(vmcb, reg); + KASSERT(seg != NULL, ("%s: invalid segment descriptor %d", + __func__, reg)); + + seg->base = desc->base; + seg->limit = desc->limit; + if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) { + /* + * Map seg_desc access to VMCB attribute format. + * + * SVM uses the 'P' bit in the segment attributes to indicate a + * NULL segment so clear it if the segment is marked unusable. + */ + attrib = ((desc->access & 0xF000) >> 4) | (desc->access & 0xFF); + if (SEG_DESC_UNUSABLE(desc->access)) { + attrib &= ~0x80; + } + seg->attrib = attrib; + } + + SVM_CTR4(vcpu, "Setting desc %d: base (%#lx), limit (%#x), " + "attrib (%#x)", reg, seg->base, seg->limit, seg->attrib); + + switch (reg) { + case VM_REG_GUEST_CS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_SS: + svm_set_dirty(vcpu, VMCB_CACHE_SEG); + break; + case VM_REG_GUEST_GDTR: + case VM_REG_GUEST_IDTR: + svm_set_dirty(vcpu, VMCB_CACHE_DT); + break; + default: + break; + } + + return (0); +} + +int +vmcb_getdesc(struct svm_vcpu *vcpu, int reg, struct seg_desc *desc) +{ + struct vmcb *vmcb; + struct vmcb_segment *seg; + + vmcb = svm_get_vmcb(vcpu); + seg = vmcb_segptr(vmcb, reg); + KASSERT(seg != NULL, ("%s: invalid segment descriptor %d", + __func__, reg)); + + desc->base = seg->base; + desc->limit = seg->limit; + desc->access = 0; + + if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) { + /* Map seg_desc access to VMCB attribute format */ + desc->access = ((seg->attrib & 0xF00) << 4) | + (seg->attrib & 0xFF); + + /* + * VT-x uses bit 16 to indicate a segment that has been loaded + * with a NULL selector (aka unusable). The 'desc->access' + * field is interpreted in the VT-x format by the + * processor-independent code. + * + * SVM uses the 'P' bit to convey the same information so + * convert it into the VT-x format. For more details refer to + * section "Segment State in the VMCB" in APMv2. + */ + if (reg != VM_REG_GUEST_CS && reg != VM_REG_GUEST_TR) { + if ((desc->access & 0x80) == 0) + desc->access |= 0x10000; /* Unusable segment */ + } + } + + return (0); +} + +#ifdef BHYVE_SNAPSHOT +int +vmcb_getany(struct svm_vcpu *vcpu, int ident, uint64_t *val) +{ + int error = 0; + + if (ident >= VM_REG_LAST) { + error = EINVAL; + goto err; + } + + error = vmcb_read(vcpu, ident, val); + +err: + return (error); +} + +int +vmcb_setany(struct svm_vcpu *vcpu, int ident, uint64_t val) +{ + int error = 0; + + if (ident >= VM_REG_LAST) { + error = EINVAL; + goto err; + } + + error = vmcb_write(vcpu, ident, val); + +err: + return (error); +} + +int +vmcb_snapshot_desc(struct svm_vcpu *vcpu, int reg, + struct vm_snapshot_meta *meta) +{ + int ret; + struct seg_desc desc; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcb_getdesc(vcpu, reg, &desc); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done); + + ret = vmcb_setdesc(vcpu, reg, &desc); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} + +int +vmcb_snapshot_any(struct svm_vcpu *vcpu, int ident, + struct vm_snapshot_meta *meta) +{ + int ret; + uint64_t val; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcb_getany(vcpu, ident, &val); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + + ret = vmcb_setany(vcpu, ident, val); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} +#endif diff --git a/sys/amd64/vmm/amd/vmcb.h b/sys/amd64/vmm/amd/vmcb.h new file mode 100644 index 000000000000..09150fc26a72 --- /dev/null +++ b/sys/amd64/vmm/amd/vmcb.h @@ -0,0 +1,370 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VMCB_H_ +#define _VMCB_H_ + +#define BIT(n) (1ULL << n) + +/* + * Secure Virtual Machine: AMD64 Programmer's Manual Vol2, Chapter 15 + * Layout of VMCB: AMD64 Programmer's Manual Vol2, Appendix B + */ + +/* vmcb_ctrl->intercept[] array indices */ +#define VMCB_CR_INTCPT 0 +#define VMCB_DR_INTCPT 1 +#define VMCB_EXC_INTCPT 2 +#define VMCB_CTRL1_INTCPT 3 +#define VMCB_CTRL2_INTCPT 4 + +/* intercept[VMCB_CTRL1_INTCPT] fields */ +#define VMCB_INTCPT_INTR BIT(0) +#define VMCB_INTCPT_NMI BIT(1) +#define VMCB_INTCPT_SMI BIT(2) +#define VMCB_INTCPT_INIT BIT(3) +#define VMCB_INTCPT_VINTR BIT(4) +#define VMCB_INTCPT_CR0_WRITE BIT(5) +#define VMCB_INTCPT_IDTR_READ BIT(6) +#define VMCB_INTCPT_GDTR_READ BIT(7) +#define VMCB_INTCPT_LDTR_READ BIT(8) +#define VMCB_INTCPT_TR_READ BIT(9) +#define VMCB_INTCPT_IDTR_WRITE BIT(10) +#define VMCB_INTCPT_GDTR_WRITE BIT(11) +#define VMCB_INTCPT_LDTR_WRITE BIT(12) +#define VMCB_INTCPT_TR_WRITE BIT(13) +#define VMCB_INTCPT_RDTSC BIT(14) +#define VMCB_INTCPT_RDPMC BIT(15) +#define VMCB_INTCPT_PUSHF BIT(16) +#define VMCB_INTCPT_POPF BIT(17) +#define VMCB_INTCPT_CPUID BIT(18) +#define VMCB_INTCPT_RSM BIT(19) +#define VMCB_INTCPT_IRET BIT(20) +#define VMCB_INTCPT_INTn BIT(21) +#define VMCB_INTCPT_INVD BIT(22) +#define VMCB_INTCPT_PAUSE BIT(23) +#define VMCB_INTCPT_HLT BIT(24) +#define VMCB_INTCPT_INVLPG BIT(25) +#define VMCB_INTCPT_INVLPGA BIT(26) +#define VMCB_INTCPT_IO BIT(27) +#define VMCB_INTCPT_MSR BIT(28) +#define VMCB_INTCPT_TASK_SWITCH BIT(29) +#define VMCB_INTCPT_FERR_FREEZE BIT(30) +#define VMCB_INTCPT_SHUTDOWN BIT(31) + +/* intercept[VMCB_CTRL2_INTCPT] fields */ +#define VMCB_INTCPT_VMRUN BIT(0) +#define VMCB_INTCPT_VMMCALL BIT(1) +#define VMCB_INTCPT_VMLOAD BIT(2) +#define VMCB_INTCPT_VMSAVE BIT(3) +#define VMCB_INTCPT_STGI BIT(4) +#define VMCB_INTCPT_CLGI BIT(5) +#define VMCB_INTCPT_SKINIT BIT(6) +#define VMCB_INTCPT_RDTSCP BIT(7) +#define VMCB_INTCPT_ICEBP BIT(8) +#define VMCB_INTCPT_WBINVD BIT(9) +#define VMCB_INTCPT_MONITOR BIT(10) +#define VMCB_INTCPT_MWAIT BIT(11) +#define VMCB_INTCPT_MWAIT_ARMED BIT(12) +#define VMCB_INTCPT_XSETBV BIT(13) + +/* VMCB TLB control */ +#define VMCB_TLB_FLUSH_NOTHING 0 /* Flush nothing */ +#define VMCB_TLB_FLUSH_ALL 1 /* Flush entire TLB */ +#define VMCB_TLB_FLUSH_GUEST 3 /* Flush all guest entries */ +#define VMCB_TLB_FLUSH_GUEST_NONGLOBAL 7 /* Flush guest non-PG entries */ + +/* VMCB state caching */ +#define VMCB_CACHE_NONE 0 /* No caching */ +#define VMCB_CACHE_I BIT(0) /* Intercept, TSC off, Pause filter */ +#define VMCB_CACHE_IOPM BIT(1) /* I/O and MSR permission */ +#define VMCB_CACHE_ASID BIT(2) /* ASID */ +#define VMCB_CACHE_TPR BIT(3) /* V_TPR to V_INTR_VECTOR */ +#define VMCB_CACHE_NP BIT(4) /* Nested Paging */ +#define VMCB_CACHE_CR BIT(5) /* CR0, CR3, CR4 & EFER */ +#define VMCB_CACHE_DR BIT(6) /* Debug registers */ +#define VMCB_CACHE_DT BIT(7) /* GDT/IDT */ +#define VMCB_CACHE_SEG BIT(8) /* User segments, CPL */ +#define VMCB_CACHE_CR2 BIT(9) /* page fault address */ +#define VMCB_CACHE_LBR BIT(10) /* Last branch */ + +/* VMCB control event injection */ +#define VMCB_EVENTINJ_EC_VALID BIT(11) /* Error Code valid */ +#define VMCB_EVENTINJ_VALID BIT(31) /* Event valid */ + +/* Event types that can be injected */ +#define VMCB_EVENTINJ_TYPE_INTR 0 +#define VMCB_EVENTINJ_TYPE_NMI 2 +#define VMCB_EVENTINJ_TYPE_EXCEPTION 3 +#define VMCB_EVENTINJ_TYPE_INTn 4 + +/* VMCB exit code, APM vol2 Appendix C */ +#define VMCB_EXIT_MC 0x52 +#define VMCB_EXIT_INTR 0x60 +#define VMCB_EXIT_NMI 0x61 +#define VMCB_EXIT_VINTR 0x64 +#define VMCB_EXIT_PUSHF 0x70 +#define VMCB_EXIT_POPF 0x71 +#define VMCB_EXIT_CPUID 0x72 +#define VMCB_EXIT_IRET 0x74 +#define VMCB_EXIT_INVD 0x76 +#define VMCB_EXIT_PAUSE 0x77 +#define VMCB_EXIT_HLT 0x78 +#define VMCB_EXIT_INVLPGA 0x7A +#define VMCB_EXIT_IO 0x7B +#define VMCB_EXIT_MSR 0x7C +#define VMCB_EXIT_SHUTDOWN 0x7F +#define VMCB_EXIT_VMRUN 0x80 +#define VMCB_EXIT_VMMCALL 0x81 +#define VMCB_EXIT_VMLOAD 0x82 +#define VMCB_EXIT_VMSAVE 0x83 +#define VMCB_EXIT_STGI 0x84 +#define VMCB_EXIT_CLGI 0x85 +#define VMCB_EXIT_SKINIT 0x86 +#define VMCB_EXIT_ICEBP 0x88 +#define VMCB_EXIT_WBINVD 0x89 +#define VMCB_EXIT_MONITOR 0x8A +#define VMCB_EXIT_MWAIT 0x8B +#define VMCB_EXIT_NPF 0x400 +#define VMCB_EXIT_INVALID -1 + +/* + * Nested page fault. + * Bit definitions to decode EXITINFO1. + */ +#define VMCB_NPF_INFO1_P BIT(0) /* Nested page present. */ +#define VMCB_NPF_INFO1_W BIT(1) /* Access was write. */ +#define VMCB_NPF_INFO1_U BIT(2) /* Access was user access. */ +#define VMCB_NPF_INFO1_RSV BIT(3) /* Reserved bits present. */ +#define VMCB_NPF_INFO1_ID BIT(4) /* Code read. */ + +#define VMCB_NPF_INFO1_GPA BIT(32) /* Guest physical address. */ +#define VMCB_NPF_INFO1_GPT BIT(33) /* Guest page table. */ + +/* + * EXITINTINFO, Interrupt exit info for all intercepts. + * Section 15.7.2, Intercepts during IDT Interrupt Delivery. + */ +#define VMCB_EXITINTINFO_VECTOR(x) ((x) & 0xFF) +#define VMCB_EXITINTINFO_TYPE(x) (((x) >> 8) & 0x7) +#define VMCB_EXITINTINFO_EC_VALID(x) (((x) & BIT(11)) ? 1 : 0) +#define VMCB_EXITINTINFO_VALID(x) (((x) & BIT(31)) ? 1 : 0) +#define VMCB_EXITINTINFO_EC(x) (((x) >> 32) & 0xFFFFFFFF) + +/* Offset of various VMCB fields. */ +#define VMCB_OFF_CTRL(x) (x) +#define VMCB_OFF_STATE(x) ((x) + 0x400) + +#define VMCB_OFF_CR_INTERCEPT VMCB_OFF_CTRL(0x0) +#define VMCB_OFF_DR_INTERCEPT VMCB_OFF_CTRL(0x4) +#define VMCB_OFF_EXC_INTERCEPT VMCB_OFF_CTRL(0x8) +#define VMCB_OFF_INST1_INTERCEPT VMCB_OFF_CTRL(0xC) +#define VMCB_OFF_INST2_INTERCEPT VMCB_OFF_CTRL(0x10) +#define VMCB_OFF_PAUSE_FILTHRESH VMCB_OFF_CTRL(0x3C) +#define VMCB_OFF_PAUSE_FILCNT VMCB_OFF_CTRL(0x3E) +#define VMCB_OFF_IO_PERM VMCB_OFF_CTRL(0x40) +#define VMCB_OFF_MSR_PERM VMCB_OFF_CTRL(0x48) +#define VMCB_OFF_TSC_OFFSET VMCB_OFF_CTRL(0x50) +#define VMCB_OFF_ASID VMCB_OFF_CTRL(0x58) +#define VMCB_OFF_TLB_CTRL VMCB_OFF_CTRL(0x5C) +#define VMCB_OFF_VIRQ VMCB_OFF_CTRL(0x60) +#define VMCB_OFF_EXIT_REASON VMCB_OFF_CTRL(0x70) +#define VMCB_OFF_EXITINFO1 VMCB_OFF_CTRL(0x78) +#define VMCB_OFF_EXITINFO2 VMCB_OFF_CTRL(0x80) +#define VMCB_OFF_EXITINTINFO VMCB_OFF_CTRL(0x88) +#define VMCB_OFF_NP_ENABLE VMCB_OFF_CTRL(0x90) +#define VMCB_OFF_AVIC_BAR VMCB_OFF_CTRL(0x98) +#define VMCB_OFF_NPT_BASE VMCB_OFF_CTRL(0xB0) +#define VMCB_OFF_AVIC_PAGE VMCB_OFF_CTRL(0xE0) +#define VMCB_OFF_AVIC_LT VMCB_OFF_CTRL(0xF0) +#define VMCB_OFF_AVIC_PT VMCB_OFF_CTRL(0xF8) + +#define VMCB_OFF_CPL VMCB_OFF_STATE(0xCB) +#define VMCB_OFF_STAR VMCB_OFF_STATE(0x200) +#define VMCB_OFF_LSTAR VMCB_OFF_STATE(0x208) +#define VMCB_OFF_CSTAR VMCB_OFF_STATE(0x210) +#define VMCB_OFF_SFMASK VMCB_OFF_STATE(0x218) +#define VMCB_OFF_KERNELGBASE VMCB_OFF_STATE(0x220) +#define VMCB_OFF_SYSENTER_CS VMCB_OFF_STATE(0x228) +#define VMCB_OFF_SYSENTER_ESP VMCB_OFF_STATE(0x230) +#define VMCB_OFF_SYSENTER_EIP VMCB_OFF_STATE(0x238) +#define VMCB_OFF_GUEST_PAT VMCB_OFF_STATE(0x268) +#define VMCB_OFF_DBGCTL VMCB_OFF_STATE(0x270) +#define VMCB_OFF_BR_FROM VMCB_OFF_STATE(0x278) +#define VMCB_OFF_BR_TO VMCB_OFF_STATE(0x280) +#define VMCB_OFF_INT_FROM VMCB_OFF_STATE(0x288) +#define VMCB_OFF_INT_TO VMCB_OFF_STATE(0x290) + +/* + * Encode the VMCB offset and bytes that we want to read from VMCB. + */ +#define VMCB_ACCESS(o, w) (0x80000000 | (((w) & 0xF) << 16) | \ + ((o) & 0xFFF)) +#define VMCB_ACCESS_OK(v) ((v) & 0x80000000 ) +#define VMCB_ACCESS_BYTES(v) (((v) >> 16) & 0xF) +#define VMCB_ACCESS_OFFSET(v) ((v) & 0xFFF) + +#ifdef _KERNEL + +struct svm_softc; +struct svm_vcpu; +struct vm_snapshot_meta; + +/* VMCB save state area segment format */ +struct vmcb_segment { + uint16_t selector; + uint16_t attrib; + uint32_t limit; + uint64_t base; +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb_segment) == 16); + +/* Code segment descriptor attribute in 12 bit format as saved by VMCB. */ +#define VMCB_CS_ATTRIB_L BIT(9) /* Long mode. */ +#define VMCB_CS_ATTRIB_D BIT(10) /* OPerand size bit. */ + +/* + * The VMCB is divided into two areas - the first one contains various + * control bits including the intercept vector and the second one contains + * the guest state. + */ + +/* VMCB control area - padded up to 1024 bytes */ +struct vmcb_ctrl { + uint32_t intercept[5]; /* all intercepts */ + uint8_t pad1[0x28]; /* Offsets 0x14-0x3B are reserved. */ + uint16_t pause_filthresh; /* Offset 0x3C, PAUSE filter threshold */ + uint16_t pause_filcnt; /* Offset 0x3E, PAUSE filter count */ + uint64_t iopm_base_pa; /* 0x40: IOPM_BASE_PA */ + uint64_t msrpm_base_pa; /* 0x48: MSRPM_BASE_PA */ + uint64_t tsc_offset; /* 0x50: TSC_OFFSET */ + uint32_t asid; /* 0x58: Guest ASID */ + uint8_t tlb_ctrl; /* 0x5C: TLB_CONTROL */ + uint8_t pad2[3]; /* 0x5D-0x5F: Reserved. */ + uint8_t v_tpr; /* 0x60: V_TPR, guest CR8 */ + uint8_t v_irq:1; /* Is virtual interrupt pending? */ + uint8_t :7; /* Padding */ + uint8_t v_intr_prio:4; /* 0x62: Priority for virtual interrupt. */ + uint8_t v_ign_tpr:1; + uint8_t :3; + uint8_t v_intr_masking:1; /* Guest and host sharing of RFLAGS. */ + uint8_t :7; + uint8_t v_intr_vector; /* 0x64: Vector for virtual interrupt. */ + uint8_t pad3[3]; /* 0x65-0x67 Reserved. */ + uint64_t intr_shadow:1; /* 0x68: Interrupt shadow, section15.2.1 APM2 */ + uint64_t :63; + uint64_t exitcode; /* 0x70, Exitcode */ + uint64_t exitinfo1; /* 0x78, EXITINFO1 */ + uint64_t exitinfo2; /* 0x80, EXITINFO2 */ + uint64_t exitintinfo; /* 0x88, Interrupt exit value. */ + uint64_t np_enable:1; /* 0x90, Nested paging enable. */ + uint64_t :63; + uint8_t pad4[0x10]; /* 0x98-0xA7 reserved. */ + uint64_t eventinj; /* 0xA8, Event injection. */ + uint64_t n_cr3; /* B0, Nested page table. */ + uint64_t lbr_virt_en:1; /* Enable LBR virtualization. */ + uint64_t :63; + uint32_t vmcb_clean; /* 0xC0: VMCB clean bits for caching */ + uint32_t :32; /* 0xC4: Reserved */ + uint64_t nrip; /* 0xC8: Guest next nRIP. */ + uint8_t inst_len; /* 0xD0: #NPF decode assist */ + uint8_t inst_bytes[15]; + uint8_t padd6[0x320]; +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb_ctrl) == 1024); + +struct vmcb_state { + struct vmcb_segment es; + struct vmcb_segment cs; + struct vmcb_segment ss; + struct vmcb_segment ds; + struct vmcb_segment fs; + struct vmcb_segment gs; + struct vmcb_segment gdt; + struct vmcb_segment ldt; + struct vmcb_segment idt; + struct vmcb_segment tr; + uint8_t pad1[0x2b]; /* Reserved: 0xA0-0xCA */ + uint8_t cpl; + uint8_t pad2[4]; + uint64_t efer; + uint8_t pad3[0x70]; /* Reserved: 0xd8-0x147 */ + uint64_t cr4; + uint64_t cr3; /* Guest CR3 */ + uint64_t cr0; + uint64_t dr7; + uint64_t dr6; + uint64_t rflags; + uint64_t rip; + uint8_t pad4[0x58]; /* Reserved: 0x180-0x1D7 */ + uint64_t rsp; + uint8_t pad5[0x18]; /* Reserved 0x1E0-0x1F7 */ + uint64_t rax; + uint64_t star; + uint64_t lstar; + uint64_t cstar; + uint64_t sfmask; + uint64_t kernelgsbase; + uint64_t sysenter_cs; + uint64_t sysenter_esp; + uint64_t sysenter_eip; + uint64_t cr2; + uint8_t pad6[0x20]; + uint64_t g_pat; + uint64_t dbgctl; + uint64_t br_from; + uint64_t br_to; + uint64_t int_from; + uint64_t int_to; + uint8_t pad7[0x968]; /* Reserved up to end of VMCB */ +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb_state) == 0xC00); + +struct vmcb { + struct vmcb_ctrl ctrl; + struct vmcb_state state; +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb) == PAGE_SIZE); +CTASSERT(offsetof(struct vmcb, state) == 0x400); + +int vmcb_read(struct svm_vcpu *vcpu, int ident, uint64_t *retval); +int vmcb_write(struct svm_vcpu *vcpu, int ident, uint64_t val); +int vmcb_setdesc(struct svm_vcpu *vcpu, int ident, struct seg_desc *desc); +int vmcb_getdesc(struct svm_vcpu *vcpu, int ident, struct seg_desc *desc); +int vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg); +#ifdef BHYVE_SNAPSHOT +int vmcb_getany(struct svm_vcpu *vcpu, int ident, uint64_t *val); +int vmcb_setany(struct svm_vcpu *vcpu, int ident, uint64_t val); +int vmcb_snapshot_desc(struct svm_vcpu *vcpu, int reg, + struct vm_snapshot_meta *meta); +int vmcb_snapshot_any(struct svm_vcpu*vcpu, int ident, + struct vm_snapshot_meta *meta); +#endif + +#endif /* _KERNEL */ +#endif /* _VMCB_H_ */ diff --git a/sys/amd64/vmm/intel/ept.c b/sys/amd64/vmm/intel/ept.c new file mode 100644 index 000000000000..5432c7da5df7 --- /dev/null +++ b/sys/amd64/vmm/intel/ept.c @@ -0,0 +1,203 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/smp.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_extern.h> + +#include <machine/vmm.h> + +#include "vmx_cpufunc.h" +#include "ept.h" + +#define EPT_SUPPORTS_EXEC_ONLY(cap) ((cap) & (1UL << 0)) +#define EPT_PWL4(cap) ((cap) & (1UL << 6)) +#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14)) +#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */ +#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */ +#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20)) +#define AD_BITS_SUPPORTED(cap) ((cap) & (1UL << 21)) +#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32)) + +#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL +#define INVVPID_ALL_TYPES_SUPPORTED(cap) \ + (((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK) + +#define INVEPT_ALL_TYPES_MASK 0x6000000UL +#define INVEPT_ALL_TYPES_SUPPORTED(cap) \ + (((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK) + +#define EPT_PWLEVELS 4 /* page walk levels */ +#define EPT_ENABLE_AD_BITS (1 << 6) + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, ept, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, + NULL); + +static int ept_enable_ad_bits; + +static int ept_pmap_flags; +SYSCTL_INT(_hw_vmm_ept, OID_AUTO, pmap_flags, CTLFLAG_RD, + &ept_pmap_flags, 0, NULL); + +int +ept_init(int ipinum) +{ + int use_hw_ad_bits, use_superpages, use_exec_only; + uint64_t cap; + + cap = rdmsr(MSR_VMX_EPT_VPID_CAP); + + /* + * Verify that: + * - page walk length is 4 steps + * - extended page tables can be laid out in write-back memory + * - invvpid instruction with all possible types is supported + * - invept instruction with all possible types is supported + */ + if (!EPT_PWL4(cap) || + !EPT_MEMORY_TYPE_WB(cap) || + !INVVPID_SUPPORTED(cap) || + !INVVPID_ALL_TYPES_SUPPORTED(cap) || + !INVEPT_SUPPORTED(cap) || + !INVEPT_ALL_TYPES_SUPPORTED(cap)) + return (EINVAL); + + ept_pmap_flags = ipinum & PMAP_NESTED_IPIMASK; + + use_superpages = 1; + TUNABLE_INT_FETCH("hw.vmm.ept.use_superpages", &use_superpages); + if (use_superpages && EPT_PDE_SUPERPAGE(cap)) + ept_pmap_flags |= PMAP_PDE_SUPERPAGE; /* 2MB superpage */ + + use_hw_ad_bits = 1; + TUNABLE_INT_FETCH("hw.vmm.ept.use_hw_ad_bits", &use_hw_ad_bits); + if (use_hw_ad_bits && AD_BITS_SUPPORTED(cap)) + ept_enable_ad_bits = 1; + else + ept_pmap_flags |= PMAP_EMULATE_AD_BITS; + + use_exec_only = 1; + TUNABLE_INT_FETCH("hw.vmm.ept.use_exec_only", &use_exec_only); + if (use_exec_only && EPT_SUPPORTS_EXEC_ONLY(cap)) + ept_pmap_flags |= PMAP_SUPPORTS_EXEC_ONLY; + + return (0); +} + +#if 0 +static void +ept_dump(uint64_t *ptp, int nlevels) +{ + int i, t, tabs; + uint64_t *ptpnext, ptpval; + + if (--nlevels < 0) + return; + + tabs = 3 - nlevels; + for (t = 0; t < tabs; t++) + printf("\t"); + printf("PTP = %p\n", ptp); + + for (i = 0; i < 512; i++) { + ptpval = ptp[i]; + + if (ptpval == 0) + continue; + + for (t = 0; t < tabs; t++) + printf("\t"); + printf("%3d 0x%016lx\n", i, ptpval); + + if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) { + ptpnext = (uint64_t *) + PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK); + ept_dump(ptpnext, nlevels); + } + } +} +#endif + +static void +invept_single_context(void *arg) +{ + struct invept_desc desc = *(struct invept_desc *)arg; + + invept(INVEPT_TYPE_SINGLE_CONTEXT, desc); +} + +void +ept_invalidate_mappings(u_long eptp) +{ + struct invept_desc invept_desc = { 0 }; + + invept_desc.eptp = eptp; + + smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc); +} + +static int +ept_pinit(pmap_t pmap) +{ + + return (pmap_pinit_type(pmap, PT_EPT, ept_pmap_flags)); +} + +struct vmspace * +ept_vmspace_alloc(vm_offset_t min, vm_offset_t max) +{ + + return (vmspace_alloc(min, max, ept_pinit)); +} + +void +ept_vmspace_free(struct vmspace *vmspace) +{ + + vmspace_free(vmspace); +} + +uint64_t +eptp(uint64_t pml4) +{ + uint64_t eptp_val; + + eptp_val = pml4 | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK; + if (ept_enable_ad_bits) + eptp_val |= EPT_ENABLE_AD_BITS; + + return (eptp_val); +} diff --git a/sys/amd64/vmm/intel/ept.h b/sys/amd64/vmm/intel/ept.h new file mode 100644 index 000000000000..93aa9ca3c041 --- /dev/null +++ b/sys/amd64/vmm/intel/ept.h @@ -0,0 +1,39 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _EPT_H_ +#define _EPT_H_ + +struct vmx; + +int ept_init(int ipinum); +void ept_invalidate_mappings(u_long eptp); +struct vmspace *ept_vmspace_alloc(vm_offset_t min, vm_offset_t max); +void ept_vmspace_free(struct vmspace *vmspace); +uint64_t eptp(uint64_t pml4); +#endif diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c new file mode 100644 index 000000000000..35c2ee5b6eff --- /dev/null +++ b/sys/amd64/vmm/intel/vmcs.c @@ -0,0 +1,643 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "opt_bhyve_snapshot.h" +#include "opt_ddb.h" + +#include <sys/param.h> +#include <sys/sysctl.h> +#include <sys/systm.h> +#include <sys/pcpu.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/segments.h> +#include <machine/vmm.h> +#include <machine/vmm_snapshot.h> +#include "vmm_host.h" +#include "vmx_cpufunc.h" +#include "vmcs.h" +#include "ept.h" +#include "vmx.h" + +#ifdef DDB +#include <ddb/ddb.h> +#endif + +SYSCTL_DECL(_hw_vmm_vmx); + +static int no_flush_rsb; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, no_flush_rsb, CTLFLAG_RW, + &no_flush_rsb, 0, "Do not flush RSB upon vmexit"); + +static uint64_t +vmcs_fix_regval(uint32_t encoding, uint64_t val) +{ + + switch (encoding) { + case VMCS_GUEST_CR0: + val = vmx_fix_cr0(val); + break; + case VMCS_GUEST_CR4: + val = vmx_fix_cr4(val); + break; + default: + break; + } + return (val); +} + +static uint32_t +vmcs_field_encoding(int ident) +{ + switch (ident) { + case VM_REG_GUEST_CR0: + return (VMCS_GUEST_CR0); + case VM_REG_GUEST_CR3: + return (VMCS_GUEST_CR3); + case VM_REG_GUEST_CR4: + return (VMCS_GUEST_CR4); + case VM_REG_GUEST_DR7: + return (VMCS_GUEST_DR7); + case VM_REG_GUEST_RSP: + return (VMCS_GUEST_RSP); + case VM_REG_GUEST_RIP: + return (VMCS_GUEST_RIP); + case VM_REG_GUEST_RFLAGS: + return (VMCS_GUEST_RFLAGS); + case VM_REG_GUEST_ES: + return (VMCS_GUEST_ES_SELECTOR); + case VM_REG_GUEST_CS: + return (VMCS_GUEST_CS_SELECTOR); + case VM_REG_GUEST_SS: + return (VMCS_GUEST_SS_SELECTOR); + case VM_REG_GUEST_DS: + return (VMCS_GUEST_DS_SELECTOR); + case VM_REG_GUEST_FS: + return (VMCS_GUEST_FS_SELECTOR); + case VM_REG_GUEST_GS: + return (VMCS_GUEST_GS_SELECTOR); + case VM_REG_GUEST_TR: + return (VMCS_GUEST_TR_SELECTOR); + case VM_REG_GUEST_LDTR: + return (VMCS_GUEST_LDTR_SELECTOR); + case VM_REG_GUEST_EFER: + return (VMCS_GUEST_IA32_EFER); + case VM_REG_GUEST_PDPTE0: + return (VMCS_GUEST_PDPTE0); + case VM_REG_GUEST_PDPTE1: + return (VMCS_GUEST_PDPTE1); + case VM_REG_GUEST_PDPTE2: + return (VMCS_GUEST_PDPTE2); + case VM_REG_GUEST_PDPTE3: + return (VMCS_GUEST_PDPTE3); + case VM_REG_GUEST_ENTRY_INST_LENGTH: + return (VMCS_ENTRY_INST_LENGTH); + case VM_REG_GUEST_FS_BASE: + return (VMCS_GUEST_FS_BASE); + case VM_REG_GUEST_GS_BASE: + return (VMCS_GUEST_GS_BASE); + default: + return (-1); + } +} + +static int +vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc) +{ + + switch (seg) { + case VM_REG_GUEST_ES: + *base = VMCS_GUEST_ES_BASE; + *lim = VMCS_GUEST_ES_LIMIT; + *acc = VMCS_GUEST_ES_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_CS: + *base = VMCS_GUEST_CS_BASE; + *lim = VMCS_GUEST_CS_LIMIT; + *acc = VMCS_GUEST_CS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_SS: + *base = VMCS_GUEST_SS_BASE; + *lim = VMCS_GUEST_SS_LIMIT; + *acc = VMCS_GUEST_SS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_DS: + *base = VMCS_GUEST_DS_BASE; + *lim = VMCS_GUEST_DS_LIMIT; + *acc = VMCS_GUEST_DS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_FS: + *base = VMCS_GUEST_FS_BASE; + *lim = VMCS_GUEST_FS_LIMIT; + *acc = VMCS_GUEST_FS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_GS: + *base = VMCS_GUEST_GS_BASE; + *lim = VMCS_GUEST_GS_LIMIT; + *acc = VMCS_GUEST_GS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_TR: + *base = VMCS_GUEST_TR_BASE; + *lim = VMCS_GUEST_TR_LIMIT; + *acc = VMCS_GUEST_TR_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_LDTR: + *base = VMCS_GUEST_LDTR_BASE; + *lim = VMCS_GUEST_LDTR_LIMIT; + *acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_IDTR: + *base = VMCS_GUEST_IDTR_BASE; + *lim = VMCS_GUEST_IDTR_LIMIT; + *acc = VMCS_INVALID_ENCODING; + break; + case VM_REG_GUEST_GDTR: + *base = VMCS_GUEST_GDTR_BASE; + *lim = VMCS_GUEST_GDTR_LIMIT; + *acc = VMCS_INVALID_ENCODING; + break; + default: + return (EINVAL); + } + + return (0); +} + +int +vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *retval) +{ + int error; + uint32_t encoding; + + /* + * If we need to get at vmx-specific state in the VMCS we can bypass + * the translation of 'ident' to 'encoding' by simply setting the + * sign bit. As it so happens the upper 16 bits are reserved (i.e + * set to 0) in the encodings for the VMCS so we are free to use the + * sign bit. + */ + if (ident < 0) + encoding = ident & 0x7fffffff; + else + encoding = vmcs_field_encoding(ident); + + if (encoding == (uint32_t)-1) + return (EINVAL); + + if (!running) + VMPTRLD(vmcs); + + error = vmread(encoding, retval); + + if (!running) + VMCLEAR(vmcs); + + return (error); +} + +int +vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val) +{ + int error; + uint32_t encoding; + + if (ident < 0) + encoding = ident & 0x7fffffff; + else + encoding = vmcs_field_encoding(ident); + + if (encoding == (uint32_t)-1) + return (EINVAL); + + val = vmcs_fix_regval(encoding, val); + + if (!running) + VMPTRLD(vmcs); + + error = vmwrite(encoding, val); + + if (!running) + VMCLEAR(vmcs); + + return (error); +} + +int +vmcs_setdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc) +{ + int error; + uint32_t base, limit, access; + + error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); + if (error != 0) + panic("vmcs_setdesc: invalid segment register %d", seg); + + if (!running) + VMPTRLD(vmcs); + if ((error = vmwrite(base, desc->base)) != 0) + goto done; + + if ((error = vmwrite(limit, desc->limit)) != 0) + goto done; + + if (access != VMCS_INVALID_ENCODING) { + if ((error = vmwrite(access, desc->access)) != 0) + goto done; + } +done: + if (!running) + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_getdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc) +{ + int error; + uint32_t base, limit, access; + uint64_t u64; + + error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); + if (error != 0) + panic("vmcs_getdesc: invalid segment register %d", seg); + + if (!running) + VMPTRLD(vmcs); + if ((error = vmread(base, &u64)) != 0) + goto done; + desc->base = u64; + + if ((error = vmread(limit, &u64)) != 0) + goto done; + desc->limit = u64; + + if (access != VMCS_INVALID_ENCODING) { + if ((error = vmread(access, &u64)) != 0) + goto done; + desc->access = u64; + } +done: + if (!running) + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count) +{ + int error; + + VMPTRLD(vmcs); + + /* + * Guest MSRs are saved in the VM-exit MSR-store area. + * Guest MSRs are loaded from the VM-entry MSR-load area. + * Both areas point to the same location in memory. + */ + if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0) + goto done; + if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0) + goto done; + + if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0) + goto done; + if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0) + goto done; + + error = 0; +done: + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_init(struct vmcs *vmcs) +{ + int error, codesel, datasel, tsssel; + u_long cr0, cr4, efer; + uint64_t pat, fsbase, idtrbase; + + codesel = vmm_get_host_codesel(); + datasel = vmm_get_host_datasel(); + tsssel = vmm_get_host_tsssel(); + + /* + * Make sure we have a "current" VMCS to work with. + */ + VMPTRLD(vmcs); + + /* Host state */ + + /* Initialize host IA32_PAT MSR */ + pat = vmm_get_host_pat(); + if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0) + goto done; + + /* Load the IA32_EFER MSR */ + efer = vmm_get_host_efer(); + if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0) + goto done; + + /* Load the control registers */ + + cr0 = vmm_get_host_cr0(); + if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0) + goto done; + + cr4 = vmm_get_host_cr4() | CR4_VMXE; + if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0) + goto done; + + /* Load the segment selectors */ + if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0) + goto done; + + /* + * Load the Base-Address for %fs and idtr. + * + * Note that we exclude %gs, tss and gdtr here because their base + * address is pcpu specific. + */ + fsbase = vmm_get_host_fsbase(); + if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0) + goto done; + + idtrbase = vmm_get_host_idtrbase(); + if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0) + goto done; + + /* instruction pointer */ + if (no_flush_rsb) { + if ((error = vmwrite(VMCS_HOST_RIP, + (u_long)vmx_exit_guest)) != 0) + goto done; + } else { + if ((error = vmwrite(VMCS_HOST_RIP, + (u_long)vmx_exit_guest_flush_rsb)) != 0) + goto done; + } + + /* link pointer */ + if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0) + goto done; +done: + VMCLEAR(vmcs); + return (error); +} + +#ifdef BHYVE_SNAPSHOT +int +vmcs_getany(struct vmcs *vmcs, int running, int ident, uint64_t *val) +{ + int error; + + if (!running) + VMPTRLD(vmcs); + + error = vmread(ident, val); + + if (!running) + VMCLEAR(vmcs); + + return (error); +} + +int +vmcs_setany(struct vmcs *vmcs, int running, int ident, uint64_t val) +{ + int error; + + if (!running) + VMPTRLD(vmcs); + + error = vmwrite(ident, val); + + if (!running) + VMCLEAR(vmcs); + + return (error); +} + +int +vmcs_snapshot_reg(struct vmcs *vmcs, int running, int ident, + struct vm_snapshot_meta *meta) +{ + int ret; + uint64_t val; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcs_getreg(vmcs, running, ident, &val); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + + ret = vmcs_setreg(vmcs, running, ident, val); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} + +int +vmcs_snapshot_desc(struct vmcs *vmcs, int running, int seg, + struct vm_snapshot_meta *meta) +{ + int ret; + struct seg_desc desc; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcs_getdesc(vmcs, running, seg, &desc); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done); + + ret = vmcs_setdesc(vmcs, running, seg, &desc); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} + +int +vmcs_snapshot_any(struct vmcs *vmcs, int running, int ident, + struct vm_snapshot_meta *meta) +{ + int ret; + uint64_t val; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcs_getany(vmcs, running, ident, &val); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + + ret = vmcs_setany(vmcs, running, ident, val); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} +#endif + +#ifdef DDB +extern int vmxon_enabled[]; + +DB_SHOW_COMMAND(vmcs, db_show_vmcs) +{ + uint64_t cur_vmcs, val; + uint32_t exit; + + if (!vmxon_enabled[curcpu]) { + db_printf("VMX not enabled\n"); + return; + } + + if (have_addr) { + db_printf("Only current VMCS supported\n"); + return; + } + + vmptrst(&cur_vmcs); + if (cur_vmcs == VMCS_INITIAL) { + db_printf("No current VM context\n"); + return; + } + db_printf("VMCS: %jx\n", cur_vmcs); + db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID)); + db_printf("Activity: "); + val = vmcs_read(VMCS_GUEST_ACTIVITY); + switch (val) { + case 0: + db_printf("Active"); + break; + case 1: + db_printf("HLT"); + break; + case 2: + db_printf("Shutdown"); + break; + case 3: + db_printf("Wait for SIPI"); + break; + default: + db_printf("Unknown: %#lx", val); + } + db_printf("\n"); + exit = vmcs_read(VMCS_EXIT_REASON); + if (exit & 0x80000000) + db_printf("Entry Failure Reason: %u\n", exit & 0xffff); + else + db_printf("Exit Reason: %u\n", exit & 0xffff); + db_printf("Qualification: %#lx\n", vmcs_exit_qualification()); + db_printf("Guest Linear Address: %#lx\n", + vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)); + switch (exit & 0x8000ffff) { + case EXIT_REASON_EXCEPTION: + case EXIT_REASON_EXT_INTR: + val = vmcs_read(VMCS_EXIT_INTR_INFO); + db_printf("Interrupt Type: "); + switch (val >> 8 & 0x7) { + case 0: + db_printf("external"); + break; + case 2: + db_printf("NMI"); + break; + case 3: + db_printf("HW exception"); + break; + case 4: + db_printf("SW exception"); + break; + default: + db_printf("?? %lu", val >> 8 & 0x7); + break; + } + db_printf(" Vector: %lu", val & 0xff); + if (val & 0x800) + db_printf(" Error Code: %lx", + vmcs_read(VMCS_EXIT_INTR_ERRCODE)); + db_printf("\n"); + break; + case EXIT_REASON_EPT_FAULT: + case EXIT_REASON_EPT_MISCONFIG: + db_printf("Guest Physical Address: %#lx\n", + vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)); + break; + } + db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error()); +} +#endif diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h new file mode 100644 index 000000000000..f247370fc60f --- /dev/null +++ b/sys/amd64/vmm/intel/vmcs.h @@ -0,0 +1,422 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMCS_H_ +#define _VMCS_H_ + +#ifdef _KERNEL + +struct vm_snapshot_meta; + +struct vmcs { + uint32_t identifier; + uint32_t abort_code; + char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2]; +}; +CTASSERT(sizeof(struct vmcs) == PAGE_SIZE); + +/* MSR save region is composed of an array of 'struct msr_entry' */ +struct msr_entry { + uint32_t index; + uint32_t reserved; + uint64_t val; + +}; + +int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count); +int vmcs_init(struct vmcs *vmcs); +int vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *rv); +int vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val); +int vmcs_getdesc(struct vmcs *vmcs, int running, int ident, + struct seg_desc *desc); +int vmcs_setdesc(struct vmcs *vmcs, int running, int ident, + struct seg_desc *desc); +#ifdef BHYVE_SNAPSHOT +int vmcs_getany(struct vmcs *vmcs, int running, int ident, uint64_t *val); +int vmcs_setany(struct vmcs *vmcs, int running, int ident, uint64_t val); +int vmcs_snapshot_reg(struct vmcs *vmcs, int running, int ident, + struct vm_snapshot_meta *meta); +int vmcs_snapshot_desc(struct vmcs *vmcs, int running, int seg, + struct vm_snapshot_meta *meta); +int vmcs_snapshot_any(struct vmcs *vmcs, int running, int ident, + struct vm_snapshot_meta *meta); +#endif + +/* + * Avoid header pollution caused by inline use of 'vtophys()' in vmx_cpufunc.h + */ +#ifdef _VMX_CPUFUNC_H_ +static __inline uint64_t +vmcs_read(uint32_t encoding) +{ + int error __diagused; + uint64_t val; + + error = vmread(encoding, &val); + KASSERT(error == 0, ("vmcs_read(%u) error %d", encoding, error)); + return (val); +} + +static __inline void +vmcs_write(uint32_t encoding, uint64_t val) +{ + int error __diagused; + + error = vmwrite(encoding, val); + KASSERT(error == 0, ("vmcs_write(%u) error %d", encoding, error)); +} +#endif /* _VMX_CPUFUNC_H_ */ + +#define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH) +#define vmcs_guest_rip() vmcs_read(VMCS_GUEST_RIP) +#define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR) +#define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff) +#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION) +#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3) +#define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS) +#define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS) +#define vmcs_idt_vectoring_info() vmcs_read(VMCS_IDT_VECTORING_INFO) +#define vmcs_idt_vectoring_err() vmcs_read(VMCS_IDT_VECTORING_ERROR) + +#endif /* _KERNEL */ + +#define VMCS_INITIAL 0xffffffffffffffff + +#define VMCS_IDENT(encoding) ((encoding) | 0x80000000) +/* + * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B. + */ +#define VMCS_INVALID_ENCODING 0xffffffff + +/* 16-bit control fields */ +#define VMCS_VPID 0x00000000 +#define VMCS_PIR_VECTOR 0x00000002 + +/* 16-bit guest-state fields */ +#define VMCS_GUEST_ES_SELECTOR 0x00000800 +#define VMCS_GUEST_CS_SELECTOR 0x00000802 +#define VMCS_GUEST_SS_SELECTOR 0x00000804 +#define VMCS_GUEST_DS_SELECTOR 0x00000806 +#define VMCS_GUEST_FS_SELECTOR 0x00000808 +#define VMCS_GUEST_GS_SELECTOR 0x0000080A +#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C +#define VMCS_GUEST_TR_SELECTOR 0x0000080E +#define VMCS_GUEST_INTR_STATUS 0x00000810 + +/* 16-bit host-state fields */ +#define VMCS_HOST_ES_SELECTOR 0x00000C00 +#define VMCS_HOST_CS_SELECTOR 0x00000C02 +#define VMCS_HOST_SS_SELECTOR 0x00000C04 +#define VMCS_HOST_DS_SELECTOR 0x00000C06 +#define VMCS_HOST_FS_SELECTOR 0x00000C08 +#define VMCS_HOST_GS_SELECTOR 0x00000C0A +#define VMCS_HOST_TR_SELECTOR 0x00000C0C + +/* 64-bit control fields */ +#define VMCS_IO_BITMAP_A 0x00002000 +#define VMCS_IO_BITMAP_B 0x00002002 +#define VMCS_MSR_BITMAP 0x00002004 +#define VMCS_EXIT_MSR_STORE 0x00002006 +#define VMCS_EXIT_MSR_LOAD 0x00002008 +#define VMCS_ENTRY_MSR_LOAD 0x0000200A +#define VMCS_EXECUTIVE_VMCS 0x0000200C +#define VMCS_TSC_OFFSET 0x00002010 +#define VMCS_VIRTUAL_APIC 0x00002012 +#define VMCS_APIC_ACCESS 0x00002014 +#define VMCS_PIR_DESC 0x00002016 +#define VMCS_EPTP 0x0000201A +#define VMCS_EOI_EXIT0 0x0000201C +#define VMCS_EOI_EXIT1 0x0000201E +#define VMCS_EOI_EXIT2 0x00002020 +#define VMCS_EOI_EXIT3 0x00002022 +#define VMCS_EOI_EXIT(vector) (VMCS_EOI_EXIT0 + ((vector) / 64) * 2) + +/* 64-bit read-only fields */ +#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 + +/* 64-bit guest-state fields */ +#define VMCS_LINK_POINTER 0x00002800 +#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802 +#define VMCS_GUEST_IA32_PAT 0x00002804 +#define VMCS_GUEST_IA32_EFER 0x00002806 +#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808 +#define VMCS_GUEST_PDPTE0 0x0000280A +#define VMCS_GUEST_PDPTE1 0x0000280C +#define VMCS_GUEST_PDPTE2 0x0000280E +#define VMCS_GUEST_PDPTE3 0x00002810 + +/* 64-bit host-state fields */ +#define VMCS_HOST_IA32_PAT 0x00002C00 +#define VMCS_HOST_IA32_EFER 0x00002C02 +#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04 + +/* 32-bit control fields */ +#define VMCS_PIN_BASED_CTLS 0x00004000 +#define VMCS_PRI_PROC_BASED_CTLS 0x00004002 +#define VMCS_EXCEPTION_BITMAP 0x00004004 +#define VMCS_PF_ERROR_MASK 0x00004006 +#define VMCS_PF_ERROR_MATCH 0x00004008 +#define VMCS_CR3_TARGET_COUNT 0x0000400A +#define VMCS_EXIT_CTLS 0x0000400C +#define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E +#define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010 +#define VMCS_ENTRY_CTLS 0x00004012 +#define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014 +#define VMCS_ENTRY_INTR_INFO 0x00004016 +#define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018 +#define VMCS_ENTRY_INST_LENGTH 0x0000401A +#define VMCS_TPR_THRESHOLD 0x0000401C +#define VMCS_SEC_PROC_BASED_CTLS 0x0000401E +#define VMCS_PLE_GAP 0x00004020 +#define VMCS_PLE_WINDOW 0x00004022 + +/* 32-bit read-only data fields */ +#define VMCS_INSTRUCTION_ERROR 0x00004400 +#define VMCS_EXIT_REASON 0x00004402 +#define VMCS_EXIT_INTR_INFO 0x00004404 +#define VMCS_EXIT_INTR_ERRCODE 0x00004406 +#define VMCS_IDT_VECTORING_INFO 0x00004408 +#define VMCS_IDT_VECTORING_ERROR 0x0000440A +#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C +#define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E + +/* 32-bit guest-state fields */ +#define VMCS_GUEST_ES_LIMIT 0x00004800 +#define VMCS_GUEST_CS_LIMIT 0x00004802 +#define VMCS_GUEST_SS_LIMIT 0x00004804 +#define VMCS_GUEST_DS_LIMIT 0x00004806 +#define VMCS_GUEST_FS_LIMIT 0x00004808 +#define VMCS_GUEST_GS_LIMIT 0x0000480A +#define VMCS_GUEST_LDTR_LIMIT 0x0000480C +#define VMCS_GUEST_TR_LIMIT 0x0000480E +#define VMCS_GUEST_GDTR_LIMIT 0x00004810 +#define VMCS_GUEST_IDTR_LIMIT 0x00004812 +#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814 +#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816 +#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818 +#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A +#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C +#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E +#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820 +#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822 +#define VMCS_GUEST_INTERRUPTIBILITY 0x00004824 +#define VMCS_GUEST_ACTIVITY 0x00004826 +#define VMCS_GUEST_SMBASE 0x00004828 +#define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A +#define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E + +/* 32-bit host state fields */ +#define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00 + +/* Natural Width control fields */ +#define VMCS_CR0_MASK 0x00006000 +#define VMCS_CR4_MASK 0x00006002 +#define VMCS_CR0_SHADOW 0x00006004 +#define VMCS_CR4_SHADOW 0x00006006 +#define VMCS_CR3_TARGET0 0x00006008 +#define VMCS_CR3_TARGET1 0x0000600A +#define VMCS_CR3_TARGET2 0x0000600C +#define VMCS_CR3_TARGET3 0x0000600E + +/* Natural Width read-only fields */ +#define VMCS_EXIT_QUALIFICATION 0x00006400 +#define VMCS_IO_RCX 0x00006402 +#define VMCS_IO_RSI 0x00006404 +#define VMCS_IO_RDI 0x00006406 +#define VMCS_IO_RIP 0x00006408 +#define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A + +/* Natural Width guest-state fields */ +#define VMCS_GUEST_CR0 0x00006800 +#define VMCS_GUEST_CR3 0x00006802 +#define VMCS_GUEST_CR4 0x00006804 +#define VMCS_GUEST_ES_BASE 0x00006806 +#define VMCS_GUEST_CS_BASE 0x00006808 +#define VMCS_GUEST_SS_BASE 0x0000680A +#define VMCS_GUEST_DS_BASE 0x0000680C +#define VMCS_GUEST_FS_BASE 0x0000680E +#define VMCS_GUEST_GS_BASE 0x00006810 +#define VMCS_GUEST_LDTR_BASE 0x00006812 +#define VMCS_GUEST_TR_BASE 0x00006814 +#define VMCS_GUEST_GDTR_BASE 0x00006816 +#define VMCS_GUEST_IDTR_BASE 0x00006818 +#define VMCS_GUEST_DR7 0x0000681A +#define VMCS_GUEST_RSP 0x0000681C +#define VMCS_GUEST_RIP 0x0000681E +#define VMCS_GUEST_RFLAGS 0x00006820 +#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822 +#define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824 +#define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826 + +/* Natural Width host-state fields */ +#define VMCS_HOST_CR0 0x00006C00 +#define VMCS_HOST_CR3 0x00006C02 +#define VMCS_HOST_CR4 0x00006C04 +#define VMCS_HOST_FS_BASE 0x00006C06 +#define VMCS_HOST_GS_BASE 0x00006C08 +#define VMCS_HOST_TR_BASE 0x00006C0A +#define VMCS_HOST_GDTR_BASE 0x00006C0C +#define VMCS_HOST_IDTR_BASE 0x00006C0E +#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10 +#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12 +#define VMCS_HOST_RSP 0x00006C14 +#define VMCS_HOST_RIP 0x00006c16 + +/* + * VM instruction error numbers + */ +#define VMRESUME_WITH_NON_LAUNCHED_VMCS 5 + +/* + * VMCS exit reasons + */ +#define EXIT_REASON_EXCEPTION 0 +#define EXIT_REASON_EXT_INTR 1 +#define EXIT_REASON_TRIPLE_FAULT 2 +#define EXIT_REASON_INIT 3 +#define EXIT_REASON_SIPI 4 +#define EXIT_REASON_IO_SMI 5 +#define EXIT_REASON_SMI 6 +#define EXIT_REASON_INTR_WINDOW 7 +#define EXIT_REASON_NMI_WINDOW 8 +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_GETSEC 11 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVD 13 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_RSM 17 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMXOFF 26 +#define EXIT_REASON_VMXON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_INOUT 30 +#define EXIT_REASON_RDMSR 31 +#define EXIT_REASON_WRMSR 32 +#define EXIT_REASON_INVAL_VMCS 33 +#define EXIT_REASON_INVAL_MSR 34 +#define EXIT_REASON_MWAIT 36 +#define EXIT_REASON_MTF 37 +#define EXIT_REASON_MONITOR 39 +#define EXIT_REASON_PAUSE 40 +#define EXIT_REASON_MCE_DURING_ENTRY 41 +#define EXIT_REASON_TPR 43 +#define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_VIRTUALIZED_EOI 45 +#define EXIT_REASON_GDTR_IDTR 46 +#define EXIT_REASON_LDTR_TR 47 +#define EXIT_REASON_EPT_FAULT 48 +#define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_INVEPT 50 +#define EXIT_REASON_RDTSCP 51 +#define EXIT_REASON_VMX_PREEMPT 52 +#define EXIT_REASON_INVVPID 53 +#define EXIT_REASON_WBINVD 54 +#define EXIT_REASON_XSETBV 55 +#define EXIT_REASON_APIC_WRITE 56 +#define EXIT_REASON_RDRAND 57 +#define EXIT_REASON_INVPCID 58 +#define EXIT_REASON_VMFUNC 59 +#define EXIT_REASON_ENCLS 60 +#define EXIT_REASON_RDSEED 61 +#define EXIT_REASON_PM_LOG_FULL 62 +#define EXIT_REASON_XSAVES 63 +#define EXIT_REASON_XRSTORS 64 + +/* + * NMI unblocking due to IRET. + * + * Applies to VM-exits due to hardware exception or EPT fault. + */ +#define EXIT_QUAL_NMIUDTI (1 << 12) +/* + * VMCS interrupt information fields + */ +#define VMCS_INTR_VALID (1U << 31) +#define VMCS_INTR_T_MASK 0x700 /* Interruption-info type */ +#define VMCS_INTR_T_HWINTR (0 << 8) +#define VMCS_INTR_T_NMI (2 << 8) +#define VMCS_INTR_T_HWEXCEPTION (3 << 8) +#define VMCS_INTR_T_SWINTR (4 << 8) +#define VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8) +#define VMCS_INTR_T_SWEXCEPTION (6 << 8) +#define VMCS_INTR_DEL_ERRCODE (1 << 11) + +/* + * VMCS IDT-Vectoring information fields + */ +#define VMCS_IDT_VEC_VALID (1U << 31) +#define VMCS_IDT_VEC_ERRCODE_VALID (1 << 11) + +/* + * VMCS Guest interruptibility field + */ +#define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0) +#define VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING (1 << 1) +#define VMCS_INTERRUPTIBILITY_SMI_BLOCKING (1 << 2) +#define VMCS_INTERRUPTIBILITY_NMI_BLOCKING (1 << 3) + +/* + * Exit qualification for EXIT_REASON_INVAL_VMCS + */ +#define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3 + +/* + * Exit qualification for EPT violation + */ +#define EPT_VIOLATION_DATA_READ (1UL << 0) +#define EPT_VIOLATION_DATA_WRITE (1UL << 1) +#define EPT_VIOLATION_INST_FETCH (1UL << 2) +#define EPT_VIOLATION_GPA_READABLE (1UL << 3) +#define EPT_VIOLATION_GPA_WRITEABLE (1UL << 4) +#define EPT_VIOLATION_GPA_EXECUTABLE (1UL << 5) +#define EPT_VIOLATION_GLA_VALID (1UL << 7) +#define EPT_VIOLATION_XLAT_VALID (1UL << 8) + +/* + * Exit qualification for APIC-access VM exit + */ +#define APIC_ACCESS_OFFSET(qual) ((qual) & 0xFFF) +#define APIC_ACCESS_TYPE(qual) (((qual) >> 12) & 0xF) + +/* + * Exit qualification for APIC-write VM exit + */ +#define APIC_WRITE_OFFSET(qual) ((qual) & 0xFFF) + +#endif diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c new file mode 100644 index 000000000000..842281ab862e --- /dev/null +++ b/sys/amd64/vmm/intel/vmx.c @@ -0,0 +1,4307 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * Copyright (c) 2018 Joyent, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_bhyve_snapshot.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/smp.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/reg.h> +#include <sys/smr.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> + +#include <machine/psl.h> +#include <machine/cpufunc.h> +#include <machine/md_var.h> +#include <machine/segments.h> +#include <machine/smp.h> +#include <machine/specialreg.h> +#include <machine/vmparam.h> + +#include <machine/vmm.h> +#include <machine/vmm_dev.h> +#include <machine/vmm_instruction_emul.h> +#include <machine/vmm_snapshot.h> + +#include <dev/vmm/vmm_ktr.h> +#include <dev/vmm/vmm_mem.h> + +#include "vmm_lapic.h" +#include "vmm_host.h" +#include "vmm_ioport.h" +#include "vmm_stat.h" +#include "vatpic.h" +#include "vlapic.h" +#include "vlapic_priv.h" + +#include "ept.h" +#include "vmx_cpufunc.h" +#include "vmx.h" +#include "vmx_msr.h" +#include "x86.h" +#include "vmx_controls.h" +#include "io/ppt.h" + +#define PINBASED_CTLS_ONE_SETTING \ + (PINBASED_EXTINT_EXITING | \ + PINBASED_NMI_EXITING | \ + PINBASED_VIRTUAL_NMI) +#define PINBASED_CTLS_ZERO_SETTING 0 + +#define PROCBASED_CTLS_WINDOW_SETTING \ + (PROCBASED_INT_WINDOW_EXITING | \ + PROCBASED_NMI_WINDOW_EXITING) + +#define PROCBASED_CTLS_ONE_SETTING \ + (PROCBASED_SECONDARY_CONTROLS | \ + PROCBASED_MWAIT_EXITING | \ + PROCBASED_MONITOR_EXITING | \ + PROCBASED_IO_EXITING | \ + PROCBASED_MSR_BITMAPS | \ + PROCBASED_CTLS_WINDOW_SETTING | \ + PROCBASED_CR8_LOAD_EXITING | \ + PROCBASED_CR8_STORE_EXITING) +#define PROCBASED_CTLS_ZERO_SETTING \ + (PROCBASED_CR3_LOAD_EXITING | \ + PROCBASED_CR3_STORE_EXITING | \ + PROCBASED_IO_BITMAPS) + +#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT +#define PROCBASED_CTLS2_ZERO_SETTING 0 + +#define VM_EXIT_CTLS_ONE_SETTING \ + (VM_EXIT_SAVE_DEBUG_CONTROLS | \ + VM_EXIT_HOST_LMA | \ + VM_EXIT_SAVE_EFER | \ + VM_EXIT_LOAD_EFER | \ + VM_EXIT_ACKNOWLEDGE_INTERRUPT) + +#define VM_EXIT_CTLS_ZERO_SETTING 0 + +#define VM_ENTRY_CTLS_ONE_SETTING \ + (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_LOAD_EFER) + +#define VM_ENTRY_CTLS_ZERO_SETTING \ + (VM_ENTRY_INTO_SMM | \ + VM_ENTRY_DEACTIVATE_DUAL_MONITOR) + +#define HANDLED 1 +#define UNHANDLED 0 + +static MALLOC_DEFINE(M_VMX, "vmx", "vmx"); +static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); + +bool vmx_have_msr_tsc_aux; + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, + NULL); + +int vmxon_enabled[MAXCPU]; +static uint8_t *vmxon_region; + +static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; +static uint32_t exit_ctls, entry_ctls; + +static uint64_t cr0_ones_mask, cr0_zeros_mask; +SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, + &cr0_ones_mask, 0, NULL); +SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, + &cr0_zeros_mask, 0, NULL); + +static uint64_t cr4_ones_mask, cr4_zeros_mask; +SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, + &cr4_ones_mask, 0, NULL); +SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, + &cr4_zeros_mask, 0, NULL); + +static int vmx_initialized; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, + &vmx_initialized, 0, "Intel VMX initialized"); + +/* + * Optional capabilities + */ +static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, + CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, + NULL); + +static int cap_halt_exit; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0, + "HLT triggers a VM-exit"); + +static int cap_pause_exit; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, + 0, "PAUSE triggers a VM-exit"); + +static int cap_wbinvd_exit; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, wbinvd_exit, CTLFLAG_RD, &cap_wbinvd_exit, + 0, "WBINVD triggers a VM-exit"); + +static int cap_rdpid; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, rdpid, CTLFLAG_RD, &cap_rdpid, 0, + "Guests are allowed to use RDPID"); + +static int cap_rdtscp; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, rdtscp, CTLFLAG_RD, &cap_rdtscp, 0, + "Guests are allowed to use RDTSCP"); + +static int cap_unrestricted_guest; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, + &cap_unrestricted_guest, 0, "Unrestricted guests"); + +static int cap_monitor_trap; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, + &cap_monitor_trap, 0, "Monitor trap flag"); + +static int cap_invpcid; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, + 0, "Guests are allowed to use INVPCID"); + +static int tpr_shadowing; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, tpr_shadowing, + CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &tpr_shadowing, 0, "TPR shadowing support"); + +static int virtual_interrupt_delivery; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, + CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); + +static int posted_interrupts; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, + CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &posted_interrupts, 0, "APICv posted interrupt support"); + +static int pirvec = -1; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, + &pirvec, 0, "APICv posted interrupt vector"); + +static struct unrhdr *vpid_unr; +static u_int vpid_alloc_failed; +SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, + &vpid_alloc_failed, 0, NULL); + +int guest_l1d_flush; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &guest_l1d_flush, 0, NULL); +int guest_l1d_flush_sw; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &guest_l1d_flush_sw, 0, NULL); + +static struct msr_entry msr_load_list[1] __aligned(16); + +/* + * The definitions of SDT probes for VMX. + */ + +SDT_PROBE_DEFINE3(vmm, vmx, exit, entry, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch, + "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess, + "struct vmx *", "int", "struct vm_exit *", "uint64_t"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr, + "struct vmx *", "int", "struct vm_exit *", "uint32_t"); + +SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr, + "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, halt, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, pause, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt, + "struct vmx *", "int", "struct vm_exit *", "uint32_t"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, inout, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE5(vmm, vmx, exit, exception, + "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int"); + +SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault, + "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault, + "struct vmx *", "int", "struct vm_exit *", "uint64_t"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite, + "struct vmx *", "int", "struct vm_exit *", "struct vlapic *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown, + "struct vmx *", "int", "struct vm_exit *", "uint32_t"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, return, + "struct vmx *", "int", "struct vm_exit *", "int"); + +/* + * Use the last page below 4GB as the APIC access address. This address is + * occupied by the boot firmware so it is guaranteed that it will not conflict + * with a page in system memory. + */ +#define APIC_ACCESS_ADDRESS 0xFFFFF000 + +static int vmx_getdesc(void *vcpui, int reg, struct seg_desc *desc); +static int vmx_getreg(void *vcpui, int reg, uint64_t *retval); +static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val); +static void vmx_inject_pir(struct vlapic *vlapic); +#ifdef BHYVE_SNAPSHOT +static int vmx_restore_tsc(void *vcpui, uint64_t now); +#endif + +static inline bool +host_has_rdpid(void) +{ + return ((cpu_stdext_feature2 & CPUID_STDEXT2_RDPID) != 0); +} + +static inline bool +host_has_rdtscp(void) +{ + return ((amd_feature & AMDID_RDTSCP) != 0); +} + +#ifdef KTR +static const char * +exit_reason_to_str(int reason) +{ + static char reasonbuf[32]; + + switch (reason) { + case EXIT_REASON_EXCEPTION: + return "exception"; + case EXIT_REASON_EXT_INTR: + return "extint"; + case EXIT_REASON_TRIPLE_FAULT: + return "triplefault"; + case EXIT_REASON_INIT: + return "init"; + case EXIT_REASON_SIPI: + return "sipi"; + case EXIT_REASON_IO_SMI: + return "iosmi"; + case EXIT_REASON_SMI: + return "smi"; + case EXIT_REASON_INTR_WINDOW: + return "intrwindow"; + case EXIT_REASON_NMI_WINDOW: + return "nmiwindow"; + case EXIT_REASON_TASK_SWITCH: + return "taskswitch"; + case EXIT_REASON_CPUID: + return "cpuid"; + case EXIT_REASON_GETSEC: + return "getsec"; + case EXIT_REASON_HLT: + return "hlt"; + case EXIT_REASON_INVD: + return "invd"; + case EXIT_REASON_INVLPG: + return "invlpg"; + case EXIT_REASON_RDPMC: + return "rdpmc"; + case EXIT_REASON_RDTSC: + return "rdtsc"; + case EXIT_REASON_RSM: + return "rsm"; + case EXIT_REASON_VMCALL: + return "vmcall"; + case EXIT_REASON_VMCLEAR: + return "vmclear"; + case EXIT_REASON_VMLAUNCH: + return "vmlaunch"; + case EXIT_REASON_VMPTRLD: + return "vmptrld"; + case EXIT_REASON_VMPTRST: + return "vmptrst"; + case EXIT_REASON_VMREAD: + return "vmread"; + case EXIT_REASON_VMRESUME: + return "vmresume"; + case EXIT_REASON_VMWRITE: + return "vmwrite"; + case EXIT_REASON_VMXOFF: + return "vmxoff"; + case EXIT_REASON_VMXON: + return "vmxon"; + case EXIT_REASON_CR_ACCESS: + return "craccess"; + case EXIT_REASON_DR_ACCESS: + return "draccess"; + case EXIT_REASON_INOUT: + return "inout"; + case EXIT_REASON_RDMSR: + return "rdmsr"; + case EXIT_REASON_WRMSR: + return "wrmsr"; + case EXIT_REASON_INVAL_VMCS: + return "invalvmcs"; + case EXIT_REASON_INVAL_MSR: + return "invalmsr"; + case EXIT_REASON_MWAIT: + return "mwait"; + case EXIT_REASON_MTF: + return "mtf"; + case EXIT_REASON_MONITOR: + return "monitor"; + case EXIT_REASON_PAUSE: + return "pause"; + case EXIT_REASON_MCE_DURING_ENTRY: + return "mce-during-entry"; + case EXIT_REASON_TPR: + return "tpr"; + case EXIT_REASON_APIC_ACCESS: + return "apic-access"; + case EXIT_REASON_GDTR_IDTR: + return "gdtridtr"; + case EXIT_REASON_LDTR_TR: + return "ldtrtr"; + case EXIT_REASON_EPT_FAULT: + return "eptfault"; + case EXIT_REASON_EPT_MISCONFIG: + return "eptmisconfig"; + case EXIT_REASON_INVEPT: + return "invept"; + case EXIT_REASON_RDTSCP: + return "rdtscp"; + case EXIT_REASON_VMX_PREEMPT: + return "vmxpreempt"; + case EXIT_REASON_INVVPID: + return "invvpid"; + case EXIT_REASON_WBINVD: + return "wbinvd"; + case EXIT_REASON_XSETBV: + return "xsetbv"; + case EXIT_REASON_APIC_WRITE: + return "apic-write"; + default: + snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); + return (reasonbuf); + } +} +#endif /* KTR */ + +static int +vmx_allow_x2apic_msrs(struct vmx *vmx) +{ + int i, error; + + error = 0; + + /* + * Allow readonly access to the following x2APIC MSRs from the guest. + */ + error += guest_msr_ro(vmx, MSR_APIC_ID); + error += guest_msr_ro(vmx, MSR_APIC_VERSION); + error += guest_msr_ro(vmx, MSR_APIC_LDR); + error += guest_msr_ro(vmx, MSR_APIC_SVR); + + for (i = 0; i < 8; i++) + error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i); + + for (i = 0; i < 8; i++) + error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); + + for (i = 0; i < 8; i++) + error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); + + error += guest_msr_ro(vmx, MSR_APIC_ESR); + error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER); + error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL); + error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT); + error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0); + error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1); + error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR); + error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER); + error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER); + error += guest_msr_ro(vmx, MSR_APIC_ICR); + + /* + * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest. + * + * These registers get special treatment described in the section + * "Virtualizing MSR-Based APIC Accesses". + */ + error += guest_msr_rw(vmx, MSR_APIC_TPR); + error += guest_msr_rw(vmx, MSR_APIC_EOI); + error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI); + + return (error); +} + +u_long +vmx_fix_cr0(u_long cr0) +{ + + return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); +} + +u_long +vmx_fix_cr4(u_long cr4) +{ + + return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); +} + +static void +vpid_free(int vpid) +{ + if (vpid < 0 || vpid > 0xffff) + panic("vpid_free: invalid vpid %d", vpid); + + /* + * VPIDs [0,vm_maxcpu] are special and are not allocated from + * the unit number allocator. + */ + + if (vpid > vm_maxcpu) + free_unr(vpid_unr, vpid); +} + +static uint16_t +vpid_alloc(int vcpuid) +{ + int x; + + /* + * If the "enable vpid" execution control is not enabled then the + * VPID is required to be 0 for all vcpus. + */ + if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) + return (0); + + /* + * Try to allocate a unique VPID for each from the unit number + * allocator. + */ + x = alloc_unr(vpid_unr); + + if (x == -1) { + atomic_add_int(&vpid_alloc_failed, 1); + + /* + * If the unit number allocator does not have enough unique + * VPIDs then we need to allocate from the [1,vm_maxcpu] range. + * + * These VPIDs are not be unique across VMs but this does not + * affect correctness because the combined mappings are also + * tagged with the EP4TA which is unique for each VM. + * + * It is still sub-optimal because the invvpid will invalidate + * combined mappings for a particular VPID across all EP4TAs. + */ + return (vcpuid + 1); + } + + return (x); +} + +static void +vpid_init(void) +{ + /* + * VPID 0 is required when the "enable VPID" execution control is + * disabled. + * + * VPIDs [1,vm_maxcpu] are used as the "overflow namespace" when the + * unit number allocator does not have sufficient unique VPIDs to + * satisfy the allocation. + * + * The remaining VPIDs are managed by the unit number allocator. + */ + vpid_unr = new_unrhdr(vm_maxcpu + 1, 0xffff, NULL); +} + +static void +vmx_disable(void *arg __unused) +{ + struct invvpid_desc invvpid_desc = { 0 }; + struct invept_desc invept_desc = { 0 }; + + if (vmxon_enabled[curcpu]) { + /* + * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. + * + * VMXON or VMXOFF are not required to invalidate any TLB + * caching structures. This prevents potential retention of + * cached information in the TLB between distinct VMX episodes. + */ + invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); + invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); + vmxoff(); + } + load_cr4(rcr4() & ~CR4_VMXE); +} + +static int +vmx_modcleanup(void) +{ + + if (pirvec >= 0) + lapic_ipi_free(pirvec); + + if (vpid_unr != NULL) { + delete_unrhdr(vpid_unr); + vpid_unr = NULL; + } + + if (nmi_flush_l1d_sw == 1) + nmi_flush_l1d_sw = 0; + + smp_rendezvous(NULL, vmx_disable, NULL, NULL); + + if (vmxon_region != NULL) + kmem_free(vmxon_region, (mp_maxid + 1) * PAGE_SIZE); + + return (0); +} + +static void +vmx_enable(void *arg __unused) +{ + int error; + uint64_t feature_control; + + feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); + if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || + (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { + wrmsr(MSR_IA32_FEATURE_CONTROL, + feature_control | IA32_FEATURE_CONTROL_VMX_EN | + IA32_FEATURE_CONTROL_LOCK); + } + + load_cr4(rcr4() | CR4_VMXE); + + *(uint32_t *)&vmxon_region[curcpu * PAGE_SIZE] = vmx_revision(); + error = vmxon(&vmxon_region[curcpu * PAGE_SIZE]); + if (error == 0) + vmxon_enabled[curcpu] = 1; +} + +static void +vmx_modsuspend(void) +{ + + if (vmxon_enabled[curcpu]) + vmx_disable(NULL); +} + +static void +vmx_modresume(void) +{ + + if (vmxon_enabled[curcpu]) + vmx_enable(NULL); +} + +static int +vmx_modinit(int ipinum) +{ + int error; + uint64_t basic, fixed0, fixed1, feature_control; + uint32_t tmp, procbased2_vid_bits; + + /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ + if (!(cpu_feature2 & CPUID2_VMX)) { + printf("vmx_modinit: processor does not support VMX " + "operation\n"); + return (ENXIO); + } + + /* + * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits + * are set (bits 0 and 2 respectively). + */ + feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); + if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 && + (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { + printf("vmx_modinit: VMX operation disabled by BIOS\n"); + return (ENXIO); + } + + /* + * Verify capabilities MSR_VMX_BASIC: + * - bit 54 indicates support for INS/OUTS decoding + */ + basic = rdmsr(MSR_VMX_BASIC); + if ((basic & (1UL << 54)) == 0) { + printf("vmx_modinit: processor does not support desired basic " + "capabilities\n"); + return (EINVAL); + } + + /* Check support for primary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_CTLS_ONE_SETTING, + PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); + if (error) { + printf("vmx_modinit: processor does not support desired " + "primary processor-based controls\n"); + return (error); + } + + /* Clear the processor-based ctl bits that are set on demand */ + procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; + + /* Check support for secondary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, + PROCBASED_CTLS2_ONE_SETTING, + PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); + if (error) { + printf("vmx_modinit: processor does not support desired " + "secondary processor-based controls\n"); + return (error); + } + + /* Check support for VPID */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_ENABLE_VPID, 0, &tmp); + if (error == 0) + procbased_ctls2 |= PROCBASED2_ENABLE_VPID; + + /* Check support for pin-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, + PINBASED_CTLS_ONE_SETTING, + PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); + if (error) { + printf("vmx_modinit: processor does not support desired " + "pin-based controls\n"); + return (error); + } + + /* Check support for VM-exit controls */ + error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, + VM_EXIT_CTLS_ONE_SETTING, + VM_EXIT_CTLS_ZERO_SETTING, + &exit_ctls); + if (error) { + printf("vmx_modinit: processor does not support desired " + "exit controls\n"); + return (error); + } + + /* Check support for VM-entry controls */ + error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, + VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, + &entry_ctls); + if (error) { + printf("vmx_modinit: processor does not support desired " + "entry controls\n"); + return (error); + } + + /* + * Check support for optional features by testing them + * as individual bits + */ + cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_HLT_EXITING, 0, + &tmp) == 0); + + cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_PROCBASED_CTLS, + PROCBASED_MTF, 0, + &tmp) == 0); + + cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_PAUSE_EXITING, 0, + &tmp) == 0); + + cap_wbinvd_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_WBINVD_EXITING, + 0, + &tmp) == 0); + + /* + * Check support for RDPID and/or RDTSCP. + * + * Support a pass-through-based implementation of these via the + * "enable RDTSCP" VM-execution control and the "RDTSC exiting" + * VM-execution control. + * + * The "enable RDTSCP" VM-execution control applies to both RDPID + * and RDTSCP (see SDM volume 3, section 25.3, "Changes to + * Instruction Behavior in VMX Non-root operation"); this is why + * only this VM-execution control needs to be enabled in order to + * enable passing through whichever of RDPID and/or RDTSCP are + * supported by the host. + * + * The "RDTSC exiting" VM-execution control applies to both RDTSC + * and RDTSCP (again, per SDM volume 3, section 25.3), and is + * already set up for RDTSC and RDTSCP pass-through by the current + * implementation of RDTSC. + * + * Although RDPID and RDTSCP are optional capabilities, since there + * does not currently seem to be a use case for enabling/disabling + * these via libvmmapi, choose not to support this and, instead, + * just statically always enable or always disable this support + * across all vCPUs on all VMs. (Note that there may be some + * complications to providing this functionality, e.g., the MSR + * bitmap is currently per-VM rather than per-vCPU while the + * capability API wants to be able to control capabilities on a + * per-vCPU basis). + */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_ENABLE_RDTSCP, 0, &tmp); + cap_rdpid = error == 0 && host_has_rdpid(); + cap_rdtscp = error == 0 && host_has_rdtscp(); + if (cap_rdpid || cap_rdtscp) { + procbased_ctls2 |= PROCBASED2_ENABLE_RDTSCP; + vmx_have_msr_tsc_aux = true; + } + + cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_UNRESTRICTED_GUEST, 0, + &tmp) == 0); + + cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, + &tmp) == 0); + + /* + * Check support for TPR shadow. + */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, + &tmp); + if (error == 0) { + tpr_shadowing = 1; +#ifndef BURN_BRIDGES + TUNABLE_INT_FETCH("hw.vmm.vmx.use_tpr_shadowing", + &tpr_shadowing); +#endif + TUNABLE_INT_FETCH("hw.vmm.vmx.cap.tpr_shadowing", + &tpr_shadowing); + } + + if (tpr_shadowing) { + procbased_ctls |= PROCBASED_USE_TPR_SHADOW; + procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING; + procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING; + } + + /* + * Check support for virtual interrupt delivery. + */ + procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | + PROCBASED2_VIRTUALIZE_X2APIC_MODE | + PROCBASED2_APIC_REGISTER_VIRTUALIZATION | + PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); + + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, + procbased2_vid_bits, 0, &tmp); + if (error == 0 && tpr_shadowing) { + virtual_interrupt_delivery = 1; +#ifndef BURN_BRIDGES + TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", + &virtual_interrupt_delivery); +#endif + TUNABLE_INT_FETCH("hw.vmm.vmx.cap.virtual_interrupt_delivery", + &virtual_interrupt_delivery); + } + + if (virtual_interrupt_delivery) { + procbased_ctls |= PROCBASED_USE_TPR_SHADOW; + procbased_ctls2 |= procbased2_vid_bits; + procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; + + /* + * Check for Posted Interrupts only if Virtual Interrupt + * Delivery is enabled. + */ + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, + &tmp); + if (error == 0) { + pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : + &IDTVEC(justreturn)); + if (pirvec < 0) { + if (bootverbose) { + printf("vmx_modinit: unable to " + "allocate posted interrupt " + "vector\n"); + } + } else { + posted_interrupts = 1; +#ifndef BURN_BRIDGES + TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", + &posted_interrupts); +#endif + TUNABLE_INT_FETCH("hw.vmm.vmx.cap.posted_interrupts", + &posted_interrupts); + } + } + } + + if (posted_interrupts) + pinbased_ctls |= PINBASED_POSTED_INTERRUPT; + + /* Initialize EPT */ + error = ept_init(ipinum); + if (error) { + printf("vmx_modinit: ept initialization failed (%d)\n", error); + return (error); + } + + guest_l1d_flush = (cpu_ia32_arch_caps & + IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0; +#ifndef BURN_BRIDGES + TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush); +#endif + TUNABLE_INT_FETCH("hw.vmm.vmx.l1d_flush", &guest_l1d_flush); + + /* + * L1D cache flush is enabled. Use IA32_FLUSH_CMD MSR when + * available. Otherwise fall back to the software flush + * method which loads enough data from the kernel text to + * flush existing L1D content, both on VMX entry and on NMI + * return. + */ + if (guest_l1d_flush) { + if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) { + guest_l1d_flush_sw = 1; +#ifndef BURN_BRIDGES + TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw", + &guest_l1d_flush_sw); +#endif + TUNABLE_INT_FETCH("hw.vmm.vmx.l1d_flush_sw", + &guest_l1d_flush_sw); + } + if (guest_l1d_flush_sw) { + if (nmi_flush_l1d_sw <= 1) + nmi_flush_l1d_sw = 1; + } else { + msr_load_list[0].index = MSR_IA32_FLUSH_CMD; + msr_load_list[0].val = IA32_FLUSH_CMD_L1D; + } + } + + /* + * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 + */ + fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); + fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); + cr0_ones_mask = fixed0 & fixed1; + cr0_zeros_mask = ~fixed0 & ~fixed1; + + /* + * CR0_PE and CR0_PG can be set to zero in VMX non-root operation + * if unrestricted guest execution is allowed. + */ + if (cap_unrestricted_guest) + cr0_ones_mask &= ~(CR0_PG | CR0_PE); + + /* + * Do not allow the guest to set CR0_NW or CR0_CD. + */ + cr0_zeros_mask |= (CR0_NW | CR0_CD); + + fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); + fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); + cr4_ones_mask = fixed0 & fixed1; + cr4_zeros_mask = ~fixed0 & ~fixed1; + + vpid_init(); + + vmx_msr_init(); + + /* enable VMX operation */ + vmxon_region = kmem_malloc((mp_maxid + 1) * PAGE_SIZE, + M_WAITOK | M_ZERO); + smp_rendezvous(NULL, vmx_enable, NULL, NULL); + + vmx_initialized = 1; + + return (0); +} + +static void +vmx_trigger_hostintr(int vector) +{ + uintptr_t func; + struct gate_descriptor *gd; + + gd = &idt[vector]; + + KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: " + "invalid vector %d", vector)); + KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present", + vector)); + KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d " + "has invalid type %d", vector, gd->gd_type)); + KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d " + "has invalid dpl %d", vector, gd->gd_dpl)); + KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor " + "for vector %d has invalid selector %d", vector, gd->gd_selector)); + KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid " + "IST %d", vector, gd->gd_ist)); + + func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset); + vmx_call_isr(func); +} + +static int +vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) +{ + int error, mask_ident, shadow_ident; + uint64_t mask_value; + + if (which != 0 && which != 4) + panic("vmx_setup_cr_shadow: unknown cr%d", which); + + if (which == 0) { + mask_ident = VMCS_CR0_MASK; + mask_value = cr0_ones_mask | cr0_zeros_mask; + shadow_ident = VMCS_CR0_SHADOW; + } else { + mask_ident = VMCS_CR4_MASK; + mask_value = cr4_ones_mask | cr4_zeros_mask; + shadow_ident = VMCS_CR4_SHADOW; + } + + error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); + if (error) + return (error); + + error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); + if (error) + return (error); + + return (0); +} +#define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) +#define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) + +static void * +vmx_init(struct vm *vm, pmap_t pmap) +{ + int error __diagused; + struct vmx *vmx; + + vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); + vmx->vm = vm; + + vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pmltop)); + + /* + * Clean up EPTP-tagged guest physical and combined mappings + * + * VMX transitions are not required to invalidate any guest physical + * mappings. So, it may be possible for stale guest physical mappings + * to be present in the processor TLBs. + * + * Combined mappings for this EP4TA are also invalidated for all VPIDs. + */ + ept_invalidate_mappings(vmx->eptp); + + vmx->msr_bitmap = malloc_aligned(PAGE_SIZE, PAGE_SIZE, M_VMX, + M_WAITOK | M_ZERO); + msr_bitmap_initialize(vmx->msr_bitmap); + + /* + * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. + * The guest FSBASE and GSBASE are saved and restored during + * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are + * always restored from the vmcs host state area on vm-exit. + * + * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in + * how they are saved/restored so can be directly accessed by the + * guest. + * + * MSR_EFER is saved and restored in the guest VMCS area on a + * VM exit and entry respectively. It is also restored from the + * host VMCS area on a VM exit. + * + * The TSC MSR is exposed read-only. Writes are disallowed as + * that will impact the host TSC. If the guest does a write + * the "use TSC offsetting" execution control is enabled and the + * difference between the host TSC and the guest TSC is written + * into the TSC offset in the VMCS. + * + * Guest TSC_AUX support is enabled if any of guest RDPID and/or + * guest RDTSCP support are enabled (since, as per Table 2-2 in SDM + * volume 4, TSC_AUX is supported if any of RDPID and/or RDTSCP are + * supported). If guest TSC_AUX support is enabled, TSC_AUX is + * exposed read-only so that the VMM can do one fewer MSR read per + * exit than if this register were exposed read-write; the guest + * restore value can be updated during guest writes (expected to be + * rare) instead of during all exits (common). + */ + if (guest_msr_rw(vmx, MSR_GSBASE) || + guest_msr_rw(vmx, MSR_FSBASE) || + guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || + guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || + guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || + guest_msr_rw(vmx, MSR_EFER) || + guest_msr_ro(vmx, MSR_TSC) || + ((cap_rdpid || cap_rdtscp) && guest_msr_ro(vmx, MSR_TSC_AUX))) + panic("vmx_init: error setting guest msr access"); + + if (virtual_interrupt_delivery) { + error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, + APIC_ACCESS_ADDRESS); + /* XXX this should really return an error to the caller */ + KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); + } + + vmx->pmap = pmap; + return (vmx); +} + +static void * +vmx_vcpu_init(void *vmi, struct vcpu *vcpu1, int vcpuid) +{ + struct vmx *vmx = vmi; + struct vmcs *vmcs; + struct vmx_vcpu *vcpu; + uint32_t exc_bitmap; + uint16_t vpid; + int error; + + vpid = vpid_alloc(vcpuid); + + vcpu = malloc(sizeof(*vcpu), M_VMX, M_WAITOK | M_ZERO); + vcpu->vmx = vmx; + vcpu->vcpu = vcpu1; + vcpu->vcpuid = vcpuid; + vcpu->vmcs = malloc_aligned(sizeof(*vmcs), PAGE_SIZE, M_VMX, + M_WAITOK | M_ZERO); + vcpu->apic_page = malloc_aligned(PAGE_SIZE, PAGE_SIZE, M_VMX, + M_WAITOK | M_ZERO); + vcpu->pir_desc = malloc_aligned(sizeof(*vcpu->pir_desc), 64, M_VMX, + M_WAITOK | M_ZERO); + + vmcs = vcpu->vmcs; + vmcs->identifier = vmx_revision(); + error = vmclear(vmcs); + if (error != 0) { + panic("vmx_init: vmclear error %d on vcpu %d\n", + error, vcpuid); + } + + vmx_msr_guest_init(vmx, vcpu); + + error = vmcs_init(vmcs); + KASSERT(error == 0, ("vmcs_init error %d", error)); + + VMPTRLD(vmcs); + error = 0; + error += vmwrite(VMCS_HOST_RSP, (u_long)&vcpu->ctx); + error += vmwrite(VMCS_EPTP, vmx->eptp); + error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); + error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); + if (vcpu_trap_wbinvd(vcpu->vcpu)) { + KASSERT(cap_wbinvd_exit, ("WBINVD trap not available")); + procbased_ctls2 |= PROCBASED2_WBINVD_EXITING; + } + error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); + error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); + error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); + error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); + error += vmwrite(VMCS_VPID, vpid); + + if (guest_l1d_flush && !guest_l1d_flush_sw) { + vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract( + (vm_offset_t)&msr_load_list[0])); + vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT, + nitems(msr_load_list)); + vmcs_write(VMCS_EXIT_MSR_STORE, 0); + vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0); + } + + /* exception bitmap */ + if (vcpu_trace_exceptions(vcpu->vcpu)) + exc_bitmap = 0xffffffff; + else + exc_bitmap = 1 << IDT_MC; + error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap); + + vcpu->ctx.guest_dr6 = DBREG_DR6_RESERVED1; + error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1); + + if (tpr_shadowing) { + error += vmwrite(VMCS_VIRTUAL_APIC, vtophys(vcpu->apic_page)); + } + + if (virtual_interrupt_delivery) { + error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); + error += vmwrite(VMCS_EOI_EXIT0, 0); + error += vmwrite(VMCS_EOI_EXIT1, 0); + error += vmwrite(VMCS_EOI_EXIT2, 0); + error += vmwrite(VMCS_EOI_EXIT3, 0); + } + if (posted_interrupts) { + error += vmwrite(VMCS_PIR_VECTOR, pirvec); + error += vmwrite(VMCS_PIR_DESC, vtophys(vcpu->pir_desc)); + } + VMCLEAR(vmcs); + KASSERT(error == 0, ("vmx_init: error customizing the vmcs")); + + vcpu->cap.set = 0; + vcpu->cap.set |= cap_rdpid != 0 ? 1 << VM_CAP_RDPID : 0; + vcpu->cap.set |= cap_rdtscp != 0 ? 1 << VM_CAP_RDTSCP : 0; + vcpu->cap.proc_ctls = procbased_ctls; + vcpu->cap.proc_ctls2 = procbased_ctls2; + vcpu->cap.exc_bitmap = exc_bitmap; + + vcpu->state.nextrip = ~0; + vcpu->state.lastcpu = NOCPU; + vcpu->state.vpid = vpid; + + /* + * Set up the CR0/4 shadows, and init the read shadow + * to the power-on register value from the Intel Sys Arch. + * CR0 - 0x60000010 + * CR4 - 0 + */ + error = vmx_setup_cr0_shadow(vmcs, 0x60000010); + if (error != 0) + panic("vmx_setup_cr0_shadow %d", error); + + error = vmx_setup_cr4_shadow(vmcs, 0); + if (error != 0) + panic("vmx_setup_cr4_shadow %d", error); + + vcpu->ctx.pmap = vmx->pmap; + + return (vcpu); +} + +static int +vmx_handle_cpuid(struct vmx_vcpu *vcpu, struct vmxctx *vmxctx) +{ + int handled; + + handled = x86_emulate_cpuid(vcpu->vcpu, (uint64_t *)&vmxctx->guest_rax, + (uint64_t *)&vmxctx->guest_rbx, (uint64_t *)&vmxctx->guest_rcx, + (uint64_t *)&vmxctx->guest_rdx); + return (handled); +} + +static __inline void +vmx_run_trace(struct vmx_vcpu *vcpu) +{ + VMX_CTR1(vcpu, "Resume execution at %#lx", vmcs_guest_rip()); +} + +static __inline void +vmx_exit_trace(struct vmx_vcpu *vcpu, uint64_t rip, uint32_t exit_reason, + int handled) +{ + VMX_CTR3(vcpu, "%s %s vmexit at 0x%0lx", + handled ? "handled" : "unhandled", + exit_reason_to_str(exit_reason), rip); +} + +static __inline void +vmx_astpending_trace(struct vmx_vcpu *vcpu, uint64_t rip) +{ + VMX_CTR1(vcpu, "astpending vmexit at 0x%0lx", rip); +} + +static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); +static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); + +/* + * Invalidate guest mappings identified by its vpid from the TLB. + */ +static __inline void +vmx_invvpid(struct vmx *vmx, struct vmx_vcpu *vcpu, pmap_t pmap, int running) +{ + struct vmxstate *vmxstate; + struct invvpid_desc invvpid_desc; + + vmxstate = &vcpu->state; + if (vmxstate->vpid == 0) + return; + + if (!running) { + /* + * Set the 'lastcpu' to an invalid host cpu. + * + * This will invalidate TLB entries tagged with the vcpu's + * vpid the next time it runs via vmx_set_pcpu_defaults(). + */ + vmxstate->lastcpu = NOCPU; + return; + } + + KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside " + "critical section", __func__, vcpu->vcpuid)); + + /* + * Invalidate all mappings tagged with 'vpid' + * + * We do this because this vcpu was executing on a different host + * cpu when it last ran. We do not track whether it invalidated + * mappings associated with its 'vpid' during that run. So we must + * assume that the mappings associated with 'vpid' on 'curcpu' are + * stale and invalidate them. + * + * Note that we incur this penalty only when the scheduler chooses to + * move the thread associated with this vcpu between host cpus. + * + * Note also that this will invalidate mappings tagged with 'vpid' + * for "all" EP4TAs. + */ + if (atomic_load_long(&pmap->pm_eptgen) == vmx->eptgen[curcpu]) { + invvpid_desc._res1 = 0; + invvpid_desc._res2 = 0; + invvpid_desc.vpid = vmxstate->vpid; + invvpid_desc.linear_addr = 0; + invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); + vmm_stat_incr(vcpu->vcpu, VCPU_INVVPID_DONE, 1); + } else { + /* + * The invvpid can be skipped if an invept is going to + * be performed before entering the guest. The invept + * will invalidate combined mappings tagged with + * 'vmx->eptp' for all vpids. + */ + vmm_stat_incr(vcpu->vcpu, VCPU_INVVPID_SAVED, 1); + } +} + +static void +vmx_set_pcpu_defaults(struct vmx *vmx, struct vmx_vcpu *vcpu, pmap_t pmap) +{ + struct vmxstate *vmxstate; + + vmxstate = &vcpu->state; + if (vmxstate->lastcpu == curcpu) + return; + + vmxstate->lastcpu = curcpu; + + vmm_stat_incr(vcpu->vcpu, VCPU_MIGRATIONS, 1); + + vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); + vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); + vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); + vmx_invvpid(vmx, vcpu, pmap, 1); +} + +/* + * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. + */ +CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); + +static void __inline +vmx_set_int_window_exiting(struct vmx_vcpu *vcpu) +{ + + if ((vcpu->cap.proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { + vcpu->cap.proc_ctls |= PROCBASED_INT_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vcpu->cap.proc_ctls); + VMX_CTR0(vcpu, "Enabling interrupt window exiting"); + } +} + +static void __inline +vmx_clear_int_window_exiting(struct vmx_vcpu *vcpu) +{ + + KASSERT((vcpu->cap.proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, + ("intr_window_exiting not set: %#x", vcpu->cap.proc_ctls)); + vcpu->cap.proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vcpu->cap.proc_ctls); + VMX_CTR0(vcpu, "Disabling interrupt window exiting"); +} + +static void __inline +vmx_set_nmi_window_exiting(struct vmx_vcpu *vcpu) +{ + + if ((vcpu->cap.proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) { + vcpu->cap.proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vcpu->cap.proc_ctls); + VMX_CTR0(vcpu, "Enabling NMI window exiting"); + } +} + +static void __inline +vmx_clear_nmi_window_exiting(struct vmx_vcpu *vcpu) +{ + + KASSERT((vcpu->cap.proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, + ("nmi_window_exiting not set %#x", vcpu->cap.proc_ctls)); + vcpu->cap.proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vcpu->cap.proc_ctls); + VMX_CTR0(vcpu, "Disabling NMI window exiting"); +} + +int +vmx_set_tsc_offset(struct vmx_vcpu *vcpu, uint64_t offset) +{ + int error; + + if ((vcpu->cap.proc_ctls & PROCBASED_TSC_OFFSET) == 0) { + vcpu->cap.proc_ctls |= PROCBASED_TSC_OFFSET; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vcpu->cap.proc_ctls); + VMX_CTR0(vcpu, "Enabling TSC offsetting"); + } + + error = vmwrite(VMCS_TSC_OFFSET, offset); +#ifdef BHYVE_SNAPSHOT + if (error == 0) + vm_set_tsc_offset(vcpu->vcpu, offset); +#endif + return (error); +} + +#define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ + VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) +#define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ + VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) + +static void +vmx_inject_nmi(struct vmx_vcpu *vcpu) +{ + uint32_t gi __diagused, info; + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " + "interruptibility-state %#x", gi)); + + info = vmcs_read(VMCS_ENTRY_INTR_INFO); + KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " + "VM-entry interruption information %#x", info)); + + /* + * Inject the virtual NMI. The vector must be the NMI IDT entry + * or the VMCS entry check will fail. + */ + info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID; + vmcs_write(VMCS_ENTRY_INTR_INFO, info); + + VMX_CTR0(vcpu, "Injecting vNMI"); + + /* Clear the request */ + vm_nmi_clear(vcpu->vcpu); +} + +static void +vmx_inject_interrupts(struct vmx_vcpu *vcpu, struct vlapic *vlapic, + uint64_t guestrip) +{ + int vector, need_nmi_exiting, extint_pending; + uint64_t rflags, entryinfo; + uint32_t gi, info; + + if (vcpu->cap.set & (1 << VM_CAP_MASK_HWINTR)) { + return; + } + + if (vcpu->state.nextrip != guestrip) { + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + if (gi & HWINTR_BLOCKING) { + VMX_CTR2(vcpu, "Guest interrupt blocking " + "cleared due to rip change: %#lx/%#lx", + vcpu->state.nextrip, guestrip); + gi &= ~HWINTR_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); + } + } + + if (vm_entry_intinfo(vcpu->vcpu, &entryinfo)) { + KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " + "intinfo is not valid: %#lx", __func__, entryinfo)); + + info = vmcs_read(VMCS_ENTRY_INTR_INFO); + KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " + "pending exception: %#lx/%#x", __func__, entryinfo, info)); + + info = entryinfo; + vector = info & 0xff; + if (vector == IDT_BP || vector == IDT_OF) { + /* + * VT-x requires #BP and #OF to be injected as software + * exceptions. + */ + info &= ~VMCS_INTR_T_MASK; + info |= VMCS_INTR_T_SWEXCEPTION; + } + + if (info & VMCS_INTR_DEL_ERRCODE) + vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); + + vmcs_write(VMCS_ENTRY_INTR_INFO, info); + } + + if (vm_nmi_pending(vcpu->vcpu)) { + /* + * If there are no conditions blocking NMI injection then + * inject it directly here otherwise enable "NMI window + * exiting" to inject it as soon as we can. + * + * We also check for STI_BLOCKING because some implementations + * don't allow NMI injection in this case. If we are running + * on a processor that doesn't have this restriction it will + * immediately exit and the NMI will be injected in the + * "NMI window exiting" handler. + */ + need_nmi_exiting = 1; + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { + info = vmcs_read(VMCS_ENTRY_INTR_INFO); + if ((info & VMCS_INTR_VALID) == 0) { + vmx_inject_nmi(vcpu); + need_nmi_exiting = 0; + } else { + VMX_CTR1(vcpu, "Cannot inject NMI " + "due to VM-entry intr info %#x", info); + } + } else { + VMX_CTR1(vcpu, "Cannot inject NMI due to " + "Guest Interruptibility-state %#x", gi); + } + + if (need_nmi_exiting) + vmx_set_nmi_window_exiting(vcpu); + } + + extint_pending = vm_extint_pending(vcpu->vcpu); + + if (!extint_pending && virtual_interrupt_delivery) { + vmx_inject_pir(vlapic); + return; + } + + /* + * If interrupt-window exiting is already in effect then don't bother + * checking for pending interrupts. This is just an optimization and + * not needed for correctness. + */ + if ((vcpu->cap.proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) { + VMX_CTR0(vcpu, "Skip interrupt injection due to " + "pending int_window_exiting"); + return; + } + + if (!extint_pending) { + /* Ask the local apic for a vector to inject */ + if (!vlapic_pending_intr(vlapic, &vector)) + return; + + /* + * From the Intel SDM, Volume 3, Section "Maskable + * Hardware Interrupts": + * - maskable interrupt vectors [16,255] can be delivered + * through the local APIC. + */ + KASSERT(vector >= 16 && vector <= 255, + ("invalid vector %d from local APIC", vector)); + } else { + /* Ask the legacy pic for a vector to inject */ + vatpic_pending_intr(vcpu->vmx->vm, &vector); + + /* + * From the Intel SDM, Volume 3, Section "Maskable + * Hardware Interrupts": + * - maskable interrupt vectors [0,255] can be delivered + * through the INTR pin. + */ + KASSERT(vector >= 0 && vector <= 255, + ("invalid vector %d from INTR", vector)); + } + + /* Check RFLAGS.IF and the interruptibility state of the guest */ + rflags = vmcs_read(VMCS_GUEST_RFLAGS); + if ((rflags & PSL_I) == 0) { + VMX_CTR2(vcpu, "Cannot inject vector %d due to " + "rflags %#lx", vector, rflags); + goto cantinject; + } + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + if (gi & HWINTR_BLOCKING) { + VMX_CTR2(vcpu, "Cannot inject vector %d due to " + "Guest Interruptibility-state %#x", vector, gi); + goto cantinject; + } + + info = vmcs_read(VMCS_ENTRY_INTR_INFO); + if (info & VMCS_INTR_VALID) { + /* + * This is expected and could happen for multiple reasons: + * - A vectoring VM-entry was aborted due to astpending + * - A VM-exit happened during event injection. + * - An exception was injected above. + * - An NMI was injected above or after "NMI window exiting" + */ + VMX_CTR2(vcpu, "Cannot inject vector %d due to " + "VM-entry intr info %#x", vector, info); + goto cantinject; + } + + /* Inject the interrupt */ + info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; + info |= vector; + vmcs_write(VMCS_ENTRY_INTR_INFO, info); + + if (!extint_pending) { + /* Update the Local APIC ISR */ + vlapic_intr_accepted(vlapic, vector); + } else { + vm_extint_clear(vcpu->vcpu); + vatpic_intr_accepted(vcpu->vmx->vm, vector); + + /* + * After we accepted the current ExtINT the PIC may + * have posted another one. If that is the case, set + * the Interrupt Window Exiting execution control so + * we can inject that one too. + * + * Also, interrupt window exiting allows us to inject any + * pending APIC vector that was preempted by the ExtINT + * as soon as possible. This applies both for the software + * emulated vlapic and the hardware assisted virtual APIC. + */ + vmx_set_int_window_exiting(vcpu); + } + + VMX_CTR1(vcpu, "Injecting hwintr at vector %d", vector); + + return; + +cantinject: + /* + * Set the Interrupt Window Exiting execution control so we can inject + * the interrupt as soon as blocking condition goes away. + */ + vmx_set_int_window_exiting(vcpu); +} + +/* + * If the Virtual NMIs execution control is '1' then the logical processor + * tracks virtual-NMI blocking in the Guest Interruptibility-state field of + * the VMCS. An IRET instruction in VMX non-root operation will remove any + * virtual-NMI blocking. + * + * This unblocking occurs even if the IRET causes a fault. In this case the + * hypervisor needs to restore virtual-NMI blocking before resuming the guest. + */ +static void +vmx_restore_nmi_blocking(struct vmx_vcpu *vcpu) +{ + uint32_t gi; + + VMX_CTR0(vcpu, "Restore Virtual-NMI blocking"); + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); +} + +static void +vmx_clear_nmi_blocking(struct vmx_vcpu *vcpu) +{ + uint32_t gi; + + VMX_CTR0(vcpu, "Clear Virtual-NMI blocking"); + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); +} + +static void +vmx_assert_nmi_blocking(struct vmx_vcpu *vcpu) +{ + uint32_t gi __diagused; + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, + ("NMI blocking is not in effect %#x", gi)); +} + +static int +vmx_emulate_xsetbv(struct vmx *vmx, struct vmx_vcpu *vcpu, + struct vm_exit *vmexit) +{ + struct vmxctx *vmxctx; + uint64_t xcrval; + const struct xsave_limits *limits; + + vmxctx = &vcpu->ctx; + limits = vmm_get_xsave_limits(); + + /* + * Note that the processor raises a GP# fault on its own if + * xsetbv is executed for CPL != 0, so we do not have to + * emulate that fault here. + */ + + /* Only xcr0 is supported. */ + if (vmxctx->guest_rcx != 0) { + vm_inject_gp(vcpu->vcpu); + return (HANDLED); + } + + /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ + if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) { + vm_inject_ud(vcpu->vcpu); + return (HANDLED); + } + + xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff); + if ((xcrval & ~limits->xcr0_allowed) != 0) { + vm_inject_gp(vcpu->vcpu); + return (HANDLED); + } + + if (!(xcrval & XFEATURE_ENABLED_X87)) { + vm_inject_gp(vcpu->vcpu); + return (HANDLED); + } + + /* AVX (YMM_Hi128) requires SSE. */ + if (xcrval & XFEATURE_ENABLED_AVX && + (xcrval & XFEATURE_AVX) != XFEATURE_AVX) { + vm_inject_gp(vcpu->vcpu); + return (HANDLED); + } + + /* + * AVX512 requires base AVX (YMM_Hi128) as well as OpMask, + * ZMM_Hi256, and Hi16_ZMM. + */ + if (xcrval & XFEATURE_AVX512 && + (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) != + (XFEATURE_AVX512 | XFEATURE_AVX)) { + vm_inject_gp(vcpu->vcpu); + return (HANDLED); + } + + /* + * Intel MPX requires both bound register state flags to be + * set. + */ + if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) != + ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) { + vm_inject_gp(vcpu->vcpu); + return (HANDLED); + } + + /* + * This runs "inside" vmrun() with the guest's FPU state, so + * modifying xcr0 directly modifies the guest's xcr0, not the + * host's. + */ + load_xcr(0, xcrval); + return (HANDLED); +} + +static uint64_t +vmx_get_guest_reg(struct vmx_vcpu *vcpu, int ident) +{ + const struct vmxctx *vmxctx; + + vmxctx = &vcpu->ctx; + + switch (ident) { + case 0: + return (vmxctx->guest_rax); + case 1: + return (vmxctx->guest_rcx); + case 2: + return (vmxctx->guest_rdx); + case 3: + return (vmxctx->guest_rbx); + case 4: + return (vmcs_read(VMCS_GUEST_RSP)); + case 5: + return (vmxctx->guest_rbp); + case 6: + return (vmxctx->guest_rsi); + case 7: + return (vmxctx->guest_rdi); + case 8: + return (vmxctx->guest_r8); + case 9: + return (vmxctx->guest_r9); + case 10: + return (vmxctx->guest_r10); + case 11: + return (vmxctx->guest_r11); + case 12: + return (vmxctx->guest_r12); + case 13: + return (vmxctx->guest_r13); + case 14: + return (vmxctx->guest_r14); + case 15: + return (vmxctx->guest_r15); + default: + panic("invalid vmx register %d", ident); + } +} + +static void +vmx_set_guest_reg(struct vmx_vcpu *vcpu, int ident, uint64_t regval) +{ + struct vmxctx *vmxctx; + + vmxctx = &vcpu->ctx; + + switch (ident) { + case 0: + vmxctx->guest_rax = regval; + break; + case 1: + vmxctx->guest_rcx = regval; + break; + case 2: + vmxctx->guest_rdx = regval; + break; + case 3: + vmxctx->guest_rbx = regval; + break; + case 4: + vmcs_write(VMCS_GUEST_RSP, regval); + break; + case 5: + vmxctx->guest_rbp = regval; + break; + case 6: + vmxctx->guest_rsi = regval; + break; + case 7: + vmxctx->guest_rdi = regval; + break; + case 8: + vmxctx->guest_r8 = regval; + break; + case 9: + vmxctx->guest_r9 = regval; + break; + case 10: + vmxctx->guest_r10 = regval; + break; + case 11: + vmxctx->guest_r11 = regval; + break; + case 12: + vmxctx->guest_r12 = regval; + break; + case 13: + vmxctx->guest_r13 = regval; + break; + case 14: + vmxctx->guest_r14 = regval; + break; + case 15: + vmxctx->guest_r15 = regval; + break; + default: + panic("invalid vmx register %d", ident); + } +} + +static int +vmx_emulate_cr0_access(struct vmx_vcpu *vcpu, uint64_t exitqual) +{ + uint64_t crval, regval; + + /* We only handle mov to %cr0 at this time */ + if ((exitqual & 0xf0) != 0x00) + return (UNHANDLED); + + regval = vmx_get_guest_reg(vcpu, (exitqual >> 8) & 0xf); + + vmcs_write(VMCS_CR0_SHADOW, regval); + + crval = regval | cr0_ones_mask; + crval &= ~cr0_zeros_mask; + vmcs_write(VMCS_GUEST_CR0, crval); + + if (regval & CR0_PG) { + uint64_t efer, entry_ctls; + + /* + * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and + * the "IA-32e mode guest" bit in VM-entry control must be + * equal. + */ + efer = vmcs_read(VMCS_GUEST_IA32_EFER); + if (efer & EFER_LME) { + efer |= EFER_LMA; + vmcs_write(VMCS_GUEST_IA32_EFER, efer); + entry_ctls = vmcs_read(VMCS_ENTRY_CTLS); + entry_ctls |= VM_ENTRY_GUEST_LMA; + vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); + } + } + + return (HANDLED); +} + +static int +vmx_emulate_cr4_access(struct vmx_vcpu *vcpu, uint64_t exitqual) +{ + uint64_t crval, regval; + + /* We only handle mov to %cr4 at this time */ + if ((exitqual & 0xf0) != 0x00) + return (UNHANDLED); + + regval = vmx_get_guest_reg(vcpu, (exitqual >> 8) & 0xf); + + vmcs_write(VMCS_CR4_SHADOW, regval); + + crval = regval | cr4_ones_mask; + crval &= ~cr4_zeros_mask; + vmcs_write(VMCS_GUEST_CR4, crval); + + return (HANDLED); +} + +static int +vmx_emulate_cr8_access(struct vmx *vmx, struct vmx_vcpu *vcpu, + uint64_t exitqual) +{ + struct vlapic *vlapic; + uint64_t cr8; + int regnum; + + /* We only handle mov %cr8 to/from a register at this time. */ + if ((exitqual & 0xe0) != 0x00) { + return (UNHANDLED); + } + + vlapic = vm_lapic(vcpu->vcpu); + regnum = (exitqual >> 8) & 0xf; + if (exitqual & 0x10) { + cr8 = vlapic_get_cr8(vlapic); + vmx_set_guest_reg(vcpu, regnum, cr8); + } else { + cr8 = vmx_get_guest_reg(vcpu, regnum); + vlapic_set_cr8(vlapic, cr8); + } + + return (HANDLED); +} + +/* + * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL + */ +static int +vmx_cpl(void) +{ + uint32_t ssar; + + ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS); + return ((ssar >> 5) & 0x3); +} + +static enum vm_cpu_mode +vmx_cpu_mode(void) +{ + uint32_t csar; + + if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) { + csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); + if (csar & 0x2000) + return (CPU_MODE_64BIT); /* CS.L = 1 */ + else + return (CPU_MODE_COMPATIBILITY); + } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) { + return (CPU_MODE_PROTECTED); + } else { + return (CPU_MODE_REAL); + } +} + +static enum vm_paging_mode +vmx_paging_mode(void) +{ + uint64_t cr4; + + if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG)) + return (PAGING_MODE_FLAT); + cr4 = vmcs_read(VMCS_GUEST_CR4); + if (!(cr4 & CR4_PAE)) + return (PAGING_MODE_32); + if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) { + if (!(cr4 & CR4_LA57)) + return (PAGING_MODE_64); + return (PAGING_MODE_64_LA57); + } else + return (PAGING_MODE_PAE); +} + +static uint64_t +inout_str_index(struct vmx_vcpu *vcpu, int in) +{ + uint64_t val; + int error __diagused; + enum vm_reg_name reg; + + reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; + error = vmx_getreg(vcpu, reg, &val); + KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error)); + return (val); +} + +static uint64_t +inout_str_count(struct vmx_vcpu *vcpu, int rep) +{ + uint64_t val; + int error __diagused; + + if (rep) { + error = vmx_getreg(vcpu, VM_REG_GUEST_RCX, &val); + KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error)); + } else { + val = 1; + } + return (val); +} + +static int +inout_str_addrsize(uint32_t inst_info) +{ + uint32_t size; + + size = (inst_info >> 7) & 0x7; + switch (size) { + case 0: + return (2); /* 16 bit */ + case 1: + return (4); /* 32 bit */ + case 2: + return (8); /* 64 bit */ + default: + panic("%s: invalid size encoding %d", __func__, size); + } +} + +static void +inout_str_seginfo(struct vmx_vcpu *vcpu, uint32_t inst_info, int in, + struct vm_inout_str *vis) +{ + int error __diagused, s; + + if (in) { + vis->seg_name = VM_REG_GUEST_ES; + } else { + s = (inst_info >> 15) & 0x7; + vis->seg_name = vm_segment_name(s); + } + + error = vmx_getdesc(vcpu, vis->seg_name, &vis->seg_desc); + KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error)); +} + +static void +vmx_paging_info(struct vm_guest_paging *paging) +{ + paging->cr3 = vmcs_guest_cr3(); + paging->cpl = vmx_cpl(); + paging->cpu_mode = vmx_cpu_mode(); + paging->paging_mode = vmx_paging_mode(); +} + +static void +vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) +{ + struct vm_guest_paging *paging; + uint32_t csar; + + paging = &vmexit->u.inst_emul.paging; + + vmexit->exitcode = VM_EXITCODE_INST_EMUL; + vmexit->inst_length = 0; + vmexit->u.inst_emul.gpa = gpa; + vmexit->u.inst_emul.gla = gla; + vmx_paging_info(paging); + switch (paging->cpu_mode) { + case CPU_MODE_REAL: + vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); + vmexit->u.inst_emul.cs_d = 0; + break; + case CPU_MODE_PROTECTED: + case CPU_MODE_COMPATIBILITY: + vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); + csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); + vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); + break; + default: + vmexit->u.inst_emul.cs_base = 0; + vmexit->u.inst_emul.cs_d = 0; + break; + } + vie_init(&vmexit->u.inst_emul.vie, NULL, 0); +} + +static int +ept_fault_type(uint64_t ept_qual) +{ + int fault_type; + + if (ept_qual & EPT_VIOLATION_DATA_WRITE) + fault_type = VM_PROT_WRITE; + else if (ept_qual & EPT_VIOLATION_INST_FETCH) + fault_type = VM_PROT_EXECUTE; + else + fault_type= VM_PROT_READ; + + return (fault_type); +} + +static bool +ept_emulation_fault(uint64_t ept_qual) +{ + int read, write; + + /* EPT fault on an instruction fetch doesn't make sense here */ + if (ept_qual & EPT_VIOLATION_INST_FETCH) + return (false); + + /* EPT fault must be a read fault or a write fault */ + read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; + write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; + if ((read | write) == 0) + return (false); + + /* + * The EPT violation must have been caused by accessing a + * guest-physical address that is a translation of a guest-linear + * address. + */ + if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || + (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { + return (false); + } + + return (true); +} + +static __inline int +apic_access_virtualization(struct vmx_vcpu *vcpu) +{ + uint32_t proc_ctls2; + + proc_ctls2 = vcpu->cap.proc_ctls2; + return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); +} + +static __inline int +x2apic_virtualization(struct vmx_vcpu *vcpu) +{ + uint32_t proc_ctls2; + + proc_ctls2 = vcpu->cap.proc_ctls2; + return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); +} + +static int +vmx_handle_apic_write(struct vmx_vcpu *vcpu, struct vlapic *vlapic, + uint64_t qual) +{ + int error, handled, offset; + uint32_t *apic_regs, vector; + bool retu; + + handled = HANDLED; + offset = APIC_WRITE_OFFSET(qual); + + if (!apic_access_virtualization(vcpu)) { + /* + * In general there should not be any APIC write VM-exits + * unless APIC-access virtualization is enabled. + * + * However self-IPI virtualization can legitimately trigger + * an APIC-write VM-exit so treat it specially. + */ + if (x2apic_virtualization(vcpu) && + offset == APIC_OFFSET_SELF_IPI) { + apic_regs = (uint32_t *)(vlapic->apic_page); + vector = apic_regs[APIC_OFFSET_SELF_IPI / 4]; + vlapic_self_ipi_handler(vlapic, vector); + return (HANDLED); + } else + return (UNHANDLED); + } + + switch (offset) { + case APIC_OFFSET_ID: + vlapic_id_write_handler(vlapic); + break; + case APIC_OFFSET_LDR: + vlapic_ldr_write_handler(vlapic); + break; + case APIC_OFFSET_DFR: + vlapic_dfr_write_handler(vlapic); + break; + case APIC_OFFSET_SVR: + vlapic_svr_write_handler(vlapic); + break; + case APIC_OFFSET_ESR: + vlapic_esr_write_handler(vlapic); + break; + case APIC_OFFSET_ICR_LOW: + retu = false; + error = vlapic_icrlo_write_handler(vlapic, &retu); + if (error != 0 || retu) + handled = UNHANDLED; + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + vlapic_lvt_write_handler(vlapic, offset); + break; + case APIC_OFFSET_TIMER_ICR: + vlapic_icrtmr_write_handler(vlapic); + break; + case APIC_OFFSET_TIMER_DCR: + vlapic_dcr_write_handler(vlapic); + break; + default: + handled = UNHANDLED; + break; + } + return (handled); +} + +static bool +apic_access_fault(struct vmx_vcpu *vcpu, uint64_t gpa) +{ + + if (apic_access_virtualization(vcpu) && + (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) + return (true); + else + return (false); +} + +static int +vmx_handle_apic_access(struct vmx_vcpu *vcpu, struct vm_exit *vmexit) +{ + uint64_t qual; + int access_type, offset, allowed; + + if (!apic_access_virtualization(vcpu)) + return (UNHANDLED); + + qual = vmexit->u.vmx.exit_qualification; + access_type = APIC_ACCESS_TYPE(qual); + offset = APIC_ACCESS_OFFSET(qual); + + allowed = 0; + if (access_type == 0) { + /* + * Read data access to the following registers is expected. + */ + switch (offset) { + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_CCR: + allowed = 1; + break; + default: + break; + } + } else if (access_type == 1) { + /* + * Write data access to the following registers is expected. + */ + switch (offset) { + case APIC_OFFSET_VER: + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_CCR: + allowed = 1; + break; + default: + break; + } + } + + if (allowed) { + vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset, + VIE_INVALID_GLA); + } + + /* + * Regardless of whether the APIC-access is allowed this handler + * always returns UNHANDLED: + * - if the access is allowed then it is handled by emulating the + * instruction that caused the VM-exit (outside the critical section) + * - if the access is not allowed then it will be converted to an + * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. + */ + return (UNHANDLED); +} + +static enum task_switch_reason +vmx_task_switch_reason(uint64_t qual) +{ + int reason; + + reason = (qual >> 30) & 0x3; + switch (reason) { + case 0: + return (TSR_CALL); + case 1: + return (TSR_IRET); + case 2: + return (TSR_JMP); + case 3: + return (TSR_IDT_GATE); + default: + panic("%s: invalid reason %d", __func__, reason); + } +} + +static int +emulate_wrmsr(struct vmx_vcpu *vcpu, u_int num, uint64_t val, bool *retu) +{ + int error; + + if (lapic_msr(num)) + error = lapic_wrmsr(vcpu->vcpu, num, val, retu); + else + error = vmx_wrmsr(vcpu, num, val, retu); + + return (error); +} + +static int +emulate_rdmsr(struct vmx_vcpu *vcpu, u_int num, bool *retu) +{ + struct vmxctx *vmxctx; + uint64_t result; + uint32_t eax, edx; + int error; + + if (lapic_msr(num)) + error = lapic_rdmsr(vcpu->vcpu, num, &result, retu); + else + error = vmx_rdmsr(vcpu, num, &result, retu); + + if (error == 0) { + eax = result; + vmxctx = &vcpu->ctx; + error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax); + KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error)); + + edx = result >> 32; + error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx); + KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error)); + } + + return (error); +} + +static int +vmx_exit_process(struct vmx *vmx, struct vmx_vcpu *vcpu, struct vm_exit *vmexit) +{ + int error, errcode, errcode_valid, handled, in; + struct vmxctx *vmxctx; + struct vlapic *vlapic; + struct vm_inout_str *vis; + struct vm_task_switch *ts; + uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; + uint32_t intr_type, intr_vec, reason; + uint64_t exitintinfo, qual, gpa; +#ifdef KDTRACE_HOOKS + int vcpuid; +#endif + bool retu; + + CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); + CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); + + handled = UNHANDLED; + vmxctx = &vcpu->ctx; +#ifdef KDTRACE_HOOKS + vcpuid = vcpu->vcpuid; +#endif + + qual = vmexit->u.vmx.exit_qualification; + reason = vmexit->u.vmx.exit_reason; + vmexit->exitcode = VM_EXITCODE_BOGUS; + + vmm_stat_incr(vcpu->vcpu, VMEXIT_COUNT, 1); + SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpuid, vmexit); + + /* + * VM-entry failures during or after loading guest state. + * + * These VM-exits are uncommon but must be handled specially + * as most VM-exit fields are not populated as usual. + */ + if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) { + VMX_CTR0(vcpu, "Handling MCE during VM-entry"); + __asm __volatile("int $18"); + return (1); + } + + /* + * VM exits that can be triggered during event delivery need to + * be handled specially by re-injecting the event if the IDT + * vectoring information field's valid bit is set. + * + * See "Information for VM Exits During Event Delivery" in Intel SDM + * for details. + */ + idtvec_info = vmcs_idt_vectoring_info(); + if (idtvec_info & VMCS_IDT_VEC_VALID) { + idtvec_info &= ~(1 << 12); /* clear undefined bit */ + exitintinfo = idtvec_info; + if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { + idtvec_err = vmcs_idt_vectoring_err(); + exitintinfo |= (uint64_t)idtvec_err << 32; + } + error = vm_exit_intinfo(vcpu->vcpu, exitintinfo); + KASSERT(error == 0, ("%s: vm_set_intinfo error %d", + __func__, error)); + + /* + * If 'virtual NMIs' are being used and the VM-exit + * happened while injecting an NMI during the previous + * VM-entry, then clear "blocking by NMI" in the + * Guest Interruptibility-State so the NMI can be + * reinjected on the subsequent VM-entry. + * + * However, if the NMI was being delivered through a task + * gate, then the new task must start execution with NMIs + * blocked so don't clear NMI blocking in this case. + */ + intr_type = idtvec_info & VMCS_INTR_T_MASK; + if (intr_type == VMCS_INTR_T_NMI) { + if (reason != EXIT_REASON_TASK_SWITCH) + vmx_clear_nmi_blocking(vcpu); + else + vmx_assert_nmi_blocking(vcpu); + } + + /* + * Update VM-entry instruction length if the event being + * delivered was a software interrupt or software exception. + */ + if (intr_type == VMCS_INTR_T_SWINTR || + intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || + intr_type == VMCS_INTR_T_SWEXCEPTION) { + vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); + } + } + + switch (reason) { + case EXIT_REASON_TASK_SWITCH: + ts = &vmexit->u.task_switch; + ts->tsssel = qual & 0xffff; + ts->reason = vmx_task_switch_reason(qual); + ts->ext = 0; + ts->errcode_valid = 0; + vmx_paging_info(&ts->paging); + /* + * If the task switch was due to a CALL, JMP, IRET, software + * interrupt (INT n) or software exception (INT3, INTO), + * then the saved %rip references the instruction that caused + * the task switch. The instruction length field in the VMCS + * is valid in this case. + * + * In all other cases (e.g., NMI, hardware exception) the + * saved %rip is one that would have been saved in the old TSS + * had the task switch completed normally so the instruction + * length field is not needed in this case and is explicitly + * set to 0. + */ + if (ts->reason == TSR_IDT_GATE) { + KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, + ("invalid idtvec_info %#x for IDT task switch", + idtvec_info)); + intr_type = idtvec_info & VMCS_INTR_T_MASK; + if (intr_type != VMCS_INTR_T_SWINTR && + intr_type != VMCS_INTR_T_SWEXCEPTION && + intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { + /* Task switch triggered by external event */ + ts->ext = 1; + vmexit->inst_length = 0; + if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { + ts->errcode_valid = 1; + ts->errcode = vmcs_idt_vectoring_err(); + } + } + } + vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; + SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpuid, vmexit, ts); + VMX_CTR4(vcpu, "task switch reason %d, tss 0x%04x, " + "%s errcode 0x%016lx", ts->reason, ts->tsssel, + ts->ext ? "external" : "internal", + ((uint64_t)ts->errcode << 32) | ts->errcode_valid); + break; + case EXIT_REASON_CR_ACCESS: + vmm_stat_incr(vcpu->vcpu, VMEXIT_CR_ACCESS, 1); + SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpuid, vmexit, qual); + switch (qual & 0xf) { + case 0: + handled = vmx_emulate_cr0_access(vcpu, qual); + break; + case 4: + handled = vmx_emulate_cr4_access(vcpu, qual); + break; + case 8: + handled = vmx_emulate_cr8_access(vmx, vcpu, qual); + break; + } + break; + case EXIT_REASON_RDMSR: + vmm_stat_incr(vcpu->vcpu, VMEXIT_RDMSR, 1); + retu = false; + ecx = vmxctx->guest_rcx; + VMX_CTR1(vcpu, "rdmsr 0x%08x", ecx); + SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpuid, vmexit, ecx); + error = emulate_rdmsr(vcpu, ecx, &retu); + if (error) { + vmexit->exitcode = VM_EXITCODE_RDMSR; + vmexit->u.msr.code = ecx; + } else if (!retu) { + handled = HANDLED; + } else { + /* Return to userspace with a valid exitcode */ + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_rdmsr retu with bogus exitcode")); + } + break; + case EXIT_REASON_WRMSR: + vmm_stat_incr(vcpu->vcpu, VMEXIT_WRMSR, 1); + retu = false; + eax = vmxctx->guest_rax; + ecx = vmxctx->guest_rcx; + edx = vmxctx->guest_rdx; + VMX_CTR2(vcpu, "wrmsr 0x%08x value 0x%016lx", + ecx, (uint64_t)edx << 32 | eax); + SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpuid, ecx, + (uint64_t)edx << 32 | eax); + error = emulate_wrmsr(vcpu, ecx, (uint64_t)edx << 32 | eax, + &retu); + if (error) { + vmexit->exitcode = VM_EXITCODE_WRMSR; + vmexit->u.msr.code = ecx; + vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; + } else if (!retu) { + handled = HANDLED; + } else { + /* Return to userspace with a valid exitcode */ + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_wrmsr retu with bogus exitcode")); + } + break; + case EXIT_REASON_HLT: + vmm_stat_incr(vcpu->vcpu, VMEXIT_HLT, 1); + SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpuid, vmexit); + vmexit->exitcode = VM_EXITCODE_HLT; + vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); + if (virtual_interrupt_delivery) + vmexit->u.hlt.intr_status = + vmcs_read(VMCS_GUEST_INTR_STATUS); + else + vmexit->u.hlt.intr_status = 0; + break; + case EXIT_REASON_MTF: + vmm_stat_incr(vcpu->vcpu, VMEXIT_MTRAP, 1); + SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpuid, vmexit); + vmexit->exitcode = VM_EXITCODE_MTRAP; + vmexit->inst_length = 0; + break; + case EXIT_REASON_PAUSE: + vmm_stat_incr(vcpu->vcpu, VMEXIT_PAUSE, 1); + SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpuid, vmexit); + vmexit->exitcode = VM_EXITCODE_PAUSE; + break; + case EXIT_REASON_INTR_WINDOW: + vmm_stat_incr(vcpu->vcpu, VMEXIT_INTR_WINDOW, 1); + SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpuid, vmexit); + vmx_clear_int_window_exiting(vcpu); + return (1); + case EXIT_REASON_EXT_INTR: + /* + * External interrupts serve only to cause VM exits and allow + * the host interrupt handler to run. + * + * If this external interrupt triggers a virtual interrupt + * to a VM, then that state will be recorded by the + * host interrupt handler in the VM's softc. We will inject + * this virtual interrupt during the subsequent VM enter. + */ + intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); + SDT_PROBE4(vmm, vmx, exit, interrupt, + vmx, vcpuid, vmexit, intr_info); + + /* + * XXX: Ignore this exit if VMCS_INTR_VALID is not set. + * This appears to be a bug in VMware Fusion? + */ + if (!(intr_info & VMCS_INTR_VALID)) + return (1); + KASSERT((intr_info & VMCS_INTR_VALID) != 0 && + (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, + ("VM exit interruption info invalid: %#x", intr_info)); + vmx_trigger_hostintr(intr_info & 0xff); + + /* + * This is special. We want to treat this as an 'handled' + * VM-exit but not increment the instruction pointer. + */ + vmm_stat_incr(vcpu->vcpu, VMEXIT_EXTINT, 1); + return (1); + case EXIT_REASON_NMI_WINDOW: + SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpuid, vmexit); + /* Exit to allow the pending virtual NMI to be injected */ + if (vm_nmi_pending(vcpu->vcpu)) + vmx_inject_nmi(vcpu); + vmx_clear_nmi_window_exiting(vcpu); + vmm_stat_incr(vcpu->vcpu, VMEXIT_NMI_WINDOW, 1); + return (1); + case EXIT_REASON_INOUT: + vmm_stat_incr(vcpu->vcpu, VMEXIT_INOUT, 1); + vmexit->exitcode = VM_EXITCODE_INOUT; + vmexit->u.inout.bytes = (qual & 0x7) + 1; + vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0; + vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; + vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; + vmexit->u.inout.port = (uint16_t)(qual >> 16); + vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); + if (vmexit->u.inout.string) { + inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO); + vmexit->exitcode = VM_EXITCODE_INOUT_STR; + vis = &vmexit->u.inout_str; + vmx_paging_info(&vis->paging); + vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS); + vis->cr0 = vmcs_read(VMCS_GUEST_CR0); + vis->index = inout_str_index(vcpu, in); + vis->count = inout_str_count(vcpu, vis->inout.rep); + vis->addrsize = inout_str_addrsize(inst_info); + vis->cs_d = 0; + vis->cs_base = 0; + inout_str_seginfo(vcpu, inst_info, in, vis); + } + SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpuid, vmexit); + break; + case EXIT_REASON_CPUID: + vmm_stat_incr(vcpu->vcpu, VMEXIT_CPUID, 1); + SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpuid, vmexit); + handled = vmx_handle_cpuid(vcpu, vmxctx); + break; + case EXIT_REASON_EXCEPTION: + vmm_stat_incr(vcpu->vcpu, VMEXIT_EXCEPTION, 1); + intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); + KASSERT((intr_info & VMCS_INTR_VALID) != 0, + ("VM exit interruption info invalid: %#x", intr_info)); + + intr_vec = intr_info & 0xff; + intr_type = intr_info & VMCS_INTR_T_MASK; + + /* + * If Virtual NMIs control is 1 and the VM-exit is due to a + * fault encountered during the execution of IRET then we must + * restore the state of "virtual-NMI blocking" before resuming + * the guest. + * + * See "Resuming Guest Software after Handling an Exception". + * See "Information for VM Exits Due to Vectored Events". + */ + if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && + (intr_vec != IDT_DF) && + (intr_info & EXIT_QUAL_NMIUDTI) != 0) + vmx_restore_nmi_blocking(vcpu); + + /* + * The NMI has already been handled in vmx_exit_handle_nmi(). + */ + if (intr_type == VMCS_INTR_T_NMI) + return (1); + + /* + * Call the machine check handler by hand. Also don't reflect + * the machine check back into the guest. + */ + if (intr_vec == IDT_MC) { + VMX_CTR0(vcpu, "Vectoring to MCE handler"); + __asm __volatile("int $18"); + return (1); + } + + /* + * If the hypervisor has requested user exits for + * debug exceptions, bounce them out to userland. + */ + if (intr_type == VMCS_INTR_T_SWEXCEPTION && intr_vec == IDT_BP && + (vcpu->cap.set & (1 << VM_CAP_BPT_EXIT))) { + vmexit->exitcode = VM_EXITCODE_BPT; + vmexit->u.bpt.inst_length = vmexit->inst_length; + vmexit->inst_length = 0; + break; + } + + if (intr_vec == IDT_PF) { + error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual); + KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d", + __func__, error)); + } + + /* + * Software exceptions exhibit trap-like behavior. This in + * turn requires populating the VM-entry instruction length + * so that the %rip in the trap frame is past the INT3/INTO + * instruction. + */ + if (intr_type == VMCS_INTR_T_SWEXCEPTION) + vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); + + /* Reflect all other exceptions back into the guest */ + errcode_valid = errcode = 0; + if (intr_info & VMCS_INTR_DEL_ERRCODE) { + errcode_valid = 1; + errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE); + } + VMX_CTR2(vcpu, "Reflecting exception %d/%#x into " + "the guest", intr_vec, errcode); + SDT_PROBE5(vmm, vmx, exit, exception, + vmx, vcpuid, vmexit, intr_vec, errcode); + error = vm_inject_exception(vcpu->vcpu, intr_vec, + errcode_valid, errcode, 0); + KASSERT(error == 0, ("%s: vm_inject_exception error %d", + __func__, error)); + return (1); + + case EXIT_REASON_EPT_FAULT: + /* + * If 'gpa' lies within the address space allocated to + * memory then this must be a nested page fault otherwise + * this must be an instruction that accesses MMIO space. + */ + gpa = vmcs_gpa(); + if (vm_mem_allocated(vcpu->vcpu, gpa) || + ppt_is_mmio(vmx->vm, gpa) || apic_access_fault(vcpu, gpa)) { + vmexit->exitcode = VM_EXITCODE_PAGING; + vmexit->inst_length = 0; + vmexit->u.paging.gpa = gpa; + vmexit->u.paging.fault_type = ept_fault_type(qual); + vmm_stat_incr(vcpu->vcpu, VMEXIT_NESTED_FAULT, 1); + SDT_PROBE5(vmm, vmx, exit, nestedfault, + vmx, vcpuid, vmexit, gpa, qual); + } else if (ept_emulation_fault(qual)) { + vmexit_inst_emul(vmexit, gpa, vmcs_gla()); + vmm_stat_incr(vcpu->vcpu, VMEXIT_INST_EMUL, 1); + SDT_PROBE4(vmm, vmx, exit, mmiofault, + vmx, vcpuid, vmexit, gpa); + } + /* + * If Virtual NMIs control is 1 and the VM-exit is due to an + * EPT fault during the execution of IRET then we must restore + * the state of "virtual-NMI blocking" before resuming. + * + * See description of "NMI unblocking due to IRET" in + * "Exit Qualification for EPT Violations". + */ + if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && + (qual & EXIT_QUAL_NMIUDTI) != 0) + vmx_restore_nmi_blocking(vcpu); + break; + case EXIT_REASON_VIRTUALIZED_EOI: + vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; + vmexit->u.ioapic_eoi.vector = qual & 0xFF; + SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpuid, vmexit); + vmexit->inst_length = 0; /* trap-like */ + break; + case EXIT_REASON_APIC_ACCESS: + SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpuid, vmexit); + handled = vmx_handle_apic_access(vcpu, vmexit); + break; + case EXIT_REASON_APIC_WRITE: + /* + * APIC-write VM exit is trap-like so the %rip is already + * pointing to the next instruction. + */ + vmexit->inst_length = 0; + vlapic = vm_lapic(vcpu->vcpu); + SDT_PROBE4(vmm, vmx, exit, apicwrite, + vmx, vcpuid, vmexit, vlapic); + handled = vmx_handle_apic_write(vcpu, vlapic, qual); + break; + case EXIT_REASON_XSETBV: + SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpuid, vmexit); + handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit); + break; + case EXIT_REASON_MONITOR: + SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpuid, vmexit); + vmexit->exitcode = VM_EXITCODE_MONITOR; + break; + case EXIT_REASON_MWAIT: + SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpuid, vmexit); + vmexit->exitcode = VM_EXITCODE_MWAIT; + break; + case EXIT_REASON_TPR: + vlapic = vm_lapic(vcpu->vcpu); + vlapic_sync_tpr(vlapic); + vmexit->inst_length = 0; + handled = HANDLED; + break; + case EXIT_REASON_VMCALL: + case EXIT_REASON_VMCLEAR: + case EXIT_REASON_VMLAUNCH: + case EXIT_REASON_VMPTRLD: + case EXIT_REASON_VMPTRST: + case EXIT_REASON_VMREAD: + case EXIT_REASON_VMRESUME: + case EXIT_REASON_VMWRITE: + case EXIT_REASON_VMXOFF: + case EXIT_REASON_VMXON: + SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpuid, vmexit); + vmexit->exitcode = VM_EXITCODE_VMINSN; + break; + case EXIT_REASON_INVD: + case EXIT_REASON_WBINVD: + /* ignore exit */ + handled = HANDLED; + break; + default: + SDT_PROBE4(vmm, vmx, exit, unknown, + vmx, vcpuid, vmexit, reason); + vmm_stat_incr(vcpu->vcpu, VMEXIT_UNKNOWN, 1); + break; + } + + if (handled) { + /* + * It is possible that control is returned to userland + * even though we were able to handle the VM exit in the + * kernel. + * + * In such a case we want to make sure that the userland + * restarts guest execution at the instruction *after* + * the one we just processed. Therefore we update the + * guest rip in the VMCS and in 'vmexit'. + */ + vmexit->rip += vmexit->inst_length; + vmexit->inst_length = 0; + vmcs_write(VMCS_GUEST_RIP, vmexit->rip); + } else { + if (vmexit->exitcode == VM_EXITCODE_BOGUS) { + /* + * If this VM exit was not claimed by anybody then + * treat it as a generic VMX exit. + */ + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.status = VM_SUCCESS; + vmexit->u.vmx.inst_type = 0; + vmexit->u.vmx.inst_error = 0; + } else { + /* + * The exitcode and collateral have been populated. + * The VM exit will be processed further in userland. + */ + } + } + + SDT_PROBE4(vmm, vmx, exit, return, + vmx, vcpuid, vmexit, handled); + return (handled); +} + +static __inline void +vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) +{ + + KASSERT(vmxctx->inst_fail_status != VM_SUCCESS, + ("vmx_exit_inst_error: invalid inst_fail_status %d", + vmxctx->inst_fail_status)); + + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.status = vmxctx->inst_fail_status; + vmexit->u.vmx.inst_error = vmcs_instruction_error(); + vmexit->u.vmx.exit_reason = ~0; + vmexit->u.vmx.exit_qualification = ~0; + + switch (rc) { + case VMX_VMRESUME_ERROR: + case VMX_VMLAUNCH_ERROR: + vmexit->u.vmx.inst_type = rc; + break; + default: + panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc); + } +} + +/* + * If the NMI-exiting VM execution control is set to '1' then an NMI in + * non-root operation causes a VM-exit. NMI blocking is in effect so it is + * sufficient to simply vector to the NMI handler via a software interrupt. + * However, this must be done before maskable interrupts are enabled + * otherwise the "iret" issued by an interrupt handler will incorrectly + * clear NMI blocking. + */ +static __inline void +vmx_exit_handle_nmi(struct vmx_vcpu *vcpu, struct vm_exit *vmexit) +{ + uint32_t intr_info; + + KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); + + if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION) + return; + + intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); + KASSERT((intr_info & VMCS_INTR_VALID) != 0, + ("VM exit interruption info invalid: %#x", intr_info)); + + if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { + KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due " + "to NMI has invalid vector: %#x", intr_info)); + VMX_CTR0(vcpu, "Vectoring to NMI handler"); + __asm __volatile("int $2"); + } +} + +static __inline void +vmx_dr_enter_guest(struct vmxctx *vmxctx) +{ + register_t rflags; + + /* Save host control debug registers. */ + vmxctx->host_dr7 = rdr7(); + vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); + + /* + * Disable debugging in DR7 and DEBUGCTL to avoid triggering + * exceptions in the host based on the guest DRx values. The + * guest DR7 and DEBUGCTL are saved/restored in the VMCS. + */ + load_dr7(0); + wrmsr(MSR_DEBUGCTLMSR, 0); + + /* + * Disable single stepping the kernel to avoid corrupting the + * guest DR6. A debugger might still be able to corrupt the + * guest DR6 by setting a breakpoint after this point and then + * single stepping. + */ + rflags = read_rflags(); + vmxctx->host_tf = rflags & PSL_T; + write_rflags(rflags & ~PSL_T); + + /* Save host debug registers. */ + vmxctx->host_dr0 = rdr0(); + vmxctx->host_dr1 = rdr1(); + vmxctx->host_dr2 = rdr2(); + vmxctx->host_dr3 = rdr3(); + vmxctx->host_dr6 = rdr6(); + + /* Restore guest debug registers. */ + load_dr0(vmxctx->guest_dr0); + load_dr1(vmxctx->guest_dr1); + load_dr2(vmxctx->guest_dr2); + load_dr3(vmxctx->guest_dr3); + load_dr6(vmxctx->guest_dr6); +} + +static __inline void +vmx_dr_leave_guest(struct vmxctx *vmxctx) +{ + + /* Save guest debug registers. */ + vmxctx->guest_dr0 = rdr0(); + vmxctx->guest_dr1 = rdr1(); + vmxctx->guest_dr2 = rdr2(); + vmxctx->guest_dr3 = rdr3(); + vmxctx->guest_dr6 = rdr6(); + + /* + * Restore host debug registers. Restore DR7, DEBUGCTL, and + * PSL_T last. + */ + load_dr0(vmxctx->host_dr0); + load_dr1(vmxctx->host_dr1); + load_dr2(vmxctx->host_dr2); + load_dr3(vmxctx->host_dr3); + load_dr6(vmxctx->host_dr6); + wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl); + load_dr7(vmxctx->host_dr7); + write_rflags(read_rflags() | vmxctx->host_tf); +} + +static __inline void +vmx_pmap_activate(struct vmx *vmx, pmap_t pmap) +{ + long eptgen; + int cpu; + + cpu = curcpu; + + CPU_SET_ATOMIC(cpu, &pmap->pm_active); + smr_enter(pmap->pm_eptsmr); + eptgen = atomic_load_long(&pmap->pm_eptgen); + if (eptgen != vmx->eptgen[cpu]) { + vmx->eptgen[cpu] = eptgen; + invept(INVEPT_TYPE_SINGLE_CONTEXT, + (struct invept_desc){ .eptp = vmx->eptp, ._res = 0 }); + } +} + +static __inline void +vmx_pmap_deactivate(struct vmx *vmx, pmap_t pmap) +{ + smr_exit(pmap->pm_eptsmr); + CPU_CLR_ATOMIC(curcpu, &pmap->pm_active); +} + +static int +vmx_run(void *vcpui, register_t rip, pmap_t pmap, struct vm_eventinfo *evinfo) +{ + int rc, handled, launched; + struct vmx *vmx; + struct vmx_vcpu *vcpu; + struct vmxctx *vmxctx; + struct vmcs *vmcs; + struct vm_exit *vmexit; + struct vlapic *vlapic; + uint32_t exit_reason; + struct region_descriptor gdtr, idtr; + uint16_t ldt_sel; + + vcpu = vcpui; + vmx = vcpu->vmx; + vmcs = vcpu->vmcs; + vmxctx = &vcpu->ctx; + vlapic = vm_lapic(vcpu->vcpu); + vmexit = vm_exitinfo(vcpu->vcpu); + launched = 0; + + KASSERT(vmxctx->pmap == pmap, + ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); + + vmx_msr_guest_enter(vcpu); + + VMPTRLD(vmcs); + + /* + * XXX + * We do this every time because we may setup the virtual machine + * from a different process than the one that actually runs it. + * + * If the life of a virtual machine was spent entirely in the context + * of a single process we could do this once in vmx_init(). + */ + vmcs_write(VMCS_HOST_CR3, rcr3()); + + vmcs_write(VMCS_GUEST_RIP, rip); + vmx_set_pcpu_defaults(vmx, vcpu, pmap); + do { + KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch " + "%#lx/%#lx", __func__, vmcs_guest_rip(), rip)); + + handled = UNHANDLED; + /* + * Interrupts are disabled from this point on until the + * guest starts executing. This is done for the following + * reasons: + * + * If an AST is asserted on this thread after the check below, + * then the IPI_AST notification will not be lost, because it + * will cause a VM exit due to external interrupt as soon as + * the guest state is loaded. + * + * A posted interrupt after 'vmx_inject_interrupts()' will + * not be "lost" because it will be held pending in the host + * APIC because interrupts are disabled. The pending interrupt + * will be recognized as soon as the guest state is loaded. + * + * The same reasoning applies to the IPI generated by + * pmap_invalidate_ept(). + */ + disable_intr(); + vmx_inject_interrupts(vcpu, vlapic, rip); + + /* + * Check for vcpu suspension after injecting events because + * vmx_inject_interrupts() can suspend the vcpu due to a + * triple fault. + */ + if (vcpu_suspended(evinfo)) { + enable_intr(); + vm_exit_suspended(vcpu->vcpu, rip); + break; + } + + if (vcpu_rendezvous_pending(vcpu->vcpu, evinfo)) { + enable_intr(); + vm_exit_rendezvous(vcpu->vcpu, rip); + break; + } + + if (vcpu_reqidle(evinfo)) { + enable_intr(); + vm_exit_reqidle(vcpu->vcpu, rip); + break; + } + + if (vcpu_should_yield(vcpu->vcpu)) { + enable_intr(); + vm_exit_astpending(vcpu->vcpu, rip); + vmx_astpending_trace(vcpu, rip); + handled = HANDLED; + break; + } + + if (vcpu_debugged(vcpu->vcpu)) { + enable_intr(); + vm_exit_debug(vcpu->vcpu, rip); + break; + } + + /* + * If TPR Shadowing is enabled, the TPR Threshold + * must be updated right before entering the guest. + */ + if (tpr_shadowing && !virtual_interrupt_delivery) { + if ((vcpu->cap.proc_ctls & PROCBASED_USE_TPR_SHADOW) != 0) { + vmcs_write(VMCS_TPR_THRESHOLD, vlapic_get_cr8(vlapic)); + } + } + + /* + * VM exits restore the base address but not the + * limits of GDTR and IDTR. The VMCS only stores the + * base address, so VM exits set the limits to 0xffff. + * Save and restore the full GDTR and IDTR to restore + * the limits. + * + * The VMCS does not save the LDTR at all, and VM + * exits clear LDTR as if a NULL selector were loaded. + * The userspace hypervisor probably doesn't use a + * LDT, but save and restore it to be safe. + */ + sgdt(&gdtr); + sidt(&idtr); + ldt_sel = sldt(); + + /* + * The TSC_AUX MSR must be saved/restored while interrupts + * are disabled so that it is not possible for the guest + * TSC_AUX MSR value to be overwritten by the resume + * portion of the IPI_SUSPEND codepath. This is why the + * transition of this MSR is handled separately from those + * handled by vmx_msr_guest_{enter,exit}(), which are ok to + * be transitioned with preemption disabled but interrupts + * enabled. + * + * These vmx_msr_guest_{enter,exit}_tsc_aux() calls can be + * anywhere in this loop so long as they happen with + * interrupts disabled. This location is chosen for + * simplicity. + */ + vmx_msr_guest_enter_tsc_aux(vmx, vcpu); + + vmx_dr_enter_guest(vmxctx); + + /* + * Mark the EPT as active on this host CPU and invalidate + * EPTP-tagged TLB entries if required. + */ + vmx_pmap_activate(vmx, pmap); + + vmx_run_trace(vcpu); + rc = vmx_enter_guest(vmxctx, vmx, launched); + + vmx_pmap_deactivate(vmx, pmap); + vmx_dr_leave_guest(vmxctx); + vmx_msr_guest_exit_tsc_aux(vmx, vcpu); + + bare_lgdt(&gdtr); + lidt(&idtr); + lldt(ldt_sel); + + /* Collect some information for VM exit processing */ + vmexit->rip = rip = vmcs_guest_rip(); + vmexit->inst_length = vmexit_instruction_length(); + vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); + vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); + + /* Update 'nextrip' */ + vcpu->state.nextrip = rip; + + if (rc == VMX_GUEST_VMEXIT) { + vmx_exit_handle_nmi(vcpu, vmexit); + enable_intr(); + handled = vmx_exit_process(vmx, vcpu, vmexit); + } else { + enable_intr(); + vmx_exit_inst_error(vmxctx, rc, vmexit); + } + launched = 1; + vmx_exit_trace(vcpu, rip, exit_reason, handled); + rip = vmexit->rip; + } while (handled); + + /* + * If a VM exit has been handled then the exitcode must be BOGUS + * If a VM exit is not handled then the exitcode must not be BOGUS + */ + if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || + (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { + panic("Mismatch between handled (%d) and exitcode (%d)", + handled, vmexit->exitcode); + } + + VMX_CTR1(vcpu, "returning from vmx_run: exitcode %d", + vmexit->exitcode); + + VMCLEAR(vmcs); + vmx_msr_guest_exit(vcpu); + + return (0); +} + +static void +vmx_vcpu_cleanup(void *vcpui) +{ + struct vmx_vcpu *vcpu = vcpui; + + vpid_free(vcpu->state.vpid); + free(vcpu->pir_desc, M_VMX); + free(vcpu->apic_page, M_VMX); + free(vcpu->vmcs, M_VMX); + free(vcpu, M_VMX); +} + +static void +vmx_cleanup(void *vmi) +{ + struct vmx *vmx = vmi; + + if (virtual_interrupt_delivery) + vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); + + free(vmx->msr_bitmap, M_VMX); + free(vmx, M_VMX); + + return; +} + +static register_t * +vmxctx_regptr(struct vmxctx *vmxctx, int reg) +{ + + switch (reg) { + case VM_REG_GUEST_RAX: + return (&vmxctx->guest_rax); + case VM_REG_GUEST_RBX: + return (&vmxctx->guest_rbx); + case VM_REG_GUEST_RCX: + return (&vmxctx->guest_rcx); + case VM_REG_GUEST_RDX: + return (&vmxctx->guest_rdx); + case VM_REG_GUEST_RSI: + return (&vmxctx->guest_rsi); + case VM_REG_GUEST_RDI: + return (&vmxctx->guest_rdi); + case VM_REG_GUEST_RBP: + return (&vmxctx->guest_rbp); + case VM_REG_GUEST_R8: + return (&vmxctx->guest_r8); + case VM_REG_GUEST_R9: + return (&vmxctx->guest_r9); + case VM_REG_GUEST_R10: + return (&vmxctx->guest_r10); + case VM_REG_GUEST_R11: + return (&vmxctx->guest_r11); + case VM_REG_GUEST_R12: + return (&vmxctx->guest_r12); + case VM_REG_GUEST_R13: + return (&vmxctx->guest_r13); + case VM_REG_GUEST_R14: + return (&vmxctx->guest_r14); + case VM_REG_GUEST_R15: + return (&vmxctx->guest_r15); + case VM_REG_GUEST_CR2: + return (&vmxctx->guest_cr2); + case VM_REG_GUEST_DR0: + return (&vmxctx->guest_dr0); + case VM_REG_GUEST_DR1: + return (&vmxctx->guest_dr1); + case VM_REG_GUEST_DR2: + return (&vmxctx->guest_dr2); + case VM_REG_GUEST_DR3: + return (&vmxctx->guest_dr3); + case VM_REG_GUEST_DR6: + return (&vmxctx->guest_dr6); + default: + break; + } + return (NULL); +} + +static int +vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) +{ + register_t *regp; + + if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { + *retval = *regp; + return (0); + } else + return (EINVAL); +} + +static int +vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) +{ + register_t *regp; + + if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { + *regp = val; + return (0); + } else + return (EINVAL); +} + +static int +vmx_get_intr_shadow(struct vmx_vcpu *vcpu, int running, uint64_t *retval) +{ + uint64_t gi; + int error; + + error = vmcs_getreg(vcpu->vmcs, running, + VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi); + *retval = (gi & HWINTR_BLOCKING) ? 1 : 0; + return (error); +} + +static int +vmx_modify_intr_shadow(struct vmx_vcpu *vcpu, int running, uint64_t val) +{ + struct vmcs *vmcs; + uint64_t gi; + int error, ident; + + /* + * Forcing the vcpu into an interrupt shadow is not supported. + */ + if (val) { + error = EINVAL; + goto done; + } + + vmcs = vcpu->vmcs; + ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY); + error = vmcs_getreg(vmcs, running, ident, &gi); + if (error == 0) { + gi &= ~HWINTR_BLOCKING; + error = vmcs_setreg(vmcs, running, ident, gi); + } +done: + VMX_CTR2(vcpu, "Setting intr_shadow to %#lx %s", val, + error ? "failed" : "succeeded"); + return (error); +} + +static int +vmx_shadow_reg(int reg) +{ + int shreg; + + shreg = -1; + + switch (reg) { + case VM_REG_GUEST_CR0: + shreg = VMCS_CR0_SHADOW; + break; + case VM_REG_GUEST_CR4: + shreg = VMCS_CR4_SHADOW; + break; + default: + break; + } + + return (shreg); +} + +static int +vmx_getreg(void *vcpui, int reg, uint64_t *retval) +{ + int running, hostcpu; + struct vmx_vcpu *vcpu = vcpui; + struct vmx *vmx = vcpu->vmx; + + running = vcpu_is_running(vcpu->vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), + vcpu->vcpuid); + + switch (reg) { + case VM_REG_GUEST_INTR_SHADOW: + return (vmx_get_intr_shadow(vcpu, running, retval)); + case VM_REG_GUEST_KGS_BASE: + *retval = vcpu->guest_msrs[IDX_MSR_KGSBASE]; + return (0); + case VM_REG_GUEST_TPR: + *retval = vlapic_get_cr8(vm_lapic(vcpu->vcpu)); + return (0); + } + + if (vmxctx_getreg(&vcpu->ctx, reg, retval) == 0) + return (0); + + return (vmcs_getreg(vcpu->vmcs, running, reg, retval)); +} + +static int +vmx_setreg(void *vcpui, int reg, uint64_t val) +{ + int error, hostcpu, running, shadow; + uint64_t ctls; + pmap_t pmap; + struct vmx_vcpu *vcpu = vcpui; + struct vmx *vmx = vcpu->vmx; + + running = vcpu_is_running(vcpu->vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), + vcpu->vcpuid); + + if (reg == VM_REG_GUEST_INTR_SHADOW) + return (vmx_modify_intr_shadow(vcpu, running, val)); + + if (vmxctx_setreg(&vcpu->ctx, reg, val) == 0) + return (0); + + /* Do not permit user write access to VMCS fields by offset. */ + if (reg < 0) + return (EINVAL); + + error = vmcs_setreg(vcpu->vmcs, running, reg, val); + + if (error == 0) { + /* + * If the "load EFER" VM-entry control is 1 then the + * value of EFER.LMA must be identical to "IA-32e mode guest" + * bit in the VM-entry control. + */ + if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && + (reg == VM_REG_GUEST_EFER)) { + vmcs_getreg(vcpu->vmcs, running, + VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); + if (val & EFER_LMA) + ctls |= VM_ENTRY_GUEST_LMA; + else + ctls &= ~VM_ENTRY_GUEST_LMA; + vmcs_setreg(vcpu->vmcs, running, + VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); + } + + shadow = vmx_shadow_reg(reg); + if (shadow > 0) { + /* + * Store the unmodified value in the shadow + */ + error = vmcs_setreg(vcpu->vmcs, running, + VMCS_IDENT(shadow), val); + } + + if (reg == VM_REG_GUEST_CR3) { + /* + * Invalidate the guest vcpu's TLB mappings to emulate + * the behavior of updating %cr3. + * + * XXX the processor retains global mappings when %cr3 + * is updated but vmx_invvpid() does not. + */ + pmap = vcpu->ctx.pmap; + vmx_invvpid(vmx, vcpu, pmap, running); + } + } + + return (error); +} + +static int +vmx_getdesc(void *vcpui, int reg, struct seg_desc *desc) +{ + int hostcpu, running; + struct vmx_vcpu *vcpu = vcpui; + struct vmx *vmx = vcpu->vmx; + + running = vcpu_is_running(vcpu->vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), + vcpu->vcpuid); + + return (vmcs_getdesc(vcpu->vmcs, running, reg, desc)); +} + +static int +vmx_setdesc(void *vcpui, int reg, struct seg_desc *desc) +{ + int hostcpu, running; + struct vmx_vcpu *vcpu = vcpui; + struct vmx *vmx = vcpu->vmx; + + running = vcpu_is_running(vcpu->vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), + vcpu->vcpuid); + + return (vmcs_setdesc(vcpu->vmcs, running, reg, desc)); +} + +static int +vmx_getcap(void *vcpui, int type, int *retval) +{ + struct vmx_vcpu *vcpu = vcpui; + int vcap; + int ret; + + ret = ENOENT; + + vcap = vcpu->cap.set; + + switch (type) { + case VM_CAP_HALT_EXIT: + if (cap_halt_exit) + ret = 0; + break; + case VM_CAP_PAUSE_EXIT: + if (cap_pause_exit) + ret = 0; + break; + case VM_CAP_MTRAP_EXIT: + if (cap_monitor_trap) + ret = 0; + break; + case VM_CAP_RDPID: + if (cap_rdpid) + ret = 0; + break; + case VM_CAP_RDTSCP: + if (cap_rdtscp) + ret = 0; + break; + case VM_CAP_UNRESTRICTED_GUEST: + if (cap_unrestricted_guest) + ret = 0; + break; + case VM_CAP_ENABLE_INVPCID: + if (cap_invpcid) + ret = 0; + break; + case VM_CAP_BPT_EXIT: + case VM_CAP_IPI_EXIT: + ret = 0; + break; + default: + break; + } + + if (ret == 0) + *retval = (vcap & (1 << type)) ? 1 : 0; + + return (ret); +} + +static int +vmx_setcap(void *vcpui, int type, int val) +{ + struct vmx_vcpu *vcpu = vcpui; + struct vmcs *vmcs = vcpu->vmcs; + struct vlapic *vlapic; + uint32_t baseval; + uint32_t *pptr; + int error; + int flag; + int reg; + int retval; + + retval = ENOENT; + pptr = NULL; + + switch (type) { + case VM_CAP_HALT_EXIT: + if (cap_halt_exit) { + retval = 0; + pptr = &vcpu->cap.proc_ctls; + baseval = *pptr; + flag = PROCBASED_HLT_EXITING; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_MTRAP_EXIT: + if (cap_monitor_trap) { + retval = 0; + pptr = &vcpu->cap.proc_ctls; + baseval = *pptr; + flag = PROCBASED_MTF; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_PAUSE_EXIT: + if (cap_pause_exit) { + retval = 0; + pptr = &vcpu->cap.proc_ctls; + baseval = *pptr; + flag = PROCBASED_PAUSE_EXITING; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_RDPID: + case VM_CAP_RDTSCP: + if (cap_rdpid || cap_rdtscp) + /* + * Choose not to support enabling/disabling + * RDPID/RDTSCP via libvmmapi since, as per the + * discussion in vmx_modinit(), RDPID/RDTSCP are + * either always enabled or always disabled. + */ + error = EOPNOTSUPP; + break; + case VM_CAP_UNRESTRICTED_GUEST: + if (cap_unrestricted_guest) { + retval = 0; + pptr = &vcpu->cap.proc_ctls2; + baseval = *pptr; + flag = PROCBASED2_UNRESTRICTED_GUEST; + reg = VMCS_SEC_PROC_BASED_CTLS; + } + break; + case VM_CAP_ENABLE_INVPCID: + if (cap_invpcid) { + retval = 0; + pptr = &vcpu->cap.proc_ctls2; + baseval = *pptr; + flag = PROCBASED2_ENABLE_INVPCID; + reg = VMCS_SEC_PROC_BASED_CTLS; + } + break; + case VM_CAP_BPT_EXIT: + retval = 0; + + /* Don't change the bitmap if we are tracing all exceptions. */ + if (vcpu->cap.exc_bitmap != 0xffffffff) { + pptr = &vcpu->cap.exc_bitmap; + baseval = *pptr; + flag = (1 << IDT_BP); + reg = VMCS_EXCEPTION_BITMAP; + } + break; + case VM_CAP_IPI_EXIT: + retval = 0; + + vlapic = vm_lapic(vcpu->vcpu); + vlapic->ipi_exit = val; + break; + case VM_CAP_MASK_HWINTR: + retval = 0; + break; + default: + break; + } + + if (retval) + return (retval); + + if (pptr != NULL) { + if (val) { + baseval |= flag; + } else { + baseval &= ~flag; + } + VMPTRLD(vmcs); + error = vmwrite(reg, baseval); + VMCLEAR(vmcs); + + if (error) + return (error); + + /* + * Update optional stored flags, and record + * setting + */ + *pptr = baseval; + } + + if (val) { + vcpu->cap.set |= (1 << type); + } else { + vcpu->cap.set &= ~(1 << type); + } + + return (0); +} + +static struct vmspace * +vmx_vmspace_alloc(vm_offset_t min, vm_offset_t max) +{ + return (ept_vmspace_alloc(min, max)); +} + +static void +vmx_vmspace_free(struct vmspace *vmspace) +{ + ept_vmspace_free(vmspace); +} + +struct vlapic_vtx { + struct vlapic vlapic; + struct pir_desc *pir_desc; + struct vmx_vcpu *vcpu; + u_int pending_prio; +}; + +#define VPR_PRIO_BIT(vpr) (1 << ((vpr) >> 4)) + +#define VMX_CTR_PIR(vlapic, pir_desc, notify, vector, level, msg) \ +do { \ + VLAPIC_CTR2(vlapic, msg " assert %s-triggered vector %d", \ + level ? "level" : "edge", vector); \ + VLAPIC_CTR1(vlapic, msg " pir0 0x%016lx", pir_desc->pir[0]); \ + VLAPIC_CTR1(vlapic, msg " pir1 0x%016lx", pir_desc->pir[1]); \ + VLAPIC_CTR1(vlapic, msg " pir2 0x%016lx", pir_desc->pir[2]); \ + VLAPIC_CTR1(vlapic, msg " pir3 0x%016lx", pir_desc->pir[3]); \ + VLAPIC_CTR1(vlapic, msg " notify: %s", notify ? "yes" : "no"); \ +} while (0) + +/* + * vlapic->ops handlers that utilize the APICv hardware assist described in + * Chapter 29 of the Intel SDM. + */ +static int +vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) +{ + struct vlapic_vtx *vlapic_vtx; + struct pir_desc *pir_desc; + uint64_t mask; + int idx, notify = 0; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + pir_desc = vlapic_vtx->pir_desc; + + /* + * Keep track of interrupt requests in the PIR descriptor. This is + * because the virtual APIC page pointed to by the VMCS cannot be + * modified if the vcpu is running. + */ + idx = vector / 64; + mask = 1UL << (vector % 64); + atomic_set_long(&pir_desc->pir[idx], mask); + + /* + * A notification is required whenever the 'pending' bit makes a + * transition from 0->1. + * + * Even if the 'pending' bit is already asserted, notification about + * the incoming interrupt may still be necessary. For example, if a + * vCPU is HLTed with a high PPR, a low priority interrupt would cause + * the 0->1 'pending' transition with a notification, but the vCPU + * would ignore the interrupt for the time being. The same vCPU would + * need to then be notified if a high-priority interrupt arrived which + * satisfied the PPR. + * + * The priorities of interrupts injected while 'pending' is asserted + * are tracked in a custom bitfield 'pending_prio'. Should the + * to-be-injected interrupt exceed the priorities already present, the + * notification is sent. The priorities recorded in 'pending_prio' are + * cleared whenever the 'pending' bit makes another 0->1 transition. + */ + if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) { + notify = 1; + vlapic_vtx->pending_prio = 0; + } else { + const u_int old_prio = vlapic_vtx->pending_prio; + const u_int prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT); + + if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) { + atomic_set_int(&vlapic_vtx->pending_prio, prio_bit); + notify = 1; + } + } + + VMX_CTR_PIR(vlapic, pir_desc, notify, vector, level, + "vmx_set_intr_ready"); + return (notify); +} + +static int +vmx_pending_intr(struct vlapic *vlapic, int *vecptr) +{ + struct vlapic_vtx *vlapic_vtx; + struct pir_desc *pir_desc; + struct LAPIC *lapic; + uint64_t pending, pirval; + uint8_t ppr, vpr, rvi; + struct vm_exit *vmexit; + int i; + + /* + * This function is only expected to be called from the 'HLT' exit + * handler which does not care about the vector that is pending. + */ + KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL")); + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + pir_desc = vlapic_vtx->pir_desc; + lapic = vlapic->apic_page; + + /* + * While a virtual interrupt may have already been + * processed the actual delivery maybe pending the + * interruptibility of the guest. Recognize a pending + * interrupt by reevaluating virtual interrupts + * following Section 30.2.1 in the Intel SDM Volume 3. + */ + vmexit = vm_exitinfo(vlapic->vcpu); + KASSERT(vmexit->exitcode == VM_EXITCODE_HLT, + ("vmx_pending_intr: exitcode not 'HLT'")); + rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT; + ppr = lapic->ppr & APIC_TPR_INT; + if (rvi > ppr) + return (1); + + pending = atomic_load_acq_long(&pir_desc->pending); + if (!pending) + return (0); + + /* + * If there is an interrupt pending then it will be recognized only + * if its priority is greater than the processor priority. + * + * Special case: if the processor priority is zero then any pending + * interrupt will be recognized. + */ + if (ppr == 0) + return (1); + + VLAPIC_CTR1(vlapic, "HLT with non-zero PPR %d", lapic->ppr); + + vpr = 0; + for (i = 3; i >= 0; i--) { + pirval = pir_desc->pir[i]; + if (pirval != 0) { + vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT; + break; + } + } + + /* + * If the highest-priority pending interrupt falls short of the + * processor priority of this vCPU, ensure that 'pending_prio' does not + * have any stale bits which would preclude a higher-priority interrupt + * from incurring a notification later. + */ + if (vpr <= ppr) { + const u_int prio_bit = VPR_PRIO_BIT(vpr); + const u_int old = vlapic_vtx->pending_prio; + + if (old > prio_bit && (old & prio_bit) == 0) { + vlapic_vtx->pending_prio = prio_bit; + } + return (0); + } + return (1); +} + +static void +vmx_intr_accepted(struct vlapic *vlapic, int vector) +{ + + panic("vmx_intr_accepted: not expected to be called"); +} + +static void +vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) +{ + struct vlapic_vtx *vlapic_vtx; + struct vmcs *vmcs; + uint64_t mask, val; + + KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); + KASSERT(!vcpu_is_running(vlapic->vcpu, NULL), + ("vmx_set_tmr: vcpu cannot be running")); + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + vmcs = vlapic_vtx->vcpu->vmcs; + mask = 1UL << (vector % 64); + + VMPTRLD(vmcs); + val = vmcs_read(VMCS_EOI_EXIT(vector)); + if (level) + val |= mask; + else + val &= ~mask; + vmcs_write(VMCS_EOI_EXIT(vector), val); + VMCLEAR(vmcs); +} + +static void +vmx_enable_x2apic_mode_ts(struct vlapic *vlapic) +{ + struct vlapic_vtx *vlapic_vtx; + struct vmx_vcpu *vcpu; + struct vmcs *vmcs; + uint32_t proc_ctls; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + vcpu = vlapic_vtx->vcpu; + vmcs = vcpu->vmcs; + + proc_ctls = vcpu->cap.proc_ctls; + proc_ctls &= ~PROCBASED_USE_TPR_SHADOW; + proc_ctls |= PROCBASED_CR8_LOAD_EXITING; + proc_ctls |= PROCBASED_CR8_STORE_EXITING; + vcpu->cap.proc_ctls = proc_ctls; + + VMPTRLD(vmcs); + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, proc_ctls); + VMCLEAR(vmcs); +} + +static void +vmx_enable_x2apic_mode_vid(struct vlapic *vlapic) +{ + struct vlapic_vtx *vlapic_vtx; + struct vmx *vmx; + struct vmx_vcpu *vcpu; + struct vmcs *vmcs; + uint32_t proc_ctls2; + int error __diagused; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + vcpu = vlapic_vtx->vcpu; + vmx = vcpu->vmx; + vmcs = vcpu->vmcs; + + proc_ctls2 = vcpu->cap.proc_ctls2; + KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, + ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2)); + + proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; + proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; + vcpu->cap.proc_ctls2 = proc_ctls2; + + VMPTRLD(vmcs); + vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); + VMCLEAR(vmcs); + + if (vlapic->vcpuid == 0) { + /* + * The nested page table mappings are shared by all vcpus + * so unmap the APIC access page just once. + */ + error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); + KASSERT(error == 0, ("%s: vm_unmap_mmio error %d", + __func__, error)); + + /* + * The MSR bitmap is shared by all vcpus so modify it only + * once in the context of vcpu 0. + */ + error = vmx_allow_x2apic_msrs(vmx); + KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d", + __func__, error)); + } +} + +static void +vmx_post_intr(struct vlapic *vlapic, int hostcpu) +{ + + ipi_cpu(hostcpu, pirvec); +} + +/* + * Transfer the pending interrupts in the PIR descriptor to the IRR + * in the virtual APIC page. + */ +static void +vmx_inject_pir(struct vlapic *vlapic) +{ + struct vlapic_vtx *vlapic_vtx; + struct pir_desc *pir_desc; + struct LAPIC *lapic; + uint64_t val, pirval; + int rvi, pirbase = -1; + uint16_t intr_status_old, intr_status_new; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + pir_desc = vlapic_vtx->pir_desc; + if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) { + VLAPIC_CTR0(vlapic, "vmx_inject_pir: " + "no posted interrupt pending"); + return; + } + + pirval = 0; + pirbase = -1; + lapic = vlapic->apic_page; + + val = atomic_readandclear_long(&pir_desc->pir[0]); + if (val != 0) { + lapic->irr0 |= val; + lapic->irr1 |= val >> 32; + pirbase = 0; + pirval = val; + } + + val = atomic_readandclear_long(&pir_desc->pir[1]); + if (val != 0) { + lapic->irr2 |= val; + lapic->irr3 |= val >> 32; + pirbase = 64; + pirval = val; + } + + val = atomic_readandclear_long(&pir_desc->pir[2]); + if (val != 0) { + lapic->irr4 |= val; + lapic->irr5 |= val >> 32; + pirbase = 128; + pirval = val; + } + + val = atomic_readandclear_long(&pir_desc->pir[3]); + if (val != 0) { + lapic->irr6 |= val; + lapic->irr7 |= val >> 32; + pirbase = 192; + pirval = val; + } + + VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir"); + + /* + * Update RVI so the processor can evaluate pending virtual + * interrupts on VM-entry. + * + * It is possible for pirval to be 0 here, even though the + * pending bit has been set. The scenario is: + * CPU-Y is sending a posted interrupt to CPU-X, which + * is running a guest and processing posted interrupts in h/w. + * CPU-X will eventually exit and the state seen in s/w is + * the pending bit set, but no PIR bits set. + * + * CPU-X CPU-Y + * (vm running) (host running) + * rx posted interrupt + * CLEAR pending bit + * SET PIR bit + * READ/CLEAR PIR bits + * SET pending bit + * (vm exit) + * pending bit set, PIR 0 + */ + if (pirval != 0) { + rvi = pirbase + flsl(pirval) - 1; + intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS); + intr_status_new = (intr_status_old & 0xFF00) | rvi; + if (intr_status_new > intr_status_old) { + vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new); + VLAPIC_CTR2(vlapic, "vmx_inject_pir: " + "guest_intr_status changed from 0x%04x to 0x%04x", + intr_status_old, intr_status_new); + } + } +} + +static struct vlapic * +vmx_vlapic_init(void *vcpui) +{ + struct vmx *vmx; + struct vmx_vcpu *vcpu; + struct vlapic *vlapic; + struct vlapic_vtx *vlapic_vtx; + + vcpu = vcpui; + vmx = vcpu->vmx; + + vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); + vlapic->vm = vmx->vm; + vlapic->vcpu = vcpu->vcpu; + vlapic->vcpuid = vcpu->vcpuid; + vlapic->apic_page = (struct LAPIC *)vcpu->apic_page; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + vlapic_vtx->pir_desc = vcpu->pir_desc; + vlapic_vtx->vcpu = vcpu; + + if (tpr_shadowing) { + vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_ts; + } + + if (virtual_interrupt_delivery) { + vlapic->ops.set_intr_ready = vmx_set_intr_ready; + vlapic->ops.pending_intr = vmx_pending_intr; + vlapic->ops.intr_accepted = vmx_intr_accepted; + vlapic->ops.set_tmr = vmx_set_tmr; + vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_vid; + } + + if (posted_interrupts) + vlapic->ops.post_intr = vmx_post_intr; + + vlapic_init(vlapic); + + return (vlapic); +} + +static void +vmx_vlapic_cleanup(struct vlapic *vlapic) +{ + + vlapic_cleanup(vlapic); + free(vlapic, M_VLAPIC); +} + +#ifdef BHYVE_SNAPSHOT +static int +vmx_vcpu_snapshot(void *vcpui, struct vm_snapshot_meta *meta) +{ + struct vmcs *vmcs; + struct vmx *vmx; + struct vmx_vcpu *vcpu; + struct vmxctx *vmxctx; + int err, run, hostcpu; + + err = 0; + vcpu = vcpui; + vmx = vcpu->vmx; + vmcs = vcpu->vmcs; + + run = vcpu_is_running(vcpu->vcpu, &hostcpu); + if (run && hostcpu != curcpu) { + printf("%s: %s%d is running", __func__, vm_name(vmx->vm), + vcpu->vcpuid); + return (EINVAL); + } + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR0, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR3, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR4, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DR7, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RSP, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RIP, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RFLAGS, meta); + + /* Guest segments */ + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_ES, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_ES, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_CS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_SS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_SS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_DS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_FS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_FS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_GS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_TR, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_TR, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_LDTR, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_LDTR, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_EFER, meta); + + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_IDTR, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GDTR, meta); + + /* Guest page tables */ + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE0, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE1, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE2, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE3, meta); + + /* Other guest state */ + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_CS, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_ESP, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_EIP, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_INTERRUPTIBILITY, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_ACTIVITY, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_ENTRY_CTLS, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_EXIT_CTLS, meta); + if (err != 0) + goto done; + + SNAPSHOT_BUF_OR_LEAVE(vcpu->guest_msrs, + sizeof(vcpu->guest_msrs), meta, err, done); + + SNAPSHOT_BUF_OR_LEAVE(vcpu->pir_desc, + sizeof(*vcpu->pir_desc), meta, err, done); + + SNAPSHOT_BUF_OR_LEAVE(&vcpu->mtrr, + sizeof(vcpu->mtrr), meta, err, done); + + vmxctx = &vcpu->ctx; + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdi, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rsi, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdx, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rcx, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r8, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r9, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rax, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbx, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbp, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r10, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r11, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r12, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r13, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r14, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r15, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_cr2, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr0, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr1, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr2, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr3, meta, err, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr6, meta, err, done); + +done: + return (err); +} + +static int +vmx_restore_tsc(void *vcpui, uint64_t offset) +{ + struct vmx_vcpu *vcpu = vcpui; + struct vmcs *vmcs; + struct vmx *vmx; + int error, running, hostcpu; + + vmx = vcpu->vmx; + vmcs = vcpu->vmcs; + + running = vcpu_is_running(vcpu->vcpu, &hostcpu); + if (running && hostcpu != curcpu) { + printf("%s: %s%d is running", __func__, vm_name(vmx->vm), + vcpu->vcpuid); + return (EINVAL); + } + + if (!running) + VMPTRLD(vmcs); + + error = vmx_set_tsc_offset(vcpu, offset); + + if (!running) + VMCLEAR(vmcs); + return (error); +} +#endif + +const struct vmm_ops vmm_ops_intel = { + .modinit = vmx_modinit, + .modcleanup = vmx_modcleanup, + .modsuspend = vmx_modsuspend, + .modresume = vmx_modresume, + .init = vmx_init, + .run = vmx_run, + .cleanup = vmx_cleanup, + .vcpu_init = vmx_vcpu_init, + .vcpu_cleanup = vmx_vcpu_cleanup, + .getreg = vmx_getreg, + .setreg = vmx_setreg, + .getdesc = vmx_getdesc, + .setdesc = vmx_setdesc, + .getcap = vmx_getcap, + .setcap = vmx_setcap, + .vmspace_alloc = vmx_vmspace_alloc, + .vmspace_free = vmx_vmspace_free, + .vlapic_init = vmx_vlapic_init, + .vlapic_cleanup = vmx_vlapic_cleanup, +#ifdef BHYVE_SNAPSHOT + .vcpu_snapshot = vmx_vcpu_snapshot, + .restore_tsc = vmx_restore_tsc, +#endif +}; diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h new file mode 100644 index 000000000000..af4437d1eda4 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx.h @@ -0,0 +1,181 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMX_H_ +#define _VMX_H_ + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include "vmcs.h" +#include "x86.h" + +struct pmap; +struct vmx; + +struct vmxctx { + register_t guest_rdi; /* Guest state */ + register_t guest_rsi; + register_t guest_rdx; + register_t guest_rcx; + register_t guest_r8; + register_t guest_r9; + register_t guest_rax; + register_t guest_rbx; + register_t guest_rbp; + register_t guest_r10; + register_t guest_r11; + register_t guest_r12; + register_t guest_r13; + register_t guest_r14; + register_t guest_r15; + register_t guest_cr2; + register_t guest_dr0; + register_t guest_dr1; + register_t guest_dr2; + register_t guest_dr3; + register_t guest_dr6; + + register_t host_r15; /* Host state */ + register_t host_r14; + register_t host_r13; + register_t host_r12; + register_t host_rbp; + register_t host_rsp; + register_t host_rbx; + register_t host_dr0; + register_t host_dr1; + register_t host_dr2; + register_t host_dr3; + register_t host_dr6; + register_t host_dr7; + uint64_t host_debugctl; + int host_tf; + + int inst_fail_status; + + /* + * The pmap needs to be deactivated in vmx_enter_guest() + * so keep a copy of the 'pmap' in each vmxctx. + */ + struct pmap *pmap; +}; + +struct vmxcap { + int set; + uint32_t proc_ctls; + uint32_t proc_ctls2; + uint32_t exc_bitmap; +}; + +struct vmxstate { + uint64_t nextrip; /* next instruction to be executed by guest */ + int lastcpu; /* host cpu that this 'vcpu' last ran on */ + uint16_t vpid; +}; + +struct apic_page { + uint32_t reg[PAGE_SIZE / 4]; +}; +CTASSERT(sizeof(struct apic_page) == PAGE_SIZE); + +/* Posted Interrupt Descriptor (described in section 29.6 of the Intel SDM) */ +struct pir_desc { + uint64_t pir[4]; + uint64_t pending; + uint64_t unused[3]; +} __aligned(64); +CTASSERT(sizeof(struct pir_desc) == 64); + +/* Index into the 'guest_msrs[]' array */ +enum { + IDX_MSR_LSTAR, + IDX_MSR_CSTAR, + IDX_MSR_STAR, + IDX_MSR_SF_MASK, + IDX_MSR_KGSBASE, + IDX_MSR_PAT, + IDX_MSR_TSC_AUX, + GUEST_MSR_NUM /* must be the last enumeration */ +}; + +struct vmx_vcpu { + struct vmx *vmx; + struct vcpu *vcpu; + struct vmcs *vmcs; + struct apic_page *apic_page; + struct pir_desc *pir_desc; + uint64_t guest_msrs[GUEST_MSR_NUM]; + struct vmxctx ctx; + struct vmxcap cap; + struct vmxstate state; + struct vm_mtrr mtrr; + int vcpuid; +}; + +/* virtual machine softc */ +struct vmx { + struct vm *vm; + char *msr_bitmap; + uint64_t eptp; + long eptgen[MAXCPU]; /* cached pmap->pm_eptgen */ + pmap_t pmap; +}; + +extern bool vmx_have_msr_tsc_aux; + +#define VMX_CTR0(vcpu, format) \ + VCPU_CTR0((vcpu)->vmx->vm, (vcpu)->vcpuid, format) + +#define VMX_CTR1(vcpu, format, p1) \ + VCPU_CTR1((vcpu)->vmx->vm, (vcpu)->vcpuid, format, p1) + +#define VMX_CTR2(vcpu, format, p1, p2) \ + VCPU_CTR2((vcpu)->vmx->vm, (vcpu)->vcpuid, format, p1, p2) + +#define VMX_CTR3(vcpu, format, p1, p2, p3) \ + VCPU_CTR3((vcpu)->vmx->vm, (vcpu)->vcpuid, format, p1, p2, p3) + +#define VMX_CTR4(vcpu, format, p1, p2, p3, p4) \ + VCPU_CTR4((vcpu)->vmx->vm, (vcpu)->vcpuid, format, p1, p2, p3, p4) + +#define VMX_GUEST_VMEXIT 0 +#define VMX_VMRESUME_ERROR 1 +#define VMX_VMLAUNCH_ERROR 2 +int vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched); +void vmx_call_isr(uintptr_t entry); + +u_long vmx_fix_cr0(u_long cr0); +u_long vmx_fix_cr4(u_long cr4); + +int vmx_set_tsc_offset(struct vmx_vcpu *vcpu, uint64_t offset); + +extern char vmx_exit_guest[]; +extern char vmx_exit_guest_flush_rsb[]; + +#endif diff --git a/sys/amd64/vmm/intel/vmx_controls.h b/sys/amd64/vmm/intel/vmx_controls.h new file mode 100644 index 000000000000..2e4e7cc8a028 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_controls.h @@ -0,0 +1,96 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMX_CONTROLS_H_ +#define _VMX_CONTROLS_H_ + +/* Pin-Based VM-Execution Controls */ +#define PINBASED_EXTINT_EXITING (1 << 0) +#define PINBASED_NMI_EXITING (1 << 3) +#define PINBASED_VIRTUAL_NMI (1 << 5) +#define PINBASED_PREMPTION_TIMER (1 << 6) +#define PINBASED_POSTED_INTERRUPT (1 << 7) + +/* Primary Processor-Based VM-Execution Controls */ +#define PROCBASED_INT_WINDOW_EXITING (1 << 2) +#define PROCBASED_TSC_OFFSET (1 << 3) +#define PROCBASED_HLT_EXITING (1 << 7) +#define PROCBASED_INVLPG_EXITING (1 << 9) +#define PROCBASED_MWAIT_EXITING (1 << 10) +#define PROCBASED_RDPMC_EXITING (1 << 11) +#define PROCBASED_RDTSC_EXITING (1 << 12) +#define PROCBASED_CR3_LOAD_EXITING (1 << 15) +#define PROCBASED_CR3_STORE_EXITING (1 << 16) +#define PROCBASED_CR8_LOAD_EXITING (1 << 19) +#define PROCBASED_CR8_STORE_EXITING (1 << 20) +#define PROCBASED_USE_TPR_SHADOW (1 << 21) +#define PROCBASED_NMI_WINDOW_EXITING (1 << 22) +#define PROCBASED_MOV_DR_EXITING (1 << 23) +#define PROCBASED_IO_EXITING (1 << 24) +#define PROCBASED_IO_BITMAPS (1 << 25) +#define PROCBASED_MTF (1 << 27) +#define PROCBASED_MSR_BITMAPS (1 << 28) +#define PROCBASED_MONITOR_EXITING (1 << 29) +#define PROCBASED_PAUSE_EXITING (1 << 30) +#define PROCBASED_SECONDARY_CONTROLS (1U << 31) + +/* Secondary Processor-Based VM-Execution Controls */ +#define PROCBASED2_VIRTUALIZE_APIC_ACCESSES (1 << 0) +#define PROCBASED2_ENABLE_EPT (1 << 1) +#define PROCBASED2_DESC_TABLE_EXITING (1 << 2) +#define PROCBASED2_ENABLE_RDTSCP (1 << 3) +#define PROCBASED2_VIRTUALIZE_X2APIC_MODE (1 << 4) +#define PROCBASED2_ENABLE_VPID (1 << 5) +#define PROCBASED2_WBINVD_EXITING (1 << 6) +#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7) +#define PROCBASED2_APIC_REGISTER_VIRTUALIZATION (1 << 8) +#define PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY (1 << 9) +#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10) +#define PROCBASED2_ENABLE_INVPCID (1 << 12) + +/* VM Exit Controls */ +#define VM_EXIT_SAVE_DEBUG_CONTROLS (1 << 2) +#define VM_EXIT_HOST_LMA (1 << 9) +#define VM_EXIT_LOAD_PERF_GLOBAL_CTRL (1 << 12) +#define VM_EXIT_ACKNOWLEDGE_INTERRUPT (1 << 15) +#define VM_EXIT_SAVE_PAT (1 << 18) +#define VM_EXIT_LOAD_PAT (1 << 19) +#define VM_EXIT_SAVE_EFER (1 << 20) +#define VM_EXIT_LOAD_EFER (1 << 21) +#define VM_EXIT_SAVE_PREEMPTION_TIMER (1 << 22) + +/* VM Entry Controls */ +#define VM_ENTRY_LOAD_DEBUG_CONTROLS (1 << 2) +#define VM_ENTRY_GUEST_LMA (1 << 9) +#define VM_ENTRY_INTO_SMM (1 << 10) +#define VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11) +#define VM_ENTRY_LOAD_PERF_GLOBAL_CTRL (1 << 13) +#define VM_ENTRY_LOAD_PAT (1 << 14) +#define VM_ENTRY_LOAD_EFER (1 << 15) + +#endif diff --git a/sys/amd64/vmm/intel/vmx_cpufunc.h b/sys/amd64/vmm/intel/vmx_cpufunc.h new file mode 100644 index 000000000000..26ef54472436 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_cpufunc.h @@ -0,0 +1,217 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMX_CPUFUNC_H_ +#define _VMX_CPUFUNC_H_ + +struct vmcs; + +/* + * Section 5.2 "Conventions" from Intel Architecture Manual 2B. + * + * error + * VMsucceed 0 + * VMFailInvalid 1 + * VMFailValid 2 see also VMCS VM-Instruction Error Field + */ +#define VM_SUCCESS 0 +#define VM_FAIL_INVALID 1 +#define VM_FAIL_VALID 2 +#define VMX_SET_ERROR_CODE \ + " jnc 1f;" \ + " mov $1, %[error];" /* CF: error = 1 */ \ + " jmp 3f;" \ + "1: jnz 2f;" \ + " mov $2, %[error];" /* ZF: error = 2 */ \ + " jmp 3f;" \ + "2: mov $0, %[error];" \ + "3:" + +/* returns 0 on success and non-zero on failure */ +static __inline int +vmxon(char *region) +{ + int error; + uint64_t addr; + + addr = vtophys(region); + __asm __volatile("vmxon %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + + return (error); +} + +/* returns 0 on success and non-zero on failure */ +static __inline int +vmclear(struct vmcs *vmcs) +{ + int error; + uint64_t addr; + + addr = vtophys(vmcs); + __asm __volatile("vmclear %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + return (error); +} + +static __inline void +vmxoff(void) +{ + + __asm __volatile("vmxoff"); +} + +static __inline void +vmptrst(uint64_t *addr) +{ + + __asm __volatile("vmptrst %[addr]" :: [addr]"m" (*addr) : "memory"); +} + +static __inline int +vmptrld(struct vmcs *vmcs) +{ + int error; + uint64_t addr; + + addr = vtophys(vmcs); + __asm __volatile("vmptrld %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + return (error); +} + +static __inline int +vmwrite(uint64_t reg, uint64_t val) +{ + int error; + + __asm __volatile("vmwrite %[val], %[reg];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [val] "r" (val), [reg] "r" (reg) + : "memory"); + + return (error); +} + +static __inline int +vmread(uint64_t r, uint64_t *addr) +{ + int error; + + __asm __volatile("vmread %[r], %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error), [addr] "=m" (*addr) + : [r] "r" (r) + : "memory"); + return (error); +} + +static void __inline +VMCLEAR(struct vmcs *vmcs) +{ + int err; + + err = vmclear(vmcs); + if (err != 0) + panic("%s: vmclear(%p) error %d", __func__, vmcs, err); + + critical_exit(); +} + +static void __inline +VMPTRLD(struct vmcs *vmcs) +{ + int err; + + critical_enter(); + + err = vmptrld(vmcs); + if (err != 0) + panic("%s: vmptrld(%p) error %d", __func__, vmcs, err); +} + +#define INVVPID_TYPE_ADDRESS 0UL +#define INVVPID_TYPE_SINGLE_CONTEXT 1UL +#define INVVPID_TYPE_ALL_CONTEXTS 2UL + +struct invvpid_desc { + uint16_t vpid; + uint16_t _res1; + uint32_t _res2; + uint64_t linear_addr; +}; +CTASSERT(sizeof(struct invvpid_desc) == 16); + +static void __inline +invvpid(uint64_t type, struct invvpid_desc desc) +{ + int error; + + __asm __volatile("invvpid %[desc], %[type];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [desc] "m" (desc), [type] "r" (type) + : "memory"); + + if (error) + panic("invvpid error %d", error); +} + +#define INVEPT_TYPE_SINGLE_CONTEXT 1UL +#define INVEPT_TYPE_ALL_CONTEXTS 2UL +struct invept_desc { + uint64_t eptp; + uint64_t _res; +}; +CTASSERT(sizeof(struct invept_desc) == 16); + +static void __inline +invept(uint64_t type, struct invept_desc desc) +{ + int error; + + __asm __volatile("invept %[desc], %[type];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [desc] "m" (desc), [type] "r" (type) + : "memory"); + + if (error) + panic("invept error %d", error); +} +#endif diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c new file mode 100644 index 000000000000..06d6b494103a --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_genassym.c @@ -0,0 +1,84 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/assym.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_param.h> + +#include <machine/vmm.h> +#include "vmx_cpufunc.h" +#include "vmx.h" + +ASSYM(VMXCTX_GUEST_RDI, offsetof(struct vmxctx, guest_rdi)); +ASSYM(VMXCTX_GUEST_RSI, offsetof(struct vmxctx, guest_rsi)); +ASSYM(VMXCTX_GUEST_RDX, offsetof(struct vmxctx, guest_rdx)); +ASSYM(VMXCTX_GUEST_RCX, offsetof(struct vmxctx, guest_rcx)); +ASSYM(VMXCTX_GUEST_R8, offsetof(struct vmxctx, guest_r8)); +ASSYM(VMXCTX_GUEST_R9, offsetof(struct vmxctx, guest_r9)); +ASSYM(VMXCTX_GUEST_RAX, offsetof(struct vmxctx, guest_rax)); +ASSYM(VMXCTX_GUEST_RBX, offsetof(struct vmxctx, guest_rbx)); +ASSYM(VMXCTX_GUEST_RBP, offsetof(struct vmxctx, guest_rbp)); +ASSYM(VMXCTX_GUEST_R10, offsetof(struct vmxctx, guest_r10)); +ASSYM(VMXCTX_GUEST_R11, offsetof(struct vmxctx, guest_r11)); +ASSYM(VMXCTX_GUEST_R12, offsetof(struct vmxctx, guest_r12)); +ASSYM(VMXCTX_GUEST_R13, offsetof(struct vmxctx, guest_r13)); +ASSYM(VMXCTX_GUEST_R14, offsetof(struct vmxctx, guest_r14)); +ASSYM(VMXCTX_GUEST_R15, offsetof(struct vmxctx, guest_r15)); +ASSYM(VMXCTX_GUEST_CR2, offsetof(struct vmxctx, guest_cr2)); + +ASSYM(VMXCTX_HOST_R15, offsetof(struct vmxctx, host_r15)); +ASSYM(VMXCTX_HOST_R14, offsetof(struct vmxctx, host_r14)); +ASSYM(VMXCTX_HOST_R13, offsetof(struct vmxctx, host_r13)); +ASSYM(VMXCTX_HOST_R12, offsetof(struct vmxctx, host_r12)); +ASSYM(VMXCTX_HOST_RBP, offsetof(struct vmxctx, host_rbp)); +ASSYM(VMXCTX_HOST_RSP, offsetof(struct vmxctx, host_rsp)); +ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx)); + +ASSYM(VMXCTX_INST_FAIL_STATUS, offsetof(struct vmxctx, inst_fail_status)); + +ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID); +ASSYM(VM_FAIL_VALID, VM_FAIL_VALID); +ASSYM(VMX_GUEST_VMEXIT, VMX_GUEST_VMEXIT); +ASSYM(VMX_VMRESUME_ERROR, VMX_VMRESUME_ERROR); +ASSYM(VMX_VMLAUNCH_ERROR, VMX_VMLAUNCH_ERROR); + +ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid)); + +ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); +ASSYM(PM_EPTGEN, offsetof(struct pmap, pm_eptgen)); + +ASSYM(KERNEL_SS, GSEL(GDATA_SEL, SEL_KPL)); +ASSYM(KERNEL_CS, GSEL(GCODE_SEL, SEL_KPL)); + +ASSYM(PAGE_SIZE, PAGE_SIZE); +ASSYM(KERNBASE, KERNBASE); diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c new file mode 100644 index 000000000000..40dbec290f2d --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_msr.c @@ -0,0 +1,511 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> + +#include <machine/clock.h> +#include <machine/cpufunc.h> +#include <machine/md_var.h> +#include <machine/pcb.h> +#include <machine/specialreg.h> +#include <machine/vmm.h> + +#include "vmx.h" +#include "vmx_msr.h" +#include "x86.h" + +static bool +vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos) +{ + + return ((msr_val & (1UL << (bitpos + 32))) != 0); +} + +static bool +vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos) +{ + + return ((msr_val & (1UL << bitpos)) == 0); +} + +uint32_t +vmx_revision(void) +{ + + return (rdmsr(MSR_VMX_BASIC) & 0xffffffff); +} + +/* + * Generate a bitmask to be used for the VMCS execution control fields. + * + * The caller specifies what bits should be set to one in 'ones_mask' + * and what bits should be set to zero in 'zeros_mask'. The don't-care + * bits are set to the default value. The default values are obtained + * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining + * VMX Capabilities". + * + * Returns zero on success and non-zero on error. + */ +int +vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, + uint32_t zeros_mask, uint32_t *retval) +{ + int i; + uint64_t val, trueval; + bool true_ctls_avail, one_allowed, zero_allowed; + + /* We cannot ask the same bit to be set to both '1' and '0' */ + if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask)) + return (EINVAL); + + true_ctls_avail = (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) != 0; + + val = rdmsr(ctl_reg); + if (true_ctls_avail) + trueval = rdmsr(true_ctl_reg); /* step c */ + else + trueval = val; /* step a */ + + for (i = 0; i < 32; i++) { + one_allowed = vmx_ctl_allows_one_setting(trueval, i); + zero_allowed = vmx_ctl_allows_zero_setting(trueval, i); + + KASSERT(one_allowed || zero_allowed, + ("invalid zero/one setting for bit %d of ctl 0x%0x, " + "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg)); + + if (zero_allowed && !one_allowed) { /* b(i),c(i) */ + if (ones_mask & (1 << i)) + return (EINVAL); + *retval &= ~(1 << i); + } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */ + if (zeros_mask & (1 << i)) + return (EINVAL); + *retval |= 1 << i; + } else { + if (zeros_mask & (1 << i)) /* b(ii),c(ii) */ + *retval &= ~(1 << i); + else if (ones_mask & (1 << i)) /* b(ii), c(ii) */ + *retval |= 1 << i; + else if (!true_ctls_avail) + *retval &= ~(1 << i); /* b(iii) */ + else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/ + *retval &= ~(1 << i); + else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */ + *retval |= 1 << i; + else { + panic("vmx_set_ctlreg: unable to determine " + "correct value of ctl bit %d for msr " + "0x%0x and true msr 0x%0x", i, ctl_reg, + true_ctl_reg); + } + } + } + + return (0); +} + +void +msr_bitmap_initialize(char *bitmap) +{ + + memset(bitmap, 0xff, PAGE_SIZE); +} + +int +msr_bitmap_change_access(char *bitmap, u_int msr, int access) +{ + int byte, bit; + + if (msr <= 0x00001FFF) + byte = msr / 8; + else if (msr >= 0xC0000000 && msr <= 0xC0001FFF) + byte = 1024 + (msr - 0xC0000000) / 8; + else + return (EINVAL); + + bit = msr & 0x7; + + if (access & MSR_BITMAP_ACCESS_READ) + bitmap[byte] &= ~(1 << bit); + else + bitmap[byte] |= 1 << bit; + + byte += 2048; + if (access & MSR_BITMAP_ACCESS_WRITE) + bitmap[byte] &= ~(1 << bit); + else + bitmap[byte] |= 1 << bit; + + return (0); +} + +static uint64_t misc_enable; +static uint64_t platform_info; +static uint64_t turbo_ratio_limit; +static uint64_t host_msrs[GUEST_MSR_NUM]; + +static bool +nehalem_cpu(void) +{ + u_int family, model; + + /* + * The family:model numbers belonging to the Nehalem microarchitecture + * are documented in Section 35.5, Intel SDM dated Feb 2014. + */ + family = CPUID_TO_FAMILY(cpu_id); + model = CPUID_TO_MODEL(cpu_id); + if (family == 0x6) { + switch (model) { + case 0x1A: + case 0x1E: + case 0x1F: + case 0x2E: + return (true); + default: + break; + } + } + return (false); +} + +static bool +westmere_cpu(void) +{ + u_int family, model; + + /* + * The family:model numbers belonging to the Westmere microarchitecture + * are documented in Section 35.6, Intel SDM dated Feb 2014. + */ + family = CPUID_TO_FAMILY(cpu_id); + model = CPUID_TO_MODEL(cpu_id); + if (family == 0x6) { + switch (model) { + case 0x25: + case 0x2C: + return (true); + default: + break; + } + } + return (false); +} + +static bool +pat_valid(uint64_t val) +{ + int i, pa; + + /* + * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT" + * + * Extract PA0 through PA7 and validate that each one encodes a + * valid memory type. + */ + for (i = 0; i < 8; i++) { + pa = (val >> (i * 8)) & 0xff; + if (pa == 2 || pa == 3 || pa >= 8) + return (false); + } + return (true); +} + +void +vmx_msr_init(void) +{ + uint64_t bus_freq, ratio; + int i; + + /* + * It is safe to cache the values of the following MSRs because + * they don't change based on curcpu, curproc or curthread. + */ + host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); + + /* + * Initialize emulated MSRs + */ + misc_enable = rdmsr(MSR_IA32_MISC_ENABLE); + /* + * Set mandatory bits + * 11: branch trace disabled + * 12: PEBS unavailable + * Clear unsupported features + * 16: SpeedStep enable + * 18: enable MONITOR FSM + */ + misc_enable |= (1 << 12) | (1 << 11); + misc_enable &= ~((1 << 18) | (1 << 16)); + + if (nehalem_cpu() || westmere_cpu()) + bus_freq = 133330000; /* 133Mhz */ + else + bus_freq = 100000000; /* 100Mhz */ + + /* + * XXXtime + * The ratio should really be based on the virtual TSC frequency as + * opposed to the host TSC. + */ + ratio = (tsc_freq / bus_freq) & 0xff; + + /* + * The register definition is based on the micro-architecture + * but the following bits are always the same: + * [15:8] Maximum Non-Turbo Ratio + * [28] Programmable Ratio Limit for Turbo Mode + * [29] Programmable TDC-TDP Limit for Turbo Mode + * [47:40] Maximum Efficiency Ratio + * + * The other bits can be safely set to 0 on all + * micro-architectures up to Haswell. + */ + platform_info = (ratio << 8) | (ratio << 40); + + /* + * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is + * dependent on the maximum cores per package supported by the micro- + * architecture. For e.g., Westmere supports 6 cores per package and + * uses the low 48 bits. Sandybridge support 8 cores per package and + * uses up all 64 bits. + * + * However, the unused bits are reserved so we pretend that all bits + * in this MSR are valid. + */ + for (i = 0; i < 8; i++) + turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio; +} + +void +vmx_msr_guest_init(struct vmx *vmx, struct vmx_vcpu *vcpu) +{ + /* + * The permissions bitmap is shared between all vcpus so initialize it + * once when initializing the vBSP. + */ + if (vcpu->vcpuid == 0) { + guest_msr_rw(vmx, MSR_LSTAR); + guest_msr_rw(vmx, MSR_CSTAR); + guest_msr_rw(vmx, MSR_STAR); + guest_msr_rw(vmx, MSR_SF_MASK); + guest_msr_rw(vmx, MSR_KGSBASE); + } + + /* + * Initialize guest IA32_PAT MSR with default value after reset. + */ + vcpu->guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) | + PAT_VALUE(1, PAT_WRITE_THROUGH) | + PAT_VALUE(2, PAT_UNCACHED) | + PAT_VALUE(3, PAT_UNCACHEABLE) | + PAT_VALUE(4, PAT_WRITE_BACK) | + PAT_VALUE(5, PAT_WRITE_THROUGH) | + PAT_VALUE(6, PAT_UNCACHED) | + PAT_VALUE(7, PAT_UNCACHEABLE); + + return; +} + +void +vmx_msr_guest_enter(struct vmx_vcpu *vcpu) +{ + + /* Save host MSRs (in particular, KGSBASE) and restore guest MSRs */ + update_pcb_bases(curpcb); + wrmsr(MSR_LSTAR, vcpu->guest_msrs[IDX_MSR_LSTAR]); + wrmsr(MSR_CSTAR, vcpu->guest_msrs[IDX_MSR_CSTAR]); + wrmsr(MSR_STAR, vcpu->guest_msrs[IDX_MSR_STAR]); + wrmsr(MSR_SF_MASK, vcpu->guest_msrs[IDX_MSR_SF_MASK]); + wrmsr(MSR_KGSBASE, vcpu->guest_msrs[IDX_MSR_KGSBASE]); +} + +void +vmx_msr_guest_enter_tsc_aux(struct vmx *vmx, struct vmx_vcpu *vcpu) +{ + uint64_t guest_tsc_aux = vcpu->guest_msrs[IDX_MSR_TSC_AUX]; + uint32_t host_aux = cpu_auxmsr(); + + if (vmx_have_msr_tsc_aux && guest_tsc_aux != host_aux) + wrmsr(MSR_TSC_AUX, guest_tsc_aux); +} + +void +vmx_msr_guest_exit(struct vmx_vcpu *vcpu) +{ + + /* Save guest MSRs */ + vcpu->guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + vcpu->guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + vcpu->guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + vcpu->guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); + vcpu->guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE); + + /* Restore host MSRs */ + wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]); + wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]); + wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]); + wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); + + /* MSR_KGSBASE will be restored on the way back to userspace */ +} + +void +vmx_msr_guest_exit_tsc_aux(struct vmx *vmx, struct vmx_vcpu *vcpu) +{ + uint64_t guest_tsc_aux = vcpu->guest_msrs[IDX_MSR_TSC_AUX]; + uint32_t host_aux = cpu_auxmsr(); + + if (vmx_have_msr_tsc_aux && guest_tsc_aux != host_aux) + /* + * Note that it is not necessary to save the guest value + * here; vcpu->guest_msrs[IDX_MSR_TSC_AUX] always + * contains the current value since it is updated whenever + * the guest writes to it (which is expected to be very + * rare). + */ + wrmsr(MSR_TSC_AUX, host_aux); +} + +int +vmx_rdmsr(struct vmx_vcpu *vcpu, u_int num, uint64_t *val, bool *retu) +{ + int error; + + error = 0; + + switch (num) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + *val = 0; + break; + case MSR_MTRRcap: + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: + if (vm_rdmtrr(&vcpu->mtrr, num, val) != 0) { + vm_inject_gp(vcpu->vcpu); + } + break; + case MSR_IA32_MISC_ENABLE: + *val = misc_enable; + break; + case MSR_PLATFORM_INFO: + *val = platform_info; + break; + case MSR_TURBO_RATIO_LIMIT: + case MSR_TURBO_RATIO_LIMIT1: + *val = turbo_ratio_limit; + break; + case MSR_PAT: + *val = vcpu->guest_msrs[IDX_MSR_PAT]; + break; + default: + error = EINVAL; + break; + } + return (error); +} + +int +vmx_wrmsr(struct vmx_vcpu *vcpu, u_int num, uint64_t val, bool *retu) +{ + uint64_t changed; + int error; + + error = 0; + + switch (num) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + break; /* ignore writes */ + case MSR_MTRRcap: + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: + if (vm_wrmtrr(&vcpu->mtrr, num, val) != 0) { + vm_inject_gp(vcpu->vcpu); + } + break; + case MSR_IA32_MISC_ENABLE: + changed = val ^ misc_enable; + /* + * If the host has disabled the NX feature then the guest + * also cannot use it. However, a Linux guest will try to + * enable the NX feature by writing to the MISC_ENABLE MSR. + * + * This can be safely ignored because the memory management + * code looks at CPUID.80000001H:EDX.NX to check if the + * functionality is actually enabled. + */ + changed &= ~(1UL << 34); + + /* + * Punt to userspace if any other bits are being modified. + */ + if (changed) + error = EINVAL; + + break; + case MSR_PAT: + if (pat_valid(val)) + vcpu->guest_msrs[IDX_MSR_PAT] = val; + else + vm_inject_gp(vcpu->vcpu); + break; + case MSR_TSC: + error = vmx_set_tsc_offset(vcpu, val - rdtsc()); + break; + case MSR_TSC_AUX: + if (vmx_have_msr_tsc_aux) + /* + * vmx_msr_guest_enter_tsc_aux() will apply this + * value when it is called immediately before guest + * entry. + */ + vcpu->guest_msrs[IDX_MSR_TSC_AUX] = val; + else + vm_inject_gp(vcpu->vcpu); + break; + default: + error = EINVAL; + break; + } + + return (error); +} diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h new file mode 100644 index 000000000000..f88f37bd9163 --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_msr.h @@ -0,0 +1,72 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMX_MSR_H_ +#define _VMX_MSR_H_ + +struct vmx; + +void vmx_msr_init(void); +void vmx_msr_guest_init(struct vmx *vmx, struct vmx_vcpu *vcpu); +void vmx_msr_guest_enter_tsc_aux(struct vmx *vmx, struct vmx_vcpu *vcpu); +void vmx_msr_guest_enter(struct vmx_vcpu *vcpu); +void vmx_msr_guest_exit(struct vmx_vcpu *vcpu); +void vmx_msr_guest_exit_tsc_aux(struct vmx *vmx, struct vmx_vcpu *vcpu); +int vmx_rdmsr(struct vmx_vcpu *vcpu, u_int num, uint64_t *val, bool *retu); +int vmx_wrmsr(struct vmx_vcpu *vcpu, u_int num, uint64_t val, bool *retu); + +uint32_t vmx_revision(void); + +int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, + uint32_t zeros_mask, uint32_t *retval); + +/* + * According to Section 21.10.4 "Software Access to Related Structures", + * changes to data structures pointed to by the VMCS must be made only when + * there is no logical processor with a current VMCS that points to the + * data structure. + * + * This pretty much limits us to configuring the MSR bitmap before VMCS + * initialization for SMP VMs. Unless of course we do it the hard way - which + * would involve some form of synchronization between the vcpus to vmclear + * all VMCSs' that point to the bitmap. + */ +#define MSR_BITMAP_ACCESS_NONE 0x0 +#define MSR_BITMAP_ACCESS_READ 0x1 +#define MSR_BITMAP_ACCESS_WRITE 0x2 +#define MSR_BITMAP_ACCESS_RW (MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE) +void msr_bitmap_initialize(char *bitmap); +int msr_bitmap_change_access(char *bitmap, u_int msr, int access); + +#define guest_msr_rw(vmx, msr) \ + msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) + +#define guest_msr_ro(vmx, msr) \ + msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_READ) + +#endif diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S new file mode 100644 index 000000000000..877e377f892d --- /dev/null +++ b/sys/amd64/vmm/intel/vmx_support.S @@ -0,0 +1,270 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <machine/asmacros.h> +#include <machine/specialreg.h> + +#include "vmx_assym.h" + +/* Be friendly to DTrace FBT's prologue/epilogue pattern matching */ +#define VENTER push %rbp ; mov %rsp,%rbp +#define VLEAVE pop %rbp + +/* + * Save the guest context. + */ +#define VMX_GUEST_SAVE \ + movq %rdi,VMXCTX_GUEST_RDI(%rsp); \ + movq %rsi,VMXCTX_GUEST_RSI(%rsp); \ + movq %rdx,VMXCTX_GUEST_RDX(%rsp); \ + movq %rcx,VMXCTX_GUEST_RCX(%rsp); \ + movq %r8,VMXCTX_GUEST_R8(%rsp); \ + movq %r9,VMXCTX_GUEST_R9(%rsp); \ + movq %rax,VMXCTX_GUEST_RAX(%rsp); \ + movq %rbx,VMXCTX_GUEST_RBX(%rsp); \ + movq %rbp,VMXCTX_GUEST_RBP(%rsp); \ + movq %r10,VMXCTX_GUEST_R10(%rsp); \ + movq %r11,VMXCTX_GUEST_R11(%rsp); \ + movq %r12,VMXCTX_GUEST_R12(%rsp); \ + movq %r13,VMXCTX_GUEST_R13(%rsp); \ + movq %r14,VMXCTX_GUEST_R14(%rsp); \ + movq %r15,VMXCTX_GUEST_R15(%rsp); \ + movq %cr2,%rdi; \ + movq %rdi,VMXCTX_GUEST_CR2(%rsp); \ + movq %rsp,%rdi; + +/* + * Assumes that %rdi holds a pointer to the 'vmxctx'. + * + * On "return" all registers are updated to reflect guest state. The two + * exceptions are %rip and %rsp. These registers are atomically switched + * by hardware from the guest area of the vmcs. + * + * We modify %rsp to point to the 'vmxctx' so we can use it to restore + * host context in case of an error with 'vmlaunch' or 'vmresume'. + */ +#define VMX_GUEST_RESTORE \ + movq %rdi,%rsp; \ + movq VMXCTX_GUEST_CR2(%rdi),%rsi; \ + movq %rsi,%cr2; \ + movq VMXCTX_GUEST_RSI(%rdi),%rsi; \ + movq VMXCTX_GUEST_RDX(%rdi),%rdx; \ + movq VMXCTX_GUEST_RCX(%rdi),%rcx; \ + movq VMXCTX_GUEST_R8(%rdi),%r8; \ + movq VMXCTX_GUEST_R9(%rdi),%r9; \ + movq VMXCTX_GUEST_RAX(%rdi),%rax; \ + movq VMXCTX_GUEST_RBX(%rdi),%rbx; \ + movq VMXCTX_GUEST_RBP(%rdi),%rbp; \ + movq VMXCTX_GUEST_R10(%rdi),%r10; \ + movq VMXCTX_GUEST_R11(%rdi),%r11; \ + movq VMXCTX_GUEST_R12(%rdi),%r12; \ + movq VMXCTX_GUEST_R13(%rdi),%r13; \ + movq VMXCTX_GUEST_R14(%rdi),%r14; \ + movq VMXCTX_GUEST_R15(%rdi),%r15; \ + movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */ + +/* + * Clobber the remaining registers with guest contents so they can't + * be misused. + */ +#define VMX_GUEST_CLOBBER \ + xor %rax, %rax; \ + xor %rcx, %rcx; \ + xor %rdx, %rdx; \ + xor %rsi, %rsi; \ + xor %r8, %r8; \ + xor %r9, %r9; \ + xor %r10, %r10; \ + xor %r11, %r11; + +/* + * Save and restore the host context. + * + * Assumes that %rdi holds a pointer to the 'vmxctx'. + */ +#define VMX_HOST_SAVE \ + movq %r15, VMXCTX_HOST_R15(%rdi); \ + movq %r14, VMXCTX_HOST_R14(%rdi); \ + movq %r13, VMXCTX_HOST_R13(%rdi); \ + movq %r12, VMXCTX_HOST_R12(%rdi); \ + movq %rbp, VMXCTX_HOST_RBP(%rdi); \ + movq %rsp, VMXCTX_HOST_RSP(%rdi); \ + movq %rbx, VMXCTX_HOST_RBX(%rdi); \ + +#define VMX_HOST_RESTORE \ + movq VMXCTX_HOST_R15(%rdi), %r15; \ + movq VMXCTX_HOST_R14(%rdi), %r14; \ + movq VMXCTX_HOST_R13(%rdi), %r13; \ + movq VMXCTX_HOST_R12(%rdi), %r12; \ + movq VMXCTX_HOST_RBP(%rdi), %rbp; \ + movq VMXCTX_HOST_RSP(%rdi), %rsp; \ + movq VMXCTX_HOST_RBX(%rdi), %rbx; \ + +/* + * vmx_enter_guest(struct vmxctx *vmxctx, int launched) + * %rdi: pointer to the 'vmxctx' + * %rsi: pointer to the 'vmx' + * %edx: launch state of the VMCS + * Interrupts must be disabled on entry. + */ +ENTRY(vmx_enter_guest) + VENTER + /* + * Save host state before doing anything else. + */ + VMX_HOST_SAVE + +guest_restore: + movl %edx, %r8d + cmpb $0, guest_l1d_flush_sw(%rip) + je after_l1d + call flush_l1d_sw +after_l1d: + cmpl $0, %r8d + je do_launch + VMX_GUEST_RESTORE + vmresume + /* + * In the common case 'vmresume' returns back to the host through + * 'vmx_exit_guest' with %rsp pointing to 'vmxctx'. + * + * If there is an error we return VMX_VMRESUME_ERROR to the caller. + */ + movq %rsp, %rdi /* point %rdi back to 'vmxctx' */ + movl $VMX_VMRESUME_ERROR, %eax + jmp decode_inst_error + +do_launch: + VMX_GUEST_RESTORE + vmlaunch + /* + * In the common case 'vmlaunch' returns back to the host through + * 'vmx_exit_guest' with %rsp pointing to 'vmxctx'. + * + * If there is an error we return VMX_VMLAUNCH_ERROR to the caller. + */ + movq %rsp, %rdi /* point %rdi back to 'vmxctx' */ + movl $VMX_VMLAUNCH_ERROR, %eax + /* FALLTHROUGH */ +decode_inst_error: + movl $VM_FAIL_VALID, %r11d + movl $VM_FAIL_INVALID, %esi + cmovnzl %esi, %r11d + movl %r11d, VMXCTX_INST_FAIL_STATUS(%rdi) + + /* + * The return value is already populated in %eax so we cannot use + * it as a scratch register beyond this point. + */ + + VMX_HOST_RESTORE + VLEAVE + ret + +/* + * Non-error VM-exit from the guest. Make this a label so it can + * be used by C code when setting up the VMCS. + * The VMCS-restored %rsp points to the struct vmxctx + */ + ALIGN_TEXT + .globl vmx_exit_guest_flush_rsb +vmx_exit_guest_flush_rsb: + /* + * Save guest state that is not automatically saved in the vmcs. + */ + VMX_GUEST_SAVE + + VMX_HOST_RESTORE + + VMX_GUEST_CLOBBER + + /* + * To prevent malicious branch target predictions from + * affecting the host, overwrite all entries in the RSB upon + * exiting a guest. + */ + mov $16, %ecx /* 16 iterations, two calls per loop */ + mov %rsp, %rax +0: call 2f /* create an RSB entry. */ +1: pause + call 1b /* capture rogue speculation. */ +2: call 2f /* create an RSB entry. */ +1: pause + call 1b /* capture rogue speculation. */ +2: sub $1, %ecx + jnz 0b + mov %rax, %rsp + + /* + * This will return to the caller of 'vmx_enter_guest()' with a return + * value of VMX_GUEST_VMEXIT. + */ + movl $VMX_GUEST_VMEXIT, %eax + VLEAVE + ret + + .globl vmx_exit_guest +vmx_exit_guest: + /* + * Save guest state that is not automatically saved in the vmcs. + */ + VMX_GUEST_SAVE + + VMX_HOST_RESTORE + + VMX_GUEST_CLOBBER + + /* + * This will return to the caller of 'vmx_enter_guest()' with a return + * value of VMX_GUEST_VMEXIT. + */ + movl $VMX_GUEST_VMEXIT, %eax + VLEAVE + ret +END(vmx_enter_guest) + +/* + * %rdi = interrupt handler entry point + * + * Calling sequence described in the "Instruction Set Reference" for the "INT" + * instruction in Intel SDM, Vol 2. + */ +ENTRY(vmx_call_isr) + VENTER + mov %rsp, %r11 /* save %rsp */ + and $~0xf, %rsp /* align on 16-byte boundary */ + pushq $KERNEL_SS /* %ss */ + pushq %r11 /* %rsp */ + pushfq /* %rflags */ + pushq $KERNEL_CS /* %cs */ + cli /* disable interrupts */ + callq *%rdi /* push %rip and call isr */ + VLEAVE + ret +END(vmx_call_isr) diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c new file mode 100644 index 000000000000..b56541290a9d --- /dev/null +++ b/sys/amd64/vmm/intel/vtd.c @@ -0,0 +1,779 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <dev/pci/pcireg.h> + +#include <machine/vmparam.h> +#include <contrib/dev/acpica/include/acpi.h> + +#include "io/iommu.h" + +/* + * Documented in the "Intel Virtualization Technology for Directed I/O", + * Architecture Spec, September 2008. + */ + +#define VTD_DRHD_INCLUDE_PCI_ALL(Flags) (((Flags) >> 0) & 0x1) + +/* Section 10.4 "Register Descriptions" */ +struct vtdmap { + volatile uint32_t version; + volatile uint32_t res0; + volatile uint64_t cap; + volatile uint64_t ext_cap; + volatile uint32_t gcr; + volatile uint32_t gsr; + volatile uint64_t rta; + volatile uint64_t ccr; +}; + +#define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F) +#define VTD_CAP_ND(cap) ((cap) & 0x7) +#define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1) +#define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF) +#define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1) + +#define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1) +#define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1) +#define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF) + +#define VTD_GCR_WBF (1 << 27) +#define VTD_GCR_SRTP (1 << 30) +#define VTD_GCR_TE (1U << 31) + +#define VTD_GSR_WBFS (1 << 27) +#define VTD_GSR_RTPS (1 << 30) +#define VTD_GSR_TES (1U << 31) + +#define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */ +#define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */ + +#define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */ +#define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */ +#define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */ +#define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */ +#define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */ +#define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */ +#define VTD_IIR_DOMAIN_P 32 + +#define VTD_ROOT_PRESENT 0x1 +#define VTD_CTX_PRESENT 0x1 +#define VTD_CTX_TT_ALL (1UL << 2) + +#define VTD_PTE_RD (1UL << 0) +#define VTD_PTE_WR (1UL << 1) +#define VTD_PTE_SUPERPAGE (1UL << 7) +#define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL) + +#define VTD_RID2IDX(rid) (((rid) & 0xff) * 2) + +struct domain { + uint64_t *ptp; /* first level page table page */ + int pt_levels; /* number of page table levels */ + int addrwidth; /* 'AW' field in context entry */ + int spsmask; /* supported super page sizes */ + u_int id; /* domain id */ + vm_paddr_t maxaddr; /* highest address to be mapped */ + SLIST_ENTRY(domain) next; +}; + +static SLIST_HEAD(, domain) domhead; + +#define DRHD_MAX_UNITS 16 +static ACPI_DMAR_HARDWARE_UNIT *drhds[DRHD_MAX_UNITS]; +static int drhd_num; +static struct vtdmap *vtdmaps[DRHD_MAX_UNITS]; +static int max_domains; +typedef int (*drhd_ident_func_t)(void); + +static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); +static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); + +static MALLOC_DEFINE(M_VTD, "vtd", "vtd"); + +static int +vtd_max_domains(struct vtdmap *vtdmap) +{ + int nd; + + nd = VTD_CAP_ND(vtdmap->cap); + + switch (nd) { + case 0: + return (16); + case 1: + return (64); + case 2: + return (256); + case 3: + return (1024); + case 4: + return (4 * 1024); + case 5: + return (16 * 1024); + case 6: + return (64 * 1024); + default: + panic("vtd_max_domains: invalid value of nd (0x%0x)", nd); + } +} + +static u_int +domain_id(void) +{ + u_int id; + struct domain *dom; + + /* Skip domain id 0 - it is reserved when Caching Mode field is set */ + for (id = 1; id < max_domains; id++) { + SLIST_FOREACH(dom, &domhead, next) { + if (dom->id == id) + break; + } + if (dom == NULL) + break; /* found it */ + } + + if (id >= max_domains) + panic("domain ids exhausted"); + + return (id); +} + +static struct vtdmap * +vtd_device_scope(uint16_t rid) +{ + int i, remaining, pathremaining; + char *end, *pathend; + struct vtdmap *vtdmap; + ACPI_DMAR_HARDWARE_UNIT *drhd; + ACPI_DMAR_DEVICE_SCOPE *device_scope; + ACPI_DMAR_PCI_PATH *path; + + for (i = 0; i < drhd_num; i++) { + drhd = drhds[i]; + + if (VTD_DRHD_INCLUDE_PCI_ALL(drhd->Flags)) { + /* + * From Intel VT-d arch spec, version 3.0: + * If a DRHD structure with INCLUDE_PCI_ALL flag Set is reported + * for a Segment, it must be enumerated by BIOS after all other + * DRHD structures for the same Segment. + */ + vtdmap = vtdmaps[i]; + return(vtdmap); + } + + end = (char *)drhd + drhd->Header.Length; + remaining = drhd->Header.Length - sizeof(ACPI_DMAR_HARDWARE_UNIT); + while (remaining > sizeof(ACPI_DMAR_DEVICE_SCOPE)) { + device_scope = (ACPI_DMAR_DEVICE_SCOPE *)(end - remaining); + remaining -= device_scope->Length; + + switch (device_scope->EntryType){ + /* 0x01 and 0x02 are PCI device entries */ + case 0x01: + case 0x02: + break; + default: + continue; + } + + if (PCI_RID2BUS(rid) != device_scope->Bus) + continue; + + pathend = (char *)device_scope + device_scope->Length; + pathremaining = device_scope->Length - sizeof(ACPI_DMAR_DEVICE_SCOPE); + while (pathremaining >= sizeof(ACPI_DMAR_PCI_PATH)) { + path = (ACPI_DMAR_PCI_PATH *)(pathend - pathremaining); + pathremaining -= sizeof(ACPI_DMAR_PCI_PATH); + + if (PCI_RID2SLOT(rid) != path->Device) + continue; + if (PCI_RID2FUNC(rid) != path->Function) + continue; + + vtdmap = vtdmaps[i]; + return (vtdmap); + } + } + } + + /* No matching scope */ + return (NULL); +} + +static void +vtd_wbflush(struct vtdmap *vtdmap) +{ + + if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0) + pmap_invalidate_cache(); + + if (VTD_CAP_RWBF(vtdmap->cap)) { + vtdmap->gcr = VTD_GCR_WBF; + while ((vtdmap->gsr & VTD_GSR_WBFS) != 0) + ; + } +} + +static void +vtd_ctx_global_invalidate(struct vtdmap *vtdmap) +{ + + vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL; + while ((vtdmap->ccr & VTD_CCR_ICC) != 0) + ; +} + +static void +vtd_iotlb_global_invalidate(struct vtdmap *vtdmap) +{ + int offset; + volatile uint64_t *iotlb_reg, val; + + vtd_wbflush(vtdmap); + + offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16; + iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8); + + *iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL | + VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES; + + while (1) { + val = *iotlb_reg; + if ((val & VTD_IIR_IVT) == 0) + break; + } +} + +static void +vtd_translation_enable(struct vtdmap *vtdmap) +{ + + vtdmap->gcr = VTD_GCR_TE; + while ((vtdmap->gsr & VTD_GSR_TES) == 0) + ; +} + +static void +vtd_translation_disable(struct vtdmap *vtdmap) +{ + + vtdmap->gcr = 0; + while ((vtdmap->gsr & VTD_GSR_TES) != 0) + ; +} + +static int +vtd_init(void) +{ + int i, units, remaining, tmp; + struct vtdmap *vtdmap; + vm_paddr_t ctx_paddr; + char *end, envname[32]; + unsigned long mapaddr; + ACPI_STATUS status; + ACPI_TABLE_DMAR *dmar; + ACPI_DMAR_HEADER *hdr; + ACPI_DMAR_HARDWARE_UNIT *drhd; + + /* + * Allow the user to override the ACPI DMAR table by specifying the + * physical address of each remapping unit. + * + * The following example specifies two remapping units at + * physical addresses 0xfed90000 and 0xfeda0000 respectively. + * set vtd.regmap.0.addr=0xfed90000 + * set vtd.regmap.1.addr=0xfeda0000 + */ + for (units = 0; units < DRHD_MAX_UNITS; units++) { + snprintf(envname, sizeof(envname), "vtd.regmap.%d.addr", units); + if (getenv_ulong(envname, &mapaddr) == 0) + break; + vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(mapaddr); + } + + if (units > 0) + goto skip_dmar; + + /* Search for DMAR table. */ + status = AcpiGetTable(ACPI_SIG_DMAR, 0, (ACPI_TABLE_HEADER **)&dmar); + if (ACPI_FAILURE(status)) + return (ENXIO); + + end = (char *)dmar + dmar->Header.Length; + remaining = dmar->Header.Length - sizeof(ACPI_TABLE_DMAR); + while (remaining > sizeof(ACPI_DMAR_HEADER)) { + hdr = (ACPI_DMAR_HEADER *)(end - remaining); + if (hdr->Length > remaining) + break; + /* + * From Intel VT-d arch spec, version 1.3: + * BIOS implementations must report mapping structures + * in numerical order, i.e. All remapping structures of + * type 0 (DRHD) enumerated before remapping structures of + * type 1 (RMRR) and so forth. + */ + if (hdr->Type != ACPI_DMAR_TYPE_HARDWARE_UNIT) + break; + + drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr; + drhds[units] = drhd; + vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address); + if (++units >= DRHD_MAX_UNITS) + break; + remaining -= hdr->Length; + } + + if (units <= 0) + return (ENXIO); + +skip_dmar: + drhd_num = units; + + max_domains = 64 * 1024; /* maximum valid value */ + for (i = 0; i < drhd_num; i++){ + vtdmap = vtdmaps[i]; + + if (VTD_CAP_CM(vtdmap->cap) != 0) + panic("vtd_init: invalid caching mode"); + + /* take most compatible (minimum) value */ + if ((tmp = vtd_max_domains(vtdmap)) < max_domains) + max_domains = tmp; + } + + /* + * Set up the root-table to point to the context-entry tables + */ + for (i = 0; i < 256; i++) { + ctx_paddr = vtophys(ctx_tables[i]); + if (ctx_paddr & PAGE_MASK) + panic("ctx table (0x%0lx) not page aligned", ctx_paddr); + + root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT; + } + + return (0); +} + +static void +vtd_cleanup(void) +{ +} + +static void +vtd_enable(void) +{ + int i; + struct vtdmap *vtdmap; + + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_wbflush(vtdmap); + + /* Update the root table address */ + vtdmap->rta = vtophys(root_table); + vtdmap->gcr = VTD_GCR_SRTP; + while ((vtdmap->gsr & VTD_GSR_RTPS) == 0) + ; + + vtd_ctx_global_invalidate(vtdmap); + vtd_iotlb_global_invalidate(vtdmap); + + vtd_translation_enable(vtdmap); + } +} + +static void +vtd_disable(void) +{ + int i; + struct vtdmap *vtdmap; + + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_translation_disable(vtdmap); + } +} + +static int +vtd_add_device(void *arg, device_t dev __unused, uint16_t rid) +{ + int idx; + uint64_t *ctxp; + struct domain *dom = arg; + vm_paddr_t pt_paddr; + struct vtdmap *vtdmap; + uint8_t bus; + + KASSERT(dom != NULL, ("domain is NULL")); + + bus = PCI_RID2BUS(rid); + ctxp = ctx_tables[bus]; + pt_paddr = vtophys(dom->ptp); + idx = VTD_RID2IDX(rid); + + if (ctxp[idx] & VTD_CTX_PRESENT) { + panic("vtd_add_device: device %x is already owned by " + "domain %d", rid, + (uint16_t)(ctxp[idx + 1] >> 8)); + } + + if ((vtdmap = vtd_device_scope(rid)) == NULL) + panic("vtd_add_device: device %x is not in scope for " + "any DMA remapping unit", rid); + + /* + * Order is important. The 'present' bit is set only after all fields + * of the context pointer are initialized. + */ + ctxp[idx + 1] = dom->addrwidth | (dom->id << 8); + + if (VTD_ECAP_DI(vtdmap->ext_cap)) + ctxp[idx] = VTD_CTX_TT_ALL; + else + ctxp[idx] = 0; + + ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT; + + /* + * 'Not Present' entries are not cached in either the Context Cache + * or in the IOTLB, so there is no need to invalidate either of them. + */ + return (0); +} + +static int +vtd_remove_device(void *arg, device_t dev __unused, uint16_t rid) +{ + int i, idx; + uint64_t *ctxp; + struct vtdmap *vtdmap; + uint8_t bus; + + bus = PCI_RID2BUS(rid); + ctxp = ctx_tables[bus]; + idx = VTD_RID2IDX(rid); + + /* + * Order is important. The 'present' bit is must be cleared first. + */ + ctxp[idx] = 0; + ctxp[idx + 1] = 0; + + /* + * Invalidate the Context Cache and the IOTLB. + * + * XXX use device-selective invalidation for Context Cache + * XXX use domain-selective invalidation for IOTLB + */ + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_ctx_global_invalidate(vtdmap); + vtd_iotlb_global_invalidate(vtdmap); + } + return (0); +} + +#define CREATE_MAPPING 0 +#define REMOVE_MAPPING 1 + +static uint64_t +vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len, + int remove) +{ + struct domain *dom; + int i, spshift, ptpshift, ptpindex, nlevels; + uint64_t spsize, *ptp; + + dom = arg; + ptpindex = 0; + ptpshift = 0; + + KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__, + gpa, len)); + KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond " + "domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr)); + + if (gpa & PAGE_MASK) + panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa); + + if (hpa & PAGE_MASK) + panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa); + + if (len & PAGE_MASK) + panic("vtd_create_mapping: unaligned len 0x%0lx", len); + + /* + * Compute the size of the mapping that we can accommodate. + * + * This is based on three factors: + * - supported super page size + * - alignment of the region starting at 'gpa' and 'hpa' + * - length of the region 'len' + */ + spshift = 48; + for (i = 3; i >= 0; i--) { + spsize = 1UL << spshift; + if ((dom->spsmask & (1 << i)) != 0 && + (gpa & (spsize - 1)) == 0 && + (hpa & (spsize - 1)) == 0 && + (len >= spsize)) { + break; + } + spshift -= 9; + } + + ptp = dom->ptp; + nlevels = dom->pt_levels; + while (--nlevels >= 0) { + ptpshift = 12 + nlevels * 9; + ptpindex = (gpa >> ptpshift) & 0x1FF; + + /* We have reached the leaf mapping */ + if (spshift >= ptpshift) { + break; + } + + /* + * We are working on a non-leaf page table page. + * + * Create a downstream page table page if necessary and point + * to it from the current page table. + */ + if (ptp[ptpindex] == 0) { + void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO); + ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR; + } + + ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M); + } + + if ((gpa & ((1UL << ptpshift) - 1)) != 0) + panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift); + + /* + * Update the 'gpa' -> 'hpa' mapping + */ + if (remove) { + ptp[ptpindex] = 0; + } else { + ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR; + + if (nlevels > 0) + ptp[ptpindex] |= VTD_PTE_SUPERPAGE; + } + + return (1UL << ptpshift); +} + +static int +vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len, + uint64_t *res_len) +{ + + *res_len = vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING); + return (0); +} + +static int +vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len, uint64_t *res_len) +{ + + *res_len = vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING); + return (0); +} + +static int +vtd_invalidate_tlb(void *dom) +{ + int i; + struct vtdmap *vtdmap; + + /* + * Invalidate the IOTLB. + * XXX use domain-selective invalidation for IOTLB + */ + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_iotlb_global_invalidate(vtdmap); + } + return (0); +} + +static void * +vtd_create_domain(vm_paddr_t maxaddr) +{ + struct domain *dom; + vm_paddr_t addr; + int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth; + struct vtdmap *vtdmap; + + if (drhd_num <= 0) + panic("vtd_create_domain: no dma remapping hardware available"); + + /* + * Calculate AGAW. + * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec. + */ + addr = 0; + for (gaw = 0; addr < maxaddr; gaw++) + addr = 1ULL << gaw; + + res = (gaw - 12) % 9; + if (res == 0) + agaw = gaw; + else + agaw = gaw + 9 - res; + + if (agaw > 64) + agaw = 64; + + /* + * Select the smallest Supported AGAW and the corresponding number + * of page table levels. + */ + pt_levels = 2; + sagaw = 30; + addrwidth = 0; + + tmp = ~0; + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + /* take most compatible value */ + tmp &= VTD_CAP_SAGAW(vtdmap->cap); + } + + for (i = 0; i < 5; i++) { + if ((tmp & (1 << i)) != 0 && sagaw >= agaw) + break; + pt_levels++; + addrwidth++; + sagaw += 9; + if (sagaw > 64) + sagaw = 64; + } + + if (i >= 5) { + panic("vtd_create_domain: SAGAW 0x%x does not support AGAW %d", + tmp, agaw); + } + + dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK); + dom->pt_levels = pt_levels; + dom->addrwidth = addrwidth; + dom->id = domain_id(); + dom->maxaddr = maxaddr; + dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK); + if ((uintptr_t)dom->ptp & PAGE_MASK) + panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp); + +#ifdef notyet + /* + * XXX superpage mappings for the iommu do not work correctly. + * + * By default all physical memory is mapped into the host_domain. + * When a VM is allocated wired memory the pages belonging to it + * are removed from the host_domain and added to the vm's domain. + * + * If the page being removed was mapped using a superpage mapping + * in the host_domain then we need to demote the mapping before + * removing the page. + * + * There is not any code to deal with the demotion at the moment + * so we disable superpage mappings altogether. + */ + dom->spsmask = ~0; + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + /* take most compatible value */ + dom->spsmask &= VTD_CAP_SPS(vtdmap->cap); + } +#endif + + SLIST_INSERT_HEAD(&domhead, dom, next); + + return (dom); +} + +static void +vtd_free_ptp(uint64_t *ptp, int level) +{ + int i; + uint64_t *nlp; + + if (level > 1) { + for (i = 0; i < 512; i++) { + if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0) + continue; + if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0) + continue; + nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M); + vtd_free_ptp(nlp, level - 1); + } + } + + bzero(ptp, PAGE_SIZE); + free(ptp, M_VTD); +} + +static void +vtd_destroy_domain(void *arg) +{ + struct domain *dom; + + dom = arg; + + SLIST_REMOVE(&domhead, dom, domain, next); + vtd_free_ptp(dom->ptp, dom->pt_levels); + free(dom, M_VTD); +} + +const struct iommu_ops iommu_ops_intel = { + .init = vtd_init, + .cleanup = vtd_cleanup, + .enable = vtd_enable, + .disable = vtd_disable, + .create_domain = vtd_create_domain, + .destroy_domain = vtd_destroy_domain, + .create_mapping = vtd_create_mapping, + .remove_mapping = vtd_remove_mapping, + .add_device = vtd_add_device, + .remove_device = vtd_remove_device, + .invalidate_tlb = vtd_invalidate_tlb, +}; diff --git a/sys/amd64/vmm/io/iommu.c b/sys/amd64/vmm/io/iommu.c new file mode 100644 index 000000000000..9fc612244699 --- /dev/null +++ b/sys/amd64/vmm/io/iommu.c @@ -0,0 +1,363 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/eventhandler.h> +#include <sys/sysctl.h> +#include <sys/systm.h> + +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> + +#include <machine/cpu.h> +#include <machine/md_var.h> + +#include "vmm_util.h" +#include "vmm_mem.h" +#include "iommu.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, iommu, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "bhyve iommu parameters"); + +static int iommu_avail; +SYSCTL_INT(_hw_vmm_iommu, OID_AUTO, initialized, CTLFLAG_RD, &iommu_avail, + 0, "bhyve iommu initialized?"); + +static int iommu_enable = 1; +SYSCTL_INT(_hw_vmm_iommu, OID_AUTO, enable, CTLFLAG_RDTUN, &iommu_enable, 0, + "Enable use of I/O MMU (required for PCI passthrough)."); + +static const struct iommu_ops *ops; +static void *host_domain; +static eventhandler_tag add_tag, delete_tag; + +static void iommu_cleanup_int(bool iommu_disable); + +static __inline int +IOMMU_INIT(void) +{ + if (ops != NULL) + return ((*ops->init)()); + else + return (ENXIO); +} + +static __inline void +IOMMU_CLEANUP(void) +{ + if (ops != NULL && iommu_avail) + (*ops->cleanup)(); +} + +static __inline void * +IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->create_domain)(maxaddr)); + else + return (NULL); +} + +static __inline void +IOMMU_DESTROY_DOMAIN(void *dom) +{ + + if (ops != NULL && iommu_avail) + (*ops->destroy_domain)(dom); +} + +static __inline int +IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, + uint64_t len, uint64_t *res_len) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->create_mapping)(domain, gpa, hpa, len, res_len)); + return (EOPNOTSUPP); +} + +static __inline uint64_t +IOMMU_REMOVE_MAPPING(void *domain, vm_paddr_t gpa, uint64_t len, + uint64_t *res_len) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->remove_mapping)(domain, gpa, len, res_len)); + return (EOPNOTSUPP); +} + +static __inline int +IOMMU_ADD_DEVICE(void *domain, device_t dev, uint16_t rid) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->add_device)(domain, dev, rid)); + return (EOPNOTSUPP); +} + +static __inline int +IOMMU_REMOVE_DEVICE(void *domain, device_t dev, uint16_t rid) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->remove_device)(domain, dev, rid)); + return (0); /* To allow ppt_attach() to succeed. */ +} + +static __inline int +IOMMU_INVALIDATE_TLB(void *domain) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->invalidate_tlb)(domain)); + return (0); +} + +static __inline void +IOMMU_ENABLE(void) +{ + + if (ops != NULL && iommu_avail) + (*ops->enable)(); +} + +static __inline void +IOMMU_DISABLE(void) +{ + + if (ops != NULL && iommu_avail) + (*ops->disable)(); +} + +static void +iommu_pci_add(void *arg, device_t dev) +{ + + /* Add new devices to the host domain. */ + iommu_add_device(host_domain, dev, pci_get_rid(dev)); +} + +static void +iommu_pci_delete(void *arg, device_t dev) +{ + + iommu_remove_device(host_domain, dev, pci_get_rid(dev)); +} + +static void +iommu_init(void) +{ + int error, bus, slot, func; + vm_paddr_t maxaddr; + devclass_t dc; + device_t dev; + + if (!iommu_enable) + return; + + if (vmm_is_intel()) + ops = &iommu_ops_intel; + else if (vmm_is_svm()) + ops = &iommu_ops_amd; + else + ops = NULL; + + error = IOMMU_INIT(); + if (error) + return; + + iommu_avail = 1; + + /* + * Create a domain for the devices owned by the host + */ + maxaddr = vmm_mem_maxaddr(); + host_domain = IOMMU_CREATE_DOMAIN(maxaddr); + if (host_domain == NULL) { + printf("iommu_init: unable to create a host domain"); + IOMMU_CLEANUP(); + ops = NULL; + iommu_avail = 0; + return; + } + + /* + * Create 1:1 mappings from '0' to 'maxaddr' for devices assigned to + * the host + */ + iommu_create_mapping(host_domain, 0, 0, maxaddr); + + add_tag = EVENTHANDLER_REGISTER(pci_add_device, iommu_pci_add, NULL, 0); + delete_tag = EVENTHANDLER_REGISTER(pci_delete_device, iommu_pci_delete, + NULL, 0); + dc = devclass_find("ppt"); + for (bus = 0; bus <= PCI_BUSMAX; bus++) { + for (slot = 0; slot <= PCI_SLOTMAX; slot++) { + for (func = 0; func <= PCI_FUNCMAX; func++) { + dev = pci_find_dbsf(0, bus, slot, func); + if (dev == NULL) + continue; + + /* Skip passthrough devices. */ + if (dc != NULL && + device_get_devclass(dev) == dc) + continue; + + /* + * Everything else belongs to the host + * domain. + */ + error = iommu_add_device(host_domain, dev, + pci_get_rid(dev)); + if (error != 0 && error != ENXIO) { + printf( + "iommu_add_device(%s rid %#x) failed, error %d\n", + device_get_name(dev), + pci_get_rid(dev), error); + iommu_cleanup_int(false); + return; + } + } + } + } + IOMMU_ENABLE(); +} + +static void +iommu_cleanup_int(bool iommu_disable) +{ + + if (add_tag != NULL) { + EVENTHANDLER_DEREGISTER(pci_add_device, add_tag); + add_tag = NULL; + } + if (delete_tag != NULL) { + EVENTHANDLER_DEREGISTER(pci_delete_device, delete_tag); + delete_tag = NULL; + } + if (iommu_disable) + IOMMU_DISABLE(); + IOMMU_DESTROY_DOMAIN(host_domain); + host_domain = NULL; + IOMMU_CLEANUP(); +} + +void +iommu_cleanup(void) +{ + iommu_cleanup_int(true); +} + +void * +iommu_create_domain(vm_paddr_t maxaddr) +{ + static volatile int iommu_initted; + + if (iommu_initted < 2) { + if (atomic_cmpset_int(&iommu_initted, 0, 1)) { + iommu_init(); + atomic_store_rel_int(&iommu_initted, 2); + } else + while (iommu_initted == 1) + cpu_spinwait(); + } + return (IOMMU_CREATE_DOMAIN(maxaddr)); +} + +void +iommu_destroy_domain(void *dom) +{ + + IOMMU_DESTROY_DOMAIN(dom); +} + +int +iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len) +{ + uint64_t mapped, remaining; + int error; + + for (remaining = len; remaining > 0; gpa += mapped, hpa += mapped, + remaining -= mapped) { + error = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining, + &mapped); + if (error != 0) { + /* XXXKIB rollback */ + return (error); + } + } + return (0); +} + +int +iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len) +{ + uint64_t unmapped, remaining; + int error; + + for (remaining = len; remaining > 0; gpa += unmapped, + remaining -= unmapped) { + error = IOMMU_REMOVE_MAPPING(dom, gpa, remaining, &unmapped); + if (error != 0) { + /* XXXKIB ? */ + return (error); + } + } + return (0); +} + +void * +iommu_host_domain(void) +{ + + return (host_domain); +} + +int +iommu_add_device(void *dom, device_t dev, uint16_t rid) +{ + + return (IOMMU_ADD_DEVICE(dom, dev, rid)); +} + +int +iommu_remove_device(void *dom, device_t dev, uint16_t rid) +{ + + return (IOMMU_REMOVE_DEVICE(dom, dev, rid)); +} + +int +iommu_invalidate_tlb(void *domain) +{ + + return (IOMMU_INVALIDATE_TLB(domain)); +} diff --git a/sys/amd64/vmm/io/iommu.h b/sys/amd64/vmm/io/iommu.h new file mode 100644 index 000000000000..5294a9d92a6b --- /dev/null +++ b/sys/amd64/vmm/io/iommu.h @@ -0,0 +1,74 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _IO_IOMMU_H_ +#define _IO_IOMMU_H_ + +typedef int (*iommu_init_func_t)(void); +typedef void (*iommu_cleanup_func_t)(void); +typedef void (*iommu_enable_func_t)(void); +typedef void (*iommu_disable_func_t)(void); +typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr); +typedef void (*iommu_destroy_domain_t)(void *domain); +typedef int (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t len, uint64_t *res_len); +typedef int (*iommu_remove_mapping_t)(void *domain, vm_paddr_t gpa, + uint64_t len, uint64_t *res_len); +typedef int (*iommu_add_device_t)(void *domain, device_t dev, uint16_t rid); +typedef int (*iommu_remove_device_t)(void *dom, device_t dev, uint16_t rid); +typedef int (*iommu_invalidate_tlb_t)(void *dom); + +struct iommu_ops { + iommu_init_func_t init; /* module wide */ + iommu_cleanup_func_t cleanup; + iommu_enable_func_t enable; + iommu_disable_func_t disable; + + iommu_create_domain_t create_domain; /* domain-specific */ + iommu_destroy_domain_t destroy_domain; + iommu_create_mapping_t create_mapping; + iommu_remove_mapping_t remove_mapping; + iommu_add_device_t add_device; + iommu_remove_device_t remove_device; + iommu_invalidate_tlb_t invalidate_tlb; +}; + +extern const struct iommu_ops iommu_ops_intel; +extern const struct iommu_ops iommu_ops_amd; + +void iommu_cleanup(void); +void *iommu_host_domain(void); +void *iommu_create_domain(vm_paddr_t maxaddr); +void iommu_destroy_domain(void *dom); +int iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, + size_t len); +int iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len); +int iommu_add_device(void *dom, device_t dev, uint16_t rid); +int iommu_remove_device(void *dom, device_t dev, uint16_t rid); +int iommu_invalidate_tlb(void *domain); +#endif diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c new file mode 100644 index 000000000000..2cb459fb848f --- /dev/null +++ b/sys/amd64/vmm/io/ppt.c @@ -0,0 +1,804 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/bus.h> +#include <sys/pciio.h> +#include <sys/rman.h> +#include <sys/smp.h> +#include <sys/sysctl.h> + +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> + +#include <machine/resource.h> +#include <machine/vmm.h> +#include <machine/vmm_dev.h> + +#include <dev/vmm/vmm_ktr.h> + +#include "vmm_lapic.h" + +#include "iommu.h" +#include "ppt.h" + +/* XXX locking */ + +#define MAX_MSIMSGS 32 + +/* + * If the MSI-X table is located in the middle of a BAR then that MMIO + * region gets split into two segments - one segment above the MSI-X table + * and the other segment below the MSI-X table - with a hole in place of + * the MSI-X table so accesses to it can be trapped and emulated. + * + * So, allocate a MMIO segment for each BAR register + 1 additional segment. + */ +#define MAX_MMIOSEGS ((PCIR_MAX_BAR_0 + 1) + 1) + +MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources"); + +struct pptintr_arg { /* pptintr(pptintr_arg) */ + struct pptdev *pptdev; + uint64_t addr; + uint64_t msg_data; +}; + +struct pptseg { + vm_paddr_t gpa; + size_t len; + int wired; +}; + +struct pptdev { + device_t dev; + struct vm *vm; /* owner of this device */ + TAILQ_ENTRY(pptdev) next; + struct pptseg mmio[MAX_MMIOSEGS]; + struct { + int num_msgs; /* guest state */ + + int startrid; /* host state */ + struct resource *res[MAX_MSIMSGS]; + void *cookie[MAX_MSIMSGS]; + struct pptintr_arg arg[MAX_MSIMSGS]; + } msi; + + struct { + int num_msgs; + int startrid; + int msix_table_rid; + int msix_pba_rid; + struct resource *msix_table_res; + struct resource *msix_pba_res; + struct resource **res; + void **cookie; + struct pptintr_arg *arg; + } msix; +}; + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, ppt, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "bhyve passthru devices"); + +static int num_pptdevs; +SYSCTL_INT(_hw_vmm_ppt, OID_AUTO, devices, CTLFLAG_RD, &num_pptdevs, 0, + "number of pci passthru devices"); + +static TAILQ_HEAD(, pptdev) pptdev_list = TAILQ_HEAD_INITIALIZER(pptdev_list); + +static int +ppt_probe(device_t dev) +{ + int bus, slot, func; + struct pci_devinfo *dinfo; + + dinfo = (struct pci_devinfo *)device_get_ivars(dev); + + bus = pci_get_bus(dev); + slot = pci_get_slot(dev); + func = pci_get_function(dev); + + /* + * To qualify as a pci passthrough device a device must: + * - be allowed by administrator to be used in this role + * - be an endpoint device + */ + if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL) + return (ENXIO); + else if (vmm_is_pptdev(bus, slot, func)) + return (0); + else + /* + * Returning BUS_PROBE_NOWILDCARD here matches devices that the + * SR-IOV infrastructure specified as "ppt" passthrough devices. + * All normal devices that did not have "ppt" specified as their + * driver will not be matched by this. + */ + return (BUS_PROBE_NOWILDCARD); +} + +static int +ppt_attach(device_t dev) +{ + struct pptdev *ppt; + uint16_t cmd, cmd1; + int error; + + ppt = device_get_softc(dev); + + cmd1 = cmd = pci_read_config(dev, PCIR_COMMAND, 2); + cmd &= ~(PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN); + pci_write_config(dev, PCIR_COMMAND, cmd, 2); + error = iommu_remove_device(iommu_host_domain(), dev, pci_get_rid(dev)); + if (error != 0) { + pci_write_config(dev, PCIR_COMMAND, cmd1, 2); + return (error); + } + num_pptdevs++; + TAILQ_INSERT_TAIL(&pptdev_list, ppt, next); + ppt->dev = dev; + + if (bootverbose) + device_printf(dev, "attached\n"); + + return (0); +} + +static int +ppt_detach(device_t dev) +{ + struct pptdev *ppt; + int error; + + ppt = device_get_softc(dev); + + if (ppt->vm != NULL) + return (EBUSY); + if (iommu_host_domain() != NULL) { + error = iommu_add_device(iommu_host_domain(), dev, + pci_get_rid(dev)); + } else { + error = 0; + } + if (error != 0) + return (error); + num_pptdevs--; + TAILQ_REMOVE(&pptdev_list, ppt, next); + + return (0); +} + +static device_method_t ppt_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, ppt_probe), + DEVMETHOD(device_attach, ppt_attach), + DEVMETHOD(device_detach, ppt_detach), + {0, 0} +}; + +DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, sizeof(struct pptdev)); +DRIVER_MODULE(ppt, pci, ppt_driver, NULL, NULL); + +static int +ppt_find(struct vm *vm, int bus, int slot, int func, struct pptdev **pptp) +{ + device_t dev; + struct pptdev *ppt; + int b, s, f; + + TAILQ_FOREACH(ppt, &pptdev_list, next) { + dev = ppt->dev; + b = pci_get_bus(dev); + s = pci_get_slot(dev); + f = pci_get_function(dev); + if (bus == b && slot == s && func == f) + break; + } + + if (ppt == NULL) + return (ENOENT); + if (ppt->vm != vm) /* Make sure we own this device */ + return (EBUSY); + *pptp = ppt; + return (0); +} + +static void +ppt_unmap_all_mmio(struct vm *vm, struct pptdev *ppt) +{ + int i; + struct pptseg *seg; + + for (i = 0; i < MAX_MMIOSEGS; i++) { + seg = &ppt->mmio[i]; + if (seg->len == 0) + continue; + (void)vm_unmap_mmio(vm, seg->gpa, seg->len); + bzero(seg, sizeof(struct pptseg)); + } +} + +static void +ppt_teardown_msi(struct pptdev *ppt) +{ + int i, rid; + void *cookie; + struct resource *res; + + if (ppt->msi.num_msgs == 0) + return; + + for (i = 0; i < ppt->msi.num_msgs; i++) { + rid = ppt->msi.startrid + i; + res = ppt->msi.res[i]; + cookie = ppt->msi.cookie[i]; + + if (cookie != NULL) + bus_teardown_intr(ppt->dev, res, cookie); + + if (res != NULL) + bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res); + + ppt->msi.res[i] = NULL; + ppt->msi.cookie[i] = NULL; + } + + if (ppt->msi.startrid == 1) + pci_release_msi(ppt->dev); + + ppt->msi.num_msgs = 0; +} + +static void +ppt_teardown_msix_intr(struct pptdev *ppt, int idx) +{ + int rid; + struct resource *res; + void *cookie; + + rid = ppt->msix.startrid + idx; + res = ppt->msix.res[idx]; + cookie = ppt->msix.cookie[idx]; + + if (cookie != NULL) + bus_teardown_intr(ppt->dev, res, cookie); + + if (res != NULL) + bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res); + + ppt->msix.res[idx] = NULL; + ppt->msix.cookie[idx] = NULL; +} + +static void +ppt_teardown_msix(struct pptdev *ppt) +{ + int i; + + if (ppt->msix.num_msgs == 0) + return; + + for (i = 0; i < ppt->msix.num_msgs; i++) + ppt_teardown_msix_intr(ppt, i); + + free(ppt->msix.res, M_PPTMSIX); + free(ppt->msix.cookie, M_PPTMSIX); + free(ppt->msix.arg, M_PPTMSIX); + + pci_release_msi(ppt->dev); + + if (ppt->msix.msix_table_res) { + bus_release_resource(ppt->dev, SYS_RES_MEMORY, + ppt->msix.msix_table_rid, + ppt->msix.msix_table_res); + ppt->msix.msix_table_res = NULL; + ppt->msix.msix_table_rid = 0; + } + if (ppt->msix.msix_pba_res) { + bus_release_resource(ppt->dev, SYS_RES_MEMORY, + ppt->msix.msix_pba_rid, + ppt->msix.msix_pba_res); + ppt->msix.msix_pba_res = NULL; + ppt->msix.msix_pba_rid = 0; + } + + ppt->msix.num_msgs = 0; +} + +int +ppt_avail_devices(void) +{ + + return (num_pptdevs); +} + +int +ppt_assigned_devices(struct vm *vm) +{ + struct pptdev *ppt; + int num; + + num = 0; + TAILQ_FOREACH(ppt, &pptdev_list, next) { + if (ppt->vm == vm) + num++; + } + return (num); +} + +bool +ppt_is_mmio(struct vm *vm, vm_paddr_t gpa) +{ + int i; + struct pptdev *ppt; + struct pptseg *seg; + + TAILQ_FOREACH(ppt, &pptdev_list, next) { + if (ppt->vm != vm) + continue; + + for (i = 0; i < MAX_MMIOSEGS; i++) { + seg = &ppt->mmio[i]; + if (seg->len == 0) + continue; + if (gpa >= seg->gpa && gpa < seg->gpa + seg->len) + return (true); + } + } + + return (false); +} + +static void +ppt_pci_reset(device_t dev) +{ + + if (pcie_flr(dev, + max(pcie_get_max_completion_timeout(dev) / 1000, 10), true)) + return; + + pci_power_reset(dev); +} + +static uint16_t +ppt_bar_enables(struct pptdev *ppt) +{ + struct pci_map *pm; + uint16_t cmd; + + cmd = 0; + for (pm = pci_first_bar(ppt->dev); pm != NULL; pm = pci_next_bar(pm)) { + if (PCI_BAR_IO(pm->pm_value)) + cmd |= PCIM_CMD_PORTEN; + if (PCI_BAR_MEM(pm->pm_value)) + cmd |= PCIM_CMD_MEMEN; + } + return (cmd); +} + +int +ppt_assign_device(struct vm *vm, int bus, int slot, int func) +{ + struct pptdev *ppt; + int error; + uint16_t cmd; + + /* Passing NULL requires the device to be unowned. */ + error = ppt_find(NULL, bus, slot, func, &ppt); + if (error) + return (error); + + pci_save_state(ppt->dev); + ppt_pci_reset(ppt->dev); + pci_restore_state(ppt->dev); + error = iommu_add_device(vm_iommu_domain(vm), ppt->dev, + pci_get_rid(ppt->dev)); + if (error != 0) + return (error); + ppt->vm = vm; + cmd = pci_read_config(ppt->dev, PCIR_COMMAND, 2); + cmd |= PCIM_CMD_BUSMASTEREN | ppt_bar_enables(ppt); + pci_write_config(ppt->dev, PCIR_COMMAND, cmd, 2); + return (0); +} + +int +ppt_unassign_device(struct vm *vm, int bus, int slot, int func) +{ + struct pptdev *ppt; + int error; + uint16_t cmd; + + error = ppt_find(vm, bus, slot, func, &ppt); + if (error) + return (error); + + cmd = pci_read_config(ppt->dev, PCIR_COMMAND, 2); + cmd &= ~(PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN); + pci_write_config(ppt->dev, PCIR_COMMAND, cmd, 2); + pci_save_state(ppt->dev); + ppt_pci_reset(ppt->dev); + pci_restore_state(ppt->dev); + ppt_unmap_all_mmio(vm, ppt); + ppt_teardown_msi(ppt); + ppt_teardown_msix(ppt); + error = iommu_remove_device(vm_iommu_domain(vm), ppt->dev, + pci_get_rid(ppt->dev)); + ppt->vm = NULL; + return (error); +} + +int +ppt_unassign_all(struct vm *vm) +{ + struct pptdev *ppt; + int bus, slot, func; + device_t dev; + + TAILQ_FOREACH(ppt, &pptdev_list, next) { + if (ppt->vm == vm) { + dev = ppt->dev; + bus = pci_get_bus(dev); + slot = pci_get_slot(dev); + func = pci_get_function(dev); + vm_unassign_pptdev(vm, bus, slot, func); + } + } + + return (0); +} + +static bool +ppt_valid_bar_mapping(struct pptdev *ppt, vm_paddr_t hpa, size_t len) +{ + struct pci_map *pm; + pci_addr_t base, size; + + for (pm = pci_first_bar(ppt->dev); pm != NULL; pm = pci_next_bar(pm)) { + if (!PCI_BAR_MEM(pm->pm_value)) + continue; + base = pm->pm_value & PCIM_BAR_MEM_BASE; + size = (pci_addr_t)1 << pm->pm_size; + if (hpa >= base && hpa + len <= base + size) + return (true); + } + return (false); +} + +int +ppt_map_mmio(struct vm *vm, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa) +{ + int i, error; + struct pptseg *seg; + struct pptdev *ppt; + + if (len % PAGE_SIZE != 0 || len == 0 || gpa % PAGE_SIZE != 0 || + hpa % PAGE_SIZE != 0 || gpa + len < gpa || hpa + len < hpa) + return (EINVAL); + + error = ppt_find(vm, bus, slot, func, &ppt); + if (error) + return (error); + + if (!ppt_valid_bar_mapping(ppt, hpa, len)) + return (EINVAL); + + for (i = 0; i < MAX_MMIOSEGS; i++) { + seg = &ppt->mmio[i]; + if (seg->len == 0) { + error = vm_map_mmio(vm, gpa, len, hpa); + if (error == 0) { + seg->gpa = gpa; + seg->len = len; + } + return (error); + } + } + return (ENOSPC); +} + +int +ppt_unmap_mmio(struct vm *vm, int bus, int slot, int func, + vm_paddr_t gpa, size_t len) +{ + int i, error; + struct pptseg *seg; + struct pptdev *ppt; + + error = ppt_find(vm, bus, slot, func, &ppt); + if (error) + return (error); + + for (i = 0; i < MAX_MMIOSEGS; i++) { + seg = &ppt->mmio[i]; + if (seg->gpa == gpa && seg->len == len) { + error = vm_unmap_mmio(vm, seg->gpa, seg->len); + if (error == 0) { + seg->gpa = 0; + seg->len = 0; + } + return (error); + } + } + return (ENOENT); +} + +static int +pptintr(void *arg) +{ + struct pptdev *ppt; + struct pptintr_arg *pptarg; + + pptarg = arg; + ppt = pptarg->pptdev; + + if (ppt->vm != NULL) + lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data); + else { + /* + * XXX + * This is not expected to happen - panic? + */ + } + + /* + * For legacy interrupts give other filters a chance in case + * the interrupt was not generated by the passthrough device. + */ + if (ppt->msi.startrid == 0) + return (FILTER_STRAY); + else + return (FILTER_HANDLED); +} + +int +ppt_setup_msi(struct vm *vm, int bus, int slot, int func, + uint64_t addr, uint64_t msg, int numvec) +{ + int i, rid, flags; + int msi_count, startrid, error, tmp; + struct pptdev *ppt; + + if (numvec < 0 || numvec > MAX_MSIMSGS) + return (EINVAL); + + error = ppt_find(vm, bus, slot, func, &ppt); + if (error) + return (error); + + /* Reject attempts to enable MSI while MSI-X is active. */ + if (ppt->msix.num_msgs != 0 && numvec != 0) + return (EBUSY); + + /* Free any allocated resources */ + ppt_teardown_msi(ppt); + + if (numvec == 0) /* nothing more to do */ + return (0); + + flags = RF_ACTIVE; + msi_count = pci_msi_count(ppt->dev); + if (msi_count == 0) { + startrid = 0; /* legacy interrupt */ + msi_count = 1; + flags |= RF_SHAREABLE; + } else + startrid = 1; /* MSI */ + + /* + * The device must be capable of supporting the number of vectors + * the guest wants to allocate. + */ + if (numvec > msi_count) + return (EINVAL); + + /* + * Make sure that we can allocate all the MSI vectors that are needed + * by the guest. + */ + if (startrid == 1) { + tmp = numvec; + error = pci_alloc_msi(ppt->dev, &tmp); + if (error) + return (error); + else if (tmp != numvec) { + pci_release_msi(ppt->dev); + return (ENOSPC); + } else { + /* success */ + } + } + + ppt->msi.startrid = startrid; + + /* + * Allocate the irq resource and attach it to the interrupt handler. + */ + for (i = 0; i < numvec; i++) { + ppt->msi.num_msgs = i + 1; + ppt->msi.cookie[i] = NULL; + + rid = startrid + i; + ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ, + &rid, flags); + if (ppt->msi.res[i] == NULL) + break; + + ppt->msi.arg[i].pptdev = ppt; + ppt->msi.arg[i].addr = addr; + ppt->msi.arg[i].msg_data = msg + i; + + error = bus_setup_intr(ppt->dev, ppt->msi.res[i], + INTR_TYPE_NET | INTR_MPSAFE, + pptintr, NULL, &ppt->msi.arg[i], + &ppt->msi.cookie[i]); + if (error != 0) + break; + } + + if (i < numvec) { + ppt_teardown_msi(ppt); + return (ENXIO); + } + + return (0); +} + +int +ppt_setup_msix(struct vm *vm, int bus, int slot, int func, + int idx, uint64_t addr, uint64_t msg, uint32_t vector_control) +{ + struct pptdev *ppt; + struct pci_devinfo *dinfo; + int numvec, alloced, rid, error; + size_t res_size, cookie_size, arg_size; + + error = ppt_find(vm, bus, slot, func, &ppt); + if (error) + return (error); + + /* Reject attempts to enable MSI-X while MSI is active. */ + if (ppt->msi.num_msgs != 0) + return (EBUSY); + + dinfo = device_get_ivars(ppt->dev); + if (!dinfo) + return (ENXIO); + + /* + * First-time configuration: + * Allocate the MSI-X table + * Allocate the IRQ resources + * Set up some variables in ppt->msix + */ + if (ppt->msix.num_msgs == 0) { + numvec = pci_msix_count(ppt->dev); + if (numvec <= 0) + return (EINVAL); + + ppt->msix.startrid = 1; + ppt->msix.num_msgs = numvec; + + res_size = numvec * sizeof(ppt->msix.res[0]); + cookie_size = numvec * sizeof(ppt->msix.cookie[0]); + arg_size = numvec * sizeof(ppt->msix.arg[0]); + + ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO); + ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX, + M_WAITOK | M_ZERO); + ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO); + + rid = dinfo->cfg.msix.msix_table_bar; + ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev, + SYS_RES_MEMORY, &rid, RF_ACTIVE); + + if (ppt->msix.msix_table_res == NULL) { + ppt_teardown_msix(ppt); + return (ENOSPC); + } + ppt->msix.msix_table_rid = rid; + + if (dinfo->cfg.msix.msix_table_bar != + dinfo->cfg.msix.msix_pba_bar) { + rid = dinfo->cfg.msix.msix_pba_bar; + ppt->msix.msix_pba_res = bus_alloc_resource_any( + ppt->dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); + + if (ppt->msix.msix_pba_res == NULL) { + ppt_teardown_msix(ppt); + return (ENOSPC); + } + ppt->msix.msix_pba_rid = rid; + } + + alloced = numvec; + error = pci_alloc_msix(ppt->dev, &alloced); + if (error || alloced != numvec) { + ppt_teardown_msix(ppt); + return (error == 0 ? ENOSPC: error); + } + } + + if (idx >= ppt->msix.num_msgs) + return (EINVAL); + + if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { + /* Tear down the IRQ if it's already set up */ + ppt_teardown_msix_intr(ppt, idx); + + /* Allocate the IRQ resource */ + ppt->msix.cookie[idx] = NULL; + rid = ppt->msix.startrid + idx; + ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ, + &rid, RF_ACTIVE); + if (ppt->msix.res[idx] == NULL) + return (ENXIO); + + ppt->msix.arg[idx].pptdev = ppt; + ppt->msix.arg[idx].addr = addr; + ppt->msix.arg[idx].msg_data = msg; + + /* Setup the MSI-X interrupt */ + error = bus_setup_intr(ppt->dev, ppt->msix.res[idx], + INTR_TYPE_NET | INTR_MPSAFE, + pptintr, NULL, &ppt->msix.arg[idx], + &ppt->msix.cookie[idx]); + + if (error != 0) { + bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]); + ppt->msix.cookie[idx] = NULL; + ppt->msix.res[idx] = NULL; + return (ENXIO); + } + } else { + /* Masked, tear it down if it's already been set up */ + ppt_teardown_msix_intr(ppt, idx); + } + + return (0); +} + +int +ppt_disable_msix(struct vm *vm, int bus, int slot, int func) +{ + struct pptdev *ppt; + int error; + + error = ppt_find(vm, bus, slot, func, &ppt); + if (error) + return (error); + + ppt_teardown_msix(ppt); + return (0); +} diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h new file mode 100644 index 000000000000..f97c399564d7 --- /dev/null +++ b/sys/amd64/vmm/io/ppt.h @@ -0,0 +1,57 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _IO_PPT_H_ +#define _IO_PPT_H_ + +int ppt_unassign_all(struct vm *vm); +int ppt_map_mmio(struct vm *vm, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +int ppt_unmap_mmio(struct vm *vm, int bus, int slot, int func, + vm_paddr_t gpa, size_t len); +int ppt_setup_msi(struct vm *vm, int bus, int slot, int func, + uint64_t addr, uint64_t msg, int numvec); +int ppt_setup_msix(struct vm *vm, int bus, int slot, int func, + int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); +int ppt_disable_msix(struct vm *vm, int bus, int slot, int func); +int ppt_assigned_devices(struct vm *vm); +bool ppt_is_mmio(struct vm *vm, vm_paddr_t gpa); + +/* + * Returns the number of devices sequestered by the ppt driver for assignment + * to virtual machines. + */ +int ppt_avail_devices(void); + +/* + * The following functions should never be called directly. + * Use 'vm_assign_pptdev()' and 'vm_unassign_pptdev()' instead. + */ +int ppt_assign_device(struct vm *vm, int bus, int slot, int func); +int ppt_unassign_device(struct vm *vm, int bus, int slot, int func); +#endif diff --git a/sys/amd64/vmm/io/vatpic.c b/sys/amd64/vmm/io/vatpic.c new file mode 100644 index 000000000000..a003cd7e8c07 --- /dev/null +++ b/sys/amd64/vmm/io/vatpic.c @@ -0,0 +1,851 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_bhyve_snapshot.h" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/queue.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/systm.h> + +#include <x86/apicreg.h> +#include <machine/vmm.h> +#include <machine/vmm_snapshot.h> + +#include <dev/ic/i8259.h> +#include <dev/vmm/vmm_ktr.h> + +#include "vmm_lapic.h" +#include "vioapic.h" +#include "vatpic.h" + +static MALLOC_DEFINE(M_VATPIC, "atpic", "bhyve virtual atpic (8259)"); + +#define VATPIC_LOCK(vatpic) mtx_lock_spin(&((vatpic)->mtx)) +#define VATPIC_UNLOCK(vatpic) mtx_unlock_spin(&((vatpic)->mtx)) +#define VATPIC_LOCKED(vatpic) mtx_owned(&((vatpic)->mtx)) + +enum irqstate { + IRQSTATE_ASSERT, + IRQSTATE_DEASSERT, + IRQSTATE_PULSE +}; + +struct atpic { + bool ready; + int icw_num; + int rd_cmd_reg; + + bool aeoi; + bool poll; + bool rotate; + bool sfn; /* special fully-nested mode */ + + int irq_base; + uint8_t request; /* Interrupt Request Register (IIR) */ + uint8_t service; /* Interrupt Service (ISR) */ + uint8_t mask; /* Interrupt Mask Register (IMR) */ + uint8_t smm; /* special mask mode */ + + int acnt[8]; /* sum of pin asserts and deasserts */ + int lowprio; /* lowest priority irq */ + + bool intr_raised; +}; + +struct vatpic { + struct vm *vm; + struct mtx mtx; + struct atpic atpic[2]; + uint8_t elc[2]; +}; + +#define VATPIC_CTR0(vatpic, fmt) \ + VM_CTR0((vatpic)->vm, fmt) + +#define VATPIC_CTR1(vatpic, fmt, a1) \ + VM_CTR1((vatpic)->vm, fmt, a1) + +#define VATPIC_CTR2(vatpic, fmt, a1, a2) \ + VM_CTR2((vatpic)->vm, fmt, a1, a2) + +#define VATPIC_CTR3(vatpic, fmt, a1, a2, a3) \ + VM_CTR3((vatpic)->vm, fmt, a1, a2, a3) + +#define VATPIC_CTR4(vatpic, fmt, a1, a2, a3, a4) \ + VM_CTR4((vatpic)->vm, fmt, a1, a2, a3, a4) + +/* + * Loop over all the pins in priority order from highest to lowest. + */ +#define ATPIC_PIN_FOREACH(pinvar, atpic, tmpvar) \ + for (tmpvar = 0, pinvar = (atpic->lowprio + 1) & 0x7; \ + tmpvar < 8; \ + tmpvar++, pinvar = (pinvar + 1) & 0x7) + +static void vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate); + +static __inline bool +master_atpic(struct vatpic *vatpic, struct atpic *atpic) +{ + + if (atpic == &vatpic->atpic[0]) + return (true); + else + return (false); +} + +static __inline int +vatpic_get_highest_isrpin(struct atpic *atpic) +{ + int bit, pin; + int i; + + ATPIC_PIN_FOREACH(pin, atpic, i) { + bit = (1 << pin); + + if (atpic->service & bit) { + /* + * An IS bit that is masked by an IMR bit will not be + * cleared by a non-specific EOI in Special Mask Mode. + */ + if (atpic->smm && (atpic->mask & bit) != 0) + continue; + else + return (pin); + } + } + + return (-1); +} + +static __inline int +vatpic_get_highest_irrpin(struct atpic *atpic) +{ + int serviced; + int bit, pin, tmp; + + /* + * In 'Special Fully-Nested Mode' when an interrupt request from + * a slave is in service, the slave is not locked out from the + * master's priority logic. + */ + serviced = atpic->service; + if (atpic->sfn) + serviced &= ~(1 << 2); + + /* + * In 'Special Mask Mode', when a mask bit is set in OCW1 it inhibits + * further interrupts at that level and enables interrupts from all + * other levels that are not masked. In other words the ISR has no + * bearing on the levels that can generate interrupts. + */ + if (atpic->smm) + serviced = 0; + + ATPIC_PIN_FOREACH(pin, atpic, tmp) { + bit = 1 << pin; + + /* + * If there is already an interrupt in service at the same + * or higher priority then bail. + */ + if ((serviced & bit) != 0) + break; + + /* + * If an interrupt is asserted and not masked then return + * the corresponding 'pin' to the caller. + */ + if ((atpic->request & bit) != 0 && (atpic->mask & bit) == 0) + return (pin); + } + + return (-1); +} + +static void +vatpic_notify_intr(struct vatpic *vatpic) +{ + struct atpic *atpic; + int pin; + + KASSERT(VATPIC_LOCKED(vatpic), ("vatpic_notify_intr not locked")); + + /* + * First check the slave. + */ + atpic = &vatpic->atpic[1]; + if (!atpic->intr_raised && + (pin = vatpic_get_highest_irrpin(atpic)) != -1) { + VATPIC_CTR4(vatpic, "atpic slave notify pin = %d " + "(imr 0x%x irr 0x%x isr 0x%x)", pin, + atpic->mask, atpic->request, atpic->service); + + /* + * Cascade the request from the slave to the master. + */ + atpic->intr_raised = true; + vatpic_set_pinstate(vatpic, 2, true); + vatpic_set_pinstate(vatpic, 2, false); + } else { + VATPIC_CTR3(vatpic, "atpic slave no eligible interrupts " + "(imr 0x%x irr 0x%x isr 0x%x)", + atpic->mask, atpic->request, atpic->service); + } + + /* + * Then check the master. + */ + atpic = &vatpic->atpic[0]; + if (!atpic->intr_raised && + (pin = vatpic_get_highest_irrpin(atpic)) != -1) { + VATPIC_CTR4(vatpic, "atpic master notify pin = %d " + "(imr 0x%x irr 0x%x isr 0x%x)", pin, + atpic->mask, atpic->request, atpic->service); + + /* + * From Section 3.6.2, "Interrupt Modes", in the + * MPtable Specification, Version 1.4 + * + * PIC interrupts are routed to both the Local APIC + * and the I/O APIC to support operation in 1 of 3 + * modes. + * + * 1. Legacy PIC Mode: the PIC effectively bypasses + * all APIC components. In this mode the local APIC is + * disabled and LINT0 is reconfigured as INTR to + * deliver the PIC interrupt directly to the CPU. + * + * 2. Virtual Wire Mode: the APIC is treated as a + * virtual wire which delivers interrupts from the PIC + * to the CPU. In this mode LINT0 is programmed as + * ExtINT to indicate that the PIC is the source of + * the interrupt. + * + * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are + * fielded by the I/O APIC and delivered to the appropriate + * CPU. In this mode the I/O APIC input 0 is programmed + * as ExtINT to indicate that the PIC is the source of the + * interrupt. + */ + atpic->intr_raised = true; + lapic_set_local_intr(vatpic->vm, NULL, APIC_LVT_LINT0); + vioapic_pulse_irq(vatpic->vm, 0); + } else { + VATPIC_CTR3(vatpic, "atpic master no eligible interrupts " + "(imr 0x%x irr 0x%x isr 0x%x)", + atpic->mask, atpic->request, atpic->service); + } +} + +static int +vatpic_icw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw1 0x%x", val); + + atpic->ready = false; + + atpic->icw_num = 1; + atpic->request = 0; + atpic->mask = 0; + atpic->lowprio = 7; + atpic->rd_cmd_reg = 0; + atpic->poll = 0; + atpic->smm = 0; + + if ((val & ICW1_SNGL) != 0) { + VATPIC_CTR0(vatpic, "vatpic cascade mode required"); + return (-1); + } + + if ((val & ICW1_IC4) == 0) { + VATPIC_CTR0(vatpic, "vatpic icw4 required"); + return (-1); + } + + atpic->icw_num++; + + return (0); +} + +static int +vatpic_icw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw2 0x%x", val); + + atpic->irq_base = val & 0xf8; + + atpic->icw_num++; + + return (0); +} + +static int +vatpic_icw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw3 0x%x", val); + + atpic->icw_num++; + + return (0); +} + +static int +vatpic_icw4(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw4 0x%x", val); + + if ((val & ICW4_8086) == 0) { + VATPIC_CTR0(vatpic, "vatpic microprocessor mode required"); + return (-1); + } + + if ((val & ICW4_AEOI) != 0) + atpic->aeoi = true; + + if ((val & ICW4_SFNM) != 0) { + if (master_atpic(vatpic, atpic)) { + atpic->sfn = true; + } else { + VATPIC_CTR1(vatpic, "Ignoring special fully nested " + "mode on slave atpic: %#x", val); + } + } + + atpic->icw_num = 0; + atpic->ready = true; + + return (0); +} + +static int +vatpic_ocw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic ocw1 0x%x", val); + + atpic->mask = val & 0xff; + + return (0); +} + +static int +vatpic_ocw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic ocw2 0x%x", val); + + atpic->rotate = ((val & OCW2_R) != 0); + + if ((val & OCW2_EOI) != 0) { + int isr_bit; + + if ((val & OCW2_SL) != 0) { + /* specific EOI */ + isr_bit = val & 0x7; + } else { + /* non-specific EOI */ + isr_bit = vatpic_get_highest_isrpin(atpic); + } + + if (isr_bit != -1) { + atpic->service &= ~(1 << isr_bit); + + if (atpic->rotate) + atpic->lowprio = isr_bit; + } + } else if ((val & OCW2_SL) != 0 && atpic->rotate == true) { + /* specific priority */ + atpic->lowprio = val & 0x7; + } + + return (0); +} + +static int +vatpic_ocw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic ocw3 0x%x", val); + + if (val & OCW3_ESMM) { + atpic->smm = val & OCW3_SMM ? 1 : 0; + VATPIC_CTR2(vatpic, "%s atpic special mask mode %s", + master_atpic(vatpic, atpic) ? "master" : "slave", + atpic->smm ? "enabled" : "disabled"); + } + + if (val & OCW3_RR) { + /* read register command */ + atpic->rd_cmd_reg = val & OCW3_RIS; + + /* Polling mode */ + atpic->poll = ((val & OCW3_P) != 0); + } + + return (0); +} + +static void +vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate) +{ + struct atpic *atpic; + int oldcnt, newcnt; + bool level; + + KASSERT(pin >= 0 && pin < 16, + ("vatpic_set_pinstate: invalid pin number %d", pin)); + KASSERT(VATPIC_LOCKED(vatpic), + ("vatpic_set_pinstate: vatpic is not locked")); + + atpic = &vatpic->atpic[pin >> 3]; + + oldcnt = atpic->acnt[pin & 0x7]; + if (newstate) + atpic->acnt[pin & 0x7]++; + else + atpic->acnt[pin & 0x7]--; + newcnt = atpic->acnt[pin & 0x7]; + + if (newcnt < 0) { + VATPIC_CTR2(vatpic, "atpic pin%d: bad acnt %d", pin, newcnt); + } + + level = ((vatpic->elc[pin >> 3] & (1 << (pin & 0x7))) != 0); + + if ((oldcnt == 0 && newcnt == 1) || (newcnt > 0 && level == true)) { + /* rising edge or level */ + VATPIC_CTR1(vatpic, "atpic pin%d: asserted", pin); + atpic->request |= (1 << (pin & 0x7)); + } else if (oldcnt == 1 && newcnt == 0) { + /* falling edge */ + VATPIC_CTR1(vatpic, "atpic pin%d: deasserted", pin); + if (level) + atpic->request &= ~(1 << (pin & 0x7)); + } else { + VATPIC_CTR3(vatpic, "atpic pin%d: %s, ignored, acnt %d", + pin, newstate ? "asserted" : "deasserted", newcnt); + } + + vatpic_notify_intr(vatpic); +} + +static int +vatpic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate) +{ + struct vatpic *vatpic; + struct atpic *atpic; + + if (irq < 0 || irq > 15) + return (EINVAL); + + vatpic = vm_atpic(vm); + atpic = &vatpic->atpic[irq >> 3]; + + if (atpic->ready == false) + return (0); + + VATPIC_LOCK(vatpic); + switch (irqstate) { + case IRQSTATE_ASSERT: + vatpic_set_pinstate(vatpic, irq, true); + break; + case IRQSTATE_DEASSERT: + vatpic_set_pinstate(vatpic, irq, false); + break; + case IRQSTATE_PULSE: + vatpic_set_pinstate(vatpic, irq, true); + vatpic_set_pinstate(vatpic, irq, false); + break; + default: + panic("vatpic_set_irqstate: invalid irqstate %d", irqstate); + } + VATPIC_UNLOCK(vatpic); + + return (0); +} + +int +vatpic_assert_irq(struct vm *vm, int irq) +{ + return (vatpic_set_irqstate(vm, irq, IRQSTATE_ASSERT)); +} + +int +vatpic_deassert_irq(struct vm *vm, int irq) +{ + return (vatpic_set_irqstate(vm, irq, IRQSTATE_DEASSERT)); +} + +int +vatpic_pulse_irq(struct vm *vm, int irq) +{ + return (vatpic_set_irqstate(vm, irq, IRQSTATE_PULSE)); +} + +int +vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger) +{ + struct vatpic *vatpic; + + if (irq < 0 || irq > 15) + return (EINVAL); + + /* + * See comment in vatpic_elc_handler. These IRQs must be + * edge triggered. + */ + if (trigger == LEVEL_TRIGGER) { + switch (irq) { + case 0: + case 1: + case 2: + case 8: + case 13: + return (EINVAL); + } + } + + vatpic = vm_atpic(vm); + + VATPIC_LOCK(vatpic); + + if (trigger == LEVEL_TRIGGER) + vatpic->elc[irq >> 3] |= 1 << (irq & 0x7); + else + vatpic->elc[irq >> 3] &= ~(1 << (irq & 0x7)); + + VATPIC_UNLOCK(vatpic); + + return (0); +} + +void +vatpic_pending_intr(struct vm *vm, int *vecptr) +{ + struct vatpic *vatpic; + struct atpic *atpic; + int pin; + + vatpic = vm_atpic(vm); + + atpic = &vatpic->atpic[0]; + + VATPIC_LOCK(vatpic); + + pin = vatpic_get_highest_irrpin(atpic); + if (pin == 2) { + atpic = &vatpic->atpic[1]; + pin = vatpic_get_highest_irrpin(atpic); + } + + /* + * If there are no pins active at this moment then return the spurious + * interrupt vector instead. + */ + if (pin == -1) + pin = 7; + + KASSERT(pin >= 0 && pin <= 7, ("%s: invalid pin %d", __func__, pin)); + *vecptr = atpic->irq_base + pin; + + VATPIC_UNLOCK(vatpic); +} + +static void +vatpic_pin_accepted(struct atpic *atpic, int pin) +{ + atpic->intr_raised = false; + + if (atpic->acnt[pin] == 0) + atpic->request &= ~(1 << pin); + + if (atpic->aeoi == true) { + if (atpic->rotate == true) + atpic->lowprio = pin; + } else { + atpic->service |= (1 << pin); + } +} + +void +vatpic_intr_accepted(struct vm *vm, int vector) +{ + struct vatpic *vatpic; + int pin; + + vatpic = vm_atpic(vm); + + VATPIC_LOCK(vatpic); + + pin = vector & 0x7; + + if ((vector & ~0x7) == vatpic->atpic[1].irq_base) { + vatpic_pin_accepted(&vatpic->atpic[1], pin); + /* + * If this vector originated from the slave, + * accept the cascaded interrupt too. + */ + vatpic_pin_accepted(&vatpic->atpic[0], 2); + } else { + vatpic_pin_accepted(&vatpic->atpic[0], pin); + } + + vatpic_notify_intr(vatpic); + + VATPIC_UNLOCK(vatpic); +} + +static int +vatpic_read(struct vatpic *vatpic, struct atpic *atpic, bool in, int port, + int bytes, uint32_t *eax) +{ + int pin; + + VATPIC_LOCK(vatpic); + + if (atpic->poll) { + atpic->poll = 0; + pin = vatpic_get_highest_irrpin(atpic); + if (pin >= 0) { + vatpic_pin_accepted(atpic, pin); + *eax = 0x80 | pin; + } else { + *eax = 0; + } + } else { + if (port & ICU_IMR_OFFSET) { + /* read interrrupt mask register */ + *eax = atpic->mask; + } else { + if (atpic->rd_cmd_reg == OCW3_RIS) { + /* read interrupt service register */ + *eax = atpic->service; + } else { + /* read interrupt request register */ + *eax = atpic->request; + } + } + } + + VATPIC_UNLOCK(vatpic); + + return (0); + +} + +static int +vatpic_write(struct vatpic *vatpic, struct atpic *atpic, bool in, int port, + int bytes, uint32_t *eax) +{ + int error; + uint8_t val; + + error = 0; + val = *eax; + + VATPIC_LOCK(vatpic); + + if (port & ICU_IMR_OFFSET) { + switch (atpic->icw_num) { + case 2: + error = vatpic_icw2(vatpic, atpic, val); + break; + case 3: + error = vatpic_icw3(vatpic, atpic, val); + break; + case 4: + error = vatpic_icw4(vatpic, atpic, val); + break; + default: + error = vatpic_ocw1(vatpic, atpic, val); + break; + } + } else { + if (val & (1 << 4)) + error = vatpic_icw1(vatpic, atpic, val); + + if (atpic->ready) { + if (val & (1 << 3)) + error = vatpic_ocw3(vatpic, atpic, val); + else + error = vatpic_ocw2(vatpic, atpic, val); + } + } + + if (atpic->ready) + vatpic_notify_intr(vatpic); + + VATPIC_UNLOCK(vatpic); + + return (error); +} + +int +vatpic_master_handler(struct vm *vm, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpic *vatpic; + struct atpic *atpic; + + vatpic = vm_atpic(vm); + atpic = &vatpic->atpic[0]; + + if (bytes != 1) + return (-1); + + if (in) { + return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); + } + + return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); +} + +int +vatpic_slave_handler(struct vm *vm, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpic *vatpic; + struct atpic *atpic; + + vatpic = vm_atpic(vm); + atpic = &vatpic->atpic[1]; + + if (bytes != 1) + return (-1); + + if (in) { + return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); + } + + return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); +} + +int +vatpic_elc_handler(struct vm *vm, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpic *vatpic; + bool is_master; + + vatpic = vm_atpic(vm); + is_master = (port == IO_ELCR1); + + if (bytes != 1) + return (-1); + + VATPIC_LOCK(vatpic); + + if (in) { + if (is_master) + *eax = vatpic->elc[0]; + else + *eax = vatpic->elc[1]; + } else { + /* + * For the master PIC the cascade channel (IRQ2), the + * heart beat timer (IRQ0), and the keyboard + * controller (IRQ1) cannot be programmed for level + * mode. + * + * For the slave PIC the real time clock (IRQ8) and + * the floating point error interrupt (IRQ13) cannot + * be programmed for level mode. + */ + if (is_master) + vatpic->elc[0] = (*eax & 0xf8); + else + vatpic->elc[1] = (*eax & 0xde); + } + + VATPIC_UNLOCK(vatpic); + + return (0); +} + +struct vatpic * +vatpic_init(struct vm *vm) +{ + struct vatpic *vatpic; + + vatpic = malloc(sizeof(struct vatpic), M_VATPIC, M_WAITOK | M_ZERO); + vatpic->vm = vm; + + mtx_init(&vatpic->mtx, "vatpic lock", NULL, MTX_SPIN); + + return (vatpic); +} + +void +vatpic_cleanup(struct vatpic *vatpic) +{ + mtx_destroy(&vatpic->mtx); + free(vatpic, M_VATPIC); +} + +#ifdef BHYVE_SNAPSHOT +int +vatpic_snapshot(struct vatpic *vatpic, struct vm_snapshot_meta *meta) +{ + int ret; + int i; + struct atpic *atpic; + + for (i = 0; i < nitems(vatpic->atpic); i++) { + atpic = &vatpic->atpic[i]; + + SNAPSHOT_VAR_OR_LEAVE(atpic->ready, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->icw_num, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->rd_cmd_reg, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(atpic->aeoi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->poll, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->rotate, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->sfn, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->irq_base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->request, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->service, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->mask, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->smm, meta, ret, done); + + SNAPSHOT_BUF_OR_LEAVE(atpic->acnt, sizeof(atpic->acnt), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->lowprio, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->intr_raised, meta, ret, done); + } + + SNAPSHOT_BUF_OR_LEAVE(vatpic->elc, sizeof(vatpic->elc), + meta, ret, done); + +done: + return (ret); +} +#endif diff --git a/sys/amd64/vmm/io/vatpic.h b/sys/amd64/vmm/io/vatpic.h new file mode 100644 index 000000000000..352c55a3089d --- /dev/null +++ b/sys/amd64/vmm/io/vatpic.h @@ -0,0 +1,61 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VATPIC_H_ +#define _VATPIC_H_ + +#include <isa/isareg.h> + +#define ICU_IMR_OFFSET 1 + +#define IO_ELCR1 0x4d0 +#define IO_ELCR2 0x4d1 + +struct vm_snapshot_meta; + +struct vatpic *vatpic_init(struct vm *vm); +void vatpic_cleanup(struct vatpic *vatpic); + +int vatpic_master_handler(struct vm *vm, bool in, int port, int bytes, + uint32_t *eax); +int vatpic_slave_handler(struct vm *vm, bool in, int port, int bytes, + uint32_t *eax); +int vatpic_elc_handler(struct vm *vm, bool in, int port, int bytes, + uint32_t *eax); + +int vatpic_assert_irq(struct vm *vm, int irq); +int vatpic_deassert_irq(struct vm *vm, int irq); +int vatpic_pulse_irq(struct vm *vm, int irq); +int vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger); + +void vatpic_pending_intr(struct vm *vm, int *vecptr); +void vatpic_intr_accepted(struct vm *vm, int vector); + +#ifdef BHYVE_SNAPSHOT +int vatpic_snapshot(struct vatpic *vatpic, struct vm_snapshot_meta *meta); +#endif + +#endif /* _VATPIC_H_ */ diff --git a/sys/amd64/vmm/io/vatpit.c b/sys/amd64/vmm/io/vatpit.c new file mode 100644 index 000000000000..31b6c2ad0f89 --- /dev/null +++ b/sys/amd64/vmm/io/vatpit.c @@ -0,0 +1,513 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * Copyright (c) 2018 Joyent, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_bhyve_snapshot.h" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/queue.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/systm.h> + +#include <machine/vmm.h> +#include <machine/vmm_snapshot.h> + +#include <dev/vmm/vmm_ktr.h> + +#include "vatpic.h" +#include "vioapic.h" +#include "vatpit.h" + +static MALLOC_DEFINE(M_VATPIT, "atpit", "bhyve virtual atpit (8254)"); + +#define VATPIT_LOCK(vatpit) mtx_lock_spin(&((vatpit)->mtx)) +#define VATPIT_UNLOCK(vatpit) mtx_unlock_spin(&((vatpit)->mtx)) +#define VATPIT_LOCKED(vatpit) mtx_owned(&((vatpit)->mtx)) + +#define TIMER_SEL_MASK 0xc0 +#define TIMER_RW_MASK 0x30 +#define TIMER_MODE_MASK 0x0f +#define TIMER_SEL_READBACK 0xc0 + +#define TIMER_STS_OUT 0x80 +#define TIMER_STS_NULLCNT 0x40 + +#define TIMER_RB_LCTR 0x20 +#define TIMER_RB_LSTATUS 0x10 +#define TIMER_RB_CTR_2 0x08 +#define TIMER_RB_CTR_1 0x04 +#define TIMER_RB_CTR_0 0x02 + +#define TMR2_OUT_STS 0x20 + +#define PIT_8254_FREQ 1193182 +#define TIMER_DIV(freq, hz) (((freq) + (hz) / 2) / (hz)) + +struct vatpit_callout_arg { + struct vatpit *vatpit; + int channel_num; +}; + +struct channel { + int mode; + uint16_t initial; /* initial counter value */ + struct bintime now_bt; /* uptime when counter was loaded */ + uint8_t cr[2]; + uint8_t ol[2]; + bool slatched; /* status latched */ + uint8_t status; + int crbyte; + int olbyte; + int frbyte; + struct callout callout; + struct bintime callout_bt; /* target time */ + struct vatpit_callout_arg callout_arg; +}; + +struct vatpit { + struct vm *vm; + struct mtx mtx; + + struct bintime freq_bt; + + struct channel channel[3]; +}; + +static void pit_timer_start_cntr0(struct vatpit *vatpit); + +static uint64_t +vatpit_delta_ticks(struct vatpit *vatpit, struct channel *c) +{ + struct bintime delta; + uint64_t result; + + binuptime(&delta); + bintime_sub(&delta, &c->now_bt); + + result = delta.sec * PIT_8254_FREQ; + result += delta.frac / vatpit->freq_bt.frac; + + return (result); +} + +static int +vatpit_get_out(struct vatpit *vatpit, int channel) +{ + struct channel *c; + uint64_t delta_ticks; + int out; + + c = &vatpit->channel[channel]; + + switch (c->mode) { + case TIMER_INTTC: + delta_ticks = vatpit_delta_ticks(vatpit, c); + out = (delta_ticks >= c->initial); + break; + default: + out = 0; + break; + } + + return (out); +} + +static void +vatpit_callout_handler(void *a) +{ + struct vatpit_callout_arg *arg = a; + struct vatpit *vatpit; + struct callout *callout; + struct channel *c; + + vatpit = arg->vatpit; + c = &vatpit->channel[arg->channel_num]; + callout = &c->callout; + + VM_CTR1(vatpit->vm, "atpit t%d fired", arg->channel_num); + + VATPIT_LOCK(vatpit); + + if (callout_pending(callout)) /* callout was reset */ + goto done; + + if (!callout_active(callout)) /* callout was stopped */ + goto done; + + callout_deactivate(callout); + + if (c->mode == TIMER_RATEGEN) { + pit_timer_start_cntr0(vatpit); + } + + vatpic_pulse_irq(vatpit->vm, 0); + vioapic_pulse_irq(vatpit->vm, 2); + +done: + VATPIT_UNLOCK(vatpit); + return; +} + +static void +pit_timer_start_cntr0(struct vatpit *vatpit) +{ + struct channel *c; + struct bintime now, delta; + sbintime_t precision; + + c = &vatpit->channel[0]; + if (c->initial != 0) { + delta.sec = 0; + delta.frac = vatpit->freq_bt.frac * c->initial; + bintime_add(&c->callout_bt, &delta); + precision = bttosbt(delta) >> tc_precexp; + + /* + * Reset 'callout_bt' if the time that the callout + * was supposed to fire is more than 'c->initial' + * ticks in the past. + */ + binuptime(&now); + if (bintime_cmp(&c->callout_bt, &now, <)) { + c->callout_bt = now; + bintime_add(&c->callout_bt, &delta); + } + + callout_reset_sbt(&c->callout, bttosbt(c->callout_bt), + precision, vatpit_callout_handler, &c->callout_arg, + C_ABSOLUTE); + } +} + +static uint16_t +pit_update_counter(struct vatpit *vatpit, struct channel *c, bool latch) +{ + uint16_t lval; + uint64_t delta_ticks; + + /* cannot latch a new value until the old one has been consumed */ + if (latch && c->olbyte != 0) + return (0); + + if (c->initial == 0) { + /* + * This is possibly an o/s bug - reading the value of + * the timer without having set up the initial value. + * + * The original user-space version of this code set + * the timer to 100hz in this condition; do the same + * here. + */ + c->initial = TIMER_DIV(PIT_8254_FREQ, 100); + binuptime(&c->now_bt); + c->status &= ~TIMER_STS_NULLCNT; + } + + delta_ticks = vatpit_delta_ticks(vatpit, c); + lval = c->initial - delta_ticks % c->initial; + + if (latch) { + c->olbyte = 2; + c->ol[1] = lval; /* LSB */ + c->ol[0] = lval >> 8; /* MSB */ + } + + return (lval); +} + +static int +pit_readback1(struct vatpit *vatpit, int channel, uint8_t cmd) +{ + struct channel *c; + + c = &vatpit->channel[channel]; + + /* + * Latch the count/status of the timer if not already latched. + * N.B. that the count/status latch-select bits are active-low. + */ + if (!(cmd & TIMER_RB_LCTR) && !c->olbyte) { + (void) pit_update_counter(vatpit, c, true); + } + + if (!(cmd & TIMER_RB_LSTATUS) && !c->slatched) { + c->slatched = true; + /* + * For mode 0, see if the elapsed time is greater + * than the initial value - this results in the + * output pin being set to 1 in the status byte. + */ + if (c->mode == TIMER_INTTC && vatpit_get_out(vatpit, channel)) + c->status |= TIMER_STS_OUT; + else + c->status &= ~TIMER_STS_OUT; + } + + return (0); +} + +static int +pit_readback(struct vatpit *vatpit, uint8_t cmd) +{ + int error; + + /* + * The readback command can apply to all timers. + */ + error = 0; + if (cmd & TIMER_RB_CTR_0) + error = pit_readback1(vatpit, 0, cmd); + if (!error && cmd & TIMER_RB_CTR_1) + error = pit_readback1(vatpit, 1, cmd); + if (!error && cmd & TIMER_RB_CTR_2) + error = pit_readback1(vatpit, 2, cmd); + + return (error); +} + +static int +vatpit_update_mode(struct vatpit *vatpit, uint8_t val) +{ + struct channel *c; + int sel, rw, mode; + + sel = val & TIMER_SEL_MASK; + rw = val & TIMER_RW_MASK; + mode = val & TIMER_MODE_MASK; + + if (sel == TIMER_SEL_READBACK) + return (pit_readback(vatpit, val)); + + if (rw != TIMER_LATCH && rw != TIMER_16BIT) + return (-1); + + if (rw != TIMER_LATCH) { + /* + * Counter mode is not affected when issuing a + * latch command. + */ + if (mode != TIMER_INTTC && + mode != TIMER_RATEGEN && + mode != TIMER_SQWAVE && + mode != TIMER_SWSTROBE) + return (-1); + } + + c = &vatpit->channel[sel >> 6]; + if (rw == TIMER_LATCH) + pit_update_counter(vatpit, c, true); + else { + c->mode = mode; + c->olbyte = 0; /* reset latch after reprogramming */ + c->status |= TIMER_STS_NULLCNT; + } + + return (0); +} + +int +vatpit_handler(struct vm *vm, bool in, int port, int bytes, uint32_t *eax) +{ + struct vatpit *vatpit; + struct channel *c; + uint8_t val; + int error; + + vatpit = vm_atpit(vm); + + if (bytes != 1) + return (-1); + + val = *eax; + + if (port == TIMER_MODE) { + if (in) { + VM_CTR0(vatpit->vm, "vatpit attempt to read mode"); + return (-1); + } + + VATPIT_LOCK(vatpit); + error = vatpit_update_mode(vatpit, val); + VATPIT_UNLOCK(vatpit); + + return (error); + } + + /* counter ports */ + KASSERT(port >= TIMER_CNTR0 && port <= TIMER_CNTR2, + ("invalid port 0x%x", port)); + c = &vatpit->channel[port - TIMER_CNTR0]; + + VATPIT_LOCK(vatpit); + if (in && c->slatched) { + /* + * Return the status byte if latched + */ + *eax = c->status; + c->slatched = false; + c->status = 0; + } else if (in) { + /* + * The spec says that once the output latch is completely + * read it should revert to "following" the counter. Use + * the free running counter for this case (i.e. Linux + * TSC calibration). Assuming the access mode is 16-bit, + * toggle the MSB/LSB bit on each read. + */ + if (c->olbyte == 0) { + uint16_t tmp; + + tmp = pit_update_counter(vatpit, c, false); + if (c->frbyte) + tmp >>= 8; + tmp &= 0xff; + *eax = tmp; + c->frbyte ^= 1; + } else + *eax = c->ol[--c->olbyte]; + } else { + c->cr[c->crbyte++] = *eax; + if (c->crbyte == 2) { + c->status &= ~TIMER_STS_NULLCNT; + c->frbyte = 0; + c->crbyte = 0; + c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8; + binuptime(&c->now_bt); + /* Start an interval timer for channel 0 */ + if (port == TIMER_CNTR0) { + c->callout_bt = c->now_bt; + pit_timer_start_cntr0(vatpit); + } + if (c->initial == 0) + c->initial = 0xffff; + } + } + VATPIT_UNLOCK(vatpit); + + return (0); +} + +int +vatpit_nmisc_handler(struct vm *vm, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpit *vatpit; + + vatpit = vm_atpit(vm); + + if (in) { + VATPIT_LOCK(vatpit); + if (vatpit_get_out(vatpit, 2)) + *eax = TMR2_OUT_STS; + else + *eax = 0; + + VATPIT_UNLOCK(vatpit); + } + + return (0); +} + +struct vatpit * +vatpit_init(struct vm *vm) +{ + struct vatpit *vatpit; + struct vatpit_callout_arg *arg; + int i; + + vatpit = malloc(sizeof(struct vatpit), M_VATPIT, M_WAITOK | M_ZERO); + vatpit->vm = vm; + + mtx_init(&vatpit->mtx, "vatpit lock", NULL, MTX_SPIN); + + FREQ2BT(PIT_8254_FREQ, &vatpit->freq_bt); + + for (i = 0; i < 3; i++) { + callout_init(&vatpit->channel[i].callout, 1); + arg = &vatpit->channel[i].callout_arg; + arg->vatpit = vatpit; + arg->channel_num = i; + } + + return (vatpit); +} + +void +vatpit_cleanup(struct vatpit *vatpit) +{ + int i; + + for (i = 0; i < 3; i++) + callout_drain(&vatpit->channel[i].callout); + + mtx_destroy(&vatpit->mtx); + free(vatpit, M_VATPIT); +} + +#ifdef BHYVE_SNAPSHOT +int +vatpit_snapshot(struct vatpit *vatpit, struct vm_snapshot_meta *meta) +{ + int ret; + int i; + struct channel *channel; + + SNAPSHOT_VAR_OR_LEAVE(vatpit->freq_bt.sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vatpit->freq_bt.frac, meta, ret, done); + + /* properly restore timers; they will NOT work currently */ + printf("%s: snapshot restore does not reset timers!\r\n", __func__); + + for (i = 0; i < nitems(vatpit->channel); i++) { + channel = &vatpit->channel[i]; + + SNAPSHOT_VAR_OR_LEAVE(channel->mode, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->initial, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->now_bt.sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->now_bt.frac, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(channel->cr, sizeof(channel->cr), + meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(channel->ol, sizeof(channel->ol), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->slatched, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->status, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->crbyte, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->frbyte, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->callout_bt.sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->callout_bt.frac, meta, ret, + done); + } + +done: + return (ret); +} +#endif diff --git a/sys/amd64/vmm/io/vatpit.h b/sys/amd64/vmm/io/vatpit.h new file mode 100644 index 000000000000..c18071069d3c --- /dev/null +++ b/sys/amd64/vmm/io/vatpit.h @@ -0,0 +1,49 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VATPIT_H_ +#define _VATPIT_H_ + +#include <x86/timerreg.h> + +#define NMISC_PORT 0x61 + +struct vm_snapshot_meta; + +struct vatpit *vatpit_init(struct vm *vm); +void vatpit_cleanup(struct vatpit *vatpit); + +int vatpit_handler(struct vm *vm, bool in, int port, int bytes, uint32_t *eax); +int vatpit_nmisc_handler(struct vm *vm, bool in, int port, int bytes, + uint32_t *eax); +#ifdef BHYVE_SNAPSHOT +int vatpit_snapshot(struct vatpit *vatpit, struct vm_snapshot_meta *meta); +#endif + +#endif /* _VATPIT_H_ */ diff --git a/sys/amd64/vmm/io/vhpet.c b/sys/amd64/vmm/io/vhpet.c new file mode 100644 index 000000000000..88063f2952e5 --- /dev/null +++ b/sys/amd64/vmm/io/vhpet.c @@ -0,0 +1,808 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_bhyve_snapshot.h" + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/systm.h> + +#include <machine/vmm.h> +#include <machine/vmm_dev.h> +#include <machine/vmm_snapshot.h> + +#include <dev/acpica/acpi_hpet.h> +#include <dev/vmm/vmm_ktr.h> + +#include "vmm_lapic.h" +#include "vatpic.h" +#include "vioapic.h" +#include "vhpet.h" + +static MALLOC_DEFINE(M_VHPET, "vhpet", "bhyve virtual hpet"); + +#define HPET_FREQ 16777216 /* 16.7 (2^24) Mhz */ +#define FS_PER_S 1000000000000000ul + +/* Timer N Configuration and Capabilities Register */ +#define HPET_TCAP_RO_MASK (HPET_TCAP_INT_ROUTE | \ + HPET_TCAP_FSB_INT_DEL | \ + HPET_TCAP_SIZE | \ + HPET_TCAP_PER_INT) +/* + * HPET requires at least 3 timers and up to 32 timers per block. + */ +#define VHPET_NUM_TIMERS 8 +CTASSERT(VHPET_NUM_TIMERS >= 3 && VHPET_NUM_TIMERS <= 32); + +struct vhpet_callout_arg { + struct vhpet *vhpet; + int timer_num; +}; + +struct vhpet { + struct vm *vm; + struct mtx mtx; + sbintime_t freq_sbt; + + uint64_t config; /* Configuration */ + uint64_t isr; /* Interrupt Status */ + uint32_t countbase; /* HPET counter base value */ + sbintime_t countbase_sbt; /* uptime corresponding to base value */ + + struct { + uint64_t cap_config; /* Configuration */ + uint64_t msireg; /* FSB interrupt routing */ + uint32_t compval; /* Comparator */ + uint32_t comprate; + struct callout callout; + sbintime_t callout_sbt; /* time when counter==compval */ + struct vhpet_callout_arg arg; + } timer[VHPET_NUM_TIMERS]; +}; + +#define VHPET_LOCK(vhp) mtx_lock(&((vhp)->mtx)) +#define VHPET_UNLOCK(vhp) mtx_unlock(&((vhp)->mtx)) + +static void vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, + sbintime_t now); + +static uint64_t +vhpet_capabilities(void) +{ + uint64_t cap = 0; + + cap |= 0x8086 << 16; /* vendor id */ + cap |= (VHPET_NUM_TIMERS - 1) << 8; /* number of timers */ + cap |= 1; /* revision */ + cap &= ~HPET_CAP_COUNT_SIZE; /* 32-bit timer */ + + cap &= 0xffffffff; + cap |= (FS_PER_S / HPET_FREQ) << 32; /* tick period in fs */ + + return (cap); +} + +static __inline bool +vhpet_counter_enabled(struct vhpet *vhpet) +{ + + return ((vhpet->config & HPET_CNF_ENABLE) ? true : false); +} + +static __inline bool +vhpet_timer_msi_enabled(struct vhpet *vhpet, int n) +{ + const uint64_t msi_enable = HPET_TCAP_FSB_INT_DEL | HPET_TCNF_FSB_EN; + + if ((vhpet->timer[n].cap_config & msi_enable) == msi_enable) + return (true); + else + return (false); +} + +static __inline int +vhpet_timer_ioapic_pin(struct vhpet *vhpet, int n) +{ + /* + * If the timer is configured to use MSI then treat it as if the + * timer is not connected to the ioapic. + */ + if (vhpet_timer_msi_enabled(vhpet, n)) + return (0); + + return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ROUTE) >> 9); +} + +static uint32_t +vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr) +{ + uint32_t val; + sbintime_t now, delta; + + val = vhpet->countbase; + if (vhpet_counter_enabled(vhpet)) { + now = sbinuptime(); + delta = now - vhpet->countbase_sbt; + KASSERT(delta >= 0, ("vhpet_counter: uptime went backwards: " + "%#lx to %#lx", vhpet->countbase_sbt, now)); + val += delta / vhpet->freq_sbt; + if (nowptr != NULL) + *nowptr = now; + } else { + /* + * The sbinuptime corresponding to the 'countbase' is + * meaningless when the counter is disabled. Make sure + * that the caller doesn't want to use it. + */ + KASSERT(nowptr == NULL, ("vhpet_counter: nowptr must be NULL")); + } + return (val); +} + +static void +vhpet_timer_clear_isr(struct vhpet *vhpet, int n) +{ + int pin; + + if (vhpet->isr & (1 << n)) { + pin = vhpet_timer_ioapic_pin(vhpet, n); + KASSERT(pin != 0, ("vhpet timer %d irq incorrectly routed", n)); + vioapic_deassert_irq(vhpet->vm, pin); + vhpet->isr &= ~(1 << n); + } +} + +static __inline bool +vhpet_periodic_timer(struct vhpet *vhpet, int n) +{ + + return ((vhpet->timer[n].cap_config & HPET_TCNF_TYPE) != 0); +} + +static __inline bool +vhpet_timer_interrupt_enabled(struct vhpet *vhpet, int n) +{ + + return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ENB) != 0); +} + +static __inline bool +vhpet_timer_edge_trig(struct vhpet *vhpet, int n) +{ + + KASSERT(!vhpet_timer_msi_enabled(vhpet, n), ("vhpet_timer_edge_trig: " + "timer %d is using MSI", n)); + + if ((vhpet->timer[n].cap_config & HPET_TCNF_INT_TYPE) == 0) + return (true); + else + return (false); +} + +static void +vhpet_timer_interrupt(struct vhpet *vhpet, int n) +{ + int pin; + + /* If interrupts are not enabled for this timer then just return. */ + if (!vhpet_timer_interrupt_enabled(vhpet, n)) + return; + + /* + * If a level triggered interrupt is already asserted then just return. + */ + if ((vhpet->isr & (1 << n)) != 0) { + VM_CTR1(vhpet->vm, "hpet t%d intr is already asserted", n); + return; + } + + if (vhpet_timer_msi_enabled(vhpet, n)) { + lapic_intr_msi(vhpet->vm, vhpet->timer[n].msireg >> 32, + vhpet->timer[n].msireg & 0xffffffff); + return; + } + + pin = vhpet_timer_ioapic_pin(vhpet, n); + if (pin == 0) { + VM_CTR1(vhpet->vm, "hpet t%d intr is not routed to ioapic", n); + return; + } + + if (vhpet_timer_edge_trig(vhpet, n)) { + vioapic_pulse_irq(vhpet->vm, pin); + } else { + vhpet->isr |= 1 << n; + vioapic_assert_irq(vhpet->vm, pin); + } +} + +static void +vhpet_adjust_compval(struct vhpet *vhpet, int n, uint32_t counter) +{ + uint32_t compval, comprate, compnext; + + KASSERT(vhpet->timer[n].comprate != 0, ("hpet t%d is not periodic", n)); + + compval = vhpet->timer[n].compval; + comprate = vhpet->timer[n].comprate; + + /* + * Calculate the comparator value to be used for the next periodic + * interrupt. + * + * This function is commonly called from the callout handler. + * In this scenario the 'counter' is ahead of 'compval'. To find + * the next value to program into the accumulator we divide the + * number space between 'compval' and 'counter' into 'comprate' + * sized units. The 'compval' is rounded up such that is "ahead" + * of 'counter'. + */ + compnext = compval + ((counter - compval) / comprate + 1) * comprate; + + vhpet->timer[n].compval = compnext; +} + +static void +vhpet_handler(void *a) +{ + int n; + uint32_t counter; + sbintime_t now; + struct vhpet *vhpet; + struct callout *callout; + struct vhpet_callout_arg *arg; + + arg = a; + vhpet = arg->vhpet; + n = arg->timer_num; + callout = &vhpet->timer[n].callout; + + VM_CTR1(vhpet->vm, "hpet t%d fired", n); + + VHPET_LOCK(vhpet); + + if (callout_pending(callout)) /* callout was reset */ + goto done; + + if (!callout_active(callout)) /* callout was stopped */ + goto done; + + callout_deactivate(callout); + + if (!vhpet_counter_enabled(vhpet)) + panic("vhpet(%p) callout with counter disabled", vhpet); + + counter = vhpet_counter(vhpet, &now); + vhpet_start_timer(vhpet, n, counter, now); + vhpet_timer_interrupt(vhpet, n); +done: + VHPET_UNLOCK(vhpet); + return; +} + +static void +vhpet_stop_timer(struct vhpet *vhpet, int n, sbintime_t now) +{ + + VM_CTR1(vhpet->vm, "hpet t%d stopped", n); + callout_stop(&vhpet->timer[n].callout); + + /* + * If the callout was scheduled to expire in the past but hasn't + * had a chance to execute yet then trigger the timer interrupt + * here. Failing to do so will result in a missed timer interrupt + * in the guest. This is especially bad in one-shot mode because + * the next interrupt has to wait for the counter to wrap around. + */ + if (vhpet->timer[n].callout_sbt < now) { + VM_CTR1(vhpet->vm, "hpet t%d interrupt triggered after " + "stopping timer", n); + vhpet_timer_interrupt(vhpet, n); + } +} + +static void +vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, sbintime_t now) +{ + sbintime_t delta, precision; + + if (vhpet->timer[n].comprate != 0) + vhpet_adjust_compval(vhpet, n, counter); + else { + /* + * In one-shot mode it is the guest's responsibility to make + * sure that the comparator value is not in the "past". The + * hardware doesn't have any belt-and-suspenders to deal with + * this so we don't either. + */ + } + + delta = (vhpet->timer[n].compval - counter) * vhpet->freq_sbt; + precision = delta >> tc_precexp; + vhpet->timer[n].callout_sbt = now + delta; + callout_reset_sbt(&vhpet->timer[n].callout, vhpet->timer[n].callout_sbt, + precision, vhpet_handler, &vhpet->timer[n].arg, C_ABSOLUTE); +} + +static void +vhpet_start_counting(struct vhpet *vhpet) +{ + int i; + + vhpet->countbase_sbt = sbinuptime(); + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + /* + * Restart the timers based on the value of the main counter + * when it stopped counting. + */ + vhpet_start_timer(vhpet, i, vhpet->countbase, + vhpet->countbase_sbt); + } +} + +static void +vhpet_stop_counting(struct vhpet *vhpet, uint32_t counter, sbintime_t now) +{ + int i; + + vhpet->countbase = counter; + for (i = 0; i < VHPET_NUM_TIMERS; i++) + vhpet_stop_timer(vhpet, i, now); +} + +static __inline void +update_register(uint64_t *regptr, uint64_t data, uint64_t mask) +{ + + *regptr &= ~mask; + *regptr |= (data & mask); +} + +static void +vhpet_timer_update_config(struct vhpet *vhpet, int n, uint64_t data, + uint64_t mask) +{ + bool clear_isr; + int old_pin, new_pin; + uint32_t allowed_irqs; + uint64_t oldval, newval; + + if (vhpet_timer_msi_enabled(vhpet, n) || + vhpet_timer_edge_trig(vhpet, n)) { + if (vhpet->isr & (1 << n)) + panic("vhpet timer %d isr should not be asserted", n); + } + old_pin = vhpet_timer_ioapic_pin(vhpet, n); + oldval = vhpet->timer[n].cap_config; + + newval = oldval; + update_register(&newval, data, mask); + newval &= ~(HPET_TCAP_RO_MASK | HPET_TCNF_32MODE); + newval |= oldval & HPET_TCAP_RO_MASK; + + if (newval == oldval) + return; + + vhpet->timer[n].cap_config = newval; + VM_CTR2(vhpet->vm, "hpet t%d cap_config set to 0x%016x", n, newval); + + /* + * Validate the interrupt routing in the HPET_TCNF_INT_ROUTE field. + * If it does not match the bits set in HPET_TCAP_INT_ROUTE then set + * it to the default value of 0. + */ + allowed_irqs = vhpet->timer[n].cap_config >> 32; + new_pin = vhpet_timer_ioapic_pin(vhpet, n); + if (new_pin != 0 && (allowed_irqs & (1 << new_pin)) == 0) { + VM_CTR3(vhpet->vm, "hpet t%d configured invalid irq %d, " + "allowed_irqs 0x%08x", n, new_pin, allowed_irqs); + new_pin = 0; + vhpet->timer[n].cap_config &= ~HPET_TCNF_INT_ROUTE; + } + + if (!vhpet_periodic_timer(vhpet, n)) + vhpet->timer[n].comprate = 0; + + /* + * If the timer's ISR bit is set then clear it in the following cases: + * - interrupt is disabled + * - interrupt type is changed from level to edge or fsb. + * - interrupt routing is changed + * + * This is to ensure that this timer's level triggered interrupt does + * not remain asserted forever. + */ + if (vhpet->isr & (1 << n)) { + KASSERT(old_pin != 0, ("timer %d isr asserted to ioapic pin %d", + n, old_pin)); + if (!vhpet_timer_interrupt_enabled(vhpet, n)) + clear_isr = true; + else if (vhpet_timer_msi_enabled(vhpet, n)) + clear_isr = true; + else if (vhpet_timer_edge_trig(vhpet, n)) + clear_isr = true; + else if (vhpet_timer_ioapic_pin(vhpet, n) != old_pin) + clear_isr = true; + else + clear_isr = false; + + if (clear_isr) { + VM_CTR1(vhpet->vm, "hpet t%d isr cleared due to " + "configuration change", n); + vioapic_deassert_irq(vhpet->vm, old_pin); + vhpet->isr &= ~(1 << n); + } + } +} + +int +vhpet_mmio_write(struct vcpu *vcpu, uint64_t gpa, uint64_t val, int size, + void *arg) +{ + struct vhpet *vhpet; + uint64_t data, mask, oldval, val64; + uint32_t isr_clear_mask, old_compval, old_comprate, counter; + sbintime_t now, *nowptr; + int i, offset; + + vhpet = vm_hpet(vcpu_vm(vcpu)); + offset = gpa - VHPET_BASE; + + VHPET_LOCK(vhpet); + + /* Accesses to the HPET should be 4 or 8 bytes wide */ + switch (size) { + case 8: + mask = 0xffffffffffffffff; + data = val; + break; + case 4: + mask = 0xffffffff; + data = val; + if ((offset & 0x4) != 0) { + mask <<= 32; + data <<= 32; + } + break; + default: + VM_CTR2(vhpet->vm, "hpet invalid mmio write: " + "offset 0x%08x, size %d", offset, size); + goto done; + } + + /* Access to the HPET should be naturally aligned to its width */ + if (offset & (size - 1)) { + VM_CTR2(vhpet->vm, "hpet invalid mmio write: " + "offset 0x%08x, size %d", offset, size); + goto done; + } + + if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) { + /* + * Get the most recent value of the counter before updating + * the 'config' register. If the HPET is going to be disabled + * then we need to update 'countbase' with the value right + * before it is disabled. + */ + nowptr = vhpet_counter_enabled(vhpet) ? &now : NULL; + counter = vhpet_counter(vhpet, nowptr); + oldval = vhpet->config; + update_register(&vhpet->config, data, mask); + + /* + * LegacyReplacement Routing is not supported so clear the + * bit explicitly. + */ + vhpet->config &= ~HPET_CNF_LEG_RT; + + if ((oldval ^ vhpet->config) & HPET_CNF_ENABLE) { + if (vhpet_counter_enabled(vhpet)) { + vhpet_start_counting(vhpet); + VM_CTR0(vhpet->vm, "hpet enabled"); + } else { + vhpet_stop_counting(vhpet, counter, now); + VM_CTR0(vhpet->vm, "hpet disabled"); + } + } + goto done; + } + + if (offset == HPET_ISR || offset == HPET_ISR + 4) { + isr_clear_mask = vhpet->isr & data; + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + if ((isr_clear_mask & (1 << i)) != 0) { + VM_CTR1(vhpet->vm, "hpet t%d isr cleared", i); + vhpet_timer_clear_isr(vhpet, i); + } + } + goto done; + } + + if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) { + /* Zero-extend the counter to 64-bits before updating it */ + val64 = vhpet_counter(vhpet, NULL); + update_register(&val64, data, mask); + vhpet->countbase = val64; + if (vhpet_counter_enabled(vhpet)) + vhpet_start_counting(vhpet); + goto done; + } + + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + if (offset == HPET_TIMER_CAP_CNF(i) || + offset == HPET_TIMER_CAP_CNF(i) + 4) { + vhpet_timer_update_config(vhpet, i, data, mask); + break; + } + + if (offset == HPET_TIMER_COMPARATOR(i) || + offset == HPET_TIMER_COMPARATOR(i) + 4) { + old_compval = vhpet->timer[i].compval; + old_comprate = vhpet->timer[i].comprate; + if (vhpet_periodic_timer(vhpet, i)) { + /* + * In periodic mode writes to the comparator + * change the 'compval' register only if the + * HPET_TCNF_VAL_SET bit is set in the config + * register. + */ + val64 = vhpet->timer[i].comprate; + update_register(&val64, data, mask); + vhpet->timer[i].comprate = val64; + if ((vhpet->timer[i].cap_config & + HPET_TCNF_VAL_SET) != 0) { + vhpet->timer[i].compval = val64; + } + } else { + KASSERT(vhpet->timer[i].comprate == 0, + ("vhpet one-shot timer %d has invalid " + "rate %u", i, vhpet->timer[i].comprate)); + val64 = vhpet->timer[i].compval; + update_register(&val64, data, mask); + vhpet->timer[i].compval = val64; + } + vhpet->timer[i].cap_config &= ~HPET_TCNF_VAL_SET; + + if (vhpet->timer[i].compval != old_compval || + vhpet->timer[i].comprate != old_comprate) { + if (vhpet_counter_enabled(vhpet)) { + counter = vhpet_counter(vhpet, &now); + vhpet_start_timer(vhpet, i, counter, + now); + } + } + break; + } + + if (offset == HPET_TIMER_FSB_VAL(i) || + offset == HPET_TIMER_FSB_ADDR(i)) { + update_register(&vhpet->timer[i].msireg, data, mask); + break; + } + } +done: + VHPET_UNLOCK(vhpet); + return (0); +} + +int +vhpet_mmio_read(struct vcpu *vcpu, uint64_t gpa, uint64_t *rval, int size, + void *arg) +{ + int i, offset; + struct vhpet *vhpet; + uint64_t data; + + vhpet = vm_hpet(vcpu_vm(vcpu)); + offset = gpa - VHPET_BASE; + + VHPET_LOCK(vhpet); + + /* Accesses to the HPET should be 4 or 8 bytes wide */ + if (size != 4 && size != 8) { + VM_CTR2(vhpet->vm, "hpet invalid mmio read: " + "offset 0x%08x, size %d", offset, size); + data = 0; + goto done; + } + + /* Access to the HPET should be naturally aligned to its width */ + if (offset & (size - 1)) { + VM_CTR2(vhpet->vm, "hpet invalid mmio read: " + "offset 0x%08x, size %d", offset, size); + data = 0; + goto done; + } + + if (offset == HPET_CAPABILITIES || offset == HPET_CAPABILITIES + 4) { + data = vhpet_capabilities(); + goto done; + } + + if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) { + data = vhpet->config; + goto done; + } + + if (offset == HPET_ISR || offset == HPET_ISR + 4) { + data = vhpet->isr; + goto done; + } + + if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) { + data = vhpet_counter(vhpet, NULL); + goto done; + } + + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + if (offset == HPET_TIMER_CAP_CNF(i) || + offset == HPET_TIMER_CAP_CNF(i) + 4) { + data = vhpet->timer[i].cap_config; + break; + } + + if (offset == HPET_TIMER_COMPARATOR(i) || + offset == HPET_TIMER_COMPARATOR(i) + 4) { + data = vhpet->timer[i].compval; + break; + } + + if (offset == HPET_TIMER_FSB_VAL(i) || + offset == HPET_TIMER_FSB_ADDR(i)) { + data = vhpet->timer[i].msireg; + break; + } + } + + if (i >= VHPET_NUM_TIMERS) + data = 0; +done: + VHPET_UNLOCK(vhpet); + + if (size == 4) { + if (offset & 0x4) + data >>= 32; + } + *rval = data; + return (0); +} + +struct vhpet * +vhpet_init(struct vm *vm) +{ + int i, pincount; + struct vhpet *vhpet; + uint64_t allowed_irqs; + struct vhpet_callout_arg *arg; + struct bintime bt; + + vhpet = malloc(sizeof(struct vhpet), M_VHPET, M_WAITOK | M_ZERO); + vhpet->vm = vm; + mtx_init(&vhpet->mtx, "vhpet lock", NULL, MTX_DEF); + + FREQ2BT(HPET_FREQ, &bt); + vhpet->freq_sbt = bttosbt(bt); + + pincount = vioapic_pincount(vm); + if (pincount >= 32) + allowed_irqs = 0xff000000; /* irqs 24-31 */ + else if (pincount >= 20) + allowed_irqs = 0xf << (pincount - 4); /* 4 upper irqs */ + else + allowed_irqs = 0; + + /* + * Initialize HPET timer hardware state. + */ + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + vhpet->timer[i].cap_config = allowed_irqs << 32; + vhpet->timer[i].cap_config |= HPET_TCAP_PER_INT; + vhpet->timer[i].cap_config |= HPET_TCAP_FSB_INT_DEL; + + vhpet->timer[i].compval = 0xffffffff; + callout_init(&vhpet->timer[i].callout, 1); + + arg = &vhpet->timer[i].arg; + arg->vhpet = vhpet; + arg->timer_num = i; + } + + return (vhpet); +} + +void +vhpet_cleanup(struct vhpet *vhpet) +{ + int i; + + for (i = 0; i < VHPET_NUM_TIMERS; i++) + callout_drain(&vhpet->timer[i].callout); + + mtx_destroy(&vhpet->mtx); + free(vhpet, M_VHPET); +} + +int +vhpet_getcap(struct vm_hpet_cap *cap) +{ + + cap->capabilities = vhpet_capabilities(); + return (0); +} + +#ifdef BHYVE_SNAPSHOT +int +vhpet_snapshot(struct vhpet *vhpet, struct vm_snapshot_meta *meta) +{ + int i, ret; + uint32_t countbase; + + SNAPSHOT_VAR_OR_LEAVE(vhpet->freq_sbt, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->config, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->isr, meta, ret, done); + + /* at restore time the countbase should have the value it had when the + * snapshot was created; since the value is not directly kept in + * vhpet->countbase, but rather computed relative to the current system + * uptime using countbase_sbt, save the value returned by vhpet_counter + */ + if (meta->op == VM_SNAPSHOT_SAVE) + countbase = vhpet_counter(vhpet, NULL); + SNAPSHOT_VAR_OR_LEAVE(countbase, meta, ret, done); + if (meta->op == VM_SNAPSHOT_RESTORE) + vhpet->countbase = countbase; + + for (i = 0; i < nitems(vhpet->timer); i++) { + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].cap_config, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].msireg, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].compval, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].comprate, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].callout_sbt, + meta, ret, done); + } + +done: + return (ret); +} + +int +vhpet_restore_time(struct vhpet *vhpet) +{ + if (vhpet_counter_enabled(vhpet)) + vhpet_start_counting(vhpet); + + return (0); +} +#endif diff --git a/sys/amd64/vmm/io/vhpet.h b/sys/amd64/vmm/io/vhpet.h new file mode 100644 index 000000000000..4cc000ec70cf --- /dev/null +++ b/sys/amd64/vmm/io/vhpet.h @@ -0,0 +1,53 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VHPET_H_ +#define _VHPET_H_ + +#define VHPET_BASE 0xfed00000 +#define VHPET_SIZE 1024 + +#ifdef _KERNEL +struct vm_snapshot_meta; + +struct vhpet *vhpet_init(struct vm *vm); +void vhpet_cleanup(struct vhpet *vhpet); +int vhpet_mmio_write(struct vcpu *vcpu, uint64_t gpa, uint64_t val, + int size, void *arg); +int vhpet_mmio_read(struct vcpu *vcpu, uint64_t gpa, uint64_t *val, + int size, void *arg); +int vhpet_getcap(struct vm_hpet_cap *cap); +#ifdef BHYVE_SNAPSHOT +int vhpet_snapshot(struct vhpet *vhpet, struct vm_snapshot_meta *meta); +int vhpet_restore_time(struct vhpet *vhpet); +#endif + +#endif /* _KERNEL */ + +#endif /* _VHPET_H_ */ diff --git a/sys/amd64/vmm/io/vioapic.c b/sys/amd64/vmm/io/vioapic.c new file mode 100644 index 000000000000..7df6193d6dc0 --- /dev/null +++ b/sys/amd64/vmm/io/vioapic.c @@ -0,0 +1,545 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_bhyve_snapshot.h" + +#include <sys/param.h> +#include <sys/queue.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> + +#include <x86/apicreg.h> +#include <machine/vmm.h> +#include <machine/vmm_snapshot.h> + +#include <dev/vmm/vmm_ktr.h> + +#include "vmm_lapic.h" +#include "vlapic.h" +#include "vioapic.h" + +#define IOREGSEL 0x00 +#define IOWIN 0x10 + +#define REDIR_ENTRIES 32 +#define RTBL_RO_BITS ((uint64_t)(IOART_REM_IRR | IOART_DELIVS)) + +struct vioapic { + struct vm *vm; + struct mtx mtx; + uint32_t id; + uint32_t ioregsel; + struct { + uint64_t reg; + int acnt; /* sum of pin asserts (+1) and deasserts (-1) */ + } rtbl[REDIR_ENTRIES]; +}; + +#define VIOAPIC_LOCK(vioapic) mtx_lock_spin(&((vioapic)->mtx)) +#define VIOAPIC_UNLOCK(vioapic) mtx_unlock_spin(&((vioapic)->mtx)) +#define VIOAPIC_LOCKED(vioapic) mtx_owned(&((vioapic)->mtx)) + +static MALLOC_DEFINE(M_VIOAPIC, "vioapic", "bhyve virtual ioapic"); + +#define VIOAPIC_CTR1(vioapic, fmt, a1) \ + VM_CTR1((vioapic)->vm, fmt, a1) + +#define VIOAPIC_CTR2(vioapic, fmt, a1, a2) \ + VM_CTR2((vioapic)->vm, fmt, a1, a2) + +#define VIOAPIC_CTR3(vioapic, fmt, a1, a2, a3) \ + VM_CTR3((vioapic)->vm, fmt, a1, a2, a3) + +#define VIOAPIC_CTR4(vioapic, fmt, a1, a2, a3, a4) \ + VM_CTR4((vioapic)->vm, fmt, a1, a2, a3, a4) + +#ifdef KTR +static const char * +pinstate_str(bool asserted) +{ + + if (asserted) + return ("asserted"); + else + return ("deasserted"); +} +#endif + +static void +vioapic_send_intr(struct vioapic *vioapic, int pin) +{ + int vector, delmode; + uint32_t low, high, dest; + bool level, phys; + + KASSERT(pin >= 0 && pin < REDIR_ENTRIES, + ("vioapic_set_pinstate: invalid pin number %d", pin)); + + KASSERT(VIOAPIC_LOCKED(vioapic), + ("vioapic_set_pinstate: vioapic is not locked")); + + low = vioapic->rtbl[pin].reg; + high = vioapic->rtbl[pin].reg >> 32; + + if ((low & IOART_INTMASK) == IOART_INTMSET) { + VIOAPIC_CTR1(vioapic, "ioapic pin%d: masked", pin); + return; + } + + phys = ((low & IOART_DESTMOD) == IOART_DESTPHY); + delmode = low & IOART_DELMOD; + level = low & IOART_TRGRLVL ? true : false; + if (level) { + if ((low & IOART_REM_IRR) != 0) { + VIOAPIC_CTR1(vioapic, "ioapic pin%d: irr pending", + pin); + return; + } + vioapic->rtbl[pin].reg |= IOART_REM_IRR; + } + + vector = low & IOART_INTVEC; + dest = high >> APIC_ID_SHIFT; + /* + * Ideally we'd just call lapic_intr_msi() here with the + * constructed MSI instead of interpreting it for ourselves. + * But until/unless we support emulated IOMMUs with interrupt + * remapping, interpretation is simple. We just need to mask + * in the Extended Destination ID bits for the 15-bit + * enlightenment (http://david.woodhou.se/ExtDestId.pdf) + */ + dest |= ((high & APIC_EXT_ID_MASK) >> APIC_EXT_ID_SHIFT) << 8; + vlapic_deliver_intr(vioapic->vm, level, dest, phys, delmode, vector); +} + +static void +vioapic_set_pinstate(struct vioapic *vioapic, int pin, bool newstate) +{ + int oldcnt, newcnt; + bool needintr; + + KASSERT(pin >= 0 && pin < REDIR_ENTRIES, + ("vioapic_set_pinstate: invalid pin number %d", pin)); + + KASSERT(VIOAPIC_LOCKED(vioapic), + ("vioapic_set_pinstate: vioapic is not locked")); + + oldcnt = vioapic->rtbl[pin].acnt; + if (newstate) + vioapic->rtbl[pin].acnt++; + else + vioapic->rtbl[pin].acnt--; + newcnt = vioapic->rtbl[pin].acnt; + + if (newcnt < 0) { + VIOAPIC_CTR2(vioapic, "ioapic pin%d: bad acnt %d", + pin, newcnt); + } + + needintr = false; + if (oldcnt == 0 && newcnt == 1) { + needintr = true; + VIOAPIC_CTR1(vioapic, "ioapic pin%d: asserted", pin); + } else if (oldcnt == 1 && newcnt == 0) { + VIOAPIC_CTR1(vioapic, "ioapic pin%d: deasserted", pin); + } else { + VIOAPIC_CTR3(vioapic, "ioapic pin%d: %s, ignored, acnt %d", + pin, pinstate_str(newstate), newcnt); + } + + if (needintr) + vioapic_send_intr(vioapic, pin); +} + +enum irqstate { + IRQSTATE_ASSERT, + IRQSTATE_DEASSERT, + IRQSTATE_PULSE +}; + +static int +vioapic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate) +{ + struct vioapic *vioapic; + + if (irq < 0 || irq >= REDIR_ENTRIES) + return (EINVAL); + + vioapic = vm_ioapic(vm); + + VIOAPIC_LOCK(vioapic); + switch (irqstate) { + case IRQSTATE_ASSERT: + vioapic_set_pinstate(vioapic, irq, true); + break; + case IRQSTATE_DEASSERT: + vioapic_set_pinstate(vioapic, irq, false); + break; + case IRQSTATE_PULSE: + vioapic_set_pinstate(vioapic, irq, true); + vioapic_set_pinstate(vioapic, irq, false); + break; + default: + panic("vioapic_set_irqstate: invalid irqstate %d", irqstate); + } + VIOAPIC_UNLOCK(vioapic); + + return (0); +} + +int +vioapic_assert_irq(struct vm *vm, int irq) +{ + + return (vioapic_set_irqstate(vm, irq, IRQSTATE_ASSERT)); +} + +int +vioapic_deassert_irq(struct vm *vm, int irq) +{ + + return (vioapic_set_irqstate(vm, irq, IRQSTATE_DEASSERT)); +} + +int +vioapic_pulse_irq(struct vm *vm, int irq) +{ + + return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE)); +} + +/* + * Reset the vlapic's trigger-mode register to reflect the ioapic pin + * configuration. + */ +static void +vioapic_update_tmr(struct vcpu *vcpu, void *arg) +{ + struct vioapic *vioapic; + struct vlapic *vlapic; + uint32_t low, high, dest; + int delmode, pin, vector; + bool level, phys; + + vlapic = vm_lapic(vcpu); + vioapic = vm_ioapic(vcpu_vm(vcpu)); + + VIOAPIC_LOCK(vioapic); + /* + * Reset all vectors to be edge-triggered. + */ + vlapic_reset_tmr(vlapic); + for (pin = 0; pin < REDIR_ENTRIES; pin++) { + low = vioapic->rtbl[pin].reg; + high = vioapic->rtbl[pin].reg >> 32; + + level = low & IOART_TRGRLVL ? true : false; + if (!level) + continue; + + /* + * For a level-triggered 'pin' let the vlapic figure out if + * an assertion on this 'pin' would result in an interrupt + * being delivered to it. If yes, then it will modify the + * TMR bit associated with this vector to level-triggered. + */ + phys = ((low & IOART_DESTMOD) == IOART_DESTPHY); + delmode = low & IOART_DELMOD; + vector = low & IOART_INTVEC; + dest = high >> APIC_ID_SHIFT; + vlapic_set_tmr_level(vlapic, dest, phys, delmode, vector); + } + VIOAPIC_UNLOCK(vioapic); +} + +static uint32_t +vioapic_read(struct vioapic *vioapic, struct vcpu *vcpu, uint32_t addr) +{ + int regnum, pin, rshift; + + regnum = addr & 0xff; + switch (regnum) { + case IOAPIC_ID: + return (vioapic->id); + break; + case IOAPIC_VER: + return (((REDIR_ENTRIES - 1) << MAXREDIRSHIFT) | 0x11); + break; + case IOAPIC_ARB: + return (vioapic->id); + break; + default: + break; + } + + /* redirection table entries */ + if (regnum >= IOAPIC_REDTBL && + regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) { + pin = (regnum - IOAPIC_REDTBL) / 2; + if ((regnum - IOAPIC_REDTBL) % 2) + rshift = 32; + else + rshift = 0; + + return (vioapic->rtbl[pin].reg >> rshift); + } + + return (0); +} + +static void +vioapic_write(struct vioapic *vioapic, struct vcpu *vcpu, uint32_t addr, + uint32_t data) +{ + uint64_t data64, mask64; + uint64_t last, changed; + int regnum, pin, lshift; + cpuset_t allvcpus; + + regnum = addr & 0xff; + switch (regnum) { + case IOAPIC_ID: + vioapic->id = data & APIC_ID_MASK; + break; + case IOAPIC_VER: + case IOAPIC_ARB: + /* readonly */ + break; + default: + break; + } + + /* redirection table entries */ + if (regnum >= IOAPIC_REDTBL && + regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) { + pin = (regnum - IOAPIC_REDTBL) / 2; + if ((regnum - IOAPIC_REDTBL) % 2) + lshift = 32; + else + lshift = 0; + + last = vioapic->rtbl[pin].reg; + + data64 = (uint64_t)data << lshift; + mask64 = (uint64_t)0xffffffff << lshift; + vioapic->rtbl[pin].reg &= ~mask64 | RTBL_RO_BITS; + vioapic->rtbl[pin].reg |= data64 & ~RTBL_RO_BITS; + + /* + * Switching from level to edge triggering will clear the IRR + * bit. This is what FreeBSD will do in order to EOI an + * interrupt when the IO-APIC doesn't support targeted EOI (see + * _ioapic_eoi_source). + */ + if ((vioapic->rtbl[pin].reg & IOART_TRGRMOD) == IOART_TRGREDG && + (vioapic->rtbl[pin].reg & IOART_REM_IRR) != 0) + vioapic->rtbl[pin].reg &= ~IOART_REM_IRR; + + VIOAPIC_CTR2(vioapic, "ioapic pin%d: redir table entry %#lx", + pin, vioapic->rtbl[pin].reg); + + /* + * If any fields in the redirection table entry (except mask + * or polarity) have changed then rendezvous all the vcpus + * to update their vlapic trigger-mode registers. + */ + changed = last ^ vioapic->rtbl[pin].reg; + if (changed & ~(IOART_INTMASK | IOART_INTPOL)) { + VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate " + "vlapic trigger-mode register", pin); + VIOAPIC_UNLOCK(vioapic); + allvcpus = vm_active_cpus(vioapic->vm); + (void)vm_smp_rendezvous(vcpu, allvcpus, + vioapic_update_tmr, NULL); + VIOAPIC_LOCK(vioapic); + } + + /* + * Generate an interrupt if the following conditions are met: + * - pin trigger mode is level + * - pin level is asserted + */ + if ((vioapic->rtbl[pin].reg & IOART_TRGRMOD) == IOART_TRGRLVL && + (vioapic->rtbl[pin].acnt > 0)) { + VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at rtbl " + "write, acnt %d", pin, vioapic->rtbl[pin].acnt); + vioapic_send_intr(vioapic, pin); + } + } +} + +static int +vioapic_mmio_rw(struct vioapic *vioapic, struct vcpu *vcpu, uint64_t gpa, + uint64_t *data, int size, bool doread) +{ + uint64_t offset; + + offset = gpa - VIOAPIC_BASE; + + /* + * The IOAPIC specification allows 32-bit wide accesses to the + * IOREGSEL (offset 0) and IOWIN (offset 16) registers. + */ + if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) { + if (doread) + *data = 0; + return (0); + } + + VIOAPIC_LOCK(vioapic); + if (offset == IOREGSEL) { + if (doread) + *data = vioapic->ioregsel; + else + vioapic->ioregsel = *data; + } else { + if (doread) { + *data = vioapic_read(vioapic, vcpu, + vioapic->ioregsel); + } else { + vioapic_write(vioapic, vcpu, vioapic->ioregsel, + *data); + } + } + VIOAPIC_UNLOCK(vioapic); + + return (0); +} + +int +vioapic_mmio_read(struct vcpu *vcpu, uint64_t gpa, uint64_t *rval, + int size, void *arg) +{ + int error; + struct vioapic *vioapic; + + vioapic = vm_ioapic(vcpu_vm(vcpu)); + error = vioapic_mmio_rw(vioapic, vcpu, gpa, rval, size, true); + return (error); +} + +int +vioapic_mmio_write(struct vcpu *vcpu, uint64_t gpa, uint64_t wval, + int size, void *arg) +{ + int error; + struct vioapic *vioapic; + + vioapic = vm_ioapic(vcpu_vm(vcpu)); + error = vioapic_mmio_rw(vioapic, vcpu, gpa, &wval, size, false); + return (error); +} + +void +vioapic_process_eoi(struct vm *vm, int vector) +{ + struct vioapic *vioapic; + int pin; + + KASSERT(vector >= 0 && vector < 256, + ("vioapic_process_eoi: invalid vector %d", vector)); + + vioapic = vm_ioapic(vm); + VIOAPIC_CTR1(vioapic, "ioapic processing eoi for vector %d", vector); + + /* + * XXX keep track of the pins associated with this vector instead + * of iterating on every single pin each time. + */ + VIOAPIC_LOCK(vioapic); + for (pin = 0; pin < REDIR_ENTRIES; pin++) { + if ((vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0) + continue; + if ((vioapic->rtbl[pin].reg & IOART_INTVEC) != vector) + continue; + vioapic->rtbl[pin].reg &= ~IOART_REM_IRR; + if (vioapic->rtbl[pin].acnt > 0) { + VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at eoi, " + "acnt %d", pin, vioapic->rtbl[pin].acnt); + vioapic_send_intr(vioapic, pin); + } + } + VIOAPIC_UNLOCK(vioapic); +} + +struct vioapic * +vioapic_init(struct vm *vm) +{ + int i; + struct vioapic *vioapic; + + vioapic = malloc(sizeof(struct vioapic), M_VIOAPIC, M_WAITOK | M_ZERO); + + vioapic->vm = vm; + mtx_init(&vioapic->mtx, "vioapic lock", NULL, MTX_SPIN); + + /* Initialize all redirection entries to mask all interrupts */ + for (i = 0; i < REDIR_ENTRIES; i++) + vioapic->rtbl[i].reg = 0x0001000000010000UL; + + return (vioapic); +} + +void +vioapic_cleanup(struct vioapic *vioapic) +{ + + mtx_destroy(&vioapic->mtx); + free(vioapic, M_VIOAPIC); +} + +int +vioapic_pincount(struct vm *vm) +{ + + return (REDIR_ENTRIES); +} + +#ifdef BHYVE_SNAPSHOT +int +vioapic_snapshot(struct vioapic *vioapic, struct vm_snapshot_meta *meta) +{ + int ret; + int i; + + SNAPSHOT_VAR_OR_LEAVE(vioapic->ioregsel, meta, ret, done); + + for (i = 0; i < nitems(vioapic->rtbl); i++) { + SNAPSHOT_VAR_OR_LEAVE(vioapic->rtbl[i].reg, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vioapic->rtbl[i].acnt, meta, ret, done); + } + +done: + return (ret); +} +#endif diff --git a/sys/amd64/vmm/io/vioapic.h b/sys/amd64/vmm/io/vioapic.h new file mode 100644 index 000000000000..ac05d95357ff --- /dev/null +++ b/sys/amd64/vmm/io/vioapic.h @@ -0,0 +1,60 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VIOAPIC_H_ +#define _VIOAPIC_H_ + +#define VIOAPIC_BASE 0xFEC00000 +#define VIOAPIC_SIZE 4096 + +#ifdef _KERNEL +struct vm_snapshot_meta; + +struct vioapic *vioapic_init(struct vm *vm); +void vioapic_cleanup(struct vioapic *vioapic); + +int vioapic_assert_irq(struct vm *vm, int irq); +int vioapic_deassert_irq(struct vm *vm, int irq); +int vioapic_pulse_irq(struct vm *vm, int irq); + +int vioapic_mmio_write(struct vcpu *vcpu, uint64_t gpa, + uint64_t wval, int size, void *arg); +int vioapic_mmio_read(struct vcpu *vcpu, uint64_t gpa, + uint64_t *rval, int size, void *arg); + +int vioapic_pincount(struct vm *vm); +void vioapic_process_eoi(struct vm *vm, int vector); +#ifdef BHYVE_SNAPSHOT +int vioapic_snapshot(struct vioapic *vioapic, + struct vm_snapshot_meta *meta); +#endif + +#endif /* _KERNEL */ + +#endif /* _VIOAPIC_H_ */ diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c new file mode 100644 index 000000000000..9879dfa164a4 --- /dev/null +++ b/sys/amd64/vmm/io/vlapic.c @@ -0,0 +1,1907 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * Copyright (c) 2019 Joyent, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_bhyve_snapshot.h" + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/systm.h> +#include <sys/smp.h> + +#include <x86/specialreg.h> +#include <x86/apicreg.h> + +#include <machine/clock.h> +#include <machine/smp.h> +#include <machine/vmm.h> +#include <machine/vmm_snapshot.h> + +#include <dev/vmm/vmm_ktr.h> + +#include "vmm_lapic.h" +#include "vmm_stat.h" + +#include "vlapic.h" +#include "vlapic_priv.h" +#include "vioapic.h" + +#define PRIO(x) ((x) >> 4) + +#define VLAPIC_VERSION (0x14) + +#define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0) + +/* + * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the + * vlapic_callout_handler() and vcpu accesses to: + * - timer_freq_bt, timer_period_bt, timer_fire_bt + * - timer LVT register + */ +#define VLAPIC_TIMER_LOCK(vlapic) mtx_lock_spin(&((vlapic)->timer_mtx)) +#define VLAPIC_TIMER_UNLOCK(vlapic) mtx_unlock_spin(&((vlapic)->timer_mtx)) +#define VLAPIC_TIMER_LOCKED(vlapic) mtx_owned(&((vlapic)->timer_mtx)) + +/* + * APIC timer frequency: + * - arbitrary but chosen to be in the ballpark of contemporary hardware. + * - power-of-two to avoid loss of precision when converted to a bintime. + */ +#define VLAPIC_BUS_FREQ (128 * 1024 * 1024) + +static void vlapic_set_error(struct vlapic *, uint32_t, bool); +static void vlapic_callout_handler(void *arg); +static void vlapic_reset(struct vlapic *vlapic); + +static __inline uint32_t +vlapic_get_id(struct vlapic *vlapic) +{ + + if (x2apic(vlapic)) + return (vlapic->vcpuid); + else + return (vlapic->vcpuid << 24); +} + +static uint32_t +x2apic_ldr(struct vlapic *vlapic) +{ + int apicid; + uint32_t ldr; + + apicid = vlapic_get_id(vlapic); + ldr = 1 << (apicid & 0xf); + ldr |= (apicid & 0xffff0) << 12; + return (ldr); +} + +void +vlapic_dfr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + if (x2apic(vlapic)) { + VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x", + lapic->dfr); + lapic->dfr = 0; + return; + } + + lapic->dfr &= APIC_DFR_MODEL_MASK; + lapic->dfr |= APIC_DFR_RESERVED; + + if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) + VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model"); + else if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) + VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model"); + else + VLAPIC_CTR1(vlapic, "DFR in Unknown Model %#x", lapic->dfr); +} + +void +vlapic_ldr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + + /* LDR is read-only in x2apic mode */ + if (x2apic(vlapic)) { + VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x", + lapic->ldr); + lapic->ldr = x2apic_ldr(vlapic); + } else { + lapic->ldr &= ~APIC_LDR_RESERVED; + VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr); + } +} + +void +vlapic_id_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + /* + * We don't allow the ID register to be modified so reset it back to + * its default value. + */ + lapic = vlapic->apic_page; + lapic->id = vlapic_get_id(vlapic); +} + +static int +vlapic_timer_divisor(uint32_t dcr) +{ + switch (dcr & 0xB) { + case APIC_TDCR_1: + return (1); + case APIC_TDCR_2: + return (2); + case APIC_TDCR_4: + return (4); + case APIC_TDCR_8: + return (8); + case APIC_TDCR_16: + return (16); + case APIC_TDCR_32: + return (32); + case APIC_TDCR_64: + return (64); + case APIC_TDCR_128: + return (128); + default: + panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr); + } +} + +#if 0 +static inline void +vlapic_dump_lvt(uint32_t offset, uint32_t *lvt) +{ + printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset, + *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS, + *lvt & APIC_LVTT_M); +} +#endif + +static uint32_t +vlapic_get_ccr(struct vlapic *vlapic) +{ + struct bintime bt_now, bt_rem; + struct LAPIC *lapic __diagused; + uint32_t ccr; + + ccr = 0; + lapic = vlapic->apic_page; + + VLAPIC_TIMER_LOCK(vlapic); + if (callout_active(&vlapic->callout)) { + /* + * If the timer is scheduled to expire in the future then + * compute the value of 'ccr' based on the remaining time. + */ + binuptime(&bt_now); + if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) { + bt_rem = vlapic->timer_fire_bt; + bintime_sub(&bt_rem, &bt_now); + ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt); + ccr += bt_rem.frac / vlapic->timer_freq_bt.frac; + } + } + KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, " + "icr_timer is %#x", ccr, lapic->icr_timer)); + VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x", + ccr, lapic->icr_timer); + VLAPIC_TIMER_UNLOCK(vlapic); + return (ccr); +} + +void +vlapic_dcr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + int divisor; + + lapic = vlapic->apic_page; + VLAPIC_TIMER_LOCK(vlapic); + + divisor = vlapic_timer_divisor(lapic->dcr_timer); + VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", + lapic->dcr_timer, divisor); + + /* + * Update the timer frequency and the timer period. + * + * XXX changes to the frequency divider will not take effect until + * the timer is reloaded. + */ + FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt); + vlapic->timer_period_bt = vlapic->timer_freq_bt; + bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); + + VLAPIC_TIMER_UNLOCK(vlapic); +} + +void +vlapic_esr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + lapic->esr = vlapic->esr_pending; + vlapic->esr_pending = 0; +} + +int +vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level) +{ + struct LAPIC *lapic; + uint32_t *irrptr, *tmrptr, mask; + int idx; + + KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector)); + + lapic = vlapic->apic_page; + if (!(lapic->svr & APIC_SVR_ENABLE)) { + VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring " + "interrupt %d", vector); + return (0); + } + + if (vector < 16) { + vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR, + false); + VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d", + vector); + return (1); + } + + if (vlapic->ops.set_intr_ready) + return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level)); + + idx = (vector / 32) * 4; + mask = 1 << (vector % 32); + + irrptr = &lapic->irr0; + atomic_set_int(&irrptr[idx], mask); + + /* + * Verify that the trigger-mode of the interrupt matches with + * the vlapic TMR registers. + */ + tmrptr = &lapic->tmr0; + if ((tmrptr[idx] & mask) != (level ? mask : 0)) { + VLAPIC_CTR3(vlapic, "vlapic TMR[%d] is 0x%08x but " + "interrupt is %s-triggered", idx / 4, tmrptr[idx], + level ? "level" : "edge"); + } + + VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready"); + return (1); +} + +static __inline uint32_t * +vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset) +{ + struct LAPIC *lapic = vlapic->apic_page; + int i; + + switch (offset) { + case APIC_OFFSET_CMCI_LVT: + return (&lapic->lvt_cmci); + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + i = (offset - APIC_OFFSET_TIMER_LVT) >> 2; + return ((&lapic->lvt_timer) + i); + default: + panic("vlapic_get_lvt: invalid LVT\n"); + } +} + +static __inline int +lvt_off_to_idx(uint32_t offset) +{ + int index; + + switch (offset) { + case APIC_OFFSET_CMCI_LVT: + index = APIC_LVT_CMCI; + break; + case APIC_OFFSET_TIMER_LVT: + index = APIC_LVT_TIMER; + break; + case APIC_OFFSET_THERM_LVT: + index = APIC_LVT_THERMAL; + break; + case APIC_OFFSET_PERF_LVT: + index = APIC_LVT_PMC; + break; + case APIC_OFFSET_LINT0_LVT: + index = APIC_LVT_LINT0; + break; + case APIC_OFFSET_LINT1_LVT: + index = APIC_LVT_LINT1; + break; + case APIC_OFFSET_ERROR_LVT: + index = APIC_LVT_ERROR; + break; + default: + index = -1; + break; + } + KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: " + "invalid lvt index %d for offset %#x", index, offset)); + + return (index); +} + +static __inline uint32_t +vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset) +{ + int idx; + uint32_t val; + + idx = lvt_off_to_idx(offset); + val = atomic_load_acq_32(&vlapic->lvt_last[idx]); + return (val); +} + +void +vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset) +{ + uint32_t *lvtptr, mask, val; + struct LAPIC *lapic; + int idx; + + lapic = vlapic->apic_page; + lvtptr = vlapic_get_lvtptr(vlapic, offset); + val = *lvtptr; + idx = lvt_off_to_idx(offset); + + if (!(lapic->svr & APIC_SVR_ENABLE)) + val |= APIC_LVT_M; + mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR; + switch (offset) { + case APIC_OFFSET_TIMER_LVT: + mask |= APIC_LVTT_TM; + break; + case APIC_OFFSET_ERROR_LVT: + break; + case APIC_OFFSET_LINT0_LVT: + case APIC_OFFSET_LINT1_LVT: + mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP; + /* FALLTHROUGH */ + default: + mask |= APIC_LVT_DM; + break; + } + val &= mask; + *lvtptr = val; + atomic_store_rel_32(&vlapic->lvt_last[idx], val); +} + +static void +vlapic_mask_lvts(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + + lapic->lvt_cmci |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT); + + lapic->lvt_timer |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT); + + lapic->lvt_thermal |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT); + + lapic->lvt_pcint |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT); + + lapic->lvt_lint0 |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT); + + lapic->lvt_lint1 |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT); + + lapic->lvt_error |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT); +} + +static int +vlapic_fire_lvt(struct vlapic *vlapic, u_int lvt) +{ + uint32_t mode, reg, vec; + + reg = atomic_load_acq_32(&vlapic->lvt_last[lvt]); + + if (reg & APIC_LVT_M) + return (0); + vec = reg & APIC_LVT_VECTOR; + mode = reg & APIC_LVT_DM; + + switch (mode) { + case APIC_LVT_DM_FIXED: + if (vec < 16) { + vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, + lvt == APIC_LVT_ERROR); + return (0); + } + if (vlapic_set_intr_ready(vlapic, vec, false)) + vcpu_notify_event(vlapic->vcpu, true); + break; + case APIC_LVT_DM_NMI: + vm_inject_nmi(vlapic->vcpu); + break; + case APIC_LVT_DM_EXTINT: + vm_inject_extint(vlapic->vcpu); + break; + default: + // Other modes ignored + return (0); + } + return (1); +} + +#if 1 +static void +dump_isrvec_stk(struct vlapic *vlapic) +{ + int i; + uint32_t *isrptr; + + isrptr = &vlapic->apic_page->isr0; + for (i = 0; i < 8; i++) + printf("ISR%d 0x%08x\n", i, isrptr[i * 4]); + + for (i = 0; i <= vlapic->isrvec_stk_top; i++) + printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]); +} +#endif + +/* + * Algorithm adopted from section "Interrupt, Task and Processor Priority" + * in Intel Architecture Manual Vol 3a. + */ +static void +vlapic_update_ppr(struct vlapic *vlapic) +{ + int isrvec, tpr, ppr; + + /* + * Note that the value on the stack at index 0 is always 0. + * + * This is a placeholder for the value of ISRV when none of the + * bits is set in the ISRx registers. + */ + isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top]; + tpr = vlapic->apic_page->tpr; + +#if 1 + { + int i, lastprio, curprio, vector, idx; + uint32_t *isrptr; + + if (vlapic->isrvec_stk_top == 0 && isrvec != 0) + panic("isrvec_stk is corrupted: %d", isrvec); + + /* + * Make sure that the priority of the nested interrupts is + * always increasing. + */ + lastprio = -1; + for (i = 1; i <= vlapic->isrvec_stk_top; i++) { + curprio = PRIO(vlapic->isrvec_stk[i]); + if (curprio <= lastprio) { + dump_isrvec_stk(vlapic); + panic("isrvec_stk does not satisfy invariant"); + } + lastprio = curprio; + } + + /* + * Make sure that each bit set in the ISRx registers has a + * corresponding entry on the isrvec stack. + */ + i = 1; + isrptr = &vlapic->apic_page->isr0; + for (vector = 0; vector < 256; vector++) { + idx = (vector / 32) * 4; + if (isrptr[idx] & (1 << (vector % 32))) { + if (i > vlapic->isrvec_stk_top || + vlapic->isrvec_stk[i] != vector) { + dump_isrvec_stk(vlapic); + panic("ISR and isrvec_stk out of sync"); + } + i++; + } + } + } +#endif + + if (PRIO(tpr) >= PRIO(isrvec)) + ppr = tpr; + else + ppr = isrvec & 0xf0; + + vlapic->apic_page->ppr = ppr; + VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); +} + +void +vlapic_sync_tpr(struct vlapic *vlapic) +{ + vlapic_update_ppr(vlapic); +} + +static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt"); + +static void +vlapic_process_eoi(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *isrptr, *tmrptr; + int i, idx, bitpos, vector; + + isrptr = &lapic->isr0; + tmrptr = &lapic->tmr0; + + for (i = 7; i >= 0; i--) { + idx = i * 4; + bitpos = fls(isrptr[idx]); + if (bitpos-- != 0) { + if (vlapic->isrvec_stk_top <= 0) { + panic("invalid vlapic isrvec_stk_top %d", + vlapic->isrvec_stk_top); + } + isrptr[idx] &= ~(1 << bitpos); + vector = i * 32 + bitpos; + VLAPIC_CTR1(vlapic, "EOI vector %d", vector); + VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi"); + vlapic->isrvec_stk_top--; + vlapic_update_ppr(vlapic); + if ((tmrptr[idx] & (1 << bitpos)) != 0) { + vioapic_process_eoi(vlapic->vm, vector); + } + return; + } + } + VLAPIC_CTR0(vlapic, "Gratuitous EOI"); + vmm_stat_incr(vlapic->vcpu, VLAPIC_GRATUITOUS_EOI, 1); +} + +static __inline int +vlapic_get_lvt_field(uint32_t lvt, uint32_t mask) +{ + + return (lvt & mask); +} + +static __inline int +vlapic_periodic_timer(struct vlapic *vlapic) +{ + uint32_t lvt; + + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); + + return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC)); +} + +static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic"); + +static void +vlapic_set_error(struct vlapic *vlapic, uint32_t mask, bool lvt_error) +{ + + vlapic->esr_pending |= mask; + + /* + * Avoid infinite recursion if the error LVT itself is configured with + * an illegal vector. + */ + if (lvt_error) + return; + + if (vlapic_fire_lvt(vlapic, APIC_LVT_ERROR)) { + vmm_stat_incr(vlapic->vcpu, VLAPIC_INTR_ERROR, 1); + } +} + +static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic"); + +static void +vlapic_fire_timer(struct vlapic *vlapic) +{ + + KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked")); + + if (vlapic_fire_lvt(vlapic, APIC_LVT_TIMER)) { + VLAPIC_CTR0(vlapic, "vlapic timer fired"); + vmm_stat_incr(vlapic->vcpu, VLAPIC_INTR_TIMER, 1); + } +} + +static VMM_STAT(VLAPIC_INTR_CMC, + "corrected machine check interrupts generated by vlapic"); + +void +vlapic_fire_cmci(struct vlapic *vlapic) +{ + + if (vlapic_fire_lvt(vlapic, APIC_LVT_CMCI)) { + vmm_stat_incr(vlapic->vcpu, VLAPIC_INTR_CMC, 1); + } +} + +static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1, + "lvts triggered"); + +int +vlapic_trigger_lvt(struct vlapic *vlapic, int vector) +{ + + if (vlapic_enabled(vlapic) == false) { + /* + * When the local APIC is global/hardware disabled, + * LINT[1:0] pins are configured as INTR and NMI pins, + * respectively. + */ + switch (vector) { + case APIC_LVT_LINT0: + vm_inject_extint(vlapic->vcpu); + break; + case APIC_LVT_LINT1: + vm_inject_nmi(vlapic->vcpu); + break; + default: + break; + } + return (0); + } + + switch (vector) { + case APIC_LVT_LINT0: + case APIC_LVT_LINT1: + case APIC_LVT_TIMER: + case APIC_LVT_ERROR: + case APIC_LVT_PMC: + case APIC_LVT_THERMAL: + case APIC_LVT_CMCI: + if (vlapic_fire_lvt(vlapic, vector)) { + vmm_stat_array_incr(vlapic->vcpu, LVTS_TRIGGERRED, + vector, 1); + } + break; + default: + return (EINVAL); + } + return (0); +} + +static void +vlapic_callout_reset(struct vlapic *vlapic, sbintime_t t) +{ + callout_reset_sbt_curcpu(&vlapic->callout, t, 0, + vlapic_callout_handler, vlapic, 0); +} + +static void +vlapic_callout_handler(void *arg) +{ + struct vlapic *vlapic; + struct bintime bt, btnow; + sbintime_t rem_sbt; + + vlapic = arg; + + VLAPIC_TIMER_LOCK(vlapic); + if (callout_pending(&vlapic->callout)) /* callout was reset */ + goto done; + + if (!callout_active(&vlapic->callout)) /* callout was stopped */ + goto done; + + callout_deactivate(&vlapic->callout); + + vlapic_fire_timer(vlapic); + + if (vlapic_periodic_timer(vlapic)) { + binuptime(&btnow); + KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=), + ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx", + btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec, + vlapic->timer_fire_bt.frac)); + + /* + * Compute the delta between when the timer was supposed to + * fire and the present time. + */ + bt = btnow; + bintime_sub(&bt, &vlapic->timer_fire_bt); + + rem_sbt = bttosbt(vlapic->timer_period_bt); + if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) { + /* + * Adjust the time until the next countdown downward + * to account for the lost time. + */ + rem_sbt -= bttosbt(bt); + } else { + /* + * If the delta is greater than the timer period then + * just reset our time base instead of trying to catch + * up. + */ + vlapic->timer_fire_bt = btnow; + VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu " + "usecs, period is %lu usecs - resetting time base", + bttosbt(bt) / SBT_1US, + bttosbt(vlapic->timer_period_bt) / SBT_1US); + } + + bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); + vlapic_callout_reset(vlapic, rem_sbt); + } +done: + VLAPIC_TIMER_UNLOCK(vlapic); +} + +void +vlapic_icrtmr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + sbintime_t sbt; + uint32_t icr_timer; + + VLAPIC_TIMER_LOCK(vlapic); + + lapic = vlapic->apic_page; + icr_timer = lapic->icr_timer; + + vlapic->timer_period_bt = vlapic->timer_freq_bt; + bintime_mul(&vlapic->timer_period_bt, icr_timer); + + if (icr_timer != 0) { + binuptime(&vlapic->timer_fire_bt); + bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); + + sbt = bttosbt(vlapic->timer_period_bt); + vlapic_callout_reset(vlapic, sbt); + } else + callout_stop(&vlapic->callout); + + VLAPIC_TIMER_UNLOCK(vlapic); +} + +/* + * This function populates 'dmask' with the set of vcpus that match the + * addressing specified by the (dest, phys, lowprio) tuple. + * + * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit) + * or xAPIC (8-bit) destination field. + */ +static void +vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, + bool lowprio, bool x2apic_dest) +{ + struct vlapic *vlapic; + uint32_t dfr, ldr, ldest, cluster; + uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id; + cpuset_t amask; + int vcpuid; + + if ((x2apic_dest && dest == 0xffffffff) || + (!x2apic_dest && dest == 0xff)) { + /* + * Broadcast in both logical and physical modes. + */ + *dmask = vm_active_cpus(vm); + return; + } + + if (phys) { + /* + * Physical mode: destination is APIC ID. + */ + CPU_ZERO(dmask); + vcpuid = vm_apicid2vcpuid(vm, dest); + amask = vm_active_cpus(vm); + if (vcpuid < vm_get_maxcpus(vm) && CPU_ISSET(vcpuid, &amask)) + CPU_SET(vcpuid, dmask); + } else { + /* + * In the "Flat Model" the MDA is interpreted as an 8-bit wide + * bitmask. This model is only available in the xAPIC mode. + */ + mda_flat_ldest = dest & 0xff; + + /* + * In the "Cluster Model" the MDA is used to identify a + * specific cluster and a set of APICs in that cluster. + */ + if (x2apic_dest) { + mda_cluster_id = dest >> 16; + mda_cluster_ldest = dest & 0xffff; + } else { + mda_cluster_id = (dest >> 4) & 0xf; + mda_cluster_ldest = dest & 0xf; + } + + /* + * Logical mode: match each APIC that has a bit set + * in its LDR that matches a bit in the ldest. + */ + CPU_ZERO(dmask); + amask = vm_active_cpus(vm); + CPU_FOREACH_ISSET(vcpuid, &amask) { + vlapic = vm_lapic(vm_vcpu(vm, vcpuid)); + dfr = vlapic->apic_page->dfr; + ldr = vlapic->apic_page->ldr; + + if ((dfr & APIC_DFR_MODEL_MASK) == + APIC_DFR_MODEL_FLAT) { + ldest = ldr >> 24; + mda_ldest = mda_flat_ldest; + } else if ((dfr & APIC_DFR_MODEL_MASK) == + APIC_DFR_MODEL_CLUSTER) { + if (x2apic(vlapic)) { + cluster = ldr >> 16; + ldest = ldr & 0xffff; + } else { + cluster = ldr >> 28; + ldest = (ldr >> 24) & 0xf; + } + if (cluster != mda_cluster_id) + continue; + mda_ldest = mda_cluster_ldest; + } else { + /* + * Guest has configured a bad logical + * model for this vcpu - skip it. + */ + VLAPIC_CTR1(vlapic, "vlapic has bad logical " + "model %x - cannot deliver interrupt", dfr); + continue; + } + + if ((mda_ldest & ldest) != 0) { + CPU_SET(vcpuid, dmask); + if (lowprio) + break; + } + } + } +} + +static VMM_STAT(VLAPIC_IPI_SEND, "ipis sent from vcpu"); +static VMM_STAT(VLAPIC_IPI_RECV, "ipis received by vcpu"); + +static void +vlapic_set_tpr(struct vlapic *vlapic, uint8_t val) +{ + struct LAPIC *lapic = vlapic->apic_page; + + if (lapic->tpr != val) { + VLAPIC_CTR2(vlapic, "vlapic TPR changed from %#x to %#x", + lapic->tpr, val); + lapic->tpr = val; + vlapic_update_ppr(vlapic); + } +} + +static uint8_t +vlapic_get_tpr(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + + return (lapic->tpr); +} + +void +vlapic_set_cr8(struct vlapic *vlapic, uint64_t val) +{ + uint8_t tpr; + + if (val & ~0xf) { + vm_inject_gp(vlapic->vcpu); + return; + } + + tpr = val << 4; + vlapic_set_tpr(vlapic, tpr); +} + +uint64_t +vlapic_get_cr8(struct vlapic *vlapic) +{ + uint8_t tpr; + + tpr = vlapic_get_tpr(vlapic); + return (tpr >> 4); +} + +static bool +vlapic_is_icr_valid(uint64_t icrval) +{ + uint32_t mode = icrval & APIC_DELMODE_MASK; + uint32_t level = icrval & APIC_LEVEL_MASK; + uint32_t trigger = icrval & APIC_TRIGMOD_MASK; + uint32_t shorthand = icrval & APIC_DEST_MASK; + + switch (mode) { + case APIC_DELMODE_FIXED: + if (trigger == APIC_TRIGMOD_EDGE) + return (true); + /* + * AMD allows a level assert IPI and Intel converts a level + * assert IPI into an edge IPI. + */ + if (trigger == APIC_TRIGMOD_LEVEL && level == APIC_LEVEL_ASSERT) + return (true); + break; + case APIC_DELMODE_LOWPRIO: + case APIC_DELMODE_SMI: + case APIC_DELMODE_NMI: + case APIC_DELMODE_INIT: + if (trigger == APIC_TRIGMOD_EDGE && + (shorthand == APIC_DEST_DESTFLD || + shorthand == APIC_DEST_ALLESELF)) + return (true); + /* + * AMD allows a level assert IPI and Intel converts a level + * assert IPI into an edge IPI. + */ + if (trigger == APIC_TRIGMOD_LEVEL && + level == APIC_LEVEL_ASSERT && + (shorthand == APIC_DEST_DESTFLD || + shorthand == APIC_DEST_ALLESELF)) + return (true); + /* + * An level triggered deassert INIT is defined in the Intel + * Multiprocessor Specification and the Intel Software Developer + * Manual. Due to the MPS it's required to send a level assert + * INIT to a cpu and then a level deassert INIT. Some operating + * systems e.g. FreeBSD or Linux use that algorithm. According + * to the SDM a level deassert INIT is only supported by Pentium + * and P6 processors. It's always send to all cpus regardless of + * the destination or shorthand field. It resets the arbitration + * id register. This register is not software accessible and + * only required for the APIC bus arbitration. So, the level + * deassert INIT doesn't need any emulation and we should ignore + * it. The SDM also defines that newer processors don't support + * the level deassert INIT and it's not valid any more. As it's + * defined for older systems, it can't be invalid per se. + * Otherwise, backward compatibility would be broken. However, + * when returning false here, it'll be ignored which is the + * desired behaviour. + */ + if (mode == APIC_DELMODE_INIT && + trigger == APIC_TRIGMOD_LEVEL && + level == APIC_LEVEL_DEASSERT) + return (false); + break; + case APIC_DELMODE_STARTUP: + if (shorthand == APIC_DEST_DESTFLD || + shorthand == APIC_DEST_ALLESELF) + return (true); + break; + case APIC_DELMODE_RR: + /* Only available on AMD! */ + if (trigger == APIC_TRIGMOD_EDGE && + shorthand == APIC_DEST_DESTFLD) + return (true); + break; + case APIC_DELMODE_RESV: + return (false); + default: + __assert_unreachable(); + } + + return (false); +} + +int +vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) +{ + int i; + bool phys; + cpuset_t dmask, ipimask; + uint64_t icrval; + uint32_t dest, vec, mode, shorthand; + struct vcpu *vcpu; + struct vm_exit *vmexit; + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + lapic->icr_lo &= ~APIC_DELSTAT_PEND; + icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo; + + if (x2apic(vlapic)) + dest = icrval >> 32; + else + dest = icrval >> (32 + 24); + vec = icrval & APIC_VECTOR_MASK; + mode = icrval & APIC_DELMODE_MASK; + phys = (icrval & APIC_DESTMODE_LOG) == 0; + shorthand = icrval & APIC_DEST_MASK; + + VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec); + + switch (shorthand) { + case APIC_DEST_DESTFLD: + vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false, x2apic(vlapic)); + break; + case APIC_DEST_SELF: + CPU_SETOF(vlapic->vcpuid, &dmask); + break; + case APIC_DEST_ALLISELF: + dmask = vm_active_cpus(vlapic->vm); + break; + case APIC_DEST_ALLESELF: + dmask = vm_active_cpus(vlapic->vm); + CPU_CLR(vlapic->vcpuid, &dmask); + break; + default: + __assert_unreachable(); + } + + /* + * Ignore invalid combinations of the icr. + */ + if (!vlapic_is_icr_valid(icrval)) { + VLAPIC_CTR1(vlapic, "Ignoring invalid ICR %016lx", icrval); + return (0); + } + + /* + * ipimask is a set of vCPUs needing userland handling of the current + * IPI. + */ + CPU_ZERO(&ipimask); + + switch (mode) { + case APIC_DELMODE_FIXED: + if (vec < 16) { + vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, + false); + VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec); + return (0); + } + + CPU_FOREACH_ISSET(i, &dmask) { + vcpu = vm_vcpu(vlapic->vm, i); + lapic_intr_edge(vcpu, vec); + vmm_stat_incr(vlapic->vcpu, VLAPIC_IPI_SEND, 1); + vmm_stat_incr(vcpu, VLAPIC_IPI_RECV, 1); + VLAPIC_CTR2(vlapic, + "vlapic sending ipi %d to vcpuid %d", vec, i); + } + + break; + case APIC_DELMODE_NMI: + CPU_FOREACH_ISSET(i, &dmask) { + vcpu = vm_vcpu(vlapic->vm, i); + vm_inject_nmi(vcpu); + VLAPIC_CTR1(vlapic, + "vlapic sending ipi nmi to vcpuid %d", i); + } + + break; + case APIC_DELMODE_INIT: + case APIC_DELMODE_STARTUP: + if (!vlapic->ipi_exit) { + if (!phys) + break; + + i = vm_apicid2vcpuid(vlapic->vm, dest); + if (i >= vm_get_maxcpus(vlapic->vm) || + i == vlapic->vcpuid) + break; + + CPU_SETOF(i, &ipimask); + + break; + } + + CPU_COPY(&dmask, &ipimask); + break; + default: + return (1); + } + + if (!CPU_EMPTY(&ipimask)) { + vmexit = vm_exitinfo(vlapic->vcpu); + vmexit->exitcode = VM_EXITCODE_IPI; + vmexit->u.ipi.mode = mode; + vmexit->u.ipi.vector = vec; + *vm_exitinfo_cpuset(vlapic->vcpu) = ipimask; + + *retu = true; + } + + return (0); +} + +static void +vlapic_handle_init(struct vcpu *vcpu, void *arg) +{ + struct vlapic *vlapic = vm_lapic(vcpu); + + vlapic_reset(vlapic); +} + +int +vm_handle_ipi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu) +{ + struct vlapic *vlapic = vm_lapic(vcpu); + cpuset_t *dmask = vm_exitinfo_cpuset(vcpu); + uint8_t vec = vme->u.ipi.vector; + + *retu = true; + switch (vme->u.ipi.mode) { + case APIC_DELMODE_INIT: { + cpuset_t active, reinit; + + active = vm_active_cpus(vcpu_vm(vcpu)); + CPU_AND(&reinit, &active, dmask); + if (!CPU_EMPTY(&reinit)) { + vm_smp_rendezvous(vcpu, reinit, vlapic_handle_init, + NULL); + } + vm_await_start(vcpu_vm(vcpu), dmask); + + if (!vlapic->ipi_exit) + *retu = false; + + break; + } + case APIC_DELMODE_STARTUP: + /* + * Ignore SIPIs in any state other than wait-for-SIPI + */ + *dmask = vm_start_cpus(vcpu_vm(vcpu), dmask); + + if (CPU_EMPTY(dmask)) { + *retu = false; + break; + } + + /* + * Old bhyve versions don't support the IPI + * exit. Translate it into the old style. + */ + if (!vlapic->ipi_exit) { + vme->exitcode = VM_EXITCODE_SPINUP_AP; + vme->u.spinup_ap.vcpu = CPU_FFS(dmask) - 1; + vme->u.spinup_ap.rip = vec << PAGE_SHIFT; + } + + break; + default: + __assert_unreachable(); + } + + return (0); +} + +void +vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val) +{ + int vec; + + KASSERT(x2apic(vlapic), ("SELF_IPI does not exist in xAPIC mode")); + + vec = val & 0xff; + lapic_intr_edge(vlapic->vcpu, vec); + vmm_stat_incr(vlapic->vcpu, VLAPIC_IPI_SEND, 1); + vmm_stat_incr(vlapic->vcpu, VLAPIC_IPI_RECV, 1); + VLAPIC_CTR1(vlapic, "vlapic self-ipi %d", vec); +} + +int +vlapic_pending_intr(struct vlapic *vlapic, int *vecptr) +{ + struct LAPIC *lapic = vlapic->apic_page; + int idx, i, bitpos, vector; + uint32_t *irrptr, val; + + vlapic_update_ppr(vlapic); + + if (vlapic->ops.pending_intr) + return ((*vlapic->ops.pending_intr)(vlapic, vecptr)); + + irrptr = &lapic->irr0; + + for (i = 7; i >= 0; i--) { + idx = i * 4; + val = atomic_load_acq_int(&irrptr[idx]); + bitpos = fls(val); + if (bitpos != 0) { + vector = i * 32 + (bitpos - 1); + if (PRIO(vector) > PRIO(lapic->ppr)) { + VLAPIC_CTR1(vlapic, "pending intr %d", vector); + if (vecptr != NULL) + *vecptr = vector; + return (1); + } else + break; + } + } + return (0); +} + +void +vlapic_intr_accepted(struct vlapic *vlapic, int vector) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *irrptr, *isrptr; + int idx, stk_top; + + if (vlapic->ops.intr_accepted) + return ((*vlapic->ops.intr_accepted)(vlapic, vector)); + + /* + * clear the ready bit for vector being accepted in irr + * and set the vector as in service in isr. + */ + idx = (vector / 32) * 4; + + irrptr = &lapic->irr0; + atomic_clear_int(&irrptr[idx], 1 << (vector % 32)); + VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted"); + + isrptr = &lapic->isr0; + isrptr[idx] |= 1 << (vector % 32); + VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted"); + + /* + * Update the PPR + */ + vlapic->isrvec_stk_top++; + + stk_top = vlapic->isrvec_stk_top; + if (stk_top >= ISRVEC_STK_SIZE) + panic("isrvec_stk_top overflow %d", stk_top); + + vlapic->isrvec_stk[stk_top] = vector; +} + +void +vlapic_svr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + uint32_t old, new, changed; + + lapic = vlapic->apic_page; + + new = lapic->svr; + old = vlapic->svr_last; + vlapic->svr_last = new; + + changed = old ^ new; + if ((changed & APIC_SVR_ENABLE) != 0) { + if ((new & APIC_SVR_ENABLE) == 0) { + /* + * The apic is now disabled so stop the apic timer + * and mask all the LVT entries. + */ + VLAPIC_CTR0(vlapic, "vlapic is software-disabled"); + VLAPIC_TIMER_LOCK(vlapic); + callout_stop(&vlapic->callout); + VLAPIC_TIMER_UNLOCK(vlapic); + vlapic_mask_lvts(vlapic); + } else { + /* + * The apic is now enabled so restart the apic timer + * if it is configured in periodic mode. + */ + VLAPIC_CTR0(vlapic, "vlapic is software-enabled"); + if (vlapic_periodic_timer(vlapic)) + vlapic_icrtmr_write_handler(vlapic); + } + } +} + +int +vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t *data, bool *retu) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *reg; + int i; + + /* Ignore MMIO accesses in x2APIC mode */ + if (x2apic(vlapic) && mmio_access) { + VLAPIC_CTR1(vlapic, "MMIO read from offset %#lx in x2APIC mode", + offset); + *data = 0; + goto done; + } + + if (!x2apic(vlapic) && !mmio_access) { + /* + * XXX Generate GP fault for MSR accesses in xAPIC mode + */ + VLAPIC_CTR1(vlapic, "x2APIC MSR read from offset %#lx in " + "xAPIC mode", offset); + *data = 0; + goto done; + } + + if (offset > sizeof(*lapic)) { + *data = 0; + goto done; + } + + offset &= ~3; + switch(offset) + { + case APIC_OFFSET_ID: + *data = lapic->id; + break; + case APIC_OFFSET_VER: + *data = lapic->version; + break; + case APIC_OFFSET_TPR: + *data = vlapic_get_tpr(vlapic); + break; + case APIC_OFFSET_APR: + *data = lapic->apr; + break; + case APIC_OFFSET_PPR: + *data = lapic->ppr; + break; + case APIC_OFFSET_EOI: + *data = lapic->eoi; + break; + case APIC_OFFSET_LDR: + *data = lapic->ldr; + break; + case APIC_OFFSET_DFR: + *data = lapic->dfr; + break; + case APIC_OFFSET_SVR: + *data = lapic->svr; + break; + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + i = (offset - APIC_OFFSET_ISR0) >> 2; + reg = &lapic->isr0; + *data = *(reg + i); + break; + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + i = (offset - APIC_OFFSET_TMR0) >> 2; + reg = &lapic->tmr0; + *data = *(reg + i); + break; + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + i = (offset - APIC_OFFSET_IRR0) >> 2; + reg = &lapic->irr0; + *data = atomic_load_acq_int(reg + i); + break; + case APIC_OFFSET_ESR: + *data = lapic->esr; + break; + case APIC_OFFSET_ICR_LOW: + *data = lapic->icr_lo; + if (x2apic(vlapic)) + *data |= (uint64_t)lapic->icr_hi << 32; + break; + case APIC_OFFSET_ICR_HI: + *data = lapic->icr_hi; + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + *data = vlapic_get_lvt(vlapic, offset); +#ifdef INVARIANTS + reg = vlapic_get_lvtptr(vlapic, offset); + KASSERT(*data == *reg, ("inconsistent lvt value at " + "offset %#lx: %#lx/%#x", offset, *data, *reg)); +#endif + break; + case APIC_OFFSET_TIMER_ICR: + *data = lapic->icr_timer; + break; + case APIC_OFFSET_TIMER_CCR: + *data = vlapic_get_ccr(vlapic); + break; + case APIC_OFFSET_TIMER_DCR: + *data = lapic->dcr_timer; + break; + case APIC_OFFSET_SELF_IPI: + /* + * XXX generate a GP fault if vlapic is in x2apic mode + */ + *data = 0; + break; + case APIC_OFFSET_RRR: + default: + *data = 0; + break; + } +done: + VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data); + return 0; +} + +int +vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t data, bool *retu) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *regptr; + int retval; + + KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE, + ("vlapic_write: invalid offset %#lx", offset)); + + VLAPIC_CTR2(vlapic, "vlapic write offset %#lx, data %#lx", + offset, data); + + if (offset > sizeof(*lapic)) + return (0); + + /* Ignore MMIO accesses in x2APIC mode */ + if (x2apic(vlapic) && mmio_access) { + VLAPIC_CTR2(vlapic, "MMIO write of %#lx to offset %#lx " + "in x2APIC mode", data, offset); + return (0); + } + + /* + * XXX Generate GP fault for MSR accesses in xAPIC mode + */ + if (!x2apic(vlapic) && !mmio_access) { + VLAPIC_CTR2(vlapic, "x2APIC MSR write of %#lx to offset %#lx " + "in xAPIC mode", data, offset); + return (0); + } + + retval = 0; + switch(offset) + { + case APIC_OFFSET_ID: + lapic->id = data; + vlapic_id_write_handler(vlapic); + break; + case APIC_OFFSET_TPR: + vlapic_set_tpr(vlapic, data & 0xff); + break; + case APIC_OFFSET_EOI: + vlapic_process_eoi(vlapic); + break; + case APIC_OFFSET_LDR: + lapic->ldr = data; + vlapic_ldr_write_handler(vlapic); + break; + case APIC_OFFSET_DFR: + lapic->dfr = data; + vlapic_dfr_write_handler(vlapic); + break; + case APIC_OFFSET_SVR: + lapic->svr = data; + vlapic_svr_write_handler(vlapic); + break; + case APIC_OFFSET_ICR_LOW: + lapic->icr_lo = data; + if (x2apic(vlapic)) + lapic->icr_hi = data >> 32; + retval = vlapic_icrlo_write_handler(vlapic, retu); + break; + case APIC_OFFSET_ICR_HI: + lapic->icr_hi = data; + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + regptr = vlapic_get_lvtptr(vlapic, offset); + *regptr = data; + vlapic_lvt_write_handler(vlapic, offset); + break; + case APIC_OFFSET_TIMER_ICR: + lapic->icr_timer = data; + vlapic_icrtmr_write_handler(vlapic); + break; + + case APIC_OFFSET_TIMER_DCR: + lapic->dcr_timer = data; + vlapic_dcr_write_handler(vlapic); + break; + + case APIC_OFFSET_ESR: + vlapic_esr_write_handler(vlapic); + break; + + case APIC_OFFSET_SELF_IPI: + if (x2apic(vlapic)) + vlapic_self_ipi_handler(vlapic, data); + break; + + case APIC_OFFSET_VER: + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + case APIC_OFFSET_TIMER_CCR: + default: + // Read only. + break; + } + + return (retval); +} + +static void +vlapic_reset(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + bzero(lapic, sizeof(struct LAPIC)); + + lapic->id = vlapic_get_id(vlapic); + lapic->version = VLAPIC_VERSION; + lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT); + lapic->dfr = 0xffffffff; + lapic->svr = APIC_SVR_VECTOR; + vlapic_mask_lvts(vlapic); + vlapic_reset_tmr(vlapic); + + lapic->dcr_timer = 0; + vlapic_dcr_write_handler(vlapic); + + vlapic->svr_last = lapic->svr; +} + +void +vlapic_init(struct vlapic *vlapic) +{ + KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized")); + KASSERT(vlapic->vcpuid >= 0 && + vlapic->vcpuid < vm_get_maxcpus(vlapic->vm), + ("vlapic_init: vcpuid is not initialized")); + KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not " + "initialized")); + + /* + * If the vlapic is configured in x2apic mode then it will be + * accessed in the critical section via the MSR emulation code. + * + * Therefore the timer mutex must be a spinlock because blockable + * mutexes cannot be acquired in a critical section. + */ + mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_SPIN); + callout_init(&vlapic->callout, 1); + + vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED; + + if (vlapic->vcpuid == 0) + vlapic->msr_apicbase |= APICBASE_BSP; + + vlapic->ipi_exit = false; + + vlapic_reset(vlapic); +} + +void +vlapic_cleanup(struct vlapic *vlapic) +{ + + callout_drain(&vlapic->callout); + mtx_destroy(&vlapic->timer_mtx); +} + +uint64_t +vlapic_get_apicbase(struct vlapic *vlapic) +{ + + return (vlapic->msr_apicbase); +} + +int +vlapic_set_apicbase(struct vlapic *vlapic, uint64_t new) +{ + + if (vlapic->msr_apicbase != new) { + VLAPIC_CTR2(vlapic, "Changing APIC_BASE MSR from %#lx to %#lx " + "not supported", vlapic->msr_apicbase, new); + return (-1); + } + + return (0); +} + +void +vlapic_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state) +{ + struct vlapic *vlapic; + struct LAPIC *lapic; + + vlapic = vm_lapic(vcpu); + + if (state == X2APIC_DISABLED) + vlapic->msr_apicbase &= ~APICBASE_X2APIC; + else + vlapic->msr_apicbase |= APICBASE_X2APIC; + + /* + * Reset the local APIC registers whose values are mode-dependent. + * + * XXX this works because the APIC mode can be changed only at vcpu + * initialization time. + */ + lapic = vlapic->apic_page; + lapic->id = vlapic_get_id(vlapic); + if (x2apic(vlapic)) { + lapic->ldr = x2apic_ldr(vlapic); + lapic->dfr = 0; + } else { + lapic->ldr = 0; + lapic->dfr = 0xffffffff; + } + + if (state == X2APIC_ENABLED) { + if (vlapic->ops.enable_x2apic_mode) + (*vlapic->ops.enable_x2apic_mode)(vlapic); + } +} + +void +vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, + int delmode, int vec) +{ + struct vcpu *vcpu; + bool lowprio; + int vcpuid; + cpuset_t dmask; + + if (delmode != IOART_DELFIXED && + delmode != IOART_DELLOPRI && + delmode != IOART_DELEXINT) { + VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode); + return; + } + lowprio = (delmode == IOART_DELLOPRI); + + /* + * We don't provide any virtual interrupt redirection hardware so + * all interrupts originating from the ioapic or MSI specify the + * 'dest' in the legacy xAPIC format. + */ + vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false); + + CPU_FOREACH_ISSET(vcpuid, &dmask) { + vcpu = vm_vcpu(vm, vcpuid); + if (delmode == IOART_DELEXINT) { + vm_inject_extint(vcpu); + } else { + lapic_set_intr(vcpu, vec, level); + } + } +} + +void +vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum) +{ + /* + * Post an interrupt to the vcpu currently running on 'hostcpu'. + * + * This is done by leveraging features like Posted Interrupts (Intel) + * Doorbell MSR (AMD AVIC) that avoid a VM exit. + * + * If neither of these features are available then fallback to + * sending an IPI to 'hostcpu'. + */ + if (vlapic->ops.post_intr) + (*vlapic->ops.post_intr)(vlapic, hostcpu); + else + ipi_cpu(hostcpu, ipinum); +} + +bool +vlapic_enabled(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + + if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 && + (lapic->svr & APIC_SVR_ENABLE) != 0) + return (true); + else + return (false); +} + +static void +vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level) +{ + struct LAPIC *lapic; + uint32_t *tmrptr, mask; + int idx; + + lapic = vlapic->apic_page; + tmrptr = &lapic->tmr0; + idx = (vector / 32) * 4; + mask = 1 << (vector % 32); + if (level) + tmrptr[idx] |= mask; + else + tmrptr[idx] &= ~mask; + + if (vlapic->ops.set_tmr != NULL) + (*vlapic->ops.set_tmr)(vlapic, vector, level); +} + +void +vlapic_reset_tmr(struct vlapic *vlapic) +{ + int vector; + + VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered"); + + for (vector = 0; vector <= 255; vector++) + vlapic_set_tmr(vlapic, vector, false); +} + +void +vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, + int delmode, int vector) +{ + cpuset_t dmask; + bool lowprio; + + KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); + + /* + * A level trigger is valid only for fixed and lowprio delivery modes. + */ + if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) { + VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for " + "delivery-mode %d", delmode); + return; + } + + lowprio = (delmode == APIC_DELMODE_LOWPRIO); + vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false); + + if (!CPU_ISSET(vlapic->vcpuid, &dmask)) + return; + + VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector); + vlapic_set_tmr(vlapic, vector, true); +} + +#ifdef BHYVE_SNAPSHOT +static void +vlapic_reset_callout(struct vlapic *vlapic, uint32_t ccr) +{ + /* The implementation is similar to the one in the + * `vlapic_icrtmr_write_handler` function + */ + sbintime_t sbt; + struct bintime bt; + + VLAPIC_TIMER_LOCK(vlapic); + + bt = vlapic->timer_freq_bt; + bintime_mul(&bt, ccr); + + if (ccr != 0) { + binuptime(&vlapic->timer_fire_bt); + bintime_add(&vlapic->timer_fire_bt, &bt); + + sbt = bttosbt(bt); + vlapic_callout_reset(vlapic, sbt); + } else { + /* even if the CCR was 0, periodic timers should be reset */ + if (vlapic_periodic_timer(vlapic)) { + binuptime(&vlapic->timer_fire_bt); + bintime_add(&vlapic->timer_fire_bt, + &vlapic->timer_period_bt); + sbt = bttosbt(vlapic->timer_period_bt); + + callout_stop(&vlapic->callout); + vlapic_callout_reset(vlapic, sbt); + } + } + + VLAPIC_TIMER_UNLOCK(vlapic); +} + +int +vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta) +{ + int ret; + struct vcpu *vcpu; + struct vlapic *vlapic; + struct LAPIC *lapic; + uint32_t ccr; + uint16_t i, maxcpus; + + KASSERT(vm != NULL, ("%s: arg was NULL", __func__)); + + ret = 0; + + maxcpus = vm_get_maxcpus(vm); + for (i = 0; i < maxcpus; i++) { + vcpu = vm_vcpu(vm, i); + if (vcpu == NULL) + continue; + vlapic = vm_lapic(vcpu); + + /* snapshot the page first; timer period depends on icr_timer */ + lapic = vlapic->apic_page; + SNAPSHOT_BUF_OR_LEAVE(lapic, PAGE_SIZE, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vlapic->esr_pending, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.sec, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.frac, + meta, ret, done); + + /* + * Timer period is equal to 'icr_timer' ticks at a frequency of + * 'timer_freq_bt'. + */ + if (meta->op == VM_SNAPSHOT_RESTORE) { + vlapic->timer_period_bt = vlapic->timer_freq_bt; + bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); + } + + SNAPSHOT_BUF_OR_LEAVE(vlapic->isrvec_stk, + sizeof(vlapic->isrvec_stk), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vlapic->isrvec_stk_top, meta, ret, done); + + SNAPSHOT_BUF_OR_LEAVE(vlapic->lvt_last, + sizeof(vlapic->lvt_last), + meta, ret, done); + + if (meta->op == VM_SNAPSHOT_SAVE) + ccr = vlapic_get_ccr(vlapic); + + SNAPSHOT_VAR_OR_LEAVE(ccr, meta, ret, done); + + if (meta->op == VM_SNAPSHOT_RESTORE && + vlapic_enabled(vlapic) && lapic->icr_timer != 0) { + /* Reset the value of the 'timer_fire_bt' and the vlapic + * callout based on the value of the current count + * register saved when the VM snapshot was created. + * If initial count register is 0, timer is not used. + * Look at "10.5.4 APIC Timer" in Software Developer Manual. + */ + vlapic_reset_callout(vlapic, ccr); + } + } + +done: + return (ret); +} +#endif diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h new file mode 100644 index 000000000000..569dce6b272f --- /dev/null +++ b/sys/amd64/vmm/io/vlapic.h @@ -0,0 +1,118 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VLAPIC_H_ +#define _VLAPIC_H_ + +struct vm; +struct vm_snapshot_meta; +enum x2apic_state; + +int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t data, bool *retu); +int vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t *data, bool *retu); + +/* + * Returns 0 if there is no eligible vector that can be delivered to the + * guest at this time and non-zero otherwise. + * + * If an eligible vector number is found and 'vecptr' is not NULL then it will + * be stored in the location pointed to by 'vecptr'. + * + * Note that the vector does not automatically transition to the ISR as a + * result of calling this function. + */ +int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr); + +/* + * Transition 'vector' from IRR to ISR. This function is called with the + * vector returned by 'vlapic_pending_intr()' when the guest is able to + * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that + * block interrupt delivery). + */ +void vlapic_intr_accepted(struct vlapic *vlapic, int vector); + +/* + * Returns 1 if the vcpu needs to be notified of the interrupt and 0 otherwise. + */ +int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level); + +/* + * Post an interrupt to the vcpu running on 'hostcpu'. This will use a + * hardware assist if available (e.g. Posted Interrupt) or fall back to + * sending an 'ipinum' to interrupt the 'hostcpu'. + */ +void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum); + +void vlapic_fire_cmci(struct vlapic *vlapic); +int vlapic_trigger_lvt(struct vlapic *vlapic, int vector); + +void vlapic_sync_tpr(struct vlapic *vlapic); + +uint64_t vlapic_get_apicbase(struct vlapic *vlapic); +int vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val); +void vlapic_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state s); +bool vlapic_enabled(struct vlapic *vlapic); + +void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, + int delmode, int vec); + +/* Reset the trigger-mode bits for all vectors to be edge-triggered */ +void vlapic_reset_tmr(struct vlapic *vlapic); + +/* + * Set the trigger-mode bit associated with 'vector' to level-triggered if + * the (dest,phys,delmode) tuple resolves to an interrupt being delivered to + * this 'vlapic'. + */ +void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, + int delmode, int vector); + +void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val); +uint64_t vlapic_get_cr8(struct vlapic *vlapic); + +/* APIC write handlers */ +void vlapic_id_write_handler(struct vlapic *vlapic); +void vlapic_ldr_write_handler(struct vlapic *vlapic); +void vlapic_dfr_write_handler(struct vlapic *vlapic); +void vlapic_svr_write_handler(struct vlapic *vlapic); +void vlapic_esr_write_handler(struct vlapic *vlapic); +int vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu); +void vlapic_icrtmr_write_handler(struct vlapic *vlapic); +void vlapic_dcr_write_handler(struct vlapic *vlapic); +void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset); +void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val); + +#ifdef BHYVE_SNAPSHOT +int vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta); +#endif + +int vm_handle_ipi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu); + +#endif /* _VLAPIC_H_ */ diff --git a/sys/amd64/vmm/io/vlapic_priv.h b/sys/amd64/vmm/io/vlapic_priv.h new file mode 100644 index 000000000000..1dce593b9444 --- /dev/null +++ b/sys/amd64/vmm/io/vlapic_priv.h @@ -0,0 +1,186 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VLAPIC_PRIV_H_ +#define _VLAPIC_PRIV_H_ + +#include <x86/apicreg.h> + +/* + * APIC Register: Offset Description + */ +#define APIC_OFFSET_ID 0x20 /* Local APIC ID */ +#define APIC_OFFSET_VER 0x30 /* Local APIC Version */ +#define APIC_OFFSET_TPR 0x80 /* Task Priority Register */ +#define APIC_OFFSET_APR 0x90 /* Arbitration Priority */ +#define APIC_OFFSET_PPR 0xA0 /* Processor Priority Register */ +#define APIC_OFFSET_EOI 0xB0 /* EOI Register */ +#define APIC_OFFSET_RRR 0xC0 /* Remote read */ +#define APIC_OFFSET_LDR 0xD0 /* Logical Destination */ +#define APIC_OFFSET_DFR 0xE0 /* Destination Format Register */ +#define APIC_OFFSET_SVR 0xF0 /* Spurious Vector Register */ +#define APIC_OFFSET_ISR0 0x100 /* In Service Register */ +#define APIC_OFFSET_ISR1 0x110 +#define APIC_OFFSET_ISR2 0x120 +#define APIC_OFFSET_ISR3 0x130 +#define APIC_OFFSET_ISR4 0x140 +#define APIC_OFFSET_ISR5 0x150 +#define APIC_OFFSET_ISR6 0x160 +#define APIC_OFFSET_ISR7 0x170 +#define APIC_OFFSET_TMR0 0x180 /* Trigger Mode Register */ +#define APIC_OFFSET_TMR1 0x190 +#define APIC_OFFSET_TMR2 0x1A0 +#define APIC_OFFSET_TMR3 0x1B0 +#define APIC_OFFSET_TMR4 0x1C0 +#define APIC_OFFSET_TMR5 0x1D0 +#define APIC_OFFSET_TMR6 0x1E0 +#define APIC_OFFSET_TMR7 0x1F0 +#define APIC_OFFSET_IRR0 0x200 /* Interrupt Request Register */ +#define APIC_OFFSET_IRR1 0x210 +#define APIC_OFFSET_IRR2 0x220 +#define APIC_OFFSET_IRR3 0x230 +#define APIC_OFFSET_IRR4 0x240 +#define APIC_OFFSET_IRR5 0x250 +#define APIC_OFFSET_IRR6 0x260 +#define APIC_OFFSET_IRR7 0x270 +#define APIC_OFFSET_ESR 0x280 /* Error Status Register */ +#define APIC_OFFSET_CMCI_LVT 0x2F0 /* Local Vector Table (CMCI) */ +#define APIC_OFFSET_ICR_LOW 0x300 /* Interrupt Command Register */ +#define APIC_OFFSET_ICR_HI 0x310 +#define APIC_OFFSET_TIMER_LVT 0x320 /* Local Vector Table (Timer) */ +#define APIC_OFFSET_THERM_LVT 0x330 /* Local Vector Table (Thermal) */ +#define APIC_OFFSET_PERF_LVT 0x340 /* Local Vector Table (PMC) */ +#define APIC_OFFSET_LINT0_LVT 0x350 /* Local Vector Table (LINT0) */ +#define APIC_OFFSET_LINT1_LVT 0x360 /* Local Vector Table (LINT1) */ +#define APIC_OFFSET_ERROR_LVT 0x370 /* Local Vector Table (ERROR) */ +#define APIC_OFFSET_TIMER_ICR 0x380 /* Timer's Initial Count */ +#define APIC_OFFSET_TIMER_CCR 0x390 /* Timer's Current Count */ +#define APIC_OFFSET_TIMER_DCR 0x3E0 /* Timer's Divide Configuration */ +#define APIC_OFFSET_SELF_IPI 0x3F0 /* Self IPI register */ + +#define VLAPIC_CTR0(vlapic, format) \ + VCPU_CTR0((vlapic)->vm, (vlapic)->vcpuid, format) + +#define VLAPIC_CTR1(vlapic, format, p1) \ + VCPU_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1) + +#define VLAPIC_CTR2(vlapic, format, p1, p2) \ + VCPU_CTR2((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2) + +#define VLAPIC_CTR3(vlapic, format, p1, p2, p3) \ + VCPU_CTR3((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2, p3) + +#define VLAPIC_CTR_IRR(vlapic, msg) \ +do { \ + uint32_t *irrptr = &(vlapic)->apic_page->irr0; \ + irrptr[0] = irrptr[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]); \ +} while (0) + +#define VLAPIC_CTR_ISR(vlapic, msg) \ +do { \ + uint32_t *isrptr = &(vlapic)->apic_page->isr0; \ + isrptr[0] = isrptr[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \ +} while (0) + +/* + * 16 priority levels with at most one vector injected per level. + */ +#define ISRVEC_STK_SIZE (16 + 1) + +#define VLAPIC_MAXLVT_INDEX APIC_LVT_CMCI + +struct vcpu; +struct vlapic; + +struct vlapic_ops { + int (*set_intr_ready)(struct vlapic *vlapic, int vector, bool level); + int (*pending_intr)(struct vlapic *vlapic, int *vecptr); + void (*intr_accepted)(struct vlapic *vlapic, int vector); + void (*post_intr)(struct vlapic *vlapic, int hostcpu); + void (*set_tmr)(struct vlapic *vlapic, int vector, bool level); + void (*enable_x2apic_mode)(struct vlapic *vlapic); +}; + +struct vlapic { + struct vm *vm; + struct vcpu *vcpu; + int vcpuid; + struct LAPIC *apic_page; + struct vlapic_ops ops; + + uint32_t esr_pending; + + struct callout callout; /* vlapic timer */ + struct bintime timer_fire_bt; /* callout expiry time */ + struct bintime timer_freq_bt; /* timer frequency */ + struct bintime timer_period_bt; /* timer period */ + struct mtx timer_mtx; + + /* + * The 'isrvec_stk' is a stack of vectors injected by the local apic. + * A vector is popped from the stack when the processor does an EOI. + * The vector on the top of the stack is used to compute the + * Processor Priority in conjunction with the TPR. + */ + uint8_t isrvec_stk[ISRVEC_STK_SIZE]; + int isrvec_stk_top; + + uint64_t msr_apicbase; + + /* + * Copies of some registers in the virtual APIC page. We do this for + * a couple of different reasons: + * - to be able to detect what changed (e.g. svr_last) + * - to maintain a coherent snapshot of the register (e.g. lvt_last) + */ + uint32_t svr_last; + uint32_t lvt_last[VLAPIC_MAXLVT_INDEX + 1]; + + bool ipi_exit; +}; + +void vlapic_init(struct vlapic *vlapic); +void vlapic_cleanup(struct vlapic *vlapic); + +#endif /* _VLAPIC_PRIV_H_ */ diff --git a/sys/amd64/vmm/io/vpmtmr.c b/sys/amd64/vmm/io/vpmtmr.c new file mode 100644 index 000000000000..fb0775f12aab --- /dev/null +++ b/sys/amd64/vmm/io/vpmtmr.c @@ -0,0 +1,118 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_bhyve_snapshot.h" + +#include <sys/param.h> +#include <sys/queue.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/systm.h> + +#include <machine/vmm.h> +#include <machine/vmm_snapshot.h> + +#include "vpmtmr.h" + +/* + * The ACPI Power Management timer is a free-running 24- or 32-bit + * timer with a frequency of 3.579545MHz + * + * This implementation will be 32-bits + */ + +#define PMTMR_FREQ 3579545 /* 3.579545MHz */ + +struct vpmtmr { + sbintime_t freq_sbt; + sbintime_t baseuptime; + uint32_t baseval; +}; + +static MALLOC_DEFINE(M_VPMTMR, "vpmtmr", "bhyve virtual acpi timer"); + +struct vpmtmr * +vpmtmr_init(struct vm *vm) +{ + struct vpmtmr *vpmtmr; + struct bintime bt; + + vpmtmr = malloc(sizeof(struct vpmtmr), M_VPMTMR, M_WAITOK | M_ZERO); + vpmtmr->baseuptime = sbinuptime(); + vpmtmr->baseval = 0; + + FREQ2BT(PMTMR_FREQ, &bt); + vpmtmr->freq_sbt = bttosbt(bt); + + return (vpmtmr); +} + +void +vpmtmr_cleanup(struct vpmtmr *vpmtmr) +{ + + free(vpmtmr, M_VPMTMR); +} + +int +vpmtmr_handler(struct vm *vm, bool in, int port, int bytes, uint32_t *val) +{ + struct vpmtmr *vpmtmr; + sbintime_t now, delta; + + if (!in || bytes != 4) + return (-1); + + vpmtmr = vm_pmtmr(vm); + + /* + * No locking needed because 'baseuptime' and 'baseval' are + * written only during initialization. + */ + now = sbinuptime(); + delta = now - vpmtmr->baseuptime; + KASSERT(delta >= 0, ("vpmtmr_handler: uptime went backwards: " + "%#lx to %#lx", vpmtmr->baseuptime, now)); + *val = vpmtmr->baseval + delta / vpmtmr->freq_sbt; + + return (0); +} + +#ifdef BHYVE_SNAPSHOT +int +vpmtmr_snapshot(struct vpmtmr *vpmtmr, struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_OR_LEAVE(vpmtmr->baseval, meta, ret, done); + +done: + return (ret); +} +#endif diff --git a/sys/amd64/vmm/io/vpmtmr.h b/sys/amd64/vmm/io/vpmtmr.h new file mode 100644 index 000000000000..cdb87b9d77f6 --- /dev/null +++ b/sys/amd64/vmm/io/vpmtmr.h @@ -0,0 +1,46 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VPMTMR_H_ +#define _VPMTMR_H_ + +#define IO_PMTMR 0x408 + +struct vpmtmr; +struct vm_snapshot_meta; + +struct vpmtmr *vpmtmr_init(struct vm *vm); +void vpmtmr_cleanup(struct vpmtmr *pmtmr); + +int vpmtmr_handler(struct vm *vm, bool in, int port, int bytes, uint32_t *val); + +#ifdef BHYVE_SNAPSHOT +int vpmtmr_snapshot(struct vpmtmr *vpmtmr, struct vm_snapshot_meta *meta); +#endif + +#endif diff --git a/sys/amd64/vmm/io/vrtc.c b/sys/amd64/vmm/io/vrtc.c new file mode 100644 index 000000000000..a56c77b7bf73 --- /dev/null +++ b/sys/amd64/vmm/io/vrtc.c @@ -0,0 +1,1064 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#include "opt_bhyve_snapshot.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/queue.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/clock.h> +#include <sys/sysctl.h> + +#include <machine/vmm.h> +#include <machine/vmm_snapshot.h> + +#include <isa/rtc.h> + +#include <dev/vmm/vmm_ktr.h> + +#include "vatpic.h" +#include "vioapic.h" +#include "vrtc.h" + +/* Register layout of the RTC */ +struct rtcdev { + uint8_t sec; + uint8_t alarm_sec; + uint8_t min; + uint8_t alarm_min; + uint8_t hour; + uint8_t alarm_hour; + uint8_t day_of_week; + uint8_t day_of_month; + uint8_t month; + uint8_t year; + uint8_t reg_a; + uint8_t reg_b; + uint8_t reg_c; + uint8_t reg_d; + uint8_t nvram[36]; + uint8_t century; + uint8_t nvram2[128 - 51]; +} __packed; +CTASSERT(sizeof(struct rtcdev) == 128); +CTASSERT(offsetof(struct rtcdev, century) == RTC_CENTURY); + +struct vrtc { + struct vm *vm; + struct mtx mtx; + struct callout callout; + u_int addr; /* RTC register to read or write */ + sbintime_t base_uptime; + time_t base_rtctime; + struct rtcdev rtcdev; +}; + +#define VRTC_LOCK(vrtc) mtx_lock(&((vrtc)->mtx)) +#define VRTC_UNLOCK(vrtc) mtx_unlock(&((vrtc)->mtx)) +#define VRTC_LOCKED(vrtc) mtx_owned(&((vrtc)->mtx)) + +/* + * RTC time is considered "broken" if: + * - RTC updates are halted by the guest + * - RTC date/time fields have invalid values + */ +#define VRTC_BROKEN_TIME ((time_t)-1) + +#define RTC_IRQ 8 +#define RTCSB_BIN 0x04 +#define RTCSB_ALL_INTRS (RTCSB_UINTR | RTCSB_AINTR | RTCSB_PINTR) +#define rtc_halted(vrtc) ((vrtc->rtcdev.reg_b & RTCSB_HALT) != 0) +#define aintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_AINTR) != 0) +#define pintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_PINTR) != 0) +#define uintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_UINTR) != 0) + +static void vrtc_callout_handler(void *arg); +static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval); + +static MALLOC_DEFINE(M_VRTC, "vrtc", "bhyve virtual rtc"); + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, vrtc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, + NULL); + +static int rtc_flag_broken_time = 1; +SYSCTL_INT(_hw_vmm_vrtc, OID_AUTO, flag_broken_time, CTLFLAG_RDTUN, + &rtc_flag_broken_time, 0, "Stop guest when invalid RTC time is detected"); + +static __inline bool +divider_enabled(int reg_a) +{ + /* + * The RTC is counting only when dividers are not held in reset. + */ + return ((reg_a & 0x70) == 0x20); +} + +static __inline bool +update_enabled(struct vrtc *vrtc) +{ + /* + * RTC date/time can be updated only if: + * - divider is not held in reset + * - guest has not disabled updates + * - the date/time fields have valid contents + */ + if (!divider_enabled(vrtc->rtcdev.reg_a)) + return (false); + + if (rtc_halted(vrtc)) + return (false); + + if (vrtc->base_rtctime == VRTC_BROKEN_TIME) + return (false); + + return (true); +} + +static time_t +vrtc_curtime(struct vrtc *vrtc, sbintime_t *basetime) +{ + sbintime_t now, delta; + time_t t, secs; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + t = vrtc->base_rtctime; + *basetime = vrtc->base_uptime; + if (update_enabled(vrtc)) { + now = sbinuptime(); + delta = now - vrtc->base_uptime; + KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: " + "%#lx to %#lx", vrtc->base_uptime, now)); + secs = delta / SBT_1S; + t += secs; + *basetime += secs * SBT_1S; + } + return (t); +} + +static __inline uint8_t +rtcset(struct rtcdev *rtc, int val) +{ + + KASSERT(val >= 0 && val < 100, ("%s: invalid bin2bcd index %d", + __func__, val)); + + return ((rtc->reg_b & RTCSB_BIN) ? val : bin2bcd_data[val]); +} + +static void +secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update) +{ + struct clocktime ct; + struct timespec ts; + struct rtcdev *rtc; + int hour; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + if (rtctime < 0) { + KASSERT(rtctime == VRTC_BROKEN_TIME, + ("%s: invalid vrtc time %#lx", __func__, rtctime)); + return; + } + + /* + * If the RTC is halted then the guest has "ownership" of the + * date/time fields. Don't update the RTC date/time fields in + * this case (unless forced). + */ + if (rtc_halted(vrtc) && !force_update) + return; + + ts.tv_sec = rtctime; + ts.tv_nsec = 0; + clock_ts_to_ct(&ts, &ct); + + KASSERT(ct.sec >= 0 && ct.sec <= 59, ("invalid clocktime sec %d", + ct.sec)); + KASSERT(ct.min >= 0 && ct.min <= 59, ("invalid clocktime min %d", + ct.min)); + KASSERT(ct.hour >= 0 && ct.hour <= 23, ("invalid clocktime hour %d", + ct.hour)); + KASSERT(ct.dow >= 0 && ct.dow <= 6, ("invalid clocktime wday %d", + ct.dow)); + KASSERT(ct.day >= 1 && ct.day <= 31, ("invalid clocktime mday %d", + ct.day)); + KASSERT(ct.mon >= 1 && ct.mon <= 12, ("invalid clocktime month %d", + ct.mon)); + KASSERT(ct.year >= POSIX_BASE_YEAR, ("invalid clocktime year %d", + ct.year)); + + rtc = &vrtc->rtcdev; + rtc->sec = rtcset(rtc, ct.sec); + rtc->min = rtcset(rtc, ct.min); + + if (rtc->reg_b & RTCSB_24HR) { + hour = ct.hour; + } else { + /* + * Convert to the 12-hour format. + */ + switch (ct.hour) { + case 0: /* 12 AM */ + case 12: /* 12 PM */ + hour = 12; + break; + default: + /* + * The remaining 'ct.hour' values are interpreted as: + * [1 - 11] -> 1 - 11 AM + * [13 - 23] -> 1 - 11 PM + */ + hour = ct.hour % 12; + break; + } + } + + rtc->hour = rtcset(rtc, hour); + + if ((rtc->reg_b & RTCSB_24HR) == 0 && ct.hour >= 12) + rtc->hour |= 0x80; /* set MSB to indicate PM */ + + rtc->day_of_week = rtcset(rtc, ct.dow + 1); + rtc->day_of_month = rtcset(rtc, ct.day); + rtc->month = rtcset(rtc, ct.mon); + rtc->year = rtcset(rtc, ct.year % 100); + rtc->century = rtcset(rtc, ct.year / 100); +} + +static int +rtcget(struct rtcdev *rtc, int val, int *retval) +{ + uint8_t upper, lower; + + if (rtc->reg_b & RTCSB_BIN) { + *retval = val; + return (0); + } + + lower = val & 0xf; + upper = (val >> 4) & 0xf; + + if (lower > 9 || upper > 9) + return (-1); + + *retval = upper * 10 + lower; + return (0); +} + +static time_t +rtc_to_secs(struct vrtc *vrtc) +{ + struct clocktime ct; + struct timespec ts; + struct rtcdev *rtc; +#ifdef KTR + struct vm *vm = vrtc->vm; +#endif + int century, error, hour, pm, year; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + rtc = &vrtc->rtcdev; + + bzero(&ct, sizeof(struct clocktime)); + + error = rtcget(rtc, rtc->sec, &ct.sec); + if (error || ct.sec < 0 || ct.sec > 59) { + VM_CTR2(vm, "Invalid RTC sec %#x/%d", rtc->sec, ct.sec); + goto fail; + } + + error = rtcget(rtc, rtc->min, &ct.min); + if (error || ct.min < 0 || ct.min > 59) { + VM_CTR2(vm, "Invalid RTC min %#x/%d", rtc->min, ct.min); + goto fail; + } + + pm = 0; + hour = rtc->hour; + if ((rtc->reg_b & RTCSB_24HR) == 0) { + if (hour & 0x80) { + hour &= ~0x80; + pm = 1; + } + } + error = rtcget(rtc, hour, &ct.hour); + if ((rtc->reg_b & RTCSB_24HR) == 0) { + if (ct.hour >= 1 && ct.hour <= 12) { + /* + * Convert from 12-hour format to internal 24-hour + * representation as follows: + * + * 12-hour format ct.hour + * 12 AM 0 + * 1 - 11 AM 1 - 11 + * 12 PM 12 + * 1 - 11 PM 13 - 23 + */ + if (ct.hour == 12) + ct.hour = 0; + if (pm) + ct.hour += 12; + } else { + VM_CTR2(vm, "Invalid RTC 12-hour format %#x/%d", + rtc->hour, ct.hour); + goto fail; + } + } + + if (error || ct.hour < 0 || ct.hour > 23) { + VM_CTR2(vm, "Invalid RTC hour %#x/%d", rtc->hour, ct.hour); + goto fail; + } + + /* + * Ignore 'rtc->dow' because some guests like Linux don't bother + * setting it at all while others like OpenBSD/i386 set it incorrectly. + * + * clock_ct_to_ts() does not depend on 'ct.dow' anyways so ignore it. + */ + ct.dow = -1; + + error = rtcget(rtc, rtc->day_of_month, &ct.day); + if (error || ct.day < 1 || ct.day > 31) { + VM_CTR2(vm, "Invalid RTC mday %#x/%d", rtc->day_of_month, + ct.day); + goto fail; + } + + error = rtcget(rtc, rtc->month, &ct.mon); + if (error || ct.mon < 1 || ct.mon > 12) { + VM_CTR2(vm, "Invalid RTC month %#x/%d", rtc->month, ct.mon); + goto fail; + } + + error = rtcget(rtc, rtc->year, &year); + if (error || year < 0 || year > 99) { + VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year); + goto fail; + } + + error = rtcget(rtc, rtc->century, ¢ury); + ct.year = century * 100 + year; + if (error || ct.year < POSIX_BASE_YEAR) { + VM_CTR2(vm, "Invalid RTC century %#x/%d", rtc->century, + ct.year); + goto fail; + } + + error = clock_ct_to_ts(&ct, &ts); + if (error || ts.tv_sec < 0) { + VM_CTR3(vm, "Invalid RTC clocktime.date %04d-%02d-%02d", + ct.year, ct.mon, ct.day); + VM_CTR3(vm, "Invalid RTC clocktime.time %02d:%02d:%02d", + ct.hour, ct.min, ct.sec); + goto fail; + } + return (ts.tv_sec); /* success */ +fail: + /* + * Stop updating the RTC if the date/time fields programmed by + * the guest are invalid. + */ + VM_CTR0(vrtc->vm, "Invalid RTC date/time programming detected"); + return (VRTC_BROKEN_TIME); +} + +static int +vrtc_time_update(struct vrtc *vrtc, time_t newtime, sbintime_t newbase) +{ + struct rtcdev *rtc; + time_t oldtime; + uint8_t alarm_sec, alarm_min, alarm_hour; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + rtc = &vrtc->rtcdev; + alarm_sec = rtc->alarm_sec; + alarm_min = rtc->alarm_min; + alarm_hour = rtc->alarm_hour; + + oldtime = vrtc->base_rtctime; + VM_CTR2(vrtc->vm, "Updating RTC secs from %#lx to %#lx", + oldtime, newtime); + + VM_CTR2(vrtc->vm, "Updating RTC base uptime from %#lx to %#lx", + vrtc->base_uptime, newbase); + vrtc->base_uptime = newbase; + + if (newtime == oldtime) + return (0); + + /* + * If 'newtime' indicates that RTC updates are disabled then just + * record that and return. There is no need to do alarm interrupt + * processing in this case. + */ + if (newtime == VRTC_BROKEN_TIME) { + vrtc->base_rtctime = VRTC_BROKEN_TIME; + return (0); + } + + /* + * Return an error if RTC updates are halted by the guest. + */ + if (rtc_halted(vrtc)) { + VM_CTR0(vrtc->vm, "RTC update halted by guest"); + return (EBUSY); + } + + do { + /* + * If the alarm interrupt is enabled and 'oldtime' is valid + * then visit all the seconds between 'oldtime' and 'newtime' + * to check for the alarm condition. + * + * Otherwise move the RTC time forward directly to 'newtime'. + */ + if (aintr_enabled(vrtc) && oldtime != VRTC_BROKEN_TIME) + vrtc->base_rtctime++; + else + vrtc->base_rtctime = newtime; + + if (aintr_enabled(vrtc)) { + /* + * Update the RTC date/time fields before checking + * if the alarm conditions are satisfied. + */ + secs_to_rtc(vrtc->base_rtctime, vrtc, 0); + + if ((alarm_sec >= 0xC0 || alarm_sec == rtc->sec) && + (alarm_min >= 0xC0 || alarm_min == rtc->min) && + (alarm_hour >= 0xC0 || alarm_hour == rtc->hour)) { + vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_ALARM); + } + } + } while (vrtc->base_rtctime != newtime); + + if (uintr_enabled(vrtc)) + vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE); + + return (0); +} + +static sbintime_t +vrtc_freq(struct vrtc *vrtc) +{ + int ratesel; + + static sbintime_t pf[16] = { + 0, + SBT_1S / 256, + SBT_1S / 128, + SBT_1S / 8192, + SBT_1S / 4096, + SBT_1S / 2048, + SBT_1S / 1024, + SBT_1S / 512, + SBT_1S / 256, + SBT_1S / 128, + SBT_1S / 64, + SBT_1S / 32, + SBT_1S / 16, + SBT_1S / 8, + SBT_1S / 4, + SBT_1S / 2, + }; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + /* + * If both periodic and alarm interrupts are enabled then use the + * periodic frequency to drive the callout. The minimum periodic + * frequency (2 Hz) is higher than the alarm frequency (1 Hz) so + * piggyback the alarm on top of it. The same argument applies to + * the update interrupt. + */ + if (pintr_enabled(vrtc) && divider_enabled(vrtc->rtcdev.reg_a)) { + ratesel = vrtc->rtcdev.reg_a & 0xf; + return (pf[ratesel]); + } else if (aintr_enabled(vrtc) && update_enabled(vrtc)) { + return (SBT_1S); + } else if (uintr_enabled(vrtc) && update_enabled(vrtc)) { + return (SBT_1S); + } else { + return (0); + } +} + +static void +vrtc_callout_reset(struct vrtc *vrtc, sbintime_t freqsbt) +{ + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + if (freqsbt == 0) { + if (callout_active(&vrtc->callout)) { + VM_CTR0(vrtc->vm, "RTC callout stopped"); + callout_stop(&vrtc->callout); + } + return; + } + VM_CTR1(vrtc->vm, "RTC callout frequency %d hz", SBT_1S / freqsbt); + callout_reset_sbt(&vrtc->callout, freqsbt, 0, vrtc_callout_handler, + vrtc, 0); +} + +static void +vrtc_callout_handler(void *arg) +{ + struct vrtc *vrtc = arg; + sbintime_t freqsbt, basetime; + time_t rtctime; + int error __diagused; + + VM_CTR0(vrtc->vm, "vrtc callout fired"); + + VRTC_LOCK(vrtc); + if (callout_pending(&vrtc->callout)) /* callout was reset */ + goto done; + + if (!callout_active(&vrtc->callout)) /* callout was stopped */ + goto done; + + callout_deactivate(&vrtc->callout); + + KASSERT((vrtc->rtcdev.reg_b & RTCSB_ALL_INTRS) != 0, + ("gratuitous vrtc callout")); + + if (pintr_enabled(vrtc)) + vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD); + + if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) { + rtctime = vrtc_curtime(vrtc, &basetime); + error = vrtc_time_update(vrtc, rtctime, basetime); + KASSERT(error == 0, ("%s: vrtc_time_update error %d", + __func__, error)); + } + + freqsbt = vrtc_freq(vrtc); + KASSERT(freqsbt != 0, ("%s: vrtc frequency cannot be zero", __func__)); + vrtc_callout_reset(vrtc, freqsbt); +done: + VRTC_UNLOCK(vrtc); +} + +static __inline void +vrtc_callout_check(struct vrtc *vrtc, sbintime_t freq) +{ + int active __diagused; + + active = callout_active(&vrtc->callout) ? 1 : 0; + KASSERT((freq == 0 && !active) || (freq != 0 && active), + ("vrtc callout %s with frequency %#lx", + active ? "active" : "inactive", freq)); +} + +static void +vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval) +{ + struct rtcdev *rtc; + int oldirqf, newirqf; + uint8_t oldval, changed; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + rtc = &vrtc->rtcdev; + newval &= RTCIR_ALARM | RTCIR_PERIOD | RTCIR_UPDATE; + + oldirqf = rtc->reg_c & RTCIR_INT; + if ((aintr_enabled(vrtc) && (newval & RTCIR_ALARM) != 0) || + (pintr_enabled(vrtc) && (newval & RTCIR_PERIOD) != 0) || + (uintr_enabled(vrtc) && (newval & RTCIR_UPDATE) != 0)) { + newirqf = RTCIR_INT; + } else { + newirqf = 0; + } + + oldval = rtc->reg_c; + rtc->reg_c = newirqf | newval; + changed = oldval ^ rtc->reg_c; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_c changed from %#x to %#x", + oldval, rtc->reg_c); + } + + if (!oldirqf && newirqf) { + VM_CTR1(vrtc->vm, "RTC irq %d asserted", RTC_IRQ); + vatpic_pulse_irq(vrtc->vm, RTC_IRQ); + vioapic_pulse_irq(vrtc->vm, RTC_IRQ); + } else if (oldirqf && !newirqf) { + VM_CTR1(vrtc->vm, "RTC irq %d deasserted", RTC_IRQ); + } +} + +static int +vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval) +{ + struct rtcdev *rtc; + sbintime_t oldfreq, newfreq, basetime; + time_t curtime, rtctime; + int error __diagused; + uint8_t oldval, changed; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + rtc = &vrtc->rtcdev; + oldval = rtc->reg_b; + oldfreq = vrtc_freq(vrtc); + + rtc->reg_b = newval; + changed = oldval ^ newval; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_b changed from %#x to %#x", + oldval, newval); + } + + if (changed & RTCSB_HALT) { + if ((newval & RTCSB_HALT) == 0) { + rtctime = rtc_to_secs(vrtc); + basetime = sbinuptime(); + if (rtctime == VRTC_BROKEN_TIME) { + if (rtc_flag_broken_time) + return (-1); + } + } else { + curtime = vrtc_curtime(vrtc, &basetime); + KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch " + "between vrtc basetime (%#lx) and curtime (%#lx)", + __func__, vrtc->base_rtctime, curtime)); + + /* + * Force a refresh of the RTC date/time fields so + * they reflect the time right before the guest set + * the HALT bit. + */ + secs_to_rtc(curtime, vrtc, 1); + + /* + * Updates are halted so mark 'base_rtctime' to denote + * that the RTC date/time is in flux. + */ + rtctime = VRTC_BROKEN_TIME; + rtc->reg_b &= ~RTCSB_UINTR; + } + error = vrtc_time_update(vrtc, rtctime, basetime); + KASSERT(error == 0, ("vrtc_time_update error %d", error)); + } + + /* + * Side effect of changes to the interrupt enable bits. + */ + if (changed & RTCSB_ALL_INTRS) + vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c); + + /* + * Change the callout frequency if it has changed. + */ + newfreq = vrtc_freq(vrtc); + if (newfreq != oldfreq) + vrtc_callout_reset(vrtc, newfreq); + else + vrtc_callout_check(vrtc, newfreq); + + /* + * The side effect of bits that control the RTC date/time format + * is handled lazily when those fields are actually read. + */ + return (0); +} + +static void +vrtc_set_reg_a(struct vrtc *vrtc, uint8_t newval) +{ + sbintime_t oldfreq, newfreq; + uint8_t oldval, changed; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + newval &= ~RTCSA_TUP; + oldval = vrtc->rtcdev.reg_a; + oldfreq = vrtc_freq(vrtc); + + if (divider_enabled(oldval) && !divider_enabled(newval)) { + VM_CTR2(vrtc->vm, "RTC divider held in reset at %#lx/%#lx", + vrtc->base_rtctime, vrtc->base_uptime); + } else if (!divider_enabled(oldval) && divider_enabled(newval)) { + /* + * If the dividers are coming out of reset then update + * 'base_uptime' before this happens. This is done to + * maintain the illusion that the RTC date/time was frozen + * while the dividers were disabled. + */ + vrtc->base_uptime = sbinuptime(); + VM_CTR2(vrtc->vm, "RTC divider out of reset at %#lx/%#lx", + vrtc->base_rtctime, vrtc->base_uptime); + } else { + /* NOTHING */ + } + + vrtc->rtcdev.reg_a = newval; + changed = oldval ^ newval; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_a changed from %#x to %#x", + oldval, newval); + } + + /* + * Side effect of changes to rate select and divider enable bits. + */ + newfreq = vrtc_freq(vrtc); + if (newfreq != oldfreq) + vrtc_callout_reset(vrtc, newfreq); + else + vrtc_callout_check(vrtc, newfreq); +} + +int +vrtc_set_time(struct vm *vm, time_t secs) +{ + struct vrtc *vrtc; + int error; + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + error = vrtc_time_update(vrtc, secs, sbinuptime()); + VRTC_UNLOCK(vrtc); + + if (error) { + VM_CTR2(vrtc->vm, "Error %d setting RTC time to %#lx", error, + secs); + } else { + VM_CTR1(vrtc->vm, "RTC time set to %#lx", secs); + } + + return (error); +} + +time_t +vrtc_get_time(struct vm *vm) +{ + struct vrtc *vrtc; + sbintime_t basetime; + time_t t; + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + t = vrtc_curtime(vrtc, &basetime); + VRTC_UNLOCK(vrtc); + + return (t); +} + +int +vrtc_nvram_write(struct vm *vm, int offset, uint8_t value) +{ + struct vrtc *vrtc; + uint8_t *ptr; + + vrtc = vm_rtc(vm); + + /* + * Don't allow writes to RTC control registers or the date/time fields. + */ + if (offset < offsetof(struct rtcdev, nvram[0]) || + offset == RTC_CENTURY || offset >= sizeof(struct rtcdev)) { + VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d", + offset); + return (EINVAL); + } + + VRTC_LOCK(vrtc); + ptr = (uint8_t *)(&vrtc->rtcdev); + ptr[offset] = value; + VM_CTR2(vrtc->vm, "RTC nvram write %#x to offset %#x", value, offset); + VRTC_UNLOCK(vrtc); + + return (0); +} + +int +vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval) +{ + struct vrtc *vrtc; + sbintime_t basetime; + time_t curtime; + uint8_t *ptr; + + /* + * Allow all offsets in the RTC to be read. + */ + if (offset < 0 || offset >= sizeof(struct rtcdev)) + return (EINVAL); + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + + /* + * Update RTC date/time fields if necessary. + */ + if (offset < 10 || offset == RTC_CENTURY) { + curtime = vrtc_curtime(vrtc, &basetime); + secs_to_rtc(curtime, vrtc, 0); + } + + ptr = (uint8_t *)(&vrtc->rtcdev); + *retval = ptr[offset]; + + VRTC_UNLOCK(vrtc); + return (0); +} + +int +vrtc_addr_handler(struct vm *vm, bool in, int port, int bytes, uint32_t *val) +{ + struct vrtc *vrtc; + + vrtc = vm_rtc(vm); + + if (bytes != 1) + return (-1); + + if (in) { + *val = 0xff; + return (0); + } + + VRTC_LOCK(vrtc); + vrtc->addr = *val & 0x7f; + VRTC_UNLOCK(vrtc); + + return (0); +} + +int +vrtc_data_handler(struct vm *vm, bool in, int port, int bytes, uint32_t *val) +{ + struct vrtc *vrtc; + struct rtcdev *rtc; + sbintime_t basetime; + time_t curtime; + int error, offset; + + vrtc = vm_rtc(vm); + rtc = &vrtc->rtcdev; + + if (bytes != 1) + return (-1); + + VRTC_LOCK(vrtc); + offset = vrtc->addr; + if (offset >= sizeof(struct rtcdev)) { + VRTC_UNLOCK(vrtc); + return (-1); + } + + error = 0; + curtime = vrtc_curtime(vrtc, &basetime); + vrtc_time_update(vrtc, curtime, basetime); + + /* + * Update RTC date/time fields if necessary. + * + * This is not just for reads of the RTC. The side-effect of writing + * the century byte requires other RTC date/time fields (e.g. sec) + * to be updated here. + */ + if (offset < 10 || offset == RTC_CENTURY) + secs_to_rtc(curtime, vrtc, 0); + + if (in) { + if (offset == 12) { + /* + * XXX + * reg_c interrupt flags are updated only if the + * corresponding interrupt enable bit in reg_b is set. + */ + *val = vrtc->rtcdev.reg_c; + vrtc_set_reg_c(vrtc, 0); + } else { + *val = *((uint8_t *)rtc + offset); + } + VM_CTR2(vm, "Read value %#x from RTC offset %#x", + *val, offset); + } else { + switch (offset) { + case 10: + VM_CTR1(vm, "RTC reg_a set to %#x", *val); + vrtc_set_reg_a(vrtc, *val); + break; + case 11: + VM_CTR1(vm, "RTC reg_b set to %#x", *val); + error = vrtc_set_reg_b(vrtc, *val); + break; + case 12: + VM_CTR1(vm, "RTC reg_c set to %#x (ignored)", + *val); + break; + case 13: + VM_CTR1(vm, "RTC reg_d set to %#x (ignored)", + *val); + break; + case 0: + /* + * High order bit of 'seconds' is readonly. + */ + *val &= 0x7f; + /* FALLTHRU */ + default: + VM_CTR2(vm, "RTC offset %#x set to %#x", + offset, *val); + *((uint8_t *)rtc + offset) = *val; + break; + } + + /* + * XXX some guests (e.g. OpenBSD) write the century byte + * outside of RTCSB_HALT so re-calculate the RTC date/time. + */ + if (offset == RTC_CENTURY && !rtc_halted(vrtc)) { + curtime = rtc_to_secs(vrtc); + error = vrtc_time_update(vrtc, curtime, sbinuptime()); + KASSERT(!error, ("vrtc_time_update error %d", error)); + if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time) + error = -1; + } + } + VRTC_UNLOCK(vrtc); + return (error); +} + +void +vrtc_reset(struct vrtc *vrtc) +{ + struct rtcdev *rtc; + + VRTC_LOCK(vrtc); + + rtc = &vrtc->rtcdev; + vrtc_set_reg_b(vrtc, rtc->reg_b & ~(RTCSB_ALL_INTRS | RTCSB_SQWE)); + vrtc_set_reg_c(vrtc, 0); + KASSERT(!callout_active(&vrtc->callout), ("rtc callout still active")); + + VRTC_UNLOCK(vrtc); +} + +struct vrtc * +vrtc_init(struct vm *vm) +{ + struct vrtc *vrtc; + struct rtcdev *rtc; + time_t curtime; + + vrtc = malloc(sizeof(struct vrtc), M_VRTC, M_WAITOK | M_ZERO); + vrtc->vm = vm; + mtx_init(&vrtc->mtx, "vrtc lock", NULL, MTX_DEF); + callout_init(&vrtc->callout, 1); + + /* Allow dividers to keep time but disable everything else */ + rtc = &vrtc->rtcdev; + rtc->reg_a = 0x20; + rtc->reg_b = RTCSB_24HR; + rtc->reg_c = 0; + rtc->reg_d = RTCSD_PWR; + + /* Reset the index register to a safe value. */ + vrtc->addr = RTC_STATUSD; + + /* + * Initialize RTC time to 00:00:00 Jan 1, 1970. + */ + curtime = 0; + + VRTC_LOCK(vrtc); + vrtc->base_rtctime = VRTC_BROKEN_TIME; + vrtc_time_update(vrtc, curtime, sbinuptime()); + secs_to_rtc(curtime, vrtc, 0); + VRTC_UNLOCK(vrtc); + + return (vrtc); +} + +void +vrtc_cleanup(struct vrtc *vrtc) +{ + + callout_drain(&vrtc->callout); + mtx_destroy(&vrtc->mtx); + free(vrtc, M_VRTC); +} + +#ifdef BHYVE_SNAPSHOT +int +vrtc_snapshot(struct vrtc *vrtc, struct vm_snapshot_meta *meta) +{ + int ret; + + VRTC_LOCK(vrtc); + + SNAPSHOT_VAR_OR_LEAVE(vrtc->addr, meta, ret, done); + if (meta->op == VM_SNAPSHOT_RESTORE) + vrtc->base_uptime = sbinuptime(); + SNAPSHOT_VAR_OR_LEAVE(vrtc->base_rtctime, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.min, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_min, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.hour, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_hour, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.day_of_week, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.day_of_month, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.month, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.year, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_a, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_b, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_c, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_d, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vrtc->rtcdev.nvram, sizeof(vrtc->rtcdev.nvram), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.century, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vrtc->rtcdev.nvram2, sizeof(vrtc->rtcdev.nvram2), + meta, ret, done); + + vrtc_callout_reset(vrtc, vrtc_freq(vrtc)); + + VRTC_UNLOCK(vrtc); + +done: + return (ret); +} +#endif diff --git a/sys/amd64/vmm/io/vrtc.h b/sys/amd64/vmm/io/vrtc.h new file mode 100644 index 000000000000..ee596389b945 --- /dev/null +++ b/sys/amd64/vmm/io/vrtc.h @@ -0,0 +1,55 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VRTC_H_ +#define _VRTC_H_ + +#include <isa/isareg.h> + +struct vrtc; +struct vm_snapshot_meta; + +struct vrtc *vrtc_init(struct vm *vm); +void vrtc_cleanup(struct vrtc *vrtc); +void vrtc_reset(struct vrtc *vrtc); + +time_t vrtc_get_time(struct vm *vm); +int vrtc_set_time(struct vm *vm, time_t secs); +int vrtc_nvram_write(struct vm *vm, int offset, uint8_t value); +int vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval); + +int vrtc_addr_handler(struct vm *vm, bool in, int port, int bytes, + uint32_t *val); +int vrtc_data_handler(struct vm *vm, bool in, int port, int bytes, + uint32_t *val); + +#ifdef BHYVE_SNAPSHOT +int vrtc_snapshot(struct vrtc *vrtc, struct vm_snapshot_meta *meta); +#endif + +#endif diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c new file mode 100644 index 000000000000..f2bea0d82b5c --- /dev/null +++ b/sys/amd64/vmm/vmm.c @@ -0,0 +1,2805 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "opt_bhyve_snapshot.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/sched.h> +#include <sys/smp.h> +#include <sys/sx.h> +#include <sys/vnode.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_extern.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_pager.h> +#include <vm/vm_kern.h> +#include <vm/vnode_pager.h> +#include <vm/swap_pager.h> +#include <vm/uma.h> + +#include <machine/cpu.h> +#include <machine/pcb.h> +#include <machine/smp.h> +#include <machine/md_var.h> +#include <x86/psl.h> +#include <x86/apicreg.h> +#include <x86/ifunc.h> + +#include <machine/vmm.h> +#include <machine/vmm_instruction_emul.h> +#include <machine/vmm_snapshot.h> + +#include <dev/vmm/vmm_dev.h> +#include <dev/vmm/vmm_ktr.h> +#include <dev/vmm/vmm_mem.h> + +#include "vmm_ioport.h" +#include "vmm_host.h" +#include "vmm_mem.h" +#include "vmm_util.h" +#include "vatpic.h" +#include "vatpit.h" +#include "vhpet.h" +#include "vioapic.h" +#include "vlapic.h" +#include "vpmtmr.h" +#include "vrtc.h" +#include "vmm_stat.h" +#include "vmm_lapic.h" + +#include "io/ppt.h" +#include "io/iommu.h" + +struct vlapic; + +/* + * Initialization: + * (a) allocated when vcpu is created + * (i) initialized when vcpu is created and when it is reinitialized + * (o) initialized the first time the vcpu is created + * (x) initialized before use + */ +struct vcpu { + struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ + enum vcpu_state state; /* (o) vcpu state */ + int vcpuid; /* (o) */ + int hostcpu; /* (o) vcpu's host cpu */ + int reqidle; /* (i) request vcpu to idle */ + struct vm *vm; /* (o) */ + void *cookie; /* (i) cpu-specific data */ + struct vlapic *vlapic; /* (i) APIC device model */ + enum x2apic_state x2apic_state; /* (i) APIC mode */ + uint64_t exitintinfo; /* (i) events pending at VM exit */ + int nmi_pending; /* (i) NMI pending */ + int extint_pending; /* (i) INTR pending */ + int exception_pending; /* (i) exception pending */ + int exc_vector; /* (x) exception collateral */ + int exc_errcode_valid; + uint32_t exc_errcode; + struct savefpu *guestfpu; /* (a,i) guest fpu state */ + uint64_t guest_xcr0; /* (i) guest %xcr0 register */ + void *stats; /* (a,i) statistics */ + struct vm_exit exitinfo; /* (x) exit reason and collateral */ + cpuset_t exitinfo_cpuset; /* (x) storage for vmexit handlers */ + uint64_t nextrip; /* (x) next instruction to execute */ + uint64_t tsc_offset; /* (o) TSC offsetting */ +}; + +#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) +#define vcpu_lock_destroy(v) mtx_destroy(&((v)->mtx)) +#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) +#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) +#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) + +/* + * Initialization: + * (o) initialized the first time the VM is created + * (i) initialized when VM is created and when it is reinitialized + * (x) initialized before use + * + * Locking: + * [m] mem_segs_lock + * [r] rendezvous_mtx + * [v] reads require one frozen vcpu, writes require freezing all vcpus + */ +struct vm { + void *cookie; /* (i) cpu-specific data */ + void *iommu; /* (x) iommu-specific data */ + struct vhpet *vhpet; /* (i) virtual HPET */ + struct vioapic *vioapic; /* (i) virtual ioapic */ + struct vatpic *vatpic; /* (i) virtual atpic */ + struct vatpit *vatpit; /* (i) virtual atpit */ + struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ + struct vrtc *vrtc; /* (o) virtual RTC */ + volatile cpuset_t active_cpus; /* (i) active vcpus */ + volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */ + cpuset_t startup_cpus; /* (i) [r] waiting for startup */ + int suspend; /* (i) stop VM execution */ + bool dying; /* (o) is dying */ + volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ + volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ + cpuset_t rendezvous_req_cpus; /* (x) [r] rendezvous requested */ + cpuset_t rendezvous_done_cpus; /* (x) [r] rendezvous finished */ + void *rendezvous_arg; /* (x) [r] rendezvous func/arg */ + vm_rendezvous_func_t rendezvous_func; + struct mtx rendezvous_mtx; /* (o) rendezvous lock */ + struct vm_mem mem; /* (i) [m+v] guest memory */ + char name[VM_MAX_NAMELEN+1]; /* (o) virtual machine name */ + struct vcpu **vcpu; /* (o) guest vcpus */ + /* The following describe the vm cpu topology */ + uint16_t sockets; /* (o) num of sockets */ + uint16_t cores; /* (o) num of cores/socket */ + uint16_t threads; /* (o) num of threads/core */ + uint16_t maxcpus; /* (o) max pluggable cpus */ + struct sx vcpus_init_lock; /* (o) */ +}; + +#define VMM_CTR0(vcpu, format) \ + VCPU_CTR0((vcpu)->vm, (vcpu)->vcpuid, format) + +#define VMM_CTR1(vcpu, format, p1) \ + VCPU_CTR1((vcpu)->vm, (vcpu)->vcpuid, format, p1) + +#define VMM_CTR2(vcpu, format, p1, p2) \ + VCPU_CTR2((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2) + +#define VMM_CTR3(vcpu, format, p1, p2, p3) \ + VCPU_CTR3((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3) + +#define VMM_CTR4(vcpu, format, p1, p2, p3, p4) \ + VCPU_CTR4((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3, p4) + +static int vmm_initialized; + +static void vmmops_panic(void); + +static void +vmmops_panic(void) +{ + panic("vmm_ops func called when !vmm_is_intel() && !vmm_is_svm()"); +} + +#define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \ + DEFINE_IFUNC(, ret_type, vmmops_##opname, args) \ + { \ + if (vmm_is_intel()) \ + return (vmm_ops_intel.opname); \ + else if (vmm_is_svm()) \ + return (vmm_ops_amd.opname); \ + else \ + return ((ret_type (*)args)vmmops_panic); \ + } + +DEFINE_VMMOPS_IFUNC(int, modinit, (int ipinum)) +DEFINE_VMMOPS_IFUNC(int, modcleanup, (void)) +DEFINE_VMMOPS_IFUNC(void, modsuspend, (void)) +DEFINE_VMMOPS_IFUNC(void, modresume, (void)) +DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap)) +DEFINE_VMMOPS_IFUNC(int, run, (void *vcpui, register_t rip, struct pmap *pmap, + struct vm_eventinfo *info)) +DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi)) +DEFINE_VMMOPS_IFUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, + int vcpu_id)) +DEFINE_VMMOPS_IFUNC(void, vcpu_cleanup, (void *vcpui)) +DEFINE_VMMOPS_IFUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)) +DEFINE_VMMOPS_IFUNC(int, setreg, (void *vcpui, int num, uint64_t val)) +DEFINE_VMMOPS_IFUNC(int, getdesc, (void *vcpui, int num, struct seg_desc *desc)) +DEFINE_VMMOPS_IFUNC(int, setdesc, (void *vcpui, int num, struct seg_desc *desc)) +DEFINE_VMMOPS_IFUNC(int, getcap, (void *vcpui, int num, int *retval)) +DEFINE_VMMOPS_IFUNC(int, setcap, (void *vcpui, int num, int val)) +DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, + vm_offset_t max)) +DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace)) +DEFINE_VMMOPS_IFUNC(struct vlapic *, vlapic_init, (void *vcpui)) +DEFINE_VMMOPS_IFUNC(void, vlapic_cleanup, (struct vlapic *vlapic)) +#ifdef BHYVE_SNAPSHOT +DEFINE_VMMOPS_IFUNC(int, vcpu_snapshot, (void *vcpui, + struct vm_snapshot_meta *meta)) +DEFINE_VMMOPS_IFUNC(int, restore_tsc, (void *vcpui, uint64_t now)) +#endif + +SDT_PROVIDER_DEFINE(vmm); + +static MALLOC_DEFINE(M_VM, "vm", "vm"); + +/* statistics */ +static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); + +SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, + NULL); + +/* + * Halt the guest if all vcpus are executing a HLT instruction with + * interrupts disabled. + */ +static int halt_detection_enabled = 1; +SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, + &halt_detection_enabled, 0, + "Halt VM if all vcpus execute HLT with interrupts disabled"); + +static int vmm_ipinum; +SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, + "IPI vector used for vcpu notifications"); + +static int trace_guest_exceptions; +SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, + &trace_guest_exceptions, 0, + "Trap into hypervisor on all guest exceptions and reflect them back"); + +static int trap_wbinvd; +SYSCTL_INT(_hw_vmm, OID_AUTO, trap_wbinvd, CTLFLAG_RDTUN, &trap_wbinvd, 0, + "WBINVD triggers a VM-exit"); + +u_int vm_maxcpu; +SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &vm_maxcpu, 0, "Maximum number of vCPUs"); + +static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); + +/* global statistics */ +VMM_STAT(VCPU_MIGRATIONS, "vcpu migration across host cpus"); +VMM_STAT(VMEXIT_COUNT, "total number of vm exits"); +VMM_STAT(VMEXIT_EXTINT, "vm exits due to external interrupt"); +VMM_STAT(VMEXIT_HLT, "number of times hlt was intercepted"); +VMM_STAT(VMEXIT_CR_ACCESS, "number of times %cr access was intercepted"); +VMM_STAT(VMEXIT_RDMSR, "number of times rdmsr was intercepted"); +VMM_STAT(VMEXIT_WRMSR, "number of times wrmsr was intercepted"); +VMM_STAT(VMEXIT_MTRAP, "number of monitor trap exits"); +VMM_STAT(VMEXIT_PAUSE, "number of times pause was intercepted"); +VMM_STAT(VMEXIT_INTR_WINDOW, "vm exits due to interrupt window opening"); +VMM_STAT(VMEXIT_NMI_WINDOW, "vm exits due to nmi window opening"); +VMM_STAT(VMEXIT_INOUT, "number of times in/out was intercepted"); +VMM_STAT(VMEXIT_CPUID, "number of times cpuid was intercepted"); +VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault"); +VMM_STAT(VMEXIT_INST_EMUL, "vm exits for instruction emulation"); +VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason"); +VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit"); +VMM_STAT(VMEXIT_REQIDLE, "number of times idle requested at exit"); +VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace"); +VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit"); +VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions"); + +/* + * Upper limit on vm_maxcpu. Limited by use of uint16_t types for CPU + * counts as well as range of vpid values for VT-x and by the capacity + * of cpuset_t masks. The call to new_unrhdr() in vpid_init() in + * vmx.c requires 'vm_maxcpu + 1 <= 0xffff', hence the '- 1' below. + */ +#define VM_MAXCPU MIN(0xffff - 1, CPU_SETSIZE) + +#ifdef KTR +static const char * +vcpu_state2str(enum vcpu_state state) +{ + + switch (state) { + case VCPU_IDLE: + return ("idle"); + case VCPU_FROZEN: + return ("frozen"); + case VCPU_RUNNING: + return ("running"); + case VCPU_SLEEPING: + return ("sleeping"); + default: + return ("unknown"); + } +} +#endif + +static void +vcpu_cleanup(struct vcpu *vcpu, bool destroy) +{ + vmmops_vlapic_cleanup(vcpu->vlapic); + vmmops_vcpu_cleanup(vcpu->cookie); + vcpu->cookie = NULL; + if (destroy) { + vmm_stat_free(vcpu->stats); + fpu_save_area_free(vcpu->guestfpu); + vcpu_lock_destroy(vcpu); + free(vcpu, M_VM); + } +} + +static struct vcpu * +vcpu_alloc(struct vm *vm, int vcpu_id) +{ + struct vcpu *vcpu; + + KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, + ("vcpu_init: invalid vcpu %d", vcpu_id)); + + vcpu = malloc(sizeof(*vcpu), M_VM, M_WAITOK | M_ZERO); + vcpu_lock_init(vcpu); + vcpu->state = VCPU_IDLE; + vcpu->hostcpu = NOCPU; + vcpu->vcpuid = vcpu_id; + vcpu->vm = vm; + vcpu->guestfpu = fpu_save_area_alloc(); + vcpu->stats = vmm_stat_alloc(); + vcpu->tsc_offset = 0; + return (vcpu); +} + +static void +vcpu_init(struct vcpu *vcpu) +{ + vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid); + vcpu->vlapic = vmmops_vlapic_init(vcpu->cookie); + vm_set_x2apic_state(vcpu, X2APIC_DISABLED); + vcpu->reqidle = 0; + vcpu->exitintinfo = 0; + vcpu->nmi_pending = 0; + vcpu->extint_pending = 0; + vcpu->exception_pending = 0; + vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; + fpu_save_area_reset(vcpu->guestfpu); + vmm_stat_init(vcpu->stats); +} + +int +vcpu_trace_exceptions(struct vcpu *vcpu) +{ + + return (trace_guest_exceptions); +} + +int +vcpu_trap_wbinvd(struct vcpu *vcpu) +{ + return (trap_wbinvd); +} + +struct vm_exit * +vm_exitinfo(struct vcpu *vcpu) +{ + return (&vcpu->exitinfo); +} + +cpuset_t * +vm_exitinfo_cpuset(struct vcpu *vcpu) +{ + return (&vcpu->exitinfo_cpuset); +} + +static int +vmm_init(void) +{ + if (!vmm_is_hw_supported()) + return (ENXIO); + + vm_maxcpu = mp_ncpus; + TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu); + + if (vm_maxcpu > VM_MAXCPU) { + printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU); + vm_maxcpu = VM_MAXCPU; + } + if (vm_maxcpu == 0) + vm_maxcpu = 1; + + vmm_host_state_init(); + + vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : + &IDTVEC(justreturn)); + if (vmm_ipinum < 0) + vmm_ipinum = IPI_AST; + + vmm_suspend_p = vmmops_modsuspend; + vmm_resume_p = vmmops_modresume; + + return (vmmops_modinit(vmm_ipinum)); +} + +static int +vmm_handler(module_t mod, int what, void *arg) +{ + int error; + + switch (what) { + case MOD_LOAD: + if (vmm_is_hw_supported()) { + error = vmmdev_init(); + if (error != 0) + break; + error = vmm_init(); + if (error == 0) + vmm_initialized = 1; + else + (void)vmmdev_cleanup(); + } else { + error = ENXIO; + } + break; + case MOD_UNLOAD: + if (vmm_is_hw_supported()) { + error = vmmdev_cleanup(); + if (error == 0) { + vmm_suspend_p = NULL; + vmm_resume_p = NULL; + iommu_cleanup(); + if (vmm_ipinum != IPI_AST) + lapic_ipi_free(vmm_ipinum); + error = vmmops_modcleanup(); + /* + * Something bad happened - prevent new + * VMs from being created + */ + if (error) + vmm_initialized = 0; + } + } else { + error = 0; + } + break; + default: + error = 0; + break; + } + return (error); +} + +static moduledata_t vmm_kmod = { + "vmm", + vmm_handler, + NULL +}; + +/* + * vmm initialization has the following dependencies: + * + * - VT-x initialization requires smp_rendezvous() and therefore must happen + * after SMP is fully functional (after SI_SUB_SMP). + * - vmm device initialization requires an initialized devfs. + */ +DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY); +MODULE_VERSION(vmm, 1); + +static void +vm_init(struct vm *vm, bool create) +{ + vm->cookie = vmmops_init(vm, vmspace_pmap(vm_vmspace(vm))); + vm->iommu = NULL; + vm->vioapic = vioapic_init(vm); + vm->vhpet = vhpet_init(vm); + vm->vatpic = vatpic_init(vm); + vm->vatpit = vatpit_init(vm); + vm->vpmtmr = vpmtmr_init(vm); + if (create) + vm->vrtc = vrtc_init(vm); + + CPU_ZERO(&vm->active_cpus); + CPU_ZERO(&vm->debug_cpus); + CPU_ZERO(&vm->startup_cpus); + + vm->suspend = 0; + CPU_ZERO(&vm->suspended_cpus); + + if (!create) { + for (int i = 0; i < vm->maxcpus; i++) { + if (vm->vcpu[i] != NULL) + vcpu_init(vm->vcpu[i]); + } + } +} + +void +vm_disable_vcpu_creation(struct vm *vm) +{ + sx_xlock(&vm->vcpus_init_lock); + vm->dying = true; + sx_xunlock(&vm->vcpus_init_lock); +} + +struct vcpu * +vm_alloc_vcpu(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm)) + return (NULL); + + vcpu = (struct vcpu *) + atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]); + if (__predict_true(vcpu != NULL)) + return (vcpu); + + sx_xlock(&vm->vcpus_init_lock); + vcpu = vm->vcpu[vcpuid]; + if (vcpu == NULL && !vm->dying) { + vcpu = vcpu_alloc(vm, vcpuid); + vcpu_init(vcpu); + + /* + * Ensure vCPU is fully created before updating pointer + * to permit unlocked reads above. + */ + atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid], + (uintptr_t)vcpu); + } + sx_xunlock(&vm->vcpus_init_lock); + return (vcpu); +} + +void +vm_lock_vcpus(struct vm *vm) +{ + sx_xlock(&vm->vcpus_init_lock); +} + +void +vm_unlock_vcpus(struct vm *vm) +{ + sx_unlock(&vm->vcpus_init_lock); +} + +/* + * The default CPU topology is a single thread per package. + */ +u_int cores_per_package = 1; +u_int threads_per_core = 1; + +int +vm_create(const char *name, struct vm **retvm) +{ + struct vm *vm; + int error; + + /* + * If vmm.ko could not be successfully initialized then don't attempt + * to create the virtual machine. + */ + if (!vmm_initialized) + return (ENXIO); + + if (name == NULL || strnlen(name, VM_MAX_NAMELEN + 1) == + VM_MAX_NAMELEN + 1) + return (EINVAL); + + vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); + error = vm_mem_init(&vm->mem, 0, VM_MAXUSER_ADDRESS_LA48); + if (error != 0) { + free(vm, M_VM); + return (error); + } + strcpy(vm->name, name); + mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); + sx_init(&vm->vcpus_init_lock, "vm vcpus"); + vm->vcpu = malloc(sizeof(*vm->vcpu) * vm_maxcpu, M_VM, M_WAITOK | + M_ZERO); + + vm->sockets = 1; + vm->cores = cores_per_package; /* XXX backwards compatibility */ + vm->threads = threads_per_core; /* XXX backwards compatibility */ + vm->maxcpus = vm_maxcpu; + + vm_init(vm, true); + + *retvm = vm; + return (0); +} + +void +vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, + uint16_t *threads, uint16_t *maxcpus) +{ + *sockets = vm->sockets; + *cores = vm->cores; + *threads = vm->threads; + *maxcpus = vm->maxcpus; +} + +uint16_t +vm_get_maxcpus(struct vm *vm) +{ + return (vm->maxcpus); +} + +int +vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, + uint16_t threads, uint16_t maxcpus __unused) +{ + /* Ignore maxcpus. */ + if ((sockets * cores * threads) > vm->maxcpus) + return (EINVAL); + vm->sockets = sockets; + vm->cores = cores; + vm->threads = threads; + return(0); +} + +static void +vm_cleanup(struct vm *vm, bool destroy) +{ + if (destroy) + vm_xlock_memsegs(vm); + else + vm_assert_memseg_xlocked(vm); + + ppt_unassign_all(vm); + + if (vm->iommu != NULL) + iommu_destroy_domain(vm->iommu); + + if (destroy) + vrtc_cleanup(vm->vrtc); + else + vrtc_reset(vm->vrtc); + vpmtmr_cleanup(vm->vpmtmr); + vatpit_cleanup(vm->vatpit); + vhpet_cleanup(vm->vhpet); + vatpic_cleanup(vm->vatpic); + vioapic_cleanup(vm->vioapic); + + for (int i = 0; i < vm->maxcpus; i++) { + if (vm->vcpu[i] != NULL) + vcpu_cleanup(vm->vcpu[i], destroy); + } + + vmmops_cleanup(vm->cookie); + + vm_mem_cleanup(vm); + + if (destroy) { + vm_mem_destroy(vm); + + free(vm->vcpu, M_VM); + sx_destroy(&vm->vcpus_init_lock); + mtx_destroy(&vm->rendezvous_mtx); + } +} + +void +vm_destroy(struct vm *vm) +{ + vm_cleanup(vm, true); + free(vm, M_VM); +} + +int +vm_reinit(struct vm *vm) +{ + int error; + + /* + * A virtual machine can be reset only if all vcpus are suspended. + */ + if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { + vm_cleanup(vm, false); + vm_init(vm, false); + error = 0; + } else { + error = EBUSY; + } + + return (error); +} + +const char * +vm_name(struct vm *vm) +{ + return (vm->name); +} + +int +vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) +{ + return (vmm_mmio_alloc(vm_vmspace(vm), gpa, len, hpa)); +} + +int +vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) +{ + + vmm_mmio_free(vm_vmspace(vm), gpa, len); + return (0); +} + +static int +vm_iommu_map(struct vm *vm) +{ + pmap_t pmap; + vm_paddr_t gpa, hpa; + struct vm_mem_map *mm; + int error, i; + + sx_assert(&vm->mem.mem_segs_lock, SX_LOCKED); + + pmap = vmspace_pmap(vm_vmspace(vm)); + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + if (!vm_memseg_sysmem(vm, i)) + continue; + + mm = &vm->mem.mem_maps[i]; + KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, + ("iommu map found invalid memmap %#lx/%#lx/%#x", + mm->gpa, mm->len, mm->flags)); + if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) + continue; + mm->flags |= VM_MEMMAP_F_IOMMU; + + for (gpa = mm->gpa; gpa < mm->gpa + mm->len; gpa += PAGE_SIZE) { + hpa = pmap_extract(pmap, gpa); + + /* + * All mappings in the vmm vmspace must be + * present since they are managed by vmm in this way. + * Because we are in pass-through mode, the + * mappings must also be wired. This implies + * that all pages must be mapped and wired, + * allowing to use pmap_extract() and avoiding the + * need to use vm_gpa_hold_global(). + * + * This could change if/when we start + * supporting page faults on IOMMU maps. + */ + KASSERT(vm_page_wired(PHYS_TO_VM_PAGE(hpa)), + ("vm_iommu_map: vm %p gpa %jx hpa %jx not wired", + vm, (uintmax_t)gpa, (uintmax_t)hpa)); + + iommu_create_mapping(vm->iommu, gpa, hpa, PAGE_SIZE); + } + } + + error = iommu_invalidate_tlb(iommu_host_domain()); + return (error); +} + +static int +vm_iommu_unmap(struct vm *vm) +{ + vm_paddr_t gpa; + struct vm_mem_map *mm; + int error, i; + + sx_assert(&vm->mem.mem_segs_lock, SX_LOCKED); + + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + if (!vm_memseg_sysmem(vm, i)) + continue; + + mm = &vm->mem.mem_maps[i]; + if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) + continue; + mm->flags &= ~VM_MEMMAP_F_IOMMU; + KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, + ("iommu unmap found invalid memmap %#lx/%#lx/%#x", + mm->gpa, mm->len, mm->flags)); + + for (gpa = mm->gpa; gpa < mm->gpa + mm->len; gpa += PAGE_SIZE) { + KASSERT(vm_page_wired(PHYS_TO_VM_PAGE(pmap_extract( + vmspace_pmap(vm_vmspace(vm)), gpa))), + ("vm_iommu_unmap: vm %p gpa %jx not wired", + vm, (uintmax_t)gpa)); + iommu_remove_mapping(vm->iommu, gpa, PAGE_SIZE); + } + } + + /* + * Invalidate the cached translations associated with the domain + * from which pages were removed. + */ + error = iommu_invalidate_tlb(vm->iommu); + return (error); +} + +int +vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) +{ + int error; + + error = ppt_unassign_device(vm, bus, slot, func); + if (error) + return (error); + + if (ppt_assigned_devices(vm) == 0) + error = vm_iommu_unmap(vm); + + return (error); +} + +int +vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) +{ + int error; + vm_paddr_t maxaddr; + bool map = false; + + /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ + if (ppt_assigned_devices(vm) == 0) { + KASSERT(vm->iommu == NULL, + ("vm_assign_pptdev: iommu must be NULL")); + maxaddr = vmm_sysmem_maxaddr(vm); + vm->iommu = iommu_create_domain(maxaddr); + if (vm->iommu == NULL) + return (ENXIO); + map = true; + } + + error = ppt_assign_device(vm, bus, slot, func); + if (error == 0 && map) + error = vm_iommu_map(vm); + return (error); +} + +int +vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval) +{ + /* Negative values represent VM control structure fields. */ + if (reg >= VM_REG_LAST) + return (EINVAL); + + return (vmmops_getreg(vcpu->cookie, reg, retval)); +} + +int +vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) +{ + int error; + + /* Negative values represent VM control structure fields. */ + if (reg >= VM_REG_LAST) + return (EINVAL); + + error = vmmops_setreg(vcpu->cookie, reg, val); + if (error || reg != VM_REG_GUEST_RIP) + return (error); + + /* Set 'nextrip' to match the value of %rip */ + VMM_CTR1(vcpu, "Setting nextrip to %#lx", val); + vcpu->nextrip = val; + return (0); +} + +static bool +is_descriptor_table(int reg) +{ + + switch (reg) { + case VM_REG_GUEST_IDTR: + case VM_REG_GUEST_GDTR: + return (true); + default: + return (false); + } +} + +static bool +is_segment_register(int reg) +{ + + switch (reg) { + case VM_REG_GUEST_ES: + case VM_REG_GUEST_CS: + case VM_REG_GUEST_SS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_TR: + case VM_REG_GUEST_LDTR: + return (true); + default: + return (false); + } +} + +int +vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc) +{ + + if (!is_segment_register(reg) && !is_descriptor_table(reg)) + return (EINVAL); + + return (vmmops_getdesc(vcpu->cookie, reg, desc)); +} + +int +vm_set_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc) +{ + + if (!is_segment_register(reg) && !is_descriptor_table(reg)) + return (EINVAL); + + return (vmmops_setdesc(vcpu->cookie, reg, desc)); +} + +static void +restore_guest_fpustate(struct vcpu *vcpu) +{ + + /* flush host state to the pcb */ + fpuexit(curthread); + + /* restore guest FPU state */ + fpu_enable(); + fpurestore(vcpu->guestfpu); + + /* restore guest XCR0 if XSAVE is enabled in the host */ + if (rcr4() & CR4_XSAVE) + load_xcr(0, vcpu->guest_xcr0); + + /* + * The FPU is now "dirty" with the guest's state so disable + * the FPU to trap any access by the host. + */ + fpu_disable(); +} + +static void +save_guest_fpustate(struct vcpu *vcpu) +{ + + if ((rcr0() & CR0_TS) == 0) + panic("fpu emulation not enabled in host!"); + + /* save guest XCR0 and restore host XCR0 */ + if (rcr4() & CR4_XSAVE) { + vcpu->guest_xcr0 = rxcr(0); + load_xcr(0, vmm_get_host_xcr0()); + } + + /* save guest FPU state */ + fpu_enable(); + fpusave(vcpu->guestfpu); + fpu_disable(); +} + +static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); + +/* + * Invoke the rendezvous function on the specified vcpu if applicable. Return + * true if the rendezvous is finished, false otherwise. + */ +static bool +vm_rendezvous(struct vcpu *vcpu) +{ + struct vm *vm = vcpu->vm; + int vcpuid; + + mtx_assert(&vcpu->vm->rendezvous_mtx, MA_OWNED); + KASSERT(vcpu->vm->rendezvous_func != NULL, + ("vm_rendezvous: no rendezvous pending")); + + /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ + CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, + &vm->active_cpus); + + vcpuid = vcpu->vcpuid; + if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && + !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { + VMM_CTR0(vcpu, "Calling rendezvous func"); + (*vm->rendezvous_func)(vcpu, vm->rendezvous_arg); + CPU_SET(vcpuid, &vm->rendezvous_done_cpus); + } + if (CPU_CMP(&vm->rendezvous_req_cpus, + &vm->rendezvous_done_cpus) == 0) { + VMM_CTR0(vcpu, "Rendezvous completed"); + CPU_ZERO(&vm->rendezvous_req_cpus); + vm->rendezvous_func = NULL; + wakeup(&vm->rendezvous_func); + return (true); + } + return (false); +} + +static void +vcpu_wait_idle(struct vcpu *vcpu) +{ + KASSERT(vcpu->state != VCPU_IDLE, ("vcpu already idle")); + + vcpu->reqidle = 1; + vcpu_notify_event_locked(vcpu, false); + VMM_CTR1(vcpu, "vcpu state change from %s to " + "idle requested", vcpu_state2str(vcpu->state)); + msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); +} + +static int +vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, + bool from_idle) +{ + int error; + + vcpu_assert_locked(vcpu); + + /* + * State transitions from the vmmdev_ioctl() must always begin from + * the VCPU_IDLE state. This guarantees that there is only a single + * ioctl() operating on a vcpu at any point. + */ + if (from_idle) { + while (vcpu->state != VCPU_IDLE) + vcpu_wait_idle(vcpu); + } else { + KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " + "vcpu idle state")); + } + + if (vcpu->state == VCPU_RUNNING) { + KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " + "mismatch for running vcpu", curcpu, vcpu->hostcpu)); + } else { + KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " + "vcpu that is not running", vcpu->hostcpu)); + } + + /* + * The following state transitions are allowed: + * IDLE -> FROZEN -> IDLE + * FROZEN -> RUNNING -> FROZEN + * FROZEN -> SLEEPING -> FROZEN + */ + switch (vcpu->state) { + case VCPU_IDLE: + case VCPU_RUNNING: + case VCPU_SLEEPING: + error = (newstate != VCPU_FROZEN); + break; + case VCPU_FROZEN: + error = (newstate == VCPU_FROZEN); + break; + default: + error = 1; + break; + } + + if (error) + return (EBUSY); + + VMM_CTR2(vcpu, "vcpu state changed from %s to %s", + vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); + + vcpu->state = newstate; + if (newstate == VCPU_RUNNING) + vcpu->hostcpu = curcpu; + else + vcpu->hostcpu = NOCPU; + + if (newstate == VCPU_IDLE) + wakeup(&vcpu->state); + + return (0); +} + +/* + * Try to lock all of the vCPUs in the VM while taking care to avoid deadlocks + * with vm_smp_rendezvous(). + * + * The complexity here suggests that the rendezvous mechanism needs a rethink. + */ +int +vcpu_set_state_all(struct vm *vm, enum vcpu_state newstate) +{ + cpuset_t locked; + struct vcpu *vcpu; + int error, i; + uint16_t maxcpus; + + KASSERT(newstate != VCPU_IDLE, + ("vcpu_set_state_all: invalid target state %d", newstate)); + + error = 0; + CPU_ZERO(&locked); + maxcpus = vm->maxcpus; + + mtx_lock(&vm->rendezvous_mtx); +restart: + if (vm->rendezvous_func != NULL) { + /* + * If we have a pending rendezvous, then the initiator may be + * blocked waiting for other vCPUs to execute the callback. The + * current thread may be a vCPU thread so we must not block + * waiting for the initiator, otherwise we get a deadlock. + * Thus, execute the callback on behalf of any idle vCPUs. + */ + for (i = 0; i < maxcpus; i++) { + vcpu = vm_vcpu(vm, i); + if (vcpu == NULL) + continue; + vcpu_lock(vcpu); + if (vcpu->state == VCPU_IDLE) { + (void)vcpu_set_state_locked(vcpu, VCPU_FROZEN, + true); + CPU_SET(i, &locked); + } + if (CPU_ISSET(i, &locked)) { + /* + * We can safely execute the callback on this + * vCPU's behalf. + */ + vcpu_unlock(vcpu); + (void)vm_rendezvous(vcpu); + vcpu_lock(vcpu); + } + vcpu_unlock(vcpu); + } + } + + /* + * Now wait for remaining vCPUs to become idle. This may include the + * initiator of a rendezvous that is currently blocked on the rendezvous + * mutex. + */ + CPU_FOREACH_ISCLR(i, &locked) { + if (i >= maxcpus) + break; + vcpu = vm_vcpu(vm, i); + if (vcpu == NULL) + continue; + vcpu_lock(vcpu); + while (vcpu->state != VCPU_IDLE) { + mtx_unlock(&vm->rendezvous_mtx); + vcpu_wait_idle(vcpu); + vcpu_unlock(vcpu); + mtx_lock(&vm->rendezvous_mtx); + if (vm->rendezvous_func != NULL) + goto restart; + vcpu_lock(vcpu); + } + error = vcpu_set_state_locked(vcpu, newstate, true); + vcpu_unlock(vcpu); + if (error != 0) { + /* Roll back state changes. */ + CPU_FOREACH_ISSET(i, &locked) + (void)vcpu_set_state(vcpu, VCPU_IDLE, false); + break; + } + CPU_SET(i, &locked); + } + mtx_unlock(&vm->rendezvous_mtx); + return (error); +} + +static void +vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state(vcpu, newstate, false)) != 0) + panic("Error %d setting state to %d\n", error, newstate); +} + +static void +vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) + panic("Error %d setting state to %d", error, newstate); +} + +static int +vm_handle_rendezvous(struct vcpu *vcpu) +{ + struct vm *vm; + struct thread *td; + + td = curthread; + vm = vcpu->vm; + + mtx_lock(&vm->rendezvous_mtx); + while (vm->rendezvous_func != NULL) { + if (vm_rendezvous(vcpu)) + break; + + VMM_CTR0(vcpu, "Wait for rendezvous completion"); + mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, + "vmrndv", hz); + if (td_ast_pending(td, TDA_SUSPEND)) { + int error; + + mtx_unlock(&vm->rendezvous_mtx); + error = thread_check_susp(td, true); + if (error != 0) + return (error); + mtx_lock(&vm->rendezvous_mtx); + } + } + mtx_unlock(&vm->rendezvous_mtx); + return (0); +} + +/* + * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. + */ +static int +vm_handle_hlt(struct vcpu *vcpu, bool intr_disabled, bool *retu) +{ + struct vm *vm = vcpu->vm; + const char *wmesg; + struct thread *td; + int error, t, vcpuid, vcpu_halted, vm_halted; + + vcpuid = vcpu->vcpuid; + vcpu_halted = 0; + vm_halted = 0; + error = 0; + td = curthread; + + KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); + + vcpu_lock(vcpu); + while (1) { + /* + * Do a final check for pending NMI or interrupts before + * really putting this thread to sleep. Also check for + * software events that would cause this vcpu to wakeup. + * + * These interrupts/events could have happened after the + * vcpu returned from vmmops_run() and before it acquired the + * vcpu lock above. + */ + if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle) + break; + if (vm_nmi_pending(vcpu)) + break; + if (!intr_disabled) { + if (vm_extint_pending(vcpu) || + vlapic_pending_intr(vcpu->vlapic, NULL)) { + break; + } + } + + /* Don't go to sleep if the vcpu thread needs to yield */ + if (vcpu_should_yield(vcpu)) + break; + + if (vcpu_debugged(vcpu)) + break; + + /* + * Some Linux guests implement "halt" by having all vcpus + * execute HLT with interrupts disabled. 'halted_cpus' keeps + * track of the vcpus that have entered this state. When all + * vcpus enter the halted state the virtual machine is halted. + */ + if (intr_disabled) { + wmesg = "vmhalt"; + VMM_CTR0(vcpu, "Halted"); + if (!vcpu_halted && halt_detection_enabled) { + vcpu_halted = 1; + CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); + } + if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { + vm_halted = 1; + break; + } + } else { + wmesg = "vmidle"; + } + + t = ticks; + vcpu_require_state_locked(vcpu, VCPU_SLEEPING); + /* + * XXX msleep_spin() cannot be interrupted by signals so + * wake up periodically to check pending signals. + */ + msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); + vcpu_require_state_locked(vcpu, VCPU_FROZEN); + vmm_stat_incr(vcpu, VCPU_IDLE_TICKS, ticks - t); + if (td_ast_pending(td, TDA_SUSPEND)) { + vcpu_unlock(vcpu); + error = thread_check_susp(td, false); + if (error != 0) { + if (vcpu_halted) { + CPU_CLR_ATOMIC(vcpuid, + &vm->halted_cpus); + } + return (error); + } + vcpu_lock(vcpu); + } + } + + if (vcpu_halted) + CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); + + vcpu_unlock(vcpu); + + if (vm_halted) + vm_suspend(vm, VM_SUSPEND_HALT); + + return (0); +} + +static int +vm_handle_paging(struct vcpu *vcpu, bool *retu) +{ + struct vm *vm = vcpu->vm; + int rv, ftype; + struct vm_map *map; + struct vm_exit *vme; + + vme = &vcpu->exitinfo; + + KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", + __func__, vme->inst_length)); + + ftype = vme->u.paging.fault_type; + KASSERT(ftype == VM_PROT_READ || + ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, + ("vm_handle_paging: invalid fault_type %d", ftype)); + + if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { + rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm_vmspace(vm)), + vme->u.paging.gpa, ftype); + if (rv == 0) { + VMM_CTR2(vcpu, "%s bit emulation for gpa %#lx", + ftype == VM_PROT_READ ? "accessed" : "dirty", + vme->u.paging.gpa); + goto done; + } + } + + map = &vm_vmspace(vm)->vm_map; + rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL); + + VMM_CTR3(vcpu, "vm_handle_paging rv = %d, gpa = %#lx, " + "ftype = %d", rv, vme->u.paging.gpa, ftype); + + if (rv != KERN_SUCCESS) + return (EFAULT); +done: + return (0); +} + +static int +vm_handle_inst_emul(struct vcpu *vcpu, bool *retu) +{ + struct vie *vie; + struct vm_exit *vme; + uint64_t gla, gpa, cs_base; + struct vm_guest_paging *paging; + mem_region_read_t mread; + mem_region_write_t mwrite; + enum vm_cpu_mode cpu_mode; + int cs_d, error, fault; + + vme = &vcpu->exitinfo; + + KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", + __func__, vme->inst_length)); + + gla = vme->u.inst_emul.gla; + gpa = vme->u.inst_emul.gpa; + cs_base = vme->u.inst_emul.cs_base; + cs_d = vme->u.inst_emul.cs_d; + vie = &vme->u.inst_emul.vie; + paging = &vme->u.inst_emul.paging; + cpu_mode = paging->cpu_mode; + + VMM_CTR1(vcpu, "inst_emul fault accessing gpa %#lx", gpa); + + /* Fetch, decode and emulate the faulting instruction */ + if (vie->num_valid == 0) { + error = vmm_fetch_instruction(vcpu, paging, vme->rip + cs_base, + VIE_INST_SIZE, vie, &fault); + } else { + /* + * The instruction bytes have already been copied into 'vie' + */ + error = fault = 0; + } + if (error || fault) + return (error); + + if (vmm_decode_instruction(vcpu, gla, cpu_mode, cs_d, vie) != 0) { + VMM_CTR1(vcpu, "Error decoding instruction at %#lx", + vme->rip + cs_base); + *retu = true; /* dump instruction bytes in userspace */ + return (0); + } + + /* + * Update 'nextrip' based on the length of the emulated instruction. + */ + vme->inst_length = vie->num_processed; + vcpu->nextrip += vie->num_processed; + VMM_CTR1(vcpu, "nextrip updated to %#lx after instruction decoding", + vcpu->nextrip); + + /* return to userland unless this is an in-kernel emulated device */ + if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { + mread = lapic_mmio_read; + mwrite = lapic_mmio_write; + } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { + mread = vioapic_mmio_read; + mwrite = vioapic_mmio_write; + } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { + mread = vhpet_mmio_read; + mwrite = vhpet_mmio_write; + } else { + *retu = true; + return (0); + } + + error = vmm_emulate_instruction(vcpu, gpa, vie, paging, mread, mwrite, + retu); + + return (error); +} + +static int +vm_handle_suspend(struct vcpu *vcpu, bool *retu) +{ + struct vm *vm = vcpu->vm; + int error, i; + struct thread *td; + + error = 0; + td = curthread; + + CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus); + + /* + * Wait until all 'active_cpus' have suspended themselves. + * + * Since a VM may be suspended at any time including when one or + * more vcpus are doing a rendezvous we need to call the rendezvous + * handler while we are waiting to prevent a deadlock. + */ + vcpu_lock(vcpu); + while (error == 0) { + if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { + VMM_CTR0(vcpu, "All vcpus suspended"); + break; + } + + if (vm->rendezvous_func == NULL) { + VMM_CTR0(vcpu, "Sleeping during suspend"); + vcpu_require_state_locked(vcpu, VCPU_SLEEPING); + msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); + vcpu_require_state_locked(vcpu, VCPU_FROZEN); + if (td_ast_pending(td, TDA_SUSPEND)) { + vcpu_unlock(vcpu); + error = thread_check_susp(td, false); + vcpu_lock(vcpu); + } + } else { + VMM_CTR0(vcpu, "Rendezvous during suspend"); + vcpu_unlock(vcpu); + error = vm_handle_rendezvous(vcpu); + vcpu_lock(vcpu); + } + } + vcpu_unlock(vcpu); + + /* + * Wakeup the other sleeping vcpus and return to userspace. + */ + for (i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->suspended_cpus)) { + vcpu_notify_event(vm_vcpu(vm, i), false); + } + } + + *retu = true; + return (error); +} + +static int +vm_handle_reqidle(struct vcpu *vcpu, bool *retu) +{ + vcpu_lock(vcpu); + KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); + vcpu->reqidle = 0; + vcpu_unlock(vcpu); + *retu = true; + return (0); +} + +static int +vm_handle_db(struct vcpu *vcpu, struct vm_exit *vme, bool *retu) +{ + int error, fault; + uint64_t rsp; + uint64_t rflags; + struct vm_copyinfo copyinfo[2]; + + *retu = true; + if (!vme->u.dbg.pushf_intercept || vme->u.dbg.tf_shadow_val != 0) { + return (0); + } + + vm_get_register(vcpu, VM_REG_GUEST_RSP, &rsp); + error = vm_copy_setup(vcpu, &vme->u.dbg.paging, rsp, sizeof(uint64_t), + VM_PROT_RW, copyinfo, nitems(copyinfo), &fault); + if (error != 0 || fault != 0) { + *retu = false; + return (EINVAL); + } + + /* Read pushed rflags value from top of stack. */ + vm_copyin(copyinfo, &rflags, sizeof(uint64_t)); + + /* Clear TF bit. */ + rflags &= ~(PSL_T); + + /* Write updated value back to memory. */ + vm_copyout(&rflags, copyinfo, sizeof(uint64_t)); + vm_copy_teardown(copyinfo, nitems(copyinfo)); + + return (0); +} + +int +vm_suspend(struct vm *vm, enum vm_suspend_how how) +{ + int i; + + if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) + return (EINVAL); + + if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { + VM_CTR2(vm, "virtual machine already suspended %d/%d", + vm->suspend, how); + return (EALREADY); + } + + VM_CTR1(vm, "virtual machine successfully suspended %d", how); + + /* + * Notify all active vcpus that they are now suspended. + */ + for (i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->active_cpus)) + vcpu_notify_event(vm_vcpu(vm, i), false); + } + + return (0); +} + +void +vm_exit_suspended(struct vcpu *vcpu, uint64_t rip) +{ + struct vm *vm = vcpu->vm; + struct vm_exit *vmexit; + + KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, + ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); + + vmexit = vm_exitinfo(vcpu); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_SUSPENDED; + vmexit->u.suspended.how = vm->suspend; +} + +void +vm_exit_debug(struct vcpu *vcpu, uint64_t rip) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vcpu); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_DEBUG; +} + +void +vm_exit_rendezvous(struct vcpu *vcpu, uint64_t rip) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vcpu); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; + vmm_stat_incr(vcpu, VMEXIT_RENDEZVOUS, 1); +} + +void +vm_exit_reqidle(struct vcpu *vcpu, uint64_t rip) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vcpu); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_REQIDLE; + vmm_stat_incr(vcpu, VMEXIT_REQIDLE, 1); +} + +void +vm_exit_astpending(struct vcpu *vcpu, uint64_t rip) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vcpu); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_BOGUS; + vmm_stat_incr(vcpu, VMEXIT_ASTPENDING, 1); +} + +int +vm_run(struct vcpu *vcpu) +{ + struct vm *vm = vcpu->vm; + struct vm_eventinfo evinfo; + int error, vcpuid; + struct pcb *pcb; + uint64_t tscval; + struct vm_exit *vme; + bool retu, intr_disabled; + pmap_t pmap; + + vcpuid = vcpu->vcpuid; + + if (!CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EINVAL); + + if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) + return (EINVAL); + + pmap = vmspace_pmap(vm_vmspace(vm)); + vme = &vcpu->exitinfo; + evinfo.rptr = &vm->rendezvous_req_cpus; + evinfo.sptr = &vm->suspend; + evinfo.iptr = &vcpu->reqidle; +restart: + critical_enter(); + + KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), + ("vm_run: absurd pm_active")); + + tscval = rdtsc(); + + pcb = PCPU_GET(curpcb); + set_pcb_flags(pcb, PCB_FULL_IRET); + + restore_guest_fpustate(vcpu); + + vcpu_require_state(vcpu, VCPU_RUNNING); + error = vmmops_run(vcpu->cookie, vcpu->nextrip, pmap, &evinfo); + vcpu_require_state(vcpu, VCPU_FROZEN); + + save_guest_fpustate(vcpu); + + vmm_stat_incr(vcpu, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); + + critical_exit(); + + if (error == 0) { + retu = false; + vcpu->nextrip = vme->rip + vme->inst_length; + switch (vme->exitcode) { + case VM_EXITCODE_REQIDLE: + error = vm_handle_reqidle(vcpu, &retu); + break; + case VM_EXITCODE_SUSPENDED: + error = vm_handle_suspend(vcpu, &retu); + break; + case VM_EXITCODE_IOAPIC_EOI: + vioapic_process_eoi(vm, vme->u.ioapic_eoi.vector); + break; + case VM_EXITCODE_RENDEZVOUS: + error = vm_handle_rendezvous(vcpu); + break; + case VM_EXITCODE_HLT: + intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); + error = vm_handle_hlt(vcpu, intr_disabled, &retu); + break; + case VM_EXITCODE_PAGING: + error = vm_handle_paging(vcpu, &retu); + break; + case VM_EXITCODE_INST_EMUL: + error = vm_handle_inst_emul(vcpu, &retu); + break; + case VM_EXITCODE_INOUT: + case VM_EXITCODE_INOUT_STR: + error = vm_handle_inout(vcpu, vme, &retu); + break; + case VM_EXITCODE_DB: + error = vm_handle_db(vcpu, vme, &retu); + break; + case VM_EXITCODE_MONITOR: + case VM_EXITCODE_MWAIT: + case VM_EXITCODE_VMINSN: + vm_inject_ud(vcpu); + break; + default: + retu = true; /* handled in userland */ + break; + } + } + + /* + * VM_EXITCODE_INST_EMUL could access the apic which could transform the + * exit code into VM_EXITCODE_IPI. + */ + if (error == 0 && vme->exitcode == VM_EXITCODE_IPI) + error = vm_handle_ipi(vcpu, vme, &retu); + + if (error == 0 && retu == false) + goto restart; + + vmm_stat_incr(vcpu, VMEXIT_USERSPACE, 1); + VMM_CTR2(vcpu, "retu %d/%d", error, vme->exitcode); + + return (error); +} + +int +vm_restart_instruction(struct vcpu *vcpu) +{ + enum vcpu_state state; + uint64_t rip; + int error __diagused; + + state = vcpu_get_state(vcpu, NULL); + if (state == VCPU_RUNNING) { + /* + * When a vcpu is "running" the next instruction is determined + * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. + * Thus setting 'inst_length' to zero will cause the current + * instruction to be restarted. + */ + vcpu->exitinfo.inst_length = 0; + VMM_CTR1(vcpu, "restarting instruction at %#lx by " + "setting inst_length to zero", vcpu->exitinfo.rip); + } else if (state == VCPU_FROZEN) { + /* + * When a vcpu is "frozen" it is outside the critical section + * around vmmops_run() and 'nextrip' points to the next + * instruction. Thus instruction restart is achieved by setting + * 'nextrip' to the vcpu's %rip. + */ + error = vm_get_register(vcpu, VM_REG_GUEST_RIP, &rip); + KASSERT(!error, ("%s: error %d getting rip", __func__, error)); + VMM_CTR2(vcpu, "restarting instruction by updating " + "nextrip from %#lx to %#lx", vcpu->nextrip, rip); + vcpu->nextrip = rip; + } else { + panic("%s: invalid state %d", __func__, state); + } + return (0); +} + +int +vm_exit_intinfo(struct vcpu *vcpu, uint64_t info) +{ + int type, vector; + + if (info & VM_INTINFO_VALID) { + type = info & VM_INTINFO_TYPE; + vector = info & 0xff; + if (type == VM_INTINFO_NMI && vector != IDT_NMI) + return (EINVAL); + if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) + return (EINVAL); + if (info & VM_INTINFO_RSVD) + return (EINVAL); + } else { + info = 0; + } + VMM_CTR2(vcpu, "%s: info1(%#lx)", __func__, info); + vcpu->exitintinfo = info; + return (0); +} + +enum exc_class { + EXC_BENIGN, + EXC_CONTRIBUTORY, + EXC_PAGEFAULT +}; + +#define IDT_VE 20 /* Virtualization Exception (Intel specific) */ + +static enum exc_class +exception_class(uint64_t info) +{ + int type, vector; + + KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); + type = info & VM_INTINFO_TYPE; + vector = info & 0xff; + + /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ + switch (type) { + case VM_INTINFO_HWINTR: + case VM_INTINFO_SWINTR: + case VM_INTINFO_NMI: + return (EXC_BENIGN); + default: + /* + * Hardware exception. + * + * SVM and VT-x use identical type values to represent NMI, + * hardware interrupt and software interrupt. + * + * SVM uses type '3' for all exceptions. VT-x uses type '3' + * for exceptions except #BP and #OF. #BP and #OF use a type + * value of '5' or '6'. Therefore we don't check for explicit + * values of 'type' to classify 'intinfo' into a hardware + * exception. + */ + break; + } + + switch (vector) { + case IDT_PF: + case IDT_VE: + return (EXC_PAGEFAULT); + case IDT_DE: + case IDT_TS: + case IDT_NP: + case IDT_SS: + case IDT_GP: + return (EXC_CONTRIBUTORY); + default: + return (EXC_BENIGN); + } +} + +static int +nested_fault(struct vcpu *vcpu, uint64_t info1, uint64_t info2, + uint64_t *retinfo) +{ + enum exc_class exc1, exc2; + int type1, vector1; + + KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); + KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); + + /* + * If an exception occurs while attempting to call the double-fault + * handler the processor enters shutdown mode (aka triple fault). + */ + type1 = info1 & VM_INTINFO_TYPE; + vector1 = info1 & 0xff; + if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { + VMM_CTR2(vcpu, "triple fault: info1(%#lx), info2(%#lx)", + info1, info2); + vm_suspend(vcpu->vm, VM_SUSPEND_TRIPLEFAULT); + *retinfo = 0; + return (0); + } + + /* + * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 + */ + exc1 = exception_class(info1); + exc2 = exception_class(info2); + if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || + (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { + /* Convert nested fault into a double fault. */ + *retinfo = IDT_DF; + *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; + *retinfo |= VM_INTINFO_DEL_ERRCODE; + } else { + /* Handle exceptions serially */ + *retinfo = info2; + } + return (1); +} + +static uint64_t +vcpu_exception_intinfo(struct vcpu *vcpu) +{ + uint64_t info = 0; + + if (vcpu->exception_pending) { + info = vcpu->exc_vector & 0xff; + info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; + if (vcpu->exc_errcode_valid) { + info |= VM_INTINFO_DEL_ERRCODE; + info |= (uint64_t)vcpu->exc_errcode << 32; + } + } + return (info); +} + +int +vm_entry_intinfo(struct vcpu *vcpu, uint64_t *retinfo) +{ + uint64_t info1, info2; + int valid; + + info1 = vcpu->exitintinfo; + vcpu->exitintinfo = 0; + + info2 = 0; + if (vcpu->exception_pending) { + info2 = vcpu_exception_intinfo(vcpu); + vcpu->exception_pending = 0; + VMM_CTR2(vcpu, "Exception %d delivered: %#lx", + vcpu->exc_vector, info2); + } + + if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { + valid = nested_fault(vcpu, info1, info2, retinfo); + } else if (info1 & VM_INTINFO_VALID) { + *retinfo = info1; + valid = 1; + } else if (info2 & VM_INTINFO_VALID) { + *retinfo = info2; + valid = 1; + } else { + valid = 0; + } + + if (valid) { + VMM_CTR4(vcpu, "%s: info1(%#lx), info2(%#lx), " + "retinfo(%#lx)", __func__, info1, info2, *retinfo); + } + + return (valid); +} + +int +vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) +{ + *info1 = vcpu->exitintinfo; + *info2 = vcpu_exception_intinfo(vcpu); + return (0); +} + +int +vm_inject_exception(struct vcpu *vcpu, int vector, int errcode_valid, + uint32_t errcode, int restart_instruction) +{ + uint64_t regval; + int error __diagused; + + if (vector < 0 || vector >= 32) + return (EINVAL); + + /* + * A double fault exception should never be injected directly into + * the guest. It is a derived exception that results from specific + * combinations of nested faults. + */ + if (vector == IDT_DF) + return (EINVAL); + + if (vcpu->exception_pending) { + VMM_CTR2(vcpu, "Unable to inject exception %d due to " + "pending exception %d", vector, vcpu->exc_vector); + return (EBUSY); + } + + if (errcode_valid) { + /* + * Exceptions don't deliver an error code in real mode. + */ + error = vm_get_register(vcpu, VM_REG_GUEST_CR0, ®val); + KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); + if (!(regval & CR0_PE)) + errcode_valid = 0; + } + + /* + * From section 26.6.1 "Interruptibility State" in Intel SDM: + * + * Event blocking by "STI" or "MOV SS" is cleared after guest executes + * one instruction or incurs an exception. + */ + error = vm_set_register(vcpu, VM_REG_GUEST_INTR_SHADOW, 0); + KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", + __func__, error)); + + if (restart_instruction) + vm_restart_instruction(vcpu); + + vcpu->exception_pending = 1; + vcpu->exc_vector = vector; + vcpu->exc_errcode = errcode; + vcpu->exc_errcode_valid = errcode_valid; + VMM_CTR1(vcpu, "Exception %d pending", vector); + return (0); +} + +void +vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode) +{ + int error __diagused, restart_instruction; + + restart_instruction = 1; + + error = vm_inject_exception(vcpu, vector, errcode_valid, + errcode, restart_instruction); + KASSERT(error == 0, ("vm_inject_exception error %d", error)); +} + +void +vm_inject_pf(struct vcpu *vcpu, int error_code, uint64_t cr2) +{ + int error __diagused; + + VMM_CTR2(vcpu, "Injecting page fault: error_code %#x, cr2 %#lx", + error_code, cr2); + + error = vm_set_register(vcpu, VM_REG_GUEST_CR2, cr2); + KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); + + vm_inject_fault(vcpu, IDT_PF, 1, error_code); +} + +static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); + +int +vm_inject_nmi(struct vcpu *vcpu) +{ + + vcpu->nmi_pending = 1; + vcpu_notify_event(vcpu, false); + return (0); +} + +int +vm_nmi_pending(struct vcpu *vcpu) +{ + return (vcpu->nmi_pending); +} + +void +vm_nmi_clear(struct vcpu *vcpu) +{ + if (vcpu->nmi_pending == 0) + panic("vm_nmi_clear: inconsistent nmi_pending state"); + + vcpu->nmi_pending = 0; + vmm_stat_incr(vcpu, VCPU_NMI_COUNT, 1); +} + +static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); + +int +vm_inject_extint(struct vcpu *vcpu) +{ + + vcpu->extint_pending = 1; + vcpu_notify_event(vcpu, false); + return (0); +} + +int +vm_extint_pending(struct vcpu *vcpu) +{ + return (vcpu->extint_pending); +} + +void +vm_extint_clear(struct vcpu *vcpu) +{ + if (vcpu->extint_pending == 0) + panic("vm_extint_clear: inconsistent extint_pending state"); + + vcpu->extint_pending = 0; + vmm_stat_incr(vcpu, VCPU_EXTINT_COUNT, 1); +} + +int +vm_get_capability(struct vcpu *vcpu, int type, int *retval) +{ + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (vmmops_getcap(vcpu->cookie, type, retval)); +} + +int +vm_set_capability(struct vcpu *vcpu, int type, int val) +{ + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (vmmops_setcap(vcpu->cookie, type, val)); +} + +struct vm * +vcpu_vm(struct vcpu *vcpu) +{ + return (vcpu->vm); +} + +int +vcpu_vcpuid(struct vcpu *vcpu) +{ + return (vcpu->vcpuid); +} + +struct vcpu * +vm_vcpu(struct vm *vm, int vcpuid) +{ + return (vm->vcpu[vcpuid]); +} + +struct vlapic * +vm_lapic(struct vcpu *vcpu) +{ + return (vcpu->vlapic); +} + +struct vioapic * +vm_ioapic(struct vm *vm) +{ + + return (vm->vioapic); +} + +struct vhpet * +vm_hpet(struct vm *vm) +{ + + return (vm->vhpet); +} + +bool +vmm_is_pptdev(int bus, int slot, int func) +{ + int b, f, i, n, s; + char *val, *cp, *cp2; + bool found; + + /* + * XXX + * The length of an environment variable is limited to 128 bytes which + * puts an upper limit on the number of passthru devices that may be + * specified using a single environment variable. + * + * Work around this by scanning multiple environment variable + * names instead of a single one - yuck! + */ + const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; + + /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ + found = false; + for (i = 0; names[i] != NULL && !found; i++) { + cp = val = kern_getenv(names[i]); + while (cp != NULL && *cp != '\0') { + if ((cp2 = strchr(cp, ' ')) != NULL) + *cp2 = '\0'; + + n = sscanf(cp, "%d/%d/%d", &b, &s, &f); + if (n == 3 && bus == b && slot == s && func == f) { + found = true; + break; + } + + if (cp2 != NULL) + *cp2++ = ' '; + + cp = cp2; + } + freeenv(val); + } + return (found); +} + +void * +vm_iommu_domain(struct vm *vm) +{ + + return (vm->iommu); +} + +int +vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) +{ + int error; + + vcpu_lock(vcpu); + error = vcpu_set_state_locked(vcpu, newstate, from_idle); + vcpu_unlock(vcpu); + + return (error); +} + +enum vcpu_state +vcpu_get_state(struct vcpu *vcpu, int *hostcpu) +{ + enum vcpu_state state; + + vcpu_lock(vcpu); + state = vcpu->state; + if (hostcpu != NULL) + *hostcpu = vcpu->hostcpu; + vcpu_unlock(vcpu); + + return (state); +} + +int +vm_activate_cpu(struct vcpu *vcpu) +{ + struct vm *vm = vcpu->vm; + + if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) + return (EBUSY); + + VMM_CTR0(vcpu, "activated"); + CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus); + return (0); +} + +int +vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu) +{ + if (vcpu == NULL) { + vm->debug_cpus = vm->active_cpus; + for (int i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->active_cpus)) + vcpu_notify_event(vm_vcpu(vm, i), false); + } + } else { + if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) + return (EINVAL); + + CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); + vcpu_notify_event(vcpu, false); + } + return (0); +} + +int +vm_resume_cpu(struct vm *vm, struct vcpu *vcpu) +{ + + if (vcpu == NULL) { + CPU_ZERO(&vm->debug_cpus); + } else { + if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus)) + return (EINVAL); + + CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); + } + return (0); +} + +int +vcpu_debugged(struct vcpu *vcpu) +{ + + return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus)); +} + +cpuset_t +vm_active_cpus(struct vm *vm) +{ + + return (vm->active_cpus); +} + +cpuset_t +vm_debug_cpus(struct vm *vm) +{ + + return (vm->debug_cpus); +} + +cpuset_t +vm_suspended_cpus(struct vm *vm) +{ + + return (vm->suspended_cpus); +} + +/* + * Returns the subset of vCPUs in tostart that are awaiting startup. + * These vCPUs are also marked as no longer awaiting startup. + */ +cpuset_t +vm_start_cpus(struct vm *vm, const cpuset_t *tostart) +{ + cpuset_t set; + + mtx_lock(&vm->rendezvous_mtx); + CPU_AND(&set, &vm->startup_cpus, tostart); + CPU_ANDNOT(&vm->startup_cpus, &vm->startup_cpus, &set); + mtx_unlock(&vm->rendezvous_mtx); + return (set); +} + +void +vm_await_start(struct vm *vm, const cpuset_t *waiting) +{ + mtx_lock(&vm->rendezvous_mtx); + CPU_OR(&vm->startup_cpus, &vm->startup_cpus, waiting); + mtx_unlock(&vm->rendezvous_mtx); +} + +void * +vcpu_stats(struct vcpu *vcpu) +{ + + return (vcpu->stats); +} + +int +vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *state) +{ + *state = vcpu->x2apic_state; + + return (0); +} + +int +vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state) +{ + if (state >= X2APIC_STATE_LAST) + return (EINVAL); + + vcpu->x2apic_state = state; + + vlapic_set_x2apic_state(vcpu, state); + + return (0); +} + +/* + * This function is called to ensure that a vcpu "sees" a pending event + * as soon as possible: + * - If the vcpu thread is sleeping then it is woken up. + * - If the vcpu is running on a different host_cpu then an IPI will be directed + * to the host_cpu to cause the vcpu to trap into the hypervisor. + */ +static void +vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) +{ + int hostcpu; + + hostcpu = vcpu->hostcpu; + if (vcpu->state == VCPU_RUNNING) { + KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); + if (hostcpu != curcpu) { + if (lapic_intr) { + vlapic_post_intr(vcpu->vlapic, hostcpu, + vmm_ipinum); + } else { + ipi_cpu(hostcpu, vmm_ipinum); + } + } else { + /* + * If the 'vcpu' is running on 'curcpu' then it must + * be sending a notification to itself (e.g. SELF_IPI). + * The pending event will be picked up when the vcpu + * transitions back to guest context. + */ + } + } else { + KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " + "with hostcpu %d", vcpu->state, hostcpu)); + if (vcpu->state == VCPU_SLEEPING) + wakeup_one(vcpu); + } +} + +void +vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr) +{ + vcpu_lock(vcpu); + vcpu_notify_event_locked(vcpu, lapic_intr); + vcpu_unlock(vcpu); +} + +struct vm_mem * +vm_mem(struct vm *vm) +{ + return (&vm->mem); +} + +int +vm_apicid2vcpuid(struct vm *vm, int apicid) +{ + /* + * XXX apic id is assumed to be numerically identical to vcpu id + */ + return (apicid); +} + +int +vm_smp_rendezvous(struct vcpu *vcpu, cpuset_t dest, + vm_rendezvous_func_t func, void *arg) +{ + struct vm *vm = vcpu->vm; + int error, i; + + /* + * Enforce that this function is called without any locks + */ + WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); + +restart: + mtx_lock(&vm->rendezvous_mtx); + if (vm->rendezvous_func != NULL) { + /* + * If a rendezvous is already in progress then we need to + * call the rendezvous handler in case this 'vcpu' is one + * of the targets of the rendezvous. + */ + VMM_CTR0(vcpu, "Rendezvous already in progress"); + mtx_unlock(&vm->rendezvous_mtx); + error = vm_handle_rendezvous(vcpu); + if (error != 0) + return (error); + goto restart; + } + KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " + "rendezvous is still in progress")); + + VMM_CTR0(vcpu, "Initiating rendezvous"); + vm->rendezvous_req_cpus = dest; + CPU_ZERO(&vm->rendezvous_done_cpus); + vm->rendezvous_arg = arg; + vm->rendezvous_func = func; + mtx_unlock(&vm->rendezvous_mtx); + + /* + * Wake up any sleeping vcpus and trigger a VM-exit in any running + * vcpus so they handle the rendezvous as soon as possible. + */ + for (i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &dest)) + vcpu_notify_event(vm_vcpu(vm, i), false); + } + + return (vm_handle_rendezvous(vcpu)); +} + +struct vatpic * +vm_atpic(struct vm *vm) +{ + return (vm->vatpic); +} + +struct vatpit * +vm_atpit(struct vm *vm) +{ + return (vm->vatpit); +} + +struct vpmtmr * +vm_pmtmr(struct vm *vm) +{ + + return (vm->vpmtmr); +} + +struct vrtc * +vm_rtc(struct vm *vm) +{ + + return (vm->vrtc); +} + +enum vm_reg_name +vm_segment_name(int seg) +{ + static enum vm_reg_name seg_names[] = { + VM_REG_GUEST_ES, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS + }; + + KASSERT(seg >= 0 && seg < nitems(seg_names), + ("%s: invalid segment encoding %d", __func__, seg)); + return (seg_names[seg]); +} + +void +vm_copy_teardown(struct vm_copyinfo *copyinfo, int num_copyinfo) +{ + int idx; + + for (idx = 0; idx < num_copyinfo; idx++) { + if (copyinfo[idx].cookie != NULL) + vm_gpa_release(copyinfo[idx].cookie); + } + bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); +} + +int +vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, + int num_copyinfo, int *fault) +{ + int error, idx, nused; + size_t n, off, remaining; + void *hva, *cookie; + uint64_t gpa; + + bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); + + nused = 0; + remaining = len; + while (remaining > 0) { + if (nused >= num_copyinfo) + return (EFAULT); + error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); + if (error || *fault) + return (error); + off = gpa & PAGE_MASK; + n = min(remaining, PAGE_SIZE - off); + copyinfo[nused].gpa = gpa; + copyinfo[nused].len = n; + remaining -= n; + gla += n; + nused++; + } + + for (idx = 0; idx < nused; idx++) { + hva = vm_gpa_hold(vcpu, copyinfo[idx].gpa, + copyinfo[idx].len, prot, &cookie); + if (hva == NULL) + break; + copyinfo[idx].hva = hva; + copyinfo[idx].cookie = cookie; + } + + if (idx != nused) { + vm_copy_teardown(copyinfo, num_copyinfo); + return (EFAULT); + } else { + *fault = 0; + return (0); + } +} + +void +vm_copyin(struct vm_copyinfo *copyinfo, void *kaddr, size_t len) +{ + char *dst; + int idx; + + dst = kaddr; + idx = 0; + while (len > 0) { + bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); + len -= copyinfo[idx].len; + dst += copyinfo[idx].len; + idx++; + } +} + +void +vm_copyout(const void *kaddr, struct vm_copyinfo *copyinfo, size_t len) +{ + const char *src; + int idx; + + src = kaddr; + idx = 0; + while (len > 0) { + bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); + len -= copyinfo[idx].len; + src += copyinfo[idx].len; + idx++; + } +} + +/* + * Return the amount of in-use and wired memory for the VM. Since + * these are global stats, only return the values with for vCPU 0 + */ +VMM_STAT_DECLARE(VMM_MEM_RESIDENT); +VMM_STAT_DECLARE(VMM_MEM_WIRED); + +static void +vm_get_rescnt(struct vcpu *vcpu, struct vmm_stat_type *stat) +{ + + if (vcpu->vcpuid == 0) { + vmm_stat_set(vcpu, VMM_MEM_RESIDENT, PAGE_SIZE * + vmspace_resident_count(vm_vmspace(vcpu->vm))); + } +} + +static void +vm_get_wiredcnt(struct vcpu *vcpu, struct vmm_stat_type *stat) +{ + + if (vcpu->vcpuid == 0) { + vmm_stat_set(vcpu, VMM_MEM_WIRED, PAGE_SIZE * + pmap_wired_count(vmspace_pmap(vm_vmspace(vcpu->vm)))); + } +} + +VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); +VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); + +#ifdef BHYVE_SNAPSHOT +static int +vm_snapshot_vcpus(struct vm *vm, struct vm_snapshot_meta *meta) +{ + uint64_t tsc, now; + int ret; + struct vcpu *vcpu; + uint16_t i, maxcpus; + + now = rdtsc(); + maxcpus = vm_get_maxcpus(vm); + for (i = 0; i < maxcpus; i++) { + vcpu = vm->vcpu[i]; + if (vcpu == NULL) + continue; + + SNAPSHOT_VAR_OR_LEAVE(vcpu->x2apic_state, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exitintinfo, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_vector, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode_valid, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->guest_xcr0, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exitinfo, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done); + + /* + * Save the absolute TSC value by adding now to tsc_offset. + * + * It will be turned turned back into an actual offset when the + * TSC restore function is called + */ + tsc = now + vcpu->tsc_offset; + SNAPSHOT_VAR_OR_LEAVE(tsc, meta, ret, done); + if (meta->op == VM_SNAPSHOT_RESTORE) + vcpu->tsc_offset = tsc; + } + +done: + return (ret); +} + +static int +vm_snapshot_vm(struct vm *vm, struct vm_snapshot_meta *meta) +{ + int ret; + + ret = vm_snapshot_vcpus(vm, meta); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(vm->startup_cpus, meta, ret, done); +done: + return (ret); +} + +static int +vm_snapshot_vcpu(struct vm *vm, struct vm_snapshot_meta *meta) +{ + int error; + struct vcpu *vcpu; + uint16_t i, maxcpus; + + error = 0; + + maxcpus = vm_get_maxcpus(vm); + for (i = 0; i < maxcpus; i++) { + vcpu = vm->vcpu[i]; + if (vcpu == NULL) + continue; + + error = vmmops_vcpu_snapshot(vcpu->cookie, meta); + if (error != 0) { + printf("%s: failed to snapshot vmcs/vmcb data for " + "vCPU: %d; error: %d\n", __func__, i, error); + goto done; + } + } + +done: + return (error); +} + +/* + * Save kernel-side structures to user-space for snapshotting. + */ +int +vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta) +{ + int ret = 0; + + switch (meta->dev_req) { + case STRUCT_VMCX: + ret = vm_snapshot_vcpu(vm, meta); + break; + case STRUCT_VM: + ret = vm_snapshot_vm(vm, meta); + break; + case STRUCT_VIOAPIC: + ret = vioapic_snapshot(vm_ioapic(vm), meta); + break; + case STRUCT_VLAPIC: + ret = vlapic_snapshot(vm, meta); + break; + case STRUCT_VHPET: + ret = vhpet_snapshot(vm_hpet(vm), meta); + break; + case STRUCT_VATPIC: + ret = vatpic_snapshot(vm_atpic(vm), meta); + break; + case STRUCT_VATPIT: + ret = vatpit_snapshot(vm_atpit(vm), meta); + break; + case STRUCT_VPMTMR: + ret = vpmtmr_snapshot(vm_pmtmr(vm), meta); + break; + case STRUCT_VRTC: + ret = vrtc_snapshot(vm_rtc(vm), meta); + break; + default: + printf("%s: failed to find the requested type %#x\n", + __func__, meta->dev_req); + ret = (EINVAL); + } + return (ret); +} + +void +vm_set_tsc_offset(struct vcpu *vcpu, uint64_t offset) +{ + vcpu->tsc_offset = offset; +} + +int +vm_restore_time(struct vm *vm) +{ + int error; + uint64_t now; + struct vcpu *vcpu; + uint16_t i, maxcpus; + + now = rdtsc(); + + error = vhpet_restore_time(vm_hpet(vm)); + if (error) + return (error); + + maxcpus = vm_get_maxcpus(vm); + for (i = 0; i < maxcpus; i++) { + vcpu = vm->vcpu[i]; + if (vcpu == NULL) + continue; + + error = vmmops_restore_tsc(vcpu->cookie, + vcpu->tsc_offset - now); + if (error) + return (error); + } + + return (0); +} +#endif diff --git a/sys/amd64/vmm/vmm_dev_machdep.c b/sys/amd64/vmm/vmm_dev_machdep.c new file mode 100644 index 000000000000..b84be809ea24 --- /dev/null +++ b/sys/amd64/vmm/vmm_dev_machdep.c @@ -0,0 +1,596 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "opt_bhyve_snapshot.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/conf.h> +#include <sys/libkern.h> +#include <sys/ioccom.h> +#include <sys/mman.h> +#include <sys/uio.h> +#include <sys/proc.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> + +#include <machine/vmparam.h> +#include <machine/vmm.h> +#include <machine/vmm_instruction_emul.h> +#include <machine/vmm_snapshot.h> +#include <x86/apicreg.h> + +#include <dev/vmm/vmm_dev.h> +#include <dev/vmm/vmm_mem.h> +#include <dev/vmm/vmm_stat.h> + +#include "vmm_lapic.h" +#include "vmm_mem.h" +#include "io/ppt.h" +#include "io/vatpic.h" +#include "io/vioapic.h" +#include "io/vhpet.h" +#include "io/vrtc.h" + +#ifdef COMPAT_FREEBSD13 +struct vm_stats_13 { + int cpuid; /* in */ + int num_entries; /* out */ + struct timeval tv; + uint64_t statbuf[MAX_VM_STATS]; +}; + +#define VM_STATS_13 _IOWR('v', IOCNUM_VM_STATS, struct vm_stats_13) + +struct vm_snapshot_meta_13 { + void *ctx; /* unused */ + void *dev_data; + const char *dev_name; /* identify userspace devices */ + enum snapshot_req dev_req; /* identify kernel structs */ + + struct vm_snapshot_buffer buffer; + + enum vm_snapshot_op op; +}; + +#define VM_SNAPSHOT_REQ_13 \ + _IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta_13) + +struct vm_exit_ipi_13 { + uint32_t mode; + uint8_t vector; + __BITSET_DEFINE(, 256) dmask; +}; + +struct vm_exit_13 { + uint32_t exitcode; + int32_t inst_length; + uint64_t rip; + uint64_t u[120 / sizeof(uint64_t)]; +}; + +struct vm_run_13 { + int cpuid; + struct vm_exit_13 vm_exit; +}; + +#define VM_RUN_13 \ + _IOWR('v', IOCNUM_RUN, struct vm_run_13) + +#endif /* COMPAT_FREEBSD13 */ + +const struct vmmdev_ioctl vmmdev_machdep_ioctls[] = { + VMMDEV_IOCTL(VM_RUN, VMMDEV_IOCTL_LOCK_ONE_VCPU), +#ifdef COMPAT_FREEBSD13 + VMMDEV_IOCTL(VM_RUN_13, VMMDEV_IOCTL_LOCK_ONE_VCPU), +#endif + VMMDEV_IOCTL(VM_GET_SEGMENT_DESCRIPTOR, VMMDEV_IOCTL_LOCK_ONE_VCPU), + VMMDEV_IOCTL(VM_SET_SEGMENT_DESCRIPTOR, VMMDEV_IOCTL_LOCK_ONE_VCPU), + VMMDEV_IOCTL(VM_INJECT_EXCEPTION, VMMDEV_IOCTL_LOCK_ONE_VCPU), + VMMDEV_IOCTL(VM_SET_X2APIC_STATE, VMMDEV_IOCTL_LOCK_ONE_VCPU), + VMMDEV_IOCTL(VM_GLA2GPA, VMMDEV_IOCTL_LOCK_ONE_VCPU), + VMMDEV_IOCTL(VM_GLA2GPA_NOFAULT, VMMDEV_IOCTL_LOCK_ONE_VCPU), + VMMDEV_IOCTL(VM_SET_INTINFO, VMMDEV_IOCTL_LOCK_ONE_VCPU), + VMMDEV_IOCTL(VM_GET_INTINFO, VMMDEV_IOCTL_LOCK_ONE_VCPU), + VMMDEV_IOCTL(VM_RESTART_INSTRUCTION, VMMDEV_IOCTL_LOCK_ONE_VCPU), + VMMDEV_IOCTL(VM_GET_KERNEMU_DEV, VMMDEV_IOCTL_LOCK_ONE_VCPU), + VMMDEV_IOCTL(VM_SET_KERNEMU_DEV, VMMDEV_IOCTL_LOCK_ONE_VCPU), + + VMMDEV_IOCTL(VM_BIND_PPTDEV, + VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS | + VMMDEV_IOCTL_PRIV_CHECK_DRIVER), + VMMDEV_IOCTL(VM_UNBIND_PPTDEV, + VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS | + VMMDEV_IOCTL_PRIV_CHECK_DRIVER), + + VMMDEV_IOCTL(VM_MAP_PPTDEV_MMIO, VMMDEV_IOCTL_LOCK_ALL_VCPUS | + VMMDEV_IOCTL_PRIV_CHECK_DRIVER), + VMMDEV_IOCTL(VM_UNMAP_PPTDEV_MMIO, VMMDEV_IOCTL_LOCK_ALL_VCPUS | + VMMDEV_IOCTL_PRIV_CHECK_DRIVER), +#ifdef BHYVE_SNAPSHOT +#ifdef COMPAT_FREEBSD13 + VMMDEV_IOCTL(VM_SNAPSHOT_REQ_13, VMMDEV_IOCTL_LOCK_ALL_VCPUS), +#endif + VMMDEV_IOCTL(VM_SNAPSHOT_REQ, VMMDEV_IOCTL_LOCK_ALL_VCPUS), + VMMDEV_IOCTL(VM_RESTORE_TIME, VMMDEV_IOCTL_LOCK_ALL_VCPUS), +#endif + +#ifdef COMPAT_FREEBSD13 + VMMDEV_IOCTL(VM_STATS_13, VMMDEV_IOCTL_LOCK_ONE_VCPU), +#endif + VMMDEV_IOCTL(VM_INJECT_NMI, VMMDEV_IOCTL_LOCK_ONE_VCPU), + VMMDEV_IOCTL(VM_LAPIC_IRQ, VMMDEV_IOCTL_LOCK_ONE_VCPU), + VMMDEV_IOCTL(VM_GET_X2APIC_STATE, VMMDEV_IOCTL_LOCK_ONE_VCPU), + + VMMDEV_IOCTL(VM_LAPIC_LOCAL_IRQ, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU), + + VMMDEV_IOCTL(VM_PPTDEV_MSI, VMMDEV_IOCTL_PRIV_CHECK_DRIVER), + VMMDEV_IOCTL(VM_PPTDEV_MSIX, VMMDEV_IOCTL_PRIV_CHECK_DRIVER), + VMMDEV_IOCTL(VM_PPTDEV_DISABLE_MSIX, VMMDEV_IOCTL_PRIV_CHECK_DRIVER), + VMMDEV_IOCTL(VM_LAPIC_MSI, 0), + VMMDEV_IOCTL(VM_IOAPIC_ASSERT_IRQ, 0), + VMMDEV_IOCTL(VM_IOAPIC_DEASSERT_IRQ, 0), + VMMDEV_IOCTL(VM_IOAPIC_PULSE_IRQ, 0), + VMMDEV_IOCTL(VM_IOAPIC_PINCOUNT, 0), + VMMDEV_IOCTL(VM_ISA_ASSERT_IRQ, 0), + VMMDEV_IOCTL(VM_ISA_DEASSERT_IRQ, 0), + VMMDEV_IOCTL(VM_ISA_PULSE_IRQ, 0), + VMMDEV_IOCTL(VM_ISA_SET_IRQ_TRIGGER, 0), + VMMDEV_IOCTL(VM_GET_GPA_PMAP, 0), + VMMDEV_IOCTL(VM_GET_HPET_CAPABILITIES, 0), + VMMDEV_IOCTL(VM_RTC_READ, 0), + VMMDEV_IOCTL(VM_RTC_WRITE, 0), + VMMDEV_IOCTL(VM_RTC_GETTIME, 0), + VMMDEV_IOCTL(VM_RTC_SETTIME, 0), +}; +const size_t vmmdev_machdep_ioctl_count = nitems(vmmdev_machdep_ioctls); + +int +vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data, + int fflag, struct thread *td) +{ + int error; + + error = 0; + switch (cmd) { + case VM_RUN: { + struct vm_exit *vme; + struct vm_run *vmrun; + + vmrun = (struct vm_run *)data; + vme = vm_exitinfo(vcpu); + + error = vm_run(vcpu); + if (error != 0) + break; + + error = copyout(vme, vmrun->vm_exit, sizeof(*vme)); + if (error != 0) + break; + if (vme->exitcode == VM_EXITCODE_IPI) { + error = copyout(vm_exitinfo_cpuset(vcpu), + vmrun->cpuset, + min(vmrun->cpusetsize, sizeof(cpuset_t))); + if (error != 0) + break; + if (sizeof(cpuset_t) < vmrun->cpusetsize) { + uint8_t *p; + + p = (uint8_t *)vmrun->cpuset + + sizeof(cpuset_t); + while (p < (uint8_t *)vmrun->cpuset + + vmrun->cpusetsize) { + if (subyte(p++, 0) != 0) { + error = EFAULT; + break; + } + } + } + } + break; + } +#ifdef COMPAT_FREEBSD13 + case VM_RUN_13: { + struct vm_exit *vme; + struct vm_exit_13 *vme_13; + struct vm_run_13 *vmrun_13; + + vmrun_13 = (struct vm_run_13 *)data; + vme_13 = &vmrun_13->vm_exit; + vme = vm_exitinfo(vcpu); + + error = vm_run(vcpu); + if (error == 0) { + vme_13->exitcode = vme->exitcode; + vme_13->inst_length = vme->inst_length; + vme_13->rip = vme->rip; + memcpy(vme_13->u, &vme->u, sizeof(vme_13->u)); + if (vme->exitcode == VM_EXITCODE_IPI) { + struct vm_exit_ipi_13 *ipi; + cpuset_t *dmask; + int cpu; + + dmask = vm_exitinfo_cpuset(vcpu); + ipi = (struct vm_exit_ipi_13 *)&vme_13->u[0]; + BIT_ZERO(256, &ipi->dmask); + CPU_FOREACH_ISSET(cpu, dmask) { + if (cpu >= 256) + break; + BIT_SET(256, cpu, &ipi->dmask); + } + } + } + break; + } + case VM_STATS_13: { + struct vm_stats_13 *vmstats_13; + + vmstats_13 = (struct vm_stats_13 *)data; + getmicrotime(&vmstats_13->tv); + error = vmm_stat_copy(vcpu, 0, nitems(vmstats_13->statbuf), + &vmstats_13->num_entries, vmstats_13->statbuf); + break; + } +#endif + case VM_PPTDEV_MSI: { + struct vm_pptdev_msi *pptmsi; + + pptmsi = (struct vm_pptdev_msi *)data; + error = ppt_setup_msi(vm, pptmsi->bus, pptmsi->slot, + pptmsi->func, pptmsi->addr, pptmsi->msg, pptmsi->numvec); + break; + } + case VM_PPTDEV_MSIX: { + struct vm_pptdev_msix *pptmsix; + + pptmsix = (struct vm_pptdev_msix *)data; + error = ppt_setup_msix(vm, pptmsix->bus, pptmsix->slot, + pptmsix->func, pptmsix->idx, pptmsix->addr, pptmsix->msg, + pptmsix->vector_control); + break; + } + case VM_PPTDEV_DISABLE_MSIX: { + struct vm_pptdev *pptdev; + + pptdev = (struct vm_pptdev *)data; + error = ppt_disable_msix(vm, pptdev->bus, pptdev->slot, + pptdev->func); + break; + } + case VM_MAP_PPTDEV_MMIO: { + struct vm_pptdev_mmio *pptmmio; + + pptmmio = (struct vm_pptdev_mmio *)data; + error = ppt_map_mmio(vm, pptmmio->bus, pptmmio->slot, + pptmmio->func, pptmmio->gpa, pptmmio->len, pptmmio->hpa); + break; + } + case VM_UNMAP_PPTDEV_MMIO: { + struct vm_pptdev_mmio *pptmmio; + + pptmmio = (struct vm_pptdev_mmio *)data; + error = ppt_unmap_mmio(vm, pptmmio->bus, pptmmio->slot, + pptmmio->func, pptmmio->gpa, pptmmio->len); + break; + } + case VM_BIND_PPTDEV: { + struct vm_pptdev *pptdev; + + pptdev = (struct vm_pptdev *)data; + error = vm_assign_pptdev(vm, pptdev->bus, pptdev->slot, + pptdev->func); + break; + } + case VM_UNBIND_PPTDEV: { + struct vm_pptdev *pptdev; + + pptdev = (struct vm_pptdev *)data; + error = vm_unassign_pptdev(vm, pptdev->bus, pptdev->slot, + pptdev->func); + break; + } + case VM_INJECT_EXCEPTION: { + struct vm_exception *vmexc; + + vmexc = (struct vm_exception *)data; + error = vm_inject_exception(vcpu, + vmexc->vector, vmexc->error_code_valid, vmexc->error_code, + vmexc->restart_instruction); + break; + } + case VM_INJECT_NMI: + error = vm_inject_nmi(vcpu); + break; + case VM_LAPIC_IRQ: { + struct vm_lapic_irq *vmirq; + + vmirq = (struct vm_lapic_irq *)data; + error = lapic_intr_edge(vcpu, vmirq->vector); + break; + } + case VM_LAPIC_LOCAL_IRQ: { + struct vm_lapic_irq *vmirq; + + vmirq = (struct vm_lapic_irq *)data; + error = lapic_set_local_intr(vm, vcpu, vmirq->vector); + break; + } + case VM_LAPIC_MSI: { + struct vm_lapic_msi *vmmsi; + + vmmsi = (struct vm_lapic_msi *)data; + error = lapic_intr_msi(vm, vmmsi->addr, vmmsi->msg); + break; + } + case VM_IOAPIC_ASSERT_IRQ: { + struct vm_ioapic_irq *ioapic_irq; + + ioapic_irq = (struct vm_ioapic_irq *)data; + error = vioapic_assert_irq(vm, ioapic_irq->irq); + break; + } + case VM_IOAPIC_DEASSERT_IRQ: { + struct vm_ioapic_irq *ioapic_irq; + + ioapic_irq = (struct vm_ioapic_irq *)data; + error = vioapic_deassert_irq(vm, ioapic_irq->irq); + break; + } + case VM_IOAPIC_PULSE_IRQ: { + struct vm_ioapic_irq *ioapic_irq; + + ioapic_irq = (struct vm_ioapic_irq *)data; + error = vioapic_pulse_irq(vm, ioapic_irq->irq); + break; + } + case VM_IOAPIC_PINCOUNT: + *(int *)data = vioapic_pincount(vm); + break; + case VM_SET_KERNEMU_DEV: + case VM_GET_KERNEMU_DEV: { + struct vm_readwrite_kernemu_device *kernemu; + mem_region_write_t mwrite; + mem_region_read_t mread; + int size; + bool arg; + + kernemu = (void *)data; + + if (kernemu->access_width > 0) + size = (1u << kernemu->access_width); + else + size = 1; + + if (kernemu->gpa >= DEFAULT_APIC_BASE && + kernemu->gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { + mread = lapic_mmio_read; + mwrite = lapic_mmio_write; + } else if (kernemu->gpa >= VIOAPIC_BASE && + kernemu->gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { + mread = vioapic_mmio_read; + mwrite = vioapic_mmio_write; + } else if (kernemu->gpa >= VHPET_BASE && + kernemu->gpa < VHPET_BASE + VHPET_SIZE) { + mread = vhpet_mmio_read; + mwrite = vhpet_mmio_write; + } else { + error = EINVAL; + break; + } + + if (cmd == VM_SET_KERNEMU_DEV) + error = mwrite(vcpu, kernemu->gpa, + kernemu->value, size, &arg); + else + error = mread(vcpu, kernemu->gpa, + &kernemu->value, size, &arg); + break; + } + case VM_ISA_ASSERT_IRQ: { + struct vm_isa_irq *isa_irq; + + isa_irq = (struct vm_isa_irq *)data; + error = vatpic_assert_irq(vm, isa_irq->atpic_irq); + if (error == 0 && isa_irq->ioapic_irq != -1) + error = vioapic_assert_irq(vm, isa_irq->ioapic_irq); + break; + } + case VM_ISA_DEASSERT_IRQ: { + struct vm_isa_irq *isa_irq; + + isa_irq = (struct vm_isa_irq *)data; + error = vatpic_deassert_irq(vm, isa_irq->atpic_irq); + if (error == 0 && isa_irq->ioapic_irq != -1) + error = vioapic_deassert_irq(vm, isa_irq->ioapic_irq); + break; + } + case VM_ISA_PULSE_IRQ: { + struct vm_isa_irq *isa_irq; + + isa_irq = (struct vm_isa_irq *)data; + error = vatpic_pulse_irq(vm, isa_irq->atpic_irq); + if (error == 0 && isa_irq->ioapic_irq != -1) + error = vioapic_pulse_irq(vm, isa_irq->ioapic_irq); + break; + } + case VM_ISA_SET_IRQ_TRIGGER: { + struct vm_isa_irq_trigger *isa_irq_trigger; + + isa_irq_trigger = (struct vm_isa_irq_trigger *)data; + error = vatpic_set_irq_trigger(vm, + isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger); + break; + } + case VM_SET_SEGMENT_DESCRIPTOR: { + struct vm_seg_desc *vmsegdesc; + + vmsegdesc = (struct vm_seg_desc *)data; + error = vm_set_seg_desc(vcpu, vmsegdesc->regnum, + &vmsegdesc->desc); + break; + } + case VM_GET_SEGMENT_DESCRIPTOR: { + struct vm_seg_desc *vmsegdesc; + + vmsegdesc = (struct vm_seg_desc *)data; + error = vm_get_seg_desc(vcpu, vmsegdesc->regnum, + &vmsegdesc->desc); + break; + } + case VM_SET_X2APIC_STATE: { + struct vm_x2apic *x2apic; + + x2apic = (struct vm_x2apic *)data; + error = vm_set_x2apic_state(vcpu, x2apic->state); + break; + } + case VM_GET_X2APIC_STATE: { + struct vm_x2apic *x2apic; + + x2apic = (struct vm_x2apic *)data; + error = vm_get_x2apic_state(vcpu, &x2apic->state); + break; + } + case VM_GET_GPA_PMAP: { + struct vm_gpa_pte *gpapte; + + gpapte = (struct vm_gpa_pte *)data; + pmap_get_mapping(vmspace_pmap(vm_vmspace(vm)), gpapte->gpa, + gpapte->pte, &gpapte->ptenum); + break; + } + case VM_GET_HPET_CAPABILITIES: + error = vhpet_getcap((struct vm_hpet_cap *)data); + break; + case VM_GLA2GPA: { + struct vm_gla2gpa *gg; + + CTASSERT(PROT_READ == VM_PROT_READ); + CTASSERT(PROT_WRITE == VM_PROT_WRITE); + CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); + gg = (struct vm_gla2gpa *)data; + error = vm_gla2gpa(vcpu, &gg->paging, gg->gla, + gg->prot, &gg->gpa, &gg->fault); + KASSERT(error == 0 || error == EFAULT, + ("%s: vm_gla2gpa unknown error %d", __func__, error)); + break; + } + case VM_GLA2GPA_NOFAULT: { + struct vm_gla2gpa *gg; + + gg = (struct vm_gla2gpa *)data; + error = vm_gla2gpa_nofault(vcpu, &gg->paging, gg->gla, + gg->prot, &gg->gpa, &gg->fault); + KASSERT(error == 0 || error == EFAULT, + ("%s: vm_gla2gpa unknown error %d", __func__, error)); + break; + } + case VM_SET_INTINFO: { + struct vm_intinfo *vmii; + + vmii = (struct vm_intinfo *)data; + error = vm_exit_intinfo(vcpu, vmii->info1); + break; + } + case VM_GET_INTINFO: { + struct vm_intinfo *vmii; + + vmii = (struct vm_intinfo *)data; + error = vm_get_intinfo(vcpu, &vmii->info1, &vmii->info2); + break; + } + case VM_RTC_WRITE: { + struct vm_rtc_data *rtcdata; + + rtcdata = (struct vm_rtc_data *)data; + error = vrtc_nvram_write(vm, rtcdata->offset, + rtcdata->value); + break; + } + case VM_RTC_READ: { + struct vm_rtc_data *rtcdata; + + rtcdata = (struct vm_rtc_data *)data; + error = vrtc_nvram_read(vm, rtcdata->offset, + &rtcdata->value); + break; + } + case VM_RTC_SETTIME: { + struct vm_rtc_time *rtctime; + + rtctime = (struct vm_rtc_time *)data; + error = vrtc_set_time(vm, rtctime->secs); + break; + } + case VM_RTC_GETTIME: { + struct vm_rtc_time *rtctime; + + rtctime = (struct vm_rtc_time *)data; + rtctime->secs = vrtc_get_time(vm); + break; + } + case VM_RESTART_INSTRUCTION: + error = vm_restart_instruction(vcpu); + break; +#ifdef BHYVE_SNAPSHOT + case VM_SNAPSHOT_REQ: { + struct vm_snapshot_meta *snapshot_meta; + + snapshot_meta = (struct vm_snapshot_meta *)data; + error = vm_snapshot_req(vm, snapshot_meta); + break; + } +#ifdef COMPAT_FREEBSD13 + case VM_SNAPSHOT_REQ_13: { + struct vm_snapshot_meta *snapshot_meta; + struct vm_snapshot_meta_13 *snapshot_13; + + /* + * The old structure just has an additional pointer at + * the start that is ignored. + */ + snapshot_13 = (struct vm_snapshot_meta_13 *)data; + snapshot_meta = + (struct vm_snapshot_meta *)&snapshot_13->dev_data; + error = vm_snapshot_req(vm, snapshot_meta); + break; + } +#endif + case VM_RESTORE_TIME: + error = vm_restore_time(vm); + break; +#endif + default: + error = ENOTTY; + break; + } + + return (error); +} diff --git a/sys/amd64/vmm/vmm_host.c b/sys/amd64/vmm/vmm_host.c new file mode 100644 index 000000000000..78811b488fdb --- /dev/null +++ b/sys/amd64/vmm/vmm_host.c @@ -0,0 +1,167 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/pcpu.h> + +#include <machine/cpufunc.h> +#include <machine/segments.h> +#include <machine/specialreg.h> + +#include "vmm_host.h" + +static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4, + vmm_host_xcr0; +static struct xsave_limits vmm_xsave_limits; + +void +vmm_host_state_init(void) +{ + int regs[4]; + + vmm_host_efer = rdmsr(MSR_EFER); + vmm_host_pat = rdmsr(MSR_PAT); + + /* + * We always want CR0.TS to be set when the processor does a VM exit. + * + * With emulation turned on unconditionally after a VM exit, we are + * able to trap inadvertent use of the FPU until the guest FPU state + * has been safely squirreled away. + */ + vmm_host_cr0 = rcr0() | CR0_TS; + + /* + * On non-PCID or PCID but without INVPCID support machines, + * we flush kernel i.e. global TLB entries, by temporary + * clearing the CR4.PGE bit, see invltlb_glob(). If + * preemption occurs at the wrong time, cached vmm_host_cr4 + * might store the value with CR4.PGE cleared. Since FreeBSD + * requires support for PG_G on amd64, just set it + * unconditionally. + */ + vmm_host_cr4 = rcr4() | CR4_PGE; + + /* + * Only permit a guest to use XSAVE if the host is using + * XSAVE. Only permit a guest to use XSAVE features supported + * by the host. This ensures that the FPU state used by the + * guest is always a subset of the saved guest FPU state. + * + * In addition, only permit known XSAVE features where the + * rules for which features depend on other features is known + * to properly emulate xsetbv. + */ + if (vmm_host_cr4 & CR4_XSAVE) { + vmm_xsave_limits.xsave_enabled = 1; + vmm_host_xcr0 = rxcr(0); + vmm_xsave_limits.xcr0_allowed = vmm_host_xcr0 & + (XFEATURE_AVX | XFEATURE_MPX | XFEATURE_AVX512); + + cpuid_count(0xd, 0x0, regs); + vmm_xsave_limits.xsave_max_size = regs[1]; + } +} + +uint64_t +vmm_get_host_pat(void) +{ + + return (vmm_host_pat); +} + +uint64_t +vmm_get_host_efer(void) +{ + + return (vmm_host_efer); +} + +uint64_t +vmm_get_host_cr0(void) +{ + + return (vmm_host_cr0); +} + +uint64_t +vmm_get_host_cr4(void) +{ + + return (vmm_host_cr4); +} + +uint64_t +vmm_get_host_xcr0(void) +{ + + return (vmm_host_xcr0); +} + +uint64_t +vmm_get_host_datasel(void) +{ + + return (GSEL(GDATA_SEL, SEL_KPL)); + +} + +uint64_t +vmm_get_host_codesel(void) +{ + + return (GSEL(GCODE_SEL, SEL_KPL)); +} + +uint64_t +vmm_get_host_tsssel(void) +{ + + return (GSEL(GPROC0_SEL, SEL_KPL)); +} + +uint64_t +vmm_get_host_fsbase(void) +{ + + return (0); +} + +uint64_t +vmm_get_host_idtrbase(void) +{ + + return (r_idt.rd_base); +} + +const struct xsave_limits * +vmm_get_xsave_limits(void) +{ + + return (&vmm_xsave_limits); +} diff --git a/sys/amd64/vmm/vmm_host.h b/sys/amd64/vmm/vmm_host.h new file mode 100644 index 000000000000..eebb794843b6 --- /dev/null +++ b/sys/amd64/vmm/vmm_host.h @@ -0,0 +1,80 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_HOST_H_ +#define _VMM_HOST_H_ + +#ifndef _KERNEL +#error "no user-serviceable parts inside" +#endif + +struct xsave_limits { + int xsave_enabled; + uint64_t xcr0_allowed; + uint32_t xsave_max_size; +}; + +void vmm_host_state_init(void); + +uint64_t vmm_get_host_pat(void); +uint64_t vmm_get_host_efer(void); +uint64_t vmm_get_host_cr0(void); +uint64_t vmm_get_host_cr4(void); +uint64_t vmm_get_host_xcr0(void); +uint64_t vmm_get_host_datasel(void); +uint64_t vmm_get_host_codesel(void); +uint64_t vmm_get_host_tsssel(void); +uint64_t vmm_get_host_fsbase(void); +uint64_t vmm_get_host_idtrbase(void); +const struct xsave_limits *vmm_get_xsave_limits(void); + +/* + * Inline access to host state that is used on every VM entry + */ +static __inline uint64_t +vmm_get_host_trbase(void) +{ + + return ((uint64_t)PCPU_GET(tssp)); +} + +static __inline uint64_t +vmm_get_host_gdtrbase(void) +{ + + return ((uint64_t)*PCPU_PTR(gdt)); +} + +static __inline uint64_t +vmm_get_host_gsbase(void) +{ + + return ((uint64_t)get_pcpu()); +} + +#endif diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c new file mode 100644 index 000000000000..c54b6e6d0074 --- /dev/null +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -0,0 +1,2940 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2012 Sandvine, Inc. + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +#ifdef _KERNEL +#include <sys/param.h> +#include <sys/pcpu.h> +#include <sys/systm.h> +#include <sys/proc.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/vmparam.h> +#include <machine/vmm.h> + +#include <dev/vmm/vmm_mem.h> +#else /* !_KERNEL */ +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/_iovec.h> + +#include <machine/vmm.h> + +#include <err.h> +#include <assert.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdio.h> +#include <string.h> +#include <strings.h> +#include <vmmapi.h> +#define __diagused +#define KASSERT(exp,msg) assert((exp)) +#define panic(...) errx(4, __VA_ARGS__) +#endif /* _KERNEL */ + +#include <machine/vmm_instruction_emul.h> +#include <x86/psl.h> +#include <x86/specialreg.h> + +/* struct vie_op.op_flags */ +#define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ +#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ +#define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ +#define VIE_OP_F_NO_MODRM (1 << 3) +#define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) + +static const struct vie_op three_byte_opcodes_0f38[256] = { + [0xF7] = { + .op_byte = 0xF7, + .op_type = VIE_OP_TYPE_BEXTR, + }, +}; + +static const struct vie_op two_byte_opcodes[256] = { + [0xAE] = { + .op_byte = 0xAE, + .op_type = VIE_OP_TYPE_TWOB_GRP15, + }, + [0xB6] = { + .op_byte = 0xB6, + .op_type = VIE_OP_TYPE_MOVZX, + }, + [0xB7] = { + .op_byte = 0xB7, + .op_type = VIE_OP_TYPE_MOVZX, + }, + [0xBA] = { + .op_byte = 0xBA, + .op_type = VIE_OP_TYPE_BITTEST, + .op_flags = VIE_OP_F_IMM8, + }, + [0xBE] = { + .op_byte = 0xBE, + .op_type = VIE_OP_TYPE_MOVSX, + }, +}; + +static const struct vie_op one_byte_opcodes[256] = { + [0x03] = { + .op_byte = 0x03, + .op_type = VIE_OP_TYPE_ADD, + }, + [0x0F] = { + .op_byte = 0x0F, + .op_type = VIE_OP_TYPE_TWO_BYTE + }, + [0x0B] = { + .op_byte = 0x0B, + .op_type = VIE_OP_TYPE_OR, + }, + [0x2B] = { + .op_byte = 0x2B, + .op_type = VIE_OP_TYPE_SUB, + }, + [0x39] = { + .op_byte = 0x39, + .op_type = VIE_OP_TYPE_CMP, + }, + [0x3B] = { + .op_byte = 0x3B, + .op_type = VIE_OP_TYPE_CMP, + }, + [0x6E] = { + .op_byte = 0x6E, + .op_type = VIE_OP_TYPE_OUTS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION, + }, + [0x6F] = { + .op_byte = 0x6F, + .op_type = VIE_OP_TYPE_OUTS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION, + }, + [0x88] = { + .op_byte = 0x88, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x89] = { + .op_byte = 0x89, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x8A] = { + .op_byte = 0x8A, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x8B] = { + .op_byte = 0x8B, + .op_type = VIE_OP_TYPE_MOV, + }, + [0xA1] = { + .op_byte = 0xA1, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, + }, + [0xA3] = { + .op_byte = 0xA3, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, + }, + [0xA4] = { + .op_byte = 0xA4, + .op_type = VIE_OP_TYPE_MOVS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xA5] = { + .op_byte = 0xA5, + .op_type = VIE_OP_TYPE_MOVS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xAA] = { + .op_byte = 0xAA, + .op_type = VIE_OP_TYPE_STOS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xAB] = { + .op_byte = 0xAB, + .op_type = VIE_OP_TYPE_STOS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xC6] = { + /* XXX Group 11 extended opcode - not just MOV */ + .op_byte = 0xC6, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_IMM8, + }, + [0xC7] = { + .op_byte = 0xC7, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_IMM, + }, + [0x23] = { + .op_byte = 0x23, + .op_type = VIE_OP_TYPE_AND, + }, + [0x80] = { + /* Group 1 extended opcode */ + .op_byte = 0x80, + .op_type = VIE_OP_TYPE_GROUP1, + .op_flags = VIE_OP_F_IMM8, + }, + [0x81] = { + /* Group 1 extended opcode */ + .op_byte = 0x81, + .op_type = VIE_OP_TYPE_GROUP1, + .op_flags = VIE_OP_F_IMM, + }, + [0x83] = { + /* Group 1 extended opcode */ + .op_byte = 0x83, + .op_type = VIE_OP_TYPE_GROUP1, + .op_flags = VIE_OP_F_IMM8, + }, + [0x8F] = { + /* XXX Group 1A extended opcode - not just POP */ + .op_byte = 0x8F, + .op_type = VIE_OP_TYPE_POP, + }, + [0xF6] = { + /* XXX Group 3 extended opcode - not just TEST */ + .op_byte = 0xF6, + .op_type = VIE_OP_TYPE_TEST, + .op_flags = VIE_OP_F_IMM8, + }, + [0xF7] = { + /* XXX Group 3 extended opcode - not just TEST */ + .op_byte = 0xF7, + .op_type = VIE_OP_TYPE_TEST, + .op_flags = VIE_OP_F_IMM, + }, + [0xFF] = { + /* XXX Group 5 extended opcode - not just PUSH */ + .op_byte = 0xFF, + .op_type = VIE_OP_TYPE_PUSH, + } +}; + +/* struct vie.mod */ +#define VIE_MOD_INDIRECT 0 +#define VIE_MOD_INDIRECT_DISP8 1 +#define VIE_MOD_INDIRECT_DISP32 2 +#define VIE_MOD_DIRECT 3 + +/* struct vie.rm */ +#define VIE_RM_SIB 4 +#define VIE_RM_DISP32 5 + +#define GB (1024 * 1024 * 1024) + +static enum vm_reg_name gpr_map[16] = { + VM_REG_GUEST_RAX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RDX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RSP, + VM_REG_GUEST_RBP, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15 +}; + +static uint64_t size2mask[] = { + [1] = 0xff, + [2] = 0xffff, + [4] = 0xffffffff, + [8] = 0xffffffffffffffff, +}; + +static int +vie_read_register(struct vcpu *vcpu, enum vm_reg_name reg, uint64_t *rval) +{ + int error; + + error = vm_get_register(vcpu, reg, rval); + + return (error); +} + +static void +vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) +{ + *lhbr = 0; + *reg = gpr_map[vie->reg]; + + /* + * 64-bit mode imposes limitations on accessing legacy high byte + * registers (lhbr). + * + * The legacy high-byte registers cannot be addressed if the REX + * prefix is present. In this case the values 4, 5, 6 and 7 of the + * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. + * + * If the REX prefix is not present then the values 4, 5, 6 and 7 + * of the 'ModRM:reg' field address the legacy high-byte registers, + * %ah, %ch, %dh and %bh respectively. + */ + if (!vie->rex_present) { + if (vie->reg & 0x4) { + *lhbr = 1; + *reg = gpr_map[vie->reg & 0x3]; + } + } +} + +static int +vie_read_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t *rval) +{ + uint64_t val; + int error, lhbr; + enum vm_reg_name reg; + + vie_calc_bytereg(vie, ®, &lhbr); + error = vm_get_register(vcpu, reg, &val); + + /* + * To obtain the value of a legacy high byte register shift the + * base register right by 8 bits (%ah = %rax >> 8). + */ + if (lhbr) + *rval = val >> 8; + else + *rval = val; + return (error); +} + +static int +vie_write_bytereg(struct vcpu *vcpu, struct vie *vie, uint8_t byte) +{ + uint64_t origval, val, mask; + int error, lhbr; + enum vm_reg_name reg; + + vie_calc_bytereg(vie, ®, &lhbr); + error = vm_get_register(vcpu, reg, &origval); + if (error == 0) { + val = byte; + mask = 0xff; + if (lhbr) { + /* + * Shift left by 8 to store 'byte' in a legacy high + * byte register. + */ + val <<= 8; + mask <<= 8; + } + val |= origval & ~mask; + error = vm_set_register(vcpu, reg, val); + } + return (error); +} + +int +vie_update_register(struct vcpu *vcpu, enum vm_reg_name reg, + uint64_t val, int size) +{ + int error; + uint64_t origval; + + switch (size) { + case 1: + case 2: + error = vie_read_register(vcpu, reg, &origval); + if (error) + return (error); + val &= size2mask[size]; + val |= origval & ~size2mask[size]; + break; + case 4: + val &= 0xffffffffUL; + break; + case 8: + break; + default: + return (EINVAL); + } + + error = vm_set_register(vcpu, reg, val); + return (error); +} + +#define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) + +/* + * Return the status flags that would result from doing (x - y). + */ +#define GETCC(sz) \ +static u_long \ +getcc##sz(uint##sz##_t x, uint##sz##_t y) \ +{ \ + u_long rflags; \ + \ + __asm __volatile("sub %2,%1; pushfq; popq %0" : \ + "=r" (rflags), "+r" (x) : "m" (y)); \ + return (rflags); \ +} struct __hack + +GETCC(8); +GETCC(16); +GETCC(32); +GETCC(64); + +static u_long +getcc(int opsize, uint64_t x, uint64_t y) +{ + KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, + ("getcc: invalid operand size %d", opsize)); + + if (opsize == 1) + return (getcc8(x, y)); + else if (opsize == 2) + return (getcc16(x, y)); + else if (opsize == 4) + return (getcc32(x, y)); + else + return (getcc64(x, y)); +} + +/* + * Macro creation of functions getaddflags{8,16,32,64} + */ +#define GETADDFLAGS(sz) \ +static u_long \ +getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \ +{ \ + u_long rflags; \ + \ + __asm __volatile("add %2,%1; pushfq; popq %0" : \ + "=r" (rflags), "+r" (x) : "m" (y)); \ + return (rflags); \ +} struct __hack + +GETADDFLAGS(8); +GETADDFLAGS(16); +GETADDFLAGS(32); +GETADDFLAGS(64); + +static u_long +getaddflags(int opsize, uint64_t x, uint64_t y) +{ + KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, + ("getaddflags: invalid operand size %d", opsize)); + + if (opsize == 1) + return (getaddflags8(x, y)); + else if (opsize == 2) + return (getaddflags16(x, y)); + else if (opsize == 4) + return (getaddflags32(x, y)); + else + return (getaddflags64(x, y)); +} + +/* + * Return the status flags that would result from doing (x & y). + */ +#define GETANDFLAGS(sz) \ +static u_long \ +getandflags##sz(uint##sz##_t x, uint##sz##_t y) \ +{ \ + u_long rflags; \ + \ + __asm __volatile("and %2,%1; pushfq; popq %0" : \ + "=r" (rflags), "+r" (x) : "m" (y)); \ + return (rflags); \ +} struct __hack + +GETANDFLAGS(8); +GETANDFLAGS(16); +GETANDFLAGS(32); +GETANDFLAGS(64); + +static u_long +getandflags(int opsize, uint64_t x, uint64_t y) +{ + KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, + ("getandflags: invalid operand size %d", opsize)); + + if (opsize == 1) + return (getandflags8(x, y)); + else if (opsize == 2) + return (getandflags16(x, y)); + else if (opsize == 4) + return (getandflags32(x, y)); + else + return (getandflags64(x, y)); +} + +static int +emulate_mov(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint8_t byte; + uint64_t val; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x88: + /* + * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) + * 88/r: mov r/m8, r8 + * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) + */ + size = 1; /* override for byte operation */ + error = vie_read_bytereg(vcpu, vie, &byte); + if (error == 0) + error = memwrite(vcpu, gpa, byte, size, arg); + break; + case 0x89: + /* + * MOV from reg (ModRM:reg) to mem (ModRM:r/m) + * 89/r: mov r/m16, r16 + * 89/r: mov r/m32, r32 + * REX.W + 89/r mov r/m64, r64 + */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vcpu, reg, &val); + if (error == 0) { + val &= size2mask[size]; + error = memwrite(vcpu, gpa, val, size, arg); + } + break; + case 0x8A: + /* + * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) + * 8A/r: mov r8, r/m8 + * REX + 8A/r: mov r8, r/m8 + */ + size = 1; /* override for byte operation */ + error = memread(vcpu, gpa, &val, size, arg); + if (error == 0) + error = vie_write_bytereg(vcpu, vie, val); + break; + case 0x8B: + /* + * MOV from mem (ModRM:r/m) to reg (ModRM:reg) + * 8B/r: mov r16, r/m16 + * 8B/r: mov r32, r/m32 + * REX.W 8B/r: mov r64, r/m64 + */ + error = memread(vcpu, gpa, &val, size, arg); + if (error == 0) { + reg = gpr_map[vie->reg]; + error = vie_update_register(vcpu, reg, val, size); + } + break; + case 0xA1: + /* + * MOV from seg:moffset to AX/EAX/RAX + * A1: mov AX, moffs16 + * A1: mov EAX, moffs32 + * REX.W + A1: mov RAX, moffs64 + */ + error = memread(vcpu, gpa, &val, size, arg); + if (error == 0) { + reg = VM_REG_GUEST_RAX; + error = vie_update_register(vcpu, reg, val, size); + } + break; + case 0xA3: + /* + * MOV from AX/EAX/RAX to seg:moffset + * A3: mov moffs16, AX + * A3: mov moffs32, EAX + * REX.W + A3: mov moffs64, RAX + */ + error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val); + if (error == 0) { + val &= size2mask[size]; + error = memwrite(vcpu, gpa, val, size, arg); + } + break; + case 0xC6: + /* + * MOV from imm8 to mem (ModRM:r/m) + * C6/0 mov r/m8, imm8 + * REX + C6/0 mov r/m8, imm8 + */ + size = 1; /* override for byte operation */ + error = memwrite(vcpu, gpa, vie->immediate, size, arg); + break; + case 0xC7: + /* + * MOV from imm16/imm32 to mem (ModRM:r/m) + * C7/0 mov r/m16, imm16 + * C7/0 mov r/m32, imm32 + * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) + */ + val = vie->immediate & size2mask[size]; + error = memwrite(vcpu, gpa, val, size, arg); + break; + default: + break; + } + + return (error); +} + +static int +emulate_movx(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint64_t val; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0xB6: + /* + * MOV and zero extend byte from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F B6/r movzx r16, r/m8 + * 0F B6/r movzx r32, r/m8 + * REX.W + 0F B6/r movzx r64, r/m8 + */ + + /* get the first operand */ + error = memread(vcpu, gpa, &val, 1, arg); + if (error) + break; + + /* get the second operand */ + reg = gpr_map[vie->reg]; + + /* zero-extend byte */ + val = (uint8_t)val; + + /* write the result */ + error = vie_update_register(vcpu, reg, val, size); + break; + case 0xB7: + /* + * MOV and zero extend word from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F B7/r movzx r32, r/m16 + * REX.W + 0F B7/r movzx r64, r/m16 + */ + error = memread(vcpu, gpa, &val, 2, arg); + if (error) + return (error); + + reg = gpr_map[vie->reg]; + + /* zero-extend word */ + val = (uint16_t)val; + + error = vie_update_register(vcpu, reg, val, size); + break; + case 0xBE: + /* + * MOV and sign extend byte from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F BE/r movsx r16, r/m8 + * 0F BE/r movsx r32, r/m8 + * REX.W + 0F BE/r movsx r64, r/m8 + */ + + /* get the first operand */ + error = memread(vcpu, gpa, &val, 1, arg); + if (error) + break; + + /* get the second operand */ + reg = gpr_map[vie->reg]; + + /* sign extend byte */ + val = (int8_t)val; + + /* write the result */ + error = vie_update_register(vcpu, reg, val, size); + break; + default: + break; + } + return (error); +} + +/* + * Helper function to calculate and validate a linear address. + */ +static int +get_gla(struct vcpu *vcpu, struct vie *vie __unused, + struct vm_guest_paging *paging, int opsize, int addrsize, int prot, + enum vm_reg_name seg, enum vm_reg_name gpr, uint64_t *gla, int *fault) +{ + struct seg_desc desc; + uint64_t cr0, val, rflags; + int error __diagused; + + error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0); + KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); + + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = vm_get_seg_desc(vcpu, seg, &desc); + KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", + __func__, error, seg)); + + error = vie_read_register(vcpu, gpr, &val); + KASSERT(error == 0, ("%s: error %d getting register %d", __func__, + error, gpr)); + + if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, + addrsize, prot, gla)) { + if (seg == VM_REG_GUEST_SS) + vm_inject_ss(vcpu, 0); + else + vm_inject_gp(vcpu); + goto guest_fault; + } + + if (vie_canonical_check(paging->cpu_mode, *gla)) { + if (seg == VM_REG_GUEST_SS) + vm_inject_ss(vcpu, 0); + else + vm_inject_gp(vcpu); + goto guest_fault; + } + + if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { + vm_inject_ac(vcpu, 0); + goto guest_fault; + } + + *fault = 0; + return (0); + +guest_fault: + *fault = 1; + return (0); +} + +static int +emulate_movs(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ +#ifdef _KERNEL + struct vm_copyinfo copyinfo[2]; +#else + struct iovec copyinfo[2]; +#endif + uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; + uint64_t rcx, rdi, rsi, rflags; + int error, fault, opsize, seg, repeat; + + opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; + val = 0; + error = 0; + + /* + * XXX although the MOVS instruction is only supposed to be used with + * the "rep" prefix some guests like FreeBSD will use "repnz" instead. + * + * Empirically the "repnz" prefix has identical behavior to "rep" + * and the zero flag does not make a difference. + */ + repeat = vie->repz_present | vie->repnz_present; + + if (repeat) { + error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx); + KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); + + /* + * The count register is %rcx, %ecx or %cx depending on the + * address size of the instruction. + */ + if ((rcx & vie_size2mask(vie->addrsize)) == 0) { + error = 0; + goto done; + } + } + + /* + * Source Destination Comments + * -------------------------------------------- + * (1) memory memory n/a + * (2) memory mmio emulated + * (3) mmio memory emulated + * (4) mmio mmio emulated + * + * At this point we don't have sufficient information to distinguish + * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this + * out because it will succeed only when operating on regular memory. + * + * XXX the emulation doesn't properly handle the case where 'gpa' + * is straddling the boundary between the normal memory and MMIO. + */ + + seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; + error = get_gla(vcpu, vie, paging, opsize, vie->addrsize, + PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault); + if (error || fault) + goto done; + + error = vm_copy_setup(vcpu, paging, srcaddr, opsize, PROT_READ, + copyinfo, nitems(copyinfo), &fault); + if (error == 0) { + if (fault) + goto done; /* Resume guest to handle fault */ + + /* + * case (2): read from system memory and write to mmio. + */ + vm_copyin(copyinfo, &val, opsize); + vm_copy_teardown(copyinfo, nitems(copyinfo)); + error = memwrite(vcpu, gpa, val, opsize, arg); + if (error) + goto done; + } else { + /* + * 'vm_copy_setup()' is expected to fail for cases (3) and (4) + * if 'srcaddr' is in the mmio space. + */ + + error = get_gla(vcpu, vie, paging, opsize, vie->addrsize, + PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr, + &fault); + if (error || fault) + goto done; + + error = vm_copy_setup(vcpu, paging, dstaddr, opsize, + PROT_WRITE, copyinfo, nitems(copyinfo), &fault); + if (error == 0) { + if (fault) + goto done; /* Resume guest to handle fault */ + + /* + * case (3): read from MMIO and write to system memory. + * + * A MMIO read can have side-effects so we + * commit to it only after vm_copy_setup() is + * successful. If a page-fault needs to be + * injected into the guest then it will happen + * before the MMIO read is attempted. + */ + error = memread(vcpu, gpa, &val, opsize, arg); + if (error) + goto done; + + vm_copyout(&val, copyinfo, opsize); + vm_copy_teardown(copyinfo, nitems(copyinfo)); + } else { + /* + * Case (4): read from and write to mmio. + * + * Commit to the MMIO read/write (with potential + * side-effects) only after we are sure that the + * instruction is not going to be restarted due + * to address translation faults. + */ + error = vm_gla2gpa(vcpu, paging, srcaddr, + PROT_READ, &srcgpa, &fault); + if (error || fault) + goto done; + + error = vm_gla2gpa(vcpu, paging, dstaddr, + PROT_WRITE, &dstgpa, &fault); + if (error || fault) + goto done; + + error = memread(vcpu, srcgpa, &val, opsize, arg); + if (error) + goto done; + + error = memwrite(vcpu, dstgpa, val, opsize, arg); + if (error) + goto done; + } + } + + error = vie_read_register(vcpu, VM_REG_GUEST_RSI, &rsi); + KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); + + error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi); + KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); + + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + if (rflags & PSL_D) { + rsi -= opsize; + rdi -= opsize; + } else { + rsi += opsize; + rdi += opsize; + } + + error = vie_update_register(vcpu, VM_REG_GUEST_RSI, rsi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); + + error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); + + if (repeat) { + rcx = rcx - 1; + error = vie_update_register(vcpu, VM_REG_GUEST_RCX, + rcx, vie->addrsize); + KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); + + /* + * Repeat the instruction if the count register is not zero. + */ + if ((rcx & vie_size2mask(vie->addrsize)) != 0) + vm_restart_instruction(vcpu); + } +done: + KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d", + __func__, error)); + return (error); +} + +static int +emulate_stos(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging __unused, mem_region_read_t memread __unused, + mem_region_write_t memwrite, void *arg) +{ + int error, opsize, repeat; + uint64_t val; + uint64_t rcx, rdi, rflags; + + opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; + repeat = vie->repz_present | vie->repnz_present; + + if (repeat) { + error = vie_read_register(vcpu, VM_REG_GUEST_RCX, &rcx); + KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); + + /* + * The count register is %rcx, %ecx or %cx depending on the + * address size of the instruction. + */ + if ((rcx & vie_size2mask(vie->addrsize)) == 0) + return (0); + } + + error = vie_read_register(vcpu, VM_REG_GUEST_RAX, &val); + KASSERT(!error, ("%s: error %d getting rax", __func__, error)); + + error = memwrite(vcpu, gpa, val, opsize, arg); + if (error) + return (error); + + error = vie_read_register(vcpu, VM_REG_GUEST_RDI, &rdi); + KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); + + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + if (rflags & PSL_D) + rdi -= opsize; + else + rdi += opsize; + + error = vie_update_register(vcpu, VM_REG_GUEST_RDI, rdi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); + + if (repeat) { + rcx = rcx - 1; + error = vie_update_register(vcpu, VM_REG_GUEST_RCX, + rcx, vie->addrsize); + KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); + + /* + * Repeat the instruction if the count register is not zero. + */ + if ((rcx & vie_size2mask(vie->addrsize)) != 0) + vm_restart_instruction(vcpu); + } + + return (0); +} + +static int +emulate_and(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint64_t result, rflags, rflags2, val1, val2; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x23: + /* + * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the + * result in reg. + * + * 23/r and r16, r/m16 + * 23/r and r32, r/m32 + * REX.W + 23/r and r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vcpu, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vcpu, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + result = val1 & val2; + error = vie_update_register(vcpu, reg, result, size); + break; + case 0x81: + case 0x83: + /* + * AND mem (ModRM:r/m) with immediate and store the + * result in mem. + * + * 81 /4 and r/m16, imm16 + * 81 /4 and r/m32, imm32 + * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 + * + * 83 /4 and r/m16, imm8 sign-extended to 16 + * 83 /4 and r/m32, imm8 sign-extended to 32 + * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 + */ + + /* get the first operand */ + error = memread(vcpu, gpa, &val1, size, arg); + if (error) + break; + + /* + * perform the operation with the pre-fetched immediate + * operand and write the result + */ + result = val1 & vie->immediate; + error = memwrite(vcpu, gpa, result, size, arg); + break; + default: + break; + } + if (error) + return (error); + + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + + /* + * OF and CF are cleared; the SF, ZF and PF flags are set according + * to the result; AF is undefined. + * + * The updated status flags are obtained by subtracting 0 from 'result'. + */ + rflags2 = getcc(size, result, 0); + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); + + error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +emulate_or(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint64_t result, rflags, rflags2, val1, val2; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x0B: + /* + * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the + * result in reg. + * + * 0b/r or r16, r/m16 + * 0b/r or r32, r/m32 + * REX.W + 0b/r or r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vcpu, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vcpu, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + result = val1 | val2; + error = vie_update_register(vcpu, reg, result, size); + break; + case 0x81: + case 0x83: + /* + * OR mem (ModRM:r/m) with immediate and store the + * result in mem. + * + * 81 /1 or r/m16, imm16 + * 81 /1 or r/m32, imm32 + * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 + * + * 83 /1 or r/m16, imm8 sign-extended to 16 + * 83 /1 or r/m32, imm8 sign-extended to 32 + * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 + */ + + /* get the first operand */ + error = memread(vcpu, gpa, &val1, size, arg); + if (error) + break; + + /* + * perform the operation with the pre-fetched immediate + * operand and write the result + */ + result = val1 | vie->immediate; + error = memwrite(vcpu, gpa, result, size, arg); + break; + default: + break; + } + if (error) + return (error); + + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + + /* + * OF and CF are cleared; the SF, ZF and PF flags are set according + * to the result; AF is undefined. + * + * The updated status flags are obtained by subtracting 0 from 'result'. + */ + rflags2 = getcc(size, result, 0); + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); + + error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +emulate_cmp(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) +{ + int error, size; + uint64_t regop, memop, op1, op2, rflags, rflags2; + enum vm_reg_name reg; + + size = vie->opsize; + switch (vie->op.op_byte) { + case 0x39: + case 0x3B: + /* + * 39/r CMP r/m16, r16 + * 39/r CMP r/m32, r32 + * REX.W 39/r CMP r/m64, r64 + * + * 3B/r CMP r16, r/m16 + * 3B/r CMP r32, r/m32 + * REX.W + 3B/r CMP r64, r/m64 + * + * Compare the first operand with the second operand and + * set status flags in EFLAGS register. The comparison is + * performed by subtracting the second operand from the first + * operand and then setting the status flags. + */ + + /* Get the register operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vcpu, reg, ®op); + if (error) + return (error); + + /* Get the memory operand */ + error = memread(vcpu, gpa, &memop, size, arg); + if (error) + return (error); + + if (vie->op.op_byte == 0x3B) { + op1 = regop; + op2 = memop; + } else { + op1 = memop; + op2 = regop; + } + rflags2 = getcc(size, op1, op2); + break; + case 0x80: + case 0x81: + case 0x83: + /* + * 80 /7 cmp r/m8, imm8 + * REX + 80 /7 cmp r/m8, imm8 + * + * 81 /7 cmp r/m16, imm16 + * 81 /7 cmp r/m32, imm32 + * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 + * + * 83 /7 cmp r/m16, imm8 sign-extended to 16 + * 83 /7 cmp r/m32, imm8 sign-extended to 32 + * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 + * + * Compare mem (ModRM:r/m) with immediate and set + * status flags according to the results. The + * comparison is performed by subtracting the + * immediate from the first operand and then setting + * the status flags. + * + */ + if (vie->op.op_byte == 0x80) + size = 1; + + /* get the first operand */ + error = memread(vcpu, gpa, &op1, size, arg); + if (error) + return (error); + + rflags2 = getcc(size, op1, vie->immediate); + break; + default: + return (EINVAL); + } + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & RFLAGS_STATUS_BITS; + + error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +emulate_test(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) +{ + int error, size; + uint64_t op1, rflags, rflags2; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0xF6: + /* + * F6 /0 test r/m8, imm8 + */ + size = 1; /* override for byte operation */ + /* FALLTHROUGH */ + case 0xF7: + /* + * F7 /0 test r/m16, imm16 + * F7 /0 test r/m32, imm32 + * REX.W + F7 /0 test r/m64, imm32 sign-extended to 64 + * + * Test mem (ModRM:r/m) with immediate and set status + * flags according to the results. The comparison is + * performed by anding the immediate from the first + * operand and then setting the status flags. + */ + if ((vie->reg & 7) != 0) + return (EINVAL); + + error = memread(vcpu, gpa, &op1, size, arg); + if (error) + return (error); + + rflags2 = getandflags(size, op1, vie->immediate); + break; + default: + return (EINVAL); + } + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + + /* + * OF and CF are cleared; the SF, ZF and PF flags are set according + * to the result; AF is undefined. + */ + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); + + error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +emulate_bextr(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite __unused, void *arg) +{ + uint64_t src1, src2, dst, rflags; + unsigned start, len, size; + int error; + + size = vie->opsize; + error = EINVAL; + + /* + * VEX.LZ.0F38.W0 F7 /r BEXTR r32a, r/m32, r32b + * VEX.LZ.0F38.W1 F7 /r BEXTR r64a, r/m64, r64b + * + * Destination operand is ModRM:reg. Source operands are ModRM:r/m and + * Vex.vvvv. + * + * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored). + */ + if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT) + size = 4; + + /* + * Extracts contiguous bits from the first /source/ operand (second + * operand) using an index and length specified in the second /source/ + * operand (third operand). + */ + error = memread(vcpu, gpa, &src1, size, arg); + if (error) + return (error); + error = vie_read_register(vcpu, gpr_map[vie->vex_reg], &src2); + if (error) + return (error); + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + + start = (src2 & 0xff); + len = (src2 & 0xff00) >> 8; + + /* If no bits are extracted, the destination register is cleared. */ + dst = 0; + + /* If START exceeds the operand size, no bits are extracted. */ + if (start > size * 8) + goto done; + /* Length is bounded by both the destination size and start offset. */ + if (start + len > size * 8) + len = (size * 8) - start; + if (len == 0) + goto done; + + if (start > 0) + src1 = (src1 >> start); + if (len < 64) + src1 = src1 & ((1ull << len) - 1); + dst = src1; + +done: + error = vie_update_register(vcpu, gpr_map[vie->reg], dst, size); + if (error) + return (error); + + /* + * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result. + * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared. + */ + rflags &= ~RFLAGS_STATUS_BITS; + if (dst == 0) + rflags |= PSL_Z; + error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, + 8); + return (error); +} + +static int +emulate_add(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) +{ + int error, size; + uint64_t nval, rflags, rflags2, val1, val2; + enum vm_reg_name reg; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x03: + /* + * ADD r/m to r and store the result in r + * + * 03/r ADD r16, r/m16 + * 03/r ADD r32, r/m32 + * REX.W + 03/r ADD r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vcpu, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vcpu, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + nval = val1 + val2; + error = vie_update_register(vcpu, reg, nval, size); + break; + default: + break; + } + + if (!error) { + rflags2 = getaddflags(size, val1, val2); + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, + &rflags); + if (error) + return (error); + + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & RFLAGS_STATUS_BITS; + error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, + rflags, 8); + } + + return (error); +} + +static int +emulate_sub(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg) +{ + int error, size; + uint64_t nval, rflags, rflags2, val1, val2; + enum vm_reg_name reg; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x2B: + /* + * SUB r/m from r and store the result in r + * + * 2B/r SUB r16, r/m16 + * 2B/r SUB r32, r/m32 + * REX.W + 2B/r SUB r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vcpu, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vcpu, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + nval = val1 - val2; + error = vie_update_register(vcpu, reg, nval, size); + break; + default: + break; + } + + if (!error) { + rflags2 = getcc(size, val1, val2); + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, + &rflags); + if (error) + return (error); + + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & RFLAGS_STATUS_BITS; + error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, + rflags, 8); + } + + return (error); +} + +static int +emulate_stack_op(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ +#ifdef _KERNEL + struct vm_copyinfo copyinfo[2]; +#else + struct iovec copyinfo[2]; +#endif + struct seg_desc ss_desc; + uint64_t cr0, rflags, rsp, stack_gla, val; + int error, fault, size, stackaddrsize, pushop; + + val = 0; + size = vie->opsize; + pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; + + /* + * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 + */ + if (paging->cpu_mode == CPU_MODE_REAL) { + stackaddrsize = 2; + } else if (paging->cpu_mode == CPU_MODE_64BIT) { + /* + * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 + * - Stack pointer size is always 64-bits. + * - PUSH/POP of 32-bit values is not possible in 64-bit mode. + * - 16-bit PUSH/POP is supported by using the operand size + * override prefix (66H). + */ + stackaddrsize = 8; + size = vie->opsize_override ? 2 : 8; + } else { + /* + * In protected or compatibility mode the 'B' flag in the + * stack-segment descriptor determines the size of the + * stack pointer. + */ + error = vm_get_seg_desc(vcpu, VM_REG_GUEST_SS, &ss_desc); + KASSERT(error == 0, ("%s: error %d getting SS descriptor", + __func__, error)); + if (SEG_DESC_DEF32(ss_desc.access)) + stackaddrsize = 4; + else + stackaddrsize = 2; + } + + error = vie_read_register(vcpu, VM_REG_GUEST_CR0, &cr0); + KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); + + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = vie_read_register(vcpu, VM_REG_GUEST_RSP, &rsp); + KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); + if (pushop) { + rsp -= size; + } + + if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, + rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, + &stack_gla)) { + vm_inject_ss(vcpu, 0); + return (0); + } + + if (vie_canonical_check(paging->cpu_mode, stack_gla)) { + vm_inject_ss(vcpu, 0); + return (0); + } + + if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { + vm_inject_ac(vcpu, 0); + return (0); + } + + error = vm_copy_setup(vcpu, paging, stack_gla, size, + pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo), + &fault); + if (error || fault) + return (error); + + if (pushop) { + error = memread(vcpu, mmio_gpa, &val, size, arg); + if (error == 0) + vm_copyout(&val, copyinfo, size); + } else { + vm_copyin(copyinfo, &val, size); + error = memwrite(vcpu, mmio_gpa, val, size, arg); + rsp += size; + } + vm_copy_teardown(copyinfo, nitems(copyinfo)); + + if (error == 0) { + error = vie_update_register(vcpu, VM_REG_GUEST_RSP, rsp, + stackaddrsize); + KASSERT(error == 0, ("error %d updating rsp", error)); + } + return (error); +} + +static int +emulate_push(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ + int error; + + /* + * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. + * + * PUSH is part of the group 5 extended opcodes and is identified + * by ModRM:reg = b110. + */ + if ((vie->reg & 7) != 6) + return (EINVAL); + + error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread, + memwrite, arg); + return (error); +} + +static int +emulate_pop(struct vcpu *vcpu, uint64_t mmio_gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ + int error; + + /* + * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. + * + * POP is part of the group 1A extended opcodes and is identified + * by ModRM:reg = b000. + */ + if ((vie->reg & 7) != 0) + return (EINVAL); + + error = emulate_stack_op(vcpu, mmio_gpa, vie, paging, memread, + memwrite, arg); + return (error); +} + +static int +emulate_group1(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging __unused, mem_region_read_t memread, + mem_region_write_t memwrite, void *memarg) +{ + int error; + + switch (vie->reg & 7) { + case 0x1: /* OR */ + error = emulate_or(vcpu, gpa, vie, + memread, memwrite, memarg); + break; + case 0x4: /* AND */ + error = emulate_and(vcpu, gpa, vie, + memread, memwrite, memarg); + break; + case 0x7: /* CMP */ + error = emulate_cmp(vcpu, gpa, vie, + memread, memwrite, memarg); + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +static int +emulate_bittest(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite __unused, + void *memarg) +{ + uint64_t val, rflags; + int error, bitmask, bitoff; + + /* + * 0F BA is a Group 8 extended opcode. + * + * Currently we only emulate the 'Bit Test' instruction which is + * identified by a ModR/M:reg encoding of 100b. + */ + if ((vie->reg & 7) != 4) + return (EINVAL); + + error = vie_read_register(vcpu, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = memread(vcpu, gpa, &val, vie->opsize, memarg); + if (error) + return (error); + + /* + * Intel SDM, Vol 2, Table 3-2: + * "Range of Bit Positions Specified by Bit Offset Operands" + */ + bitmask = vie->opsize * 8 - 1; + bitoff = vie->immediate & bitmask; + + /* Copy the bit into the Carry flag in %rflags */ + if (val & (1UL << bitoff)) + rflags |= PSL_C; + else + rflags &= ~PSL_C; + + error = vie_update_register(vcpu, VM_REG_GUEST_RFLAGS, rflags, 8); + KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); + + return (0); +} + +static int +emulate_twob_group15(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite __unused, + void *memarg) +{ + int error; + uint64_t buf; + + switch (vie->reg & 7) { + case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */ + if (vie->mod == 0x3) { + /* + * SFENCE. Ignore it, VM exit provides enough + * barriers on its own. + */ + error = 0; + } else { + /* + * CLFLUSH, CLFLUSHOPT. Only check for access + * rights. + */ + error = memread(vcpu, gpa, &buf, 1, memarg); + } + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +int +vmm_emulate_instruction(struct vcpu *vcpu, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *memarg) +{ + int error; + + if (!vie->decoded) + return (EINVAL); + + switch (vie->op.op_type) { + case VIE_OP_TYPE_GROUP1: + error = emulate_group1(vcpu, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_POP: + error = emulate_pop(vcpu, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_PUSH: + error = emulate_push(vcpu, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_CMP: + error = emulate_cmp(vcpu, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_MOV: + error = emulate_mov(vcpu, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_MOVSX: + case VIE_OP_TYPE_MOVZX: + error = emulate_movx(vcpu, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_MOVS: + error = emulate_movs(vcpu, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_STOS: + error = emulate_stos(vcpu, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_AND: + error = emulate_and(vcpu, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_OR: + error = emulate_or(vcpu, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_SUB: + error = emulate_sub(vcpu, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_BITTEST: + error = emulate_bittest(vcpu, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_TWOB_GRP15: + error = emulate_twob_group15(vcpu, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_ADD: + error = emulate_add(vcpu, gpa, vie, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_TEST: + error = emulate_test(vcpu, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_BEXTR: + error = emulate_bextr(vcpu, gpa, vie, paging, + memread, memwrite, memarg); + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +int +vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) +{ + KASSERT(size == 1 || size == 2 || size == 4 || size == 8, + ("%s: invalid size %d", __func__, size)); + KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); + + if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) + return (0); + + return ((gla & (size - 1)) ? 1 : 0); +} + +int +vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) +{ + uint64_t mask; + + if (cpu_mode != CPU_MODE_64BIT) + return (0); + + /* + * The value of the bit 47 in the 'gla' should be replicated in the + * most significant 16 bits. + */ + mask = ~((1UL << 48) - 1); + if (gla & (1UL << 47)) + return ((gla & mask) != mask); + else + return ((gla & mask) != 0); +} + +uint64_t +vie_size2mask(int size) +{ + KASSERT(size == 1 || size == 2 || size == 4 || size == 8, + ("vie_size2mask: invalid size %d", size)); + return (size2mask[size]); +} + +int +vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, + struct seg_desc *desc, uint64_t offset, int length, int addrsize, + int prot, uint64_t *gla) +{ + uint64_t firstoff, low_limit, high_limit, segbase; + int glasize, type; + + KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, + ("%s: invalid segment %d", __func__, seg)); + KASSERT(length == 1 || length == 2 || length == 4 || length == 8, + ("%s: invalid operand size %d", __func__, length)); + KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, + ("%s: invalid prot %#x", __func__, prot)); + + firstoff = offset; + if (cpu_mode == CPU_MODE_64BIT) { + KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " + "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); + glasize = 8; + } else { + KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " + "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); + glasize = 4; + /* + * If the segment selector is loaded with a NULL selector + * then the descriptor is unusable and attempting to use + * it results in a #GP(0). + */ + if (SEG_DESC_UNUSABLE(desc->access)) + return (-1); + + /* + * The processor generates a #NP exception when a segment + * register is loaded with a selector that points to a + * descriptor that is not present. If this was the case then + * it would have been checked before the VM-exit. + */ + KASSERT(SEG_DESC_PRESENT(desc->access), + ("segment %d not present: %#x", seg, desc->access)); + + /* + * The descriptor type must indicate a code/data segment. + */ + type = SEG_DESC_TYPE(desc->access); + KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " + "descriptor type %#x", seg, type)); + + if (prot & PROT_READ) { + /* #GP on a read access to a exec-only code segment */ + if ((type & 0xA) == 0x8) + return (-1); + } + + if (prot & PROT_WRITE) { + /* + * #GP on a write access to a code segment or a + * read-only data segment. + */ + if (type & 0x8) /* code segment */ + return (-1); + + if ((type & 0xA) == 0) /* read-only data seg */ + return (-1); + } + + /* + * 'desc->limit' is fully expanded taking granularity into + * account. + */ + if ((type & 0xC) == 0x4) { + /* expand-down data segment */ + low_limit = desc->limit + 1; + high_limit = SEG_DESC_DEF32(desc->access) ? + 0xffffffff : 0xffff; + } else { + /* code segment or expand-up data segment */ + low_limit = 0; + high_limit = desc->limit; + } + + while (length > 0) { + offset &= vie_size2mask(addrsize); + if (offset < low_limit || offset > high_limit) + return (-1); + offset++; + length--; + } + } + + /* + * In 64-bit mode all segments except %fs and %gs have a segment + * base address of 0. + */ + if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && + seg != VM_REG_GUEST_GS) { + segbase = 0; + } else { + segbase = desc->base; + } + + /* + * Truncate 'firstoff' to the effective address size before adding + * it to the segment base. + */ + firstoff &= vie_size2mask(addrsize); + *gla = (segbase + firstoff) & vie_size2mask(glasize); + return (0); +} + +/* + * Prepare a partially decoded vie for a 2nd attempt. + */ +void +vie_restart(struct vie *vie) +{ + _Static_assert( + offsetof(struct vie, inst) < offsetof(struct vie, vie_startzero) && + offsetof(struct vie, num_valid) < offsetof(struct vie, vie_startzero), + "restart should not erase instruction length or contents"); + + memset((char *)vie + offsetof(struct vie, vie_startzero), 0, + sizeof(*vie) - offsetof(struct vie, vie_startzero)); + + vie->base_register = VM_REG_LAST; + vie->index_register = VM_REG_LAST; + vie->segment_register = VM_REG_LAST; +} + +void +vie_init(struct vie *vie, const char *inst_bytes, int inst_length) +{ + KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE, + ("%s: invalid instruction length (%d)", __func__, inst_length)); + + vie_restart(vie); + memset(vie->inst, 0, sizeof(vie->inst)); + if (inst_length != 0) + memcpy(vie->inst, inst_bytes, inst_length); + vie->num_valid = inst_length; +} + +#ifdef _KERNEL +static int +pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) +{ + int error_code = 0; + + if (pte & PG_V) + error_code |= PGEX_P; + if (prot & VM_PROT_WRITE) + error_code |= PGEX_W; + if (usermode) + error_code |= PGEX_U; + if (rsvd) + error_code |= PGEX_RSV; + if (prot & VM_PROT_EXECUTE) + error_code |= PGEX_I; + + return (error_code); +} + +static void +ptp_release(void **cookie) +{ + if (*cookie != NULL) { + vm_gpa_release(*cookie); + *cookie = NULL; + } +} + +static void * +ptp_hold(struct vcpu *vcpu, vm_paddr_t ptpphys, size_t len, void **cookie) +{ + void *ptr; + + ptp_release(cookie); + ptr = vm_gpa_hold(vcpu, ptpphys, len, VM_PROT_RW, cookie); + return (ptr); +} + +static int +_vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only) +{ + int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; + u_int retries; + uint64_t *ptpbase, ptpphys, pte, pgsize; + uint32_t *ptpbase32, pte32; + void *cookie; + + *guest_fault = 0; + + usermode = (paging->cpl == 3 ? 1 : 0); + writable = prot & VM_PROT_WRITE; + cookie = NULL; + retval = 0; + retries = 0; +restart: + ptpphys = paging->cr3; /* root of the page tables */ + ptp_release(&cookie); + if (retries++ > 0) + maybe_yield(); + + if (vie_canonical_check(paging->cpu_mode, gla)) { + /* + * XXX assuming a non-stack reference otherwise a stack fault + * should be generated. + */ + if (!check_only) + vm_inject_gp(vcpu); + goto fault; + } + + if (paging->paging_mode == PAGING_MODE_FLAT) { + *gpa = gla; + goto done; + } + + if (paging->paging_mode == PAGING_MODE_32) { + nlevels = 2; + while (--nlevels >= 0) { + /* Zero out the lower 12 bits. */ + ptpphys &= ~0xfff; + + ptpbase32 = ptp_hold(vcpu, ptpphys, PAGE_SIZE, + &cookie); + + if (ptpbase32 == NULL) + goto error; + + ptpshift = PAGE_SHIFT + nlevels * 10; + ptpindex = (gla >> ptpshift) & 0x3FF; + pgsize = 1UL << ptpshift; + + pte32 = ptpbase32[ptpindex]; + + if ((pte32 & PG_V) == 0 || + (usermode && (pte32 & PG_U) == 0) || + (writable && (pte32 & PG_RW) == 0)) { + if (!check_only) { + pfcode = pf_error_code(usermode, prot, 0, + pte32); + vm_inject_pf(vcpu, pfcode, gla); + } + goto fault; + } + + /* + * Emulate the x86 MMU's management of the accessed + * and dirty flags. While the accessed flag is set + * at every level of the page table, the dirty flag + * is only set at the last level providing the guest + * physical address. + */ + if (!check_only && (pte32 & PG_A) == 0) { + if (atomic_cmpset_32(&ptpbase32[ptpindex], + pte32, pte32 | PG_A) == 0) { + goto restart; + } + } + + /* XXX must be ignored if CR4.PSE=0 */ + if (nlevels > 0 && (pte32 & PG_PS) != 0) + break; + + ptpphys = pte32; + } + + /* Set the dirty bit in the page table entry if necessary */ + if (!check_only && writable && (pte32 & PG_M) == 0) { + if (atomic_cmpset_32(&ptpbase32[ptpindex], + pte32, pte32 | PG_M) == 0) { + goto restart; + } + } + + /* Zero out the lower 'ptpshift' bits */ + pte32 >>= ptpshift; pte32 <<= ptpshift; + *gpa = pte32 | (gla & (pgsize - 1)); + goto done; + } + + if (paging->paging_mode == PAGING_MODE_PAE) { + /* Zero out the lower 5 bits and the upper 32 bits */ + ptpphys &= 0xffffffe0UL; + + ptpbase = ptp_hold(vcpu, ptpphys, sizeof(*ptpbase) * 4, + &cookie); + if (ptpbase == NULL) + goto error; + + ptpindex = (gla >> 30) & 0x3; + + pte = ptpbase[ptpindex]; + + if ((pte & PG_V) == 0) { + if (!check_only) { + pfcode = pf_error_code(usermode, prot, 0, pte); + vm_inject_pf(vcpu, pfcode, gla); + } + goto fault; + } + + ptpphys = pte; + + nlevels = 2; + } else if (paging->paging_mode == PAGING_MODE_64_LA57) { + nlevels = 5; + } else { + nlevels = 4; + } + + while (--nlevels >= 0) { + /* Zero out the lower 12 bits and the upper 12 bits */ + ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; + + ptpbase = ptp_hold(vcpu, ptpphys, PAGE_SIZE, &cookie); + if (ptpbase == NULL) + goto error; + + ptpshift = PAGE_SHIFT + nlevels * 9; + ptpindex = (gla >> ptpshift) & 0x1FF; + pgsize = 1UL << ptpshift; + + pte = ptpbase[ptpindex]; + + if ((pte & PG_V) == 0 || + (usermode && (pte & PG_U) == 0) || + (writable && (pte & PG_RW) == 0)) { + if (!check_only) { + pfcode = pf_error_code(usermode, prot, 0, pte); + vm_inject_pf(vcpu, pfcode, gla); + } + goto fault; + } + + /* Set the accessed bit in the page table entry */ + if (!check_only && (pte & PG_A) == 0) { + if (atomic_cmpset_64(&ptpbase[ptpindex], + pte, pte | PG_A) == 0) { + goto restart; + } + } + + if (nlevels > 0 && (pte & PG_PS) != 0) { + if (pgsize > 1 * GB) { + if (!check_only) { + pfcode = pf_error_code(usermode, prot, 1, + pte); + vm_inject_pf(vcpu, pfcode, gla); + } + goto fault; + } + break; + } + + ptpphys = pte; + } + + /* Set the dirty bit in the page table entry if necessary */ + if (!check_only && writable && (pte & PG_M) == 0) { + if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) + goto restart; + } + + /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ + pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; + *gpa = pte | (gla & (pgsize - 1)); +done: + ptp_release(&cookie); + KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d", + __func__, retval)); + return (retval); +error: + retval = EFAULT; + goto done; +fault: + *guest_fault = 1; + goto done; +} + +int +vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) +{ + + return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault, + false)); +} + +int +vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) +{ + + return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault, + true)); +} + +int +vmm_fetch_instruction(struct vcpu *vcpu, struct vm_guest_paging *paging, + uint64_t rip, int inst_length, struct vie *vie, int *faultptr) +{ + struct vm_copyinfo copyinfo[2]; + int error, prot; + + if (inst_length > VIE_INST_SIZE) + panic("vmm_fetch_instruction: invalid length %d", inst_length); + + prot = PROT_READ | PROT_EXEC; + error = vm_copy_setup(vcpu, paging, rip, inst_length, prot, + copyinfo, nitems(copyinfo), faultptr); + if (error || *faultptr) + return (error); + + vm_copyin(copyinfo, vie->inst, inst_length); + vm_copy_teardown(copyinfo, nitems(copyinfo)); + vie->num_valid = inst_length; + return (0); +} +#endif /* _KERNEL */ + +static int +vie_peek(struct vie *vie, uint8_t *x) +{ + + if (vie->num_processed < vie->num_valid) { + *x = vie->inst[vie->num_processed]; + return (0); + } else + return (-1); +} + +static void +vie_advance(struct vie *vie) +{ + + vie->num_processed++; +} + +static bool +segment_override(uint8_t x, int *seg) +{ + + switch (x) { + case 0x2E: + *seg = VM_REG_GUEST_CS; + break; + case 0x36: + *seg = VM_REG_GUEST_SS; + break; + case 0x3E: + *seg = VM_REG_GUEST_DS; + break; + case 0x26: + *seg = VM_REG_GUEST_ES; + break; + case 0x64: + *seg = VM_REG_GUEST_FS; + break; + case 0x65: + *seg = VM_REG_GUEST_GS; + break; + default: + return (false); + } + return (true); +} + +static int +decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) +{ + uint8_t x; + + while (1) { + if (vie_peek(vie, &x)) + return (-1); + + if (x == 0x66) + vie->opsize_override = 1; + else if (x == 0x67) + vie->addrsize_override = 1; + else if (x == 0xF3) + vie->repz_present = 1; + else if (x == 0xF2) + vie->repnz_present = 1; + else if (segment_override(x, &vie->segment_register)) + vie->segment_override = 1; + else + break; + + vie_advance(vie); + } + + /* + * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: + * - Only one REX prefix is allowed per instruction. + * - The REX prefix must immediately precede the opcode byte or the + * escape opcode byte. + * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) + * the mandatory prefix must come before the REX prefix. + */ + if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { + vie->rex_present = 1; + vie->rex_w = x & 0x8 ? 1 : 0; + vie->rex_r = x & 0x4 ? 1 : 0; + vie->rex_x = x & 0x2 ? 1 : 0; + vie->rex_b = x & 0x1 ? 1 : 0; + vie_advance(vie); + } + + /* + * § 2.3.5, "The VEX Prefix", SDM Vol 2. + */ + if ((cpu_mode == CPU_MODE_64BIT || cpu_mode == CPU_MODE_COMPATIBILITY) + && x == 0xC4) { + const struct vie_op *optab; + + /* 3-byte VEX prefix. */ + vie->vex_present = 1; + + vie_advance(vie); + if (vie_peek(vie, &x)) + return (-1); + + /* + * 2nd byte: [R', X', B', mmmmm[4:0]]. Bits are inverted + * relative to REX encoding. + */ + vie->rex_r = x & 0x80 ? 0 : 1; + vie->rex_x = x & 0x40 ? 0 : 1; + vie->rex_b = x & 0x20 ? 0 : 1; + + switch (x & 0x1F) { + case 0x2: + /* 0F 38. */ + optab = three_byte_opcodes_0f38; + break; + case 0x1: + /* 0F class - nothing handled here yet. */ + /* FALLTHROUGH */ + case 0x3: + /* 0F 3A class - nothing handled here yet. */ + /* FALLTHROUGH */ + default: + /* Reserved (#UD). */ + return (-1); + } + + vie_advance(vie); + if (vie_peek(vie, &x)) + return (-1); + + /* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */ + vie->rex_w = x & 0x80 ? 1 : 0; + + vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3); + vie->vex_l = !!(x & 0x4); + vie->vex_pp = (x & 0x3); + + /* PP: 1=66 2=F3 3=F2 prefixes. */ + switch (vie->vex_pp) { + case 0x1: + vie->opsize_override = 1; + break; + case 0x2: + vie->repz_present = 1; + break; + case 0x3: + vie->repnz_present = 1; + break; + } + + vie_advance(vie); + + /* Opcode, sans literal prefix prefix. */ + if (vie_peek(vie, &x)) + return (-1); + + vie->op = optab[x]; + if (vie->op.op_type == VIE_OP_TYPE_NONE) + return (-1); + + vie_advance(vie); + } + + /* + * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 + */ + if (cpu_mode == CPU_MODE_64BIT) { + /* + * Default address size is 64-bits and default operand size + * is 32-bits. + */ + vie->addrsize = vie->addrsize_override ? 4 : 8; + if (vie->rex_w) + vie->opsize = 8; + else if (vie->opsize_override) + vie->opsize = 2; + else + vie->opsize = 4; + } else if (cs_d) { + /* Default address and operand sizes are 32-bits */ + vie->addrsize = vie->addrsize_override ? 2 : 4; + vie->opsize = vie->opsize_override ? 2 : 4; + } else { + /* Default address and operand sizes are 16-bits */ + vie->addrsize = vie->addrsize_override ? 4 : 2; + vie->opsize = vie->opsize_override ? 4 : 2; + } + return (0); +} + +static int +decode_two_byte_opcode(struct vie *vie) +{ + uint8_t x; + + if (vie_peek(vie, &x)) + return (-1); + + vie->op = two_byte_opcodes[x]; + + if (vie->op.op_type == VIE_OP_TYPE_NONE) + return (-1); + + vie_advance(vie); + return (0); +} + +static int +decode_opcode(struct vie *vie) +{ + uint8_t x; + + if (vie_peek(vie, &x)) + return (-1); + + /* Already did this via VEX prefix. */ + if (vie->op.op_type != VIE_OP_TYPE_NONE) + return (0); + + vie->op = one_byte_opcodes[x]; + + if (vie->op.op_type == VIE_OP_TYPE_NONE) + return (-1); + + vie_advance(vie); + + if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) + return (decode_two_byte_opcode(vie)); + + return (0); +} + +static int +decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) +{ + uint8_t x; + + if (vie->op.op_flags & VIE_OP_F_NO_MODRM) + return (0); + + if (cpu_mode == CPU_MODE_REAL) + return (-1); + + if (vie_peek(vie, &x)) + return (-1); + + vie->mod = (x >> 6) & 0x3; + vie->rm = (x >> 0) & 0x7; + vie->reg = (x >> 3) & 0x7; + + /* + * A direct addressing mode makes no sense in the context of an EPT + * fault. There has to be a memory access involved to cause the + * EPT fault. + */ + if (vie->mod == VIE_MOD_DIRECT) + return (-1); + + if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || + (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { + /* + * Table 2-5: Special Cases of REX Encodings + * + * mod=0, r/m=5 is used in the compatibility mode to + * indicate a disp32 without a base register. + * + * mod!=3, r/m=4 is used in the compatibility mode to + * indicate that the SIB byte is present. + * + * The 'b' bit in the REX prefix is don't care in + * this case. + */ + } else { + vie->rm |= (vie->rex_b << 3); + } + + vie->reg |= (vie->rex_r << 3); + + /* SIB */ + if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) + goto done; + + vie->base_register = gpr_map[vie->rm]; + + switch (vie->mod) { + case VIE_MOD_INDIRECT_DISP8: + vie->disp_bytes = 1; + break; + case VIE_MOD_INDIRECT_DISP32: + vie->disp_bytes = 4; + break; + case VIE_MOD_INDIRECT: + if (vie->rm == VIE_RM_DISP32) { + vie->disp_bytes = 4; + /* + * Table 2-7. RIP-Relative Addressing + * + * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 + * whereas in compatibility mode it just implies disp32. + */ + + if (cpu_mode == CPU_MODE_64BIT) + vie->base_register = VM_REG_GUEST_RIP; + else + vie->base_register = VM_REG_LAST; + } + break; + } + +done: + vie_advance(vie); + + return (0); +} + +static int +decode_sib(struct vie *vie) +{ + uint8_t x; + + /* Proceed only if SIB byte is present */ + if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) + return (0); + + if (vie_peek(vie, &x)) + return (-1); + + /* De-construct the SIB byte */ + vie->ss = (x >> 6) & 0x3; + vie->index = (x >> 3) & 0x7; + vie->base = (x >> 0) & 0x7; + + /* Apply the REX prefix modifiers */ + vie->index |= vie->rex_x << 3; + vie->base |= vie->rex_b << 3; + + switch (vie->mod) { + case VIE_MOD_INDIRECT_DISP8: + vie->disp_bytes = 1; + break; + case VIE_MOD_INDIRECT_DISP32: + vie->disp_bytes = 4; + break; + } + + if (vie->mod == VIE_MOD_INDIRECT && + (vie->base == 5 || vie->base == 13)) { + /* + * Special case when base register is unused if mod = 0 + * and base = %rbp or %r13. + * + * Documented in: + * Table 2-3: 32-bit Addressing Forms with the SIB Byte + * Table 2-5: Special Cases of REX Encodings + */ + vie->disp_bytes = 4; + } else { + vie->base_register = gpr_map[vie->base]; + } + + /* + * All encodings of 'index' are valid except for %rsp (4). + * + * Documented in: + * Table 2-3: 32-bit Addressing Forms with the SIB Byte + * Table 2-5: Special Cases of REX Encodings + */ + if (vie->index != 4) + vie->index_register = gpr_map[vie->index]; + + /* 'scale' makes sense only in the context of an index register */ + if (vie->index_register < VM_REG_LAST) + vie->scale = 1 << vie->ss; + + vie_advance(vie); + + return (0); +} + +static int +decode_displacement(struct vie *vie) +{ + int n, i; + uint8_t x; + + union { + char buf[4]; + int8_t signed8; + int32_t signed32; + } u; + + if ((n = vie->disp_bytes) == 0) + return (0); + + if (n != 1 && n != 4) + panic("decode_displacement: invalid disp_bytes %d", n); + + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = x; + vie_advance(vie); + } + + if (n == 1) + vie->displacement = u.signed8; /* sign-extended */ + else + vie->displacement = u.signed32; /* sign-extended */ + + return (0); +} + +static int +decode_immediate(struct vie *vie) +{ + int i, n; + uint8_t x; + union { + char buf[4]; + int8_t signed8; + int16_t signed16; + int32_t signed32; + } u; + + /* Figure out immediate operand size (if any) */ + if (vie->op.op_flags & VIE_OP_F_IMM) { + /* + * Section 2.2.1.5 "Immediates", Intel SDM: + * In 64-bit mode the typical size of immediate operands + * remains 32-bits. When the operand size if 64-bits, the + * processor sign-extends all immediates to 64-bits prior + * to their use. + */ + if (vie->opsize == 4 || vie->opsize == 8) + vie->imm_bytes = 4; + else + vie->imm_bytes = 2; + } else if (vie->op.op_flags & VIE_OP_F_IMM8) { + vie->imm_bytes = 1; + } + + if ((n = vie->imm_bytes) == 0) + return (0); + + KASSERT(n == 1 || n == 2 || n == 4, + ("%s: invalid number of immediate bytes: %d", __func__, n)); + + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = x; + vie_advance(vie); + } + + /* sign-extend the immediate value before use */ + if (n == 1) + vie->immediate = u.signed8; + else if (n == 2) + vie->immediate = u.signed16; + else + vie->immediate = u.signed32; + + return (0); +} + +static int +decode_moffset(struct vie *vie) +{ + int i, n; + uint8_t x; + union { + char buf[8]; + uint64_t u64; + } u; + + if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) + return (0); + + /* + * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: + * The memory offset size follows the address-size of the instruction. + */ + n = vie->addrsize; + KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); + + u.u64 = 0; + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = x; + vie_advance(vie); + } + vie->displacement = u.u64; + return (0); +} + +#ifdef _KERNEL +/* + * Verify that the 'guest linear address' provided as collateral of the nested + * page table fault matches with our instruction decoding. + */ +static int +verify_gla(struct vcpu *vcpu, uint64_t gla, struct vie *vie, + enum vm_cpu_mode cpu_mode) +{ + int error; + uint64_t base, segbase, idx, gla2; + enum vm_reg_name seg; + struct seg_desc desc; + + /* Skip 'gla' verification */ + if (gla == VIE_INVALID_GLA) + return (0); + + base = 0; + if (vie->base_register != VM_REG_LAST) { + error = vm_get_register(vcpu, vie->base_register, &base); + if (error) { + printf("verify_gla: error %d getting base reg %d\n", + error, vie->base_register); + return (-1); + } + + /* + * RIP-relative addressing starts from the following + * instruction + */ + if (vie->base_register == VM_REG_GUEST_RIP) + base += vie->num_processed; + } + + idx = 0; + if (vie->index_register != VM_REG_LAST) { + error = vm_get_register(vcpu, vie->index_register, &idx); + if (error) { + printf("verify_gla: error %d getting index reg %d\n", + error, vie->index_register); + return (-1); + } + } + + /* + * From "Specifying a Segment Selector", Intel SDM, Vol 1 + * + * In 64-bit mode, segmentation is generally (but not + * completely) disabled. The exceptions are the FS and GS + * segments. + * + * In legacy IA-32 mode, when the ESP or EBP register is used + * as the base, the SS segment is the default segment. For + * other data references, except when relative to stack or + * string destination the DS segment is the default. These + * can be overridden to allow other segments to be accessed. + */ + if (vie->segment_override) + seg = vie->segment_register; + else if (vie->base_register == VM_REG_GUEST_RSP || + vie->base_register == VM_REG_GUEST_RBP) + seg = VM_REG_GUEST_SS; + else + seg = VM_REG_GUEST_DS; + if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && + seg != VM_REG_GUEST_GS) { + segbase = 0; + } else { + error = vm_get_seg_desc(vcpu, seg, &desc); + if (error) { + printf("verify_gla: error %d getting segment" + " descriptor %d", error, + vie->segment_register); + return (-1); + } + segbase = desc.base; + } + + gla2 = segbase + base + vie->scale * idx + vie->displacement; + gla2 &= size2mask[vie->addrsize]; + if (gla != gla2) { + printf("verify_gla mismatch: segbase(0x%0lx)" + "base(0x%0lx), scale(%d), index(0x%0lx), " + "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", + segbase, base, vie->scale, idx, vie->displacement, + gla, gla2); + return (-1); + } + + return (0); +} +#endif /* _KERNEL */ + +int +#ifdef _KERNEL +vmm_decode_instruction(struct vcpu *vcpu, uint64_t gla, + enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) +#else +vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) +#endif +{ + + if (decode_prefixes(vie, cpu_mode, cs_d)) + return (-1); + + if (decode_opcode(vie)) + return (-1); + + if (decode_modrm(vie, cpu_mode)) + return (-1); + + if (decode_sib(vie)) + return (-1); + + if (decode_displacement(vie)) + return (-1); + + if (decode_immediate(vie)) + return (-1); + + if (decode_moffset(vie)) + return (-1); + +#ifdef _KERNEL + if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { + if (verify_gla(vcpu, gla, vie, cpu_mode)) + return (-1); + } +#endif + + vie->decoded = 1; /* success */ + + return (0); +} diff --git a/sys/amd64/vmm/vmm_ioport.c b/sys/amd64/vmm/vmm_ioport.c new file mode 100644 index 000000000000..8aab28f5e68e --- /dev/null +++ b/sys/amd64/vmm/vmm_ioport.c @@ -0,0 +1,215 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/systm.h> + +#include <machine/vmm.h> +#include <machine/vmm_instruction_emul.h> + +#include <dev/vmm/vmm_ktr.h> + +#include "vatpic.h" +#include "vatpit.h" +#include "vpmtmr.h" +#include "vrtc.h" +#include "vmm_ioport.h" + +#define MAX_IOPORTS 1280 + +ioport_handler_func_t ioport_handler[MAX_IOPORTS] = { + [TIMER_MODE] = vatpit_handler, + [TIMER_CNTR0] = vatpit_handler, + [TIMER_CNTR1] = vatpit_handler, + [TIMER_CNTR2] = vatpit_handler, + [NMISC_PORT] = vatpit_nmisc_handler, + [IO_ICU1] = vatpic_master_handler, + [IO_ICU1 + ICU_IMR_OFFSET] = vatpic_master_handler, + [IO_ICU2] = vatpic_slave_handler, + [IO_ICU2 + ICU_IMR_OFFSET] = vatpic_slave_handler, + [IO_ELCR1] = vatpic_elc_handler, + [IO_ELCR2] = vatpic_elc_handler, + [IO_PMTMR] = vpmtmr_handler, + [IO_RTC] = vrtc_addr_handler, + [IO_RTC + 1] = vrtc_data_handler, +}; + +#ifdef KTR +static const char * +inout_instruction(struct vm_exit *vmexit) +{ + int index; + + static const char *iodesc[] = { + "outb", "outw", "outl", + "inb", "inw", "inl", + "outsb", "outsw", "outsd", + "insb", "insw", "insd", + }; + + switch (vmexit->u.inout.bytes) { + case 1: + index = 0; + break; + case 2: + index = 1; + break; + default: + index = 2; + break; + } + + if (vmexit->u.inout.in) + index += 3; + + if (vmexit->u.inout.string) + index += 6; + + KASSERT(index < nitems(iodesc), ("%s: invalid index %d", + __func__, index)); + + return (iodesc[index]); +} +#endif /* KTR */ + +static int +emulate_inout_port(struct vcpu *vcpu, struct vm_exit *vmexit, bool *retu) +{ + ioport_handler_func_t handler; + uint32_t mask, val = 0; + int error; + + /* + * If there is no handler for the I/O port then punt to userspace. + */ + if (vmexit->u.inout.port >= MAX_IOPORTS || + (handler = ioport_handler[vmexit->u.inout.port]) == NULL) { + *retu = true; + return (0); + } + + mask = vie_size2mask(vmexit->u.inout.bytes); + + if (!vmexit->u.inout.in) { + val = vmexit->u.inout.eax & mask; + } + + error = (*handler)(vcpu_vm(vcpu), vmexit->u.inout.in, + vmexit->u.inout.port, vmexit->u.inout.bytes, &val); + if (error) { + /* + * The value returned by this function is also the return value + * of vm_run(). This needs to be a positive number otherwise it + * can be interpreted as a "pseudo-error" like ERESTART. + * + * Enforce this by mapping all errors to EIO. + */ + return (EIO); + } + + if (vmexit->u.inout.in) { + vmexit->u.inout.eax &= ~mask; + vmexit->u.inout.eax |= val & mask; + error = vm_set_register(vcpu, VM_REG_GUEST_RAX, + vmexit->u.inout.eax); + KASSERT(error == 0, ("emulate_ioport: error %d setting guest " + "rax register", error)); + } + *retu = false; + return (0); +} + +static int +decode_segment(struct vcpu *vcpu, enum vm_reg_name *segment) +{ + struct vm_guest_paging *paging; + struct vie vie; + struct vm_exit *vme; + int err; + int fault; + + vme = vm_exitinfo(vcpu); + paging = &vme->u.inout_str.paging; + + vie_init(&vie, NULL, 0); + err = vmm_fetch_instruction(vcpu, paging, + vme->rip + vme->u.inout_str.cs_base, VIE_INST_SIZE, &vie, &fault); + if (err || fault) + return (err); + + err = vmm_decode_instruction(vcpu, VIE_INVALID_GLA, paging->cpu_mode, + vme->u.inout_str.cs_d, &vie); + + if (err || vie.op.op_type != VIE_OP_TYPE_OUTS) + return (EINVAL); + if (vie.segment_override) + *segment = vie.segment_register; + else + *segment = VM_REG_GUEST_DS; + + return (0); +} + +static int +emulate_inout_str(struct vcpu *vcpu, struct vm_exit *vmexit, bool *retu) +{ + int err; + + *retu = true; + if (vmexit->u.inout_str.seg_name == VM_REG_LAST) { + err = decode_segment(vcpu, &vmexit->u.inout_str.seg_name); + if (err) + return (err); + return (vm_get_seg_desc(vcpu, vmexit->u.inout_str.seg_name, + &vmexit->u.inout_str.seg_desc)); + } + return (0); /* Return to userspace to finish emulation */ +} + +int +vm_handle_inout(struct vcpu *vcpu, struct vm_exit *vmexit, bool *retu) +{ + int bytes __diagused, error; + + bytes = vmexit->u.inout.bytes; + KASSERT(bytes == 1 || bytes == 2 || bytes == 4, + ("vm_handle_inout: invalid operand size %d", bytes)); + + if (vmexit->u.inout.string) + error = emulate_inout_str(vcpu, vmexit, retu); + else + error = emulate_inout_port(vcpu, vmexit, retu); + + VCPU_CTR4(vcpu_vm(vcpu), vcpu_vcpuid(vcpu), "%s%s 0x%04x: %s", + vmexit->u.inout.rep ? "rep " : "", + inout_instruction(vmexit), + vmexit->u.inout.port, + error ? "error" : (*retu ? "userspace" : "handled")); + + return (error); +} diff --git a/sys/amd64/vmm/vmm_ioport.h b/sys/amd64/vmm/vmm_ioport.h new file mode 100644 index 000000000000..e24e5ad57185 --- /dev/null +++ b/sys/amd64/vmm/vmm_ioport.h @@ -0,0 +1,37 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_IOPORT_H_ +#define _VMM_IOPORT_H_ + +typedef int (*ioport_handler_func_t)(struct vm *vm, + bool in, int port, int bytes, uint32_t *val); + +int vm_handle_inout(struct vcpu *vcpu, struct vm_exit *vme, bool *retu); + +#endif /* _VMM_IOPORT_H_ */ diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c new file mode 100644 index 000000000000..0cae01f172ec --- /dev/null +++ b/sys/amd64/vmm/vmm_lapic.c @@ -0,0 +1,238 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/smp.h> + +#include <x86/specialreg.h> +#include <x86/apicreg.h> + +#include <dev/vmm/vmm_ktr.h> + +#include <machine/vmm.h> +#include "vmm_lapic.h" +#include "vlapic.h" + +/* + * Some MSI message definitions + */ +#define MSI_X86_ADDR_MASK 0xfff00000 +#define MSI_X86_ADDR_BASE 0xfee00000 +#define MSI_X86_ADDR_RH 0x00000008 /* Redirection Hint */ +#define MSI_X86_ADDR_LOG 0x00000004 /* Destination Mode */ + +int +lapic_set_intr(struct vcpu *vcpu, int vector, bool level) +{ + struct vlapic *vlapic; + + /* + * According to section "Maskable Hardware Interrupts" in Intel SDM + * vectors 16 through 255 can be delivered through the local APIC. + */ + if (vector < 16 || vector > 255) + return (EINVAL); + + vlapic = vm_lapic(vcpu); + if (vlapic_set_intr_ready(vlapic, vector, level)) + vcpu_notify_event(vcpu, true); + return (0); +} + +int +lapic_set_local_intr(struct vm *vm, struct vcpu *vcpu, int vector) +{ + struct vlapic *vlapic; + cpuset_t dmask; + int cpu, error; + + if (vcpu == NULL) { + error = 0; + dmask = vm_active_cpus(vm); + CPU_FOREACH_ISSET(cpu, &dmask) { + vlapic = vm_lapic(vm_vcpu(vm, cpu)); + error = vlapic_trigger_lvt(vlapic, vector); + if (error) + break; + } + } else { + vlapic = vm_lapic(vcpu); + error = vlapic_trigger_lvt(vlapic, vector); + } + + return (error); +} + +int +lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg) +{ + int delmode, vec; + uint32_t dest; + bool phys; + + VM_CTR2(vm, "lapic MSI addr: %#lx msg: %#lx", addr, msg); + + if ((addr & MSI_X86_ADDR_MASK) != MSI_X86_ADDR_BASE) { + VM_CTR1(vm, "lapic MSI invalid addr %#lx", addr); + return (-1); + } + + /* + * Extract the x86-specific fields from the MSI addr/msg + * params according to the Intel Arch spec, Vol3 Ch 10. + * + * The PCI specification does not support level triggered + * MSI/MSI-X so ignore trigger level in 'msg'. + * + * The 'dest' is interpreted as a logical APIC ID if both + * the Redirection Hint and Destination Mode are '1' and + * physical otherwise. + */ + dest = (addr >> 12) & 0xff; + /* + * Extended Destination ID support uses bits 5-11 of the address: + * http://david.woodhou.se/ExtDestId.pdf + */ + dest |= ((addr >> 5) & 0x7f) << 8; + phys = ((addr & (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)) != + (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)); + delmode = msg & APIC_DELMODE_MASK; + vec = msg & 0xff; + + VM_CTR3(vm, "lapic MSI %s dest %#x, vec %d", + phys ? "physical" : "logical", dest, vec); + + vlapic_deliver_intr(vm, LAPIC_TRIG_EDGE, dest, phys, delmode, vec); + return (0); +} + +static bool +x2apic_msr(u_int msr) +{ + return (msr >= 0x800 && msr <= 0xBFF); +} + +static u_int +x2apic_msr_to_regoff(u_int msr) +{ + + return ((msr - 0x800) << 4); +} + +bool +lapic_msr(u_int msr) +{ + + return (x2apic_msr(msr) || msr == MSR_APICBASE); +} + +int +lapic_rdmsr(struct vcpu *vcpu, u_int msr, uint64_t *rval, bool *retu) +{ + int error; + u_int offset; + struct vlapic *vlapic; + + vlapic = vm_lapic(vcpu); + + if (msr == MSR_APICBASE) { + *rval = vlapic_get_apicbase(vlapic); + error = 0; + } else { + offset = x2apic_msr_to_regoff(msr); + error = vlapic_read(vlapic, 0, offset, rval, retu); + } + + return (error); +} + +int +lapic_wrmsr(struct vcpu *vcpu, u_int msr, uint64_t val, bool *retu) +{ + int error; + u_int offset; + struct vlapic *vlapic; + + vlapic = vm_lapic(vcpu); + + if (msr == MSR_APICBASE) { + error = vlapic_set_apicbase(vlapic, val); + } else { + offset = x2apic_msr_to_regoff(msr); + error = vlapic_write(vlapic, 0, offset, val, retu); + } + + return (error); +} + +int +lapic_mmio_write(struct vcpu *vcpu, uint64_t gpa, uint64_t wval, int size, + void *arg) +{ + int error; + uint64_t off; + struct vlapic *vlapic; + + off = gpa - DEFAULT_APIC_BASE; + + /* + * Memory mapped local apic accesses must be 4 bytes wide and + * aligned on a 16-byte boundary. + */ + if (size != 4 || off & 0xf) + return (EINVAL); + + vlapic = vm_lapic(vcpu); + error = vlapic_write(vlapic, 1, off, wval, arg); + return (error); +} + +int +lapic_mmio_read(struct vcpu *vcpu, uint64_t gpa, uint64_t *rval, int size, + void *arg) +{ + int error; + uint64_t off; + struct vlapic *vlapic; + + off = gpa - DEFAULT_APIC_BASE; + + /* + * Memory mapped local apic accesses should be aligned on a + * 16-byte boundary. They are also suggested to be 4 bytes + * wide, alas not all OSes follow suggestions. + */ + off &= ~3; + if (off & 0xf) + return (EINVAL); + + vlapic = vm_lapic(vcpu); + error = vlapic_read(vlapic, 1, off, rval, arg); + return (error); +} diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h new file mode 100644 index 000000000000..1c0e17b15c18 --- /dev/null +++ b/sys/amd64/vmm/vmm_lapic.h @@ -0,0 +1,74 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_LAPIC_H_ +#define _VMM_LAPIC_H_ + +struct vcpu; +struct vm; + +bool lapic_msr(u_int num); +int lapic_rdmsr(struct vcpu *vcpu, u_int msr, uint64_t *rval, bool *retu); +int lapic_wrmsr(struct vcpu *vcpu, u_int msr, uint64_t wval, bool *retu); + +int lapic_mmio_read(struct vcpu *vcpu, uint64_t gpa, + uint64_t *rval, int size, void *arg); +int lapic_mmio_write(struct vcpu *vcpu, uint64_t gpa, + uint64_t wval, int size, void *arg); + +/* + * Signals to the LAPIC that an interrupt at 'vector' needs to be generated + * to the 'cpu', the state is recorded in IRR. + */ +int lapic_set_intr(struct vcpu *vcpu, int vector, bool trig); + +#define LAPIC_TRIG_LEVEL true +#define LAPIC_TRIG_EDGE false +static __inline int +lapic_intr_level(struct vcpu *vcpu, int vector) +{ + + return (lapic_set_intr(vcpu, vector, LAPIC_TRIG_LEVEL)); +} + +static __inline int +lapic_intr_edge(struct vcpu *vcpu, int vector) +{ + + return (lapic_set_intr(vcpu, vector, LAPIC_TRIG_EDGE)); +} + +/* + * Triggers the LAPIC local interrupt (LVT) 'vector' on 'cpu'. 'cpu' can + * be set to -1 to trigger the interrupt on all CPUs. + */ +int lapic_set_local_intr(struct vm *vm, struct vcpu *vcpu, int vector); + +int lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg); + +#endif diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h new file mode 100644 index 000000000000..d905fd37001d --- /dev/null +++ b/sys/amd64/vmm/vmm_mem.h @@ -0,0 +1,39 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_MEM_H_ +#define _VMM_MEM_H_ + +struct vmspace; + +int vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa); +void vmm_mmio_free(struct vmspace *, vm_paddr_t gpa, size_t size); +vm_paddr_t vmm_mem_maxaddr(void); + +#endif diff --git a/sys/amd64/vmm/vmm_mem_machdep.c b/sys/amd64/vmm/vmm_mem_machdep.c new file mode 100644 index 000000000000..afb3a0274e2a --- /dev/null +++ b/sys/amd64/vmm/vmm_mem_machdep.c @@ -0,0 +1,121 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/sglist.h> +#include <sys/lock.h> +#include <sys/rwlock.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_extern.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> + +#include <machine/md_var.h> + +#include "vmm_mem.h" + +int +vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa) +{ + struct sglist *sg; + vm_object_t obj; + int error; + + if (gpa + len < gpa || hpa + len < hpa || (gpa & PAGE_MASK) != 0 || + (hpa & PAGE_MASK) != 0 || (len & PAGE_MASK) != 0) + return (EINVAL); + + sg = sglist_alloc(1, M_WAITOK); + error = sglist_append_phys(sg, hpa, len); + KASSERT(error == 0, ("error %d appending physaddr to sglist", error)); + + obj = vm_pager_allocate(OBJT_SG, sg, len, VM_PROT_RW, 0, NULL); + if (obj == NULL) + return (ENOMEM); + + /* + * VT-x ignores the MTRR settings when figuring out the memory type for + * translations obtained through EPT. + * + * Therefore we explicitly force the pages provided by this object to be + * mapped as uncacheable. + */ + VM_OBJECT_WLOCK(obj); + error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE); + VM_OBJECT_WUNLOCK(obj); + if (error != KERN_SUCCESS) + panic("vmm_mmio_alloc: vm_object_set_memattr error %d", error); + + vm_map_lock(&vmspace->vm_map); + error = vm_map_insert(&vmspace->vm_map, obj, 0, gpa, gpa + len, + VM_PROT_RW, VM_PROT_RW, 0); + vm_map_unlock(&vmspace->vm_map); + if (error != KERN_SUCCESS) { + error = vm_mmap_to_errno(error); + vm_object_deallocate(obj); + } else { + error = 0; + } + + /* + * Drop the reference on the sglist. + * + * If the scatter/gather object was successfully allocated then it + * has incremented the reference count on the sglist. Dropping the + * initial reference count ensures that the sglist will be freed + * when the object is deallocated. + * + * If the object could not be allocated then we end up freeing the + * sglist. + */ + sglist_free(sg); + + return (error); +} + +void +vmm_mmio_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len) +{ + + vm_map_remove(&vmspace->vm_map, gpa, gpa + len); +} + +vm_paddr_t +vmm_mem_maxaddr(void) +{ + + return (ptoa(Maxmem)); +} diff --git a/sys/amd64/vmm/vmm_snapshot.c b/sys/amd64/vmm/vmm_snapshot.c new file mode 100644 index 000000000000..cd53f05a1603 --- /dev/null +++ b/sys/amd64/vmm/vmm_snapshot.c @@ -0,0 +1,103 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2016 Flavius Anton + * Copyright (c) 2016 Mihai Tiganus + * Copyright (c) 2016-2019 Mihai Carabas + * Copyright (c) 2017-2019 Darius Mihai + * Copyright (c) 2017-2019 Elena Mihailescu + * Copyright (c) 2018-2019 Sergiu Weisz + * All rights reserved. + * The bhyve-snapshot feature was developed under sponsorships + * from Matthew Grooms. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/types.h> +#include <sys/systm.h> + +#include <machine/vmm_snapshot.h> + +void +vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op) +{ + const char *opstr; + + if (op == VM_SNAPSHOT_SAVE) + opstr = "save"; + else if (op == VM_SNAPSHOT_RESTORE) + opstr = "restore"; + else + opstr = "unknown"; + + printf("%s: snapshot-%s failed for %s\r\n", __func__, opstr, bufname); +} + +int +vm_snapshot_buf(void *data, size_t data_size, struct vm_snapshot_meta *meta) +{ + struct vm_snapshot_buffer *buffer; + int op, error; + + buffer = &meta->buffer; + op = meta->op; + + if (buffer->buf_rem < data_size) { + printf("%s: buffer too small\r\n", __func__); + return (E2BIG); + } + + if (op == VM_SNAPSHOT_SAVE) + error = copyout(data, buffer->buf, data_size); + else if (op == VM_SNAPSHOT_RESTORE) + error = copyin(buffer->buf, data, data_size); + else + error = EINVAL; + + if (error) + return (error); + + buffer->buf += data_size; + buffer->buf_rem -= data_size; + + return (0); +} + +size_t +vm_get_snapshot_size(struct vm_snapshot_meta *meta) +{ + size_t length; + struct vm_snapshot_buffer *buffer; + + buffer = &meta->buffer; + + if (buffer->buf_size < buffer->buf_rem) { + printf("%s: Invalid buffer: size = %zu, rem = %zu\r\n", + __func__, buffer->buf_size, buffer->buf_rem); + length = 0; + } else { + length = buffer->buf_size - buffer->buf_rem; + } + + return (length); +} diff --git a/sys/amd64/vmm/vmm_stat.h b/sys/amd64/vmm/vmm_stat.h new file mode 100644 index 000000000000..cf3895001528 --- /dev/null +++ b/sys/amd64/vmm/vmm_stat.h @@ -0,0 +1,66 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_STAT_H_ +#define _VMM_STAT_H_ + +#include <dev/vmm/vmm_stat.h> + +#include "vmm_util.h" + +VMM_STAT_DECLARE(VCPU_MIGRATIONS); +VMM_STAT_DECLARE(VMEXIT_COUNT); +VMM_STAT_DECLARE(VMEXIT_EXTINT); +VMM_STAT_DECLARE(VMEXIT_HLT); +VMM_STAT_DECLARE(VMEXIT_CR_ACCESS); +VMM_STAT_DECLARE(VMEXIT_RDMSR); +VMM_STAT_DECLARE(VMEXIT_WRMSR); +VMM_STAT_DECLARE(VMEXIT_MTRAP); +VMM_STAT_DECLARE(VMEXIT_PAUSE); +VMM_STAT_DECLARE(VMEXIT_INTR_WINDOW); +VMM_STAT_DECLARE(VMEXIT_NMI_WINDOW); +VMM_STAT_DECLARE(VMEXIT_INOUT); +VMM_STAT_DECLARE(VMEXIT_CPUID); +VMM_STAT_DECLARE(VMEXIT_NESTED_FAULT); +VMM_STAT_DECLARE(VMEXIT_INST_EMUL); +VMM_STAT_DECLARE(VMEXIT_UNKNOWN); +VMM_STAT_DECLARE(VMEXIT_ASTPENDING); +VMM_STAT_DECLARE(VMEXIT_USERSPACE); +VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS); +VMM_STAT_DECLARE(VMEXIT_EXCEPTION); +VMM_STAT_DECLARE(VMEXIT_REQIDLE); + +#define VMM_STAT_INTEL(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, vmm_is_intel) +#define VMM_STAT_AMD(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, vmm_is_svm) + +#endif diff --git a/sys/amd64/vmm/vmm_util.c b/sys/amd64/vmm/vmm_util.c new file mode 100644 index 000000000000..6c921e218a34 --- /dev/null +++ b/sys/amd64/vmm/vmm_util.c @@ -0,0 +1,109 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/libkern.h> + +#include <machine/md_var.h> + +#include "vmm_util.h" + +bool +vmm_is_hw_supported(void) +{ + return (vmm_is_intel() || vmm_is_svm()); +} + +bool +vmm_is_intel(void) +{ + + return (strcmp(cpu_vendor, "GenuineIntel") == 0); +} + +bool +vmm_is_svm(void) +{ + return (strcmp(cpu_vendor, "AuthenticAMD") == 0 || + strcmp(cpu_vendor, "HygonGenuine") == 0); +} + +bool +vmm_supports_1G_pages(void) +{ + unsigned int regs[4]; + + /* + * CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages + * + * Both Intel and AMD support this bit. + */ + if (cpu_exthigh >= 0x80000001) { + do_cpuid(0x80000001, regs); + if (regs[3] & (1 << 26)) + return (true); + } + return (false); +} + +#include <sys/proc.h> +#include <machine/frame.h> +#define DUMP_REG(x) printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x)) +#define DUMP_SEG(x) printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x)) +void +dump_trapframe(struct trapframe *tf) +{ + DUMP_REG(rdi); + DUMP_REG(rsi); + DUMP_REG(rdx); + DUMP_REG(rcx); + DUMP_REG(r8); + DUMP_REG(r9); + DUMP_REG(rax); + DUMP_REG(rbx); + DUMP_REG(rbp); + DUMP_REG(r10); + DUMP_REG(r11); + DUMP_REG(r12); + DUMP_REG(r13); + DUMP_REG(r14); + DUMP_REG(r15); + DUMP_REG(trapno); + DUMP_REG(addr); + DUMP_REG(flags); + DUMP_REG(err); + DUMP_REG(rip); + DUMP_REG(rflags); + DUMP_REG(rsp); + DUMP_SEG(cs); + DUMP_SEG(ss); + DUMP_SEG(fs); + DUMP_SEG(gs); + DUMP_SEG(es); + DUMP_SEG(ds); +} diff --git a/sys/amd64/vmm/vmm_util.h b/sys/amd64/vmm/vmm_util.h new file mode 100644 index 000000000000..c689f2f81721 --- /dev/null +++ b/sys/amd64/vmm/vmm_util.h @@ -0,0 +1,41 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VMM_UTIL_H_ +#define _VMM_UTIL_H_ + +struct trapframe; + +bool vmm_is_hw_supported(void); +bool vmm_is_intel(void); +bool vmm_is_svm(void); +bool vmm_supports_1G_pages(void); + +void dump_trapframe(struct trapframe *tf); + +#endif diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c new file mode 100644 index 000000000000..2e2224595ab4 --- /dev/null +++ b/sys/amd64/vmm/x86.c @@ -0,0 +1,757 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/pcpu.h> +#include <sys/systm.h> +#include <sys/sysctl.h> + +#include <machine/clock.h> +#include <machine/cpufunc.h> +#include <machine/md_var.h> +#include <machine/segments.h> +#include <machine/specialreg.h> +#include <machine/vmm.h> + +#include <dev/vmm/vmm_ktr.h> + +#include "vmm_host.h" +#include "vmm_util.h" +#include "x86.h" + +SYSCTL_DECL(_hw_vmm); +static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + NULL); + +#define CPUID_VM_SIGNATURE 0x40000000 +#define CPUID_BHYVE_FEATURES 0x40000001 +#define CPUID_VM_HIGH CPUID_BHYVE_FEATURES + +/* Features advertised in CPUID_BHYVE_FEATURES %eax */ +#define CPUID_BHYVE_FEAT_EXT_DEST_ID (1UL << 0) /* MSI Extended Dest ID */ + +static const char bhyve_id[12] = "bhyve bhyve "; + +static uint64_t bhyve_xcpuids; +SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0, + "Number of times an unknown cpuid leaf was accessed"); + +static int cpuid_leaf_b = 1; +SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN, + &cpuid_leaf_b, 0, NULL); + +/* + * Compute ceil(log2(x)). Returns -1 if x is zero. + */ +static __inline int +log2(u_int x) +{ + + return (x == 0 ? -1 : order_base_2(x)); +} + +int +x86_emulate_cpuid(struct vcpu *vcpu, uint64_t *rax, uint64_t *rbx, + uint64_t *rcx, uint64_t *rdx) +{ + struct vm *vm = vcpu_vm(vcpu); + int vcpu_id = vcpu_vcpuid(vcpu); + const struct xsave_limits *limits; + uint64_t cr4; + int error, enable_invpcid, enable_rdpid, enable_rdtscp, level, + width, x2apic_id; + unsigned int func, regs[4], logical_cpus, param; + enum x2apic_state x2apic_state; + uint16_t cores, maxcpus, sockets, threads; + + /* + * The function of CPUID is controlled through the provided value of + * %eax (and secondarily %ecx, for certain leaf data). + */ + func = (uint32_t)*rax; + param = (uint32_t)*rcx; + + VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", func, param); + + /* + * Requests for invalid CPUID levels should map to the highest + * available level instead. + */ + if (cpu_exthigh != 0 && func >= 0x80000000) { + if (func > cpu_exthigh) + func = cpu_exthigh; + } else if (func >= CPUID_VM_SIGNATURE) { + if (func > CPUID_VM_HIGH) + func = CPUID_VM_HIGH; + } else if (func > cpu_high) { + func = cpu_high; + } + + /* + * In general the approach used for CPU topology is to + * advertise a flat topology where all CPUs are packages with + * no multi-core or SMT. + */ + switch (func) { + /* + * Pass these through to the guest + */ + case CPUID_0000_0000: + case CPUID_0000_0002: + case CPUID_0000_0003: + case CPUID_8000_0000: + case CPUID_8000_0002: + case CPUID_8000_0003: + case CPUID_8000_0004: + case CPUID_8000_0006: + cpuid_count(func, param, regs); + break; + case CPUID_8000_0008: + cpuid_count(func, param, regs); + if (vmm_is_svm()) { + /* + * As on Intel (0000_0007:0, EDX), mask out + * unsupported or unsafe AMD extended features + * (8000_0008 EBX). + */ + regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF | + AMDFEID_XSAVEERPTR); + + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + /* + * Here, width is ApicIdCoreIdSize, present on + * at least Family 15h and newer. It + * represents the "number of bits in the + * initial apicid that indicate thread id + * within a package." + * + * Our topo_probe_amd() uses it for + * pkg_id_shift and other OSes may rely on it. + */ + width = MIN(0xF, log2(threads * cores)); + logical_cpus = MIN(0xFF, threads * cores - 1); + regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | logical_cpus; + } + break; + + case CPUID_8000_0001: + cpuid_count(func, param, regs); + + /* + * Hide SVM from guest. + */ + regs[2] &= ~AMDID2_SVM; + + /* + * Don't advertise extended performance counter MSRs + * to the guest. + */ + regs[2] &= ~AMDID2_PCXC; + regs[2] &= ~AMDID2_PNXC; + regs[2] &= ~AMDID2_PTSCEL2I; + + /* + * Don't advertise Instruction Based Sampling feature. + */ + regs[2] &= ~AMDID2_IBS; + + /* NodeID MSR not available */ + regs[2] &= ~AMDID2_NODE_ID; + + /* Don't advertise the OS visible workaround feature */ + regs[2] &= ~AMDID2_OSVW; + + /* Hide mwaitx/monitorx capability from the guest */ + regs[2] &= ~AMDID2_MWAITX; + + /* Advertise RDTSCP if it is enabled. */ + error = vm_get_capability(vcpu, + VM_CAP_RDTSCP, &enable_rdtscp); + if (error == 0 && enable_rdtscp) + regs[3] |= AMDID_RDTSCP; + else + regs[3] &= ~AMDID_RDTSCP; + break; + + case CPUID_8000_0007: + /* + * AMD uses this leaf to advertise the processor's + * power monitoring and RAS capabilities. These + * features are hardware-specific and exposing + * them to a guest doesn't make a lot of sense. + * + * Intel uses this leaf only to advertise the + * "Invariant TSC" feature with all other bits + * being reserved (set to zero). + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + + /* + * "Invariant TSC" can be advertised to the guest if: + * - host TSC frequency is invariant + * - host TSCs are synchronized across physical cpus + * + * XXX This still falls short because the vcpu + * can observe the TSC moving backwards as it + * migrates across physical cpus. But at least + * it should discourage the guest from using the + * TSC to keep track of time. + */ + if (tsc_is_invariant && smp_tsc) + regs[3] |= AMDPM_TSC_INVARIANT; + break; + + case CPUID_8000_001D: + /* AMD Cache topology, like 0000_0004 for Intel. */ + if (!vmm_is_svm()) + goto default_leaf; + + /* + * Similar to Intel, generate a fictitious cache + * topology for the guest with L3 shared by the + * package, and L1 and L2 local to a core. + */ + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + switch (param) { + case 0: + logical_cpus = threads; + level = 1; + func = 1; /* data cache */ + break; + case 1: + logical_cpus = threads; + level = 2; + func = 3; /* unified cache */ + break; + case 2: + logical_cpus = threads * cores; + level = 3; + func = 3; /* unified cache */ + break; + default: + logical_cpus = sockets * threads * cores; + level = 0; + func = 0; + break; + } + + logical_cpus = MIN(0xfff, logical_cpus - 1); + regs[0] = (logical_cpus << 14) | (1 << 8) | + (level << 5) | func; + regs[1] = (func > 0) ? (CACHE_LINE_SIZE - 1) : 0; + + /* + * ecx: Number of cache ways for non-fully + * associative cache, minus 1. Reported value + * of zero means there is one way. + */ + regs[2] = 0; + + regs[3] = 0; + break; + + case CPUID_8000_001E: + /* + * AMD Family 16h+ and Hygon Family 18h additional + * identifiers. + */ + if (!vmm_is_svm() || CPUID_TO_FAMILY(cpu_id) < 0x16) + goto default_leaf; + + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + regs[0] = vcpu_id; + threads = MIN(0xFF, threads - 1); + regs[1] = (threads << 8) | + (vcpu_id >> log2(threads + 1)); + /* + * XXX Bhyve topology cannot yet represent >1 node per + * processor. + */ + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_0001: + do_cpuid(1, regs); + + error = vm_get_x2apic_state(vcpu, &x2apic_state); + if (error) { + panic("x86_emulate_cpuid: error %d " + "fetching x2apic state", error); + } + + /* + * Override the APIC ID only in ebx + */ + regs[1] &= ~(CPUID_LOCAL_APIC_ID); + regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT); + + /* + * Don't expose VMX, SpeedStep, TME or SMX capability. + * Advertise x2APIC capability and Hypervisor guest. + */ + regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); + regs[2] &= ~(CPUID2_SMX); + + regs[2] |= CPUID2_HV; + + if (x2apic_state != X2APIC_DISABLED) + regs[2] |= CPUID2_X2APIC; + else + regs[2] &= ~CPUID2_X2APIC; + + /* + * Only advertise CPUID2_XSAVE in the guest if + * the host is using XSAVE. + */ + if (!(regs[2] & CPUID2_OSXSAVE)) + regs[2] &= ~CPUID2_XSAVE; + + /* + * If CPUID2_XSAVE is being advertised and the + * guest has set CR4_XSAVE, set + * CPUID2_OSXSAVE. + */ + regs[2] &= ~CPUID2_OSXSAVE; + if (regs[2] & CPUID2_XSAVE) { + error = vm_get_register(vcpu, + VM_REG_GUEST_CR4, &cr4); + if (error) + panic("x86_emulate_cpuid: error %d " + "fetching %%cr4", error); + if (cr4 & CR4_XSAVE) + regs[2] |= CPUID2_OSXSAVE; + } + + /* + * Hide monitor/mwait until we know how to deal with + * these instructions. + */ + regs[2] &= ~CPUID2_MON; + + /* + * Hide the performance and debug features. + */ + regs[2] &= ~CPUID2_PDCM; + + /* + * No TSC deadline support in the APIC yet + */ + regs[2] &= ~CPUID2_TSCDLT; + + /* + * Hide thermal monitoring + */ + regs[3] &= ~(CPUID_ACPI | CPUID_TM); + + /* + * Hide the debug store capability. + */ + regs[3] &= ~CPUID_DS; + + /* + * Advertise the Machine Check and MTRR capability. + * + * Some guest OSes (e.g. Windows) will not boot if + * these features are absent. + */ + regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR); + + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + logical_cpus = threads * cores; + regs[1] &= ~CPUID_HTT_CORES; + regs[1] |= (logical_cpus & 0xff) << 16; + regs[3] |= CPUID_HTT; + break; + + case CPUID_0000_0004: + cpuid_count(func, param, regs); + + if (regs[0] || regs[1] || regs[2] || regs[3]) { + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + regs[0] &= 0x3ff; + regs[0] |= (cores - 1) << 26; + /* + * Cache topology: + * - L1 and L2 are shared only by the logical + * processors in a single core. + * - L3 and above are shared by all logical + * processors in the package. + */ + logical_cpus = threads; + level = (regs[0] >> 5) & 0x7; + if (level >= 3) + logical_cpus *= cores; + regs[0] |= (logical_cpus - 1) << 14; + } + break; + + case CPUID_0000_0007: + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + + /* leaf 0 */ + if (param == 0) { + cpuid_count(func, param, regs); + + /* Only leaf 0 is supported */ + regs[0] = 0; + + /* + * Expose known-safe features. + */ + regs[1] &= CPUID_STDEXT_FSGSBASE | + CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE | + CPUID_STDEXT_AVX2 | CPUID_STDEXT_SMEP | + CPUID_STDEXT_BMI2 | + CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM | + CPUID_STDEXT_AVX512F | + CPUID_STDEXT_AVX512DQ | + CPUID_STDEXT_RDSEED | + CPUID_STDEXT_SMAP | + CPUID_STDEXT_AVX512PF | + CPUID_STDEXT_AVX512ER | + CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA | + CPUID_STDEXT_AVX512BW | + CPUID_STDEXT_AVX512VL; + regs[2] &= CPUID_STDEXT2_VAES | + CPUID_STDEXT2_VPCLMULQDQ; + regs[3] &= CPUID_STDEXT3_MD_CLEAR; + + /* Advertise RDPID if it is enabled. */ + error = vm_get_capability(vcpu, VM_CAP_RDPID, + &enable_rdpid); + if (error == 0 && enable_rdpid) + regs[2] |= CPUID_STDEXT2_RDPID; + + /* Advertise INVPCID if it is enabled. */ + error = vm_get_capability(vcpu, + VM_CAP_ENABLE_INVPCID, &enable_invpcid); + if (error == 0 && enable_invpcid) + regs[1] |= CPUID_STDEXT_INVPCID; + } + break; + + case CPUID_0000_0006: + regs[0] = CPUTPM1_ARAT; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_000A: + /* + * Handle the access, but report 0 for + * all options + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_000B: + /* + * Intel processor topology enumeration + */ + if (vmm_is_intel()) { + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + if (param == 0) { + logical_cpus = threads; + width = log2(logical_cpus); + level = CPUID_TYPE_SMT; + x2apic_id = vcpu_id; + } + + if (param == 1) { + logical_cpus = threads * cores; + width = log2(logical_cpus); + level = CPUID_TYPE_CORE; + x2apic_id = vcpu_id; + } + + if (!cpuid_leaf_b || param >= 2) { + width = 0; + logical_cpus = 0; + level = 0; + x2apic_id = 0; + } + + regs[0] = width & 0x1f; + regs[1] = logical_cpus & 0xffff; + regs[2] = (level << 8) | (param & 0xff); + regs[3] = x2apic_id; + } else { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + } + break; + + case CPUID_0000_000D: + limits = vmm_get_xsave_limits(); + if (!limits->xsave_enabled) { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + } + + cpuid_count(func, param, regs); + switch (param) { + case 0: + /* + * Only permit the guest to use bits + * that are active in the host in + * %xcr0. Also, claim that the + * maximum save area size is + * equivalent to the host's current + * save area size. Since this runs + * "inside" of vmrun(), it runs with + * the guest's xcr0, so the current + * save area size is correct as-is. + */ + regs[0] &= limits->xcr0_allowed; + regs[2] = limits->xsave_max_size; + regs[3] &= (limits->xcr0_allowed >> 32); + break; + case 1: + /* Only permit XSAVEOPT. */ + regs[0] &= CPUID_EXTSTATE_XSAVEOPT; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + default: + /* + * If the leaf is for a permitted feature, + * pass through as-is, otherwise return + * all zeroes. + */ + if (!(limits->xcr0_allowed & (1ul << param))) { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + } + break; + } + break; + + case CPUID_0000_000F: + case CPUID_0000_0010: + /* + * Do not report any Resource Director Technology + * capabilities. Exposing control of cache or memory + * controller resource partitioning to the guest is not + * at all sensible. + * + * This is already hidden at a high level by masking of + * leaf 0x7. Even still, a guest may look here for + * detailed capability information. + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_0015: + /* + * Don't report CPU TSC/Crystal ratio and clock + * values since guests may use these to derive the + * local APIC frequency.. + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_VM_SIGNATURE: + regs[0] = CPUID_VM_HIGH; + bcopy(bhyve_id, ®s[1], 4); + bcopy(bhyve_id + 4, ®s[2], 4); + bcopy(bhyve_id + 8, ®s[3], 4); + break; + + case CPUID_BHYVE_FEATURES: + regs[0] = CPUID_BHYVE_FEAT_EXT_DEST_ID; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + default: +default_leaf: + /* + * The leaf value has already been clamped so + * simply pass this through, keeping count of + * how many unhandled leaf values have been seen. + */ + atomic_add_long(&bhyve_xcpuids, 1); + cpuid_count(func, param, regs); + break; + } + + /* + * CPUID clears the upper 32-bits of the long-mode registers. + */ + *rax = regs[0]; + *rbx = regs[1]; + *rcx = regs[2]; + *rdx = regs[3]; + + return (1); +} + +bool +vm_cpuid_capability(struct vcpu *vcpu, enum vm_cpuid_capability cap) +{ + bool rv; + + KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d", + __func__, cap)); + + /* + * Simply passthrough the capabilities of the host cpu for now. + */ + rv = false; + switch (cap) { + case VCC_NO_EXECUTE: + if (amd_feature & AMDID_NX) + rv = true; + break; + case VCC_FFXSR: + if (amd_feature & AMDID_FFXSR) + rv = true; + break; + case VCC_TCE: + if (amd_feature2 & AMDID2_TCE) + rv = true; + break; + default: + panic("%s: unknown vm_cpu_capability %d", __func__, cap); + } + return (rv); +} + +int +vm_rdmtrr(struct vm_mtrr *mtrr, u_int num, uint64_t *val) +{ + switch (num) { + case MSR_MTRRcap: + *val = MTRR_CAP_WC | MTRR_CAP_FIXED | VMM_MTRR_VAR_MAX; + break; + case MSR_MTRRdefType: + *val = mtrr->def_type; + break; + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: + *val = mtrr->fixed4k[num - MSR_MTRR4kBase]; + break; + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + *val = mtrr->fixed16k[num - MSR_MTRR16kBase]; + break; + case MSR_MTRR64kBase: + *val = mtrr->fixed64k; + break; + case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { + u_int offset = num - MSR_MTRRVarBase; + if (offset % 2 == 0) { + *val = mtrr->var[offset / 2].base; + } else { + *val = mtrr->var[offset / 2].mask; + } + break; + } + default: + return (-1); + } + + return (0); +} + +int +vm_wrmtrr(struct vm_mtrr *mtrr, u_int num, uint64_t val) +{ + switch (num) { + case MSR_MTRRcap: + /* MTRRCAP is read only */ + return (-1); + case MSR_MTRRdefType: + if (val & ~VMM_MTRR_DEF_MASK) { + /* generate #GP on writes to reserved fields */ + return (-1); + } + mtrr->def_type = val; + break; + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: + mtrr->fixed4k[num - MSR_MTRR4kBase] = val; + break; + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + mtrr->fixed16k[num - MSR_MTRR16kBase] = val; + break; + case MSR_MTRR64kBase: + mtrr->fixed64k = val; + break; + case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { + u_int offset = num - MSR_MTRRVarBase; + if (offset % 2 == 0) { + if (val & ~VMM_MTRR_PHYSBASE_MASK) { + /* generate #GP on writes to reserved fields */ + return (-1); + } + mtrr->var[offset / 2].base = val; + } else { + if (val & ~VMM_MTRR_PHYSMASK_MASK) { + /* generate #GP on writes to reserved fields */ + return (-1); + } + mtrr->var[offset / 2].mask = val; + } + break; + } + default: + return (-1); + } + + return (0); +} diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h new file mode 100644 index 000000000000..56364f4f5cb4 --- /dev/null +++ b/sys/amd64/vmm/x86.h @@ -0,0 +1,103 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _X86_H_ +#define _X86_H_ + +#define CPUID_0000_0000 (0x0) +#define CPUID_0000_0001 (0x1) +#define CPUID_0000_0002 (0x2) +#define CPUID_0000_0003 (0x3) +#define CPUID_0000_0004 (0x4) +#define CPUID_0000_0006 (0x6) +#define CPUID_0000_0007 (0x7) +#define CPUID_0000_000A (0xA) +#define CPUID_0000_000B (0xB) +#define CPUID_0000_000D (0xD) +#define CPUID_0000_000F (0xF) +#define CPUID_0000_0010 (0x10) +#define CPUID_0000_0015 (0x15) +#define CPUID_8000_0000 (0x80000000) +#define CPUID_8000_0001 (0x80000001) +#define CPUID_8000_0002 (0x80000002) +#define CPUID_8000_0003 (0x80000003) +#define CPUID_8000_0004 (0x80000004) +#define CPUID_8000_0006 (0x80000006) +#define CPUID_8000_0007 (0x80000007) +#define CPUID_8000_0008 (0x80000008) +#define CPUID_8000_001D (0x8000001D) +#define CPUID_8000_001E (0x8000001E) + +/* + * CPUID instruction Fn0000_0001: + */ +#define CPUID_0000_0001_APICID_MASK (0xff<<24) +#define CPUID_0000_0001_APICID_SHIFT 24 + +/* + * CPUID instruction Fn0000_0001 ECX + */ +#define CPUID_0000_0001_FEAT0_VMX (1<<5) + +int x86_emulate_cpuid(struct vcpu *vcpu, uint64_t *rax, uint64_t *rbx, + uint64_t *rcx, uint64_t *rdx); + +enum vm_cpuid_capability { + VCC_NONE, + VCC_NO_EXECUTE, + VCC_FFXSR, + VCC_TCE, + VCC_LAST +}; + +/* + * Return 'true' if the capability 'cap' is enabled in this virtual cpu + * and 'false' otherwise. + */ +bool vm_cpuid_capability(struct vcpu *vcpu, enum vm_cpuid_capability); + +#define VMM_MTRR_VAR_MAX 10 +#define VMM_MTRR_DEF_MASK \ + (MTRR_DEF_ENABLE | MTRR_DEF_FIXED_ENABLE | MTRR_DEF_TYPE) +#define VMM_MTRR_PHYSBASE_MASK (MTRR_PHYSBASE_PHYSBASE | MTRR_PHYSBASE_TYPE) +#define VMM_MTRR_PHYSMASK_MASK (MTRR_PHYSMASK_PHYSMASK | MTRR_PHYSMASK_VALID) +struct vm_mtrr { + uint64_t def_type; + uint64_t fixed4k[8]; + uint64_t fixed16k[2]; + uint64_t fixed64k; + struct { + uint64_t base; + uint64_t mask; + } var[VMM_MTRR_VAR_MAX]; +}; + +int vm_rdmtrr(struct vm_mtrr *mtrr, u_int num, uint64_t *val); +int vm_wrmtrr(struct vm_mtrr *mtrr, u_int num, uint64_t val); + +#endif |
