diff options
Diffstat (limited to 'lib/libvmmapi/vmmapi.c')
-rw-r--r-- | lib/libvmmapi/vmmapi.c | 1319 |
1 files changed, 1319 insertions, 0 deletions
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c new file mode 100644 index 000000000000..77f0f8f5c581 --- /dev/null +++ b/lib/libvmmapi/vmmapi.c @@ -0,0 +1,1319 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/capsicum.h> +#include <sys/cpuset.h> +#include <sys/domainset.h> +#include <sys/sysctl.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/linker.h> +#include <sys/module.h> +#include <sys/_iovec.h> + +#include <capsicum_helpers.h> +#include <err.h> +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <string.h> +#include <fcntl.h> +#include <unistd.h> + +#include <libutil.h> + +#include <vm/vm.h> +#include <machine/vmm.h> +#ifdef WITH_VMMAPI_SNAPSHOT +#include <machine/vmm_snapshot.h> +#endif + +#include <dev/vmm/vmm_dev.h> + +#include "vmmapi.h" +#include "internal.h" + +#define MB (1024 * 1024UL) +#define GB (1024 * 1024 * 1024UL) + +#ifdef __amd64__ +#define VM_LOWMEM_LIMIT (3 * GB) +#else +#define VM_LOWMEM_LIMIT 0 +#endif +#define VM_HIGHMEM_BASE (4 * GB) + +/* + * Size of the guard region before and after the virtual address space + * mapping the guest physical memory. This must be a multiple of the + * superpage size for performance reasons. + */ +#define VM_MMAP_GUARD_SIZE (4 * MB) + +#define PROT_RW (PROT_READ | PROT_WRITE) +#define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) + +static int +vm_device_open(const char *name) +{ + char devpath[PATH_MAX]; + + assert(strlen(name) <= VM_MAX_NAMELEN); + (void)snprintf(devpath, sizeof(devpath), "/dev/vmm/%s", name); + return (open(devpath, O_RDWR)); +} + +static int +vm_ctl_open(void) +{ + if (modfind("vmm") < 0) + (void)kldload("vmm"); + return (open("/dev/vmmctl", O_RDWR, 0)); +} + +static int +vm_ctl_create(const char *name, int ctlfd) +{ + struct vmmctl_vm_create vmc; + + memset(&vmc, 0, sizeof(vmc)); + if (strlcpy(vmc.name, name, sizeof(vmc.name)) >= sizeof(vmc.name)) { + errno = ENAMETOOLONG; + return (-1); + } + return (ioctl(ctlfd, VMMCTL_VM_CREATE, &vmc)); +} + +int +vm_create(const char *name) +{ + int error, fd; + + fd = vm_ctl_open(); + if (fd < 0) + return (-1); + + error = vm_ctl_create(name, fd); + if (error != 0) { + error = errno; + (void)close(fd); + errno = error; + return (-1); + } + (void)close(fd); + return (0); +} + +struct vmctx * +vm_open(const char *name) +{ + return (vm_openf(name, 0)); +} + +struct vmctx * +vm_openf(const char *name, int flags) +{ + struct vmctx *vm; + int saved_errno; + bool created; + + created = false; + + vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); + assert(vm != NULL); + + vm->fd = vm->ctlfd = -1; + vm->memflags = 0; + vm->name = (char *)(vm + 1); + strcpy(vm->name, name); + memset(vm->memsegs, 0, sizeof(vm->memsegs)); + + if ((vm->ctlfd = vm_ctl_open()) < 0) + goto err; + + vm->fd = vm_device_open(vm->name); + if (vm->fd < 0 && errno == ENOENT) { + if (flags & VMMAPI_OPEN_CREATE) { + if (vm_ctl_create(vm->name, vm->ctlfd) != 0) + goto err; + vm->fd = vm_device_open(vm->name); + created = true; + } + } + if (vm->fd < 0) + goto err; + + if (!created && (flags & VMMAPI_OPEN_REINIT) != 0 && vm_reinit(vm) != 0) + goto err; + + return (vm); +err: + saved_errno = errno; + if (created) + vm_destroy(vm); + else + vm_close(vm); + errno = saved_errno; + return (NULL); +} + +void +vm_close(struct vmctx *vm) +{ + assert(vm != NULL); + + if (vm->fd >= 0) + (void)close(vm->fd); + if (vm->ctlfd >= 0) + (void)close(vm->ctlfd); + free(vm); +} + +void +vm_destroy(struct vmctx *vm) +{ + struct vmmctl_vm_destroy vmd; + + memset(&vmd, 0, sizeof(vmd)); + (void)strlcpy(vmd.name, vm->name, sizeof(vmd.name)); + if (ioctl(vm->ctlfd, VMMCTL_VM_DESTROY, &vmd) != 0) + warn("ioctl(VMMCTL_VM_DESTROY)"); + + vm_close(vm); +} + +struct vcpu * +vm_vcpu_open(struct vmctx *ctx, int vcpuid) +{ + struct vcpu *vcpu; + + vcpu = malloc(sizeof(*vcpu)); + vcpu->ctx = ctx; + vcpu->vcpuid = vcpuid; + return (vcpu); +} + +void +vm_vcpu_close(struct vcpu *vcpu) +{ + free(vcpu); +} + +int +vcpu_id(struct vcpu *vcpu) +{ + return (vcpu->vcpuid); +} + +int +vm_parse_memsize(const char *opt, size_t *ret_memsize) +{ + char *endptr; + size_t optval; + int error; + + optval = strtoul(opt, &endptr, 0); + if (*opt != '\0' && *endptr == '\0') { + /* + * For the sake of backward compatibility if the memory size + * specified on the command line is less than a megabyte then + * it is interpreted as being in units of MB. + */ + if (optval < MB) + optval *= MB; + *ret_memsize = optval; + error = 0; + } else + error = expand_number(opt, ret_memsize); + + return (error); +} + +uint32_t +vm_get_lowmem_limit(struct vmctx *ctx __unused) +{ + + return (VM_LOWMEM_LIMIT); +} + +void +vm_set_memflags(struct vmctx *ctx, int flags) +{ + + ctx->memflags = flags; +} + +int +vm_get_memflags(struct vmctx *ctx) +{ + + return (ctx->memflags); +} + +/* + * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). + */ +int +vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, + size_t len, int prot) +{ + struct vm_memmap memmap; + int error, flags; + + memmap.gpa = gpa; + memmap.segid = segid; + memmap.segoff = off; + memmap.len = len; + memmap.prot = prot; + memmap.flags = 0; + + if (ctx->memflags & VM_MEM_F_WIRED) + memmap.flags |= VM_MEMMAP_F_WIRED; + + /* + * If this mapping already exists then don't create it again. This + * is the common case for SYSMEM mappings created by bhyveload(8). + */ + error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); + if (error == 0 && gpa == memmap.gpa) { + if (segid != memmap.segid || off != memmap.segoff || + prot != memmap.prot || flags != memmap.flags) { + errno = EEXIST; + return (-1); + } else { + return (0); + } + } + + error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); + return (error); +} + +int +vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, + size_t *lowmem_size, size_t *highmem_size) +{ + + *guest_baseaddr = ctx->baseaddr; + *lowmem_size = ctx->lowmem_size; + *highmem_size = ctx->highmem_size; + return (0); +} + +int +vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len) +{ + struct vm_munmap munmap; + int error; + + munmap.gpa = gpa; + munmap.len = len; + + error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap); + return (error); +} + +int +vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) +{ + struct vm_memmap memmap; + int error; + + bzero(&memmap, sizeof(struct vm_memmap)); + memmap.gpa = *gpa; + error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); + if (error == 0) { + *gpa = memmap.gpa; + *segid = memmap.segid; + *segoff = memmap.segoff; + *len = memmap.len; + *prot = memmap.prot; + *flags = memmap.flags; + } + return (error); +} + +/* + * Return 0 if the segments are identical and non-zero otherwise. + * + * This is slightly complicated by the fact that only device memory segments + * are named. + */ +static int +cmpseg(size_t len, const char *str, size_t len2, const char *str2) +{ + + if (len == len2) { + if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) + return (0); + } + return (-1); +} + +static int +vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name, + int ds_policy, domainset_t *ds_mask, size_t ds_size) +{ + struct vm_memseg memseg; + size_t n; + int error; + + /* + * If the memory segment has already been created then just return. + * This is the usual case for the SYSMEM segment created by userspace + * loaders like bhyveload(8). + */ + error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, + sizeof(memseg.name)); + if (error) + return (error); + + if (memseg.len != 0) { + if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { + errno = EINVAL; + return (-1); + } else { + return (0); + } + } + + bzero(&memseg, sizeof(struct vm_memseg)); + memseg.segid = segid; + memseg.len = len; + if (ds_mask == NULL) { + memseg.ds_policy = DOMAINSET_POLICY_INVALID; + } else { + memseg.ds_policy = ds_policy; + memseg.ds_mask = ds_mask; + memseg.ds_mask_size = ds_size; + } + if (name != NULL) { + n = strlcpy(memseg.name, name, sizeof(memseg.name)); + if (n >= sizeof(memseg.name)) { + errno = ENAMETOOLONG; + return (-1); + } + } + + error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); + return (error); +} + +int +vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, + size_t bufsize) +{ + struct vm_memseg memseg; + size_t n; + int error; + + bzero(&memseg, sizeof(memseg)); + memseg.segid = segid; + error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); + if (error == 0) { + *lenp = memseg.len; + n = strlcpy(namebuf, memseg.name, bufsize); + if (n >= bufsize) { + errno = ENAMETOOLONG; + error = -1; + } + } + return (error); +} + +static int +map_memory_segment(struct vmctx *ctx, int segid, vm_paddr_t gpa, size_t len, + size_t segoff, char *base) +{ + char *ptr; + int error, flags; + + /* Map 'len' bytes starting at 'gpa' in the guest address space */ + error = vm_mmap_memseg(ctx, gpa, segid, segoff, len, PROT_ALL); + if (error) + return (error); + + flags = MAP_SHARED | MAP_FIXED; + if ((ctx->memflags & VM_MEM_F_INCORE) == 0) + flags |= MAP_NOCORE; + + /* mmap into the process address space on the host */ + ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); + if (ptr == MAP_FAILED) + return (-1); + + return (0); +} + +/* + * Allocates and maps virtual machine memory segments according + * to the NUMA topology specified by the 'doms' array. + * + * The domains are laid out sequentially in the guest's physical address space. + * The [VM_LOWMEM_LIMIT, VM_HIGHMEM_BASE) address range is skipped and + * left unmapped. + */ +int +vm_setup_memory_domains(struct vmctx *ctx, enum vm_mmap_style vms, + struct vm_mem_domain *doms, int ndoms) +{ + size_t low_len, len, totalsize; + struct vm_mem_domain *dom; + struct vm_memseg memseg; + char *baseaddr, *ptr; + int error, i, segid; + vm_paddr_t gpa; + + /* Sanity checks. */ + assert(vms == VM_MMAP_ALL); + if (doms == NULL || ndoms <= 0 || ndoms > VM_MAXMEMDOM) { + errno = EINVAL; + return (-1); + } + + /* Calculate total memory size. */ + totalsize = 0; + for (i = 0; i < ndoms; i++) + totalsize += doms[i].size; + + if (totalsize > VM_LOWMEM_LIMIT) + totalsize = VM_HIGHMEM_BASE + (totalsize - VM_LOWMEM_LIMIT); + + /* + * Stake out a contiguous region covering the guest physical memory + * and the adjoining guard regions. + */ + len = VM_MMAP_GUARD_SIZE + totalsize + VM_MMAP_GUARD_SIZE; + ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); + if (ptr == MAP_FAILED) + return (-1); + baseaddr = ptr + VM_MMAP_GUARD_SIZE; + + /* + * Allocate and map memory segments for the virtual machine. + */ + gpa = VM_LOWMEM_LIMIT > 0 ? 0 : VM_HIGHMEM_BASE; + ctx->lowmem_size = 0; + ctx->highmem_size = 0; + for (i = 0; i < ndoms; i++) { + segid = VM_SYSMEM + i; + dom = &doms[i]; + + /* + * Check if the memory segment already exists. + * If 'ndoms' is greater than one, refuse to proceed if the + * memseg already exists. If only one domain was requested, use + * the existing segment to preserve the behaviour of the previous + * implementation. + * + * Splitting existing memory segments is tedious and + * error-prone, which is why we don't support NUMA + * domains for bhyveload(8)-loaded VMs. + */ + error = vm_get_memseg(ctx, segid, &len, memseg.name, + sizeof(memseg.name)); + if (error == 0 && len != 0) { + if (ndoms != 1) { + errno = EEXIST; + return (-1); + } else + doms[0].size = len; + } else { + error = vm_alloc_memseg(ctx, segid, dom->size, NULL, + dom->ds_policy, dom->ds_mask, dom->ds_size); + if (error) + return (error); + } + + /* + * If a domain is split by VM_LOWMEM_LIMIT then break + * its segment mapping into two parts, one below VM_LOWMEM_LIMIT + * and one above VM_HIGHMEM_BASE. + */ + if (gpa <= VM_LOWMEM_LIMIT && + gpa + dom->size > VM_LOWMEM_LIMIT) { + low_len = VM_LOWMEM_LIMIT - gpa; + error = map_memory_segment(ctx, segid, gpa, low_len, 0, + baseaddr); + if (error) + return (error); + ctx->lowmem_size = VM_LOWMEM_LIMIT; + /* Map the remainder. */ + gpa = VM_HIGHMEM_BASE; + len = dom->size - low_len; + error = map_memory_segment(ctx, segid, gpa, len, + low_len, baseaddr); + if (error) + return (error); + } else { + len = dom->size; + error = map_memory_segment(ctx, segid, gpa, len, 0, + baseaddr); + if (error) + return (error); + } + if (gpa <= VM_LOWMEM_LIMIT) + ctx->lowmem_size += len; + else + ctx->highmem_size += len; + gpa += len; + } + ctx->baseaddr = baseaddr; + + return (0); +} + +int +vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) +{ + struct vm_mem_domain dom0; + + memset(&dom0, 0, sizeof(dom0)); + dom0.ds_policy = DOMAINSET_POLICY_INVALID; + dom0.size = memsize; + + return (vm_setup_memory_domains(ctx, vms, &dom0, 1)); +} + +/* + * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in + * the lowmem or highmem regions. + * + * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. + * The instruction emulation code depends on this behavior. + */ +void * +vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) +{ + vm_size_t lowsize, highsize; + + lowsize = ctx->lowmem_size; + if (lowsize > 0) { + if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize) + return (ctx->baseaddr + gaddr); + } + + highsize = ctx->highmem_size; + if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) { + if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize && + gaddr + len <= VM_HIGHMEM_BASE + highsize) + return (ctx->baseaddr + gaddr); + } + + return (NULL); +} + +vm_paddr_t +vm_rev_map_gpa(struct vmctx *ctx, void *addr) +{ + vm_paddr_t offaddr; + vm_size_t lowsize, highsize; + + offaddr = (char *)addr - ctx->baseaddr; + + lowsize = ctx->lowmem_size; + if (lowsize > 0) + if (offaddr <= lowsize) + return (offaddr); + + highsize = ctx->highmem_size; + if (highsize > 0) + if (offaddr >= VM_HIGHMEM_BASE && + offaddr < VM_HIGHMEM_BASE + highsize) + return (offaddr); + + return ((vm_paddr_t)-1); +} + +const char * +vm_get_name(struct vmctx *ctx) +{ + + return (ctx->name); +} + +size_t +vm_get_lowmem_size(struct vmctx *ctx) +{ + return (ctx->lowmem_size); +} + +vm_paddr_t +vm_get_highmem_base(struct vmctx *ctx __unused) +{ + + return (VM_HIGHMEM_BASE); +} + +size_t +vm_get_highmem_size(struct vmctx *ctx) +{ + return (ctx->highmem_size); +} + +void * +vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) +{ + char pathname[MAXPATHLEN]; + size_t len2; + char *base, *ptr; + int fd, error, flags; + + fd = -1; + ptr = MAP_FAILED; + if (name == NULL || strlen(name) == 0) { + errno = EINVAL; + goto done; + } + + error = vm_alloc_memseg(ctx, segid, len, name, 0, NULL, 0); + if (error) + goto done; + + strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); + strlcat(pathname, ctx->name, sizeof(pathname)); + strlcat(pathname, ".", sizeof(pathname)); + strlcat(pathname, name, sizeof(pathname)); + + fd = open(pathname, O_RDWR); + if (fd < 0) + goto done; + + /* + * Stake out a contiguous region covering the device memory and the + * adjoining guard regions. + */ + len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; + base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, + 0); + if (base == MAP_FAILED) + goto done; + + flags = MAP_SHARED | MAP_FIXED; + if ((ctx->memflags & VM_MEM_F_INCORE) == 0) + flags |= MAP_NOCORE; + + /* mmap the devmem region in the host address space */ + ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); +done: + if (fd >= 0) + close(fd); + return (ptr); +} + +int +vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg) +{ + /* + * XXX: fragile, handle with care + * Assumes that the first field of the ioctl data + * is the vcpuid. + */ + *(int *)arg = vcpu->vcpuid; + return (ioctl(vcpu->ctx->fd, cmd, arg)); +} + +int +vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) +{ + int error; + struct vm_register vmreg; + + bzero(&vmreg, sizeof(vmreg)); + vmreg.regnum = reg; + vmreg.regval = val; + + error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg); + return (error); +} + +int +vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val) +{ + int error; + struct vm_register vmreg; + + bzero(&vmreg, sizeof(vmreg)); + vmreg.regnum = reg; + + error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg); + *ret_val = vmreg.regval; + return (error); +} + +int +vm_set_register_set(struct vcpu *vcpu, unsigned int count, + const int *regnums, uint64_t *regvals) +{ + int error; + struct vm_register_set vmregset; + + bzero(&vmregset, sizeof(vmregset)); + vmregset.count = count; + vmregset.regnums = regnums; + vmregset.regvals = regvals; + + error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset); + return (error); +} + +int +vm_get_register_set(struct vcpu *vcpu, unsigned int count, + const int *regnums, uint64_t *regvals) +{ + int error; + struct vm_register_set vmregset; + + bzero(&vmregset, sizeof(vmregset)); + vmregset.count = count; + vmregset.regnums = regnums; + vmregset.regvals = regvals; + + error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset); + return (error); +} + +int +vm_run(struct vcpu *vcpu, struct vm_run *vmrun) +{ + return (vcpu_ioctl(vcpu, VM_RUN, vmrun)); +} + +int +vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) +{ + struct vm_suspend vmsuspend; + + bzero(&vmsuspend, sizeof(vmsuspend)); + vmsuspend.how = how; + return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); +} + +int +vm_reinit(struct vmctx *ctx) +{ + + return (ioctl(ctx->fd, VM_REINIT, 0)); +} + +int +vm_capability_name2type(const char *capname) +{ + int i; + + for (i = 0; i < VM_CAP_MAX; i++) { + if (vm_capstrmap[i] != NULL && + strcmp(vm_capstrmap[i], capname) == 0) + return (i); + } + + return (-1); +} + +const char * +vm_capability_type2name(int type) +{ + if (type >= 0 && type < VM_CAP_MAX) + return (vm_capstrmap[type]); + + return (NULL); +} + +int +vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval) +{ + int error; + struct vm_capability vmcap; + + bzero(&vmcap, sizeof(vmcap)); + vmcap.captype = cap; + + error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap); + *retval = vmcap.capval; + return (error); +} + +int +vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val) +{ + struct vm_capability vmcap; + + bzero(&vmcap, sizeof(vmcap)); + vmcap.captype = cap; + vmcap.capval = val; + + return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap)); +} + +uint64_t * +vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv, + int *ret_entries) +{ + static _Thread_local uint64_t *stats_buf; + static _Thread_local u_int stats_count; + uint64_t *new_stats; + struct vm_stats vmstats; + u_int count, index; + bool have_stats; + + have_stats = false; + count = 0; + for (index = 0;; index += nitems(vmstats.statbuf)) { + vmstats.index = index; + if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0) + break; + if (stats_count < index + vmstats.num_entries) { + new_stats = realloc(stats_buf, + (index + vmstats.num_entries) * sizeof(uint64_t)); + if (new_stats == NULL) { + errno = ENOMEM; + return (NULL); + } + stats_count = index + vmstats.num_entries; + stats_buf = new_stats; + } + memcpy(stats_buf + index, vmstats.statbuf, + vmstats.num_entries * sizeof(uint64_t)); + count += vmstats.num_entries; + have_stats = true; + + if (vmstats.num_entries != nitems(vmstats.statbuf)) + break; + } + if (have_stats) { + if (ret_entries) + *ret_entries = count; + if (ret_tv) + *ret_tv = vmstats.tv; + return (stats_buf); + } else + return (NULL); +} + +const char * +vm_get_stat_desc(struct vmctx *ctx, int index) +{ + static struct vm_stat_desc statdesc; + + statdesc.index = index; + if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) + return (statdesc.desc); + else + return (NULL); +} + +#ifdef __amd64__ +int +vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) +{ + int error, i; + struct vm_gpa_pte gpapte; + + bzero(&gpapte, sizeof(gpapte)); + gpapte.gpa = gpa; + + error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); + + if (error == 0) { + *num = gpapte.ptenum; + for (i = 0; i < gpapte.ptenum; i++) + pte[i] = gpapte.pte[i]; + } + + return (error); +} + +int +vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *fault) +{ + struct vm_gla2gpa gg; + int error; + + bzero(&gg, sizeof(struct vm_gla2gpa)); + gg.prot = prot; + gg.gla = gla; + gg.paging = *paging; + + error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg); + if (error == 0) { + *fault = gg.fault; + *gpa = gg.gpa; + } + return (error); +} +#endif + +int +vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *fault) +{ + struct vm_gla2gpa gg; + int error; + + bzero(&gg, sizeof(struct vm_gla2gpa)); + gg.prot = prot; + gg.gla = gla; + gg.paging = *paging; + + error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg); + if (error == 0) { + *fault = gg.fault; + *gpa = gg.gpa; + } + return (error); +} + +#ifndef min +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#endif + +#ifdef __amd64__ +int +vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, + int *fault) +{ + void *va; + uint64_t gpa, off; + int error, i, n; + + for (i = 0; i < iovcnt; i++) { + iov[i].iov_base = 0; + iov[i].iov_len = 0; + } + + while (len) { + assert(iovcnt > 0); + error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); + if (error || *fault) + return (error); + + off = gpa & PAGE_MASK; + n = MIN(len, PAGE_SIZE - off); + + va = vm_map_gpa(vcpu->ctx, gpa, n); + if (va == NULL) + return (EFAULT); + + iov->iov_base = va; + iov->iov_len = n; + iov++; + iovcnt--; + + gla += n; + len -= n; + } + return (0); +} +#endif + +void +vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused) +{ + /* + * Intentionally empty. This is used by the instruction + * emulation code shared with the kernel. The in-kernel + * version of this is non-empty. + */ +} + +void +vm_copyin(struct iovec *iov, void *vp, size_t len) +{ + const char *src; + char *dst; + size_t n; + + dst = vp; + while (len) { + assert(iov->iov_len); + n = min(len, iov->iov_len); + src = iov->iov_base; + bcopy(src, dst, n); + + iov++; + dst += n; + len -= n; + } +} + +void +vm_copyout(const void *vp, struct iovec *iov, size_t len) +{ + const char *src; + char *dst; + size_t n; + + src = vp; + while (len) { + assert(iov->iov_len); + n = min(len, iov->iov_len); + dst = iov->iov_base; + bcopy(src, dst, n); + + iov++; + src += n; + len -= n; + } +} + +static int +vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) +{ + struct vm_cpuset vm_cpuset; + int error; + + bzero(&vm_cpuset, sizeof(struct vm_cpuset)); + vm_cpuset.which = which; + vm_cpuset.cpusetsize = sizeof(cpuset_t); + vm_cpuset.cpus = cpus; + + error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); + return (error); +} + +int +vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) +{ + + return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); +} + +int +vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) +{ + + return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); +} + +int +vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus) +{ + + return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus)); +} + +int +vm_activate_cpu(struct vcpu *vcpu) +{ + struct vm_activate_cpu ac; + int error; + + bzero(&ac, sizeof(struct vm_activate_cpu)); + error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac); + return (error); +} + +int +vm_suspend_all_cpus(struct vmctx *ctx) +{ + struct vm_activate_cpu ac; + int error; + + bzero(&ac, sizeof(struct vm_activate_cpu)); + ac.vcpuid = -1; + error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); + return (error); +} + +int +vm_suspend_cpu(struct vcpu *vcpu) +{ + struct vm_activate_cpu ac; + int error; + + bzero(&ac, sizeof(struct vm_activate_cpu)); + error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac); + return (error); +} + +int +vm_resume_cpu(struct vcpu *vcpu) +{ + struct vm_activate_cpu ac; + int error; + + bzero(&ac, sizeof(struct vm_activate_cpu)); + error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac); + return (error); +} + +int +vm_resume_all_cpus(struct vmctx *ctx) +{ + struct vm_activate_cpu ac; + int error; + + bzero(&ac, sizeof(struct vm_activate_cpu)); + ac.vcpuid = -1; + error = ioctl(ctx->fd, VM_RESUME_CPU, &ac); + return (error); +} + +#ifdef __amd64__ +int +vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) +{ + struct vm_intinfo vmii; + int error; + + bzero(&vmii, sizeof(struct vm_intinfo)); + error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii); + if (error == 0) { + *info1 = vmii.info1; + *info2 = vmii.info2; + } + return (error); +} + +int +vm_set_intinfo(struct vcpu *vcpu, uint64_t info1) +{ + struct vm_intinfo vmii; + int error; + + bzero(&vmii, sizeof(struct vm_intinfo)); + vmii.info1 = info1; + error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii); + return (error); +} +#endif + +#ifdef WITH_VMMAPI_SNAPSHOT +int +vm_restart_instruction(struct vcpu *vcpu) +{ + int arg; + + return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg)); +} + +int +vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta) +{ + + if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) { +#ifdef SNAPSHOT_DEBUG + fprintf(stderr, "%s: snapshot failed for %s: %d\r\n", + __func__, meta->dev_name, errno); +#endif + return (-1); + } + return (0); +} + +int +vm_restore_time(struct vmctx *ctx) +{ + int dummy; + + dummy = 0; + return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); +} +#endif + +int +vm_set_topology(struct vmctx *ctx, + uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) +{ + struct vm_cpu_topology topology; + + bzero(&topology, sizeof (struct vm_cpu_topology)); + topology.sockets = sockets; + topology.cores = cores; + topology.threads = threads; + topology.maxcpus = maxcpus; + return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology)); +} + +int +vm_get_topology(struct vmctx *ctx, + uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) +{ + struct vm_cpu_topology topology; + int error; + + bzero(&topology, sizeof (struct vm_cpu_topology)); + error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology); + if (error == 0) { + *sockets = topology.sockets; + *cores = topology.cores; + *threads = topology.threads; + *maxcpus = topology.maxcpus; + } + return (error); +} + +int +vm_limit_rights(struct vmctx *ctx) +{ + cap_rights_t rights; + + cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); + if (caph_rights_limit(ctx->fd, &rights) != 0) + return (-1); + if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0) + return (-1); + return (0); +} + +/* + * Avoid using in new code. Operations on the fd should be wrapped here so that + * capability rights can be kept in sync. + */ +int +vm_get_device_fd(struct vmctx *ctx) +{ + + return (ctx->fd); +} + +/* Legacy interface, do not use. */ +const cap_ioctl_t * +vm_get_ioctls(size_t *len) +{ + cap_ioctl_t *cmds; + size_t sz; + + if (len == NULL) { + sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]); + cmds = malloc(sz); + if (cmds == NULL) + return (NULL); + bcopy(vm_ioctl_cmds, cmds, sz); + return (cmds); + } + + *len = vm_ioctl_ncmds; + return (NULL); +} |