aboutsummaryrefslogtreecommitdiff
path: root/usr.sbin/bhyve/pci_emul.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr.sbin/bhyve/pci_emul.c')
-rw-r--r--usr.sbin/bhyve/pci_emul.c2805
1 files changed, 2805 insertions, 0 deletions
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
new file mode 100644
index 000000000000..9d6060e3e254
--- /dev/null
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -0,0 +1,2805 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/mman.h>
+
+#include <ctype.h>
+#include <err.h>
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <sysexits.h>
+
+#include <dev/vmm/vmm_mem.h>
+#include <machine/vmm.h>
+#include <machine/vmm_snapshot.h>
+#include <vmmapi.h>
+
+#include "acpi.h"
+#include "bhyverun.h"
+#include "bootrom.h"
+#include "config.h"
+#include "debug.h"
+#ifdef __amd64__
+#include "amd64/inout.h"
+#endif
+#include "mem.h"
+#include "pci_emul.h"
+#ifdef __amd64__
+#include "amd64/pci_lpc.h"
+#include "pci_passthru.h"
+#endif
+#include "qemu_fwcfg.h"
+
+#define CONF1_ADDR_PORT 0x0cf8
+#define CONF1_DATA_PORT 0x0cfc
+
+#define CONF1_ENABLE 0x80000000ul
+
+#define MAXBUSES (PCI_BUSMAX + 1)
+#define MAXSLOTS (PCI_SLOTMAX + 1)
+#define MAXFUNCS (PCI_FUNCMAX + 1)
+
+#define GB (1024 * 1024 * 1024UL)
+
+struct funcinfo {
+ nvlist_t *fi_config;
+ struct pci_devemu *fi_pde;
+ struct pci_devinst *fi_devi;
+};
+
+struct intxinfo {
+ int ii_count;
+ struct pci_irq ii_irq;
+};
+
+struct slotinfo {
+ struct intxinfo si_intpins[4];
+ struct funcinfo si_funcs[MAXFUNCS];
+};
+
+struct businfo {
+ uint16_t iobase, iolimit; /* I/O window */
+ uint32_t membase32, memlimit32; /* mmio window below 4GB */
+ uint64_t membase64, memlimit64; /* mmio window above 4GB */
+ struct slotinfo slotinfo[MAXSLOTS];
+};
+
+static struct businfo *pci_businfo[MAXBUSES];
+
+SET_DECLARE(pci_devemu_set, struct pci_devemu);
+
+static uint64_t pci_emul_iobase;
+static uint8_t *pci_emul_rombase;
+static uint64_t pci_emul_romoffset;
+static uint8_t *pci_emul_romlim;
+static uint64_t pci_emul_membase32;
+static uint64_t pci_emul_membase64;
+static uint64_t pci_emul_memlim64;
+
+struct pci_bar_allocation {
+ TAILQ_ENTRY(pci_bar_allocation) chain;
+ struct pci_devinst *pdi;
+ int idx;
+ enum pcibar_type type;
+ uint64_t size;
+};
+
+static TAILQ_HEAD(pci_bar_list, pci_bar_allocation) pci_bars =
+ TAILQ_HEAD_INITIALIZER(pci_bars);
+
+struct boot_device {
+ TAILQ_ENTRY(boot_device) boot_device_chain;
+ struct pci_devinst *pdi;
+ int bootindex;
+};
+static TAILQ_HEAD(boot_list, boot_device) boot_devices = TAILQ_HEAD_INITIALIZER(
+ boot_devices);
+
+#if defined(__amd64__)
+#define PCI_EMUL_IOBASE 0x2000
+#define PCI_EMUL_IOLIMIT 0x10000
+#define PCI_EMUL_IOMASK 0xffff
+/*
+ * OVMF always uses 0xc0000000 as base address for 32 bit PCI MMIO. Don't
+ * change this address without changing it in OVMF.
+ */
+#define PCI_EMUL_MEMBASE32 0xc0000000
+#elif defined(__aarch64__) || defined(__riscv)
+#define PCI_EMUL_IOBASE 0xdf000000UL
+#define PCI_EMUL_IOLIMIT 0xe0000000UL
+#define PCI_EMUL_MEMBASE32 0xa0000000UL
+#else
+#error Unsupported platform
+#endif
+
+#define PCI_EMUL_ROMSIZE 0x10000000
+
+#define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */
+#define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */
+#ifdef __amd64__
+SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE);
+#endif
+
+#define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE
+#define PCI_EMUL_MEMSIZE64 (32*GB)
+
+static void pci_lintr_route(struct pci_devinst *pi);
+static void pci_lintr_update(struct pci_devinst *pi);
+
+static struct pci_devemu *pci_emul_finddev(const char *name);
+static void pci_cfgrw(int in, int bus, int slot, int func, int coff,
+ int bytes, uint32_t *val);
+
+static __inline void
+CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes)
+{
+
+ if (bytes == 1)
+ pci_set_cfgdata8(pi, coff, val);
+ else if (bytes == 2)
+ pci_set_cfgdata16(pi, coff, val);
+ else
+ pci_set_cfgdata32(pi, coff, val);
+}
+
+static __inline uint32_t
+CFGREAD(struct pci_devinst *pi, int coff, int bytes)
+{
+
+ if (bytes == 1)
+ return (pci_get_cfgdata8(pi, coff));
+ else if (bytes == 2)
+ return (pci_get_cfgdata16(pi, coff));
+ else
+ return (pci_get_cfgdata32(pi, coff));
+}
+
+static int
+is_pcir_bar(int coff)
+{
+ return (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1));
+}
+
+static int
+is_pcir_bios(int coff)
+{
+ return (coff >= PCIR_BIOS && coff < PCIR_BIOS + 4);
+}
+
+/*
+ * I/O access
+ */
+
+/*
+ * Slot options are in the form:
+ *
+ * <bus>:<slot>:<func>,<emul>[,<config>]
+ * <slot>[:<func>],<emul>[,<config>]
+ *
+ * slot is 0..31
+ * func is 0..7
+ * emul is a string describing the type of PCI device e.g. virtio-net
+ * config is an optional string, depending on the device, that can be
+ * used for configuration.
+ * Examples are:
+ * 1,virtio-net,tap0
+ * 3:0,dummy
+ */
+static void
+pci_parse_slot_usage(char *aopt)
+{
+
+ EPRINTLN("Invalid PCI slot info field \"%s\"", aopt);
+}
+
+/*
+ * Helper function to parse a list of comma-separated options where
+ * each option is formatted as "name[=value]". If no value is
+ * provided, the option is treated as a boolean and is given a value
+ * of true.
+ */
+int
+pci_parse_legacy_config(nvlist_t *nvl, const char *opt)
+{
+ char *config, *name, *tofree, *value;
+
+ if (opt == NULL)
+ return (0);
+
+ config = tofree = strdup(opt);
+ while ((name = strsep(&config, ",")) != NULL) {
+ value = strchr(name, '=');
+ if (value != NULL) {
+ *value = '\0';
+ value++;
+ set_config_value_node(nvl, name, value);
+ } else
+ set_config_bool_node(nvl, name, true);
+ }
+ free(tofree);
+ return (0);
+}
+
+/*
+ * PCI device configuration is stored in MIBs that encode the device's
+ * location:
+ *
+ * pci.<bus>.<slot>.<func>
+ *
+ * Where "bus", "slot", and "func" are all decimal values without
+ * leading zeroes. Each valid device must have a "device" node which
+ * identifies the driver model of the device.
+ *
+ * Device backends can provide a parser for the "config" string. If
+ * a custom parser is not provided, pci_parse_legacy_config() is used
+ * to parse the string.
+ */
+int
+pci_parse_slot(char *opt)
+{
+ char node_name[sizeof("pci.XXX.XX.X")];
+ struct pci_devemu *pde;
+ char *emul, *config, *str, *cp;
+ int error, bnum, snum, fnum;
+ nvlist_t *nvl;
+
+ error = -1;
+ str = strdup(opt);
+
+ emul = config = NULL;
+ if ((cp = strchr(str, ',')) != NULL) {
+ *cp = '\0';
+ emul = cp + 1;
+ if ((cp = strchr(emul, ',')) != NULL) {
+ *cp = '\0';
+ config = cp + 1;
+ }
+ } else {
+ pci_parse_slot_usage(opt);
+ goto done;
+ }
+
+ /* <bus>:<slot>:<func> */
+ if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) {
+ bnum = 0;
+ /* <slot>:<func> */
+ if (sscanf(str, "%d:%d", &snum, &fnum) != 2) {
+ fnum = 0;
+ /* <slot> */
+ if (sscanf(str, "%d", &snum) != 1) {
+ snum = -1;
+ }
+ }
+ }
+
+ if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS ||
+ fnum < 0 || fnum >= MAXFUNCS) {
+ pci_parse_slot_usage(opt);
+ goto done;
+ }
+
+ pde = pci_emul_finddev(emul);
+ if (pde == NULL) {
+ EPRINTLN("pci slot %d:%d:%d: unknown device \"%s\"", bnum, snum,
+ fnum, emul);
+ goto done;
+ }
+
+ snprintf(node_name, sizeof(node_name), "pci.%d.%d.%d", bnum, snum,
+ fnum);
+ nvl = find_config_node(node_name);
+ if (nvl != NULL) {
+ EPRINTLN("pci slot %d:%d:%d already occupied!", bnum, snum,
+ fnum);
+ goto done;
+ }
+ nvl = create_config_node(node_name);
+ if (pde->pe_alias != NULL)
+ set_config_value_node(nvl, "device", pde->pe_alias);
+ else
+ set_config_value_node(nvl, "device", pde->pe_emu);
+
+ if (pde->pe_legacy_config != NULL)
+ error = pde->pe_legacy_config(nvl, config);
+ else
+ error = pci_parse_legacy_config(nvl, config);
+done:
+ free(str);
+ return (error);
+}
+
+void
+pci_print_supported_devices(void)
+{
+ struct pci_devemu **pdpp, *pdp;
+
+ SET_FOREACH(pdpp, pci_devemu_set) {
+ pdp = *pdpp;
+ printf("%s\n", pdp->pe_emu);
+ }
+}
+
+uint32_t
+pci_config_read_reg(const struct pcisel *const host_sel, nvlist_t *nvl,
+ const uint32_t reg, const uint8_t size, const uint32_t def)
+{
+ const char *config;
+ const nvlist_t *pci_regs;
+
+ assert(size == 1 || size == 2 || size == 4);
+
+ pci_regs = find_relative_config_node(nvl, "pcireg");
+ if (pci_regs == NULL) {
+ return def;
+ }
+
+ switch (reg) {
+ case PCIR_DEVICE:
+ config = get_config_value_node(pci_regs, "device");
+ break;
+ case PCIR_VENDOR:
+ config = get_config_value_node(pci_regs, "vendor");
+ break;
+ case PCIR_REVID:
+ config = get_config_value_node(pci_regs, "revid");
+ break;
+ case PCIR_SUBVEND_0:
+ config = get_config_value_node(pci_regs, "subvendor");
+ break;
+ case PCIR_SUBDEV_0:
+ config = get_config_value_node(pci_regs, "subdevice");
+ break;
+ default:
+ return (-1);
+ }
+
+ if (config == NULL) {
+ return def;
+ } else if (host_sel != NULL && strcmp(config, "host") == 0) {
+#ifdef __amd64__
+ return pci_host_read_config(host_sel, reg, size);
+#else
+ errx(1, "cannot fetch host PCI configuration");
+#endif
+ } else {
+ return strtol(config, NULL, 16);
+ }
+}
+
+static int
+pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset)
+{
+
+ if (offset < pi->pi_msix.pba_offset)
+ return (0);
+
+ if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
+ return (0);
+ }
+
+ return (1);
+}
+
+int
+pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
+ uint64_t value)
+{
+ int msix_entry_offset;
+ int tab_index;
+ char *dest;
+
+ /* support only 4 or 8 byte writes */
+ if (size != 4 && size != 8)
+ return (-1);
+
+ /*
+ * Return if table index is beyond what device supports
+ */
+ tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
+ if (tab_index >= pi->pi_msix.table_count)
+ return (-1);
+
+ msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+
+ /* support only aligned writes */
+ if ((msix_entry_offset % size) != 0)
+ return (-1);
+
+ dest = (char *)(pi->pi_msix.table + tab_index);
+ dest += msix_entry_offset;
+
+ if (size == 4)
+ *((uint32_t *)dest) = value;
+ else
+ *((uint64_t *)dest) = value;
+
+ return (0);
+}
+
+uint64_t
+pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size)
+{
+ char *dest;
+ int msix_entry_offset;
+ int tab_index;
+ uint64_t retval = ~0;
+
+ /*
+ * The PCI standard only allows 4 and 8 byte accesses to the MSI-X
+ * table but we also allow 1 byte access to accommodate reads from
+ * ddb.
+ */
+ if (size != 1 && size != 4 && size != 8)
+ return (retval);
+
+ msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+
+ /* support only aligned reads */
+ if ((msix_entry_offset % size) != 0) {
+ return (retval);
+ }
+
+ tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
+
+ if (tab_index < pi->pi_msix.table_count) {
+ /* valid MSI-X Table access */
+ dest = (char *)(pi->pi_msix.table + tab_index);
+ dest += msix_entry_offset;
+
+ if (size == 1)
+ retval = *((uint8_t *)dest);
+ else if (size == 4)
+ retval = *((uint32_t *)dest);
+ else
+ retval = *((uint64_t *)dest);
+ } else if (pci_valid_pba_offset(pi, offset)) {
+ /* return 0 for PBA access */
+ retval = 0;
+ }
+
+ return (retval);
+}
+
+int
+pci_msix_table_bar(struct pci_devinst *pi)
+{
+
+ if (pi->pi_msix.table != NULL)
+ return (pi->pi_msix.table_bar);
+ else
+ return (-1);
+}
+
+int
+pci_msix_pba_bar(struct pci_devinst *pi)
+{
+
+ if (pi->pi_msix.table != NULL)
+ return (pi->pi_msix.pba_bar);
+ else
+ return (-1);
+}
+
+#ifdef __amd64__
+static int
+pci_emul_io_handler(struct vmctx *ctx __unused, int in, int port,
+ int bytes, uint32_t *eax, void *arg)
+{
+ struct pci_devinst *pdi = arg;
+ struct pci_devemu *pe = pdi->pi_d;
+ uint64_t offset;
+ int i;
+
+ assert(port >= 0);
+
+ for (i = 0; i <= PCI_BARMAX; i++) {
+ if (pdi->pi_bar[i].type == PCIBAR_IO &&
+ (uint64_t)port >= pdi->pi_bar[i].addr &&
+ (uint64_t)port + bytes <=
+ pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
+ offset = port - pdi->pi_bar[i].addr;
+ if (in)
+ *eax = (*pe->pe_barread)(pdi, i,
+ offset, bytes);
+ else
+ (*pe->pe_barwrite)(pdi, i, offset,
+ bytes, *eax);
+ return (0);
+ }
+ }
+ return (-1);
+}
+#else
+static int
+pci_emul_iomem_handler(struct vcpu *vcpu __unused, int dir,
+ uint64_t addr, int size, uint64_t *val, void *arg1, long arg2)
+{
+ struct pci_devinst *pdi = arg1;
+ struct pci_devemu *pe = pdi->pi_d;
+ uint64_t offset;
+ int bidx = (int)arg2;
+
+ assert(bidx <= PCI_BARMAX);
+ assert(pdi->pi_bar[bidx].type == PCIBAR_IO);
+ assert(addr >= pdi->pi_bar[bidx].addr &&
+ addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size);
+ assert(size == 1 || size == 2 || size == 4);
+
+ offset = addr - pdi->pi_bar[bidx].addr;
+ if (dir == MEM_F_READ)
+ *val = (*pe->pe_barread)(pdi, bidx, offset, size);
+ else
+ (*pe->pe_barwrite)(pdi, bidx, offset, size, *val);
+
+ return (0);
+}
+#endif /* !__amd64__ */
+
+static int
+pci_emul_mem_handler(struct vcpu *vcpu __unused, int dir,
+ uint64_t addr, int size, uint64_t *val, void *arg1, long arg2)
+{
+ struct pci_devinst *pdi = arg1;
+ struct pci_devemu *pe = pdi->pi_d;
+ uint64_t offset;
+ int bidx = (int)arg2;
+
+ assert(bidx <= PCI_BARMAX);
+ assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 ||
+ pdi->pi_bar[bidx].type == PCIBAR_MEM64);
+ assert(addr >= pdi->pi_bar[bidx].addr &&
+ addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size);
+
+ offset = addr - pdi->pi_bar[bidx].addr;
+
+ if (dir == MEM_F_WRITE) {
+ if (size == 8) {
+ (*pe->pe_barwrite)(pdi, bidx, offset,
+ 4, *val & 0xffffffff);
+ (*pe->pe_barwrite)(pdi, bidx, offset + 4,
+ 4, *val >> 32);
+ } else {
+ (*pe->pe_barwrite)(pdi, bidx, offset,
+ size, *val);
+ }
+ } else {
+ if (size == 8) {
+ *val = (*pe->pe_barread)(pdi, bidx,
+ offset, 4);
+ *val |= (*pe->pe_barread)(pdi, bidx,
+ offset + 4, 4) << 32;
+ } else {
+ *val = (*pe->pe_barread)(pdi, bidx,
+ offset, size);
+ }
+ }
+
+ return (0);
+}
+
+
+static int
+pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size,
+ uint64_t *addr)
+{
+ uint64_t base;
+
+ assert((size & (size - 1)) == 0); /* must be a power of 2 */
+
+ base = roundup2(*baseptr, size);
+
+ if (base + size <= limit) {
+ *addr = base;
+ *baseptr = base + size;
+ return (0);
+ } else
+ return (-1);
+}
+
+/*
+ * Register (or unregister) the MMIO or I/O region associated with the BAR
+ * register 'idx' of an emulated pci device.
+ */
+static void
+modify_bar_registration(struct pci_devinst *pi, int idx, int registration)
+{
+ struct pci_devemu *pe;
+ int error;
+ enum pcibar_type type;
+
+ pe = pi->pi_d;
+ type = pi->pi_bar[idx].type;
+ switch (type) {
+ case PCIBAR_IO:
+ {
+#ifdef __amd64__
+ struct inout_port iop;
+
+ bzero(&iop, sizeof(struct inout_port));
+ iop.name = pi->pi_name;
+ iop.port = pi->pi_bar[idx].addr;
+ iop.size = pi->pi_bar[idx].size;
+ if (registration) {
+ iop.flags = IOPORT_F_INOUT;
+ iop.handler = pci_emul_io_handler;
+ iop.arg = pi;
+ error = register_inout(&iop);
+ } else
+ error = unregister_inout(&iop);
+#else
+ struct mem_range mr;
+
+ bzero(&mr, sizeof(struct mem_range));
+ mr.name = pi->pi_name;
+ mr.base = pi->pi_bar[idx].addr;
+ mr.size = pi->pi_bar[idx].size;
+ if (registration) {
+ mr.flags = MEM_F_RW;
+ mr.handler = pci_emul_iomem_handler;
+ mr.arg1 = pi;
+ mr.arg2 = idx;
+ error = register_mem(&mr);
+ } else
+ error = unregister_mem(&mr);
+#endif
+ break;
+ }
+ case PCIBAR_MEM32:
+ case PCIBAR_MEM64:
+ {
+ struct mem_range mr;
+
+ bzero(&mr, sizeof(struct mem_range));
+ mr.name = pi->pi_name;
+ mr.base = pi->pi_bar[idx].addr;
+ mr.size = pi->pi_bar[idx].size;
+ if (registration) {
+ mr.flags = MEM_F_RW;
+ mr.handler = pci_emul_mem_handler;
+ mr.arg1 = pi;
+ mr.arg2 = idx;
+ error = register_mem(&mr);
+ } else
+ error = unregister_mem(&mr);
+ break;
+ }
+ case PCIBAR_ROM:
+ error = 0;
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ assert(error == 0);
+
+ if (pe->pe_baraddr != NULL)
+ (*pe->pe_baraddr)(pi, idx, registration, pi->pi_bar[idx].addr);
+}
+
+static void
+unregister_bar(struct pci_devinst *pi, int idx)
+{
+
+ modify_bar_registration(pi, idx, 0);
+}
+
+static void
+register_bar(struct pci_devinst *pi, int idx)
+{
+
+ modify_bar_registration(pi, idx, 1);
+}
+
+/* Is the ROM enabled for the emulated pci device? */
+static int
+romen(struct pci_devinst *pi)
+{
+ return (pi->pi_bar[PCI_ROM_IDX].lobits & PCIM_BIOS_ENABLE) ==
+ PCIM_BIOS_ENABLE;
+}
+
+/* Are we decoding i/o port accesses for the emulated pci device? */
+static int
+porten(struct pci_devinst *pi)
+{
+ uint16_t cmd;
+
+ cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
+
+ return (cmd & PCIM_CMD_PORTEN);
+}
+
+/* Are we decoding memory accesses for the emulated pci device? */
+static int
+memen(struct pci_devinst *pi)
+{
+ uint16_t cmd;
+
+ cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
+
+ return (cmd & PCIM_CMD_MEMEN);
+}
+
+/*
+ * Update the MMIO or I/O address that is decoded by the BAR register.
+ *
+ * If the pci device has enabled the address space decoding then intercept
+ * the address range decoded by the BAR register.
+ */
+static void
+update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type)
+{
+ int decode;
+
+ if (pi->pi_bar[idx].type == PCIBAR_IO)
+ decode = porten(pi);
+ else
+ decode = memen(pi);
+
+ if (decode)
+ unregister_bar(pi, idx);
+
+ switch (type) {
+ case PCIBAR_IO:
+ case PCIBAR_MEM32:
+ pi->pi_bar[idx].addr = addr;
+ break;
+ case PCIBAR_MEM64:
+ pi->pi_bar[idx].addr &= ~0xffffffffUL;
+ pi->pi_bar[idx].addr |= addr;
+ break;
+ case PCIBAR_MEMHI64:
+ pi->pi_bar[idx].addr &= 0xffffffff;
+ pi->pi_bar[idx].addr |= addr;
+ break;
+ default:
+ assert(0);
+ }
+
+ if (decode)
+ register_bar(pi, idx);
+}
+
+int
+pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type,
+ uint64_t size)
+{
+ assert((type == PCIBAR_ROM) || (idx >= 0 && idx <= PCI_BARMAX));
+ assert((type != PCIBAR_ROM) || (idx == PCI_ROM_IDX));
+
+ if ((size & (size - 1)) != 0)
+ size = 1UL << flsl(size); /* round up to a power of 2 */
+
+ /* Enforce minimum BAR sizes required by the PCI standard */
+ if (type == PCIBAR_IO) {
+ if (size < 4)
+ size = 4;
+ } else if (type == PCIBAR_ROM) {
+ if (size < ~PCIM_BIOS_ADDR_MASK + 1)
+ size = ~PCIM_BIOS_ADDR_MASK + 1;
+ } else {
+ if (size < 16)
+ size = 16;
+ }
+
+ /*
+ * To reduce fragmentation of the MMIO space, we allocate the BARs by
+ * size. Therefore, don't allocate the BAR yet. We create a list of all
+ * BAR allocation which is sorted by BAR size. When all PCI devices are
+ * initialized, we will assign an address to the BARs.
+ */
+
+ /* create a new list entry */
+ struct pci_bar_allocation *const new_bar = malloc(sizeof(*new_bar));
+ memset(new_bar, 0, sizeof(*new_bar));
+ new_bar->pdi = pdi;
+ new_bar->idx = idx;
+ new_bar->type = type;
+ new_bar->size = size;
+
+ /*
+ * Search for a BAR which size is lower than the size of our newly
+ * allocated BAR.
+ */
+ struct pci_bar_allocation *bar = NULL;
+ TAILQ_FOREACH(bar, &pci_bars, chain) {
+ if (bar->size < size) {
+ break;
+ }
+ }
+
+ if (bar == NULL) {
+ /*
+ * Either the list is empty or new BAR is the smallest BAR of
+ * the list. Append it to the end of our list.
+ */
+ TAILQ_INSERT_TAIL(&pci_bars, new_bar, chain);
+ } else {
+ /*
+ * The found BAR is smaller than our new BAR. For that reason,
+ * insert our new BAR before the found BAR.
+ */
+ TAILQ_INSERT_BEFORE(bar, new_bar, chain);
+ }
+
+ /*
+ * Enable PCI BARs only if we don't have a boot ROM, i.e., bhyveload was
+ * used to load the initial guest image. Otherwise, we rely on the boot
+ * ROM to handle this.
+ */
+ if (!get_config_bool_default("pci.enable_bars", !bootrom_boot()))
+ return (0);
+
+ /*
+ * pci_passthru devices synchronize their physical and virtual command
+ * register on init. For that reason, the virtual cmd reg should be
+ * updated as early as possible.
+ */
+ uint16_t enbit = 0;
+ switch (type) {
+ case PCIBAR_IO:
+ enbit = PCIM_CMD_PORTEN;
+ break;
+ case PCIBAR_MEM64:
+ case PCIBAR_MEM32:
+ enbit = PCIM_CMD_MEMEN;
+ break;
+ default:
+ enbit = 0;
+ break;
+ }
+
+ const uint16_t cmd = pci_get_cfgdata16(pdi, PCIR_COMMAND);
+ pci_set_cfgdata16(pdi, PCIR_COMMAND, cmd | enbit);
+
+ return (0);
+}
+
+static int
+pci_emul_assign_bar(struct pci_devinst *const pdi, const int idx,
+ const enum pcibar_type type, const uint64_t size)
+{
+ int error;
+ uint64_t *baseptr, limit, addr, mask, lobits, bar;
+
+ switch (type) {
+ case PCIBAR_NONE:
+ baseptr = NULL;
+ addr = mask = lobits = 0;
+ break;
+ case PCIBAR_IO:
+ baseptr = &pci_emul_iobase;
+ limit = PCI_EMUL_IOLIMIT;
+ mask = PCIM_BAR_IO_BASE;
+ lobits = PCIM_BAR_IO_SPACE;
+ break;
+ case PCIBAR_MEM64:
+ /*
+ * XXX
+ * Some drivers do not work well if the 64-bit BAR is allocated
+ * above 4GB. Allow for this by allocating small requests under
+ * 4GB unless then allocation size is larger than some arbitrary
+ * number (128MB currently).
+ */
+ if (size > 128 * 1024 * 1024) {
+ baseptr = &pci_emul_membase64;
+ limit = pci_emul_memlim64;
+ mask = PCIM_BAR_MEM_BASE;
+ lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
+ PCIM_BAR_MEM_PREFETCH;
+ } else {
+ baseptr = &pci_emul_membase32;
+ limit = PCI_EMUL_MEMLIMIT32;
+ mask = PCIM_BAR_MEM_BASE;
+ lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64;
+ }
+ break;
+ case PCIBAR_MEM32:
+ baseptr = &pci_emul_membase32;
+ limit = PCI_EMUL_MEMLIMIT32;
+ mask = PCIM_BAR_MEM_BASE;
+ lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
+ break;
+ case PCIBAR_ROM:
+ /* do not claim memory for ROM. OVMF will do it for us. */
+ baseptr = NULL;
+ limit = 0;
+ mask = PCIM_BIOS_ADDR_MASK;
+ lobits = 0;
+ break;
+ default:
+ printf("pci_emul_alloc_base: invalid bar type %d\n", type);
+ assert(0);
+ }
+
+ if (baseptr != NULL) {
+ error = pci_emul_alloc_resource(baseptr, limit, size, &addr);
+ if (error != 0)
+ return (error);
+ } else {
+ addr = 0;
+ }
+
+ pdi->pi_bar[idx].type = type;
+ pdi->pi_bar[idx].addr = addr;
+ pdi->pi_bar[idx].size = size;
+ /*
+ * passthru devices are using same lobits as physical device they set
+ * this property
+ */
+ if (pdi->pi_bar[idx].lobits != 0) {
+ lobits = pdi->pi_bar[idx].lobits;
+ } else {
+ pdi->pi_bar[idx].lobits = lobits;
+ }
+
+ /* Initialize the BAR register in config space */
+ bar = (addr & mask) | lobits;
+ pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar);
+
+ if (type == PCIBAR_MEM64) {
+ assert(idx + 1 <= PCI_BARMAX);
+ pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64;
+ pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32);
+ }
+
+ switch (type) {
+ case PCIBAR_IO:
+ if (porten(pdi))
+ register_bar(pdi, idx);
+ break;
+ case PCIBAR_MEM32:
+ case PCIBAR_MEM64:
+ case PCIBAR_MEMHI64:
+ if (memen(pdi))
+ register_bar(pdi, idx);
+ break;
+ default:
+ break;
+ }
+
+ return (0);
+}
+
+int
+pci_emul_alloc_rom(struct pci_devinst *const pdi, const uint64_t size,
+ void **const addr)
+{
+ /* allocate ROM space once on first call */
+ if (pci_emul_rombase == 0) {
+ pci_emul_rombase = vm_create_devmem(pdi->pi_vmctx, VM_PCIROM,
+ "pcirom", PCI_EMUL_ROMSIZE);
+ if (pci_emul_rombase == MAP_FAILED) {
+ warnx("%s: failed to create rom segment", __func__);
+ return (-1);
+ }
+ pci_emul_romlim = pci_emul_rombase + PCI_EMUL_ROMSIZE;
+ pci_emul_romoffset = 0;
+ }
+
+ /* ROM size should be a power of 2 and greater than 2 KB */
+ const uint64_t rom_size = MAX(1UL << flsl(size),
+ ~PCIM_BIOS_ADDR_MASK + 1);
+
+ /* check if ROM fits into ROM space */
+ if (pci_emul_romoffset + rom_size > PCI_EMUL_ROMSIZE) {
+ warnx("%s: no space left in rom segment:", __func__);
+ warnx("%16lu bytes left",
+ PCI_EMUL_ROMSIZE - pci_emul_romoffset);
+ warnx("%16lu bytes required by %d/%d/%d", rom_size, pdi->pi_bus,
+ pdi->pi_slot, pdi->pi_func);
+ return (-1);
+ }
+
+ /* allocate ROM BAR */
+ const int error = pci_emul_alloc_bar(pdi, PCI_ROM_IDX, PCIBAR_ROM,
+ rom_size);
+ if (error)
+ return error;
+
+ /* return address */
+ *addr = pci_emul_rombase + pci_emul_romoffset;
+
+ /* save offset into ROM Space */
+ pdi->pi_romoffset = pci_emul_romoffset;
+
+ /* increase offset for next ROM */
+ pci_emul_romoffset += rom_size;
+
+ return (0);
+}
+
+int
+pci_emul_add_boot_device(struct pci_devinst *pi, int bootindex)
+{
+ struct boot_device *new_device, *device;
+
+ /* don't permit a negative bootindex */
+ if (bootindex < 0) {
+ errx(4, "Invalid bootindex %d for %s", bootindex, pi->pi_name);
+ }
+
+ /* alloc new boot device */
+ new_device = calloc(1, sizeof(struct boot_device));
+ if (new_device == NULL) {
+ return (ENOMEM);
+ }
+ new_device->pdi = pi;
+ new_device->bootindex = bootindex;
+
+ /* search for boot device with higher boot index */
+ TAILQ_FOREACH(device, &boot_devices, boot_device_chain) {
+ if (device->bootindex == bootindex) {
+ errx(4,
+ "Could not set bootindex %d for %s. Bootindex already occupied by %s",
+ bootindex, pi->pi_name, device->pdi->pi_name);
+ } else if (device->bootindex > bootindex) {
+ break;
+ }
+ }
+
+ /* add boot device to queue */
+ if (device == NULL) {
+ TAILQ_INSERT_TAIL(&boot_devices, new_device, boot_device_chain);
+ } else {
+ TAILQ_INSERT_BEFORE(device, new_device, boot_device_chain);
+ }
+
+ return (0);
+}
+
+#define CAP_START_OFFSET 0x40
+static int
+pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen)
+{
+ int i, capoff, reallen;
+ uint16_t sts;
+
+ assert(caplen > 0);
+
+ reallen = roundup2(caplen, 4); /* dword aligned */
+
+ sts = pci_get_cfgdata16(pi, PCIR_STATUS);
+ if ((sts & PCIM_STATUS_CAPPRESENT) == 0)
+ capoff = CAP_START_OFFSET;
+ else
+ capoff = pi->pi_capend + 1;
+
+ /* Check if we have enough space */
+ if (capoff + reallen > PCI_REGMAX + 1)
+ return (-1);
+
+ /* Set the previous capability pointer */
+ if ((sts & PCIM_STATUS_CAPPRESENT) == 0) {
+ pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff);
+ pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT);
+ } else
+ pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff);
+
+ /* Copy the capability */
+ for (i = 0; i < caplen; i++)
+ pci_set_cfgdata8(pi, capoff + i, capdata[i]);
+
+ /* Set the next capability pointer */
+ pci_set_cfgdata8(pi, capoff + 1, 0);
+
+ pi->pi_prevcap = capoff;
+ pi->pi_capend = capoff + reallen - 1;
+ return (0);
+}
+
+static struct pci_devemu *
+pci_emul_finddev(const char *name)
+{
+ struct pci_devemu **pdpp, *pdp;
+
+ SET_FOREACH(pdpp, pci_devemu_set) {
+ pdp = *pdpp;
+ if (!strcmp(pdp->pe_emu, name)) {
+ return (pdp);
+ }
+ }
+
+ return (NULL);
+}
+
+static int
+pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot,
+ int func, struct funcinfo *fi)
+{
+ struct pci_devinst *pdi;
+ int err;
+
+ pdi = calloc(1, sizeof(struct pci_devinst));
+
+ pdi->pi_vmctx = ctx;
+ pdi->pi_bus = bus;
+ pdi->pi_slot = slot;
+ pdi->pi_func = func;
+ pthread_mutex_init(&pdi->pi_lintr.lock, NULL);
+ pdi->pi_lintr.pin = 0;
+ pdi->pi_lintr.state = IDLE;
+ pci_irq_init_irq(&pdi->pi_lintr.irq);
+ pdi->pi_d = pde;
+ snprintf(pdi->pi_name, PI_NAMESZ, "%s@pci.%d.%d.%d", pde->pe_emu, bus,
+ slot, func);
+
+ /* Disable legacy interrupts */
+ pci_set_cfgdata8(pdi, PCIR_INTLINE, 255);
+ pci_set_cfgdata8(pdi, PCIR_INTPIN, 0);
+
+ if (get_config_bool_default("pci.enable_bars", !bootrom_boot()))
+ pci_set_cfgdata8(pdi, PCIR_COMMAND, PCIM_CMD_BUSMASTEREN);
+
+ err = (*pde->pe_init)(pdi, fi->fi_config);
+ if (err == 0)
+ fi->fi_devi = pdi;
+ else
+ free(pdi);
+
+ return (err);
+}
+
+void
+pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr)
+{
+ int mmc;
+
+ /* Number of msi messages must be a power of 2 between 1 and 32 */
+ assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32);
+ mmc = ffs(msgnum) - 1;
+
+ bzero(msicap, sizeof(struct msicap));
+ msicap->capid = PCIY_MSI;
+ msicap->nextptr = nextptr;
+ msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1);
+}
+
+int
+pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
+{
+ struct msicap msicap;
+
+ pci_populate_msicap(&msicap, msgnum, 0);
+
+ return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
+}
+
+static void
+pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum,
+ uint32_t msix_tab_size)
+{
+
+ assert(msix_tab_size % 4096 == 0);
+
+ bzero(msixcap, sizeof(struct msixcap));
+ msixcap->capid = PCIY_MSIX;
+
+ /*
+ * Message Control Register, all fields set to
+ * zero except for the Table Size.
+ * Note: Table size N is encoded as N-1
+ */
+ msixcap->msgctrl = msgnum - 1;
+
+ /*
+ * MSI-X BAR setup:
+ * - MSI-X table start at offset 0
+ * - PBA table starts at a 4K aligned offset after the MSI-X table
+ */
+ msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK;
+ msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK);
+}
+
+static void
+pci_msix_table_init(struct pci_devinst *pi, int table_entries)
+{
+ int i, table_size;
+
+ assert(table_entries > 0);
+ assert(table_entries <= MAX_MSIX_TABLE_ENTRIES);
+
+ table_size = table_entries * MSIX_TABLE_ENTRY_SIZE;
+ pi->pi_msix.table = calloc(1, table_size);
+
+ /* set mask bit of vector control register */
+ for (i = 0; i < table_entries; i++)
+ pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK;
+}
+
+int
+pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum)
+{
+ uint32_t tab_size;
+ struct msixcap msixcap;
+
+ assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES);
+ assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0);
+
+ tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE;
+
+ /* Align table size to nearest 4K */
+ tab_size = roundup2(tab_size, 4096);
+
+ pi->pi_msix.table_bar = barnum;
+ pi->pi_msix.pba_bar = barnum;
+ pi->pi_msix.table_offset = 0;
+ pi->pi_msix.table_count = msgnum;
+ pi->pi_msix.pba_offset = tab_size;
+ pi->pi_msix.pba_size = PBA_SIZE(msgnum);
+
+ pci_msix_table_init(pi, msgnum);
+
+ pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size);
+
+ /* allocate memory for MSI-X Table and PBA */
+ pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32,
+ tab_size + pi->pi_msix.pba_size);
+
+ return (pci_emul_add_capability(pi, (u_char *)&msixcap,
+ sizeof(msixcap)));
+}
+
+static void
+msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val)
+{
+ uint16_t msgctrl, rwmask;
+ int off;
+
+ off = offset - capoff;
+ /* Message Control Register */
+ if (off == 2 && bytes == 2) {
+ rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK;
+ msgctrl = pci_get_cfgdata16(pi, offset);
+ msgctrl &= ~rwmask;
+ msgctrl |= val & rwmask;
+ val = msgctrl;
+
+ pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE;
+ pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK;
+ pci_lintr_update(pi);
+ }
+
+ CFGWRITE(pi, offset, val, bytes);
+}
+
+static void
+msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val)
+{
+ uint16_t msgctrl, rwmask, msgdata, mme;
+ uint32_t addrlo;
+
+ /*
+ * If guest is writing to the message control register make sure
+ * we do not overwrite read-only fields.
+ */
+ if ((offset - capoff) == 2 && bytes == 2) {
+ rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE;
+ msgctrl = pci_get_cfgdata16(pi, offset);
+ msgctrl &= ~rwmask;
+ msgctrl |= val & rwmask;
+ val = msgctrl;
+ }
+ CFGWRITE(pi, offset, val, bytes);
+
+ msgctrl = pci_get_cfgdata16(pi, capoff + 2);
+ addrlo = pci_get_cfgdata32(pi, capoff + 4);
+ if (msgctrl & PCIM_MSICTRL_64BIT)
+ msgdata = pci_get_cfgdata16(pi, capoff + 12);
+ else
+ msgdata = pci_get_cfgdata16(pi, capoff + 8);
+
+ mme = msgctrl & PCIM_MSICTRL_MME_MASK;
+ pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0;
+ if (pi->pi_msi.enabled) {
+ pi->pi_msi.addr = addrlo;
+ pi->pi_msi.msg_data = msgdata;
+ pi->pi_msi.maxmsgnum = 1 << (mme >> 4);
+ } else {
+ pi->pi_msi.maxmsgnum = 0;
+ }
+ pci_lintr_update(pi);
+}
+
+static void
+pciecap_cfgwrite(struct pci_devinst *pi, int capoff __unused, int offset,
+ int bytes, uint32_t val)
+{
+
+ /* XXX don't write to the readonly parts */
+ CFGWRITE(pi, offset, val, bytes);
+}
+
+#define PCIECAP_VERSION 0x2
+int
+pci_emul_add_pciecap(struct pci_devinst *pi, int type)
+{
+ int err;
+ struct pciecap pciecap;
+
+ bzero(&pciecap, sizeof(pciecap));
+
+ /*
+ * Use the integrated endpoint type for endpoints on a root complex bus.
+ *
+ * NB: bhyve currently only supports a single PCI bus that is the root
+ * complex bus, so all endpoints are integrated.
+ */
+ if ((type == PCIEM_TYPE_ENDPOINT) && (pi->pi_bus == 0))
+ type = PCIEM_TYPE_ROOT_INT_EP;
+
+ pciecap.capid = PCIY_EXPRESS;
+ pciecap.pcie_capabilities = PCIECAP_VERSION | type;
+ if (type != PCIEM_TYPE_ROOT_INT_EP) {
+ pciecap.link_capabilities = 0x411; /* gen1, x1 */
+ pciecap.link_status = 0x11; /* gen1, x1 */
+ }
+
+ err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap));
+ return (err);
+}
+
+/*
+ * This function assumes that 'coff' is in the capabilities region of the
+ * config space. A capoff parameter of zero will force a search for the
+ * offset and type.
+ */
+void
+pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val,
+ uint8_t capoff, int capid)
+{
+ uint8_t nextoff;
+
+ /* Do not allow un-aligned writes */
+ if ((offset & (bytes - 1)) != 0)
+ return;
+
+ if (capoff == 0) {
+ /* Find the capability that we want to update */
+ capoff = CAP_START_OFFSET;
+ while (1) {
+ nextoff = pci_get_cfgdata8(pi, capoff + 1);
+ if (nextoff == 0)
+ break;
+ if (offset >= capoff && offset < nextoff)
+ break;
+
+ capoff = nextoff;
+ }
+ assert(offset >= capoff);
+ capid = pci_get_cfgdata8(pi, capoff);
+ }
+
+ /*
+ * Capability ID and Next Capability Pointer are readonly.
+ * However, some o/s's do 4-byte writes that include these.
+ * For this case, trim the write back to 2 bytes and adjust
+ * the data.
+ */
+ if (offset == capoff || offset == capoff + 1) {
+ if (offset == capoff && bytes == 4) {
+ bytes = 2;
+ offset += 2;
+ val >>= 16;
+ } else
+ return;
+ }
+
+ switch (capid) {
+ case PCIY_MSI:
+ msicap_cfgwrite(pi, capoff, offset, bytes, val);
+ break;
+ case PCIY_MSIX:
+ msixcap_cfgwrite(pi, capoff, offset, bytes, val);
+ break;
+ case PCIY_EXPRESS:
+ pciecap_cfgwrite(pi, capoff, offset, bytes, val);
+ break;
+ default:
+ break;
+ }
+}
+
+static int
+pci_emul_iscap(struct pci_devinst *pi, int offset)
+{
+ uint16_t sts;
+
+ sts = pci_get_cfgdata16(pi, PCIR_STATUS);
+ if ((sts & PCIM_STATUS_CAPPRESENT) != 0) {
+ if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend)
+ return (1);
+ }
+ return (0);
+}
+
+static int
+pci_emul_fallback_handler(struct vcpu *vcpu __unused, int dir,
+ uint64_t addr __unused, int size __unused, uint64_t *val,
+ void *arg1 __unused, long arg2 __unused)
+{
+ /*
+ * Ignore writes; return 0xff's for reads. The mem read code
+ * will take care of truncating to the correct size.
+ */
+ if (dir == MEM_F_READ) {
+ *val = 0xffffffffffffffff;
+ }
+
+ return (0);
+}
+
+static int
+pci_emul_ecfg_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr,
+ int bytes, uint64_t *val, void *arg1 __unused, long arg2 __unused)
+{
+ int bus, slot, func, coff, in;
+
+ coff = addr & 0xfff;
+ func = (addr >> 12) & 0x7;
+ slot = (addr >> 15) & 0x1f;
+ bus = (addr >> 20) & 0xff;
+ in = (dir == MEM_F_READ);
+ if (in)
+ *val = ~0UL;
+ pci_cfgrw(in, bus, slot, func, coff, bytes, (uint32_t *)val);
+ return (0);
+}
+
+uint64_t
+pci_ecfg_base(void)
+{
+
+ return (PCI_EMUL_ECFG_BASE);
+}
+
+static int
+init_bootorder(void)
+{
+ struct boot_device *device;
+ FILE *fp;
+ char *bootorder;
+ size_t bootorder_len;
+
+ if (TAILQ_EMPTY(&boot_devices))
+ return (0);
+
+ fp = open_memstream(&bootorder, &bootorder_len);
+ TAILQ_FOREACH(device, &boot_devices, boot_device_chain) {
+ fprintf(fp, "/pci@i0cf8/pci@%d,%d\n",
+ device->pdi->pi_slot, device->pdi->pi_func);
+ }
+ fclose(fp);
+
+ return (qemu_fwcfg_add_file("bootorder", bootorder_len, bootorder));
+}
+
+#define BUSIO_ROUNDUP 32
+#define BUSMEM32_ROUNDUP (1024 * 1024)
+#define BUSMEM64_ROUNDUP (512 * 1024 * 1024)
+
+int
+init_pci(struct vmctx *ctx)
+{
+ char node_name[sizeof("pci.XXX.XX.X")];
+ struct mem_range mr;
+ struct pci_devemu *pde;
+ struct businfo *bi;
+ struct slotinfo *si;
+ struct funcinfo *fi;
+ nvlist_t *nvl;
+ const char *emul;
+ size_t lowmem;
+ int bus, slot, func;
+ int error;
+
+ if (vm_get_lowmem_limit(ctx) > PCI_EMUL_MEMBASE32)
+ errx(EX_OSERR, "Invalid lowmem limit");
+
+ pci_emul_iobase = PCI_EMUL_IOBASE;
+ pci_emul_membase32 = PCI_EMUL_MEMBASE32;
+
+ pci_emul_membase64 = vm_get_highmem_base(ctx) +
+ vm_get_highmem_size(ctx);
+ pci_emul_membase64 = roundup2(pci_emul_membase64, PCI_EMUL_MEMSIZE64);
+ pci_emul_memlim64 = pci_emul_membase64 + PCI_EMUL_MEMSIZE64;
+
+ TAILQ_INIT(&boot_devices);
+
+ for (bus = 0; bus < MAXBUSES; bus++) {
+ snprintf(node_name, sizeof(node_name), "pci.%d", bus);
+ nvl = find_config_node(node_name);
+ if (nvl == NULL)
+ continue;
+ pci_businfo[bus] = calloc(1, sizeof(struct businfo));
+ bi = pci_businfo[bus];
+
+ /*
+ * Keep track of the i/o and memory resources allocated to
+ * this bus.
+ */
+ bi->iobase = pci_emul_iobase;
+ bi->membase32 = pci_emul_membase32;
+ bi->membase64 = pci_emul_membase64;
+
+ /* first run: init devices */
+ for (slot = 0; slot < MAXSLOTS; slot++) {
+ si = &bi->slotinfo[slot];
+ for (func = 0; func < MAXFUNCS; func++) {
+ fi = &si->si_funcs[func];
+ snprintf(node_name, sizeof(node_name),
+ "pci.%d.%d.%d", bus, slot, func);
+ nvl = find_config_node(node_name);
+ if (nvl == NULL)
+ continue;
+
+ fi->fi_config = nvl;
+ emul = get_config_value_node(nvl, "device");
+ if (emul == NULL) {
+ EPRINTLN("pci slot %d:%d:%d: missing "
+ "\"device\" value", bus, slot, func);
+ return (EINVAL);
+ }
+ pde = pci_emul_finddev(emul);
+ if (pde == NULL) {
+ EPRINTLN("pci slot %d:%d:%d: unknown "
+ "device \"%s\"", bus, slot, func,
+ emul);
+ return (EINVAL);
+ }
+ if (pde->pe_alias != NULL) {
+ EPRINTLN("pci slot %d:%d:%d: legacy "
+ "device \"%s\", use \"%s\" instead",
+ bus, slot, func, emul,
+ pde->pe_alias);
+ return (EINVAL);
+ }
+ fi->fi_pde = pde;
+ error = pci_emul_init(ctx, pde, bus, slot,
+ func, fi);
+ if (error)
+ return (error);
+ }
+ }
+
+ /* second run: assign BARs and free list */
+ struct pci_bar_allocation *bar;
+ struct pci_bar_allocation *bar_tmp;
+ TAILQ_FOREACH_SAFE(bar, &pci_bars, chain, bar_tmp) {
+ pci_emul_assign_bar(bar->pdi, bar->idx, bar->type,
+ bar->size);
+ free(bar);
+ }
+ TAILQ_INIT(&pci_bars);
+
+ /*
+ * Add some slop to the I/O and memory resources decoded by
+ * this bus to give a guest some flexibility if it wants to
+ * reprogram the BARs.
+ */
+ pci_emul_iobase += BUSIO_ROUNDUP;
+ pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP);
+ bi->iolimit = pci_emul_iobase;
+
+ pci_emul_membase32 += BUSMEM32_ROUNDUP;
+ pci_emul_membase32 = roundup2(pci_emul_membase32,
+ BUSMEM32_ROUNDUP);
+ bi->memlimit32 = pci_emul_membase32;
+
+ pci_emul_membase64 += BUSMEM64_ROUNDUP;
+ pci_emul_membase64 = roundup2(pci_emul_membase64,
+ BUSMEM64_ROUNDUP);
+ bi->memlimit64 = pci_emul_membase64;
+ }
+
+ /*
+ * PCI backends are initialized before routing INTx interrupts
+ * so that LPC devices are able to reserve ISA IRQs before
+ * routing PIRQ pins.
+ */
+ for (bus = 0; bus < MAXBUSES; bus++) {
+ if ((bi = pci_businfo[bus]) == NULL)
+ continue;
+
+ for (slot = 0; slot < MAXSLOTS; slot++) {
+ si = &bi->slotinfo[slot];
+ for (func = 0; func < MAXFUNCS; func++) {
+ fi = &si->si_funcs[func];
+ if (fi->fi_devi == NULL)
+ continue;
+ pci_lintr_route(fi->fi_devi);
+ }
+ }
+ }
+#ifdef __amd64__
+ lpc_pirq_routed();
+#endif
+
+ if ((error = init_bootorder()) != 0) {
+ warnx("%s: Unable to init bootorder", __func__);
+ return (error);
+ }
+
+ /*
+ * The guest physical memory map looks like the following on amd64:
+ * [0, lowmem) guest system memory
+ * [lowmem, 0xC0000000) memory hole (may be absent)
+ * [0xC0000000, 0xE0000000) PCI hole (32-bit BAR allocation)
+ * [0xE0000000, 0xF0000000) PCI extended config window
+ * [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware
+ * [4GB, 4GB + highmem) guest system memory
+ * [roundup(4GB + highmem, 32GB), ...) PCI 64-bit BAR allocation
+ *
+ * On arm64 the guest physical memory map looks like this:
+ * [0x0DF00000, 0x10000000) PCI I/O memory
+ * [0xA0000000, 0xE0000000) PCI 32-bit BAR allocation
+ * [0xE0000000, 0xF0000000) PCI extended config window
+ * [4GB, 4GB + highmem) guest system memory
+ * [roundup(4GB + highmem, 32GB), ...) PCI 64-bit BAR allocation
+ *
+ * "lowmem" is guest memory below 0xC0000000. amd64 guests provisioned
+ * with less than 3GB of RAM will have no memory above the 4GB boundary.
+ * System memory for arm64 guests is all above the 4GB boundary.
+ */
+
+ /*
+ * Accesses to memory addresses that are not allocated to system
+ * memory or PCI devices return 0xff's.
+ */
+ lowmem = vm_get_lowmem_size(ctx);
+ bzero(&mr, sizeof(struct mem_range));
+ mr.name = "PCI hole";
+ mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
+ mr.base = lowmem;
+ mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem;
+ mr.handler = pci_emul_fallback_handler;
+ error = register_mem_fallback(&mr);
+ assert(error == 0);
+
+ /* PCI extended config space */
+ bzero(&mr, sizeof(struct mem_range));
+ mr.name = "PCI ECFG";
+ mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
+ mr.base = PCI_EMUL_ECFG_BASE;
+ mr.size = PCI_EMUL_ECFG_SIZE;
+ mr.handler = pci_emul_ecfg_handler;
+ error = register_mem(&mr);
+ assert(error == 0);
+
+ return (0);
+}
+
+#ifdef __amd64__
+static void
+pci_apic_prt_entry(int bus __unused, int slot, int pin, struct pci_irq *irq,
+ void *arg __unused)
+{
+
+ dsdt_line(" Package ()");
+ dsdt_line(" {");
+ dsdt_line(" 0x%X,", slot << 16 | 0xffff);
+ dsdt_line(" 0x%02X,", pin - 1);
+ dsdt_line(" Zero,");
+ dsdt_line(" 0x%X", irq->ioapic_irq);
+ dsdt_line(" },");
+}
+
+static void
+pci_pirq_prt_entry(int bus __unused, int slot, int pin, struct pci_irq *irq,
+ void *arg __unused)
+{
+ char *name;
+
+ name = lpc_pirq_name(irq->pirq_pin);
+ if (name == NULL)
+ return;
+ dsdt_line(" Package ()");
+ dsdt_line(" {");
+ dsdt_line(" 0x%X,", slot << 16 | 0xffff);
+ dsdt_line(" 0x%02X,", pin - 1);
+ dsdt_line(" %s,", name);
+ dsdt_line(" 0x00");
+ dsdt_line(" },");
+ free(name);
+}
+#endif
+
+/*
+ * A bhyve virtual machine has a flat PCI hierarchy with a root port
+ * corresponding to each PCI bus.
+ */
+static void
+pci_bus_write_dsdt(int bus)
+{
+ struct businfo *bi;
+ struct slotinfo *si;
+ struct pci_devinst *pi;
+ int func, slot;
+
+ /*
+ * If there are no devices on this 'bus' then just return.
+ */
+ if ((bi = pci_businfo[bus]) == NULL) {
+ /*
+ * Bus 0 is special because it decodes the I/O ports used
+ * for PCI config space access even if there are no devices
+ * on it.
+ */
+ if (bus != 0)
+ return;
+ }
+
+ dsdt_line(" Device (PC%02X)", bus);
+ dsdt_line(" {");
+ dsdt_line(" Name (_HID, EisaId (\"PNP0A03\"))");
+
+ dsdt_line(" Method (_BBN, 0, NotSerialized)");
+ dsdt_line(" {");
+ dsdt_line(" Return (0x%08X)", bus);
+ dsdt_line(" }");
+ dsdt_line(" Name (_CRS, ResourceTemplate ()");
+ dsdt_line(" {");
+ dsdt_line(" WordBusNumber (ResourceProducer, MinFixed, "
+ "MaxFixed, PosDecode,");
+ dsdt_line(" 0x0000, // Granularity");
+ dsdt_line(" 0x%04X, // Range Minimum", bus);
+ dsdt_line(" 0x%04X, // Range Maximum", bus);
+ dsdt_line(" 0x0000, // Translation Offset");
+ dsdt_line(" 0x0001, // Length");
+ dsdt_line(" ,, )");
+
+#ifdef __amd64__
+ if (bus == 0) {
+ dsdt_indent(3);
+ dsdt_fixed_ioport(0xCF8, 8);
+ dsdt_unindent(3);
+
+ dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, "
+ "PosDecode, EntireRange,");
+ dsdt_line(" 0x0000, // Granularity");
+ dsdt_line(" 0x0000, // Range Minimum");
+ dsdt_line(" 0x0CF7, // Range Maximum");
+ dsdt_line(" 0x0000, // Translation Offset");
+ dsdt_line(" 0x0CF8, // Length");
+ dsdt_line(" ,, , TypeStatic)");
+
+ dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, "
+ "PosDecode, EntireRange,");
+ dsdt_line(" 0x0000, // Granularity");
+ dsdt_line(" 0x0D00, // Range Minimum");
+ dsdt_line(" 0x%04X, // Range Maximum",
+ PCI_EMUL_IOBASE - 1);
+ dsdt_line(" 0x0000, // Translation Offset");
+ dsdt_line(" 0x%04X, // Length",
+ PCI_EMUL_IOBASE - 0x0D00);
+ dsdt_line(" ,, , TypeStatic)");
+
+ if (bi == NULL) {
+ dsdt_line(" })");
+ goto done;
+ }
+ }
+#endif
+ assert(bi != NULL);
+
+ /* i/o window */
+ dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, "
+ "PosDecode, EntireRange,");
+ dsdt_line(" 0x0000, // Granularity");
+ dsdt_line(" 0x%04X, // Range Minimum", bi->iobase);
+ dsdt_line(" 0x%04X, // Range Maximum",
+ bi->iolimit - 1);
+ dsdt_line(" 0x0000, // Translation Offset");
+ dsdt_line(" 0x%04X, // Length",
+ bi->iolimit - bi->iobase);
+ dsdt_line(" ,, , TypeStatic)");
+
+ /* mmio window (32-bit) */
+ dsdt_line(" DWordMemory (ResourceProducer, PosDecode, "
+ "MinFixed, MaxFixed, NonCacheable, ReadWrite,");
+ dsdt_line(" 0x00000000, // Granularity");
+ dsdt_line(" 0x%08X, // Range Minimum\n", bi->membase32);
+ dsdt_line(" 0x%08X, // Range Maximum\n",
+ bi->memlimit32 - 1);
+ dsdt_line(" 0x00000000, // Translation Offset");
+ dsdt_line(" 0x%08X, // Length\n",
+ bi->memlimit32 - bi->membase32);
+ dsdt_line(" ,, , AddressRangeMemory, TypeStatic)");
+
+ /* mmio window (64-bit) */
+ dsdt_line(" QWordMemory (ResourceProducer, PosDecode, "
+ "MinFixed, MaxFixed, NonCacheable, ReadWrite,");
+ dsdt_line(" 0x0000000000000000, // Granularity");
+ dsdt_line(" 0x%016lX, // Range Minimum\n", bi->membase64);
+ dsdt_line(" 0x%016lX, // Range Maximum\n",
+ bi->memlimit64 - 1);
+ dsdt_line(" 0x0000000000000000, // Translation Offset");
+ dsdt_line(" 0x%016lX, // Length\n",
+ bi->memlimit64 - bi->membase64);
+ dsdt_line(" ,, , AddressRangeMemory, TypeStatic)");
+ dsdt_line(" })");
+
+#ifdef __amd64__
+ if (pci_count_lintr(bus) != 0) {
+ dsdt_indent(2);
+ dsdt_line("Name (PPRT, Package ()");
+ dsdt_line("{");
+ pci_walk_lintr(bus, pci_pirq_prt_entry, NULL);
+ dsdt_line("})");
+ dsdt_line("Name (APRT, Package ()");
+ dsdt_line("{");
+ pci_walk_lintr(bus, pci_apic_prt_entry, NULL);
+ dsdt_line("})");
+ dsdt_line("Method (_PRT, 0, NotSerialized)");
+ dsdt_line("{");
+ dsdt_line(" If (PICM)");
+ dsdt_line(" {");
+ dsdt_line(" Return (APRT)");
+ dsdt_line(" }");
+ dsdt_line(" Else");
+ dsdt_line(" {");
+ dsdt_line(" Return (PPRT)");
+ dsdt_line(" }");
+ dsdt_line("}");
+ dsdt_unindent(2);
+ }
+#endif
+
+ dsdt_indent(2);
+ for (slot = 0; slot < MAXSLOTS; slot++) {
+ si = &bi->slotinfo[slot];
+ for (func = 0; func < MAXFUNCS; func++) {
+ pi = si->si_funcs[func].fi_devi;
+ if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL)
+ pi->pi_d->pe_write_dsdt(pi);
+ }
+ }
+ dsdt_unindent(2);
+#ifdef __amd64__
+done:
+#endif
+ dsdt_line(" }");
+}
+
+void
+pci_write_dsdt(void)
+{
+ int bus;
+
+ dsdt_indent(1);
+ dsdt_line("Name (PICM, 0x00)");
+ dsdt_line("Method (_PIC, 1, NotSerialized)");
+ dsdt_line("{");
+ dsdt_line(" Store (Arg0, PICM)");
+ dsdt_line("}");
+ dsdt_line("");
+ dsdt_line("Scope (_SB)");
+ dsdt_line("{");
+ for (bus = 0; bus < MAXBUSES; bus++)
+ pci_bus_write_dsdt(bus);
+ dsdt_line("}");
+ dsdt_unindent(1);
+}
+
+int
+pci_bus_configured(int bus)
+{
+ assert(bus >= 0 && bus < MAXBUSES);
+ return (pci_businfo[bus] != NULL);
+}
+
+int
+pci_msi_enabled(struct pci_devinst *pi)
+{
+ return (pi->pi_msi.enabled);
+}
+
+int
+pci_msi_maxmsgnum(struct pci_devinst *pi)
+{
+ if (pi->pi_msi.enabled)
+ return (pi->pi_msi.maxmsgnum);
+ else
+ return (0);
+}
+
+int
+pci_msix_enabled(struct pci_devinst *pi)
+{
+
+ return (pi->pi_msix.enabled && !pi->pi_msi.enabled);
+}
+
+void
+pci_generate_msix(struct pci_devinst *pi, int index)
+{
+ struct msix_table_entry *mte;
+
+ if (!pci_msix_enabled(pi))
+ return;
+
+ if (pi->pi_msix.function_mask)
+ return;
+
+ if (index >= pi->pi_msix.table_count)
+ return;
+
+ mte = &pi->pi_msix.table[index];
+ if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+ /* XXX Set PBA bit if interrupt is disabled */
+ vm_raise_msi(pi->pi_vmctx, mte->addr, mte->msg_data,
+ pi->pi_bus, pi->pi_slot, pi->pi_func);
+ }
+}
+
+void
+pci_generate_msi(struct pci_devinst *pi, int index)
+{
+
+ if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) {
+ vm_raise_msi(pi->pi_vmctx, pi->pi_msi.addr,
+ pi->pi_msi.msg_data + index,
+ pi->pi_bus, pi->pi_slot, pi->pi_func);
+ }
+}
+
+static bool
+pci_lintr_permitted(struct pci_devinst *pi)
+{
+ uint16_t cmd;
+
+ cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
+ return (!(pi->pi_msi.enabled || pi->pi_msix.enabled ||
+ (cmd & PCIM_CMD_INTxDIS)));
+}
+
+void
+pci_lintr_request(struct pci_devinst *pi)
+{
+ struct businfo *bi;
+ struct slotinfo *si;
+ int bestpin, bestcount, pin;
+
+ bi = pci_businfo[pi->pi_bus];
+ assert(bi != NULL);
+
+ /*
+ * Just allocate a pin from our slot. The pin will be
+ * assigned IRQs later when interrupts are routed.
+ */
+ si = &bi->slotinfo[pi->pi_slot];
+ bestpin = 0;
+ bestcount = si->si_intpins[0].ii_count;
+ for (pin = 1; pin < 4; pin++) {
+ if (si->si_intpins[pin].ii_count < bestcount) {
+ bestpin = pin;
+ bestcount = si->si_intpins[pin].ii_count;
+ }
+ }
+
+ si->si_intpins[bestpin].ii_count++;
+ pi->pi_lintr.pin = bestpin + 1;
+ pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1);
+}
+
+static void
+pci_lintr_route(struct pci_devinst *pi)
+{
+ struct businfo *bi;
+ struct intxinfo *ii;
+ struct pci_irq *irq;
+
+ if (pi->pi_lintr.pin == 0)
+ return;
+
+ bi = pci_businfo[pi->pi_bus];
+ assert(bi != NULL);
+ ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1];
+ irq = &ii->ii_irq;
+ pci_irq_route(pi, irq);
+ pi->pi_lintr.irq = *irq;
+ pci_set_cfgdata8(pi, PCIR_INTLINE, pci_irq_intline(irq));
+}
+
+void
+pci_lintr_assert(struct pci_devinst *pi)
+{
+
+ assert(pi->pi_lintr.pin > 0);
+
+ pthread_mutex_lock(&pi->pi_lintr.lock);
+ if (pi->pi_lintr.state == IDLE) {
+ if (pci_lintr_permitted(pi)) {
+ pi->pi_lintr.state = ASSERTED;
+ pci_irq_assert(pi);
+ } else
+ pi->pi_lintr.state = PENDING;
+ }
+ pthread_mutex_unlock(&pi->pi_lintr.lock);
+}
+
+void
+pci_lintr_deassert(struct pci_devinst *pi)
+{
+
+ assert(pi->pi_lintr.pin > 0);
+
+ pthread_mutex_lock(&pi->pi_lintr.lock);
+ if (pi->pi_lintr.state == ASSERTED) {
+ pi->pi_lintr.state = IDLE;
+ pci_irq_deassert(pi);
+ } else if (pi->pi_lintr.state == PENDING)
+ pi->pi_lintr.state = IDLE;
+ pthread_mutex_unlock(&pi->pi_lintr.lock);
+}
+
+static void
+pci_lintr_update(struct pci_devinst *pi)
+{
+
+ pthread_mutex_lock(&pi->pi_lintr.lock);
+ if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) {
+ pci_irq_deassert(pi);
+ pi->pi_lintr.state = PENDING;
+ } else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) {
+ pi->pi_lintr.state = ASSERTED;
+ pci_irq_assert(pi);
+ }
+ pthread_mutex_unlock(&pi->pi_lintr.lock);
+}
+
+int
+pci_count_lintr(int bus)
+{
+ int count, slot, pin;
+ struct slotinfo *slotinfo;
+
+ count = 0;
+ if (pci_businfo[bus] != NULL) {
+ for (slot = 0; slot < MAXSLOTS; slot++) {
+ slotinfo = &pci_businfo[bus]->slotinfo[slot];
+ for (pin = 0; pin < 4; pin++) {
+ if (slotinfo->si_intpins[pin].ii_count != 0)
+ count++;
+ }
+ }
+ }
+ return (count);
+}
+
+void
+pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg)
+{
+ struct businfo *bi;
+ struct slotinfo *si;
+ struct intxinfo *ii;
+ int slot, pin;
+
+ if ((bi = pci_businfo[bus]) == NULL)
+ return;
+
+ for (slot = 0; slot < MAXSLOTS; slot++) {
+ si = &bi->slotinfo[slot];
+ for (pin = 0; pin < 4; pin++) {
+ ii = &si->si_intpins[pin];
+ if (ii->ii_count != 0)
+ cb(bus, slot, pin + 1, &ii->ii_irq, arg);
+ }
+ }
+}
+
+/*
+ * Return 1 if the emulated device in 'slot' is a multi-function device.
+ * Return 0 otherwise.
+ */
+static int
+pci_emul_is_mfdev(int bus, int slot)
+{
+ struct businfo *bi;
+ struct slotinfo *si;
+ int f, numfuncs;
+
+ numfuncs = 0;
+ if ((bi = pci_businfo[bus]) != NULL) {
+ si = &bi->slotinfo[slot];
+ for (f = 0; f < MAXFUNCS; f++) {
+ if (si->si_funcs[f].fi_devi != NULL) {
+ numfuncs++;
+ }
+ }
+ }
+ return (numfuncs > 1);
+}
+
+/*
+ * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on
+ * whether or not is a multi-function being emulated in the pci 'slot'.
+ */
+static void
+pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv)
+{
+ int mfdev;
+
+ if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) {
+ mfdev = pci_emul_is_mfdev(bus, slot);
+ switch (bytes) {
+ case 1:
+ case 2:
+ *rv &= ~PCIM_MFDEV;
+ if (mfdev) {
+ *rv |= PCIM_MFDEV;
+ }
+ break;
+ case 4:
+ *rv &= ~(PCIM_MFDEV << 16);
+ if (mfdev) {
+ *rv |= (PCIM_MFDEV << 16);
+ }
+ break;
+ }
+ }
+}
+
+/*
+ * Update device state in response to changes to the PCI command
+ * register.
+ */
+void
+pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old)
+{
+ int i;
+ uint16_t changed, new;
+
+ new = pci_get_cfgdata16(pi, PCIR_COMMAND);
+ changed = old ^ new;
+
+ /*
+ * If the MMIO or I/O address space decoding has changed then
+ * register/unregister all BARs that decode that address space.
+ */
+ for (i = 0; i <= PCI_BARMAX_WITH_ROM; i++) {
+ switch (pi->pi_bar[i].type) {
+ case PCIBAR_NONE:
+ case PCIBAR_MEMHI64:
+ break;
+ case PCIBAR_IO:
+ /* I/O address space decoding changed? */
+ if (changed & PCIM_CMD_PORTEN) {
+ if (new & PCIM_CMD_PORTEN)
+ register_bar(pi, i);
+ else
+ unregister_bar(pi, i);
+ }
+ break;
+ case PCIBAR_ROM:
+ /* skip (un-)register of ROM if it disabled */
+ if (!romen(pi))
+ break;
+ /* fallthrough */
+ case PCIBAR_MEM32:
+ case PCIBAR_MEM64:
+ /* MMIO address space decoding changed? */
+ if (changed & PCIM_CMD_MEMEN) {
+ if (new & PCIM_CMD_MEMEN)
+ register_bar(pi, i);
+ else
+ unregister_bar(pi, i);
+ }
+ break;
+ default:
+ assert(0);
+ }
+ }
+
+ /*
+ * If INTx has been unmasked and is pending, assert the
+ * interrupt.
+ */
+ pci_lintr_update(pi);
+}
+
+static void
+pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes)
+{
+ int rshift;
+ uint32_t cmd, old, readonly;
+
+ cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */
+
+ /*
+ * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3.
+ *
+ * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are
+ * 'write 1 to clear'. However these bits are not set to '1' by
+ * any device emulation so it is simpler to treat them as readonly.
+ */
+ rshift = (coff & 0x3) * 8;
+ readonly = 0xFFFFF880 >> rshift;
+
+ old = CFGREAD(pi, coff, bytes);
+ new &= ~readonly;
+ new |= (old & readonly);
+ CFGWRITE(pi, coff, new, bytes); /* update config */
+
+ pci_emul_cmd_changed(pi, cmd);
+}
+
+static void
+pci_cfgrw(int in, int bus, int slot, int func, int coff, int bytes,
+ uint32_t *valp)
+{
+ struct businfo *bi;
+ struct slotinfo *si;
+ struct pci_devinst *pi;
+ struct pci_devemu *pe;
+ int idx, needcfg;
+ uint64_t addr, bar, mask;
+
+ if ((bi = pci_businfo[bus]) != NULL) {
+ si = &bi->slotinfo[slot];
+ pi = si->si_funcs[func].fi_devi;
+ } else
+ pi = NULL;
+
+ /*
+ * Just return if there is no device at this slot:func or if the
+ * guest is doing an un-aligned access.
+ */
+ if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) ||
+ (coff & (bytes - 1)) != 0) {
+ if (in)
+ *valp = 0xffffffff;
+ return;
+ }
+
+ /*
+ * Ignore all writes beyond the standard config space and return all
+ * ones on reads.
+ */
+ if (coff >= PCI_REGMAX + 1) {
+ if (in) {
+ *valp = 0xffffffff;
+ /*
+ * Extended capabilities begin at offset 256 in config
+ * space. Absence of extended capabilities is signaled
+ * with all 0s in the extended capability header at
+ * offset 256.
+ */
+ if (coff <= PCI_REGMAX + 4)
+ *valp = 0x00000000;
+ }
+ return;
+ }
+
+ pe = pi->pi_d;
+
+ /*
+ * Config read
+ */
+ if (in) {
+ /* Let the device emulation override the default handler */
+ if (pe->pe_cfgread != NULL) {
+ needcfg = pe->pe_cfgread(pi, coff, bytes, valp);
+ } else {
+ needcfg = 1;
+ }
+
+ if (needcfg)
+ *valp = CFGREAD(pi, coff, bytes);
+
+ pci_emul_hdrtype_fixup(bus, slot, coff, bytes, valp);
+ } else {
+ /* Let the device emulation override the default handler */
+ if (pe->pe_cfgwrite != NULL &&
+ (*pe->pe_cfgwrite)(pi, coff, bytes, *valp) == 0)
+ return;
+
+ /*
+ * Special handling for write to BAR and ROM registers
+ */
+ if (is_pcir_bar(coff) || is_pcir_bios(coff)) {
+ /*
+ * Ignore writes to BAR registers that are not
+ * 4-byte aligned.
+ */
+ if (bytes != 4 || (coff & 0x3) != 0)
+ return;
+
+ if (is_pcir_bar(coff)) {
+ idx = (coff - PCIR_BAR(0)) / 4;
+ } else if (is_pcir_bios(coff)) {
+ idx = PCI_ROM_IDX;
+ } else {
+ errx(4, "%s: invalid BAR offset %d", __func__,
+ coff);
+ }
+
+ mask = ~(pi->pi_bar[idx].size - 1);
+ switch (pi->pi_bar[idx].type) {
+ case PCIBAR_NONE:
+ pi->pi_bar[idx].addr = bar = 0;
+ break;
+ case PCIBAR_IO:
+ addr = *valp & mask;
+#if defined(PCI_EMUL_IOMASK)
+ addr &= PCI_EMUL_IOMASK;
+#endif
+ bar = addr | pi->pi_bar[idx].lobits;
+ /*
+ * Register the new BAR value for interception
+ */
+ if (addr != pi->pi_bar[idx].addr) {
+ update_bar_address(pi, addr, idx,
+ PCIBAR_IO);
+ }
+ break;
+ case PCIBAR_MEM32:
+ addr = bar = *valp & mask;
+ bar |= pi->pi_bar[idx].lobits;
+ if (addr != pi->pi_bar[idx].addr) {
+ update_bar_address(pi, addr, idx,
+ PCIBAR_MEM32);
+ }
+ break;
+ case PCIBAR_MEM64:
+ addr = bar = *valp & mask;
+ bar |= pi->pi_bar[idx].lobits;
+ if (addr != (uint32_t)pi->pi_bar[idx].addr) {
+ update_bar_address(pi, addr, idx,
+ PCIBAR_MEM64);
+ }
+ break;
+ case PCIBAR_MEMHI64:
+ mask = ~(pi->pi_bar[idx - 1].size - 1);
+ addr = ((uint64_t)*valp << 32) & mask;
+ bar = addr >> 32;
+ if (bar != pi->pi_bar[idx - 1].addr >> 32) {
+ update_bar_address(pi, addr, idx - 1,
+ PCIBAR_MEMHI64);
+ }
+ break;
+ case PCIBAR_ROM:
+ addr = bar = *valp & mask;
+ if (memen(pi) && romen(pi)) {
+ unregister_bar(pi, idx);
+ }
+ pi->pi_bar[idx].addr = addr;
+ pi->pi_bar[idx].lobits = *valp &
+ PCIM_BIOS_ENABLE;
+ /* romen could have changed it value */
+ if (memen(pi) && romen(pi)) {
+ register_bar(pi, idx);
+ }
+ bar |= pi->pi_bar[idx].lobits;
+ break;
+ default:
+ assert(0);
+ }
+ pci_set_cfgdata32(pi, coff, bar);
+
+ } else if (pci_emul_iscap(pi, coff)) {
+ pci_emul_capwrite(pi, coff, bytes, *valp, 0, 0);
+ } else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) {
+ pci_emul_cmdsts_write(pi, coff, *valp, bytes);
+ } else {
+ CFGWRITE(pi, coff, *valp, bytes);
+ }
+ }
+}
+
+#ifdef __amd64__
+static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff;
+
+static int
+pci_emul_cfgaddr(struct vmctx *ctx __unused, int in,
+ int port __unused, int bytes, uint32_t *eax, void *arg __unused)
+{
+ uint32_t x;
+
+ if (bytes != 4) {
+ if (in)
+ *eax = (bytes == 2) ? 0xffff : 0xff;
+ return (0);
+ }
+
+ if (in) {
+ x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff;
+ if (cfgenable)
+ x |= CONF1_ENABLE;
+ *eax = x;
+ } else {
+ x = *eax;
+ cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE;
+ cfgoff = (x & PCI_REGMAX) & ~0x03;
+ cfgfunc = (x >> 8) & PCI_FUNCMAX;
+ cfgslot = (x >> 11) & PCI_SLOTMAX;
+ cfgbus = (x >> 16) & PCI_BUSMAX;
+ }
+
+ return (0);
+}
+INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr);
+
+static int
+pci_emul_cfgdata(struct vmctx *ctx __unused, int in, int port,
+ int bytes, uint32_t *eax, void *arg __unused)
+{
+ int coff;
+
+ assert(bytes == 1 || bytes == 2 || bytes == 4);
+
+ coff = cfgoff + (port - CONF1_DATA_PORT);
+ if (cfgenable) {
+ pci_cfgrw(in, cfgbus, cfgslot, cfgfunc, coff, bytes, eax);
+ } else {
+ /* Ignore accesses to cfgdata if not enabled by cfgaddr */
+ if (in)
+ *eax = 0xffffffff;
+ }
+ return (0);
+}
+
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
+#endif
+
+#ifdef BHYVE_SNAPSHOT
+/*
+ * Saves/restores PCI device emulated state. Returns 0 on success.
+ */
+static int
+pci_snapshot_pci_dev(struct vm_snapshot_meta *meta)
+{
+ struct pci_devinst *pi;
+ int i;
+ int ret;
+
+ pi = meta->dev_data;
+
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.enabled, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.addr, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.msg_data, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.maxmsgnum, meta, ret, done);
+
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.enabled, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_bar, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_bar, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_offset, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_count, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_offset, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_size, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.function_mask, meta, ret, done);
+
+ SNAPSHOT_BUF_OR_LEAVE(pi->pi_cfgdata, sizeof(pi->pi_cfgdata),
+ meta, ret, done);
+
+ for (i = 0; i < (int)nitems(pi->pi_bar); i++) {
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].type, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].size, meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].addr, meta, ret, done);
+ }
+
+ /* Restore MSI-X table. */
+ for (i = 0; i < pi->pi_msix.table_count; i++) {
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].addr,
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].msg_data,
+ meta, ret, done);
+ SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].vector_control,
+ meta, ret, done);
+ }
+
+done:
+ return (ret);
+}
+
+int
+pci_snapshot(struct vm_snapshot_meta *meta)
+{
+ struct pci_devemu *pde;
+ struct pci_devinst *pdi;
+ int ret;
+
+ assert(meta->dev_name != NULL);
+
+ pdi = meta->dev_data;
+ pde = pdi->pi_d;
+
+ if (pde->pe_snapshot == NULL)
+ return (ENOTSUP);
+
+ ret = pci_snapshot_pci_dev(meta);
+ if (ret == 0)
+ ret = (*pde->pe_snapshot)(meta);
+
+ return (ret);
+}
+
+int
+pci_pause(struct pci_devinst *pdi)
+{
+ struct pci_devemu *pde = pdi->pi_d;
+
+ if (pde->pe_pause == NULL) {
+ /* The pause/resume functionality is optional. */
+ return (0);
+ }
+
+ return (*pde->pe_pause)(pdi);
+}
+
+int
+pci_resume(struct pci_devinst *pdi)
+{
+ struct pci_devemu *pde = pdi->pi_d;
+
+ if (pde->pe_resume == NULL) {
+ /* The pause/resume functionality is optional. */
+ return (0);
+ }
+
+ return (*pde->pe_resume)(pdi);
+}
+#endif
+
+#define PCI_EMUL_TEST
+#ifdef PCI_EMUL_TEST
+/*
+ * Define a dummy test device
+ */
+#define DIOSZ 8
+#define DMEMSZ 4096
+struct pci_emul_dsoftc {
+ uint8_t ioregs[DIOSZ];
+ uint8_t memregs[2][DMEMSZ];
+};
+
+#define PCI_EMUL_MSI_MSGS 4
+#define PCI_EMUL_MSIX_MSGS 16
+
+static int
+pci_emul_dinit(struct pci_devinst *pi, nvlist_t *nvl __unused)
+{
+ int error;
+ struct pci_emul_dsoftc *sc;
+
+ sc = calloc(1, sizeof(struct pci_emul_dsoftc));
+
+ pi->pi_arg = sc;
+
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD);
+ pci_set_cfgdata8(pi, PCIR_CLASS, 0x02);
+
+ error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS);
+ assert(error == 0);
+
+ error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ);
+ assert(error == 0);
+
+ error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ);
+ assert(error == 0);
+
+ error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ);
+ assert(error == 0);
+
+ return (0);
+}
+
+static void
+pci_emul_diow(struct pci_devinst *pi, int baridx, uint64_t offset, int size,
+ uint64_t value)
+{
+ int i;
+ struct pci_emul_dsoftc *sc = pi->pi_arg;
+
+ if (baridx == 0) {
+ if (offset + size > DIOSZ) {
+ printf("diow: iow too large, offset %ld size %d\n",
+ offset, size);
+ return;
+ }
+
+ if (size == 1) {
+ sc->ioregs[offset] = value & 0xff;
+ } else if (size == 2) {
+ *(uint16_t *)&sc->ioregs[offset] = value & 0xffff;
+ } else if (size == 4) {
+ *(uint32_t *)&sc->ioregs[offset] = value;
+ } else {
+ printf("diow: iow unknown size %d\n", size);
+ }
+
+ /*
+ * Special magic value to generate an interrupt
+ */
+ if (offset == 4 && size == 4 && pci_msi_enabled(pi))
+ pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi));
+
+ if (value == 0xabcdef) {
+ for (i = 0; i < pci_msi_maxmsgnum(pi); i++)
+ pci_generate_msi(pi, i);
+ }
+ }
+
+ if (baridx == 1 || baridx == 2) {
+ if (offset + size > DMEMSZ) {
+ printf("diow: memw too large, offset %ld size %d\n",
+ offset, size);
+ return;
+ }
+
+ i = baridx - 1; /* 'memregs' index */
+
+ if (size == 1) {
+ sc->memregs[i][offset] = value;
+ } else if (size == 2) {
+ *(uint16_t *)&sc->memregs[i][offset] = value;
+ } else if (size == 4) {
+ *(uint32_t *)&sc->memregs[i][offset] = value;
+ } else if (size == 8) {
+ *(uint64_t *)&sc->memregs[i][offset] = value;
+ } else {
+ printf("diow: memw unknown size %d\n", size);
+ }
+
+ /*
+ * magic interrupt ??
+ */
+ }
+
+ if (baridx > 2 || baridx < 0) {
+ printf("diow: unknown bar idx %d\n", baridx);
+ }
+}
+
+static uint64_t
+pci_emul_dior(struct pci_devinst *pi, int baridx, uint64_t offset, int size)
+{
+ struct pci_emul_dsoftc *sc = pi->pi_arg;
+ uint32_t value;
+ int i;
+
+ if (baridx == 0) {
+ if (offset + size > DIOSZ) {
+ printf("dior: ior too large, offset %ld size %d\n",
+ offset, size);
+ return (0);
+ }
+
+ value = 0;
+ if (size == 1) {
+ value = sc->ioregs[offset];
+ } else if (size == 2) {
+ value = *(uint16_t *) &sc->ioregs[offset];
+ } else if (size == 4) {
+ value = *(uint32_t *) &sc->ioregs[offset];
+ } else {
+ printf("dior: ior unknown size %d\n", size);
+ }
+ }
+
+ if (baridx == 1 || baridx == 2) {
+ if (offset + size > DMEMSZ) {
+ printf("dior: memr too large, offset %ld size %d\n",
+ offset, size);
+ return (0);
+ }
+
+ i = baridx - 1; /* 'memregs' index */
+
+ if (size == 1) {
+ value = sc->memregs[i][offset];
+ } else if (size == 2) {
+ value = *(uint16_t *) &sc->memregs[i][offset];
+ } else if (size == 4) {
+ value = *(uint32_t *) &sc->memregs[i][offset];
+ } else if (size == 8) {
+ value = *(uint64_t *) &sc->memregs[i][offset];
+ } else {
+ printf("dior: ior unknown size %d\n", size);
+ }
+ }
+
+
+ if (baridx > 2 || baridx < 0) {
+ printf("dior: unknown bar idx %d\n", baridx);
+ return (0);
+ }
+
+ return (value);
+}
+
+#ifdef BHYVE_SNAPSHOT
+struct pci_devinst *
+pci_next(const struct pci_devinst *cursor)
+{
+ unsigned bus = 0, slot = 0, func = 0;
+ struct businfo *bi;
+ struct slotinfo *si;
+ struct funcinfo *fi;
+
+ bus = cursor ? cursor->pi_bus : 0;
+ slot = cursor ? cursor->pi_slot : 0;
+ func = cursor ? (cursor->pi_func + 1) : 0;
+
+ for (; bus < MAXBUSES; bus++) {
+ if ((bi = pci_businfo[bus]) == NULL)
+ continue;
+
+ if (slot >= MAXSLOTS)
+ slot = 0;
+
+ for (; slot < MAXSLOTS; slot++) {
+ si = &bi->slotinfo[slot];
+ if (func >= MAXFUNCS)
+ func = 0;
+ for (; func < MAXFUNCS; func++) {
+ fi = &si->si_funcs[func];
+ if (fi->fi_devi == NULL)
+ continue;
+
+ return (fi->fi_devi);
+ }
+ }
+ }
+
+ return (NULL);
+}
+
+static int
+pci_emul_snapshot(struct vm_snapshot_meta *meta __unused)
+{
+ return (0);
+}
+#endif
+
+static const struct pci_devemu pci_dummy = {
+ .pe_emu = "dummy",
+ .pe_init = pci_emul_dinit,
+ .pe_barwrite = pci_emul_diow,
+ .pe_barread = pci_emul_dior,
+#ifdef BHYVE_SNAPSHOT
+ .pe_snapshot = pci_emul_snapshot,
+#endif
+};
+PCI_EMUL_SET(pci_dummy);
+
+#endif /* PCI_EMUL_TEST */