diff options
Diffstat (limited to 'sys/amd64')
| -rw-r--r-- | sys/amd64/amd64/apic_vector.S | 11 | ||||
| -rw-r--r-- | sys/amd64/amd64/genassym.c | 12 | ||||
| -rw-r--r-- | sys/amd64/amd64/kexec_support.c | 300 | ||||
| -rw-r--r-- | sys/amd64/amd64/kexec_tramp.S | 91 | ||||
| -rw-r--r-- | sys/amd64/amd64/mp_machdep.c | 13 | ||||
| -rw-r--r-- | sys/amd64/amd64/trap.c | 4 | ||||
| -rw-r--r-- | sys/amd64/conf/GENERIC | 1 | ||||
| -rw-r--r-- | sys/amd64/conf/MINIMAL | 1 | ||||
| -rw-r--r-- | sys/amd64/include/kexec.h | 41 | ||||
| -rw-r--r-- | sys/amd64/include/smp.h | 1 | ||||
| -rw-r--r-- | sys/amd64/pt/pt.c | 221 | ||||
| -rw-r--r-- | sys/amd64/sgx/sgx_linux.c | 11 | ||||
| -rw-r--r-- | sys/amd64/vmm/vmm.c | 10 | ||||
| -rw-r--r-- | sys/amd64/vmm/vmm_dev_machdep.c | 255 | ||||
| -rw-r--r-- | sys/amd64/vmm/vmm_mem.h | 5 | ||||
| -rw-r--r-- | sys/amd64/vmm/vmm_mem_machdep.c | 61 |
16 files changed, 797 insertions, 241 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index e98bae9eb6c5..8691387a5a8e 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -204,6 +204,17 @@ IDTVEC(spuriousint) jmp doreti /* + * Executed by a CPU when it receives an IPI_OFF from another CPU. + * Should never return + */ + INTR_HANDLER cpuoff + KMSAN_ENTER + call cpuoff_handler + call as_lapic_eoi + KMSAN_LEAVE + jmp doreti + +/* * Executed by a CPU when it receives an IPI_SWI. */ INTR_HANDLER ipi_swi diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c index eb1b746f5893..2716784ee871 100644 --- a/sys/amd64/amd64/genassym.c +++ b/sys/amd64/amd64/genassym.c @@ -57,6 +57,7 @@ #include <vm/vm_param.h> #include <vm/pmap.h> #include <vm/vm_map.h> +#include <sys/kexec.h> #include <sys/proc.h> #include <x86/apicreg.h> #include <machine/cpu.h> @@ -65,6 +66,7 @@ #include <machine/proc.h> #include <machine/segments.h> #include <machine/efi.h> +#include <machine/kexec.h> ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); @@ -295,3 +297,13 @@ ASSYM(EC_R13, offsetof(struct efirt_callinfo, ec_r13)); ASSYM(EC_R14, offsetof(struct efirt_callinfo, ec_r14)); ASSYM(EC_R15, offsetof(struct efirt_callinfo, ec_r15)); ASSYM(EC_RFLAGS, offsetof(struct efirt_callinfo, ec_rflags)); + +/* Kexec */ +ASSYM(KEXEC_ENTRY, offsetof(struct kexec_image, entry)); +ASSYM(KEXEC_SEGMENTS, offsetof(struct kexec_image, segments)); +ASSYM(KEXEC_SEGMENT_MAX, KEXEC_SEGMENT_MAX); +ASSYM(KEXEC_IMAGE_SIZE, sizeof(struct kexec_image)); +ASSYM(KEXEC_STAGED_SEGMENT_SIZE, sizeof(struct kexec_segment_stage)); +ASSYM(KEXEC_SEGMENT_SIZE, offsetof(struct kexec_segment_stage, size)); +ASSYM(KEXEC_SEGMENT_MAP, offsetof(struct kexec_segment_stage, map_buf)); +ASSYM(KEXEC_SEGMENT_TARGET, offsetof(struct kexec_segment_stage, target)); diff --git a/sys/amd64/amd64/kexec_support.c b/sys/amd64/amd64/kexec_support.c new file mode 100644 index 000000000000..8189a48e9ae9 --- /dev/null +++ b/sys/amd64/amd64/kexec_support.c @@ -0,0 +1,300 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/conf.h> +#include <sys/interrupt.h> +#include <sys/kernel.h> +#include <sys/kexec.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_object.h> +#include <vm/vm_phys.h> +#include <vm/pmap.h> +#include <vm/vm_page.h> +#include <vm/vm_radix.h> + +#include <machine/intr_machdep.h> +#include <machine/kexec.h> +#include <machine/md_var.h> +#include <machine/pmap.h> +#include <x86/apicvar.h> + +/* + * Idea behind this: + * + * kexec_load_md(): + * - Update boot page tables (identity map) to include all pages needed before + * disabling MMU. + * + * kexec_reboot_md(): + * - Copy pages into target(s) + * - Do "other stuff" + * - Does not return + */ + +/* + * do_pte: Create PTE entries (4k pages). If false, create 2MB superpages. + * identity: This is for an identity map, treat `start` as a physical address. + * Only valid here if do_pte is false. + */ +static void +kexec_generate_page_tables(pml4_entry_t *root, vm_offset_t start, + vm_size_t size, bool do_pte, bool identity, struct pctrie_iter *pages) +{ + vm_paddr_t mpa; + vm_offset_t pg; + vm_size_t stride = do_pte ? PAGE_SIZE : NBPDR; + vm_page_t m; + vm_pindex_t i, j, k, l; + + pg = start & ~(stride - 1); + i = pmap_pml4e_index(pg); + j = pmap_pdpe_index(pg); + k = pmap_pde_index(pg); + l = pmap_pte_index(pg); + for (; pg < start + size; i++, j = 0, k = 0, l = 0) { + /* + * Walk linearly, as above, but one fell swoop, one page at a + * time. + */ + if (root[i] == 0) { + m = vm_radix_iter_next(pages); + mpa = VM_PAGE_TO_PHYS(m); + root[i] = mpa | PG_RW | PG_V; + } + pdp_entry_t *pdp = + (pdp_entry_t *)(PHYS_TO_DMAP(root[i] & PG_FRAME)); + for (; j < NPDPEPG && pg < start + size; j++, k = 0, l = 0) { + if (pdp[j] == 0) { + m = vm_radix_iter_next(pages); + mpa = VM_PAGE_TO_PHYS(m); + pdp[j] = mpa | PG_RW | PG_V; + } + pd_entry_t *pde = + (pd_entry_t *)(PHYS_TO_DMAP(pdp[j] & PG_FRAME)); + for (; k < NPDEPG && pg < start + size; k++, l = 0) { + if (pde[k] == 0) { + if (!do_pte) { + pde[k] = + (identity ? pg : pmap_kextract(pg)) | + PG_RW | PG_PS | PG_V; + pg += NBPDR; + continue; + } + m = vm_radix_iter_next(pages); + mpa = VM_PAGE_TO_PHYS(m); + pde[k] = mpa | PG_V | PG_RW; + } else if ((pde[k] & PG_PS) != 0) { + pg += NBPDR; + continue; + } + /* Populate the PTEs. */ + for (; l < NPTEPG && pg < start + size; + l++, pg += PAGE_SIZE) { + pt_entry_t *pte = + (pt_entry_t *)PHYS_TO_DMAP(pde[pmap_pde_index(pg)] & PG_FRAME); + pte[pmap_pte_index(pg)] = + pmap_kextract(pg) | PG_RW | PG_V; + } + } + } + } +} + +void +kexec_reboot_md(struct kexec_image *image) +{ + void (*kexec_do_tramp)(void) = image->md_image; + + intr_disable_all(); + lapic_disable(); + kexec_do_reboot_trampoline(VM_PAGE_TO_PHYS(image->first_md_page), + kexec_do_tramp); + + for (;;) + ; +} + +int +kexec_load_md(struct kexec_image *image) +{ + struct pctrie_iter pct_iter; + pml4_entry_t *PT4; + pdp_entry_t *PDP_l; + pd_entry_t *PD_l0; + vm_offset_t va; + int i; + + /* + * Start building the page table. + * First part of the page table is standard for all. + */ + vm_offset_t pa_pdp_l, pa_pd_l0, pa_pd_l1, pa_pd_l2, pa_pd_l3; + vm_page_t m; + + if (la57) + return (EINVAL); + + vm_radix_iter_init(&pct_iter, &image->map_obj->rtree); + /* Working in linear space in the mapped space, `va` is our tracker. */ + m = vm_radix_iter_lookup(&pct_iter, image->first_md_page->pindex); + va = (vm_offset_t)image->map_addr + ptoa(m->pindex); + /* We'll find a place for these later */ + PT4 = (void *)va; + va += PAGE_SIZE; + m = vm_radix_iter_next(&pct_iter); + pa_pdp_l = VM_PAGE_TO_PHYS(m); + PDP_l = (void *)va; + va += PAGE_SIZE; + m = vm_radix_iter_next(&pct_iter); + pa_pd_l0 = VM_PAGE_TO_PHYS(m); + PD_l0 = (void *)va; + va += PAGE_SIZE; + m = vm_radix_iter_next(&pct_iter); + pa_pd_l1 = VM_PAGE_TO_PHYS(m); + m = vm_radix_iter_next(&pct_iter); + pa_pd_l2 = VM_PAGE_TO_PHYS(m); + m = vm_radix_iter_next(&pct_iter); + pa_pd_l3 = VM_PAGE_TO_PHYS(m); + m = vm_radix_iter_next(&pct_iter); + + /* 1:1 mapping of lower 4G */ + PT4[0] = (pml4_entry_t)pa_pdp_l | PG_V | PG_RW; + PDP_l[0] = (pdp_entry_t)pa_pd_l0 | PG_V | PG_RW; + PDP_l[1] = (pdp_entry_t)pa_pd_l1 | PG_V | PG_RW; + PDP_l[2] = (pdp_entry_t)pa_pd_l2 | PG_V | PG_RW; + PDP_l[3] = (pdp_entry_t)pa_pd_l3 | PG_V | PG_RW; + for (i = 0; i < 4 * NPDEPG; i++) { /* we overflow PD_l0 into _l1, etc */ + PD_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V | + PG_RW | PG_PS; + } + + /* Map the target(s) in 2MB chunks. */ + for (i = 0; i < KEXEC_SEGMENT_MAX; i++) { + struct kexec_segment_stage *s = &image->segments[i]; + + if (s->size == 0) + break; + kexec_generate_page_tables(PT4, s->target, s->size, false, + true, &pct_iter); + } + /* Now create the source page tables */ + kexec_generate_page_tables(PT4, image->map_addr, image->map_size, true, + false, &pct_iter); + kexec_generate_page_tables(PT4, + trunc_page((vm_offset_t)kexec_do_reboot_trampoline), + PAGE_SIZE, true, false, &pct_iter); + KASSERT(m != NULL, ("kexec_load_md: Missing trampoline page!\n")); + + /* MD control pages start at this next page. */ + image->md_image = (void *)(image->map_addr + ptoa(m->pindex)); + bcopy(kexec_do_reboot, image->md_image, kexec_do_reboot_size); + + /* Save the image into the MD page(s) right after the trampoline */ + bcopy(image, (void *)((vm_offset_t)image->md_image + + (vm_offset_t)&kexec_saved_image - (vm_offset_t)&kexec_do_reboot), + sizeof(*image)); + + return (0); +} + +/* + * Required pages: + * - L4 (1) (root) + * - L3 (PDPE) - 2 (bottom 512GB, bottom 4 used, top range for kernel map) + * - L2 (PDP) - 5 (2MB superpage mappings, 1GB each, for bottom 4GB, top 1) + * - L1 (PDR) - 1 (kexec trampoline page, first MD page) + * - kexec_do_reboot trampoline - 1 + * - Slop pages for staging (in case it's not aligned nicely) - 3 (worst case) + * + * Minimum 9 pages for the direct map. + */ +int +kexec_md_pages(struct kexec_segment *seg_in) +{ + struct kexec_segment *segs = seg_in; + vm_size_t pages = 13; /* Minimum number of starting pages */ + vm_paddr_t cur_addr = (1UL << 32) - 1; /* Bottom 4G will be identity mapped in full */ + vm_size_t source_total = 0; + + for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) { + vm_offset_t start, end; + if (segs[i].memsz == 0) + break; + + end = round_2mpage((vm_offset_t)segs[i].mem + segs[i].memsz); + start = trunc_2mpage((vm_offset_t)segs[i].mem); + start = max(start, cur_addr + 1); + /* + * Round to cover the full range of page table pages for each + * segment. + */ + source_total += round_2mpage(end - start); + + /* + * Bottom 4GB are identity mapped already in the count, so skip + * any segments that end up there, this will short-circuit that. + */ + if (end <= cur_addr + 1) + continue; + + if (pmap_pml4e_index(end) != pmap_pml4e_index(cur_addr)) { + /* Need a new 512GB mapping page */ + pages++; + pages += howmany(end - (start & ~PML4MASK), NBPML4); + pages += howmany(end - (start & ~PDPMASK), NBPDP); + pages += howmany(end - (start & ~PDRMASK), NBPDR); + + } else if (pmap_pdpe_index(end) != pmap_pdpe_index(cur_addr)) { + pages++; + pages += howmany(end - (start & ~PDPMASK), NBPDP) - 1; + pages += howmany(end - (start & ~PDRMASK), NBPDR); + } + + } + /* Be pessimistic when totaling up source pages. We likely + * can't use superpages, so need to map each page individually. + */ + pages += howmany(source_total, NBPDR); + pages += howmany(source_total, NBPDP); + pages += howmany(source_total, NBPML4); + + /* + * Be intentionally sloppy adding in the extra page table pages. It's + * better to go over than under. + */ + pages += howmany(pages * PAGE_SIZE, NBPDR); + pages += howmany(pages * PAGE_SIZE, NBPDP); + pages += howmany(pages * PAGE_SIZE, NBPML4); + + /* Add in the trampoline pages */ + pages += howmany(kexec_do_reboot_size, PAGE_SIZE); + + return (pages); +} diff --git a/sys/amd64/amd64/kexec_tramp.S b/sys/amd64/amd64/kexec_tramp.S new file mode 100644 index 000000000000..6a2de676bc35 --- /dev/null +++ b/sys/amd64/amd64/kexec_tramp.S @@ -0,0 +1,91 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <machine/asmacros.h> +#include <machine/specialreg.h> +#include "assym.inc" + +/* + * Take a pointer to the image, copy each segment, and jump to the trampoline. + * + * Assumptions: + * - image is in safe memory + * - We're already running out of the new "identity" map. + * - All registers are free game, so go nuts + * - Interrupts are disabled + * - All APs are disabled + */ +ENTRY(kexec_do_reboot) + /* + r9: image pointer + r10: segment pointer + r11: segment counter + */ + leaq kexec_stack(%rip), %rsp + /* Get the saved kexec_image. */ + leaq kexec_saved_image(%rip), %r9 + leaq KEXEC_SEGMENTS(%r9), %r10 + movq $KEXEC_SEGMENT_MAX, %r11 +copy_segment: + movq KEXEC_SEGMENT_SIZE(%r10), %rcx + cmpq $0, %rcx + je done + shrq $3, %rcx + movq KEXEC_SEGMENT_TARGET(%r10), %rdi + movq KEXEC_SEGMENT_MAP(%r10), %rsi + rep + movsq + addq $KEXEC_STAGED_SEGMENT_SIZE, %r10 + decq %r11 + jg copy_segment + +done: + pushq KEXEC_ENTRY(%r9) + ret +fail: + jmp fail +END(kexec_do_reboot) +ENTRY(kexec_do_reboot_trampoline) + /* Set new page table, clears most of TLB. */ + movq %rdi, %cr3 + + /* Now flush the rest of the TLB, including global pages. */ + movq %cr4, %rax + andq $~CR4_PGE, %rax + movq %rax, %cr4 + jmp *%rsi +END(kexec_do_reboot_trampoline) +CNAME(kexec_saved_image): + .globl kexec_saved_image + .space KEXEC_IMAGE_SIZE + .quad 0 + /* We don't need more than quad, so just fill out the page. */ + .p2align PAGE_SHIFT + kexec_stack: +CNAME(kexec_do_reboot_size): + .globl kexec_do_reboot_size + .quad . - kexec_do_reboot diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 00e99f9df192..96ed0a2cc3ba 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -140,6 +140,10 @@ cpu_mp_start(void) setidt(IPI_STOP, pti ? IDTVEC(cpustop_pti) : IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0); + /* Install an inter-CPU IPI for CPU offline */ + setidt(IPI_OFF, pti ? IDTVEC(cpuoff_pti) : IDTVEC(cpuoff), + SDT_SYSIGT, SEL_KPL, 0); + /* Install an inter-CPU IPI for CPU suspend/resume */ setidt(IPI_SUSPEND, pti ? IDTVEC(cpususpend_pti) : IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0); @@ -176,6 +180,15 @@ cpu_mp_start(void) #endif } +void +cpu_mp_stop(void) +{ + cpuset_t other_cpus = all_cpus; + + CPU_CLR(PCPU_GET(cpuid), &other_cpus); + offline_cpus(other_cpus); +} + /* * AP CPU's call this to initialize themselves. */ diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index f3469ed5e2bc..84305ca918df 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -435,9 +435,9 @@ trap(struct trapframe *frame) if ((print_efirt_faults == 1 && cnt == 0) || print_efirt_faults == 2) { - trap_diag(frame, 0); printf("EFI RT fault %s\n", traptype_to_msg(type)); + trap_diag(frame, 0); } frame->tf_rip = (long)curpcb->pcb_onfault; return; @@ -870,8 +870,8 @@ after_vmfault: if ((print_efirt_faults == 1 && cnt == 0) || print_efirt_faults == 2) { - trap_diag(frame, eva); printf("EFI RT page fault\n"); + trap_diag(frame, eva); } } frame->tf_rip = (long)curpcb->pcb_onfault; diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index 2e41ed26403a..fb8473505128 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -309,7 +309,6 @@ device wpi # Intel 3945ABG wireless NICs. device crypto # core crypto support device aesni # AES-NI OpenCrypto module device loop # Network loopback -device padlock_rng # VIA Padlock RNG device rdrand_rng # Intel Bull Mountain RNG device ether # Ethernet support device vlan # 802.1Q VLAN support diff --git a/sys/amd64/conf/MINIMAL b/sys/amd64/conf/MINIMAL index 0baf6d6431de..61c713c609a4 100644 --- a/sys/amd64/conf/MINIMAL +++ b/sys/amd64/conf/MINIMAL @@ -113,7 +113,6 @@ device uart # Generic UART driver # Pseudo devices. device loop # Network loopback -device padlock_rng # VIA Padlock RNG device rdrand_rng # Intel Bull Mountain RNG device ether # Ethernet support diff --git a/sys/amd64/include/kexec.h b/sys/amd64/include/kexec.h new file mode 100644 index 000000000000..70bc2991be3f --- /dev/null +++ b/sys/amd64/include/kexec.h @@ -0,0 +1,41 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _AMD64_KEXEC_H_ +#define _AMD64_KEXEC_H_ + +struct kexec_segment; +struct kexec_image; +int kexec_md_pages(struct kexec_segment *); +extern void kexec_do_reboot(void); +extern long kexec_do_reboot_size; +extern void *kexec_saved_image; +extern void kexec_do_reboot_trampoline(unsigned long, void (*)(void)); +#define KEXEC_MD_PAGES(x) kexec_md_pages(x) + + +#endif /* _AMD64_KEXEC_H_ */ diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h index bff92570ff82..28c372a2e556 100644 --- a/sys/amd64/include/smp.h +++ b/sys/amd64/include/smp.h @@ -30,6 +30,7 @@ inthand_t IDTVEC(ipi_intr_bitmap_handler_pti), IDTVEC(ipi_swi_pti), IDTVEC(cpustop_pti), + IDTVEC(cpuoff_pti), IDTVEC(cpususpend_pti), IDTVEC(rendezvous_pti); diff --git a/sys/amd64/pt/pt.c b/sys/amd64/pt/pt.c index c7b75767680a..6b2296de049c 100644 --- a/sys/amd64/pt/pt.c +++ b/sys/amd64/pt/pt.c @@ -42,15 +42,15 @@ */ #include <sys/systm.h> +#include <sys/bus.h> #include <sys/hwt.h> +#include <sys/interrupt.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/module.h> #include <sys/mutex.h> -#include <sys/sdt.h> #include <sys/smp.h> -#include <sys/taskqueue.h> #include <vm/vm.h> #include <vm/vm_page.h> @@ -94,12 +94,7 @@ MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace"); -SDT_PROVIDER_DEFINE(pt); -SDT_PROBE_DEFINE(pt, , , topa__intr); - -TASKQUEUE_FAST_DEFINE_THREAD(pt); - -static void pt_send_buffer_record(void *arg, int pending __unused); +static void pt_send_buffer_record(void *arg); static int pt_topa_intr(struct trapframe *tf); /* @@ -122,29 +117,24 @@ struct pt_buffer { size_t size; struct mtx lock; /* Lock for fields below. */ vm_offset_t offset; - uint64_t wrap_count; - int curpage; }; struct pt_ctx { int id; struct pt_buffer buf; /* ToPA buffer metadata */ - struct task task; /* ToPA buffer notification task */ struct hwt_context *hwt_ctx; uint8_t *save_area; /* PT XSAVE area */ }; /* PT tracing contexts used for CPU mode. */ static struct pt_ctx *pt_pcpu_ctx; -enum pt_cpu_state { - PT_DISABLED = 0, - PT_STOPPED, - PT_ACTIVE -}; +enum pt_cpu_state { PT_INACTIVE = 0, PT_ACTIVE }; static struct pt_cpu { struct pt_ctx *ctx; /* active PT tracing context */ enum pt_cpu_state state; /* used as part of trace stop protocol */ + void *swi_cookie; /* Software interrupt handler context */ + int in_pcint_handler; } *pt_pcpu; /* @@ -199,31 +189,28 @@ static __inline void pt_update_buffer(struct pt_buffer *buf) { uint64_t reg; - int curpage; + uint64_t offset; /* Update buffer offset. */ reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS); - curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT; - mtx_lock_spin(&buf->lock); - /* Check if the output wrapped. */ - if (buf->curpage > curpage) - buf->wrap_count++; - buf->curpage = curpage; - buf->offset = reg >> 32; - mtx_unlock_spin(&buf->lock); - - dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__, - buf->wrap_count, buf->curpage, buf->offset); + offset = ((reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT) * PAGE_SIZE; + offset += (reg >> 32); + + atomic_store_rel_64(&buf->offset, offset); } static __inline void pt_fill_buffer_record(int id, struct pt_buffer *buf, struct hwt_record_entry *rec) { + vm_offset_t offset; + + offset = atomic_load_acq_64(&buf->offset); + rec->record_type = HWT_RECORD_BUFFER; rec->buf_id = id; - rec->curpage = buf->curpage; - rec->offset = buf->offset + (buf->wrap_count * buf->size); + rec->curpage = offset / PAGE_SIZE; + rec->offset = offset & PAGE_MASK; } /* @@ -273,9 +260,9 @@ pt_cpu_start(void *dummy) MPASS(cpu->ctx != NULL); dprintf("%s: curcpu %d\n", __func__, curcpu); + pt_cpu_set_state(curcpu, PT_ACTIVE); load_cr4(rcr4() | CR4_XSAVE); wrmsr(MSR_IA32_RTIT_STATUS, 0); - pt_cpu_set_state(curcpu, PT_ACTIVE); pt_cpu_toggle_local(cpu->ctx->save_area, true); } @@ -291,16 +278,16 @@ pt_cpu_stop(void *dummy) struct pt_cpu *cpu; struct pt_ctx *ctx; - /* Shutdown may occur before PT gets properly configured. */ - if (pt_cpu_get_state(curcpu) == PT_DISABLED) - return; - cpu = &pt_pcpu[curcpu]; ctx = cpu->ctx; - MPASS(ctx != NULL); - dprintf("%s: curcpu %d\n", __func__, curcpu); - pt_cpu_set_state(curcpu, PT_STOPPED); + dprintf("%s: curcpu %d\n", __func__, curcpu); + /* Shutdown may occur before PT gets properly configured. */ + if (ctx == NULL) { + dprintf("%s: missing context on cpu %d; bailing\n", __func__, + curcpu); + return; + } pt_cpu_toggle_local(cpu->ctx->save_area, false); pt_update_buffer(&ctx->buf); } @@ -406,13 +393,11 @@ pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id) return (ENOMEM); dprintf("%s: preparing ToPA buffer\n", __func__); if (pt_topa_prepare(pt_ctx, vm) != 0) { - dprintf("%s: failed to prepare ToPA buffer\n", __func__); free(pt_ctx->save_area, M_PT); return (ENOMEM); } pt_ctx->id = ctx_id; - TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx); return (0); } @@ -426,7 +411,6 @@ pt_deinit_ctx(struct pt_ctx *pt_ctx) if (pt_ctx->save_area != NULL) free(pt_ctx->save_area, M_PT); memset(pt_ctx, 0, sizeof(*pt_ctx)); - pt_ctx->buf.topa_hw = NULL; } /* @@ -519,7 +503,6 @@ pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id) XSTATE_XCOMP_BV_COMPACT; pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN; pt_pcpu[cpu_id].ctx = pt_ctx; - pt_cpu_set_state(cpu_id, PT_STOPPED); return (0); } @@ -549,12 +532,19 @@ pt_backend_disable(struct hwt_context *ctx, int cpu_id) if (ctx->mode == HWT_MODE_CPU) return; - KASSERT(curcpu == cpu_id, ("%s: attempting to disable PT on another cpu", __func__)); + + cpu = &pt_pcpu[cpu_id]; + + dprintf("%s: waiting for cpu %d to exit interrupt handler\n", __func__, + cpu_id); + pt_cpu_set_state(cpu_id, PT_INACTIVE); + while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0)) + ; + pt_cpu_stop(NULL); CPU_CLR(cpu_id, &ctx->cpu_map); - cpu = &pt_pcpu[cpu_id]; cpu->ctx = NULL; } @@ -564,14 +554,14 @@ pt_backend_disable(struct hwt_context *ctx, int cpu_id) static int pt_backend_enable_smp(struct hwt_context *ctx) { - dprintf("%s\n", __func__); + + KASSERT(ctx->mode == HWT_MODE_CPU, + ("%s: should only be used for CPU mode", __func__)); if (ctx->mode == HWT_MODE_CPU && atomic_swap_32(&cpu_mode_ctr, 1) != 0) return (-1); - KASSERT(ctx->mode == HWT_MODE_CPU, - ("%s: should only be used for CPU mode", __func__)); smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL); return (0); @@ -583,6 +573,7 @@ pt_backend_enable_smp(struct hwt_context *ctx) static int pt_backend_disable_smp(struct hwt_context *ctx) { + struct pt_cpu *cpu; dprintf("%s\n", __func__); if (ctx->mode == HWT_MODE_CPU && @@ -593,6 +584,14 @@ pt_backend_disable_smp(struct hwt_context *ctx) dprintf("%s: empty cpu map\n", __func__); return (-1); } + CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) { + cpu = &pt_pcpu[cpu_id]; + dprintf("%s: waiting for cpu %d to exit interrupt handler\n", + __func__, cpu_id); + pt_cpu_set_state(cpu_id, PT_INACTIVE); + while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0)) + ; + } smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL); return (0); @@ -611,13 +610,13 @@ pt_backend_init(struct hwt_context *ctx) int error; dprintf("%s\n", __func__); - if (ctx->mode == HWT_MODE_CPU) { - TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { - error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id], - hwt_cpu->vm, hwt_cpu->cpu_id); - if (error) - return (error); - } + if (ctx->mode != HWT_MODE_CPU) + return (0); + TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { + error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id], hwt_cpu->vm, + hwt_cpu->cpu_id); + if (error) + return (error); } return (0); @@ -647,20 +646,16 @@ pt_backend_deinit(struct hwt_context *ctx) pt_deinit_ctx(pt_ctx); } } else { - CPU_FOREACH(cpu_id) { - if (!CPU_ISSET(cpu_id, &ctx->cpu_map)) + CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) { + if (pt_pcpu[cpu_id].ctx == NULL) continue; - if (pt_pcpu[cpu_id].ctx != NULL) { - KASSERT(pt_pcpu[cpu_id].ctx == - &pt_pcpu_ctx[cpu_id], - ("%s: CPU mode tracing with non-cpu mode PT" - "context active", - __func__)); - pt_pcpu[cpu_id].ctx = NULL; - } - pt_ctx = &pt_pcpu_ctx[cpu_id]; - pt_deinit_ctx(pt_ctx); - memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu)); + KASSERT(pt_pcpu[cpu_id].ctx == &pt_pcpu_ctx[cpu_id], + ("%s: CPU mode tracing with non-cpu mode PT" + "context active", + __func__)); + pt_deinit_ctx(pt_pcpu[cpu_id].ctx); + pt_pcpu[cpu_id].ctx = NULL; + atomic_set_int(&pt_pcpu[cpu_id].in_pcint_handler, 0); } } @@ -675,15 +670,15 @@ pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset, uint64_t *data) { struct pt_buffer *buf; + uint64_t offset; if (vm->ctx->mode == HWT_MODE_THREAD) buf = &((struct pt_ctx *)vm->thr->private)->buf; else buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf; - mtx_lock_spin(&buf->lock); - *curpage = buf->curpage; - *curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize); - mtx_unlock_spin(&buf->lock); + offset = atomic_load_acq_64(&buf->offset); + *curpage = offset / PAGE_SIZE; + *curpage_offset = offset & PAGE_MASK; return (0); } @@ -762,15 +757,13 @@ static struct hwt_backend backend = { * Used as a taskqueue routine from the ToPA interrupt handler. */ static void -pt_send_buffer_record(void *arg, int pending __unused) +pt_send_buffer_record(void *arg) { + struct pt_cpu *cpu = (struct pt_cpu *)arg; struct hwt_record_entry record; - struct pt_ctx *ctx = (struct pt_ctx *)arg; - /* Prepare buffer record. */ - mtx_lock_spin(&ctx->buf.lock); + struct pt_ctx *ctx = cpu->ctx; pt_fill_buffer_record(ctx->id, &ctx->buf, &record); - mtx_unlock_spin(&ctx->buf.lock); hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT); } static void @@ -795,36 +788,40 @@ static int pt_topa_intr(struct trapframe *tf) { struct pt_buffer *buf; + struct pt_cpu *cpu; struct pt_ctx *ctx; uint64_t reg; - SDT_PROBE0(pt, , , topa__intr); - - if (pt_cpu_get_state(curcpu) != PT_ACTIVE) { - return (0); - } + cpu = &pt_pcpu[curcpu]; reg = rdmsr(MSR_IA_GLOBAL_STATUS); if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) { - /* ACK spurious or leftover interrupt. */ pt_topa_status_clear(); + return (0); + } + + if (pt_cpu_get_state(curcpu) != PT_ACTIVE) { return (1); } + atomic_set_int(&cpu->in_pcint_handler, 1); - ctx = pt_pcpu[curcpu].ctx; + ctx = cpu->ctx; + KASSERT(ctx != NULL, + ("%s: cpu %d: ToPA PMI interrupt without an active context", + __func__, curcpu)); buf = &ctx->buf; KASSERT(buf->topa_hw != NULL, - ("%s: ToPA PMI interrupt with invalid buffer", __func__)); - + ("%s: cpu %d: ToPA PMI interrupt with invalid buffer", __func__, + curcpu)); pt_cpu_toggle_local(ctx->save_area, false); pt_update_buffer(buf); pt_topa_status_clear(); - taskqueue_enqueue_flags(taskqueue_pt, &ctx->task, - TASKQUEUE_FAIL_IF_PENDING); if (pt_cpu_get_state(curcpu) == PT_ACTIVE) { + swi_sched(cpu->swi_cookie, SWI_FROMNMI); pt_cpu_toggle_local(ctx->save_area, true); lapic_reenable_pcint(); } + atomic_set_int(&cpu->in_pcint_handler, 0); return (1); } @@ -839,7 +836,7 @@ static int pt_init(void) { u_int cp[4]; - int error; + int error, i; dprintf("pt: Enumerating part 1\n"); cpuid_count(CPUID_PT_LEAF, 0, cp); @@ -869,20 +866,38 @@ pt_init(void) pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT, M_ZERO | M_WAITOK); + for (i = 0; i < mp_ncpus; i++) { + error = swi_add(&clk_intr_event, "pt", pt_send_buffer_record, + &pt_pcpu[i], SWI_CLOCK, INTR_MPSAFE, + &pt_pcpu[i].swi_cookie); + if (error != 0) { + dprintf( + "%s: failed to add interrupt handler for cpu: %d\n", + __func__, error); + goto err; + } + } + nmi_register_handler(pt_topa_intr); - if (!lapic_enable_pcint()) { - nmi_remove_handler(pt_topa_intr); - hwt_backend_unregister(&backend); - free(pt_pcpu, M_PT); - free(pt_pcpu_ctx, M_PT); - pt_pcpu = NULL; - pt_pcpu_ctx = NULL; + if (lapic_enable_pcint()) { + initialized = true; + return (0); + } else printf("pt: failed to setup interrupt line\n"); - return (error); +err: + nmi_remove_handler(pt_topa_intr); + hwt_backend_unregister(&backend); + + for (i = 0; i < mp_ncpus; i++) { + if (pt_pcpu[i].swi_cookie != 0) + swi_remove(pt_pcpu[i].swi_cookie); } - initialized = true; + free(pt_pcpu, M_PT); + free(pt_pcpu_ctx, M_PT); + pt_pcpu = NULL; + pt_pcpu_ctx = NULL; - return (0); + return (error); } /* @@ -941,14 +956,24 @@ pt_supported(void) static void pt_deinit(void) { + int i; + struct pt_cpu *cpu; + if (!initialized) return; nmi_remove_handler(pt_topa_intr); lapic_disable_pcint(); hwt_backend_unregister(&backend); + + for (i = 0; i < mp_ncpus; i++) { + cpu = &pt_pcpu[i]; + swi_remove(cpu->swi_cookie); + } + free(pt_pcpu, M_PT); free(pt_pcpu_ctx, M_PT); pt_pcpu = NULL; + pt_pcpu_ctx = NULL; initialized = false; } diff --git a/sys/amd64/sgx/sgx_linux.c b/sys/amd64/sgx/sgx_linux.c index 6ecef9207a38..d389edc1b2b0 100644 --- a/sys/amd64/sgx/sgx_linux.c +++ b/sys/amd64/sgx/sgx_linux.c @@ -92,16 +92,7 @@ out: return (error); } -static struct linux_ioctl_handler sgx_linux_handler = { - sgx_linux_ioctl, - SGX_LINUX_IOCTL_MIN, - SGX_LINUX_IOCTL_MAX, -}; - -SYSINIT(sgx_linux_register, SI_SUB_KLD, SI_ORDER_MIDDLE, - linux_ioctl_register_handler, &sgx_linux_handler); -SYSUNINIT(sgx_linux_unregister, SI_SUB_KLD, SI_ORDER_MIDDLE, - linux_ioctl_unregister_handler, &sgx_linux_handler); +LINUX_IOCTL_SET(sgx, SGX_LINUX_IOCTL_MIN, SGX_LINUX_IOCTL_MAX); static int sgx_linux_modevent(module_t mod, int type, void *data) diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index f7c59847140b..f2bea0d82b5c 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -724,12 +724,7 @@ vm_name(struct vm *vm) int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { - vm_object_t obj; - - if ((obj = vmm_mmio_alloc(vm_vmspace(vm), gpa, len, hpa)) == NULL) - return (ENOMEM); - else - return (0); + return (vmm_mmio_alloc(vm_vmspace(vm), gpa, len, hpa)); } int @@ -870,7 +865,7 @@ vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval) { - + /* Negative values represent VM control structure fields. */ if (reg >= VM_REG_LAST) return (EINVAL); @@ -882,6 +877,7 @@ vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) { int error; + /* Negative values represent VM control structure fields. */ if (reg >= VM_REG_LAST) return (EINVAL); diff --git a/sys/amd64/vmm/vmm_dev_machdep.c b/sys/amd64/vmm/vmm_dev_machdep.c index dfebc9dcadbf..b84be809ea24 100644 --- a/sys/amd64/vmm/vmm_dev_machdep.c +++ b/sys/amd64/vmm/vmm_dev_machdep.c @@ -124,12 +124,16 @@ const struct vmmdev_ioctl vmmdev_machdep_ioctls[] = { VMMDEV_IOCTL(VM_SET_KERNEMU_DEV, VMMDEV_IOCTL_LOCK_ONE_VCPU), VMMDEV_IOCTL(VM_BIND_PPTDEV, - VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), + VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS | + VMMDEV_IOCTL_PRIV_CHECK_DRIVER), VMMDEV_IOCTL(VM_UNBIND_PPTDEV, - VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS), + VMMDEV_IOCTL_XLOCK_MEMSEGS | VMMDEV_IOCTL_LOCK_ALL_VCPUS | + VMMDEV_IOCTL_PRIV_CHECK_DRIVER), - VMMDEV_IOCTL(VM_MAP_PPTDEV_MMIO, VMMDEV_IOCTL_LOCK_ALL_VCPUS), - VMMDEV_IOCTL(VM_UNMAP_PPTDEV_MMIO, VMMDEV_IOCTL_LOCK_ALL_VCPUS), + VMMDEV_IOCTL(VM_MAP_PPTDEV_MMIO, VMMDEV_IOCTL_LOCK_ALL_VCPUS | + VMMDEV_IOCTL_PRIV_CHECK_DRIVER), + VMMDEV_IOCTL(VM_UNMAP_PPTDEV_MMIO, VMMDEV_IOCTL_LOCK_ALL_VCPUS | + VMMDEV_IOCTL_PRIV_CHECK_DRIVER), #ifdef BHYVE_SNAPSHOT #ifdef COMPAT_FREEBSD13 VMMDEV_IOCTL(VM_SNAPSHOT_REQ_13, VMMDEV_IOCTL_LOCK_ALL_VCPUS), @@ -147,9 +151,9 @@ const struct vmmdev_ioctl vmmdev_machdep_ioctls[] = { VMMDEV_IOCTL(VM_LAPIC_LOCAL_IRQ, VMMDEV_IOCTL_MAYBE_ALLOC_VCPU), - VMMDEV_IOCTL(VM_PPTDEV_MSI, 0), - VMMDEV_IOCTL(VM_PPTDEV_MSIX, 0), - VMMDEV_IOCTL(VM_PPTDEV_DISABLE_MSIX, 0), + VMMDEV_IOCTL(VM_PPTDEV_MSI, VMMDEV_IOCTL_PRIV_CHECK_DRIVER), + VMMDEV_IOCTL(VM_PPTDEV_MSIX, VMMDEV_IOCTL_PRIV_CHECK_DRIVER), + VMMDEV_IOCTL(VM_PPTDEV_DISABLE_MSIX, VMMDEV_IOCTL_PRIV_CHECK_DRIVER), VMMDEV_IOCTL(VM_LAPIC_MSI, 0), VMMDEV_IOCTL(VM_IOAPIC_ASSERT_IRQ, 0), VMMDEV_IOCTL(VM_IOAPIC_DEASSERT_IRQ, 0), @@ -172,40 +176,13 @@ int vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data, int fflag, struct thread *td) { - struct vm_seg_desc *vmsegdesc; - struct vm_run *vmrun; -#ifdef COMPAT_FREEBSD13 - struct vm_run_13 *vmrun_13; -#endif - struct vm_exception *vmexc; - struct vm_lapic_irq *vmirq; - struct vm_lapic_msi *vmmsi; - struct vm_ioapic_irq *ioapic_irq; - struct vm_isa_irq *isa_irq; - struct vm_isa_irq_trigger *isa_irq_trigger; - struct vm_pptdev *pptdev; - struct vm_pptdev_mmio *pptmmio; - struct vm_pptdev_msi *pptmsi; - struct vm_pptdev_msix *pptmsix; - struct vm_x2apic *x2apic; - struct vm_gpa_pte *gpapte; - struct vm_gla2gpa *gg; - struct vm_intinfo *vmii; - struct vm_rtc_time *rtctime; - struct vm_rtc_data *rtcdata; - struct vm_readwrite_kernemu_device *kernemu; -#ifdef BHYVE_SNAPSHOT - struct vm_snapshot_meta *snapshot_meta; -#ifdef COMPAT_FREEBSD13 - struct vm_snapshot_meta_13 *snapshot_13; -#endif -#endif int error; error = 0; switch (cmd) { case VM_RUN: { struct vm_exit *vme; + struct vm_run *vmrun; vmrun = (struct vm_run *)data; vme = vm_exitinfo(vcpu); @@ -243,6 +220,7 @@ vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data, case VM_RUN_13: { struct vm_exit *vme; struct vm_exit_13 *vme_13; + struct vm_run_13 *vmrun_13; vmrun_13 = (struct vm_run_13 *)data; vme_13 = &vmrun_13->vm_exit; @@ -281,85 +259,123 @@ vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data, break; } #endif - case VM_PPTDEV_MSI: + case VM_PPTDEV_MSI: { + struct vm_pptdev_msi *pptmsi; + pptmsi = (struct vm_pptdev_msi *)data; - error = ppt_setup_msi(vm, - pptmsi->bus, pptmsi->slot, pptmsi->func, - pptmsi->addr, pptmsi->msg, - pptmsi->numvec); + error = ppt_setup_msi(vm, pptmsi->bus, pptmsi->slot, + pptmsi->func, pptmsi->addr, pptmsi->msg, pptmsi->numvec); break; - case VM_PPTDEV_MSIX: + } + case VM_PPTDEV_MSIX: { + struct vm_pptdev_msix *pptmsix; + pptmsix = (struct vm_pptdev_msix *)data; - error = ppt_setup_msix(vm, - pptmsix->bus, pptmsix->slot, - pptmsix->func, pptmsix->idx, - pptmsix->addr, pptmsix->msg, - pptmsix->vector_control); + error = ppt_setup_msix(vm, pptmsix->bus, pptmsix->slot, + pptmsix->func, pptmsix->idx, pptmsix->addr, pptmsix->msg, + pptmsix->vector_control); break; - case VM_PPTDEV_DISABLE_MSIX: + } + case VM_PPTDEV_DISABLE_MSIX: { + struct vm_pptdev *pptdev; + pptdev = (struct vm_pptdev *)data; error = ppt_disable_msix(vm, pptdev->bus, pptdev->slot, - pptdev->func); + pptdev->func); break; - case VM_MAP_PPTDEV_MMIO: + } + case VM_MAP_PPTDEV_MMIO: { + struct vm_pptdev_mmio *pptmmio; + pptmmio = (struct vm_pptdev_mmio *)data; error = ppt_map_mmio(vm, pptmmio->bus, pptmmio->slot, - pptmmio->func, pptmmio->gpa, pptmmio->len, - pptmmio->hpa); + pptmmio->func, pptmmio->gpa, pptmmio->len, pptmmio->hpa); break; - case VM_UNMAP_PPTDEV_MMIO: + } + case VM_UNMAP_PPTDEV_MMIO: { + struct vm_pptdev_mmio *pptmmio; + pptmmio = (struct vm_pptdev_mmio *)data; error = ppt_unmap_mmio(vm, pptmmio->bus, pptmmio->slot, - pptmmio->func, pptmmio->gpa, pptmmio->len); + pptmmio->func, pptmmio->gpa, pptmmio->len); break; - case VM_BIND_PPTDEV: + } + case VM_BIND_PPTDEV: { + struct vm_pptdev *pptdev; + pptdev = (struct vm_pptdev *)data; error = vm_assign_pptdev(vm, pptdev->bus, pptdev->slot, - pptdev->func); + pptdev->func); break; - case VM_UNBIND_PPTDEV: + } + case VM_UNBIND_PPTDEV: { + struct vm_pptdev *pptdev; + pptdev = (struct vm_pptdev *)data; error = vm_unassign_pptdev(vm, pptdev->bus, pptdev->slot, - pptdev->func); + pptdev->func); break; - case VM_INJECT_EXCEPTION: + } + case VM_INJECT_EXCEPTION: { + struct vm_exception *vmexc; + vmexc = (struct vm_exception *)data; error = vm_inject_exception(vcpu, vmexc->vector, vmexc->error_code_valid, vmexc->error_code, vmexc->restart_instruction); break; + } case VM_INJECT_NMI: error = vm_inject_nmi(vcpu); break; - case VM_LAPIC_IRQ: + case VM_LAPIC_IRQ: { + struct vm_lapic_irq *vmirq; + vmirq = (struct vm_lapic_irq *)data; error = lapic_intr_edge(vcpu, vmirq->vector); break; - case VM_LAPIC_LOCAL_IRQ: + } + case VM_LAPIC_LOCAL_IRQ: { + struct vm_lapic_irq *vmirq; + vmirq = (struct vm_lapic_irq *)data; error = lapic_set_local_intr(vm, vcpu, vmirq->vector); break; - case VM_LAPIC_MSI: + } + case VM_LAPIC_MSI: { + struct vm_lapic_msi *vmmsi; + vmmsi = (struct vm_lapic_msi *)data; error = lapic_intr_msi(vm, vmmsi->addr, vmmsi->msg); break; - case VM_IOAPIC_ASSERT_IRQ: + } + case VM_IOAPIC_ASSERT_IRQ: { + struct vm_ioapic_irq *ioapic_irq; + ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_assert_irq(vm, ioapic_irq->irq); break; - case VM_IOAPIC_DEASSERT_IRQ: + } + case VM_IOAPIC_DEASSERT_IRQ: { + struct vm_ioapic_irq *ioapic_irq; + ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_deassert_irq(vm, ioapic_irq->irq); break; - case VM_IOAPIC_PULSE_IRQ: + } + case VM_IOAPIC_PULSE_IRQ: { + struct vm_ioapic_irq *ioapic_irq; + ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_pulse_irq(vm, ioapic_irq->irq); break; + } case VM_IOAPIC_PINCOUNT: *(int *)data = vioapic_pincount(vm); break; case VM_SET_KERNEMU_DEV: case VM_GET_KERNEMU_DEV: { + struct vm_readwrite_kernemu_device *kernemu; mem_region_write_t mwrite; mem_region_read_t mread; int size; @@ -396,60 +412,86 @@ vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data, error = mread(vcpu, kernemu->gpa, &kernemu->value, size, &arg); break; - } - case VM_ISA_ASSERT_IRQ: + } + case VM_ISA_ASSERT_IRQ: { + struct vm_isa_irq *isa_irq; + isa_irq = (struct vm_isa_irq *)data; error = vatpic_assert_irq(vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_assert_irq(vm, isa_irq->ioapic_irq); break; - case VM_ISA_DEASSERT_IRQ: + } + case VM_ISA_DEASSERT_IRQ: { + struct vm_isa_irq *isa_irq; + isa_irq = (struct vm_isa_irq *)data; error = vatpic_deassert_irq(vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_deassert_irq(vm, isa_irq->ioapic_irq); break; - case VM_ISA_PULSE_IRQ: + } + case VM_ISA_PULSE_IRQ: { + struct vm_isa_irq *isa_irq; + isa_irq = (struct vm_isa_irq *)data; error = vatpic_pulse_irq(vm, isa_irq->atpic_irq); if (error == 0 && isa_irq->ioapic_irq != -1) error = vioapic_pulse_irq(vm, isa_irq->ioapic_irq); break; - case VM_ISA_SET_IRQ_TRIGGER: + } + case VM_ISA_SET_IRQ_TRIGGER: { + struct vm_isa_irq_trigger *isa_irq_trigger; + isa_irq_trigger = (struct vm_isa_irq_trigger *)data; error = vatpic_set_irq_trigger(vm, isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger); break; - case VM_SET_SEGMENT_DESCRIPTOR: + } + case VM_SET_SEGMENT_DESCRIPTOR: { + struct vm_seg_desc *vmsegdesc; + vmsegdesc = (struct vm_seg_desc *)data; - error = vm_set_seg_desc(vcpu, - vmsegdesc->regnum, - &vmsegdesc->desc); + error = vm_set_seg_desc(vcpu, vmsegdesc->regnum, + &vmsegdesc->desc); break; - case VM_GET_SEGMENT_DESCRIPTOR: + } + case VM_GET_SEGMENT_DESCRIPTOR: { + struct vm_seg_desc *vmsegdesc; + vmsegdesc = (struct vm_seg_desc *)data; - error = vm_get_seg_desc(vcpu, - vmsegdesc->regnum, - &vmsegdesc->desc); + error = vm_get_seg_desc(vcpu, vmsegdesc->regnum, + &vmsegdesc->desc); break; - case VM_SET_X2APIC_STATE: + } + case VM_SET_X2APIC_STATE: { + struct vm_x2apic *x2apic; + x2apic = (struct vm_x2apic *)data; error = vm_set_x2apic_state(vcpu, x2apic->state); break; - case VM_GET_X2APIC_STATE: + } + case VM_GET_X2APIC_STATE: { + struct vm_x2apic *x2apic; + x2apic = (struct vm_x2apic *)data; error = vm_get_x2apic_state(vcpu, &x2apic->state); break; - case VM_GET_GPA_PMAP: + } + case VM_GET_GPA_PMAP: { + struct vm_gpa_pte *gpapte; + gpapte = (struct vm_gpa_pte *)data; - pmap_get_mapping(vmspace_pmap(vm_vmspace(vm)), - gpapte->gpa, gpapte->pte, &gpapte->ptenum); - error = 0; + pmap_get_mapping(vmspace_pmap(vm_vmspace(vm)), gpapte->gpa, + gpapte->pte, &gpapte->ptenum); break; + } case VM_GET_HPET_CAPABILITIES: error = vhpet_getcap((struct vm_hpet_cap *)data); break; case VM_GLA2GPA: { + struct vm_gla2gpa *gg; + CTASSERT(PROT_READ == VM_PROT_READ); CTASSERT(PROT_WRITE == VM_PROT_WRITE); CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); @@ -460,50 +502,76 @@ vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data, ("%s: vm_gla2gpa unknown error %d", __func__, error)); break; } - case VM_GLA2GPA_NOFAULT: + case VM_GLA2GPA_NOFAULT: { + struct vm_gla2gpa *gg; + gg = (struct vm_gla2gpa *)data; error = vm_gla2gpa_nofault(vcpu, &gg->paging, gg->gla, gg->prot, &gg->gpa, &gg->fault); KASSERT(error == 0 || error == EFAULT, ("%s: vm_gla2gpa unknown error %d", __func__, error)); break; - case VM_SET_INTINFO: + } + case VM_SET_INTINFO: { + struct vm_intinfo *vmii; + vmii = (struct vm_intinfo *)data; error = vm_exit_intinfo(vcpu, vmii->info1); break; - case VM_GET_INTINFO: + } + case VM_GET_INTINFO: { + struct vm_intinfo *vmii; + vmii = (struct vm_intinfo *)data; error = vm_get_intinfo(vcpu, &vmii->info1, &vmii->info2); break; - case VM_RTC_WRITE: + } + case VM_RTC_WRITE: { + struct vm_rtc_data *rtcdata; + rtcdata = (struct vm_rtc_data *)data; error = vrtc_nvram_write(vm, rtcdata->offset, rtcdata->value); break; - case VM_RTC_READ: + } + case VM_RTC_READ: { + struct vm_rtc_data *rtcdata; + rtcdata = (struct vm_rtc_data *)data; error = vrtc_nvram_read(vm, rtcdata->offset, &rtcdata->value); break; - case VM_RTC_SETTIME: + } + case VM_RTC_SETTIME: { + struct vm_rtc_time *rtctime; + rtctime = (struct vm_rtc_time *)data; error = vrtc_set_time(vm, rtctime->secs); break; - case VM_RTC_GETTIME: - error = 0; + } + case VM_RTC_GETTIME: { + struct vm_rtc_time *rtctime; + rtctime = (struct vm_rtc_time *)data; rtctime->secs = vrtc_get_time(vm); break; + } case VM_RESTART_INSTRUCTION: error = vm_restart_instruction(vcpu); break; #ifdef BHYVE_SNAPSHOT - case VM_SNAPSHOT_REQ: + case VM_SNAPSHOT_REQ: { + struct vm_snapshot_meta *snapshot_meta; + snapshot_meta = (struct vm_snapshot_meta *)data; error = vm_snapshot_req(vm, snapshot_meta); break; + } #ifdef COMPAT_FREEBSD13 - case VM_SNAPSHOT_REQ_13: + case VM_SNAPSHOT_REQ_13: { + struct vm_snapshot_meta *snapshot_meta; + struct vm_snapshot_meta_13 *snapshot_13; + /* * The old structure just has an additional pointer at * the start that is ignored. @@ -513,6 +581,7 @@ vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data, (struct vm_snapshot_meta *)&snapshot_13->dev_data; error = vm_snapshot_req(vm, snapshot_meta); break; + } #endif case VM_RESTORE_TIME: error = vm_restore_time(vm); diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h index 41b9bf07c4fc..d905fd37001d 100644 --- a/sys/amd64/vmm/vmm_mem.h +++ b/sys/amd64/vmm/vmm_mem.h @@ -30,10 +30,9 @@ #define _VMM_MEM_H_ struct vmspace; -struct vm_object; -struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len, - vm_paddr_t hpa); +int vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa); void vmm_mmio_free(struct vmspace *, vm_paddr_t gpa, size_t size); vm_paddr_t vmm_mem_maxaddr(void); diff --git a/sys/amd64/vmm/vmm_mem_machdep.c b/sys/amd64/vmm/vmm_mem_machdep.c index e96c9e4bdc66..afb3a0274e2a 100644 --- a/sys/amd64/vmm/vmm_mem_machdep.c +++ b/sys/amd64/vmm/vmm_mem_machdep.c @@ -36,6 +36,7 @@ #include <vm/vm.h> #include <vm/vm_param.h> #include <vm/pmap.h> +#include <vm/vm_extern.h> #include <vm/vm_map.h> #include <vm/vm_object.h> #include <vm/vm_page.h> @@ -45,40 +46,48 @@ #include "vmm_mem.h" -vm_object_t +int vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len, - vm_paddr_t hpa) + vm_paddr_t hpa) { - int error; - vm_object_t obj; struct sglist *sg; + vm_object_t obj; + int error; + + if (gpa + len < gpa || hpa + len < hpa || (gpa & PAGE_MASK) != 0 || + (hpa & PAGE_MASK) != 0 || (len & PAGE_MASK) != 0) + return (EINVAL); sg = sglist_alloc(1, M_WAITOK); error = sglist_append_phys(sg, hpa, len); KASSERT(error == 0, ("error %d appending physaddr to sglist", error)); obj = vm_pager_allocate(OBJT_SG, sg, len, VM_PROT_RW, 0, NULL); - if (obj != NULL) { - /* - * VT-x ignores the MTRR settings when figuring out the - * memory type for translations obtained through EPT. - * - * Therefore we explicitly force the pages provided by - * this object to be mapped as uncacheable. - */ - VM_OBJECT_WLOCK(obj); - error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE); - VM_OBJECT_WUNLOCK(obj); - if (error != KERN_SUCCESS) { - panic("vmm_mmio_alloc: vm_object_set_memattr error %d", - error); - } - error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0, - VMFS_NO_SPACE, VM_PROT_RW, VM_PROT_RW, 0); - if (error != KERN_SUCCESS) { - vm_object_deallocate(obj); - obj = NULL; - } + if (obj == NULL) + return (ENOMEM); + + /* + * VT-x ignores the MTRR settings when figuring out the memory type for + * translations obtained through EPT. + * + * Therefore we explicitly force the pages provided by this object to be + * mapped as uncacheable. + */ + VM_OBJECT_WLOCK(obj); + error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE); + VM_OBJECT_WUNLOCK(obj); + if (error != KERN_SUCCESS) + panic("vmm_mmio_alloc: vm_object_set_memattr error %d", error); + + vm_map_lock(&vmspace->vm_map); + error = vm_map_insert(&vmspace->vm_map, obj, 0, gpa, gpa + len, + VM_PROT_RW, VM_PROT_RW, 0); + vm_map_unlock(&vmspace->vm_map); + if (error != KERN_SUCCESS) { + error = vm_mmap_to_errno(error); + vm_object_deallocate(obj); + } else { + error = 0; } /* @@ -94,7 +103,7 @@ vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len, */ sglist_free(sg); - return (obj); + return (error); } void |
