diff options
Diffstat (limited to 'sys/amd64')
| -rw-r--r-- | sys/amd64/amd64/apic_vector.S | 11 | ||||
| -rw-r--r-- | sys/amd64/amd64/genassym.c | 12 | ||||
| -rw-r--r-- | sys/amd64/amd64/kexec_support.c | 300 | ||||
| -rw-r--r-- | sys/amd64/amd64/kexec_tramp.S | 91 | ||||
| -rw-r--r-- | sys/amd64/amd64/mp_machdep.c | 13 | ||||
| -rw-r--r-- | sys/amd64/amd64/trap.c | 4 | ||||
| -rw-r--r-- | sys/amd64/conf/GENERIC | 1 | ||||
| -rw-r--r-- | sys/amd64/conf/MINIMAL | 1 | ||||
| -rw-r--r-- | sys/amd64/include/kexec.h | 41 | ||||
| -rw-r--r-- | sys/amd64/include/smp.h | 1 | ||||
| -rw-r--r-- | sys/amd64/include/vmm.h | 31 | ||||
| -rw-r--r-- | sys/amd64/include/vmm_dev.h | 2 | ||||
| -rw-r--r-- | sys/amd64/vmm/intel/vmx.c | 2 | ||||
| -rw-r--r-- | sys/amd64/vmm/io/ppt.c | 7 | ||||
| -rw-r--r-- | sys/amd64/vmm/io/ppt.h | 6 | ||||
| -rw-r--r-- | sys/amd64/vmm/io/vlapic.c | 2 | ||||
| -rw-r--r-- | sys/amd64/vmm/vmm.c | 168 | ||||
| -rw-r--r-- | sys/amd64/vmm/vmm_lapic.c | 2 | ||||
| -rw-r--r-- | sys/amd64/vmm/vmm_mem.h | 5 | ||||
| -rw-r--r-- | sys/amd64/vmm/vmm_mem_machdep.c | 61 |
20 files changed, 552 insertions, 209 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index e98bae9eb6c5..8691387a5a8e 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -204,6 +204,17 @@ IDTVEC(spuriousint) jmp doreti /* + * Executed by a CPU when it receives an IPI_OFF from another CPU. + * Should never return + */ + INTR_HANDLER cpuoff + KMSAN_ENTER + call cpuoff_handler + call as_lapic_eoi + KMSAN_LEAVE + jmp doreti + +/* * Executed by a CPU when it receives an IPI_SWI. */ INTR_HANDLER ipi_swi diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c index eb1b746f5893..2716784ee871 100644 --- a/sys/amd64/amd64/genassym.c +++ b/sys/amd64/amd64/genassym.c @@ -57,6 +57,7 @@ #include <vm/vm_param.h> #include <vm/pmap.h> #include <vm/vm_map.h> +#include <sys/kexec.h> #include <sys/proc.h> #include <x86/apicreg.h> #include <machine/cpu.h> @@ -65,6 +66,7 @@ #include <machine/proc.h> #include <machine/segments.h> #include <machine/efi.h> +#include <machine/kexec.h> ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); @@ -295,3 +297,13 @@ ASSYM(EC_R13, offsetof(struct efirt_callinfo, ec_r13)); ASSYM(EC_R14, offsetof(struct efirt_callinfo, ec_r14)); ASSYM(EC_R15, offsetof(struct efirt_callinfo, ec_r15)); ASSYM(EC_RFLAGS, offsetof(struct efirt_callinfo, ec_rflags)); + +/* Kexec */ +ASSYM(KEXEC_ENTRY, offsetof(struct kexec_image, entry)); +ASSYM(KEXEC_SEGMENTS, offsetof(struct kexec_image, segments)); +ASSYM(KEXEC_SEGMENT_MAX, KEXEC_SEGMENT_MAX); +ASSYM(KEXEC_IMAGE_SIZE, sizeof(struct kexec_image)); +ASSYM(KEXEC_STAGED_SEGMENT_SIZE, sizeof(struct kexec_segment_stage)); +ASSYM(KEXEC_SEGMENT_SIZE, offsetof(struct kexec_segment_stage, size)); +ASSYM(KEXEC_SEGMENT_MAP, offsetof(struct kexec_segment_stage, map_buf)); +ASSYM(KEXEC_SEGMENT_TARGET, offsetof(struct kexec_segment_stage, target)); diff --git a/sys/amd64/amd64/kexec_support.c b/sys/amd64/amd64/kexec_support.c new file mode 100644 index 000000000000..8189a48e9ae9 --- /dev/null +++ b/sys/amd64/amd64/kexec_support.c @@ -0,0 +1,300 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/conf.h> +#include <sys/interrupt.h> +#include <sys/kernel.h> +#include <sys/kexec.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_object.h> +#include <vm/vm_phys.h> +#include <vm/pmap.h> +#include <vm/vm_page.h> +#include <vm/vm_radix.h> + +#include <machine/intr_machdep.h> +#include <machine/kexec.h> +#include <machine/md_var.h> +#include <machine/pmap.h> +#include <x86/apicvar.h> + +/* + * Idea behind this: + * + * kexec_load_md(): + * - Update boot page tables (identity map) to include all pages needed before + * disabling MMU. + * + * kexec_reboot_md(): + * - Copy pages into target(s) + * - Do "other stuff" + * - Does not return + */ + +/* + * do_pte: Create PTE entries (4k pages). If false, create 2MB superpages. + * identity: This is for an identity map, treat `start` as a physical address. + * Only valid here if do_pte is false. + */ +static void +kexec_generate_page_tables(pml4_entry_t *root, vm_offset_t start, + vm_size_t size, bool do_pte, bool identity, struct pctrie_iter *pages) +{ + vm_paddr_t mpa; + vm_offset_t pg; + vm_size_t stride = do_pte ? PAGE_SIZE : NBPDR; + vm_page_t m; + vm_pindex_t i, j, k, l; + + pg = start & ~(stride - 1); + i = pmap_pml4e_index(pg); + j = pmap_pdpe_index(pg); + k = pmap_pde_index(pg); + l = pmap_pte_index(pg); + for (; pg < start + size; i++, j = 0, k = 0, l = 0) { + /* + * Walk linearly, as above, but one fell swoop, one page at a + * time. + */ + if (root[i] == 0) { + m = vm_radix_iter_next(pages); + mpa = VM_PAGE_TO_PHYS(m); + root[i] = mpa | PG_RW | PG_V; + } + pdp_entry_t *pdp = + (pdp_entry_t *)(PHYS_TO_DMAP(root[i] & PG_FRAME)); + for (; j < NPDPEPG && pg < start + size; j++, k = 0, l = 0) { + if (pdp[j] == 0) { + m = vm_radix_iter_next(pages); + mpa = VM_PAGE_TO_PHYS(m); + pdp[j] = mpa | PG_RW | PG_V; + } + pd_entry_t *pde = + (pd_entry_t *)(PHYS_TO_DMAP(pdp[j] & PG_FRAME)); + for (; k < NPDEPG && pg < start + size; k++, l = 0) { + if (pde[k] == 0) { + if (!do_pte) { + pde[k] = + (identity ? pg : pmap_kextract(pg)) | + PG_RW | PG_PS | PG_V; + pg += NBPDR; + continue; + } + m = vm_radix_iter_next(pages); + mpa = VM_PAGE_TO_PHYS(m); + pde[k] = mpa | PG_V | PG_RW; + } else if ((pde[k] & PG_PS) != 0) { + pg += NBPDR; + continue; + } + /* Populate the PTEs. */ + for (; l < NPTEPG && pg < start + size; + l++, pg += PAGE_SIZE) { + pt_entry_t *pte = + (pt_entry_t *)PHYS_TO_DMAP(pde[pmap_pde_index(pg)] & PG_FRAME); + pte[pmap_pte_index(pg)] = + pmap_kextract(pg) | PG_RW | PG_V; + } + } + } + } +} + +void +kexec_reboot_md(struct kexec_image *image) +{ + void (*kexec_do_tramp)(void) = image->md_image; + + intr_disable_all(); + lapic_disable(); + kexec_do_reboot_trampoline(VM_PAGE_TO_PHYS(image->first_md_page), + kexec_do_tramp); + + for (;;) + ; +} + +int +kexec_load_md(struct kexec_image *image) +{ + struct pctrie_iter pct_iter; + pml4_entry_t *PT4; + pdp_entry_t *PDP_l; + pd_entry_t *PD_l0; + vm_offset_t va; + int i; + + /* + * Start building the page table. + * First part of the page table is standard for all. + */ + vm_offset_t pa_pdp_l, pa_pd_l0, pa_pd_l1, pa_pd_l2, pa_pd_l3; + vm_page_t m; + + if (la57) + return (EINVAL); + + vm_radix_iter_init(&pct_iter, &image->map_obj->rtree); + /* Working in linear space in the mapped space, `va` is our tracker. */ + m = vm_radix_iter_lookup(&pct_iter, image->first_md_page->pindex); + va = (vm_offset_t)image->map_addr + ptoa(m->pindex); + /* We'll find a place for these later */ + PT4 = (void *)va; + va += PAGE_SIZE; + m = vm_radix_iter_next(&pct_iter); + pa_pdp_l = VM_PAGE_TO_PHYS(m); + PDP_l = (void *)va; + va += PAGE_SIZE; + m = vm_radix_iter_next(&pct_iter); + pa_pd_l0 = VM_PAGE_TO_PHYS(m); + PD_l0 = (void *)va; + va += PAGE_SIZE; + m = vm_radix_iter_next(&pct_iter); + pa_pd_l1 = VM_PAGE_TO_PHYS(m); + m = vm_radix_iter_next(&pct_iter); + pa_pd_l2 = VM_PAGE_TO_PHYS(m); + m = vm_radix_iter_next(&pct_iter); + pa_pd_l3 = VM_PAGE_TO_PHYS(m); + m = vm_radix_iter_next(&pct_iter); + + /* 1:1 mapping of lower 4G */ + PT4[0] = (pml4_entry_t)pa_pdp_l | PG_V | PG_RW; + PDP_l[0] = (pdp_entry_t)pa_pd_l0 | PG_V | PG_RW; + PDP_l[1] = (pdp_entry_t)pa_pd_l1 | PG_V | PG_RW; + PDP_l[2] = (pdp_entry_t)pa_pd_l2 | PG_V | PG_RW; + PDP_l[3] = (pdp_entry_t)pa_pd_l3 | PG_V | PG_RW; + for (i = 0; i < 4 * NPDEPG; i++) { /* we overflow PD_l0 into _l1, etc */ + PD_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V | + PG_RW | PG_PS; + } + + /* Map the target(s) in 2MB chunks. */ + for (i = 0; i < KEXEC_SEGMENT_MAX; i++) { + struct kexec_segment_stage *s = &image->segments[i]; + + if (s->size == 0) + break; + kexec_generate_page_tables(PT4, s->target, s->size, false, + true, &pct_iter); + } + /* Now create the source page tables */ + kexec_generate_page_tables(PT4, image->map_addr, image->map_size, true, + false, &pct_iter); + kexec_generate_page_tables(PT4, + trunc_page((vm_offset_t)kexec_do_reboot_trampoline), + PAGE_SIZE, true, false, &pct_iter); + KASSERT(m != NULL, ("kexec_load_md: Missing trampoline page!\n")); + + /* MD control pages start at this next page. */ + image->md_image = (void *)(image->map_addr + ptoa(m->pindex)); + bcopy(kexec_do_reboot, image->md_image, kexec_do_reboot_size); + + /* Save the image into the MD page(s) right after the trampoline */ + bcopy(image, (void *)((vm_offset_t)image->md_image + + (vm_offset_t)&kexec_saved_image - (vm_offset_t)&kexec_do_reboot), + sizeof(*image)); + + return (0); +} + +/* + * Required pages: + * - L4 (1) (root) + * - L3 (PDPE) - 2 (bottom 512GB, bottom 4 used, top range for kernel map) + * - L2 (PDP) - 5 (2MB superpage mappings, 1GB each, for bottom 4GB, top 1) + * - L1 (PDR) - 1 (kexec trampoline page, first MD page) + * - kexec_do_reboot trampoline - 1 + * - Slop pages for staging (in case it's not aligned nicely) - 3 (worst case) + * + * Minimum 9 pages for the direct map. + */ +int +kexec_md_pages(struct kexec_segment *seg_in) +{ + struct kexec_segment *segs = seg_in; + vm_size_t pages = 13; /* Minimum number of starting pages */ + vm_paddr_t cur_addr = (1UL << 32) - 1; /* Bottom 4G will be identity mapped in full */ + vm_size_t source_total = 0; + + for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) { + vm_offset_t start, end; + if (segs[i].memsz == 0) + break; + + end = round_2mpage((vm_offset_t)segs[i].mem + segs[i].memsz); + start = trunc_2mpage((vm_offset_t)segs[i].mem); + start = max(start, cur_addr + 1); + /* + * Round to cover the full range of page table pages for each + * segment. + */ + source_total += round_2mpage(end - start); + + /* + * Bottom 4GB are identity mapped already in the count, so skip + * any segments that end up there, this will short-circuit that. + */ + if (end <= cur_addr + 1) + continue; + + if (pmap_pml4e_index(end) != pmap_pml4e_index(cur_addr)) { + /* Need a new 512GB mapping page */ + pages++; + pages += howmany(end - (start & ~PML4MASK), NBPML4); + pages += howmany(end - (start & ~PDPMASK), NBPDP); + pages += howmany(end - (start & ~PDRMASK), NBPDR); + + } else if (pmap_pdpe_index(end) != pmap_pdpe_index(cur_addr)) { + pages++; + pages += howmany(end - (start & ~PDPMASK), NBPDP) - 1; + pages += howmany(end - (start & ~PDRMASK), NBPDR); + } + + } + /* Be pessimistic when totaling up source pages. We likely + * can't use superpages, so need to map each page individually. + */ + pages += howmany(source_total, NBPDR); + pages += howmany(source_total, NBPDP); + pages += howmany(source_total, NBPML4); + + /* + * Be intentionally sloppy adding in the extra page table pages. It's + * better to go over than under. + */ + pages += howmany(pages * PAGE_SIZE, NBPDR); + pages += howmany(pages * PAGE_SIZE, NBPDP); + pages += howmany(pages * PAGE_SIZE, NBPML4); + + /* Add in the trampoline pages */ + pages += howmany(kexec_do_reboot_size, PAGE_SIZE); + + return (pages); +} diff --git a/sys/amd64/amd64/kexec_tramp.S b/sys/amd64/amd64/kexec_tramp.S new file mode 100644 index 000000000000..6a2de676bc35 --- /dev/null +++ b/sys/amd64/amd64/kexec_tramp.S @@ -0,0 +1,91 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <machine/asmacros.h> +#include <machine/specialreg.h> +#include "assym.inc" + +/* + * Take a pointer to the image, copy each segment, and jump to the trampoline. + * + * Assumptions: + * - image is in safe memory + * - We're already running out of the new "identity" map. + * - All registers are free game, so go nuts + * - Interrupts are disabled + * - All APs are disabled + */ +ENTRY(kexec_do_reboot) + /* + r9: image pointer + r10: segment pointer + r11: segment counter + */ + leaq kexec_stack(%rip), %rsp + /* Get the saved kexec_image. */ + leaq kexec_saved_image(%rip), %r9 + leaq KEXEC_SEGMENTS(%r9), %r10 + movq $KEXEC_SEGMENT_MAX, %r11 +copy_segment: + movq KEXEC_SEGMENT_SIZE(%r10), %rcx + cmpq $0, %rcx + je done + shrq $3, %rcx + movq KEXEC_SEGMENT_TARGET(%r10), %rdi + movq KEXEC_SEGMENT_MAP(%r10), %rsi + rep + movsq + addq $KEXEC_STAGED_SEGMENT_SIZE, %r10 + decq %r11 + jg copy_segment + +done: + pushq KEXEC_ENTRY(%r9) + ret +fail: + jmp fail +END(kexec_do_reboot) +ENTRY(kexec_do_reboot_trampoline) + /* Set new page table, clears most of TLB. */ + movq %rdi, %cr3 + + /* Now flush the rest of the TLB, including global pages. */ + movq %cr4, %rax + andq $~CR4_PGE, %rax + movq %rax, %cr4 + jmp *%rsi +END(kexec_do_reboot_trampoline) +CNAME(kexec_saved_image): + .globl kexec_saved_image + .space KEXEC_IMAGE_SIZE + .quad 0 + /* We don't need more than quad, so just fill out the page. */ + .p2align PAGE_SHIFT + kexec_stack: +CNAME(kexec_do_reboot_size): + .globl kexec_do_reboot_size + .quad . - kexec_do_reboot diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 00e99f9df192..96ed0a2cc3ba 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -140,6 +140,10 @@ cpu_mp_start(void) setidt(IPI_STOP, pti ? IDTVEC(cpustop_pti) : IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0); + /* Install an inter-CPU IPI for CPU offline */ + setidt(IPI_OFF, pti ? IDTVEC(cpuoff_pti) : IDTVEC(cpuoff), + SDT_SYSIGT, SEL_KPL, 0); + /* Install an inter-CPU IPI for CPU suspend/resume */ setidt(IPI_SUSPEND, pti ? IDTVEC(cpususpend_pti) : IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0); @@ -176,6 +180,15 @@ cpu_mp_start(void) #endif } +void +cpu_mp_stop(void) +{ + cpuset_t other_cpus = all_cpus; + + CPU_CLR(PCPU_GET(cpuid), &other_cpus); + offline_cpus(other_cpus); +} + /* * AP CPU's call this to initialize themselves. */ diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index f3469ed5e2bc..84305ca918df 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -435,9 +435,9 @@ trap(struct trapframe *frame) if ((print_efirt_faults == 1 && cnt == 0) || print_efirt_faults == 2) { - trap_diag(frame, 0); printf("EFI RT fault %s\n", traptype_to_msg(type)); + trap_diag(frame, 0); } frame->tf_rip = (long)curpcb->pcb_onfault; return; @@ -870,8 +870,8 @@ after_vmfault: if ((print_efirt_faults == 1 && cnt == 0) || print_efirt_faults == 2) { - trap_diag(frame, eva); printf("EFI RT page fault\n"); + trap_diag(frame, eva); } } frame->tf_rip = (long)curpcb->pcb_onfault; diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index 2e41ed26403a..fb8473505128 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -309,7 +309,6 @@ device wpi # Intel 3945ABG wireless NICs. device crypto # core crypto support device aesni # AES-NI OpenCrypto module device loop # Network loopback -device padlock_rng # VIA Padlock RNG device rdrand_rng # Intel Bull Mountain RNG device ether # Ethernet support device vlan # 802.1Q VLAN support diff --git a/sys/amd64/conf/MINIMAL b/sys/amd64/conf/MINIMAL index 0baf6d6431de..61c713c609a4 100644 --- a/sys/amd64/conf/MINIMAL +++ b/sys/amd64/conf/MINIMAL @@ -113,7 +113,6 @@ device uart # Generic UART driver # Pseudo devices. device loop # Network loopback -device padlock_rng # VIA Padlock RNG device rdrand_rng # Intel Bull Mountain RNG device ether # Ethernet support diff --git a/sys/amd64/include/kexec.h b/sys/amd64/include/kexec.h new file mode 100644 index 000000000000..70bc2991be3f --- /dev/null +++ b/sys/amd64/include/kexec.h @@ -0,0 +1,41 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _AMD64_KEXEC_H_ +#define _AMD64_KEXEC_H_ + +struct kexec_segment; +struct kexec_image; +int kexec_md_pages(struct kexec_segment *); +extern void kexec_do_reboot(void); +extern long kexec_do_reboot_size; +extern void *kexec_saved_image; +extern void kexec_do_reboot_trampoline(unsigned long, void (*)(void)); +#define KEXEC_MD_PAGES(x) kexec_md_pages(x) + + +#endif /* _AMD64_KEXEC_H_ */ diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h index bff92570ff82..28c372a2e556 100644 --- a/sys/amd64/include/smp.h +++ b/sys/amd64/include/smp.h @@ -30,6 +30,7 @@ inthand_t IDTVEC(ipi_intr_bitmap_handler_pti), IDTVEC(ipi_swi_pti), IDTVEC(cpustop_pti), + IDTVEC(cpuoff_pti), IDTVEC(cpususpend_pti), IDTVEC(rendezvous_pti); diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index ad67510fecf3..5cf1ae2d769c 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -122,33 +122,7 @@ enum x2apic_state { #define VM_INTINFO_HWEXCEPTION (3 << 8) #define VM_INTINFO_SWINTR (4 << 8) -/* - * The VM name has to fit into the pathname length constraints of devfs, - * governed primarily by SPECNAMELEN. The length is the total number of - * characters in the full path, relative to the mount point and not - * including any leading '/' characters. - * A prefix and a suffix are added to the name specified by the user. - * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters - * longer for future use. - * The suffix is a string that identifies a bootrom image or some similar - * image that is attached to the VM. A separator character gets added to - * the suffix automatically when generating the full path, so it must be - * accounted for, reducing the effective length by 1. - * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37 - * bytes for FreeBSD 12. A minimum length is set for safety and supports - * a SPECNAMELEN as small as 32 on old systems. - */ -#define VM_MAX_PREFIXLEN 10 -#define VM_MAX_SUFFIXLEN 15 -#define VM_MIN_NAMELEN 6 -#define VM_MAX_NAMELEN \ - (SPECNAMELEN - VM_MAX_PREFIXLEN - VM_MAX_SUFFIXLEN - 1) - #ifdef _KERNEL -#include <sys/kassert.h> - -CTASSERT(VM_MAX_NAMELEN >= VM_MIN_NAMELEN); - struct vm; struct vm_exception; struct vm_mem; @@ -232,8 +206,6 @@ struct vmm_ops { extern const struct vmm_ops vmm_ops_intel; extern const struct vmm_ops vmm_ops_amd; -extern u_int vm_maxcpu; /* maximum virtual cpus */ - int vm_create(const char *name, struct vm **retvm); struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid); void vm_disable_vcpu_creation(struct vm *vm); @@ -383,7 +355,8 @@ vcpu_should_yield(struct vcpu *vcpu) #endif void *vcpu_stats(struct vcpu *vcpu); -void vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr); +void vcpu_notify_event(struct vcpu *vcpu); +void vcpu_notify_lapic(struct vcpu *vcpu); struct vm_mem *vm_mem(struct vm *vm); struct vatpic *vm_atpic(struct vm *vm); struct vatpit *vm_atpit(struct vm *vm); diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h index 441330fd57b8..f1c07a983a4b 100644 --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -34,6 +34,8 @@ #include <machine/vmm.h> #include <machine/vmm_snapshot.h> +#include <dev/vmm/vmm_param.h> + struct vm_memmap { vm_paddr_t gpa; int segid; /* memory segment */ diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 842281ab862e..4189c1214b40 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -27,7 +27,6 @@ * SUCH DAMAGE. */ -#include <sys/cdefs.h> #include "opt_bhyve_snapshot.h" #include <sys/param.h> @@ -58,6 +57,7 @@ #include <machine/vmm_instruction_emul.h> #include <machine/vmm_snapshot.h> +#include <dev/vmm/vmm_dev.h> #include <dev/vmm/vmm_ktr.h> #include <dev/vmm/vmm_mem.h> diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c index 2cb459fb848f..6feac5dcbbed 100644 --- a/sys/amd64/vmm/io/ppt.c +++ b/sys/amd64/vmm/io/ppt.c @@ -336,13 +336,6 @@ ppt_teardown_msix(struct pptdev *ppt) } int -ppt_avail_devices(void) -{ - - return (num_pptdevs); -} - -int ppt_assigned_devices(struct vm *vm) { struct pptdev *ppt; diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h index f97c399564d7..9377f34d50e6 100644 --- a/sys/amd64/vmm/io/ppt.h +++ b/sys/amd64/vmm/io/ppt.h @@ -43,12 +43,6 @@ int ppt_assigned_devices(struct vm *vm); bool ppt_is_mmio(struct vm *vm, vm_paddr_t gpa); /* - * Returns the number of devices sequestered by the ppt driver for assignment - * to virtual machines. - */ -int ppt_avail_devices(void); - -/* * The following functions should never be called directly. * Use 'vm_assign_pptdev()' and 'vm_unassign_pptdev()' instead. */ diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index 9879dfa164a4..afd5045de574 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -456,7 +456,7 @@ vlapic_fire_lvt(struct vlapic *vlapic, u_int lvt) return (0); } if (vlapic_set_intr_ready(vlapic, vec, false)) - vcpu_notify_event(vlapic->vcpu, true); + vcpu_notify_lapic(vlapic->vcpu); break; case APIC_LVT_DM_NMI: vm_inject_nmi(vlapic->vcpu); diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 473887240b9b..2890e990633d 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -31,7 +31,6 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> -#include <sys/module.h> #include <sys/sysctl.h> #include <sys/malloc.h> #include <sys/pcpu.h> @@ -189,8 +188,6 @@ struct vm { #define VMM_CTR4(vcpu, format, p1, p2, p3, p4) \ VCPU_CTR4((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3, p4) -static int vmm_initialized; - static void vmmops_panic(void); static void @@ -270,11 +267,7 @@ static int trap_wbinvd; SYSCTL_INT(_hw_vmm, OID_AUTO, trap_wbinvd, CTLFLAG_RDTUN, &trap_wbinvd, 0, "WBINVD triggers a VM-exit"); -u_int vm_maxcpu; -SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, - &vm_maxcpu, 0, "Maximum number of vCPUs"); - -static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); +static void vcpu_notify_event_locked(struct vcpu *vcpu); /* global statistics */ VMM_STAT(VCPU_MIGRATIONS, "vcpu migration across host cpus"); @@ -299,14 +292,6 @@ VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace"); VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit"); VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions"); -/* - * Upper limit on vm_maxcpu. Limited by use of uint16_t types for CPU - * counts as well as range of vpid values for VT-x and by the capacity - * of cpuset_t masks. The call to new_unrhdr() in vpid_init() in - * vmx.c requires 'vm_maxcpu + 1 <= 0xffff', hence the '- 1' below. - */ -#define VM_MAXCPU MIN(0xffff - 1, CPU_SETSIZE) - #ifdef KTR static const char * vcpu_state2str(enum vcpu_state state) @@ -402,22 +387,12 @@ vm_exitinfo_cpuset(struct vcpu *vcpu) return (&vcpu->exitinfo_cpuset); } -static int -vmm_init(void) +int +vmm_modinit(void) { if (!vmm_is_hw_supported()) return (ENXIO); - vm_maxcpu = mp_ncpus; - TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu); - - if (vm_maxcpu > VM_MAXCPU) { - printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU); - vm_maxcpu = VM_MAXCPU; - } - if (vm_maxcpu == 0) - vm_maxcpu = 1; - vmm_host_state_init(); vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : @@ -431,70 +406,17 @@ vmm_init(void) return (vmmops_modinit(vmm_ipinum)); } -static int -vmm_handler(module_t mod, int what, void *arg) +int +vmm_modcleanup(void) { - int error; - - switch (what) { - case MOD_LOAD: - if (vmm_is_hw_supported()) { - error = vmmdev_init(); - if (error != 0) - break; - error = vmm_init(); - if (error == 0) - vmm_initialized = 1; - else - (void)vmmdev_cleanup(); - } else { - error = ENXIO; - } - break; - case MOD_UNLOAD: - if (vmm_is_hw_supported()) { - error = vmmdev_cleanup(); - if (error == 0) { - vmm_suspend_p = NULL; - vmm_resume_p = NULL; - iommu_cleanup(); - if (vmm_ipinum != IPI_AST) - lapic_ipi_free(vmm_ipinum); - error = vmmops_modcleanup(); - /* - * Something bad happened - prevent new - * VMs from being created - */ - if (error) - vmm_initialized = 0; - } - } else { - error = 0; - } - break; - default: - error = 0; - break; - } - return (error); + vmm_suspend_p = NULL; + vmm_resume_p = NULL; + iommu_cleanup(); + if (vmm_ipinum != IPI_AST) + lapic_ipi_free(vmm_ipinum); + return (vmmops_modcleanup()); } -static moduledata_t vmm_kmod = { - "vmm", - vmm_handler, - NULL -}; - -/* - * vmm initialization has the following dependencies: - * - * - VT-x initialization requires smp_rendezvous() and therefore must happen - * after SMP is fully functional (after SI_SUB_SMP). - * - vmm device initialization requires an initialized devfs. - */ -DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY); -MODULE_VERSION(vmm, 1); - static void vm_init(struct vm *vm, bool create) { @@ -573,29 +495,12 @@ vm_unlock_vcpus(struct vm *vm) sx_unlock(&vm->vcpus_init_lock); } -/* - * The default CPU topology is a single thread per package. - */ -u_int cores_per_package = 1; -u_int threads_per_core = 1; - int vm_create(const char *name, struct vm **retvm) { struct vm *vm; int error; - /* - * If vmm.ko could not be successfully initialized then don't attempt - * to create the virtual machine. - */ - if (!vmm_initialized) - return (ENXIO); - - if (name == NULL || strnlen(name, VM_MAX_NAMELEN + 1) == - VM_MAX_NAMELEN + 1) - return (EINVAL); - vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); error = vm_mem_init(&vm->mem, 0, VM_MAXUSER_ADDRESS_LA48); if (error != 0) { @@ -609,8 +514,8 @@ vm_create(const char *name, struct vm **retvm) M_ZERO); vm->sockets = 1; - vm->cores = cores_per_package; /* XXX backwards compatibility */ - vm->threads = threads_per_core; /* XXX backwards compatibility */ + vm->cores = 1; /* XXX backwards compatibility */ + vm->threads = 1; /* XXX backwards compatibility */ vm->maxcpus = vm_maxcpu; vm_init(vm, true); @@ -724,12 +629,7 @@ vm_name(struct vm *vm) int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { - vm_object_t obj; - - if ((obj = vmm_mmio_alloc(vm_vmspace(vm), gpa, len, hpa)) == NULL) - return (ENOMEM); - else - return (0); + return (vmm_mmio_alloc(vm_vmspace(vm), gpa, len, hpa)); } int @@ -1033,7 +933,7 @@ vcpu_wait_idle(struct vcpu *vcpu) KASSERT(vcpu->state != VCPU_IDLE, ("vcpu already idle")); vcpu->reqidle = 1; - vcpu_notify_event_locked(vcpu, false); + vcpu_notify_event_locked(vcpu); VMM_CTR1(vcpu, "vcpu state change from %s to " "idle requested", vcpu_state2str(vcpu->state)); msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); @@ -1514,7 +1414,7 @@ vm_handle_suspend(struct vcpu *vcpu, bool *retu) */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->suspended_cpus)) { - vcpu_notify_event(vm_vcpu(vm, i), false); + vcpu_notify_event(vm_vcpu(vm, i)); } } @@ -1588,7 +1488,7 @@ vm_suspend(struct vm *vm, enum vm_suspend_how how) */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) - vcpu_notify_event(vm_vcpu(vm, i), false); + vcpu_notify_event(vm_vcpu(vm, i)); } return (0); @@ -2068,7 +1968,7 @@ vm_inject_nmi(struct vcpu *vcpu) { vcpu->nmi_pending = 1; - vcpu_notify_event(vcpu, false); + vcpu_notify_event(vcpu); return (0); } @@ -2095,7 +1995,7 @@ vm_inject_extint(struct vcpu *vcpu) { vcpu->extint_pending = 1; - vcpu_notify_event(vcpu, false); + vcpu_notify_event(vcpu); return (0); } @@ -2266,14 +2166,14 @@ vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu) vm->debug_cpus = vm->active_cpus; for (int i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &vm->active_cpus)) - vcpu_notify_event(vm_vcpu(vm, i), false); + vcpu_notify_event(vm_vcpu(vm, i)); } } else { if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) return (EINVAL); CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); - vcpu_notify_event(vcpu, false); + vcpu_notify_event(vcpu); } return (0); } @@ -2381,7 +2281,7 @@ vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state) * to the host_cpu to cause the vcpu to trap into the hypervisor. */ static void -vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) +vcpu_notify_event_locked(struct vcpu *vcpu) { int hostcpu; @@ -2389,12 +2289,7 @@ vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) if (vcpu->state == VCPU_RUNNING) { KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); if (hostcpu != curcpu) { - if (lapic_intr) { - vlapic_post_intr(vcpu->vlapic, hostcpu, - vmm_ipinum); - } else { - ipi_cpu(hostcpu, vmm_ipinum); - } + ipi_cpu(hostcpu, vmm_ipinum); } else { /* * If the 'vcpu' is running on 'curcpu' then it must @@ -2412,10 +2307,21 @@ vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) } void -vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr) +vcpu_notify_event(struct vcpu *vcpu) { vcpu_lock(vcpu); - vcpu_notify_event_locked(vcpu, lapic_intr); + vcpu_notify_event_locked(vcpu); + vcpu_unlock(vcpu); +} + +void +vcpu_notify_lapic(struct vcpu *vcpu) +{ + vcpu_lock(vcpu); + if (vcpu->state == VCPU_RUNNING && vcpu->hostcpu != curcpu) + vlapic_post_intr(vcpu->vlapic, vcpu->hostcpu, vmm_ipinum); + else + vcpu_notify_event_locked(vcpu); vcpu_unlock(vcpu); } @@ -2477,7 +2383,7 @@ restart: */ for (i = 0; i < vm->maxcpus; i++) { if (CPU_ISSET(i, &dest)) - vcpu_notify_event(vm_vcpu(vm, i), false); + vcpu_notify_event(vm_vcpu(vm, i)); } return (vm_handle_rendezvous(vcpu)); diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c index 0cae01f172ec..63bdee69bb59 100644 --- a/sys/amd64/vmm/vmm_lapic.c +++ b/sys/amd64/vmm/vmm_lapic.c @@ -61,7 +61,7 @@ lapic_set_intr(struct vcpu *vcpu, int vector, bool level) vlapic = vm_lapic(vcpu); if (vlapic_set_intr_ready(vlapic, vector, level)) - vcpu_notify_event(vcpu, true); + vcpu_notify_lapic(vcpu); return (0); } diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h index 41b9bf07c4fc..d905fd37001d 100644 --- a/sys/amd64/vmm/vmm_mem.h +++ b/sys/amd64/vmm/vmm_mem.h @@ -30,10 +30,9 @@ #define _VMM_MEM_H_ struct vmspace; -struct vm_object; -struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len, - vm_paddr_t hpa); +int vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa); void vmm_mmio_free(struct vmspace *, vm_paddr_t gpa, size_t size); vm_paddr_t vmm_mem_maxaddr(void); diff --git a/sys/amd64/vmm/vmm_mem_machdep.c b/sys/amd64/vmm/vmm_mem_machdep.c index e96c9e4bdc66..afb3a0274e2a 100644 --- a/sys/amd64/vmm/vmm_mem_machdep.c +++ b/sys/amd64/vmm/vmm_mem_machdep.c @@ -36,6 +36,7 @@ #include <vm/vm.h> #include <vm/vm_param.h> #include <vm/pmap.h> +#include <vm/vm_extern.h> #include <vm/vm_map.h> #include <vm/vm_object.h> #include <vm/vm_page.h> @@ -45,40 +46,48 @@ #include "vmm_mem.h" -vm_object_t +int vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len, - vm_paddr_t hpa) + vm_paddr_t hpa) { - int error; - vm_object_t obj; struct sglist *sg; + vm_object_t obj; + int error; + + if (gpa + len < gpa || hpa + len < hpa || (gpa & PAGE_MASK) != 0 || + (hpa & PAGE_MASK) != 0 || (len & PAGE_MASK) != 0) + return (EINVAL); sg = sglist_alloc(1, M_WAITOK); error = sglist_append_phys(sg, hpa, len); KASSERT(error == 0, ("error %d appending physaddr to sglist", error)); obj = vm_pager_allocate(OBJT_SG, sg, len, VM_PROT_RW, 0, NULL); - if (obj != NULL) { - /* - * VT-x ignores the MTRR settings when figuring out the - * memory type for translations obtained through EPT. - * - * Therefore we explicitly force the pages provided by - * this object to be mapped as uncacheable. - */ - VM_OBJECT_WLOCK(obj); - error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE); - VM_OBJECT_WUNLOCK(obj); - if (error != KERN_SUCCESS) { - panic("vmm_mmio_alloc: vm_object_set_memattr error %d", - error); - } - error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0, - VMFS_NO_SPACE, VM_PROT_RW, VM_PROT_RW, 0); - if (error != KERN_SUCCESS) { - vm_object_deallocate(obj); - obj = NULL; - } + if (obj == NULL) + return (ENOMEM); + + /* + * VT-x ignores the MTRR settings when figuring out the memory type for + * translations obtained through EPT. + * + * Therefore we explicitly force the pages provided by this object to be + * mapped as uncacheable. + */ + VM_OBJECT_WLOCK(obj); + error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE); + VM_OBJECT_WUNLOCK(obj); + if (error != KERN_SUCCESS) + panic("vmm_mmio_alloc: vm_object_set_memattr error %d", error); + + vm_map_lock(&vmspace->vm_map); + error = vm_map_insert(&vmspace->vm_map, obj, 0, gpa, gpa + len, + VM_PROT_RW, VM_PROT_RW, 0); + vm_map_unlock(&vmspace->vm_map); + if (error != KERN_SUCCESS) { + error = vm_mmap_to_errno(error); + vm_object_deallocate(obj); + } else { + error = 0; } /* @@ -94,7 +103,7 @@ vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len, */ sglist_free(sg); - return (obj); + return (error); } void |
