diff options
Diffstat (limited to 'sys/powerpc/aim')
| -rw-r--r-- | sys/powerpc/aim/aim_machdep.c | 813 | ||||
| -rw-r--r-- | sys/powerpc/aim/locore.S | 15 | ||||
| -rw-r--r-- | sys/powerpc/aim/locore32.S | 123 | ||||
| -rw-r--r-- | sys/powerpc/aim/locore64.S | 287 | ||||
| -rw-r--r-- | sys/powerpc/aim/mmu_oea.c | 2843 | ||||
| -rw-r--r-- | sys/powerpc/aim/mmu_oea64.c | 4341 | ||||
| -rw-r--r-- | sys/powerpc/aim/mmu_oea64.h | 143 | ||||
| -rw-r--r-- | sys/powerpc/aim/mmu_radix.c | 6552 | ||||
| -rw-r--r-- | sys/powerpc/aim/moea64_native.c | 1031 | ||||
| -rw-r--r-- | sys/powerpc/aim/mp_cpudep.c | 425 | ||||
| -rw-r--r-- | sys/powerpc/aim/slb.c | 624 | ||||
| -rw-r--r-- | sys/powerpc/aim/trap_subr32.S | 929 | ||||
| -rw-r--r-- | sys/powerpc/aim/trap_subr64.S | 997 |
13 files changed, 19123 insertions, 0 deletions
diff --git a/sys/powerpc/aim/aim_machdep.c b/sys/powerpc/aim/aim_machdep.c new file mode 100644 index 000000000000..814c679ff47e --- /dev/null +++ b/sys/powerpc/aim/aim_machdep.c @@ -0,0 +1,813 @@ +/*- + * Copyright (C) 1995, 1996 Wolfgang Solfrank. + * Copyright (C) 1995, 1996 TooLs GmbH. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by TooLs GmbH. + * 4. The name of TooLs GmbH may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +/*- + * Copyright (C) 2001 Benno Rice + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * $NetBSD: machdep.c,v 1.74.2.1 2000/11/01 16:13:48 tv Exp $ + */ + +#include <sys/cdefs.h> +#include "opt_ddb.h" +#include "opt_kstack_pages.h" +#include "opt_platform.h" + +#include <sys/endian.h> +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/bus.h> +#include <sys/cons.h> +#include <sys/cpu.h> +#include <sys/eventhandler.h> +#include <sys/exec.h> +#include <sys/imgact.h> +#include <sys/kdb.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/linker.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/msgbuf.h> +#include <sys/mutex.h> +#include <sys/ptrace.h> +#include <sys/reboot.h> +#include <sys/rwlock.h> +#include <sys/signalvar.h> +#include <sys/syscallsubr.h> +#include <sys/sysctl.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> +#include <sys/ucontext.h> +#include <sys/uio.h> +#include <sys/vmmeter.h> +#include <sys/vnode.h> + +#include <net/netisr.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_pager.h> + +#include <machine/altivec.h> +#ifndef __powerpc64__ +#include <machine/bat.h> +#endif +#include <machine/cpu.h> +#include <machine/elf.h> +#include <machine/fpu.h> +#include <machine/hid.h> +#include <machine/kdb.h> +#include <machine/md_var.h> +#include <machine/metadata.h> +#include <machine/mmuvar.h> +#include <machine/pcb.h> +#include <machine/sigframe.h> +#include <machine/spr.h> +#include <machine/trap.h> +#include <machine/vmparam.h> +#include <machine/ofw_machdep.h> + +#include <ddb/ddb.h> + +#include <dev/ofw/openfirm.h> + +#ifdef __powerpc64__ +#include "mmu_oea64.h" +#endif + +#ifndef __powerpc64__ +struct bat battable[16]; +#endif + +int radix_mmu = 0; + +#ifndef __powerpc64__ +/* Bits for running on 64-bit systems in 32-bit mode. */ +extern void *testppc64, *testppc64size; +extern void *restorebridge, *restorebridgesize; +extern void *rfid_patch, *rfi_patch1, *rfi_patch2; +extern void *trapcode64; + +extern Elf_Addr _GLOBAL_OFFSET_TABLE_[]; +#endif + +extern void *rstcode, *rstcodeend; +extern void *trapcode, *trapcodeend; +extern void *hypertrapcode, *hypertrapcodeend; +extern void *generictrap, *generictrap64; +extern void *alitrap, *aliend; +extern void *dsitrap, *dsiend; +extern void *decrint, *decrsize; +extern void *extint, *extsize; +extern void *dblow, *dbend; +extern void *imisstrap, *imisssize; +extern void *dlmisstrap, *dlmisssize; +extern void *dsmisstrap, *dsmisssize; + +extern void *ap_pcpu; +extern void __restartkernel(vm_offset_t, vm_offset_t, vm_offset_t, void *, uint32_t, register_t offset, register_t msr); +extern void __restartkernel_virtual(vm_offset_t, vm_offset_t, vm_offset_t, void *, uint32_t, register_t offset, register_t msr); + +void aim_early_init(vm_offset_t fdt, vm_offset_t toc, vm_offset_t ofentry, + void *mdp, uint32_t mdp_cookie); +void aim_cpu_init(vm_offset_t toc); + +void +aim_early_init(vm_offset_t fdt, vm_offset_t toc, vm_offset_t ofentry, void *mdp, + uint32_t mdp_cookie) +{ + register_t scratch; + + /* + * If running from an FDT, make sure we are in real mode to avoid + * tromping on firmware page tables. Everything in the kernel assumes + * 1:1 mappings out of firmware, so this won't break anything not + * already broken. This doesn't work if there is live OF, since OF + * may internally use non-1:1 mappings. + */ + if (ofentry == 0) + mtmsr(mfmsr() & ~(PSL_IR | PSL_DR)); + +#ifdef __powerpc64__ + /* + * Relocate to high memory so that the kernel + * can execute from the direct map. + * + * If we are in virtual mode already, use a special entry point + * that sets up a temporary DMAP to execute from until we can + * properly set up the MMU. + */ + if ((vm_offset_t)&aim_early_init < DMAP_BASE_ADDRESS) { + if (mfmsr() & PSL_DR) { + __restartkernel_virtual(fdt, 0, ofentry, mdp, + mdp_cookie, DMAP_BASE_ADDRESS, mfmsr()); + } else { + __restartkernel(fdt, 0, ofentry, mdp, mdp_cookie, + DMAP_BASE_ADDRESS, mfmsr()); + } + } +#endif + + /* Various very early CPU fix ups */ + switch (mfpvr() >> 16) { + /* + * PowerPC 970 CPUs have a misfeature requested by Apple that + * makes them pretend they have a 32-byte cacheline. Turn this + * off before we measure the cacheline size. + */ + case IBM970: + case IBM970FX: + case IBM970MP: + case IBM970GX: + scratch = mfspr(SPR_HID5); + scratch &= ~HID5_970_DCBZ_SIZE_HI; + mtspr(SPR_HID5, scratch); + break; + #ifdef __powerpc64__ + case IBMPOWER7: + case IBMPOWER7PLUS: + case IBMPOWER8: + case IBMPOWER8E: + case IBMPOWER8NVL: + case IBMPOWER9: + /* XXX: get from ibm,slb-size in device tree */ + n_slbs = 32; + break; + #endif + } +} + +void +aim_cpu_init(vm_offset_t toc) +{ + size_t trap_offset, trapsize; + vm_offset_t trap; + register_t msr; + uint8_t *cache_check; + int cacheline_warn; +#ifndef __powerpc64__ + register_t scratch; + int ppc64; +#endif + + trap_offset = 0; + cacheline_warn = 0; + + /* General setup for AIM CPUs */ + psl_kernset = PSL_EE | PSL_ME | PSL_IR | PSL_DR | PSL_RI; + +#ifdef __powerpc64__ + psl_kernset |= PSL_SF; + if (mfmsr() & PSL_HV) + psl_kernset |= PSL_HV; + +#if BYTE_ORDER == LITTLE_ENDIAN + psl_kernset |= PSL_LE; +#endif + +#endif + psl_userset = psl_kernset | PSL_PR; +#ifdef __powerpc64__ + psl_userset32 = psl_userset & ~PSL_SF; +#endif + + /* + * Zeroed bits in this variable signify that the value of the bit + * in its position is allowed to vary between userspace contexts. + * + * All other bits are required to be identical for every userspace + * context. The actual *value* of the bit is determined by + * psl_userset and/or psl_userset32, and is not allowed to change. + * + * Remember to update this set when implementing support for + * *conditionally* enabling a processor facility. Failing to do + * this will cause swapcontext() in userspace to break when a + * process uses a conditionally-enabled facility. + * + * When *unconditionally* implementing support for a processor + * facility, update psl_userset / psl_userset32 instead. + * + * See the access control check in set_mcontext(). + */ + psl_userstatic = ~(PSL_VSX | PSL_VEC | PSL_FP | PSL_FE0 | PSL_FE1); + /* + * Mask bits from the SRR1 that aren't really the MSR: + * Bits 1-4, 10-15 (ppc32), 33-36, 42-47 (ppc64) + */ + psl_userstatic &= ~0x783f0000UL; + + /* + * Initialize the interrupt tables and figure out our cache line + * size and whether or not we need the 64-bit bridge code. + */ + + /* + * Disable translation in case the vector area hasn't been + * mapped (G5). Note that no OFW calls can be made until + * translation is re-enabled. + */ + + msr = mfmsr(); + mtmsr((msr & ~(PSL_IR | PSL_DR)) | PSL_RI); + + /* + * Measure the cacheline size using dcbz + * + * Use EXC_PGM as a playground. We are about to overwrite it + * anyway, we know it exists, and we know it is cache-aligned. + */ + + cache_check = (void *)EXC_PGM; + + for (cacheline_size = 0; cacheline_size < 0x100; cacheline_size++) + cache_check[cacheline_size] = 0xff; + + __asm __volatile("dcbz 0,%0":: "r" (cache_check) : "memory"); + + /* Find the first byte dcbz did not zero to get the cache line size */ + for (cacheline_size = 0; cacheline_size < 0x100 && + cache_check[cacheline_size] == 0; cacheline_size++); + + /* Work around psim bug */ + if (cacheline_size == 0) { + cacheline_warn = 1; + cacheline_size = 32; + } + + #ifndef __powerpc64__ + /* + * Figure out whether we need to use the 64 bit PMAP. This works by + * executing an instruction that is only legal on 64-bit PPC (mtmsrd), + * and setting ppc64 = 0 if that causes a trap. + */ + + ppc64 = 1; + + bcopy(&testppc64, (void *)EXC_PGM, (size_t)&testppc64size); + __syncicache((void *)EXC_PGM, (size_t)&testppc64size); + + __asm __volatile("\ + mfmsr %0; \ + mtsprg2 %1; \ + \ + mtmsrd %0; \ + mfsprg2 %1;" + : "=r"(scratch), "=r"(ppc64)); + + if (ppc64) + cpu_features |= PPC_FEATURE_64; + + /* + * Now copy restorebridge into all the handlers, if necessary, + * and set up the trap tables. + */ + + if (cpu_features & PPC_FEATURE_64) { + /* Patch the two instances of rfi -> rfid */ + bcopy(&rfid_patch,&rfi_patch1,4); + #ifdef KDB + /* rfi_patch2 is at the end of dbleave */ + bcopy(&rfid_patch,&rfi_patch2,4); + #endif + } + #else /* powerpc64 */ + cpu_features |= PPC_FEATURE_64; + #endif + + trapsize = (size_t)&trapcodeend - (size_t)&trapcode; + + /* + * Copy generic handler into every possible trap. Special cases will get + * different ones in a minute. + */ + for (trap = EXC_RST; trap < EXC_LAST; trap += 0x20) + bcopy(&trapcode, (void *)trap, trapsize); + + #ifndef __powerpc64__ + if (cpu_features & PPC_FEATURE_64) { + /* + * Copy a code snippet to restore 32-bit bridge mode + * to the top of every non-generic trap handler + */ + + trap_offset += (size_t)&restorebridgesize; + bcopy(&restorebridge, (void *)EXC_RST, trap_offset); + bcopy(&restorebridge, (void *)EXC_DSI, trap_offset); + bcopy(&restorebridge, (void *)EXC_ALI, trap_offset); + bcopy(&restorebridge, (void *)EXC_PGM, trap_offset); + bcopy(&restorebridge, (void *)EXC_MCHK, trap_offset); + bcopy(&restorebridge, (void *)EXC_TRC, trap_offset); + bcopy(&restorebridge, (void *)EXC_BPT, trap_offset); + } else { + /* + * Use an IBAT and a DBAT to map the bottom 256M segment. + * + * It is very important to do it *now* to avoid taking a + * fault in .text / .data before the MMU is bootstrapped, + * because until then, the translation data has not been + * copied over from OpenFirmware, so our DSI/ISI will fail + * to find a match. + */ + + battable[0x0].batl = BATL(0x00000000, BAT_M, BAT_PP_RW); + battable[0x0].batu = BATU(0x00000000, BAT_BL_256M, BAT_Vs); + + __asm (".balign 32; \n" + "mtibatu 0,%0; mtibatl 0,%1; isync; \n" + "mtdbatu 0,%0; mtdbatl 0,%1; isync" + :: "r"(battable[0].batu), "r"(battable[0].batl)); + } + #else + trapsize = (size_t)&hypertrapcodeend - (size_t)&hypertrapcode; + bcopy(&hypertrapcode, (void *)(EXC_HEA + trap_offset), trapsize); + bcopy(&hypertrapcode, (void *)(EXC_HMI + trap_offset), trapsize); + bcopy(&hypertrapcode, (void *)(EXC_HVI + trap_offset), trapsize); + bcopy(&hypertrapcode, (void *)(EXC_HFAC + trap_offset), trapsize); + bcopy(&hypertrapcode, (void *)(EXC_SOFT_PATCH + trap_offset), trapsize); + #endif + + bcopy(&rstcode, (void *)(EXC_RST + trap_offset), (size_t)&rstcodeend - + (size_t)&rstcode); + +#ifdef KDB + bcopy(&dblow, (void *)(EXC_MCHK + trap_offset), (size_t)&dbend - + (size_t)&dblow); + bcopy(&dblow, (void *)(EXC_PGM + trap_offset), (size_t)&dbend - + (size_t)&dblow); + bcopy(&dblow, (void *)(EXC_TRC + trap_offset), (size_t)&dbend - + (size_t)&dblow); + bcopy(&dblow, (void *)(EXC_BPT + trap_offset), (size_t)&dbend - + (size_t)&dblow); +#endif + bcopy(&alitrap, (void *)(EXC_ALI + trap_offset), (size_t)&aliend - + (size_t)&alitrap); + bcopy(&dsitrap, (void *)(EXC_DSI + trap_offset), (size_t)&dsiend - + (size_t)&dsitrap); + + /* Set address of generictrap for self-reloc calculations */ + *((void **)TRAP_GENTRAP) = &generictrap; + #ifdef __powerpc64__ + /* Set TOC base so that the interrupt code can get at it */ + *((void **)TRAP_ENTRY) = &generictrap; + *((register_t *)TRAP_TOCBASE) = toc; + #else + /* Set branch address for trap code */ + if (cpu_features & PPC_FEATURE_64) + *((void **)TRAP_ENTRY) = &generictrap64; + else + *((void **)TRAP_ENTRY) = &generictrap; + *((void **)TRAP_TOCBASE) = _GLOBAL_OFFSET_TABLE_; + + /* G2-specific TLB miss helper handlers */ + bcopy(&imisstrap, (void *)EXC_IMISS, (size_t)&imisssize); + bcopy(&dlmisstrap, (void *)EXC_DLMISS, (size_t)&dlmisssize); + bcopy(&dsmisstrap, (void *)EXC_DSMISS, (size_t)&dsmisssize); + #endif + __syncicache(EXC_RSVD, EXC_LAST - EXC_RSVD); + + /* + * Restore MSR + */ + mtmsr(msr); + + /* Warn if cachline size was not determined */ + if (cacheline_warn == 1) { + printf("WARNING: cacheline size undetermined, setting to 32\n"); + } + + /* + * Initialise virtual memory. Use BUS_PROBE_GENERIC priority + * in case the platform module had a better idea of what we + * should do. + */ + if (radix_mmu) + pmap_mmu_install(MMU_TYPE_RADIX, BUS_PROBE_GENERIC); + else if (cpu_features & PPC_FEATURE_64) + pmap_mmu_install(MMU_TYPE_G5, BUS_PROBE_GENERIC); + else + pmap_mmu_install(MMU_TYPE_OEA, BUS_PROBE_GENERIC); +} + +/* + * Shutdown the CPU as much as possible. + */ +void +cpu_halt(void) +{ + + OF_exit(); +} + +int +ptrace_single_step(struct thread *td) +{ + struct trapframe *tf; + + tf = td->td_frame; + tf->srr1 |= PSL_SE; + + return (0); +} + +int +ptrace_clear_single_step(struct thread *td) +{ + struct trapframe *tf; + + tf = td->td_frame; + tf->srr1 &= ~PSL_SE; + + return (0); +} + +void +kdb_cpu_clear_singlestep(void) +{ + + kdb_frame->srr1 &= ~PSL_SE; +} + +void +kdb_cpu_set_singlestep(void) +{ + + kdb_frame->srr1 |= PSL_SE; +} + +/* + * Initialise a struct pcpu. + */ +void +cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t sz) +{ +#ifdef __powerpc64__ +/* Copy the SLB contents from the current CPU */ +memcpy(pcpu->pc_aim.slb, PCPU_GET(aim.slb), sizeof(pcpu->pc_aim.slb)); +#endif +} + +/* Return 0 on handled success, otherwise signal number. */ +int +cpu_machine_check(struct thread *td, struct trapframe *frame, int *ucode) +{ +#ifdef __powerpc64__ + /* + * This block is 64-bit CPU specific currently. Punt running in 32-bit + * mode on 64-bit CPUs. + */ + /* Check if the important information is in DSISR */ + if ((frame->srr1 & SRR1_MCHK_DATA) != 0) { + printf("Machine check, DSISR: %016lx\n", frame->cpu.aim.dsisr); + /* SLB multi-hit is recoverable. */ + if ((frame->cpu.aim.dsisr & DSISR_MC_SLB_MULTIHIT) != 0) + return (0); + if ((frame->cpu.aim.dsisr & + (DSISR_MC_DERAT_MULTIHIT | DSISR_MC_TLB_MULTIHIT)) != 0) { + pmap_tlbie_all(); + return (0); + } + /* TODO: Add other machine check recovery procedures. */ + } else { + if ((frame->srr1 & SRR1_MCHK_IFETCH_M) == SRR1_MCHK_IFETCH_SLBMH) + return (0); + } +#endif + *ucode = BUS_OBJERR; + return (SIGBUS); +} + +#ifndef __powerpc64__ +uint64_t +va_to_vsid(pmap_t pm, vm_offset_t va) +{ + return ((pm->pm_sr[(uintptr_t)va >> ADDR_SR_SHFT]) & SR_VSID_MASK); +} + +#endif + +void +pmap_early_io_map_init(void) +{ + if ((cpu_features2 & PPC_FEATURE2_ARCH_3_00) == 0) + radix_mmu = 0; + else { + radix_mmu = 1; + TUNABLE_INT_FETCH("radix_mmu", &radix_mmu); + } + + /* + * When using Radix, set the start and end of kva early, to be able to + * use KVAs on pmap_early_io_map and avoid issues when remapping them + * later. + */ + if (radix_mmu) { + virtual_avail = VM_MIN_KERNEL_ADDRESS; + virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; + } +} + +/* + * These functions need to provide addresses that both (a) work in real mode + * (or whatever mode/circumstances the kernel is in in early boot (now)) and + * (b) can still, in principle, work once the kernel is going. Because these + * rely on existing mappings/real mode, unmap is a no-op. + */ +vm_offset_t +pmap_early_io_map(vm_paddr_t pa, vm_size_t size) +{ + KASSERT(!pmap_bootstrapped, ("Not available after PMAP started!")); + + /* + * If we have the MMU up in early boot, assume it is 1:1. Otherwise, + * try to get the address in a memory region compatible with the + * direct map for efficiency later. + * Except for Radix MMU, for which current implementation doesn't + * support mapping arbitrary virtual addresses, such as the ones + * generated by "direct mapping" I/O addresses. In this case, use + * addresses from KVA area. + */ + if (mfmsr() & PSL_DR) + return (pa); + else if (radix_mmu) { + vm_offset_t va; + + va = virtual_avail; + virtual_avail += round_page(size + pa - trunc_page(pa)); + return (va); + } else + return (DMAP_BASE_ADDRESS + pa); +} + +void +pmap_early_io_unmap(vm_offset_t va, vm_size_t size) +{ + + KASSERT(!pmap_bootstrapped, ("Not available after PMAP started!")); +} + +/* From p3-53 of the MPC7450 RISC Microprocessor Family Reference Manual */ +void +flush_disable_caches(void) +{ + register_t msr; + register_t msscr0; + register_t cache_reg; + volatile uint32_t *memp; + int i; + int x; + + msr = mfmsr(); + powerpc_sync(); + mtmsr(msr & ~(PSL_EE | PSL_DR)); + msscr0 = mfspr(SPR_MSSCR0); + msscr0 &= ~MSSCR0_L2PFE; + mtspr(SPR_MSSCR0, msscr0); + powerpc_sync(); + isync(); + /* 7e00066c: dssall */ + __asm__ __volatile__(".long 0x7e00066c; sync"); + powerpc_sync(); + isync(); + __asm__ __volatile__("dcbf 0,%0" :: "r"(0)); + __asm__ __volatile__("dcbf 0,%0" :: "r"(0)); + __asm__ __volatile__("dcbf 0,%0" :: "r"(0)); + + /* Lock the L1 Data cache. */ + mtspr(SPR_LDSTCR, mfspr(SPR_LDSTCR) | 0xFF); + powerpc_sync(); + isync(); + + mtspr(SPR_LDSTCR, 0); + + /* + * Perform this in two stages: Flush the cache starting in RAM, then do it + * from ROM. + */ + memp = (volatile uint32_t *)0x00000000; + for (i = 0; i < 128 * 1024; i++) { + (void)*memp; + __asm__ __volatile__("dcbf 0,%0" :: "r"(memp)); + memp += 32/sizeof(*memp); + } + + memp = (volatile uint32_t *)0xfff00000; + x = 0xfe; + + for (; x != 0xff;) { + mtspr(SPR_LDSTCR, x); + for (i = 0; i < 128; i++) { + (void)*memp; + __asm__ __volatile__("dcbf 0,%0" :: "r"(memp)); + memp += 32/sizeof(*memp); + } + x = ((x << 1) | 1) & 0xff; + } + mtspr(SPR_LDSTCR, 0); + + cache_reg = mfspr(SPR_L2CR); + if (cache_reg & L2CR_L2E) { + cache_reg &= ~(L2CR_L2IO_7450 | L2CR_L2DO_7450); + mtspr(SPR_L2CR, cache_reg); + powerpc_sync(); + mtspr(SPR_L2CR, cache_reg | L2CR_L2HWF); + while (mfspr(SPR_L2CR) & L2CR_L2HWF) + ; /* Busy wait for cache to flush */ + powerpc_sync(); + cache_reg &= ~L2CR_L2E; + mtspr(SPR_L2CR, cache_reg); + powerpc_sync(); + mtspr(SPR_L2CR, cache_reg | L2CR_L2I); + powerpc_sync(); + while (mfspr(SPR_L2CR) & L2CR_L2I) + ; /* Busy wait for L2 cache invalidate */ + powerpc_sync(); + } + + cache_reg = mfspr(SPR_L3CR); + if (cache_reg & L3CR_L3E) { + cache_reg &= ~(L3CR_L3IO | L3CR_L3DO); + mtspr(SPR_L3CR, cache_reg); + powerpc_sync(); + mtspr(SPR_L3CR, cache_reg | L3CR_L3HWF); + while (mfspr(SPR_L3CR) & L3CR_L3HWF) + ; /* Busy wait for cache to flush */ + powerpc_sync(); + cache_reg &= ~L3CR_L3E; + mtspr(SPR_L3CR, cache_reg); + powerpc_sync(); + mtspr(SPR_L3CR, cache_reg | L3CR_L3I); + powerpc_sync(); + while (mfspr(SPR_L3CR) & L3CR_L3I) + ; /* Busy wait for L3 cache invalidate */ + powerpc_sync(); + } + + mtspr(SPR_HID0, mfspr(SPR_HID0) & ~HID0_DCE); + powerpc_sync(); + isync(); + + mtmsr(msr); +} + +#ifndef __powerpc64__ +void +mpc745x_sleep(void) +{ + static u_quad_t timebase = 0; + static register_t sprgs[4]; + static register_t srrs[2]; + + jmp_buf resetjb; + struct thread *fputd; + struct thread *vectd; + register_t hid0; + register_t msr; + register_t saved_msr; + + ap_pcpu = pcpup; + + PCPU_SET(restore, &resetjb); + + saved_msr = mfmsr(); + fputd = PCPU_GET(fputhread); + vectd = PCPU_GET(vecthread); + if (fputd != NULL) + save_fpu(fputd); + if (vectd != NULL) + save_vec(vectd); + if (setjmp(resetjb) == 0) { + sprgs[0] = mfspr(SPR_SPRG0); + sprgs[1] = mfspr(SPR_SPRG1); + sprgs[2] = mfspr(SPR_SPRG2); + sprgs[3] = mfspr(SPR_SPRG3); + srrs[0] = mfspr(SPR_SRR0); + srrs[1] = mfspr(SPR_SRR1); + timebase = mftb(); + powerpc_sync(); + flush_disable_caches(); + hid0 = mfspr(SPR_HID0); + hid0 = (hid0 & ~(HID0_DOZE | HID0_NAP)) | HID0_SLEEP; + powerpc_sync(); + isync(); + msr = mfmsr() | PSL_POW; + mtspr(SPR_HID0, hid0); + powerpc_sync(); + + while (1) + mtmsr(msr); + } + /* XXX: The mttb() means this *only* works on single-CPU systems. */ + mttb(timebase); + PCPU_SET(curthread, curthread); + PCPU_SET(curpcb, curthread->td_pcb); + pmap_activate(curthread); + powerpc_sync(); + mtspr(SPR_SPRG0, sprgs[0]); + mtspr(SPR_SPRG1, sprgs[1]); + mtspr(SPR_SPRG2, sprgs[2]); + mtspr(SPR_SPRG3, sprgs[3]); + mtspr(SPR_SRR0, srrs[0]); + mtspr(SPR_SRR1, srrs[1]); + mtmsr(saved_msr); + if (fputd == curthread) + enable_fpu(curthread); + if (vectd == curthread) + enable_vec(curthread); + powerpc_sync(); +} +#endif diff --git a/sys/powerpc/aim/locore.S b/sys/powerpc/aim/locore.S new file mode 100644 index 000000000000..085719547fa1 --- /dev/null +++ b/sys/powerpc/aim/locore.S @@ -0,0 +1,15 @@ + +#ifdef __powerpc64__ +#include <powerpc/aim/locore64.S> +#else +#include <powerpc/aim/locore32.S> +#endif + +/* + * XXX: This should be moved to a shared AIM/booke asm file, if one ever is + * created. + */ +ENTRY(get_spr) + mfspr %r3, 0 + blr +END(get_spr) diff --git a/sys/powerpc/aim/locore32.S b/sys/powerpc/aim/locore32.S new file mode 100644 index 000000000000..3fba703794b0 --- /dev/null +++ b/sys/powerpc/aim/locore32.S @@ -0,0 +1,123 @@ + +/*- + * Copyright (C) 2010-2016 Nathan Whitehorn + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "assym.inc" + +#include <sys/syscall.h> + +#include <machine/trap.h> +#include <machine/param.h> +#include <machine/spr.h> +#include <machine/asm.h> +#include <machine/vmparam.h> +#include "opt_platform.h" + +/* Locate the per-CPU data structure */ +#define GET_CPUINFO(r) \ + mfsprg0 r + +/* + * Compiled KERNBASE location and the kernel load address + */ + .globl kernbase + .set kernbase, KERNBASE + +/* + * Globals + */ + .data + .align 3 +GLOBAL(__startkernel) + .long begin +GLOBAL(__endkernel) + .long end + .align 4 +#define TMPSTKSZ 8192 /* 8K temporary stack */ +GLOBAL(tmpstk) + .space TMPSTKSZ + +#ifdef KDB +#define TRAPSTKSZ 4096 /* 4k trap stack */ +GLOBAL(trapstk) + .space TRAPSTKSZ +#endif + + .text + .globl btext +btext: + +/* + * Main kernel entry point. + */ + .text + .globl __start +__start: + /* Figure out where we are */ + bl 1f + .long _DYNAMIC-. + .long _GLOBAL_OFFSET_TABLE_-. + .long tmpstk-. +1: mflr %r30 + + /* Set up temporary stack pointer */ + lwz %r1,8(%r30) + add %r1,%r1,%r30 + addi %r1,%r1,(8+TMPSTKSZ-40) + + /* Relocate self */ + stw %r3,16(%r1) + stw %r4,20(%r1) + stw %r5,24(%r1) + stw %r6,28(%r1) + stw %r7,32(%r1) + + lwz %r3,0(%r30) /* _DYNAMIC in %r3 */ + add %r3,%r3,%r30 + lwz %r4,4(%r30) /* GOT pointer */ + add %r4,%r4,%r30 + lwz %r4,4(%r4) /* got[0] is _DYNAMIC link addr */ + subf %r4,%r4,%r3 /* subtract to calculate relocbase */ + bl elf_reloc_self + + lwz %r3,16(%r1) + lwz %r4,20(%r1) + lwz %r5,24(%r1) + lwz %r6,28(%r1) + lwz %r7,32(%r1) + + /* MD setup */ + bl powerpc_init + + /* Set stack pointer to new value and branch to mi_startup */ + mr %r1, %r3 + li %r3, 0 + stw %r3, 0(%r1) + bl mi_startup + + /* mi_startup() does not return */ + b . + +#include <powerpc/aim/trap_subr32.S> diff --git a/sys/powerpc/aim/locore64.S b/sys/powerpc/aim/locore64.S new file mode 100644 index 000000000000..9e49b605b8b4 --- /dev/null +++ b/sys/powerpc/aim/locore64.S @@ -0,0 +1,287 @@ + +/*- + * Copyright (C) 2010-2016 Nathan Whitehorn + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "assym.inc" + +#include <sys/syscall.h> + +#include <machine/trap.h> +#include <machine/param.h> +#include <machine/spr.h> +#include <machine/asm.h> +#include <machine/vmparam.h> + +#ifdef _CALL_ELF +.abiversion _CALL_ELF +#endif + +/* Glue for linker script */ +.globl kernbase +.set kernbase, KERNBASE + +/* + * Globals + */ + .data + .align 3 +GLOBAL(__startkernel) + .llong begin +GLOBAL(__endkernel) + .llong end +GLOBAL(can_wakeup) + .llong 0x0 + + .align 4 +#define TMPSTKSZ 16384 /* 16K temporary stack */ +GLOBAL(tmpstk) + .space TMPSTKSZ + +TOC_ENTRY(tmpstk) +TOC_ENTRY(can_wakeup) + +#ifdef KDB +#define TRAPSTKSZ 8192 /* 8k trap stack */ +GLOBAL(trapstk) + .space TRAPSTKSZ +TOC_ENTRY(trapstk) +#endif + + +/* + * Entry point for bootloaders that do not fully implement ELF and start + * at the beginning of the image (kexec, notably). In its own section so + * that it ends up before any linker-generated call stubs and actually at + * the beginning of the image. kexec on some systems also enters at + * (start of image) + 0x60, so put a spin loop there. + */ + .section ".text.kboot", "x", @progbits +kbootentry: +#ifdef __LITTLE_ENDIAN__ + RETURN_TO_NATIVE_ENDIAN +#endif + b __start +. = kbootentry + 0x40 /* Magic address used in platform layer */ + .global smp_spin_sem +ap_kexec_spin_sem: + .long -1 +. = kbootentry + 0x60 /* Entry point for kexec APs */ +ap_kexec_start: /* At 0x60 past start, copied to 0x60 by kexec */ + /* r3 set to CPU ID by kexec */ + + /* Invalidate icache for low-memory copy and jump there */ + li %r0,0x80 + dcbst 0,%r0 + sync + icbi 0,%r0 + isync + ba 0x80 /* Absolute branch to next inst */ + +. = kbootentry + 0x80 /* Aligned to cache line */ +1: or 31,31,31 /* yield */ + sync + lwz %r1,0x40(0) /* Spin on ap_kexec_spin_sem */ + cmpw %r1,%r3 /* Until it equals our CPU ID */ + bne 1b + + /* Released */ + or 2,2,2 /* unyield */ + + /* Make sure that it will be software reset. Clear SRR1 */ + li %r1,0 + mtsrr1 %r1 + ba EXC_RST + +/* + * Now start the real text section + */ + + .text + .globl btext +btext: + +/* + * Main kernel entry point. + * + * Calling convention: + * r3: Flattened Device Tree pointer (or zero) + * r4: ignored + * r5: OF client interface pointer (or zero) + * r6: Loader metadata pointer (or zero) + * r7: Magic cookie (0xfb5d104d) to indicate that r6 has loader metadata + */ + .text +_NAKED_ENTRY(__start) + +#ifdef __LITTLE_ENDIAN__ + RETURN_TO_NATIVE_ENDIAN +#endif + /* Set 64-bit mode if not yet set before branching to C */ + mfmsr %r20 + li %r21,1 + insrdi %r20,%r21,1,0 + mtmsrd %r20 + isync + nop /* Make this block a multiple of 8 bytes */ + + /* Set up the TOC pointer */ + b 0f + .align 3 +0: nop + bl 1f + .llong __tocbase + 0x8000 - . +1: mflr %r2 + ld %r1,0(%r2) + add %r2,%r1,%r2 + + /* Get load offset */ + ld %r31,-0x8000(%r2) /* First TOC entry is TOC base */ + subf %r31,%r31,%r2 /* Subtract from real TOC base to get base */ + + /* Set up the stack pointer */ + bl 1f + .llong tmpstk + TMPSTKSZ - 96 - . +1: mflr %r30 + ld %r1,0(%r30) + add %r1,%r1,%r30 + nop + + /* Relocate kernel */ + std %r3,48(%r1) + std %r4,56(%r1) + std %r5,64(%r1) + std %r6,72(%r1) + std %r7,80(%r1) + + bl 1f + .llong _DYNAMIC-. +1: mflr %r3 + ld %r4,0(%r3) + add %r3,%r4,%r3 + mr %r4,%r31 + bl elf_reloc_self + nop + ld %r3,48(%r1) + ld %r4,56(%r1) + ld %r5,64(%r1) + ld %r6,72(%r1) + ld %r7,80(%r1) + + /* Begin CPU init */ + mr %r4,%r2 /* Replace ignored r4 with tocbase for trap handlers */ + bl powerpc_init + nop + + /* Set stack pointer to new value and branch to mi_startup */ + mr %r1, %r3 + li %r3, 0 + std %r3, 0(%r1) + bl mi_startup + nop + + /* Unreachable */ + b . +_END(__start) + +ASENTRY_NOPROF(__restartkernel_virtual) + /* + * When coming in via this entry point, we need to alter the SLB to + * shadow the segment register emulation entries in DMAP space. + * We need to do this dance because we are running with virtual-mode + * OpenFirmware and have not yet taken over the MMU. + * + * Assumptions: + * 1) The kernel is currently identity-mapped. + * 2) We are currently executing at an address compatible with + * real mode. + * 3) The first 16 SLB entries are emulating SRs. + * 4) The rest of the SLB is not in use. + * 5) OpenFirmware is not manipulating the SLB at runtime. + * 6) We are running on 64-bit AIM. + * + * Tested on a G5. + */ + mfmsr %r14 + /* Switch to real mode because we are about to mess with the SLB. */ + andi. %r14, %r14, ~(PSL_DR|PSL_IR|PSL_ME|PSL_RI)@l + mtmsr %r14 + isync + /* Prepare variables for later use. */ + li %r14, 0 + li %r18, 0 + oris %r18, %r18, 0xc000 + sldi %r18, %r18, 32 /* r18: 0xc000000000000000 */ +1: + /* + * Loop over the first 16 SLB entries. + * Offset the SLBE into the DMAP, add 16 to the index, and write + * it back to the SLB. + */ + /* XXX add more safety checks */ + slbmfev %r15, %r14 + slbmfee %r16, %r14 + or %r16, %r16, %r14 /* index is 0-15 */ + ori %r16, %r16, 0x10 /* add 16 to index. */ + or %r16, %r16, %r18 /* SLBE DMAP offset */ + rldicr %r17, %r16, 0, 37 /* Invalidation SLBE */ + + isync + slbie %r17 + /* isync */ + slbmte %r15, %r16 + isync + addi %r14, %r14, 1 + cmpdi %r14, 16 + blt 1b + + /* + * Now that we are set up with a temporary direct map, we can + * continue with __restartkernel. Translation will be switched + * back on at the rfid, at which point we will be executing from + * the temporary direct map we just installed, until the kernel + * takes over responsibility for the MMU. + */ + bl __restartkernel + nop +ASEND(__restartkernel_virtual) + +ASENTRY_NOPROF(__restartkernel) + /* + * r3-r7: arguments to go to __start + * r8: offset from current kernel address to apply + * r9: MSR to set when (atomically) jumping to __start + r8 + */ + mtsrr1 %r9 + bl 1f +1: mflr %r25 + add %r25,%r8,%r25 + addi %r25,%r25,2f-1b + mtsrr0 %r25 + rfid +2: bl __start + nop +ASEND(__restartkernel) + +#include <powerpc/aim/trap_subr64.S> diff --git a/sys/powerpc/aim/mmu_oea.c b/sys/powerpc/aim/mmu_oea.c new file mode 100644 index 000000000000..ae17b3289593 --- /dev/null +++ b/sys/powerpc/aim/mmu_oea.c @@ -0,0 +1,2843 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause AND BSD-4-Clause + * + * Copyright (c) 2001 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Matt Thomas <matt@3am-software.com> of Allegro Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +/*- + * Copyright (C) 1995, 1996 Wolfgang Solfrank. + * Copyright (C) 1995, 1996 TooLs GmbH. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by TooLs GmbH. + * 4. The name of TooLs GmbH may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $NetBSD: pmap.c,v 1.28 2000/03/26 20:42:36 kleink Exp $ + */ +/*- + * Copyright (C) 2001 Benno Rice. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +/* + * Manages physical address maps. + * + * Since the information managed by this module is also stored by the + * logical address mapping module, this module may throw away valid virtual + * to physical mappings at almost any time. However, invalidations of + * mappings must be done as requested. + * + * In order to cope with hardware architectures which make virtual to + * physical map invalidates expensive, this module may delay invalidate + * reduced protection operations until such time as they are actually + * necessary. This module is given full information as to which processors + * are currently using which maps, and to when physical maps must be made + * correct. + */ + +#include "opt_kstack_pages.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/conf.h> +#include <sys/queue.h> +#include <sys/cpuset.h> +#include <sys/kerneldump.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/mman.h> +#include <sys/msgbuf.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/sched.h> +#include <sys/sysctl.h> +#include <sys/systm.h> +#include <sys/vmmeter.h> + +#include <dev/ofw/openfirm.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/vm_page.h> +#include <vm/vm_phys.h> +#include <vm/vm_pageout.h> +#include <vm/vm_radix.h> +#include <vm/uma.h> + +#include <machine/cpu.h> +#include <machine/platform.h> +#include <machine/bat.h> +#include <machine/frame.h> +#include <machine/md_var.h> +#include <machine/psl.h> +#include <machine/pte.h> +#include <machine/smp.h> +#include <machine/sr.h> +#include <machine/mmuvar.h> +#include <machine/trap.h> + +#define MOEA_DEBUG + +#define TODO panic("%s: not implemented", __func__); + +#define VSID_MAKE(sr, hash) ((sr) | (((hash) & 0xfffff) << 4)) +#define VSID_TO_SR(vsid) ((vsid) & 0xf) +#define VSID_TO_HASH(vsid) (((vsid) >> 4) & 0xfffff) + +/* Get physical address from PVO. */ +#define PVO_PADDR(pvo) ((pvo)->pvo_pte.pte.pte_lo & PTE_RPGN) + +struct ofw_map { + vm_offset_t om_va; + vm_size_t om_len; + vm_offset_t om_pa; + u_int om_mode; +}; + +extern unsigned char _etext[]; +extern unsigned char _end[]; + +/* + * Map of physical memory regions. + */ +static struct mem_region *regions; +static struct mem_region *pregions; +static u_int phys_avail_count; +static int regions_sz, pregions_sz; +static struct ofw_map *translations; + +/* + * Lock for the pteg and pvo tables. + */ +struct mtx moea_table_mutex; +struct mtx moea_vsid_mutex; + +/* tlbie instruction synchronization */ +static struct mtx tlbie_mtx; + +/* + * PTEG data. + */ +static struct pteg *moea_pteg_table; +u_int moea_pteg_count; +u_int moea_pteg_mask; + +/* + * PVO data. + */ +struct pvo_head *moea_pvo_table; /* pvo entries by pteg index */ +struct pvo_head moea_pvo_kunmanaged = + LIST_HEAD_INITIALIZER(moea_pvo_kunmanaged); /* list of unmanaged pages */ + +static struct rwlock_padalign pvh_global_lock; + +uma_zone_t moea_upvo_zone; /* zone for pvo entries for unmanaged pages */ +uma_zone_t moea_mpvo_zone; /* zone for pvo entries for managed pages */ + +#define BPVO_POOL_SIZE 32768 +static struct pvo_entry *moea_bpvo_pool; +static int moea_bpvo_pool_index = 0; + +#define VSID_NBPW (sizeof(u_int32_t) * 8) +static u_int moea_vsid_bitmap[NPMAPS / VSID_NBPW]; + +static bool moea_initialized = false; + +/* + * Statistics. + */ +u_int moea_pte_valid = 0; +u_int moea_pte_overflow = 0; +u_int moea_pte_replacements = 0; +u_int moea_pvo_entries = 0; +u_int moea_pvo_enter_calls = 0; +u_int moea_pvo_remove_calls = 0; +u_int moea_pte_spills = 0; +SYSCTL_INT(_machdep, OID_AUTO, moea_pte_valid, CTLFLAG_RD, &moea_pte_valid, + 0, ""); +SYSCTL_INT(_machdep, OID_AUTO, moea_pte_overflow, CTLFLAG_RD, + &moea_pte_overflow, 0, ""); +SYSCTL_INT(_machdep, OID_AUTO, moea_pte_replacements, CTLFLAG_RD, + &moea_pte_replacements, 0, ""); +SYSCTL_INT(_machdep, OID_AUTO, moea_pvo_entries, CTLFLAG_RD, &moea_pvo_entries, + 0, ""); +SYSCTL_INT(_machdep, OID_AUTO, moea_pvo_enter_calls, CTLFLAG_RD, + &moea_pvo_enter_calls, 0, ""); +SYSCTL_INT(_machdep, OID_AUTO, moea_pvo_remove_calls, CTLFLAG_RD, + &moea_pvo_remove_calls, 0, ""); +SYSCTL_INT(_machdep, OID_AUTO, moea_pte_spills, CTLFLAG_RD, + &moea_pte_spills, 0, ""); + +/* + * Allocate physical memory for use in moea_bootstrap. + */ +static vm_offset_t moea_bootstrap_alloc(vm_size_t, u_int); + +/* + * PTE calls. + */ +static int moea_pte_insert(u_int, struct pte *); + +/* + * PVO calls. + */ +static int moea_pvo_enter(pmap_t, uma_zone_t, struct pvo_head *, + vm_offset_t, vm_paddr_t, u_int, int); +static void moea_pvo_remove(struct pvo_entry *, int); +static struct pvo_entry *moea_pvo_find_va(pmap_t, vm_offset_t, int *); +static struct pte *moea_pvo_to_pte(const struct pvo_entry *, int); + +/* + * Utility routines. + */ +static int moea_enter_locked(pmap_t, vm_offset_t, vm_page_t, + vm_prot_t, u_int, int8_t); +static void moea_syncicache(vm_paddr_t, vm_size_t); +static bool moea_query_bit(vm_page_t, int); +static u_int moea_clear_bit(vm_page_t, int); +static void moea_kremove(vm_offset_t); +int moea_pte_spill(vm_offset_t); + +/* + * Kernel MMU interface + */ +void moea_clear_modify(vm_page_t); +void moea_copy_page(vm_page_t, vm_page_t); +void moea_copy_pages(vm_page_t *ma, vm_offset_t a_offset, + vm_page_t *mb, vm_offset_t b_offset, int xfersize); +int moea_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, + int8_t); +void moea_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t, + vm_prot_t); +void moea_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t); +vm_paddr_t moea_extract(pmap_t, vm_offset_t); +vm_page_t moea_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t); +void moea_init(void); +bool moea_is_modified(vm_page_t); +bool moea_is_prefaultable(pmap_t, vm_offset_t); +bool moea_is_referenced(vm_page_t); +int moea_ts_referenced(vm_page_t); +vm_offset_t moea_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int); +static int moea_mincore(pmap_t, vm_offset_t, vm_paddr_t *); +bool moea_page_exists_quick(pmap_t, vm_page_t); +void moea_page_init(vm_page_t); +int moea_page_wired_mappings(vm_page_t); +int moea_pinit(pmap_t); +void moea_pinit0(pmap_t); +void moea_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); +void moea_qenter(vm_offset_t, vm_page_t *, int); +void moea_qremove(vm_offset_t, int); +void moea_release(pmap_t); +void moea_remove(pmap_t, vm_offset_t, vm_offset_t); +void moea_remove_all(vm_page_t); +void moea_remove_write(vm_page_t); +void moea_unwire(pmap_t, vm_offset_t, vm_offset_t); +void moea_zero_page(vm_page_t); +void moea_zero_page_area(vm_page_t, int, int); +void moea_activate(struct thread *); +void moea_deactivate(struct thread *); +void moea_cpu_bootstrap(int); +void moea_bootstrap(vm_offset_t, vm_offset_t); +void *moea_mapdev(vm_paddr_t, vm_size_t); +void *moea_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t); +void moea_unmapdev(void *, vm_size_t); +vm_paddr_t moea_kextract(vm_offset_t); +void moea_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t); +void moea_kenter(vm_offset_t, vm_paddr_t); +void moea_page_set_memattr(vm_page_t m, vm_memattr_t ma); +int moea_dev_direct_mapped(vm_paddr_t, vm_size_t); +static void moea_sync_icache(pmap_t, vm_offset_t, vm_size_t); +void moea_dumpsys_map(vm_paddr_t pa, size_t sz, void **va); +void moea_scan_init(void); +vm_offset_t moea_quick_enter_page(vm_page_t m); +void moea_quick_remove_page(vm_offset_t addr); +bool moea_page_is_mapped(vm_page_t m); +bool moea_ps_enabled(pmap_t pmap); +static int moea_map_user_ptr(pmap_t pm, + volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen); +static int moea_decode_kernel_ptr(vm_offset_t addr, + int *is_user, vm_offset_t *decoded_addr); + +static struct pmap_funcs moea_methods = { + .clear_modify = moea_clear_modify, + .copy_page = moea_copy_page, + .copy_pages = moea_copy_pages, + .enter = moea_enter, + .enter_object = moea_enter_object, + .enter_quick = moea_enter_quick, + .extract = moea_extract, + .extract_and_hold = moea_extract_and_hold, + .init = moea_init, + .is_modified = moea_is_modified, + .is_prefaultable = moea_is_prefaultable, + .is_referenced = moea_is_referenced, + .ts_referenced = moea_ts_referenced, + .map = moea_map, + .page_exists_quick = moea_page_exists_quick, + .page_init = moea_page_init, + .page_wired_mappings = moea_page_wired_mappings, + .pinit = moea_pinit, + .pinit0 = moea_pinit0, + .protect = moea_protect, + .qenter = moea_qenter, + .qremove = moea_qremove, + .release = moea_release, + .remove = moea_remove, + .remove_all = moea_remove_all, + .mincore = moea_mincore, + .remove_write = moea_remove_write, + .sync_icache = moea_sync_icache, + .unwire = moea_unwire, + .zero_page = moea_zero_page, + .zero_page_area = moea_zero_page_area, + .activate = moea_activate, + .deactivate = moea_deactivate, + .page_set_memattr = moea_page_set_memattr, + .quick_enter_page = moea_quick_enter_page, + .quick_remove_page = moea_quick_remove_page, + .page_is_mapped = moea_page_is_mapped, + .ps_enabled = moea_ps_enabled, + + /* Internal interfaces */ + .bootstrap = moea_bootstrap, + .cpu_bootstrap = moea_cpu_bootstrap, + .mapdev_attr = moea_mapdev_attr, + .mapdev = moea_mapdev, + .unmapdev = moea_unmapdev, + .kextract = moea_kextract, + .kenter = moea_kenter, + .kenter_attr = moea_kenter_attr, + .dev_direct_mapped = moea_dev_direct_mapped, + .dumpsys_pa_init = moea_scan_init, + .dumpsys_map_chunk = moea_dumpsys_map, + .map_user_ptr = moea_map_user_ptr, + .decode_kernel_ptr = moea_decode_kernel_ptr, +}; + +MMU_DEF(oea_mmu, MMU_TYPE_OEA, moea_methods); + +static __inline uint32_t +moea_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) +{ + uint32_t pte_lo; + int i; + + if (ma != VM_MEMATTR_DEFAULT) { + switch (ma) { + case VM_MEMATTR_UNCACHEABLE: + return (PTE_I | PTE_G); + case VM_MEMATTR_CACHEABLE: + return (PTE_M); + case VM_MEMATTR_WRITE_COMBINING: + case VM_MEMATTR_WRITE_BACK: + case VM_MEMATTR_PREFETCHABLE: + return (PTE_I); + case VM_MEMATTR_WRITE_THROUGH: + return (PTE_W | PTE_M); + } + } + + /* + * Assume the page is cache inhibited and access is guarded unless + * it's in our available memory array. + */ + pte_lo = PTE_I | PTE_G; + for (i = 0; i < pregions_sz; i++) { + if ((pa >= pregions[i].mr_start) && + (pa < (pregions[i].mr_start + pregions[i].mr_size))) { + pte_lo = PTE_M; + break; + } + } + + return pte_lo; +} + +/* + * Translate OFW translations into VM attributes. + */ +static __inline vm_memattr_t +moea_bootstrap_convert_wimg(uint32_t mode) +{ + + switch (mode) { + case (PTE_I | PTE_G): + /* PCI device memory */ + return VM_MEMATTR_UNCACHEABLE; + case (PTE_M): + /* Explicitly coherent */ + return VM_MEMATTR_CACHEABLE; + case 0: /* Default claim */ + case 2: /* Alternate PP bits set by OF for the original payload */ + /* "Normal" memory. */ + return VM_MEMATTR_DEFAULT; + + default: + /* Err on the side of caution for unknowns */ + /* XXX should we panic instead? */ + return VM_MEMATTR_UNCACHEABLE; + } +} + +static void +tlbie(vm_offset_t va) +{ + + mtx_lock_spin(&tlbie_mtx); + __asm __volatile("ptesync"); + __asm __volatile("tlbie %0" :: "r"(va)); + __asm __volatile("eieio; tlbsync; ptesync"); + mtx_unlock_spin(&tlbie_mtx); +} + +static void +tlbia(void) +{ + vm_offset_t va; + + for (va = 0; va < 0x00040000; va += 0x00001000) { + __asm __volatile("tlbie %0" :: "r"(va)); + powerpc_sync(); + } + __asm __volatile("tlbsync"); + powerpc_sync(); +} + +static __inline int +va_to_sr(u_int *sr, vm_offset_t va) +{ + return (sr[(uintptr_t)va >> ADDR_SR_SHFT]); +} + +static __inline u_int +va_to_pteg(u_int sr, vm_offset_t addr) +{ + u_int hash; + + hash = (sr & SR_VSID_MASK) ^ (((u_int)addr & ADDR_PIDX) >> + ADDR_PIDX_SHFT); + return (hash & moea_pteg_mask); +} + +static __inline struct pvo_head * +vm_page_to_pvoh(vm_page_t m) +{ + + return (&m->md.mdpg_pvoh); +} + +static __inline void +moea_attr_clear(vm_page_t m, int ptebit) +{ + + rw_assert(&pvh_global_lock, RA_WLOCKED); + m->md.mdpg_attrs &= ~ptebit; +} + +static __inline int +moea_attr_fetch(vm_page_t m) +{ + + return (m->md.mdpg_attrs); +} + +static __inline void +moea_attr_save(vm_page_t m, int ptebit) +{ + + rw_assert(&pvh_global_lock, RA_WLOCKED); + m->md.mdpg_attrs |= ptebit; +} + +static __inline int +moea_pte_compare(const struct pte *pt, const struct pte *pvo_pt) +{ + if (pt->pte_hi == pvo_pt->pte_hi) + return (1); + + return (0); +} + +static __inline int +moea_pte_match(struct pte *pt, u_int sr, vm_offset_t va, int which) +{ + return (pt->pte_hi & ~PTE_VALID) == + (((sr & SR_VSID_MASK) << PTE_VSID_SHFT) | + ((va >> ADDR_API_SHFT) & PTE_API) | which); +} + +static __inline void +moea_pte_create(struct pte *pt, u_int sr, vm_offset_t va, u_int pte_lo) +{ + + mtx_assert(&moea_table_mutex, MA_OWNED); + + /* + * Construct a PTE. Default to IMB initially. Valid bit only gets + * set when the real pte is set in memory. + * + * Note: Don't set the valid bit for correct operation of tlb update. + */ + pt->pte_hi = ((sr & SR_VSID_MASK) << PTE_VSID_SHFT) | + (((va & ADDR_PIDX) >> ADDR_API_SHFT) & PTE_API); + pt->pte_lo = pte_lo; +} + +static __inline void +moea_pte_synch(struct pte *pt, struct pte *pvo_pt) +{ + + mtx_assert(&moea_table_mutex, MA_OWNED); + pvo_pt->pte_lo |= pt->pte_lo & (PTE_REF | PTE_CHG); +} + +static __inline void +moea_pte_clear(struct pte *pt, vm_offset_t va, int ptebit) +{ + + mtx_assert(&moea_table_mutex, MA_OWNED); + + /* + * As shown in Section 7.6.3.2.3 + */ + pt->pte_lo &= ~ptebit; + tlbie(va); +} + +static __inline void +moea_pte_set(struct pte *pt, struct pte *pvo_pt) +{ + + mtx_assert(&moea_table_mutex, MA_OWNED); + pvo_pt->pte_hi |= PTE_VALID; + + /* + * Update the PTE as defined in section 7.6.3.1. + * Note that the REF/CHG bits are from pvo_pt and thus should have + * been saved so this routine can restore them (if desired). + */ + pt->pte_lo = pvo_pt->pte_lo; + powerpc_sync(); + pt->pte_hi = pvo_pt->pte_hi; + powerpc_sync(); + moea_pte_valid++; +} + +static __inline void +moea_pte_unset(struct pte *pt, struct pte *pvo_pt, vm_offset_t va) +{ + + mtx_assert(&moea_table_mutex, MA_OWNED); + pvo_pt->pte_hi &= ~PTE_VALID; + + /* + * Force the reg & chg bits back into the PTEs. + */ + powerpc_sync(); + + /* + * Invalidate the pte. + */ + pt->pte_hi &= ~PTE_VALID; + + tlbie(va); + + /* + * Save the reg & chg bits. + */ + moea_pte_synch(pt, pvo_pt); + moea_pte_valid--; +} + +static __inline void +moea_pte_change(struct pte *pt, struct pte *pvo_pt, vm_offset_t va) +{ + + /* + * Invalidate the PTE + */ + moea_pte_unset(pt, pvo_pt, va); + moea_pte_set(pt, pvo_pt); +} + +/* + * Quick sort callout for comparing memory regions. + */ +static int om_cmp(const void *a, const void *b); + +static int +om_cmp(const void *a, const void *b) +{ + const struct ofw_map *mapa; + const struct ofw_map *mapb; + + mapa = a; + mapb = b; + if (mapa->om_pa < mapb->om_pa) + return (-1); + else if (mapa->om_pa > mapb->om_pa) + return (1); + else + return (0); +} + +void +moea_cpu_bootstrap(int ap) +{ + u_int sdr; + int i; + + if (ap) { + powerpc_sync(); + __asm __volatile("mtdbatu 0,%0" :: "r"(battable[0].batu)); + __asm __volatile("mtdbatl 0,%0" :: "r"(battable[0].batl)); + isync(); + __asm __volatile("mtibatu 0,%0" :: "r"(battable[0].batu)); + __asm __volatile("mtibatl 0,%0" :: "r"(battable[0].batl)); + isync(); + } + + __asm __volatile("mtdbatu 1,%0" :: "r"(battable[8].batu)); + __asm __volatile("mtdbatl 1,%0" :: "r"(battable[8].batl)); + isync(); + + __asm __volatile("mtibatu 1,%0" :: "r"(0)); + __asm __volatile("mtdbatu 2,%0" :: "r"(0)); + __asm __volatile("mtibatu 2,%0" :: "r"(0)); + __asm __volatile("mtdbatu 3,%0" :: "r"(0)); + __asm __volatile("mtibatu 3,%0" :: "r"(0)); + isync(); + + for (i = 0; i < 16; i++) + mtsrin(i << ADDR_SR_SHFT, kernel_pmap->pm_sr[i]); + powerpc_sync(); + + sdr = (u_int)moea_pteg_table | (moea_pteg_mask >> 10); + __asm __volatile("mtsdr1 %0" :: "r"(sdr)); + isync(); + + tlbia(); +} + +void +moea_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend) +{ + ihandle_t mmui; + phandle_t chosen, mmu; + int sz; + int i, j; + vm_size_t size, physsz, hwphyssz; + vm_offset_t pa, va, off; + void *dpcpu; + + /* + * Map PCI memory space. + */ + battable[0x8].batl = BATL(0x80000000, BAT_I|BAT_G, BAT_PP_RW); + battable[0x8].batu = BATU(0x80000000, BAT_BL_256M, BAT_Vs); + + battable[0x9].batl = BATL(0x90000000, BAT_I|BAT_G, BAT_PP_RW); + battable[0x9].batu = BATU(0x90000000, BAT_BL_256M, BAT_Vs); + + battable[0xa].batl = BATL(0xa0000000, BAT_I|BAT_G, BAT_PP_RW); + battable[0xa].batu = BATU(0xa0000000, BAT_BL_256M, BAT_Vs); + + battable[0xb].batl = BATL(0xb0000000, BAT_I|BAT_G, BAT_PP_RW); + battable[0xb].batu = BATU(0xb0000000, BAT_BL_256M, BAT_Vs); + + powerpc_sync(); + + /* map pci space */ + __asm __volatile("mtdbatu 1,%0" :: "r"(battable[8].batu)); + __asm __volatile("mtdbatl 1,%0" :: "r"(battable[8].batl)); + isync(); + + /* set global direct map flag */ + hw_direct_map = 1; + + mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); + CTR0(KTR_PMAP, "moea_bootstrap: physical memory"); + + for (i = 0; i < pregions_sz; i++) { + vm_offset_t pa; + vm_offset_t end; + + CTR3(KTR_PMAP, "physregion: %#x - %#x (%#x)", + pregions[i].mr_start, + pregions[i].mr_start + pregions[i].mr_size, + pregions[i].mr_size); + /* + * Install entries into the BAT table to allow all + * of physmem to be convered by on-demand BAT entries. + * The loop will sometimes set the same battable element + * twice, but that's fine since they won't be used for + * a while yet. + */ + pa = pregions[i].mr_start & 0xf0000000; + end = pregions[i].mr_start + pregions[i].mr_size; + do { + u_int n = pa >> ADDR_SR_SHFT; + + battable[n].batl = BATL(pa, BAT_M, BAT_PP_RW); + battable[n].batu = BATU(pa, BAT_BL_256M, BAT_Vs); + pa += SEGMENT_LENGTH; + } while (pa < end); + } + + if (PHYS_AVAIL_ENTRIES < regions_sz) + panic("moea_bootstrap: phys_avail too small"); + + phys_avail_count = 0; + physsz = 0; + hwphyssz = 0; + TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); + for (i = 0, j = 0; i < regions_sz; i++, j += 2) { + CTR3(KTR_PMAP, "region: %#x - %#x (%#x)", regions[i].mr_start, + regions[i].mr_start + regions[i].mr_size, + regions[i].mr_size); + if (hwphyssz != 0 && + (physsz + regions[i].mr_size) >= hwphyssz) { + if (physsz < hwphyssz) { + phys_avail[j] = regions[i].mr_start; + phys_avail[j + 1] = regions[i].mr_start + + hwphyssz - physsz; + physsz = hwphyssz; + phys_avail_count++; + } + break; + } + phys_avail[j] = regions[i].mr_start; + phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; + phys_avail_count++; + physsz += regions[i].mr_size; + } + + /* Check for overlap with the kernel and exception vectors */ + for (j = 0; j < 2*phys_avail_count; j+=2) { + if (phys_avail[j] < EXC_LAST) + phys_avail[j] += EXC_LAST; + + if (kernelstart >= phys_avail[j] && + kernelstart < phys_avail[j+1]) { + if (kernelend < phys_avail[j+1]) { + phys_avail[2*phys_avail_count] = + (kernelend & ~PAGE_MASK) + PAGE_SIZE; + phys_avail[2*phys_avail_count + 1] = + phys_avail[j+1]; + phys_avail_count++; + } + + phys_avail[j+1] = kernelstart & ~PAGE_MASK; + } + + if (kernelend >= phys_avail[j] && + kernelend < phys_avail[j+1]) { + if (kernelstart > phys_avail[j]) { + phys_avail[2*phys_avail_count] = phys_avail[j]; + phys_avail[2*phys_avail_count + 1] = + kernelstart & ~PAGE_MASK; + phys_avail_count++; + } + + phys_avail[j] = (kernelend & ~PAGE_MASK) + PAGE_SIZE; + } + } + + physmem = btoc(physsz); + + /* + * Allocate PTEG table. + */ +#ifdef PTEGCOUNT + moea_pteg_count = PTEGCOUNT; +#else + moea_pteg_count = 0x1000; + + while (moea_pteg_count < physmem) + moea_pteg_count <<= 1; + + moea_pteg_count >>= 1; +#endif /* PTEGCOUNT */ + + size = moea_pteg_count * sizeof(struct pteg); + CTR2(KTR_PMAP, "moea_bootstrap: %d PTEGs, %d bytes", moea_pteg_count, + size); + moea_pteg_table = (struct pteg *)moea_bootstrap_alloc(size, size); + CTR1(KTR_PMAP, "moea_bootstrap: PTEG table at %p", moea_pteg_table); + bzero((void *)moea_pteg_table, moea_pteg_count * sizeof(struct pteg)); + moea_pteg_mask = moea_pteg_count - 1; + + /* + * Allocate pv/overflow lists. + */ + size = sizeof(struct pvo_head) * moea_pteg_count; + moea_pvo_table = (struct pvo_head *)moea_bootstrap_alloc(size, + PAGE_SIZE); + CTR1(KTR_PMAP, "moea_bootstrap: PVO table at %p", moea_pvo_table); + for (i = 0; i < moea_pteg_count; i++) + LIST_INIT(&moea_pvo_table[i]); + + /* + * Initialize the lock that synchronizes access to the pteg and pvo + * tables. + */ + mtx_init(&moea_table_mutex, "pmap table", NULL, MTX_DEF | + MTX_RECURSE); + mtx_init(&moea_vsid_mutex, "VSID table", NULL, MTX_DEF); + + mtx_init(&tlbie_mtx, "tlbie", NULL, MTX_SPIN); + + /* + * Initialise the unmanaged pvo pool. + */ + moea_bpvo_pool = (struct pvo_entry *)moea_bootstrap_alloc( + BPVO_POOL_SIZE*sizeof(struct pvo_entry), 0); + moea_bpvo_pool_index = 0; + + /* + * Make sure kernel vsid is allocated as well as VSID 0. + */ + moea_vsid_bitmap[(KERNEL_VSIDBITS & (NPMAPS - 1)) / VSID_NBPW] + |= 1 << (KERNEL_VSIDBITS % VSID_NBPW); + moea_vsid_bitmap[0] |= 1; + + /* + * Initialize the kernel pmap (which is statically allocated). + */ + PMAP_LOCK_INIT(kernel_pmap); + for (i = 0; i < 16; i++) + kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i; + CPU_FILL(&kernel_pmap->pm_active); + RB_INIT(&kernel_pmap->pmap_pvo); + + /* + * Initialize the global pv list lock. + */ + rw_init(&pvh_global_lock, "pmap pv global"); + + /* + * Set up the Open Firmware mappings + */ + chosen = OF_finddevice("/chosen"); + if (chosen != -1 && OF_getprop(chosen, "mmu", &mmui, 4) != -1 && + (mmu = OF_instance_to_package(mmui)) != -1 && + (sz = OF_getproplen(mmu, "translations")) != -1) { + translations = NULL; + for (i = 0; phys_avail[i] != 0; i += 2) { + if (phys_avail[i + 1] >= sz) { + translations = (struct ofw_map *)phys_avail[i]; + break; + } + } + if (translations == NULL) + panic("moea_bootstrap: no space to copy translations"); + bzero(translations, sz); + if (OF_getprop(mmu, "translations", translations, sz) == -1) + panic("moea_bootstrap: can't get ofw translations"); + CTR0(KTR_PMAP, "moea_bootstrap: translations"); + sz /= sizeof(*translations); + qsort(translations, sz, sizeof (*translations), om_cmp); + for (i = 0; i < sz; i++) { + CTR3(KTR_PMAP, "translation: pa=%#x va=%#x len=%#x", + translations[i].om_pa, translations[i].om_va, + translations[i].om_len); + + /* + * If the mapping is 1:1, let the RAM and device + * on-demand BAT tables take care of the translation. + * + * However, always enter mappings for segment 16, + * which is mixed-protection and therefore not + * compatible with a BAT entry. + */ + if ((translations[i].om_va >> ADDR_SR_SHFT) != 0xf && + translations[i].om_va == translations[i].om_pa) + continue; + + /* Enter the pages */ + for (off = 0; off < translations[i].om_len; + off += PAGE_SIZE) + moea_kenter_attr(translations[i].om_va + off, + translations[i].om_pa + off, + moea_bootstrap_convert_wimg(translations[i].om_mode)); + } + } + + /* + * Calculate the last available physical address. + */ + for (i = 0; phys_avail[i + 2] != 0; i += 2) + ; + Maxmem = powerpc_btop(phys_avail[i + 1]); + + moea_cpu_bootstrap(0); + mtmsr(mfmsr() | PSL_DR | PSL_IR); + pmap_bootstrapped++; + + /* + * Set the start and end of kva. + */ + virtual_avail = VM_MIN_KERNEL_ADDRESS; + virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; + + /* + * Allocate a kernel stack with a guard page for thread0 and map it + * into the kernel page map. + */ + pa = moea_bootstrap_alloc(kstack_pages * PAGE_SIZE, PAGE_SIZE); + va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; + virtual_avail = va + kstack_pages * PAGE_SIZE; + CTR2(KTR_PMAP, "moea_bootstrap: kstack0 at %#x (%#x)", pa, va); + thread0.td_kstack = va; + thread0.td_kstack_pages = kstack_pages; + for (i = 0; i < kstack_pages; i++) { + moea_kenter(va, pa); + pa += PAGE_SIZE; + va += PAGE_SIZE; + } + + /* + * Allocate virtual address space for the message buffer. + */ + pa = msgbuf_phys = moea_bootstrap_alloc(msgbufsize, PAGE_SIZE); + msgbufp = (struct msgbuf *)virtual_avail; + va = virtual_avail; + virtual_avail += round_page(msgbufsize); + while (va < virtual_avail) { + moea_kenter(va, pa); + pa += PAGE_SIZE; + va += PAGE_SIZE; + } + + /* + * Allocate virtual address space for the dynamic percpu area. + */ + pa = moea_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE); + dpcpu = (void *)virtual_avail; + va = virtual_avail; + virtual_avail += DPCPU_SIZE; + while (va < virtual_avail) { + moea_kenter(va, pa); + pa += PAGE_SIZE; + va += PAGE_SIZE; + } + dpcpu_init(dpcpu, 0); +} + +/* + * Activate a user pmap. The pmap must be activated before it's address + * space can be accessed in any way. + */ +void +moea_activate(struct thread *td) +{ + pmap_t pm, pmr; + + /* + * Load all the data we need up front to encourage the compiler to + * not issue any loads while we have interrupts disabled below. + */ + pm = &td->td_proc->p_vmspace->vm_pmap; + pmr = pm->pmap_phys; + + CPU_SET(PCPU_GET(cpuid), &pm->pm_active); + PCPU_SET(curpmap, pmr); + + mtsrin(USER_SR << ADDR_SR_SHFT, td->td_pcb->pcb_cpu.aim.usr_vsid); +} + +void +moea_deactivate(struct thread *td) +{ + pmap_t pm; + + pm = &td->td_proc->p_vmspace->vm_pmap; + CPU_CLR(PCPU_GET(cpuid), &pm->pm_active); + PCPU_SET(curpmap, NULL); +} + +void +moea_unwire(pmap_t pm, vm_offset_t sva, vm_offset_t eva) +{ + struct pvo_entry key, *pvo; + + PMAP_LOCK(pm); + key.pvo_vaddr = sva; + for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); + pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { + if ((pvo->pvo_vaddr & PVO_WIRED) == 0) + panic("moea_unwire: pvo %p is missing PVO_WIRED", pvo); + pvo->pvo_vaddr &= ~PVO_WIRED; + pm->pm_stats.wired_count--; + } + PMAP_UNLOCK(pm); +} + +void +moea_copy_page(vm_page_t msrc, vm_page_t mdst) +{ + vm_offset_t dst; + vm_offset_t src; + + dst = VM_PAGE_TO_PHYS(mdst); + src = VM_PAGE_TO_PHYS(msrc); + + bcopy((void *)src, (void *)dst, PAGE_SIZE); +} + +void +moea_copy_pages(vm_page_t *ma, vm_offset_t a_offset, + vm_page_t *mb, vm_offset_t b_offset, int xfersize) +{ + void *a_cp, *b_cp; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + while (xfersize > 0) { + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + a_cp = (char *)VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT]) + + a_pg_offset; + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + b_cp = (char *)VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT]) + + b_pg_offset; + bcopy(a_cp, b_cp, cnt); + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } +} + +/* + * Zero a page of physical memory by temporarily mapping it into the tlb. + */ +void +moea_zero_page(vm_page_t m) +{ + vm_offset_t off, pa = VM_PAGE_TO_PHYS(m); + + for (off = 0; off < PAGE_SIZE; off += cacheline_size) + __asm __volatile("dcbz 0,%0" :: "r"(pa + off)); +} + +void +moea_zero_page_area(vm_page_t m, int off, int size) +{ + vm_offset_t pa = VM_PAGE_TO_PHYS(m); + void *va = (void *)(pa + off); + + bzero(va, size); +} + +vm_offset_t +moea_quick_enter_page(vm_page_t m) +{ + + return (VM_PAGE_TO_PHYS(m)); +} + +void +moea_quick_remove_page(vm_offset_t addr) +{ +} + +bool +moea_page_is_mapped(vm_page_t m) +{ + return (!LIST_EMPTY(&(m)->md.mdpg_pvoh)); +} + +bool +moea_ps_enabled(pmap_t pmap __unused) +{ + return (false); +} + +/* + * Map the given physical page at the specified virtual address in the + * target pmap with the protection requested. If specified the page + * will be wired down. + */ +int +moea_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, + u_int flags, int8_t psind) +{ + int error; + + for (;;) { + rw_wlock(&pvh_global_lock); + PMAP_LOCK(pmap); + error = moea_enter_locked(pmap, va, m, prot, flags, psind); + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); + if (error != ENOMEM) + return (KERN_SUCCESS); + if ((flags & PMAP_ENTER_NOSLEEP) != 0) + return (KERN_RESOURCE_SHORTAGE); + VM_OBJECT_ASSERT_UNLOCKED(m->object); + vm_wait(NULL); + } +} + +/* + * Map the given physical page at the specified virtual address in the + * target pmap with the protection requested. If specified the page + * will be wired down. + * + * The global pvh and pmap must be locked. + */ +static int +moea_enter_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, + u_int flags, int8_t psind __unused) +{ + struct pvo_head *pvo_head; + uma_zone_t zone; + u_int pte_lo, pvo_flags; + int error; + + if (pmap_bootstrapped) + rw_assert(&pvh_global_lock, RA_WLOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if ((m->oflags & VPO_UNMANAGED) == 0) { + if ((flags & PMAP_ENTER_QUICK_LOCKED) == 0) + VM_PAGE_OBJECT_BUSY_ASSERT(m); + else + VM_OBJECT_ASSERT_LOCKED(m->object); + } + + if ((m->oflags & VPO_UNMANAGED) != 0 || !moea_initialized) { + pvo_head = &moea_pvo_kunmanaged; + zone = moea_upvo_zone; + pvo_flags = 0; + } else { + pvo_head = vm_page_to_pvoh(m); + zone = moea_mpvo_zone; + pvo_flags = PVO_MANAGED; + } + + pte_lo = moea_calc_wimg(VM_PAGE_TO_PHYS(m), pmap_page_get_memattr(m)); + + if (prot & VM_PROT_WRITE) { + pte_lo |= PTE_BW; + if (pmap_bootstrapped && + (m->oflags & VPO_UNMANAGED) == 0) + vm_page_aflag_set(m, PGA_WRITEABLE); + } else + pte_lo |= PTE_BR; + + if ((flags & PMAP_ENTER_WIRED) != 0) + pvo_flags |= PVO_WIRED; + + error = moea_pvo_enter(pmap, zone, pvo_head, va, VM_PAGE_TO_PHYS(m), + pte_lo, pvo_flags); + + /* + * Flush the real page from the instruction cache. This has be done + * for all user mappings to prevent information leakage via the + * instruction cache. moea_pvo_enter() returns ENOENT for the first + * mapping for a page. + */ + if (pmap != kernel_pmap && error == ENOENT && + (pte_lo & (PTE_I | PTE_G)) == 0) + moea_syncicache(VM_PAGE_TO_PHYS(m), PAGE_SIZE); + + return (error); +} + +/* + * Maps a sequence of resident pages belonging to the same object. + * The sequence begins with the given page m_start. This page is + * mapped at the given virtual address start. Each subsequent page is + * mapped at a virtual address that is offset from start by the same + * amount as the page is offset from m_start within the object. The + * last page in the sequence is the page with the largest offset from + * m_start that can be mapped at a virtual address less than the given + * virtual address end. Not every virtual page between start and end + * is mapped; only those for which a resident page exists with the + * corresponding offset from m_start are mapped. + */ +void +moea_enter_object(pmap_t pm, vm_offset_t start, vm_offset_t end, + vm_page_t m_start, vm_prot_t prot) +{ + struct pctrie_iter pages; + vm_offset_t va; + vm_page_t m; + + VM_OBJECT_ASSERT_LOCKED(m_start->object); + + vm_page_iter_limit_init(&pages, m_start->object, + m_start->pindex + atop(end - start)); + m = vm_radix_iter_lookup(&pages, m_start->pindex); + rw_wlock(&pvh_global_lock); + PMAP_LOCK(pm); + while (m != NULL) { + va = start + ptoa(m->pindex - m_start->pindex); + moea_enter_locked(pm, va, m, prot & + (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_QUICK_LOCKED, + 0); + m = vm_radix_iter_step(&pages); + } + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(pm); +} + +void +moea_enter_quick(pmap_t pm, vm_offset_t va, vm_page_t m, + vm_prot_t prot) +{ + + rw_wlock(&pvh_global_lock); + PMAP_LOCK(pm); + moea_enter_locked(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), + PMAP_ENTER_QUICK_LOCKED, 0); + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(pm); +} + +vm_paddr_t +moea_extract(pmap_t pm, vm_offset_t va) +{ + struct pvo_entry *pvo; + vm_paddr_t pa; + + PMAP_LOCK(pm); + pvo = moea_pvo_find_va(pm, va & ~ADDR_POFF, NULL); + if (pvo == NULL) + pa = 0; + else + pa = PVO_PADDR(pvo) | (va & ADDR_POFF); + PMAP_UNLOCK(pm); + return (pa); +} + +/* + * Atomically extract and hold the physical page with the given + * pmap and virtual address pair if that mapping permits the given + * protection. + */ +vm_page_t +moea_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) +{ + struct pvo_entry *pvo; + vm_page_t m; + + m = NULL; + PMAP_LOCK(pmap); + pvo = moea_pvo_find_va(pmap, va & ~ADDR_POFF, NULL); + if (pvo != NULL && (pvo->pvo_pte.pte.pte_hi & PTE_VALID) && + ((pvo->pvo_pte.pte.pte_lo & PTE_PP) == PTE_RW || + (prot & VM_PROT_WRITE) == 0)) { + m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); + if (!vm_page_wire_mapped(m)) + m = NULL; + } + PMAP_UNLOCK(pmap); + return (m); +} + +void +moea_init(void) +{ + + moea_upvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, + UMA_ZONE_VM | UMA_ZONE_NOFREE); + moea_mpvo_zone = uma_zcreate("MPVO entry", sizeof(struct pvo_entry), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, + UMA_ZONE_VM | UMA_ZONE_NOFREE); + moea_initialized = true; +} + +bool +moea_is_referenced(vm_page_t m) +{ + bool rv; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("moea_is_referenced: page %p is not managed", m)); + rw_wlock(&pvh_global_lock); + rv = moea_query_bit(m, PTE_REF); + rw_wunlock(&pvh_global_lock); + return (rv); +} + +bool +moea_is_modified(vm_page_t m) +{ + bool rv; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("moea_is_modified: page %p is not managed", m)); + + /* + * If the page is not busied then this check is racy. + */ + if (!pmap_page_is_write_mapped(m)) + return (false); + + rw_wlock(&pvh_global_lock); + rv = moea_query_bit(m, PTE_CHG); + rw_wunlock(&pvh_global_lock); + return (rv); +} + +bool +moea_is_prefaultable(pmap_t pmap, vm_offset_t va) +{ + struct pvo_entry *pvo; + bool rv; + + PMAP_LOCK(pmap); + pvo = moea_pvo_find_va(pmap, va & ~ADDR_POFF, NULL); + rv = pvo == NULL || (pvo->pvo_pte.pte.pte_hi & PTE_VALID) == 0; + PMAP_UNLOCK(pmap); + return (rv); +} + +void +moea_clear_modify(vm_page_t m) +{ + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("moea_clear_modify: page %p is not managed", m)); + vm_page_assert_busied(m); + + if (!pmap_page_is_write_mapped(m)) + return; + rw_wlock(&pvh_global_lock); + moea_clear_bit(m, PTE_CHG); + rw_wunlock(&pvh_global_lock); +} + +/* + * Clear the write and modified bits in each of the given page's mappings. + */ +void +moea_remove_write(vm_page_t m) +{ + struct pvo_entry *pvo; + struct pte *pt; + pmap_t pmap; + u_int lo; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("moea_remove_write: page %p is not managed", m)); + vm_page_assert_busied(m); + + if (!pmap_page_is_write_mapped(m)) + return; + rw_wlock(&pvh_global_lock); + lo = moea_attr_fetch(m); + powerpc_sync(); + LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { + pmap = pvo->pvo_pmap; + PMAP_LOCK(pmap); + if ((pvo->pvo_pte.pte.pte_lo & PTE_PP) != PTE_BR) { + pt = moea_pvo_to_pte(pvo, -1); + pvo->pvo_pte.pte.pte_lo &= ~PTE_PP; + pvo->pvo_pte.pte.pte_lo |= PTE_BR; + if (pt != NULL) { + moea_pte_synch(pt, &pvo->pvo_pte.pte); + lo |= pvo->pvo_pte.pte.pte_lo; + pvo->pvo_pte.pte.pte_lo &= ~PTE_CHG; + moea_pte_change(pt, &pvo->pvo_pte.pte, + pvo->pvo_vaddr); + mtx_unlock(&moea_table_mutex); + } + } + PMAP_UNLOCK(pmap); + } + if ((lo & PTE_CHG) != 0) { + moea_attr_clear(m, PTE_CHG); + vm_page_dirty(m); + } + vm_page_aflag_clear(m, PGA_WRITEABLE); + rw_wunlock(&pvh_global_lock); +} + +/* + * moea_ts_referenced: + * + * Return a count of reference bits for a page, clearing those bits. + * It is not necessary for every reference bit to be cleared, but it + * is necessary that 0 only be returned when there are truly no + * reference bits set. + * + * XXX: The exact number of bits to check and clear is a matter that + * should be tested and standardized at some point in the future for + * optimal aging of shared pages. + */ +int +moea_ts_referenced(vm_page_t m) +{ + int count; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("moea_ts_referenced: page %p is not managed", m)); + rw_wlock(&pvh_global_lock); + count = moea_clear_bit(m, PTE_REF); + rw_wunlock(&pvh_global_lock); + return (count); +} + +/* + * Modify the WIMG settings of all mappings for a page. + */ +void +moea_page_set_memattr(vm_page_t m, vm_memattr_t ma) +{ + struct pvo_entry *pvo; + struct pvo_head *pvo_head; + struct pte *pt; + pmap_t pmap; + u_int lo; + + if (m->md.mdpg_cache_attrs == ma) + return; + + if ((m->oflags & VPO_UNMANAGED) != 0) { + m->md.mdpg_cache_attrs = ma; + return; + } + + rw_wlock(&pvh_global_lock); + pvo_head = vm_page_to_pvoh(m); + lo = moea_calc_wimg(VM_PAGE_TO_PHYS(m), ma); + + LIST_FOREACH(pvo, pvo_head, pvo_vlink) { + pmap = pvo->pvo_pmap; + PMAP_LOCK(pmap); + pt = moea_pvo_to_pte(pvo, -1); + pvo->pvo_pte.pte.pte_lo &= ~PTE_WIMG; + pvo->pvo_pte.pte.pte_lo |= lo; + if (pt != NULL) { + moea_pte_change(pt, &pvo->pvo_pte.pte, + pvo->pvo_vaddr); + if (pvo->pvo_pmap == kernel_pmap) + isync(); + } + mtx_unlock(&moea_table_mutex); + PMAP_UNLOCK(pmap); + } + m->md.mdpg_cache_attrs = ma; + rw_wunlock(&pvh_global_lock); +} + +/* + * Map a wired page into kernel virtual address space. + */ +void +moea_kenter(vm_offset_t va, vm_paddr_t pa) +{ + + moea_kenter_attr(va, pa, VM_MEMATTR_DEFAULT); +} + +void +moea_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) +{ + u_int pte_lo; + int error; + +#if 0 + if (va < VM_MIN_KERNEL_ADDRESS) + panic("moea_kenter: attempt to enter non-kernel address %#x", + va); +#endif + + pte_lo = moea_calc_wimg(pa, ma); + + PMAP_LOCK(kernel_pmap); + error = moea_pvo_enter(kernel_pmap, moea_upvo_zone, + &moea_pvo_kunmanaged, va, pa, pte_lo, PVO_WIRED); + + if (error != 0 && error != ENOENT) + panic("moea_kenter: failed to enter va %#x pa %#x: %d", va, + pa, error); + + PMAP_UNLOCK(kernel_pmap); +} + +/* + * Extract the physical page address associated with the given kernel virtual + * address. + */ +vm_paddr_t +moea_kextract(vm_offset_t va) +{ + struct pvo_entry *pvo; + vm_paddr_t pa; + + /* + * Allow direct mappings on 32-bit OEA + */ + if (va < VM_MIN_KERNEL_ADDRESS) { + return (va); + } + + PMAP_LOCK(kernel_pmap); + pvo = moea_pvo_find_va(kernel_pmap, va & ~ADDR_POFF, NULL); + KASSERT(pvo != NULL, ("moea_kextract: no addr found")); + pa = PVO_PADDR(pvo) | (va & ADDR_POFF); + PMAP_UNLOCK(kernel_pmap); + return (pa); +} + +/* + * Remove a wired page from kernel virtual address space. + */ +void +moea_kremove(vm_offset_t va) +{ + + moea_remove(kernel_pmap, va, va + PAGE_SIZE); +} + +/* + * Provide a kernel pointer corresponding to a given userland pointer. + * The returned pointer is valid until the next time this function is + * called in this thread. This is used internally in copyin/copyout. + */ +int +moea_map_user_ptr(pmap_t pm, volatile const void *uaddr, + void **kaddr, size_t ulen, size_t *klen) +{ + size_t l; + register_t vsid; + + *kaddr = (char *)USER_ADDR + ((uintptr_t)uaddr & ~SEGMENT_MASK); + l = ((char *)USER_ADDR + SEGMENT_LENGTH) - (char *)(*kaddr); + if (l > ulen) + l = ulen; + if (klen) + *klen = l; + else if (l != ulen) + return (EFAULT); + + vsid = va_to_vsid(pm, (vm_offset_t)uaddr); + + /* Mark segment no-execute */ + vsid |= SR_N; + + /* If we have already set this VSID, we can just return */ + if (curthread->td_pcb->pcb_cpu.aim.usr_vsid == vsid) + return (0); + + __asm __volatile("isync"); + curthread->td_pcb->pcb_cpu.aim.usr_segm = + (uintptr_t)uaddr >> ADDR_SR_SHFT; + curthread->td_pcb->pcb_cpu.aim.usr_vsid = vsid; + __asm __volatile("mtsr %0,%1; isync" :: "n"(USER_SR), "r"(vsid)); + + return (0); +} + +/* + * Figure out where a given kernel pointer (usually in a fault) points + * to from the VM's perspective, potentially remapping into userland's + * address space. + */ +static int +moea_decode_kernel_ptr(vm_offset_t addr, int *is_user, + vm_offset_t *decoded_addr) +{ + vm_offset_t user_sr; + + if ((addr >> ADDR_SR_SHFT) == (USER_ADDR >> ADDR_SR_SHFT)) { + user_sr = curthread->td_pcb->pcb_cpu.aim.usr_segm; + addr &= ADDR_PIDX | ADDR_POFF; + addr |= user_sr << ADDR_SR_SHFT; + *decoded_addr = addr; + *is_user = 1; + } else { + *decoded_addr = addr; + *is_user = 0; + } + + return (0); +} + +/* + * Map a range of physical addresses into kernel virtual address space. + * + * The value passed in *virt is a suggested virtual address for the mapping. + * Architectures which can support a direct-mapped physical to virtual region + * can return the appropriate address within that region, leaving '*virt' + * unchanged. We cannot and therefore do not; *virt is updated with the + * first usable address after the mapped region. + */ +vm_offset_t +moea_map(vm_offset_t *virt, vm_paddr_t pa_start, + vm_paddr_t pa_end, int prot) +{ + vm_offset_t sva, va; + + sva = *virt; + va = sva; + for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE) + moea_kenter(va, pa_start); + *virt = va; + return (sva); +} + +/* + * Returns true if the pmap's pv is one of the first + * 16 pvs linked to from this page. This count may + * be changed upwards or downwards in the future; it + * is only necessary that true be returned for a small + * subset of pmaps for proper page aging. + */ +bool +moea_page_exists_quick(pmap_t pmap, vm_page_t m) +{ + int loops; + struct pvo_entry *pvo; + bool rv; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("moea_page_exists_quick: page %p is not managed", m)); + loops = 0; + rv = false; + rw_wlock(&pvh_global_lock); + LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { + if (pvo->pvo_pmap == pmap) { + rv = true; + break; + } + if (++loops >= 16) + break; + } + rw_wunlock(&pvh_global_lock); + return (rv); +} + +void +moea_page_init(vm_page_t m) +{ + + m->md.mdpg_attrs = 0; + m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT; + LIST_INIT(&m->md.mdpg_pvoh); +} + +/* + * Return the number of managed mappings to the given physical page + * that are wired. + */ +int +moea_page_wired_mappings(vm_page_t m) +{ + struct pvo_entry *pvo; + int count; + + count = 0; + if ((m->oflags & VPO_UNMANAGED) != 0) + return (count); + rw_wlock(&pvh_global_lock); + LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) + if ((pvo->pvo_vaddr & PVO_WIRED) != 0) + count++; + rw_wunlock(&pvh_global_lock); + return (count); +} + +static u_int moea_vsidcontext; + +int +moea_pinit(pmap_t pmap) +{ + int i, mask; + u_int entropy; + + RB_INIT(&pmap->pmap_pvo); + + entropy = 0; + __asm __volatile("mftb %0" : "=r"(entropy)); + + if ((pmap->pmap_phys = (pmap_t)moea_kextract((vm_offset_t)pmap)) + == NULL) { + pmap->pmap_phys = pmap; + } + + mtx_lock(&moea_vsid_mutex); + /* + * Allocate some segment registers for this pmap. + */ + for (i = 0; i < NPMAPS; i += VSID_NBPW) { + u_int hash, n; + + /* + * Create a new value by multiplying by a prime and adding in + * entropy from the timebase register. This is to make the + * VSID more random so that the PT hash function collides + * less often. (Note that the prime casues gcc to do shifts + * instead of a multiply.) + */ + moea_vsidcontext = (moea_vsidcontext * 0x1105) + entropy; + hash = moea_vsidcontext & (NPMAPS - 1); + if (hash == 0) /* 0 is special, avoid it */ + continue; + n = hash >> 5; + mask = 1 << (hash & (VSID_NBPW - 1)); + hash = (moea_vsidcontext & 0xfffff); + if (moea_vsid_bitmap[n] & mask) { /* collision? */ + /* anything free in this bucket? */ + if (moea_vsid_bitmap[n] == 0xffffffff) { + entropy = (moea_vsidcontext >> 20); + continue; + } + i = ffs(~moea_vsid_bitmap[n]) - 1; + mask = 1 << i; + hash &= rounddown2(0xfffff, VSID_NBPW); + hash |= i; + } + KASSERT(!(moea_vsid_bitmap[n] & mask), + ("Allocating in-use VSID group %#x\n", hash)); + moea_vsid_bitmap[n] |= mask; + for (i = 0; i < 16; i++) + pmap->pm_sr[i] = VSID_MAKE(i, hash); + mtx_unlock(&moea_vsid_mutex); + return (1); + } + + mtx_unlock(&moea_vsid_mutex); + panic("moea_pinit: out of segments"); +} + +/* + * Initialize the pmap associated with process 0. + */ +void +moea_pinit0(pmap_t pm) +{ + + PMAP_LOCK_INIT(pm); + moea_pinit(pm); + bzero(&pm->pm_stats, sizeof(pm->pm_stats)); +} + +/* + * Set the physical protection on the specified range of this map as requested. + */ +void +moea_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, + vm_prot_t prot) +{ + struct pvo_entry *pvo, *tpvo, key; + struct pte *pt; + + KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap, + ("moea_protect: non current pmap")); + + if ((prot & VM_PROT_READ) == VM_PROT_NONE) { + moea_remove(pm, sva, eva); + return; + } + + rw_wlock(&pvh_global_lock); + PMAP_LOCK(pm); + key.pvo_vaddr = sva; + for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); + pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { + tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); + + /* + * Grab the PTE pointer before we diddle with the cached PTE + * copy. + */ + pt = moea_pvo_to_pte(pvo, -1); + /* + * Change the protection of the page. + */ + pvo->pvo_pte.pte.pte_lo &= ~PTE_PP; + pvo->pvo_pte.pte.pte_lo |= PTE_BR; + + /* + * If the PVO is in the page table, update that pte as well. + */ + if (pt != NULL) { + moea_pte_change(pt, &pvo->pvo_pte.pte, pvo->pvo_vaddr); + mtx_unlock(&moea_table_mutex); + } + } + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(pm); +} + +/* + * Map a list of wired pages into kernel virtual address space. This is + * intended for temporary mappings which do not need page modification or + * references recorded. Existing mappings in the region are overwritten. + */ +void +moea_qenter(vm_offset_t sva, vm_page_t *m, int count) +{ + vm_offset_t va; + + va = sva; + while (count-- > 0) { + moea_kenter(va, VM_PAGE_TO_PHYS(*m)); + va += PAGE_SIZE; + m++; + } +} + +/* + * Remove page mappings from kernel virtual address space. Intended for + * temporary mappings entered by moea_qenter. + */ +void +moea_qremove(vm_offset_t sva, int count) +{ + vm_offset_t va; + + va = sva; + while (count-- > 0) { + moea_kremove(va); + va += PAGE_SIZE; + } +} + +void +moea_release(pmap_t pmap) +{ + int idx, mask; + + /* + * Free segment register's VSID + */ + if (pmap->pm_sr[0] == 0) + panic("moea_release"); + + mtx_lock(&moea_vsid_mutex); + idx = VSID_TO_HASH(pmap->pm_sr[0]) & (NPMAPS-1); + mask = 1 << (idx % VSID_NBPW); + idx /= VSID_NBPW; + moea_vsid_bitmap[idx] &= ~mask; + mtx_unlock(&moea_vsid_mutex); +} + +/* + * Remove the given range of addresses from the specified map. + */ +void +moea_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva) +{ + struct pvo_entry *pvo, *tpvo, key; + + rw_wlock(&pvh_global_lock); + PMAP_LOCK(pm); + key.pvo_vaddr = sva; + for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); + pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { + tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); + moea_pvo_remove(pvo, -1); + } + PMAP_UNLOCK(pm); + rw_wunlock(&pvh_global_lock); +} + +/* + * Remove physical page from all pmaps in which it resides. moea_pvo_remove() + * will reflect changes in pte's back to the vm_page. + */ +void +moea_remove_all(vm_page_t m) +{ + struct pvo_head *pvo_head; + struct pvo_entry *pvo, *next_pvo; + pmap_t pmap; + + rw_wlock(&pvh_global_lock); + pvo_head = vm_page_to_pvoh(m); + for (pvo = LIST_FIRST(pvo_head); pvo != NULL; pvo = next_pvo) { + next_pvo = LIST_NEXT(pvo, pvo_vlink); + + pmap = pvo->pvo_pmap; + PMAP_LOCK(pmap); + moea_pvo_remove(pvo, -1); + PMAP_UNLOCK(pmap); + } + if ((m->a.flags & PGA_WRITEABLE) && moea_query_bit(m, PTE_CHG)) { + moea_attr_clear(m, PTE_CHG); + vm_page_dirty(m); + } + vm_page_aflag_clear(m, PGA_WRITEABLE); + rw_wunlock(&pvh_global_lock); +} + +static int +moea_mincore(pmap_t pm, vm_offset_t va, vm_paddr_t *pap) +{ + struct pvo_entry *pvo; + vm_paddr_t pa; + vm_page_t m; + int val; + bool managed; + + PMAP_LOCK(pm); + + pvo = moea_pvo_find_va(pm, va & ~ADDR_POFF, NULL); + if (pvo != NULL) { + pa = PVO_PADDR(pvo); + m = PHYS_TO_VM_PAGE(pa); + managed = (pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED; + val = MINCORE_INCORE; + } else { + PMAP_UNLOCK(pm); + return (0); + } + + PMAP_UNLOCK(pm); + + if (m == NULL) + return (0); + + if (managed) { + if (moea_is_modified(m)) + val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; + + if (moea_is_referenced(m)) + val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; + } + + if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != + (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && + managed) { + *pap = pa; + } + + return (val); +} + +/* + * Allocate a physical page of memory directly from the phys_avail map. + * Can only be called from moea_bootstrap before avail start and end are + * calculated. + */ +static vm_offset_t +moea_bootstrap_alloc(vm_size_t size, u_int align) +{ + vm_offset_t s, e; + int i, j; + + size = round_page(size); + for (i = 0; phys_avail[i + 1] != 0; i += 2) { + if (align != 0) + s = roundup2(phys_avail[i], align); + else + s = phys_avail[i]; + e = s + size; + + if (s < phys_avail[i] || e > phys_avail[i + 1]) + continue; + + if (s == phys_avail[i]) { + phys_avail[i] += size; + } else if (e == phys_avail[i + 1]) { + phys_avail[i + 1] -= size; + } else { + for (j = phys_avail_count * 2; j > i; j -= 2) { + phys_avail[j] = phys_avail[j - 2]; + phys_avail[j + 1] = phys_avail[j - 1]; + } + + phys_avail[i + 3] = phys_avail[i + 1]; + phys_avail[i + 1] = s; + phys_avail[i + 2] = e; + phys_avail_count++; + } + + return (s); + } + panic("moea_bootstrap_alloc: could not allocate memory"); +} + +static void +moea_syncicache(vm_paddr_t pa, vm_size_t len) +{ + __syncicache((void *)pa, len); +} + +static int +moea_pvo_enter(pmap_t pm, uma_zone_t zone, struct pvo_head *pvo_head, + vm_offset_t va, vm_paddr_t pa, u_int pte_lo, int flags) +{ + struct pvo_entry *pvo; + u_int sr; + int first; + u_int ptegidx; + int i; + int bootstrap; + + moea_pvo_enter_calls++; + first = 0; + bootstrap = 0; + + /* + * Compute the PTE Group index. + */ + va &= ~ADDR_POFF; + sr = va_to_sr(pm->pm_sr, va); + ptegidx = va_to_pteg(sr, va); + + /* + * Remove any existing mapping for this page. Reuse the pvo entry if + * there is a mapping. + */ + mtx_lock(&moea_table_mutex); + LIST_FOREACH(pvo, &moea_pvo_table[ptegidx], pvo_olink) { + if (pvo->pvo_pmap == pm && PVO_VADDR(pvo) == va) { + if (PVO_PADDR(pvo) == pa && + (pvo->pvo_pte.pte.pte_lo & PTE_PP) == + (pte_lo & PTE_PP)) { + /* + * The PTE is not changing. Instead, this may + * be a request to change the mapping's wired + * attribute. + */ + mtx_unlock(&moea_table_mutex); + if ((flags & PVO_WIRED) != 0 && + (pvo->pvo_vaddr & PVO_WIRED) == 0) { + pvo->pvo_vaddr |= PVO_WIRED; + pm->pm_stats.wired_count++; + } else if ((flags & PVO_WIRED) == 0 && + (pvo->pvo_vaddr & PVO_WIRED) != 0) { + pvo->pvo_vaddr &= ~PVO_WIRED; + pm->pm_stats.wired_count--; + } + return (0); + } + moea_pvo_remove(pvo, -1); + break; + } + } + + /* + * If we aren't overwriting a mapping, try to allocate. + */ + if (moea_initialized) { + pvo = uma_zalloc(zone, M_NOWAIT); + } else { + if (moea_bpvo_pool_index >= BPVO_POOL_SIZE) { + panic("moea_enter: bpvo pool exhausted, %d, %d, %d", + moea_bpvo_pool_index, BPVO_POOL_SIZE, + BPVO_POOL_SIZE * sizeof(struct pvo_entry)); + } + pvo = &moea_bpvo_pool[moea_bpvo_pool_index]; + moea_bpvo_pool_index++; + bootstrap = 1; + } + + if (pvo == NULL) { + mtx_unlock(&moea_table_mutex); + return (ENOMEM); + } + + moea_pvo_entries++; + pvo->pvo_vaddr = va; + pvo->pvo_pmap = pm; + LIST_INSERT_HEAD(&moea_pvo_table[ptegidx], pvo, pvo_olink); + pvo->pvo_vaddr &= ~ADDR_POFF; + if (flags & PVO_WIRED) + pvo->pvo_vaddr |= PVO_WIRED; + if (pvo_head != &moea_pvo_kunmanaged) + pvo->pvo_vaddr |= PVO_MANAGED; + if (bootstrap) + pvo->pvo_vaddr |= PVO_BOOTSTRAP; + + moea_pte_create(&pvo->pvo_pte.pte, sr, va, pa | pte_lo); + + /* + * Add to pmap list + */ + RB_INSERT(pvo_tree, &pm->pmap_pvo, pvo); + + /* + * Remember if the list was empty and therefore will be the first + * item. + */ + if (LIST_FIRST(pvo_head) == NULL) + first = 1; + LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink); + + if (pvo->pvo_vaddr & PVO_WIRED) + pm->pm_stats.wired_count++; + pm->pm_stats.resident_count++; + + i = moea_pte_insert(ptegidx, &pvo->pvo_pte.pte); + KASSERT(i < 8, ("Invalid PTE index")); + if (i >= 0) { + PVO_PTEGIDX_SET(pvo, i); + } else { + panic("moea_pvo_enter: overflow"); + moea_pte_overflow++; + } + mtx_unlock(&moea_table_mutex); + + return (first ? ENOENT : 0); +} + +static void +moea_pvo_remove(struct pvo_entry *pvo, int pteidx) +{ + struct pte *pt; + + /* + * If there is an active pte entry, we need to deactivate it (and + * save the ref & cfg bits). + */ + pt = moea_pvo_to_pte(pvo, pteidx); + if (pt != NULL) { + moea_pte_unset(pt, &pvo->pvo_pte.pte, pvo->pvo_vaddr); + mtx_unlock(&moea_table_mutex); + PVO_PTEGIDX_CLR(pvo); + } else { + moea_pte_overflow--; + } + + /* + * Update our statistics. + */ + pvo->pvo_pmap->pm_stats.resident_count--; + if (pvo->pvo_vaddr & PVO_WIRED) + pvo->pvo_pmap->pm_stats.wired_count--; + + /* + * Remove this PVO from the PV and pmap lists. + */ + LIST_REMOVE(pvo, pvo_vlink); + RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); + + /* + * Save the REF/CHG bits into their cache if the page is managed. + * Clear PGA_WRITEABLE if all mappings of the page have been removed. + */ + if ((pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED) { + struct vm_page *pg; + + pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); + if (pg != NULL) { + moea_attr_save(pg, pvo->pvo_pte.pte.pte_lo & + (PTE_REF | PTE_CHG)); + if (LIST_EMPTY(&pg->md.mdpg_pvoh)) + vm_page_aflag_clear(pg, PGA_WRITEABLE); + } + } + + /* + * Remove this from the overflow list and return it to the pool + * if we aren't going to reuse it. + */ + LIST_REMOVE(pvo, pvo_olink); + if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP)) + uma_zfree(pvo->pvo_vaddr & PVO_MANAGED ? moea_mpvo_zone : + moea_upvo_zone, pvo); + moea_pvo_entries--; + moea_pvo_remove_calls++; +} + +static __inline int +moea_pvo_pte_index(const struct pvo_entry *pvo, int ptegidx) +{ + int pteidx; + + /* + * We can find the actual pte entry without searching by grabbing + * the PTEG index from 3 unused bits in pte_lo[11:9] and by + * noticing the HID bit. + */ + pteidx = ptegidx * 8 + PVO_PTEGIDX_GET(pvo); + if (pvo->pvo_pte.pte.pte_hi & PTE_HID) + pteidx ^= moea_pteg_mask * 8; + + return (pteidx); +} + +static struct pvo_entry * +moea_pvo_find_va(pmap_t pm, vm_offset_t va, int *pteidx_p) +{ + struct pvo_entry *pvo; + int ptegidx; + u_int sr; + + va &= ~ADDR_POFF; + sr = va_to_sr(pm->pm_sr, va); + ptegidx = va_to_pteg(sr, va); + + mtx_lock(&moea_table_mutex); + LIST_FOREACH(pvo, &moea_pvo_table[ptegidx], pvo_olink) { + if (pvo->pvo_pmap == pm && PVO_VADDR(pvo) == va) { + if (pteidx_p) + *pteidx_p = moea_pvo_pte_index(pvo, ptegidx); + break; + } + } + mtx_unlock(&moea_table_mutex); + + return (pvo); +} + +static struct pte * +moea_pvo_to_pte(const struct pvo_entry *pvo, int pteidx) +{ + struct pte *pt; + + /* + * If we haven't been supplied the ptegidx, calculate it. + */ + if (pteidx == -1) { + int ptegidx; + u_int sr; + + sr = va_to_sr(pvo->pvo_pmap->pm_sr, pvo->pvo_vaddr); + ptegidx = va_to_pteg(sr, pvo->pvo_vaddr); + pteidx = moea_pvo_pte_index(pvo, ptegidx); + } + + pt = &moea_pteg_table[pteidx >> 3].pt[pteidx & 7]; + mtx_lock(&moea_table_mutex); + + if ((pvo->pvo_pte.pte.pte_hi & PTE_VALID) && !PVO_PTEGIDX_ISSET(pvo)) { + panic("moea_pvo_to_pte: pvo %p has valid pte in pvo but no " + "valid pte index", pvo); + } + + if ((pvo->pvo_pte.pte.pte_hi & PTE_VALID) == 0 && PVO_PTEGIDX_ISSET(pvo)) { + panic("moea_pvo_to_pte: pvo %p has valid pte index in pvo " + "pvo but no valid pte", pvo); + } + + if ((pt->pte_hi ^ (pvo->pvo_pte.pte.pte_hi & ~PTE_VALID)) == PTE_VALID) { + if ((pvo->pvo_pte.pte.pte_hi & PTE_VALID) == 0) { + panic("moea_pvo_to_pte: pvo %p has valid pte in " + "moea_pteg_table %p but invalid in pvo", pvo, pt); + } + + if (((pt->pte_lo ^ pvo->pvo_pte.pte.pte_lo) & ~(PTE_CHG|PTE_REF)) + != 0) { + panic("moea_pvo_to_pte: pvo %p pte does not match " + "pte %p in moea_pteg_table", pvo, pt); + } + + mtx_assert(&moea_table_mutex, MA_OWNED); + return (pt); + } + + if (pvo->pvo_pte.pte.pte_hi & PTE_VALID) { + panic("moea_pvo_to_pte: pvo %p has invalid pte %p in " + "moea_pteg_table but valid in pvo: %8x, %8x", pvo, pt, pvo->pvo_pte.pte.pte_hi, pt->pte_hi); + } + + mtx_unlock(&moea_table_mutex); + return (NULL); +} + +/* + * XXX: THIS STUFF SHOULD BE IN pte.c? + */ +int +moea_pte_spill(vm_offset_t addr) +{ + struct pvo_entry *source_pvo, *victim_pvo; + struct pvo_entry *pvo; + int ptegidx, i, j; + u_int sr; + struct pteg *pteg; + struct pte *pt; + + moea_pte_spills++; + + sr = mfsrin(addr); + ptegidx = va_to_pteg(sr, addr); + + /* + * Have to substitute some entry. Use the primary hash for this. + * Use low bits of timebase as random generator. + */ + pteg = &moea_pteg_table[ptegidx]; + mtx_lock(&moea_table_mutex); + __asm __volatile("mftb %0" : "=r"(i)); + i &= 7; + pt = &pteg->pt[i]; + + source_pvo = NULL; + victim_pvo = NULL; + LIST_FOREACH(pvo, &moea_pvo_table[ptegidx], pvo_olink) { + /* + * We need to find a pvo entry for this address. + */ + if (source_pvo == NULL && + moea_pte_match(&pvo->pvo_pte.pte, sr, addr, + pvo->pvo_pte.pte.pte_hi & PTE_HID)) { + /* + * Now found an entry to be spilled into the pteg. + * The PTE is now valid, so we know it's active. + */ + j = moea_pte_insert(ptegidx, &pvo->pvo_pte.pte); + + if (j >= 0) { + PVO_PTEGIDX_SET(pvo, j); + moea_pte_overflow--; + mtx_unlock(&moea_table_mutex); + return (1); + } + + source_pvo = pvo; + + if (victim_pvo != NULL) + break; + } + + /* + * We also need the pvo entry of the victim we are replacing + * so save the R & C bits of the PTE. + */ + if ((pt->pte_hi & PTE_HID) == 0 && victim_pvo == NULL && + moea_pte_compare(pt, &pvo->pvo_pte.pte)) { + victim_pvo = pvo; + if (source_pvo != NULL) + break; + } + } + + if (source_pvo == NULL) { + mtx_unlock(&moea_table_mutex); + return (0); + } + + if (victim_pvo == NULL) { + if ((pt->pte_hi & PTE_HID) == 0) + panic("moea_pte_spill: victim p-pte (%p) has no pvo" + "entry", pt); + + /* + * If this is a secondary PTE, we need to search it's primary + * pvo bucket for the matching PVO. + */ + LIST_FOREACH(pvo, &moea_pvo_table[ptegidx ^ moea_pteg_mask], + pvo_olink) { + /* + * We also need the pvo entry of the victim we are + * replacing so save the R & C bits of the PTE. + */ + if (moea_pte_compare(pt, &pvo->pvo_pte.pte)) { + victim_pvo = pvo; + break; + } + } + + if (victim_pvo == NULL) + panic("moea_pte_spill: victim s-pte (%p) has no pvo" + "entry", pt); + } + + /* + * We are invalidating the TLB entry for the EA we are replacing even + * though it's valid. If we don't, we lose any ref/chg bit changes + * contained in the TLB entry. + */ + source_pvo->pvo_pte.pte.pte_hi &= ~PTE_HID; + + moea_pte_unset(pt, &victim_pvo->pvo_pte.pte, victim_pvo->pvo_vaddr); + moea_pte_set(pt, &source_pvo->pvo_pte.pte); + + PVO_PTEGIDX_CLR(victim_pvo); + PVO_PTEGIDX_SET(source_pvo, i); + moea_pte_replacements++; + + mtx_unlock(&moea_table_mutex); + return (1); +} + +static __inline struct pvo_entry * +moea_pte_spillable_ident(u_int ptegidx) +{ + struct pte *pt; + struct pvo_entry *pvo_walk, *pvo = NULL; + + LIST_FOREACH(pvo_walk, &moea_pvo_table[ptegidx], pvo_olink) { + if (pvo_walk->pvo_vaddr & PVO_WIRED) + continue; + + if (!(pvo_walk->pvo_pte.pte.pte_hi & PTE_VALID)) + continue; + + pt = moea_pvo_to_pte(pvo_walk, -1); + + if (pt == NULL) + continue; + + pvo = pvo_walk; + + mtx_unlock(&moea_table_mutex); + if (!(pt->pte_lo & PTE_REF)) + return (pvo_walk); + } + + return (pvo); +} + +static int +moea_pte_insert(u_int ptegidx, struct pte *pvo_pt) +{ + struct pte *pt; + struct pvo_entry *victim_pvo; + int i; + int victim_idx; + u_int pteg_bkpidx = ptegidx; + + mtx_assert(&moea_table_mutex, MA_OWNED); + + /* + * First try primary hash. + */ + for (pt = moea_pteg_table[ptegidx].pt, i = 0; i < 8; i++, pt++) { + if ((pt->pte_hi & PTE_VALID) == 0) { + pvo_pt->pte_hi &= ~PTE_HID; + moea_pte_set(pt, pvo_pt); + return (i); + } + } + + /* + * Now try secondary hash. + */ + ptegidx ^= moea_pteg_mask; + + for (pt = moea_pteg_table[ptegidx].pt, i = 0; i < 8; i++, pt++) { + if ((pt->pte_hi & PTE_VALID) == 0) { + pvo_pt->pte_hi |= PTE_HID; + moea_pte_set(pt, pvo_pt); + return (i); + } + } + + /* Try again, but this time try to force a PTE out. */ + ptegidx = pteg_bkpidx; + + victim_pvo = moea_pte_spillable_ident(ptegidx); + if (victim_pvo == NULL) { + ptegidx ^= moea_pteg_mask; + victim_pvo = moea_pte_spillable_ident(ptegidx); + } + + if (victim_pvo == NULL) { + panic("moea_pte_insert: overflow"); + return (-1); + } + + victim_idx = moea_pvo_pte_index(victim_pvo, ptegidx); + + if (pteg_bkpidx == ptegidx) + pvo_pt->pte_hi &= ~PTE_HID; + else + pvo_pt->pte_hi |= PTE_HID; + + /* + * Synchronize the sacrifice PTE with its PVO, then mark both + * invalid. The PVO will be reused when/if the VM system comes + * here after a fault. + */ + pt = &moea_pteg_table[victim_idx >> 3].pt[victim_idx & 7]; + + if (pt->pte_hi != victim_pvo->pvo_pte.pte.pte_hi) + panic("Victim PVO doesn't match PTE! PVO: %8x, PTE: %8x", victim_pvo->pvo_pte.pte.pte_hi, pt->pte_hi); + + /* + * Set the new PTE. + */ + moea_pte_unset(pt, &victim_pvo->pvo_pte.pte, victim_pvo->pvo_vaddr); + PVO_PTEGIDX_CLR(victim_pvo); + moea_pte_overflow++; + moea_pte_set(pt, pvo_pt); + + return (victim_idx & 7); +} + +static bool +moea_query_bit(vm_page_t m, int ptebit) +{ + struct pvo_entry *pvo; + struct pte *pt; + + rw_assert(&pvh_global_lock, RA_WLOCKED); + if (moea_attr_fetch(m) & ptebit) + return (true); + + LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { + /* + * See if we saved the bit off. If so, cache it and return + * success. + */ + if (pvo->pvo_pte.pte.pte_lo & ptebit) { + moea_attr_save(m, ptebit); + return (true); + } + } + + /* + * No luck, now go through the hard part of looking at the PTEs + * themselves. Sync so that any pending REF/CHG bits are flushed to + * the PTEs. + */ + powerpc_sync(); + LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { + /* + * See if this pvo has a valid PTE. if so, fetch the + * REF/CHG bits from the valid PTE. If the appropriate + * ptebit is set, cache it and return success. + */ + pt = moea_pvo_to_pte(pvo, -1); + if (pt != NULL) { + moea_pte_synch(pt, &pvo->pvo_pte.pte); + mtx_unlock(&moea_table_mutex); + if (pvo->pvo_pte.pte.pte_lo & ptebit) { + moea_attr_save(m, ptebit); + return (true); + } + } + } + + return (false); +} + +static u_int +moea_clear_bit(vm_page_t m, int ptebit) +{ + u_int count; + struct pvo_entry *pvo; + struct pte *pt; + + rw_assert(&pvh_global_lock, RA_WLOCKED); + + /* + * Clear the cached value. + */ + moea_attr_clear(m, ptebit); + + /* + * Sync so that any pending REF/CHG bits are flushed to the PTEs (so + * we can reset the right ones). note that since the pvo entries and + * list heads are accessed via BAT0 and are never placed in the page + * table, we don't have to worry about further accesses setting the + * REF/CHG bits. + */ + powerpc_sync(); + + /* + * For each pvo entry, clear the pvo's ptebit. If this pvo has a + * valid pte clear the ptebit from the valid pte. + */ + count = 0; + LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { + pt = moea_pvo_to_pte(pvo, -1); + if (pt != NULL) { + moea_pte_synch(pt, &pvo->pvo_pte.pte); + if (pvo->pvo_pte.pte.pte_lo & ptebit) { + count++; + moea_pte_clear(pt, PVO_VADDR(pvo), ptebit); + } + mtx_unlock(&moea_table_mutex); + } + pvo->pvo_pte.pte.pte_lo &= ~ptebit; + } + + return (count); +} + +/* + * Return true if the physical range is encompassed by the battable[idx] + */ +static int +moea_bat_mapped(int idx, vm_paddr_t pa, vm_size_t size) +{ + u_int prot; + u_int32_t start; + u_int32_t end; + u_int32_t bat_ble; + + /* + * Return immediately if not a valid mapping + */ + if (!(battable[idx].batu & BAT_Vs)) + return (EINVAL); + + /* + * The BAT entry must be cache-inhibited, guarded, and r/w + * so it can function as an i/o page + */ + prot = battable[idx].batl & (BAT_I|BAT_G|BAT_PP_RW); + if (prot != (BAT_I|BAT_G|BAT_PP_RW)) + return (EPERM); + + /* + * The address should be within the BAT range. Assume that the + * start address in the BAT has the correct alignment (thus + * not requiring masking) + */ + start = battable[idx].batl & BAT_PBS; + bat_ble = (battable[idx].batu & ~(BAT_EBS)) | 0x03; + end = start | (bat_ble << 15) | 0x7fff; + + if ((pa < start) || ((pa + size) > end)) + return (ERANGE); + + return (0); +} + +int +moea_dev_direct_mapped(vm_paddr_t pa, vm_size_t size) +{ + int i; + + /* + * This currently does not work for entries that + * overlap 256M BAT segments. + */ + + for(i = 0; i < 16; i++) + if (moea_bat_mapped(i, pa, size) == 0) + return (0); + + return (EFAULT); +} + +/* + * Map a set of physical memory pages into the kernel virtual + * address space. Return a pointer to where it is mapped. This + * routine is intended to be used for mapping device memory, + * NOT real memory. + */ +void * +moea_mapdev(vm_paddr_t pa, vm_size_t size) +{ + + return (moea_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT)); +} + +void * +moea_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) +{ + vm_offset_t va, tmpva, ppa, offset; + int i; + + ppa = trunc_page(pa); + offset = pa & PAGE_MASK; + size = roundup(offset + size, PAGE_SIZE); + + /* + * If the physical address lies within a valid BAT table entry, + * return the 1:1 mapping. This currently doesn't work + * for regions that overlap 256M BAT segments. + */ + for (i = 0; i < 16; i++) { + if (moea_bat_mapped(i, pa, size) == 0) + return ((void *) pa); + } + + va = kva_alloc(size); + if (!va) + panic("moea_mapdev: Couldn't alloc kernel virtual memory"); + + for (tmpva = va; size > 0;) { + moea_kenter_attr(tmpva, ppa, ma); + tlbie(tmpva); + size -= PAGE_SIZE; + tmpva += PAGE_SIZE; + ppa += PAGE_SIZE; + } + + return ((void *)(va + offset)); +} + +void +moea_unmapdev(void *p, vm_size_t size) +{ + vm_offset_t base, offset, va; + + /* + * If this is outside kernel virtual space, then it's a + * battable entry and doesn't require unmapping + */ + va = (vm_offset_t)p; + if ((va >= VM_MIN_KERNEL_ADDRESS) && (va <= virtual_end)) { + base = trunc_page(va); + offset = va & PAGE_MASK; + size = roundup(offset + size, PAGE_SIZE); + moea_qremove(base, atop(size)); + kva_free(base, size); + } +} + +static void +moea_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) +{ + struct pvo_entry *pvo; + vm_offset_t lim; + vm_paddr_t pa; + vm_size_t len; + + PMAP_LOCK(pm); + while (sz > 0) { + lim = round_page(va + 1); + len = MIN(lim - va, sz); + pvo = moea_pvo_find_va(pm, va & ~ADDR_POFF, NULL); + if (pvo != NULL) { + pa = PVO_PADDR(pvo) | (va & ADDR_POFF); + moea_syncicache(pa, len); + } + va += len; + sz -= len; + } + PMAP_UNLOCK(pm); +} + +void +moea_dumpsys_map(vm_paddr_t pa, size_t sz, void **va) +{ + + *va = (void *)pa; +} + +extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1]; + +void +moea_scan_init(void) +{ + struct pvo_entry *pvo; + vm_offset_t va; + int i; + + if (!do_minidump) { + /* Initialize phys. segments for dumpsys(). */ + memset(&dump_map, 0, sizeof(dump_map)); + mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); + for (i = 0; i < pregions_sz; i++) { + dump_map[i].pa_start = pregions[i].mr_start; + dump_map[i].pa_size = pregions[i].mr_size; + } + return; + } + + /* Virtual segments for minidumps: */ + memset(&dump_map, 0, sizeof(dump_map)); + + /* 1st: kernel .data and .bss. */ + dump_map[0].pa_start = trunc_page((uintptr_t)_etext); + dump_map[0].pa_size = + round_page((uintptr_t)_end) - dump_map[0].pa_start; + + /* 2nd: msgbuf and tables (see pmap_bootstrap()). */ + dump_map[1].pa_start = (vm_paddr_t)msgbufp->msg_ptr; + dump_map[1].pa_size = round_page(msgbufp->msg_size); + + /* 3rd: kernel VM. */ + va = dump_map[1].pa_start + dump_map[1].pa_size; + /* Find start of next chunk (from va). */ + while (va < virtual_end) { + /* Don't dump the buffer cache. */ + if (va >= kmi.buffer_sva && va < kmi.buffer_eva) { + va = kmi.buffer_eva; + continue; + } + pvo = moea_pvo_find_va(kernel_pmap, va & ~ADDR_POFF, NULL); + if (pvo != NULL && (pvo->pvo_pte.pte.pte_hi & PTE_VALID)) + break; + va += PAGE_SIZE; + } + if (va < virtual_end) { + dump_map[2].pa_start = va; + va += PAGE_SIZE; + /* Find last page in chunk. */ + while (va < virtual_end) { + /* Don't run into the buffer cache. */ + if (va == kmi.buffer_sva) + break; + pvo = moea_pvo_find_va(kernel_pmap, va & ~ADDR_POFF, + NULL); + if (pvo == NULL || + !(pvo->pvo_pte.pte.pte_hi & PTE_VALID)) + break; + va += PAGE_SIZE; + } + dump_map[2].pa_size = va - dump_map[2].pa_start; + } +} diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c new file mode 100644 index 000000000000..01bf4c7e90a8 --- /dev/null +++ b/sys/powerpc/aim/mmu_oea64.c @@ -0,0 +1,4341 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2008-2015 Nathan Whitehorn + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +/* + * Manages physical address maps. + * + * Since the information managed by this module is also stored by the + * logical address mapping module, this module may throw away valid virtual + * to physical mappings at almost any time. However, invalidations of + * mappings must be done as requested. + * + * In order to cope with hardware architectures which make virtual to + * physical map invalidates expensive, this module may delay invalidate + * reduced protection operations until such time as they are actually + * necessary. This module is given full information as to which processors + * are currently using which maps, and to when physical maps must be made + * correct. + */ + +#include "opt_kstack_pages.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/conf.h> +#include <sys/queue.h> +#include <sys/cpuset.h> +#include <sys/kerneldump.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/msgbuf.h> +#include <sys/malloc.h> +#include <sys/mman.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/sched.h> +#include <sys/sysctl.h> +#include <sys/systm.h> +#include <sys/vmmeter.h> +#include <sys/smp.h> +#include <sys/reboot.h> + +#include <sys/kdb.h> + +#include <dev/ofw/openfirm.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_phys.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/vm_pageout.h> +#include <vm/vm_dumpset.h> +#include <vm/vm_radix.h> +#include <vm/vm_reserv.h> +#include <vm/uma.h> + +#include <machine/_inttypes.h> +#include <machine/cpu.h> +#include <machine/ifunc.h> +#include <machine/platform.h> +#include <machine/frame.h> +#include <machine/md_var.h> +#include <machine/psl.h> +#include <machine/bat.h> +#include <machine/hid.h> +#include <machine/pte.h> +#include <machine/sr.h> +#include <machine/trap.h> +#include <machine/mmuvar.h> + +#include "mmu_oea64.h" + +void moea64_release_vsid(uint64_t vsid); +uintptr_t moea64_get_unique_vsid(void); + +#define DISABLE_TRANS(msr) msr = mfmsr(); mtmsr(msr & ~PSL_DR) +#define ENABLE_TRANS(msr) mtmsr(msr) + +#define VSID_MAKE(sr, hash) ((sr) | (((hash) & 0xfffff) << 4)) +#define VSID_TO_HASH(vsid) (((vsid) >> 4) & 0xfffff) +#define VSID_HASH_MASK 0x0000007fffffffffULL + +/* + * Locking semantics: + * + * There are two locks of interest: the page locks and the pmap locks, which + * protect their individual PVO lists and are locked in that order. The contents + * of all PVO entries are protected by the locks of their respective pmaps. + * The pmap of any PVO is guaranteed not to change so long as the PVO is linked + * into any list. + * + */ + +#define PV_LOCK_COUNT MAXCPU +static struct mtx_padalign pv_lock[PV_LOCK_COUNT]; + +#define PV_LOCK_SHIFT 21 +#define pa_index(pa) ((pa) >> PV_LOCK_SHIFT) + +/* + * Cheap NUMA-izing of the pv locks, to reduce contention across domains. + * NUMA domains on POWER9 appear to be indexed as sparse memory spaces, with the + * index at (N << 45). + */ +#ifdef __powerpc64__ +#define PV_LOCK_IDX(pa) ((pa_index(pa) * (((pa) >> 45) + 1)) % PV_LOCK_COUNT) +#else +#define PV_LOCK_IDX(pa) (pa_index(pa) % PV_LOCK_COUNT) +#endif +#define PV_LOCKPTR(pa) ((struct mtx *)(&pv_lock[PV_LOCK_IDX(pa)])) +#define PV_LOCK(pa) mtx_lock(PV_LOCKPTR(pa)) +#define PV_UNLOCK(pa) mtx_unlock(PV_LOCKPTR(pa)) +#define PV_LOCKASSERT(pa) mtx_assert(PV_LOCKPTR(pa), MA_OWNED) +#define PV_PAGE_LOCK(m) PV_LOCK(VM_PAGE_TO_PHYS(m)) +#define PV_PAGE_UNLOCK(m) PV_UNLOCK(VM_PAGE_TO_PHYS(m)) +#define PV_PAGE_LOCKASSERT(m) PV_LOCKASSERT(VM_PAGE_TO_PHYS(m)) + +/* Superpage PV lock */ + +#define PV_LOCK_SIZE (1 << PV_LOCK_SHIFT) + +static __always_inline void +moea64_sp_pv_lock(vm_paddr_t pa) +{ + vm_paddr_t pa_end; + + /* Note: breaking when pa_end is reached to avoid overflows */ + pa_end = pa + (HPT_SP_SIZE - PV_LOCK_SIZE); + for (;;) { + mtx_lock_flags(PV_LOCKPTR(pa), MTX_DUPOK); + if (pa == pa_end) + break; + pa += PV_LOCK_SIZE; + } +} + +static __always_inline void +moea64_sp_pv_unlock(vm_paddr_t pa) +{ + vm_paddr_t pa_end; + + /* Note: breaking when pa_end is reached to avoid overflows */ + pa_end = pa; + pa += HPT_SP_SIZE - PV_LOCK_SIZE; + for (;;) { + mtx_unlock_flags(PV_LOCKPTR(pa), MTX_DUPOK); + if (pa == pa_end) + break; + pa -= PV_LOCK_SIZE; + } +} + +#define SP_PV_LOCK_ALIGNED(pa) moea64_sp_pv_lock(pa) +#define SP_PV_UNLOCK_ALIGNED(pa) moea64_sp_pv_unlock(pa) +#define SP_PV_LOCK(pa) moea64_sp_pv_lock((pa) & ~HPT_SP_MASK) +#define SP_PV_UNLOCK(pa) moea64_sp_pv_unlock((pa) & ~HPT_SP_MASK) +#define SP_PV_PAGE_LOCK(m) SP_PV_LOCK(VM_PAGE_TO_PHYS(m)) +#define SP_PV_PAGE_UNLOCK(m) SP_PV_UNLOCK(VM_PAGE_TO_PHYS(m)) + +struct ofw_map { + cell_t om_va; + cell_t om_len; + uint64_t om_pa; + cell_t om_mode; +}; + +extern unsigned char _etext[]; +extern unsigned char _end[]; + +extern void *slbtrap, *slbtrapend; + +/* + * Map of physical memory regions. + */ +static struct mem_region *regions; +static struct mem_region *pregions; +static struct numa_mem_region *numa_pregions; +static u_int phys_avail_count; +static int regions_sz, pregions_sz, numapregions_sz; + +extern void bs_remap_earlyboot(void); + +/* + * Lock for the SLB tables. + */ +struct mtx moea64_slb_mutex; + +/* + * PTEG data. + */ +u_long moea64_pteg_count; +u_long moea64_pteg_mask; + +/* + * PVO data. + */ + +uma_zone_t moea64_pvo_zone; /* zone for pvo entries */ + +static struct pvo_entry *moea64_bpvo_pool; +static int moea64_bpvo_pool_index = 0; +static int moea64_bpvo_pool_size = 0; +SYSCTL_INT(_machdep, OID_AUTO, moea64_allocated_bpvo_entries, CTLFLAG_RD, + &moea64_bpvo_pool_index, 0, ""); + +#define BPVO_POOL_SIZE 327680 /* Sensible historical default value */ +#define BPVO_POOL_EXPANSION_FACTOR 3 +#define VSID_NBPW (sizeof(u_int32_t) * 8) +#ifdef __powerpc64__ +#define NVSIDS (NPMAPS * 16) +#define VSID_HASHMASK 0xffffffffUL +#else +#define NVSIDS NPMAPS +#define VSID_HASHMASK 0xfffffUL +#endif +static u_int moea64_vsid_bitmap[NVSIDS / VSID_NBPW]; + +static bool moea64_initialized = false; + +#ifdef MOEA64_STATS +/* + * Statistics. + */ +u_int moea64_pte_valid = 0; +u_int moea64_pte_overflow = 0; +u_int moea64_pvo_entries = 0; +u_int moea64_pvo_enter_calls = 0; +u_int moea64_pvo_remove_calls = 0; +SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_valid, CTLFLAG_RD, + &moea64_pte_valid, 0, ""); +SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_overflow, CTLFLAG_RD, + &moea64_pte_overflow, 0, ""); +SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_entries, CTLFLAG_RD, + &moea64_pvo_entries, 0, ""); +SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_enter_calls, CTLFLAG_RD, + &moea64_pvo_enter_calls, 0, ""); +SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_remove_calls, CTLFLAG_RD, + &moea64_pvo_remove_calls, 0, ""); +#endif + +vm_offset_t moea64_scratchpage_va[2]; +struct pvo_entry *moea64_scratchpage_pvo[2]; +struct mtx moea64_scratchpage_mtx; + +uint64_t moea64_large_page_mask = 0; +uint64_t moea64_large_page_size = 0; +int moea64_large_page_shift = 0; +bool moea64_has_lp_4k_16m = false; + +/* + * PVO calls. + */ +static int moea64_pvo_enter(struct pvo_entry *pvo, + struct pvo_head *pvo_head, struct pvo_entry **oldpvo); +static void moea64_pvo_remove_from_pmap(struct pvo_entry *pvo); +static void moea64_pvo_remove_from_page(struct pvo_entry *pvo); +static void moea64_pvo_remove_from_page_locked( + struct pvo_entry *pvo, vm_page_t m); +static struct pvo_entry *moea64_pvo_find_va(pmap_t, vm_offset_t); + +/* + * Utility routines. + */ +static bool moea64_query_bit(vm_page_t, uint64_t); +static u_int moea64_clear_bit(vm_page_t, uint64_t); +static void moea64_kremove(vm_offset_t); +static void moea64_syncicache(pmap_t pmap, vm_offset_t va, + vm_paddr_t pa, vm_size_t sz); +static void moea64_pmap_init_qpages(void *); +static void moea64_remove_locked(pmap_t, vm_offset_t, + vm_offset_t, struct pvo_dlist *); + +/* + * Superpages data and routines. + */ + +/* + * PVO flags (in vaddr) that must match for promotion to succeed. + * Note that protection bits are checked separately, as they reside in + * another field. + */ +#define PVO_FLAGS_PROMOTE (PVO_WIRED | PVO_MANAGED | PVO_PTEGIDX_VALID) + +#define PVO_IS_SP(pvo) (((pvo)->pvo_vaddr & PVO_LARGE) && \ + (pvo)->pvo_pmap != kernel_pmap) + +/* Get physical address from PVO. */ +#define PVO_PADDR(pvo) moea64_pvo_paddr(pvo) + +/* MD page flag indicating that the page is a superpage. */ +#define MDPG_ATTR_SP 0x40000000 + +SYSCTL_DECL(_vm_pmap); + +static SYSCTL_NODE(_vm_pmap, OID_AUTO, sp, CTLFLAG_RD, 0, + "SP page mapping counters"); + +static u_long sp_demotions; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, demotions, CTLFLAG_RD, + &sp_demotions, 0, "SP page demotions"); + +static u_long sp_mappings; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, mappings, CTLFLAG_RD, + &sp_mappings, 0, "SP page mappings"); + +static u_long sp_p_failures; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_failures, CTLFLAG_RD, + &sp_p_failures, 0, "SP page promotion failures"); + +static u_long sp_p_fail_pa; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_pa, CTLFLAG_RD, + &sp_p_fail_pa, 0, "SP page promotion failure: PAs don't match"); + +static u_long sp_p_fail_flags; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_flags, CTLFLAG_RD, + &sp_p_fail_flags, 0, "SP page promotion failure: page flags don't match"); + +static u_long sp_p_fail_prot; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_prot, CTLFLAG_RD, + &sp_p_fail_prot, 0, + "SP page promotion failure: page protections don't match"); + +static u_long sp_p_fail_wimg; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_wimg, CTLFLAG_RD, + &sp_p_fail_wimg, 0, "SP page promotion failure: WIMG bits don't match"); + +static u_long sp_promotions; +SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, promotions, CTLFLAG_RD, + &sp_promotions, 0, "SP page promotions"); + +static bool moea64_ps_enabled(pmap_t); +static void moea64_align_superpage(vm_object_t, vm_ooffset_t, + vm_offset_t *, vm_size_t); + +static int moea64_sp_enter(pmap_t pmap, vm_offset_t va, + vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind); +static struct pvo_entry *moea64_sp_remove(struct pvo_entry *sp, + struct pvo_dlist *tofree); + +#if VM_NRESERVLEVEL > 0 +static void moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m); +#endif +static void moea64_sp_demote_aligned(struct pvo_entry *sp); +static void moea64_sp_demote(struct pvo_entry *pvo); + +static struct pvo_entry *moea64_sp_unwire(struct pvo_entry *sp); +static struct pvo_entry *moea64_sp_protect(struct pvo_entry *sp, + vm_prot_t prot); + +static int64_t moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit); +static int64_t moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m, + uint64_t ptebit); + +static __inline bool moea64_sp_pvo_in_range(struct pvo_entry *pvo, + vm_offset_t sva, vm_offset_t eva); + +/* + * Kernel MMU interface + */ +void moea64_clear_modify(vm_page_t); +void moea64_copy_page(vm_page_t, vm_page_t); +void moea64_copy_page_dmap(vm_page_t, vm_page_t); +void moea64_copy_pages(vm_page_t *ma, vm_offset_t a_offset, + vm_page_t *mb, vm_offset_t b_offset, int xfersize); +void moea64_copy_pages_dmap(vm_page_t *ma, vm_offset_t a_offset, + vm_page_t *mb, vm_offset_t b_offset, int xfersize); +int moea64_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, + u_int flags, int8_t psind); +void moea64_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t, + vm_prot_t); +void moea64_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t); +vm_paddr_t moea64_extract(pmap_t, vm_offset_t); +vm_page_t moea64_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t); +void moea64_init(void); +bool moea64_is_modified(vm_page_t); +bool moea64_is_prefaultable(pmap_t, vm_offset_t); +bool moea64_is_referenced(vm_page_t); +int moea64_ts_referenced(vm_page_t); +vm_offset_t moea64_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int); +bool moea64_page_exists_quick(pmap_t, vm_page_t); +void moea64_page_init(vm_page_t); +int moea64_page_wired_mappings(vm_page_t); +int moea64_pinit(pmap_t); +void moea64_pinit0(pmap_t); +void moea64_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); +void moea64_qenter(vm_offset_t, vm_page_t *, int); +void moea64_qremove(vm_offset_t, int); +void moea64_release(pmap_t); +void moea64_remove(pmap_t, vm_offset_t, vm_offset_t); +void moea64_remove_pages(pmap_t); +void moea64_remove_all(vm_page_t); +void moea64_remove_write(vm_page_t); +void moea64_unwire(pmap_t, vm_offset_t, vm_offset_t); +void moea64_zero_page(vm_page_t); +void moea64_zero_page_dmap(vm_page_t); +void moea64_zero_page_area(vm_page_t, int, int); +void moea64_activate(struct thread *); +void moea64_deactivate(struct thread *); +void *moea64_mapdev(vm_paddr_t, vm_size_t); +void *moea64_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t); +void moea64_unmapdev(void *, vm_size_t); +vm_paddr_t moea64_kextract(vm_offset_t); +void moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma); +void moea64_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma); +void moea64_kenter(vm_offset_t, vm_paddr_t); +int moea64_dev_direct_mapped(vm_paddr_t, vm_size_t); +static void moea64_sync_icache(pmap_t, vm_offset_t, vm_size_t); +void moea64_dumpsys_map(vm_paddr_t pa, size_t sz, + void **va); +void moea64_scan_init(void); +vm_offset_t moea64_quick_enter_page(vm_page_t m); +vm_offset_t moea64_quick_enter_page_dmap(vm_page_t m); +void moea64_quick_remove_page(vm_offset_t addr); +bool moea64_page_is_mapped(vm_page_t m); +static int moea64_map_user_ptr(pmap_t pm, + volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen); +static int moea64_decode_kernel_ptr(vm_offset_t addr, + int *is_user, vm_offset_t *decoded_addr); +static size_t moea64_scan_pmap(struct bitset *dump_bitset); +static void *moea64_dump_pmap_init(unsigned blkpgs); +#ifdef __powerpc64__ +static void moea64_page_array_startup(long); +#endif +static int moea64_mincore(pmap_t, vm_offset_t, vm_paddr_t *); + +static struct pmap_funcs moea64_methods = { + .clear_modify = moea64_clear_modify, + .copy_page = moea64_copy_page, + .copy_pages = moea64_copy_pages, + .enter = moea64_enter, + .enter_object = moea64_enter_object, + .enter_quick = moea64_enter_quick, + .extract = moea64_extract, + .extract_and_hold = moea64_extract_and_hold, + .init = moea64_init, + .is_modified = moea64_is_modified, + .is_prefaultable = moea64_is_prefaultable, + .is_referenced = moea64_is_referenced, + .ts_referenced = moea64_ts_referenced, + .map = moea64_map, + .mincore = moea64_mincore, + .page_exists_quick = moea64_page_exists_quick, + .page_init = moea64_page_init, + .page_wired_mappings = moea64_page_wired_mappings, + .pinit = moea64_pinit, + .pinit0 = moea64_pinit0, + .protect = moea64_protect, + .qenter = moea64_qenter, + .qremove = moea64_qremove, + .release = moea64_release, + .remove = moea64_remove, + .remove_pages = moea64_remove_pages, + .remove_all = moea64_remove_all, + .remove_write = moea64_remove_write, + .sync_icache = moea64_sync_icache, + .unwire = moea64_unwire, + .zero_page = moea64_zero_page, + .zero_page_area = moea64_zero_page_area, + .activate = moea64_activate, + .deactivate = moea64_deactivate, + .page_set_memattr = moea64_page_set_memattr, + .quick_enter_page = moea64_quick_enter_page, + .quick_remove_page = moea64_quick_remove_page, + .page_is_mapped = moea64_page_is_mapped, +#ifdef __powerpc64__ + .page_array_startup = moea64_page_array_startup, +#endif + .ps_enabled = moea64_ps_enabled, + .align_superpage = moea64_align_superpage, + + /* Internal interfaces */ + .mapdev = moea64_mapdev, + .mapdev_attr = moea64_mapdev_attr, + .unmapdev = moea64_unmapdev, + .kextract = moea64_kextract, + .kenter = moea64_kenter, + .kenter_attr = moea64_kenter_attr, + .dev_direct_mapped = moea64_dev_direct_mapped, + .dumpsys_pa_init = moea64_scan_init, + .dumpsys_scan_pmap = moea64_scan_pmap, + .dumpsys_dump_pmap_init = moea64_dump_pmap_init, + .dumpsys_map_chunk = moea64_dumpsys_map, + .map_user_ptr = moea64_map_user_ptr, + .decode_kernel_ptr = moea64_decode_kernel_ptr, +}; + +MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods); + +/* + * Get physical address from PVO. + * + * For superpages, the lower bits are not stored on pvo_pte.pa and must be + * obtained from VA. + */ +static __always_inline vm_paddr_t +moea64_pvo_paddr(struct pvo_entry *pvo) +{ + vm_paddr_t pa; + + pa = (pvo)->pvo_pte.pa & LPTE_RPGN; + + if (PVO_IS_SP(pvo)) { + pa &= ~HPT_SP_MASK; /* This is needed to clear LPTE_LP bits. */ + pa |= PVO_VADDR(pvo) & HPT_SP_MASK; + } + return (pa); +} + +static struct pvo_head * +vm_page_to_pvoh(vm_page_t m) +{ + + mtx_assert(PV_LOCKPTR(VM_PAGE_TO_PHYS(m)), MA_OWNED); + return (&m->md.mdpg_pvoh); +} + +static struct pvo_entry * +alloc_pvo_entry(int bootstrap) +{ + struct pvo_entry *pvo; + + if (!moea64_initialized || bootstrap) { + if (moea64_bpvo_pool_index >= moea64_bpvo_pool_size) { + panic("%s: bpvo pool exhausted, index=%d, size=%d, bytes=%zd." + "Try setting machdep.moea64_bpvo_pool_size tunable", + __func__, moea64_bpvo_pool_index, + moea64_bpvo_pool_size, + moea64_bpvo_pool_size * sizeof(struct pvo_entry)); + } + pvo = &moea64_bpvo_pool[ + atomic_fetchadd_int(&moea64_bpvo_pool_index, 1)]; + bzero(pvo, sizeof(*pvo)); + pvo->pvo_vaddr = PVO_BOOTSTRAP; + } else + pvo = uma_zalloc(moea64_pvo_zone, M_NOWAIT | M_ZERO); + + return (pvo); +} + +static void +init_pvo_entry(struct pvo_entry *pvo, pmap_t pmap, vm_offset_t va) +{ + uint64_t vsid; + uint64_t hash; + int shift; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + pvo->pvo_pmap = pmap; + va &= ~ADDR_POFF; + pvo->pvo_vaddr |= va; + vsid = va_to_vsid(pmap, va); + pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT) + | (vsid << 16); + + if (pmap == kernel_pmap && (pvo->pvo_vaddr & PVO_LARGE) != 0) + shift = moea64_large_page_shift; + else + shift = ADDR_PIDX_SHFT; + hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)va & ADDR_PIDX) >> shift); + pvo->pvo_pte.slot = (hash & moea64_pteg_mask) << 3; +} + +static void +free_pvo_entry(struct pvo_entry *pvo) +{ + + if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP)) + uma_zfree(moea64_pvo_zone, pvo); +} + +void +moea64_pte_from_pvo(const struct pvo_entry *pvo, struct lpte *lpte) +{ + + lpte->pte_hi = moea64_pte_vpn_from_pvo_vpn(pvo); + lpte->pte_hi |= LPTE_VALID; + + if (pvo->pvo_vaddr & PVO_LARGE) + lpte->pte_hi |= LPTE_BIG; + if (pvo->pvo_vaddr & PVO_WIRED) + lpte->pte_hi |= LPTE_WIRED; + if (pvo->pvo_vaddr & PVO_HID) + lpte->pte_hi |= LPTE_HID; + + lpte->pte_lo = pvo->pvo_pte.pa; /* Includes WIMG bits */ + if (pvo->pvo_pte.prot & VM_PROT_WRITE) + lpte->pte_lo |= LPTE_BW; + else + lpte->pte_lo |= LPTE_BR; + + if (!(pvo->pvo_pte.prot & VM_PROT_EXECUTE)) + lpte->pte_lo |= LPTE_NOEXEC; +} + +static __inline uint64_t +moea64_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) +{ + uint64_t pte_lo; + int i; + + if (ma != VM_MEMATTR_DEFAULT) { + switch (ma) { + case VM_MEMATTR_UNCACHEABLE: + return (LPTE_I | LPTE_G); + case VM_MEMATTR_CACHEABLE: + return (LPTE_M); + case VM_MEMATTR_WRITE_COMBINING: + case VM_MEMATTR_WRITE_BACK: + case VM_MEMATTR_PREFETCHABLE: + return (LPTE_I); + case VM_MEMATTR_WRITE_THROUGH: + return (LPTE_W | LPTE_M); + } + } + + /* + * Assume the page is cache inhibited and access is guarded unless + * it's in our available memory array. + */ + pte_lo = LPTE_I | LPTE_G; + for (i = 0; i < pregions_sz; i++) { + if ((pa >= pregions[i].mr_start) && + (pa < (pregions[i].mr_start + pregions[i].mr_size))) { + pte_lo &= ~(LPTE_I | LPTE_G); + pte_lo |= LPTE_M; + break; + } + } + + return pte_lo; +} + +/* + * Quick sort callout for comparing memory regions. + */ +static int om_cmp(const void *a, const void *b); + +static int +om_cmp(const void *a, const void *b) +{ + const struct ofw_map *mapa; + const struct ofw_map *mapb; + + mapa = a; + mapb = b; + if (mapa->om_pa < mapb->om_pa) + return (-1); + else if (mapa->om_pa > mapb->om_pa) + return (1); + else + return (0); +} + +static void +moea64_add_ofw_mappings(phandle_t mmu, size_t sz) +{ + struct ofw_map translations[sz/(4*sizeof(cell_t))]; /*>= 4 cells per */ + pcell_t acells, trans_cells[sz/sizeof(cell_t)]; + struct pvo_entry *pvo; + register_t msr; + vm_offset_t off; + vm_paddr_t pa_base; + int i, j; + + bzero(translations, sz); + OF_getencprop(OF_finddevice("/"), "#address-cells", &acells, + sizeof(acells)); + if (OF_getencprop(mmu, "translations", trans_cells, sz) == -1) + panic("moea64_bootstrap: can't get ofw translations"); + + CTR0(KTR_PMAP, "moea64_add_ofw_mappings: translations"); + sz /= sizeof(cell_t); + for (i = 0, j = 0; i < sz; j++) { + translations[j].om_va = trans_cells[i++]; + translations[j].om_len = trans_cells[i++]; + translations[j].om_pa = trans_cells[i++]; + if (acells == 2) { + translations[j].om_pa <<= 32; + translations[j].om_pa |= trans_cells[i++]; + } + translations[j].om_mode = trans_cells[i++]; + } + KASSERT(i == sz, ("Translations map has incorrect cell count (%d/%zd)", + i, sz)); + + sz = j; + qsort(translations, sz, sizeof (*translations), om_cmp); + + for (i = 0; i < sz; i++) { + pa_base = translations[i].om_pa; + #ifndef __powerpc64__ + if ((translations[i].om_pa >> 32) != 0) + panic("OFW translations above 32-bit boundary!"); + #endif + + if (pa_base % PAGE_SIZE) + panic("OFW translation not page-aligned (phys)!"); + if (translations[i].om_va % PAGE_SIZE) + panic("OFW translation not page-aligned (virt)!"); + + CTR3(KTR_PMAP, "translation: pa=%#zx va=%#x len=%#x", + pa_base, translations[i].om_va, translations[i].om_len); + + /* Now enter the pages for this mapping */ + + DISABLE_TRANS(msr); + for (off = 0; off < translations[i].om_len; off += PAGE_SIZE) { + /* If this address is direct-mapped, skip remapping */ + if (hw_direct_map && + translations[i].om_va == PHYS_TO_DMAP(pa_base) && + moea64_calc_wimg(pa_base + off, VM_MEMATTR_DEFAULT) + == LPTE_M) + continue; + + PMAP_LOCK(kernel_pmap); + pvo = moea64_pvo_find_va(kernel_pmap, + translations[i].om_va + off); + PMAP_UNLOCK(kernel_pmap); + if (pvo != NULL) + continue; + + moea64_kenter(translations[i].om_va + off, + pa_base + off); + } + ENABLE_TRANS(msr); + } +} + +#ifdef __powerpc64__ +static void +moea64_probe_large_page(void) +{ + uint16_t pvr = mfpvr() >> 16; + + switch (pvr) { + case IBM970: + case IBM970FX: + case IBM970MP: + powerpc_sync(); isync(); + mtspr(SPR_HID4, mfspr(SPR_HID4) & ~HID4_970_DISABLE_LG_PG); + powerpc_sync(); isync(); + + /* FALLTHROUGH */ + default: + if (moea64_large_page_size == 0) { + moea64_large_page_size = 0x1000000; /* 16 MB */ + moea64_large_page_shift = 24; + } + } + + moea64_large_page_mask = moea64_large_page_size - 1; +} + +static void +moea64_bootstrap_slb_prefault(vm_offset_t va, int large) +{ + struct slb *cache; + struct slb entry; + uint64_t esid, slbe; + uint64_t i; + + cache = PCPU_GET(aim.slb); + esid = va >> ADDR_SR_SHFT; + slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID; + + for (i = 0; i < 64; i++) { + if (cache[i].slbe == (slbe | i)) + return; + } + + entry.slbe = slbe; + entry.slbv = KERNEL_VSID(esid) << SLBV_VSID_SHIFT; + if (large) + entry.slbv |= SLBV_L; + + slb_insert_kernel(entry.slbe, entry.slbv); +} +#endif + +static int +moea64_kenter_large(vm_offset_t va, vm_paddr_t pa, uint64_t attr, int bootstrap) +{ + struct pvo_entry *pvo; + uint64_t pte_lo; + int error; + + pte_lo = LPTE_M; + pte_lo |= attr; + + pvo = alloc_pvo_entry(bootstrap); + pvo->pvo_vaddr |= PVO_WIRED | PVO_LARGE; + init_pvo_entry(pvo, kernel_pmap, va); + + pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | + VM_PROT_EXECUTE; + pvo->pvo_pte.pa = pa | pte_lo; + error = moea64_pvo_enter(pvo, NULL, NULL); + if (error != 0) + panic("Error %d inserting large page\n", error); + return (0); +} + +static void +moea64_setup_direct_map(vm_offset_t kernelstart, + vm_offset_t kernelend) +{ + register_t msr; + vm_paddr_t pa, pkernelstart, pkernelend; + vm_offset_t size, off; + uint64_t pte_lo; + int i; + + if (moea64_large_page_size == 0) + hw_direct_map = 0; + + DISABLE_TRANS(msr); + if (hw_direct_map) { + PMAP_LOCK(kernel_pmap); + for (i = 0; i < pregions_sz; i++) { + for (pa = pregions[i].mr_start; pa < pregions[i].mr_start + + pregions[i].mr_size; pa += moea64_large_page_size) { + pte_lo = LPTE_M; + if (pa & moea64_large_page_mask) { + pa &= moea64_large_page_mask; + pte_lo |= LPTE_G; + } + if (pa + moea64_large_page_size > + pregions[i].mr_start + pregions[i].mr_size) + pte_lo |= LPTE_G; + + moea64_kenter_large(PHYS_TO_DMAP(pa), pa, pte_lo, 1); + } + } + PMAP_UNLOCK(kernel_pmap); + } + + /* + * Make sure the kernel and BPVO pool stay mapped on systems either + * without a direct map or on which the kernel is not already executing + * out of the direct-mapped region. + */ + if (kernelstart < DMAP_BASE_ADDRESS) { + /* + * For pre-dmap execution, we need to use identity mapping + * because we will be operating with the mmu on but in the + * wrong address configuration until we __restartkernel(). + */ + for (pa = kernelstart & ~PAGE_MASK; pa < kernelend; + pa += PAGE_SIZE) + moea64_kenter(pa, pa); + } else if (!hw_direct_map) { + pkernelstart = kernelstart & ~DMAP_BASE_ADDRESS; + pkernelend = kernelend & ~DMAP_BASE_ADDRESS; + for (pa = pkernelstart & ~PAGE_MASK; pa < pkernelend; + pa += PAGE_SIZE) + moea64_kenter(pa | DMAP_BASE_ADDRESS, pa); + } + + if (!hw_direct_map) { + size = moea64_bpvo_pool_size*sizeof(struct pvo_entry); + off = (vm_offset_t)(moea64_bpvo_pool); + for (pa = off; pa < off + size; pa += PAGE_SIZE) + moea64_kenter(pa, pa); + + /* Map exception vectors */ + for (pa = EXC_RSVD; pa < EXC_LAST; pa += PAGE_SIZE) + moea64_kenter(pa | DMAP_BASE_ADDRESS, pa); + } + ENABLE_TRANS(msr); + + /* + * Allow user to override unmapped_buf_allowed for testing. + * XXXKIB Only direct map implementation was tested. + */ + if (!TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed", + &unmapped_buf_allowed)) + unmapped_buf_allowed = hw_direct_map; +} + +/* Quick sort callout for comparing physical addresses. */ +static int +pa_cmp(const void *a, const void *b) +{ + const vm_paddr_t *pa = a, *pb = b; + + if (*pa < *pb) + return (-1); + else if (*pa > *pb) + return (1); + else + return (0); +} + +void +moea64_early_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend) +{ + int i, j; + vm_size_t physsz, hwphyssz; + vm_paddr_t kernelphysstart, kernelphysend; + int rm_pavail; + + /* Level 0 reservations consist of 4096 pages (16MB superpage). */ + vm_level_0_order = 12; + +#ifndef __powerpc64__ + /* We don't have a direct map since there is no BAT */ + hw_direct_map = 0; + + /* Make sure battable is zero, since we have no BAT */ + for (i = 0; i < 16; i++) { + battable[i].batu = 0; + battable[i].batl = 0; + } +#else + /* Install trap handlers for SLBs */ + bcopy(&slbtrap, (void *)EXC_DSE,(size_t)&slbtrapend - (size_t)&slbtrap); + bcopy(&slbtrap, (void *)EXC_ISE,(size_t)&slbtrapend - (size_t)&slbtrap); + __syncicache((void *)EXC_DSE, 0x80); + __syncicache((void *)EXC_ISE, 0x80); +#endif + + kernelphysstart = kernelstart & ~DMAP_BASE_ADDRESS; + kernelphysend = kernelend & ~DMAP_BASE_ADDRESS; + + /* Get physical memory regions from firmware */ + mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); + CTR0(KTR_PMAP, "moea64_bootstrap: physical memory"); + + if (PHYS_AVAIL_ENTRIES < regions_sz) + panic("moea64_bootstrap: phys_avail too small"); + + phys_avail_count = 0; + physsz = 0; + hwphyssz = 0; + TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); + for (i = 0, j = 0; i < regions_sz; i++, j += 2) { + CTR3(KTR_PMAP, "region: %#zx - %#zx (%#zx)", + regions[i].mr_start, regions[i].mr_start + + regions[i].mr_size, regions[i].mr_size); + if (hwphyssz != 0 && + (physsz + regions[i].mr_size) >= hwphyssz) { + if (physsz < hwphyssz) { + phys_avail[j] = regions[i].mr_start; + phys_avail[j + 1] = regions[i].mr_start + + hwphyssz - physsz; + physsz = hwphyssz; + phys_avail_count++; + dump_avail[j] = phys_avail[j]; + dump_avail[j + 1] = phys_avail[j + 1]; + } + break; + } + phys_avail[j] = regions[i].mr_start; + phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; + phys_avail_count++; + physsz += regions[i].mr_size; + dump_avail[j] = phys_avail[j]; + dump_avail[j + 1] = phys_avail[j + 1]; + } + + /* Check for overlap with the kernel and exception vectors */ + rm_pavail = 0; + for (j = 0; j < 2*phys_avail_count; j+=2) { + if (phys_avail[j] < EXC_LAST) + phys_avail[j] += EXC_LAST; + + if (phys_avail[j] >= kernelphysstart && + phys_avail[j+1] <= kernelphysend) { + phys_avail[j] = phys_avail[j+1] = ~0; + rm_pavail++; + continue; + } + + if (kernelphysstart >= phys_avail[j] && + kernelphysstart < phys_avail[j+1]) { + if (kernelphysend < phys_avail[j+1]) { + phys_avail[2*phys_avail_count] = + (kernelphysend & ~PAGE_MASK) + PAGE_SIZE; + phys_avail[2*phys_avail_count + 1] = + phys_avail[j+1]; + phys_avail_count++; + } + + phys_avail[j+1] = kernelphysstart & ~PAGE_MASK; + } + + if (kernelphysend >= phys_avail[j] && + kernelphysend < phys_avail[j+1]) { + if (kernelphysstart > phys_avail[j]) { + phys_avail[2*phys_avail_count] = phys_avail[j]; + phys_avail[2*phys_avail_count + 1] = + kernelphysstart & ~PAGE_MASK; + phys_avail_count++; + } + + phys_avail[j] = (kernelphysend & ~PAGE_MASK) + + PAGE_SIZE; + } + } + + /* Remove physical available regions marked for removal (~0) */ + if (rm_pavail) { + qsort(phys_avail, 2*phys_avail_count, sizeof(phys_avail[0]), + pa_cmp); + phys_avail_count -= rm_pavail; + for (i = 2*phys_avail_count; + i < 2*(phys_avail_count + rm_pavail); i+=2) + phys_avail[i] = phys_avail[i+1] = 0; + } + + physmem = btoc(physsz); + +#ifdef PTEGCOUNT + moea64_pteg_count = PTEGCOUNT; +#else + moea64_pteg_count = 0x1000; + + while (moea64_pteg_count < physmem) + moea64_pteg_count <<= 1; + + moea64_pteg_count >>= 1; +#endif /* PTEGCOUNT */ +} + +void +moea64_mid_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend) +{ + int i; + + /* + * Set PTEG mask + */ + moea64_pteg_mask = moea64_pteg_count - 1; + + /* + * Initialize SLB table lock and page locks + */ + mtx_init(&moea64_slb_mutex, "SLB table", NULL, MTX_DEF); + for (i = 0; i < PV_LOCK_COUNT; i++) + mtx_init(&pv_lock[i], "page pv", NULL, MTX_DEF); + + /* + * Initialise the bootstrap pvo pool. + */ + TUNABLE_INT_FETCH("machdep.moea64_bpvo_pool_size", &moea64_bpvo_pool_size); + if (moea64_bpvo_pool_size == 0) { + if (!hw_direct_map) + moea64_bpvo_pool_size = ((ptoa((uintmax_t)physmem) * sizeof(struct vm_page)) / + (PAGE_SIZE * PAGE_SIZE)) * BPVO_POOL_EXPANSION_FACTOR; + else + moea64_bpvo_pool_size = BPVO_POOL_SIZE; + } + + if (boothowto & RB_VERBOSE) { + printf("mmu_oea64: bpvo pool entries = %d, bpvo pool size = %zu MB\n", + moea64_bpvo_pool_size, + moea64_bpvo_pool_size*sizeof(struct pvo_entry) / 1048576); + } + + moea64_bpvo_pool = (struct pvo_entry *)moea64_bootstrap_alloc( + moea64_bpvo_pool_size*sizeof(struct pvo_entry), PAGE_SIZE); + moea64_bpvo_pool_index = 0; + + /* Place at address usable through the direct map */ + if (hw_direct_map) + moea64_bpvo_pool = (struct pvo_entry *) + PHYS_TO_DMAP((uintptr_t)moea64_bpvo_pool); + + /* + * Make sure kernel vsid is allocated as well as VSID 0. + */ + #ifndef __powerpc64__ + moea64_vsid_bitmap[(KERNEL_VSIDBITS & (NVSIDS - 1)) / VSID_NBPW] + |= 1 << (KERNEL_VSIDBITS % VSID_NBPW); + moea64_vsid_bitmap[0] |= 1; + #endif + + /* + * Initialize the kernel pmap (which is statically allocated). + */ + #ifdef __powerpc64__ + for (i = 0; i < 64; i++) { + pcpup->pc_aim.slb[i].slbv = 0; + pcpup->pc_aim.slb[i].slbe = 0; + } + #else + for (i = 0; i < 16; i++) + kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i; + #endif + + kernel_pmap->pmap_phys = kernel_pmap; + CPU_FILL(&kernel_pmap->pm_active); + RB_INIT(&kernel_pmap->pmap_pvo); + + PMAP_LOCK_INIT(kernel_pmap); + + /* + * Now map in all the other buffers we allocated earlier + */ + + moea64_setup_direct_map(kernelstart, kernelend); +} + +void +moea64_late_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend) +{ + ihandle_t mmui; + phandle_t chosen; + phandle_t mmu; + ssize_t sz; + int i; + vm_offset_t pa, va; + void *dpcpu; + + /* + * Set up the Open Firmware pmap and add its mappings if not in real + * mode. + */ + + chosen = OF_finddevice("/chosen"); + if (chosen != -1 && OF_getencprop(chosen, "mmu", &mmui, 4) != -1) { + mmu = OF_instance_to_package(mmui); + if (mmu == -1 || + (sz = OF_getproplen(mmu, "translations")) == -1) + sz = 0; + if (sz > 6144 /* tmpstksz - 2 KB headroom */) + panic("moea64_bootstrap: too many ofw translations"); + + if (sz > 0) + moea64_add_ofw_mappings(mmu, sz); + } + + /* + * Calculate the last available physical address. + */ + Maxmem = 0; + for (i = 0; phys_avail[i + 1] != 0; i += 2) + Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1])); + + /* + * Initialize MMU. + */ + pmap_cpu_bootstrap(0); + mtmsr(mfmsr() | PSL_DR | PSL_IR); + pmap_bootstrapped++; + + /* + * Set the start and end of kva. + */ + virtual_avail = VM_MIN_KERNEL_ADDRESS; + virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; + + /* + * Map the entire KVA range into the SLB. We must not fault there. + */ + #ifdef __powerpc64__ + for (va = virtual_avail; va < virtual_end; va += SEGMENT_LENGTH) + moea64_bootstrap_slb_prefault(va, 0); + #endif + + /* + * Remap any early IO mappings (console framebuffer, etc.) + */ + bs_remap_earlyboot(); + + /* + * Figure out how far we can extend virtual_end into segment 16 + * without running into existing mappings. Segment 16 is guaranteed + * to contain neither RAM nor devices (at least on Apple hardware), + * but will generally contain some OFW mappings we should not + * step on. + */ + + #ifndef __powerpc64__ /* KVA is in high memory on PPC64 */ + PMAP_LOCK(kernel_pmap); + while (virtual_end < VM_MAX_KERNEL_ADDRESS && + moea64_pvo_find_va(kernel_pmap, virtual_end+1) == NULL) + virtual_end += PAGE_SIZE; + PMAP_UNLOCK(kernel_pmap); + #endif + + /* + * Allocate a kernel stack with a guard page for thread0 and map it + * into the kernel page map. + */ + pa = moea64_bootstrap_alloc(kstack_pages * PAGE_SIZE, PAGE_SIZE); + va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; + virtual_avail = va + kstack_pages * PAGE_SIZE; + CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va); + thread0.td_kstack = va; + thread0.td_kstack_pages = kstack_pages; + for (i = 0; i < kstack_pages; i++) { + moea64_kenter(va, pa); + pa += PAGE_SIZE; + va += PAGE_SIZE; + } + + /* + * Allocate virtual address space for the message buffer. + */ + pa = msgbuf_phys = moea64_bootstrap_alloc(msgbufsize, PAGE_SIZE); + msgbufp = (struct msgbuf *)virtual_avail; + va = virtual_avail; + virtual_avail += round_page(msgbufsize); + while (va < virtual_avail) { + moea64_kenter(va, pa); + pa += PAGE_SIZE; + va += PAGE_SIZE; + } + + /* + * Allocate virtual address space for the dynamic percpu area. + */ + pa = moea64_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE); + dpcpu = (void *)virtual_avail; + va = virtual_avail; + virtual_avail += DPCPU_SIZE; + while (va < virtual_avail) { + moea64_kenter(va, pa); + pa += PAGE_SIZE; + va += PAGE_SIZE; + } + dpcpu_init(dpcpu, curcpu); + + crashdumpmap = (caddr_t)virtual_avail; + virtual_avail += MAXDUMPPGS * PAGE_SIZE; + + /* + * Allocate some things for page zeroing. We put this directly + * in the page table and use MOEA64_PTE_REPLACE to avoid any + * of the PVO book-keeping or other parts of the VM system + * from even knowing that this hack exists. + */ + + if (!hw_direct_map) { + mtx_init(&moea64_scratchpage_mtx, "pvo zero page", NULL, + MTX_DEF); + for (i = 0; i < 2; i++) { + moea64_scratchpage_va[i] = (virtual_end+1) - PAGE_SIZE; + virtual_end -= PAGE_SIZE; + + moea64_kenter(moea64_scratchpage_va[i], 0); + + PMAP_LOCK(kernel_pmap); + moea64_scratchpage_pvo[i] = moea64_pvo_find_va( + kernel_pmap, (vm_offset_t)moea64_scratchpage_va[i]); + PMAP_UNLOCK(kernel_pmap); + } + } + + numa_mem_regions(&numa_pregions, &numapregions_sz); +} + +static void +moea64_pmap_init_qpages(void *dummy __unused) +{ + struct pcpu *pc; + int i; + + if (hw_direct_map) + return; + + CPU_FOREACH(i) { + pc = pcpu_find(i); + pc->pc_qmap_addr = kva_alloc(PAGE_SIZE); + if (pc->pc_qmap_addr == 0) + panic("pmap_init_qpages: unable to allocate KVA"); + PMAP_LOCK(kernel_pmap); + pc->pc_aim.qmap_pvo = + moea64_pvo_find_va(kernel_pmap, pc->pc_qmap_addr); + PMAP_UNLOCK(kernel_pmap); + mtx_init(&pc->pc_aim.qmap_lock, "qmap lock", NULL, MTX_DEF); + } +} + +SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, moea64_pmap_init_qpages, NULL); + +/* + * Activate a user pmap. This mostly involves setting some non-CPU + * state. + */ +void +moea64_activate(struct thread *td) +{ + pmap_t pm; + + pm = &td->td_proc->p_vmspace->vm_pmap; + CPU_SET(PCPU_GET(cpuid), &pm->pm_active); + + #ifdef __powerpc64__ + PCPU_SET(aim.userslb, pm->pm_slb); + __asm __volatile("slbmte %0, %1; isync" :: + "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE)); + #else + PCPU_SET(curpmap, pm->pmap_phys); + mtsrin(USER_SR << ADDR_SR_SHFT, td->td_pcb->pcb_cpu.aim.usr_vsid); + #endif +} + +void +moea64_deactivate(struct thread *td) +{ + pmap_t pm; + + __asm __volatile("isync; slbie %0" :: "r"(USER_ADDR)); + + pm = &td->td_proc->p_vmspace->vm_pmap; + CPU_CLR(PCPU_GET(cpuid), &pm->pm_active); + #ifdef __powerpc64__ + PCPU_SET(aim.userslb, NULL); + #else + PCPU_SET(curpmap, NULL); + #endif +} + +void +moea64_unwire(pmap_t pm, vm_offset_t sva, vm_offset_t eva) +{ + struct pvo_entry key, *pvo; + vm_page_t m; + int64_t refchg; + + key.pvo_vaddr = sva; + PMAP_LOCK(pm); + for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); + pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { + if (PVO_IS_SP(pvo)) { + if (moea64_sp_pvo_in_range(pvo, sva, eva)) { + pvo = moea64_sp_unwire(pvo); + continue; + } else { + CTR1(KTR_PMAP, "%s: demote before unwire", + __func__); + moea64_sp_demote(pvo); + } + } + + if ((pvo->pvo_vaddr & PVO_WIRED) == 0) + panic("moea64_unwire: pvo %p is missing PVO_WIRED", + pvo); + pvo->pvo_vaddr &= ~PVO_WIRED; + refchg = moea64_pte_replace(pvo, 0 /* No invalidation */); + if ((pvo->pvo_vaddr & PVO_MANAGED) && + (pvo->pvo_pte.prot & VM_PROT_WRITE)) { + if (refchg < 0) + refchg = LPTE_CHG; + m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); + + refchg |= atomic_readandclear_32(&m->md.mdpg_attrs); + if (refchg & LPTE_CHG) + vm_page_dirty(m); + if (refchg & LPTE_REF) + vm_page_aflag_set(m, PGA_REFERENCED); + } + pm->pm_stats.wired_count--; + } + PMAP_UNLOCK(pm); +} + +static int +moea64_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) +{ + struct pvo_entry *pvo; + vm_paddr_t pa; + vm_page_t m; + int val; + bool managed; + + PMAP_LOCK(pmap); + + pvo = moea64_pvo_find_va(pmap, addr); + if (pvo != NULL) { + pa = PVO_PADDR(pvo); + m = PHYS_TO_VM_PAGE(pa); + managed = (pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED; + if (PVO_IS_SP(pvo)) + val = MINCORE_INCORE | MINCORE_PSIND(1); + else + val = MINCORE_INCORE; + } else { + PMAP_UNLOCK(pmap); + return (0); + } + + PMAP_UNLOCK(pmap); + + if (m == NULL) + return (0); + + if (managed) { + if (moea64_is_modified(m)) + val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; + + if (moea64_is_referenced(m)) + val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; + } + + if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != + (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && + managed) { + *pap = pa; + } + + return (val); +} + +/* + * This goes through and sets the physical address of our + * special scratch PTE to the PA we want to zero or copy. Because + * of locking issues (this can get called in pvo_enter() by + * the UMA allocator), we can't use most other utility functions here + */ + +static __inline +void moea64_set_scratchpage_pa(int which, vm_paddr_t pa) +{ + struct pvo_entry *pvo; + + KASSERT(!hw_direct_map, ("Using OEA64 scratchpage with a direct map!")); + mtx_assert(&moea64_scratchpage_mtx, MA_OWNED); + + pvo = moea64_scratchpage_pvo[which]; + PMAP_LOCK(pvo->pvo_pmap); + pvo->pvo_pte.pa = + moea64_calc_wimg(pa, VM_MEMATTR_DEFAULT) | (uint64_t)pa; + moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE); + PMAP_UNLOCK(pvo->pvo_pmap); + isync(); +} + +void +moea64_copy_page(vm_page_t msrc, vm_page_t mdst) +{ + mtx_lock(&moea64_scratchpage_mtx); + + moea64_set_scratchpage_pa(0, VM_PAGE_TO_PHYS(msrc)); + moea64_set_scratchpage_pa(1, VM_PAGE_TO_PHYS(mdst)); + + bcopy((void *)moea64_scratchpage_va[0], + (void *)moea64_scratchpage_va[1], PAGE_SIZE); + + mtx_unlock(&moea64_scratchpage_mtx); +} + +void +moea64_copy_page_dmap(vm_page_t msrc, vm_page_t mdst) +{ + vm_offset_t dst; + vm_offset_t src; + + dst = VM_PAGE_TO_PHYS(mdst); + src = VM_PAGE_TO_PHYS(msrc); + + bcopy((void *)PHYS_TO_DMAP(src), (void *)PHYS_TO_DMAP(dst), + PAGE_SIZE); +} + +inline void +moea64_copy_pages_dmap(vm_page_t *ma, vm_offset_t a_offset, + vm_page_t *mb, vm_offset_t b_offset, int xfersize) +{ + void *a_cp, *b_cp; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + while (xfersize > 0) { + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + a_cp = (char *)(uintptr_t)PHYS_TO_DMAP( + VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) + + a_pg_offset; + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + b_cp = (char *)(uintptr_t)PHYS_TO_DMAP( + VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) + + b_pg_offset; + bcopy(a_cp, b_cp, cnt); + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } +} + +void +moea64_copy_pages(vm_page_t *ma, vm_offset_t a_offset, + vm_page_t *mb, vm_offset_t b_offset, int xfersize) +{ + void *a_cp, *b_cp; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + mtx_lock(&moea64_scratchpage_mtx); + while (xfersize > 0) { + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + moea64_set_scratchpage_pa(0, + VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])); + a_cp = (char *)moea64_scratchpage_va[0] + a_pg_offset; + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + moea64_set_scratchpage_pa(1, + VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])); + b_cp = (char *)moea64_scratchpage_va[1] + b_pg_offset; + bcopy(a_cp, b_cp, cnt); + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } + mtx_unlock(&moea64_scratchpage_mtx); +} + +void +moea64_zero_page_area(vm_page_t m, int off, int size) +{ + vm_paddr_t pa = VM_PAGE_TO_PHYS(m); + + if (size + off > PAGE_SIZE) + panic("moea64_zero_page: size + off > PAGE_SIZE"); + + if (hw_direct_map) { + bzero((caddr_t)(uintptr_t)PHYS_TO_DMAP(pa) + off, size); + } else { + mtx_lock(&moea64_scratchpage_mtx); + moea64_set_scratchpage_pa(0, pa); + bzero((caddr_t)moea64_scratchpage_va[0] + off, size); + mtx_unlock(&moea64_scratchpage_mtx); + } +} + +/* + * Zero a page of physical memory by temporarily mapping it + */ +void +moea64_zero_page(vm_page_t m) +{ + vm_paddr_t pa = VM_PAGE_TO_PHYS(m); + vm_offset_t va, off; + + mtx_lock(&moea64_scratchpage_mtx); + + moea64_set_scratchpage_pa(0, pa); + va = moea64_scratchpage_va[0]; + + for (off = 0; off < PAGE_SIZE; off += cacheline_size) + __asm __volatile("dcbz 0,%0" :: "r"(va + off)); + + mtx_unlock(&moea64_scratchpage_mtx); +} + +void +moea64_zero_page_dmap(vm_page_t m) +{ + vm_paddr_t pa = VM_PAGE_TO_PHYS(m); + vm_offset_t va, off; + + va = PHYS_TO_DMAP(pa); + for (off = 0; off < PAGE_SIZE; off += cacheline_size) + __asm __volatile("dcbz 0,%0" :: "r"(va + off)); +} + +vm_offset_t +moea64_quick_enter_page(vm_page_t m) +{ + struct pvo_entry *pvo; + vm_paddr_t pa = VM_PAGE_TO_PHYS(m); + + /* + * MOEA64_PTE_REPLACE does some locking, so we can't just grab + * a critical section and access the PCPU data like on i386. + * Instead, pin the thread and grab the PCPU lock to prevent + * a preempting thread from using the same PCPU data. + */ + sched_pin(); + + mtx_assert(PCPU_PTR(aim.qmap_lock), MA_NOTOWNED); + pvo = PCPU_GET(aim.qmap_pvo); + + mtx_lock(PCPU_PTR(aim.qmap_lock)); + pvo->pvo_pte.pa = moea64_calc_wimg(pa, pmap_page_get_memattr(m)) | + (uint64_t)pa; + moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE); + isync(); + + return (PCPU_GET(qmap_addr)); +} + +vm_offset_t +moea64_quick_enter_page_dmap(vm_page_t m) +{ + + return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); +} + +void +moea64_quick_remove_page(vm_offset_t addr) +{ + + mtx_assert(PCPU_PTR(aim.qmap_lock), MA_OWNED); + KASSERT(PCPU_GET(qmap_addr) == addr, + ("moea64_quick_remove_page: invalid address")); + mtx_unlock(PCPU_PTR(aim.qmap_lock)); + sched_unpin(); +} + +bool +moea64_page_is_mapped(vm_page_t m) +{ + return (!LIST_EMPTY(&(m)->md.mdpg_pvoh)); +} + +/* + * Map the given physical page at the specified virtual address in the + * target pmap with the protection requested. If specified the page + * will be wired down. + */ + +int +moea64_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, u_int flags, int8_t psind) +{ + struct pvo_entry *pvo, *oldpvo, *tpvo; + struct pvo_head *pvo_head; + uint64_t pte_lo; + int error; + vm_paddr_t pa; + + if ((m->oflags & VPO_UNMANAGED) == 0) { + if ((flags & PMAP_ENTER_QUICK_LOCKED) == 0) + VM_PAGE_OBJECT_BUSY_ASSERT(m); + else + VM_OBJECT_ASSERT_LOCKED(m->object); + } + + if (psind > 0) + return (moea64_sp_enter(pmap, va, m, prot, flags, psind)); + + pvo = alloc_pvo_entry(0); + if (pvo == NULL) + return (KERN_RESOURCE_SHORTAGE); + pvo->pvo_pmap = NULL; /* to be filled in later */ + pvo->pvo_pte.prot = prot; + + pa = VM_PAGE_TO_PHYS(m); + pte_lo = moea64_calc_wimg(pa, pmap_page_get_memattr(m)); + pvo->pvo_pte.pa = pa | pte_lo; + + if ((flags & PMAP_ENTER_WIRED) != 0) + pvo->pvo_vaddr |= PVO_WIRED; + + if ((m->oflags & VPO_UNMANAGED) != 0 || !moea64_initialized) { + pvo_head = NULL; + } else { + pvo_head = &m->md.mdpg_pvoh; + pvo->pvo_vaddr |= PVO_MANAGED; + } + + PV_LOCK(pa); + PMAP_LOCK(pmap); + if (pvo->pvo_pmap == NULL) + init_pvo_entry(pvo, pmap, va); + + if (moea64_ps_enabled(pmap) && + (tpvo = moea64_pvo_find_va(pmap, va & ~HPT_SP_MASK)) != NULL && + PVO_IS_SP(tpvo)) { + /* Demote SP before entering a regular page */ + CTR2(KTR_PMAP, "%s: demote before enter: va=%#jx", + __func__, (uintmax_t)va); + moea64_sp_demote_aligned(tpvo); + } + + if (prot & VM_PROT_WRITE) + if (pmap_bootstrapped && + (m->oflags & VPO_UNMANAGED) == 0) + vm_page_aflag_set(m, PGA_WRITEABLE); + + error = moea64_pvo_enter(pvo, pvo_head, &oldpvo); + if (error == EEXIST) { + if (oldpvo->pvo_vaddr == pvo->pvo_vaddr && + oldpvo->pvo_pte.pa == pvo->pvo_pte.pa && + oldpvo->pvo_pte.prot == prot) { + /* Identical mapping already exists */ + error = 0; + + /* If not in page table, reinsert it */ + if (moea64_pte_synch(oldpvo) < 0) { + STAT_MOEA64(moea64_pte_overflow--); + moea64_pte_insert(oldpvo); + } + + /* Then just clean up and go home */ + PMAP_UNLOCK(pmap); + PV_UNLOCK(pa); + free_pvo_entry(pvo); + pvo = NULL; + goto out; + } else { + /* Otherwise, need to kill it first */ + KASSERT(oldpvo->pvo_pmap == pmap, ("pmap of old " + "mapping does not match new mapping")); + moea64_pvo_remove_from_pmap(oldpvo); + moea64_pvo_enter(pvo, pvo_head, NULL); + } + } + PMAP_UNLOCK(pmap); + PV_UNLOCK(pa); + + /* Free any dead pages */ + if (error == EEXIST) { + moea64_pvo_remove_from_page(oldpvo); + free_pvo_entry(oldpvo); + } + +out: + /* + * Flush the page from the instruction cache if this page is + * mapped executable and cacheable. + */ + if (pmap != kernel_pmap && (m->a.flags & PGA_EXECUTABLE) == 0 && + (pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { + vm_page_aflag_set(m, PGA_EXECUTABLE); + moea64_syncicache(pmap, va, pa, PAGE_SIZE); + } + +#if VM_NRESERVLEVEL > 0 + /* + * Try to promote pages. + * + * If the VA of the entered page is not aligned with its PA, + * don't try page promotion as it is not possible. + * This reduces the number of promotion failures dramatically. + * + * Ignore VM_PROT_NO_PROMOTE unless PMAP_ENTER_QUICK_LOCKED. + */ + if (moea64_ps_enabled(pmap) && pmap != kernel_pmap && pvo != NULL && + (pvo->pvo_vaddr & PVO_MANAGED) != 0 && + (va & HPT_SP_MASK) == (pa & HPT_SP_MASK) && + ((prot & VM_PROT_NO_PROMOTE) == 0 || + (flags & PMAP_ENTER_QUICK_LOCKED) == 0) && + (m->flags & PG_FICTITIOUS) == 0 && + vm_reserv_level_iffullpop(m) == 0) + moea64_sp_promote(pmap, va, m); +#endif + + return (KERN_SUCCESS); +} + +static void +moea64_syncicache(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + vm_size_t sz) +{ + + /* + * This is much trickier than on older systems because + * we can't sync the icache on physical addresses directly + * without a direct map. Instead we check a couple of cases + * where the memory is already mapped in and, failing that, + * use the same trick we use for page zeroing to create + * a temporary mapping for this physical address. + */ + + if (!pmap_bootstrapped) { + /* + * If PMAP is not bootstrapped, we are likely to be + * in real mode. + */ + __syncicache((void *)(uintptr_t)pa, sz); + } else if (pmap == kernel_pmap) { + __syncicache((void *)va, sz); + } else if (hw_direct_map) { + __syncicache((void *)(uintptr_t)PHYS_TO_DMAP(pa), sz); + } else { + /* Use the scratch page to set up a temp mapping */ + + mtx_lock(&moea64_scratchpage_mtx); + + moea64_set_scratchpage_pa(1, pa & ~ADDR_POFF); + __syncicache((void *)(moea64_scratchpage_va[1] + + (va & ADDR_POFF)), sz); + + mtx_unlock(&moea64_scratchpage_mtx); + } +} + +/* + * Maps a sequence of resident pages belonging to the same object. + * The sequence begins with the given page m_start. This page is + * mapped at the given virtual address start. Each subsequent page is + * mapped at a virtual address that is offset from start by the same + * amount as the page is offset from m_start within the object. The + * last page in the sequence is the page with the largest offset from + * m_start that can be mapped at a virtual address less than the given + * virtual address end. Not every virtual page between start and end + * is mapped; only those for which a resident page exists with the + * corresponding offset from m_start are mapped. + */ +void +moea64_enter_object(pmap_t pm, vm_offset_t start, vm_offset_t end, + vm_page_t m_start, vm_prot_t prot) +{ + struct pctrie_iter pages; + vm_page_t m; + vm_offset_t va; + int8_t psind; + + VM_OBJECT_ASSERT_LOCKED(m_start->object); + + vm_page_iter_limit_init(&pages, m_start->object, + m_start->pindex + atop(end - start)); + m = vm_radix_iter_lookup(&pages, m_start->pindex); + while (m != NULL) { + va = start + ptoa(m->pindex - m_start->pindex); + if ((va & HPT_SP_MASK) == 0 && va + HPT_SP_SIZE <= end && + m->psind == 1 && moea64_ps_enabled(pm)) + psind = 1; + else + psind = 0; + moea64_enter(pm, va, m, prot & + (VM_PROT_READ | VM_PROT_EXECUTE), + PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, psind); + if (psind == 1) + m = vm_radix_iter_jump(&pages, HPT_SP_SIZE / PAGE_SIZE); + else + m = vm_radix_iter_step(&pages); + } +} + +void +moea64_enter_quick(pmap_t pm, vm_offset_t va, vm_page_t m, + vm_prot_t prot) +{ + + moea64_enter(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE | + VM_PROT_NO_PROMOTE), PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, + 0); +} + +vm_paddr_t +moea64_extract(pmap_t pm, vm_offset_t va) +{ + struct pvo_entry *pvo; + vm_paddr_t pa; + + PMAP_LOCK(pm); + pvo = moea64_pvo_find_va(pm, va); + if (pvo == NULL) + pa = 0; + else + pa = PVO_PADDR(pvo) | (va - PVO_VADDR(pvo)); + PMAP_UNLOCK(pm); + + return (pa); +} + +/* + * Atomically extract and hold the physical page with the given + * pmap and virtual address pair if that mapping permits the given + * protection. + */ +vm_page_t +moea64_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) +{ + struct pvo_entry *pvo; + vm_page_t m; + + m = NULL; + PMAP_LOCK(pmap); + pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); + if (pvo != NULL && (pvo->pvo_pte.prot & prot) == prot) { + m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); + if (!vm_page_wire_mapped(m)) + m = NULL; + } + PMAP_UNLOCK(pmap); + return (m); +} + +static void * +moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, + uint8_t *flags, int wait) +{ + struct pvo_entry *pvo; + vm_offset_t va; + vm_page_t m; + int needed_lock; + + /* + * This entire routine is a horrible hack to avoid bothering kmem + * for new KVA addresses. Because this can get called from inside + * kmem allocation routines, calling kmem for a new address here + * can lead to multiply locking non-recursive mutexes. + */ + + *flags = UMA_SLAB_PRIV; + needed_lock = !PMAP_LOCKED(kernel_pmap); + + m = vm_page_alloc_noobj_domain(domain, malloc2vm_flags(wait) | + VM_ALLOC_WIRED); + if (m == NULL) + return (NULL); + + va = VM_PAGE_TO_PHYS(m); + + pvo = alloc_pvo_entry(1 /* bootstrap */); + + pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE; + pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | LPTE_M; + + if (needed_lock) + PMAP_LOCK(kernel_pmap); + + init_pvo_entry(pvo, kernel_pmap, va); + pvo->pvo_vaddr |= PVO_WIRED; + + moea64_pvo_enter(pvo, NULL, NULL); + + if (needed_lock) + PMAP_UNLOCK(kernel_pmap); + + return (void *)va; +} + +extern int elf32_nxstack; + +void +moea64_init(void) +{ + + CTR0(KTR_PMAP, "moea64_init"); + + moea64_pvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, + UMA_ZONE_VM | UMA_ZONE_NOFREE); + + /* + * Are large page mappings enabled? + * + * While HPT superpages are not better tested, leave it disabled by + * default. + */ + superpages_enabled = 0; + TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); + if (superpages_enabled) { + KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, + ("moea64_init: can't assign to pagesizes[1]")); + + if (moea64_large_page_size == 0) { + printf("mmu_oea64: HW does not support large pages. " + "Disabling superpages...\n"); + superpages_enabled = 0; + } else if (!moea64_has_lp_4k_16m) { + printf("mmu_oea64: " + "HW does not support mixed 4KB/16MB page sizes. " + "Disabling superpages...\n"); + superpages_enabled = 0; + } else + pagesizes[1] = HPT_SP_SIZE; + } + + if (!hw_direct_map) { + uma_zone_set_allocf(moea64_pvo_zone, moea64_uma_page_alloc); + } + +#ifdef COMPAT_FREEBSD32 + elf32_nxstack = 1; +#endif + + moea64_initialized = true; +} + +bool +moea64_is_referenced(vm_page_t m) +{ + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("moea64_is_referenced: page %p is not managed", m)); + + return (moea64_query_bit(m, LPTE_REF)); +} + +bool +moea64_is_modified(vm_page_t m) +{ + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("moea64_is_modified: page %p is not managed", m)); + + /* + * If the page is not busied then this check is racy. + */ + if (!pmap_page_is_write_mapped(m)) + return (false); + + return (moea64_query_bit(m, LPTE_CHG)); +} + +bool +moea64_is_prefaultable(pmap_t pmap, vm_offset_t va) +{ + struct pvo_entry *pvo; + bool rv = true; + + PMAP_LOCK(pmap); + pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); + if (pvo != NULL) + rv = false; + PMAP_UNLOCK(pmap); + return (rv); +} + +void +moea64_clear_modify(vm_page_t m) +{ + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("moea64_clear_modify: page %p is not managed", m)); + vm_page_assert_busied(m); + + if (!pmap_page_is_write_mapped(m)) + return; + moea64_clear_bit(m, LPTE_CHG); +} + +/* + * Clear the write and modified bits in each of the given page's mappings. + */ +void +moea64_remove_write(vm_page_t m) +{ + struct pvo_entry *pvo; + int64_t refchg, ret; + pmap_t pmap; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("moea64_remove_write: page %p is not managed", m)); + vm_page_assert_busied(m); + + if (!pmap_page_is_write_mapped(m)) + return; + + powerpc_sync(); + PV_PAGE_LOCK(m); + refchg = 0; + LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { + pmap = pvo->pvo_pmap; + PMAP_LOCK(pmap); + if (!(pvo->pvo_vaddr & PVO_DEAD) && + (pvo->pvo_pte.prot & VM_PROT_WRITE)) { + if (PVO_IS_SP(pvo)) { + CTR1(KTR_PMAP, "%s: demote before remwr", + __func__); + moea64_sp_demote(pvo); + } + pvo->pvo_pte.prot &= ~VM_PROT_WRITE; + ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE); + if (ret < 0) + ret = LPTE_CHG; + refchg |= ret; + if (pvo->pvo_pmap == kernel_pmap) + isync(); + } + PMAP_UNLOCK(pmap); + } + if ((refchg | atomic_readandclear_32(&m->md.mdpg_attrs)) & LPTE_CHG) + vm_page_dirty(m); + vm_page_aflag_clear(m, PGA_WRITEABLE); + PV_PAGE_UNLOCK(m); +} + +/* + * moea64_ts_referenced: + * + * Return a count of reference bits for a page, clearing those bits. + * It is not necessary for every reference bit to be cleared, but it + * is necessary that 0 only be returned when there are truly no + * reference bits set. + * + * XXX: The exact number of bits to check and clear is a matter that + * should be tested and standardized at some point in the future for + * optimal aging of shared pages. + */ +int +moea64_ts_referenced(vm_page_t m) +{ + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("moea64_ts_referenced: page %p is not managed", m)); + return (moea64_clear_bit(m, LPTE_REF)); +} + +/* + * Modify the WIMG settings of all mappings for a page. + */ +void +moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma) +{ + struct pvo_entry *pvo; + int64_t refchg; + pmap_t pmap; + uint64_t lo; + + CTR3(KTR_PMAP, "%s: pa=%#jx, ma=%#x", + __func__, (uintmax_t)VM_PAGE_TO_PHYS(m), ma); + + if (m->md.mdpg_cache_attrs == ma) + return; + + if ((m->oflags & VPO_UNMANAGED) != 0) { + m->md.mdpg_cache_attrs = ma; + return; + } + + lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), ma); + + PV_PAGE_LOCK(m); + LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { + pmap = pvo->pvo_pmap; + PMAP_LOCK(pmap); + if (!(pvo->pvo_vaddr & PVO_DEAD)) { + if (PVO_IS_SP(pvo)) { + CTR1(KTR_PMAP, + "%s: demote before set_memattr", __func__); + moea64_sp_demote(pvo); + } + pvo->pvo_pte.pa &= ~LPTE_WIMG; + pvo->pvo_pte.pa |= lo; + refchg = moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE); + if (refchg < 0) + refchg = (pvo->pvo_pte.prot & VM_PROT_WRITE) ? + LPTE_CHG : 0; + if ((pvo->pvo_vaddr & PVO_MANAGED) && + (pvo->pvo_pte.prot & VM_PROT_WRITE)) { + refchg |= + atomic_readandclear_32(&m->md.mdpg_attrs); + if (refchg & LPTE_CHG) + vm_page_dirty(m); + if (refchg & LPTE_REF) + vm_page_aflag_set(m, PGA_REFERENCED); + } + if (pvo->pvo_pmap == kernel_pmap) + isync(); + } + PMAP_UNLOCK(pmap); + } + m->md.mdpg_cache_attrs = ma; + PV_PAGE_UNLOCK(m); +} + +/* + * Map a wired page into kernel virtual address space. + */ +void +moea64_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) +{ + int error; + struct pvo_entry *pvo, *oldpvo; + + do { + pvo = alloc_pvo_entry(0); + if (pvo == NULL) + vm_wait(NULL); + } while (pvo == NULL); + pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; + pvo->pvo_pte.pa = (pa & ~ADDR_POFF) | moea64_calc_wimg(pa, ma); + pvo->pvo_vaddr |= PVO_WIRED; + + PMAP_LOCK(kernel_pmap); + oldpvo = moea64_pvo_find_va(kernel_pmap, va); + if (oldpvo != NULL) + moea64_pvo_remove_from_pmap(oldpvo); + init_pvo_entry(pvo, kernel_pmap, va); + error = moea64_pvo_enter(pvo, NULL, NULL); + PMAP_UNLOCK(kernel_pmap); + + /* Free any dead pages */ + if (oldpvo != NULL) { + moea64_pvo_remove_from_page(oldpvo); + free_pvo_entry(oldpvo); + } + + if (error != 0) + panic("moea64_kenter: failed to enter va %#zx pa %#jx: %d", va, + (uintmax_t)pa, error); +} + +void +moea64_kenter(vm_offset_t va, vm_paddr_t pa) +{ + + moea64_kenter_attr(va, pa, VM_MEMATTR_DEFAULT); +} + +/* + * Extract the physical page address associated with the given kernel virtual + * address. + */ +vm_paddr_t +moea64_kextract(vm_offset_t va) +{ + struct pvo_entry *pvo; + vm_paddr_t pa; + + /* + * Shortcut the direct-mapped case when applicable. We never put + * anything but 1:1 (or 62-bit aliased) mappings below + * VM_MIN_KERNEL_ADDRESS. + */ + if (va < VM_MIN_KERNEL_ADDRESS) + return (va & ~DMAP_BASE_ADDRESS); + + PMAP_LOCK(kernel_pmap); + pvo = moea64_pvo_find_va(kernel_pmap, va); + KASSERT(pvo != NULL, ("moea64_kextract: no addr found for %#" PRIxPTR, + va)); + pa = PVO_PADDR(pvo) | (va - PVO_VADDR(pvo)); + PMAP_UNLOCK(kernel_pmap); + return (pa); +} + +/* + * Remove a wired page from kernel virtual address space. + */ +void +moea64_kremove(vm_offset_t va) +{ + moea64_remove(kernel_pmap, va, va + PAGE_SIZE); +} + +/* + * Provide a kernel pointer corresponding to a given userland pointer. + * The returned pointer is valid until the next time this function is + * called in this thread. This is used internally in copyin/copyout. + */ +static int +moea64_map_user_ptr(pmap_t pm, volatile const void *uaddr, + void **kaddr, size_t ulen, size_t *klen) +{ + size_t l; +#ifdef __powerpc64__ + struct slb *slb; +#endif + register_t slbv; + + *kaddr = (char *)USER_ADDR + ((uintptr_t)uaddr & ~SEGMENT_MASK); + l = ((char *)USER_ADDR + SEGMENT_LENGTH) - (char *)(*kaddr); + if (l > ulen) + l = ulen; + if (klen) + *klen = l; + else if (l != ulen) + return (EFAULT); + +#ifdef __powerpc64__ + /* Try lockless look-up first */ + slb = user_va_to_slb_entry(pm, (vm_offset_t)uaddr); + + if (slb == NULL) { + /* If it isn't there, we need to pre-fault the VSID */ + PMAP_LOCK(pm); + slbv = va_to_vsid(pm, (vm_offset_t)uaddr) << SLBV_VSID_SHIFT; + PMAP_UNLOCK(pm); + } else { + slbv = slb->slbv; + } + + /* Mark segment no-execute */ + slbv |= SLBV_N; +#else + slbv = va_to_vsid(pm, (vm_offset_t)uaddr); + + /* Mark segment no-execute */ + slbv |= SR_N; +#endif + + /* If we have already set this VSID, we can just return */ + if (curthread->td_pcb->pcb_cpu.aim.usr_vsid == slbv) + return (0); + + __asm __volatile("isync"); + curthread->td_pcb->pcb_cpu.aim.usr_segm = + (uintptr_t)uaddr >> ADDR_SR_SHFT; + curthread->td_pcb->pcb_cpu.aim.usr_vsid = slbv; +#ifdef __powerpc64__ + __asm __volatile ("slbie %0; slbmte %1, %2; isync" :: + "r"(USER_ADDR), "r"(slbv), "r"(USER_SLB_SLBE)); +#else + __asm __volatile("mtsr %0,%1; isync" :: "n"(USER_SR), "r"(slbv)); +#endif + + return (0); +} + +/* + * Figure out where a given kernel pointer (usually in a fault) points + * to from the VM's perspective, potentially remapping into userland's + * address space. + */ +static int +moea64_decode_kernel_ptr(vm_offset_t addr, int *is_user, + vm_offset_t *decoded_addr) +{ + vm_offset_t user_sr; + + if ((addr >> ADDR_SR_SHFT) == (USER_ADDR >> ADDR_SR_SHFT)) { + user_sr = curthread->td_pcb->pcb_cpu.aim.usr_segm; + addr &= ADDR_PIDX | ADDR_POFF; + addr |= user_sr << ADDR_SR_SHFT; + *decoded_addr = addr; + *is_user = 1; + } else { + *decoded_addr = addr; + *is_user = 0; + } + + return (0); +} + +/* + * Map a range of physical addresses into kernel virtual address space. + * + * The value passed in *virt is a suggested virtual address for the mapping. + * Architectures which can support a direct-mapped physical to virtual region + * can return the appropriate address within that region, leaving '*virt' + * unchanged. Other architectures should map the pages starting at '*virt' and + * update '*virt' with the first usable address after the mapped region. + */ +vm_offset_t +moea64_map(vm_offset_t *virt, vm_paddr_t pa_start, + vm_paddr_t pa_end, int prot) +{ + vm_offset_t sva, va; + + if (hw_direct_map) { + /* + * Check if every page in the region is covered by the direct + * map. The direct map covers all of physical memory. Use + * moea64_calc_wimg() as a shortcut to see if the page is in + * physical memory as a way to see if the direct map covers it. + */ + for (va = pa_start; va < pa_end; va += PAGE_SIZE) + if (moea64_calc_wimg(va, VM_MEMATTR_DEFAULT) != LPTE_M) + break; + if (va == pa_end) + return (PHYS_TO_DMAP(pa_start)); + } + sva = *virt; + va = sva; + /* XXX respect prot argument */ + for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE) + moea64_kenter(va, pa_start); + *virt = va; + + return (sva); +} + +/* + * Returns true if the pmap's pv is one of the first + * 16 pvs linked to from this page. This count may + * be changed upwards or downwards in the future; it + * is only necessary that true be returned for a small + * subset of pmaps for proper page aging. + */ +bool +moea64_page_exists_quick(pmap_t pmap, vm_page_t m) +{ + int loops; + struct pvo_entry *pvo; + bool rv; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("moea64_page_exists_quick: page %p is not managed", m)); + loops = 0; + rv = false; + PV_PAGE_LOCK(m); + LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { + if (!(pvo->pvo_vaddr & PVO_DEAD) && pvo->pvo_pmap == pmap) { + rv = true; + break; + } + if (++loops >= 16) + break; + } + PV_PAGE_UNLOCK(m); + return (rv); +} + +void +moea64_page_init(vm_page_t m) +{ + + m->md.mdpg_attrs = 0; + m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT; + LIST_INIT(&m->md.mdpg_pvoh); +} + +/* + * Return the number of managed mappings to the given physical page + * that are wired. + */ +int +moea64_page_wired_mappings(vm_page_t m) +{ + struct pvo_entry *pvo; + int count; + + count = 0; + if ((m->oflags & VPO_UNMANAGED) != 0) + return (count); + PV_PAGE_LOCK(m); + LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) + if ((pvo->pvo_vaddr & (PVO_DEAD | PVO_WIRED)) == PVO_WIRED) + count++; + PV_PAGE_UNLOCK(m); + return (count); +} + +static uintptr_t moea64_vsidcontext; + +uintptr_t +moea64_get_unique_vsid(void) { + u_int entropy; + register_t hash; + uint32_t mask; + int i; + + entropy = 0; + __asm __volatile("mftb %0" : "=r"(entropy)); + + mtx_lock(&moea64_slb_mutex); + for (i = 0; i < NVSIDS; i += VSID_NBPW) { + u_int n; + + /* + * Create a new value by multiplying by a prime and adding in + * entropy from the timebase register. This is to make the + * VSID more random so that the PT hash function collides + * less often. (Note that the prime casues gcc to do shifts + * instead of a multiply.) + */ + moea64_vsidcontext = (moea64_vsidcontext * 0x1105) + entropy; + hash = moea64_vsidcontext & (NVSIDS - 1); + if (hash == 0) /* 0 is special, avoid it */ + continue; + n = hash >> 5; + mask = 1 << (hash & (VSID_NBPW - 1)); + hash = (moea64_vsidcontext & VSID_HASHMASK); + if (moea64_vsid_bitmap[n] & mask) { /* collision? */ + /* anything free in this bucket? */ + if (moea64_vsid_bitmap[n] == 0xffffffff) { + entropy = (moea64_vsidcontext >> 20); + continue; + } + i = ffs(~moea64_vsid_bitmap[n]) - 1; + mask = 1 << i; + hash &= rounddown2(VSID_HASHMASK, VSID_NBPW); + hash |= i; + } + if (hash == VSID_VRMA) /* also special, avoid this too */ + continue; + KASSERT(!(moea64_vsid_bitmap[n] & mask), + ("Allocating in-use VSID %#zx\n", hash)); + moea64_vsid_bitmap[n] |= mask; + mtx_unlock(&moea64_slb_mutex); + return (hash); + } + + mtx_unlock(&moea64_slb_mutex); + panic("%s: out of segments",__func__); +} + +#ifdef __powerpc64__ +int +moea64_pinit(pmap_t pmap) +{ + + RB_INIT(&pmap->pmap_pvo); + + pmap->pm_slb_tree_root = slb_alloc_tree(); + pmap->pm_slb = slb_alloc_user_cache(); + pmap->pm_slb_len = 0; + + return (1); +} +#else +int +moea64_pinit(pmap_t pmap) +{ + int i; + uint32_t hash; + + RB_INIT(&pmap->pmap_pvo); + + if (pmap_bootstrapped) + pmap->pmap_phys = (pmap_t)moea64_kextract((vm_offset_t)pmap); + else + pmap->pmap_phys = pmap; + + /* + * Allocate some segment registers for this pmap. + */ + hash = moea64_get_unique_vsid(); + + for (i = 0; i < 16; i++) + pmap->pm_sr[i] = VSID_MAKE(i, hash); + + KASSERT(pmap->pm_sr[0] != 0, ("moea64_pinit: pm_sr[0] = 0")); + + return (1); +} +#endif + +/* + * Initialize the pmap associated with process 0. + */ +void +moea64_pinit0(pmap_t pm) +{ + + PMAP_LOCK_INIT(pm); + moea64_pinit(pm); + bzero(&pm->pm_stats, sizeof(pm->pm_stats)); +} + +/* + * Set the physical protection on the specified range of this map as requested. + */ +static void +moea64_pvo_protect( pmap_t pm, struct pvo_entry *pvo, vm_prot_t prot) +{ + struct vm_page *pg; + vm_prot_t oldprot; + int32_t refchg; + + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + /* + * Change the protection of the page. + */ + oldprot = pvo->pvo_pte.prot; + pvo->pvo_pte.prot = prot; + pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); + + /* + * If the PVO is in the page table, update mapping + */ + refchg = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE); + if (refchg < 0) + refchg = (oldprot & VM_PROT_WRITE) ? LPTE_CHG : 0; + + if (pm != kernel_pmap && pg != NULL && + (pg->a.flags & PGA_EXECUTABLE) == 0 && + (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { + if ((pg->oflags & VPO_UNMANAGED) == 0) + vm_page_aflag_set(pg, PGA_EXECUTABLE); + moea64_syncicache(pm, PVO_VADDR(pvo), + PVO_PADDR(pvo), PAGE_SIZE); + } + + /* + * Update vm about the REF/CHG bits if the page is managed and we have + * removed write access. + */ + if (pg != NULL && (pvo->pvo_vaddr & PVO_MANAGED) && + (oldprot & VM_PROT_WRITE)) { + refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs); + if (refchg & LPTE_CHG) + vm_page_dirty(pg); + if (refchg & LPTE_REF) + vm_page_aflag_set(pg, PGA_REFERENCED); + } +} + +void +moea64_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, + vm_prot_t prot) +{ + struct pvo_entry *pvo, key; + + CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm, + sva, eva, prot); + + KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap, + ("moea64_protect: non current pmap")); + + if ((prot & VM_PROT_READ) == VM_PROT_NONE) { + moea64_remove(pm, sva, eva); + return; + } + + PMAP_LOCK(pm); + key.pvo_vaddr = sva; + for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); + pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { + if (PVO_IS_SP(pvo)) { + if (moea64_sp_pvo_in_range(pvo, sva, eva)) { + pvo = moea64_sp_protect(pvo, prot); + continue; + } else { + CTR1(KTR_PMAP, "%s: demote before protect", + __func__); + moea64_sp_demote(pvo); + } + } + moea64_pvo_protect(pm, pvo, prot); + } + PMAP_UNLOCK(pm); +} + +/* + * Map a list of wired pages into kernel virtual address space. This is + * intended for temporary mappings which do not need page modification or + * references recorded. Existing mappings in the region are overwritten. + */ +void +moea64_qenter(vm_offset_t va, vm_page_t *m, int count) +{ + while (count-- > 0) { + moea64_kenter(va, VM_PAGE_TO_PHYS(*m)); + va += PAGE_SIZE; + m++; + } +} + +/* + * Remove page mappings from kernel virtual address space. Intended for + * temporary mappings entered by moea64_qenter. + */ +void +moea64_qremove(vm_offset_t va, int count) +{ + while (count-- > 0) { + moea64_kremove(va); + va += PAGE_SIZE; + } +} + +void +moea64_release_vsid(uint64_t vsid) +{ + int idx, mask; + + mtx_lock(&moea64_slb_mutex); + idx = vsid & (NVSIDS-1); + mask = 1 << (idx % VSID_NBPW); + idx /= VSID_NBPW; + KASSERT(moea64_vsid_bitmap[idx] & mask, + ("Freeing unallocated VSID %#jx", vsid)); + moea64_vsid_bitmap[idx] &= ~mask; + mtx_unlock(&moea64_slb_mutex); +} + +void +moea64_release(pmap_t pmap) +{ + + /* + * Free segment registers' VSIDs + */ + #ifdef __powerpc64__ + slb_free_tree(pmap); + slb_free_user_cache(pmap->pm_slb); + #else + KASSERT(pmap->pm_sr[0] != 0, ("moea64_release: pm_sr[0] = 0")); + + moea64_release_vsid(VSID_TO_HASH(pmap->pm_sr[0])); + #endif +} + +/* + * Remove all pages mapped by the specified pmap + */ +void +moea64_remove_pages(pmap_t pm) +{ + struct pvo_entry *pvo, *tpvo; + struct pvo_dlist tofree; + + SLIST_INIT(&tofree); + + PMAP_LOCK(pm); + RB_FOREACH_SAFE(pvo, pvo_tree, &pm->pmap_pvo, tpvo) { + if (pvo->pvo_vaddr & PVO_WIRED) + continue; + + /* + * For locking reasons, remove this from the page table and + * pmap, but save delinking from the vm_page for a second + * pass + */ + moea64_pvo_remove_from_pmap(pvo); + SLIST_INSERT_HEAD(&tofree, pvo, pvo_dlink); + } + PMAP_UNLOCK(pm); + + while (!SLIST_EMPTY(&tofree)) { + pvo = SLIST_FIRST(&tofree); + SLIST_REMOVE_HEAD(&tofree, pvo_dlink); + moea64_pvo_remove_from_page(pvo); + free_pvo_entry(pvo); + } +} + +static void +moea64_remove_locked(pmap_t pm, vm_offset_t sva, vm_offset_t eva, + struct pvo_dlist *tofree) +{ + struct pvo_entry *pvo, *tpvo, key; + + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + key.pvo_vaddr = sva; + for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); + pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { + if (PVO_IS_SP(pvo)) { + if (moea64_sp_pvo_in_range(pvo, sva, eva)) { + tpvo = moea64_sp_remove(pvo, tofree); + continue; + } else { + CTR1(KTR_PMAP, "%s: demote before remove", + __func__); + moea64_sp_demote(pvo); + } + } + tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); + + /* + * For locking reasons, remove this from the page table and + * pmap, but save delinking from the vm_page for a second + * pass + */ + moea64_pvo_remove_from_pmap(pvo); + SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink); + } +} + +/* + * Remove the given range of addresses from the specified map. + */ +void +moea64_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva) +{ + struct pvo_entry *pvo; + struct pvo_dlist tofree; + + /* + * Perform an unsynchronized read. This is, however, safe. + */ + if (pm->pm_stats.resident_count == 0) + return; + + SLIST_INIT(&tofree); + PMAP_LOCK(pm); + moea64_remove_locked(pm, sva, eva, &tofree); + PMAP_UNLOCK(pm); + + while (!SLIST_EMPTY(&tofree)) { + pvo = SLIST_FIRST(&tofree); + SLIST_REMOVE_HEAD(&tofree, pvo_dlink); + moea64_pvo_remove_from_page(pvo); + free_pvo_entry(pvo); + } +} + +/* + * Remove physical page from all pmaps in which it resides. moea64_pvo_remove() + * will reflect changes in pte's back to the vm_page. + */ +void +moea64_remove_all(vm_page_t m) +{ + struct pvo_entry *pvo, *next_pvo; + struct pvo_head freequeue; + int wasdead; + pmap_t pmap; + + LIST_INIT(&freequeue); + + PV_PAGE_LOCK(m); + LIST_FOREACH_SAFE(pvo, vm_page_to_pvoh(m), pvo_vlink, next_pvo) { + pmap = pvo->pvo_pmap; + PMAP_LOCK(pmap); + wasdead = (pvo->pvo_vaddr & PVO_DEAD); + if (!wasdead) { + if (PVO_IS_SP(pvo)) { + CTR1(KTR_PMAP, "%s: demote before remove_all", + __func__); + moea64_sp_demote(pvo); + } + moea64_pvo_remove_from_pmap(pvo); + } + moea64_pvo_remove_from_page_locked(pvo, m); + if (!wasdead) + LIST_INSERT_HEAD(&freequeue, pvo, pvo_vlink); + PMAP_UNLOCK(pmap); + + } + KASSERT(!pmap_page_is_mapped(m), ("Page still has mappings")); + KASSERT((m->a.flags & PGA_WRITEABLE) == 0, ("Page still writable")); + PV_PAGE_UNLOCK(m); + + /* Clean up UMA allocations */ + LIST_FOREACH_SAFE(pvo, &freequeue, pvo_vlink, next_pvo) + free_pvo_entry(pvo); +} + +/* + * Allocate a physical page of memory directly from the phys_avail map. + * Can only be called from moea64_bootstrap before avail start and end are + * calculated. + */ +vm_offset_t +moea64_bootstrap_alloc(vm_size_t size, vm_size_t align) +{ + vm_offset_t s, e; + int i, j; + + size = round_page(size); + for (i = 0; phys_avail[i + 1] != 0; i += 2) { + if (align != 0) + s = roundup2(phys_avail[i], align); + else + s = phys_avail[i]; + e = s + size; + + if (s < phys_avail[i] || e > phys_avail[i + 1]) + continue; + + if (s + size > platform_real_maxaddr()) + continue; + + if (s == phys_avail[i]) { + phys_avail[i] += size; + } else if (e == phys_avail[i + 1]) { + phys_avail[i + 1] -= size; + } else { + for (j = phys_avail_count * 2; j > i; j -= 2) { + phys_avail[j] = phys_avail[j - 2]; + phys_avail[j + 1] = phys_avail[j - 1]; + } + + phys_avail[i + 3] = phys_avail[i + 1]; + phys_avail[i + 1] = s; + phys_avail[i + 2] = e; + phys_avail_count++; + } + + return (s); + } + panic("moea64_bootstrap_alloc: could not allocate memory"); +} + +static int +moea64_pvo_enter(struct pvo_entry *pvo, struct pvo_head *pvo_head, + struct pvo_entry **oldpvop) +{ + struct pvo_entry *old_pvo; + int err; + + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); + + STAT_MOEA64(moea64_pvo_enter_calls++); + + /* + * Add to pmap list + */ + old_pvo = RB_INSERT(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); + + if (old_pvo != NULL) { + if (oldpvop != NULL) + *oldpvop = old_pvo; + return (EEXIST); + } + + if (pvo_head != NULL) { + LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink); + } + + if (pvo->pvo_vaddr & PVO_WIRED) + pvo->pvo_pmap->pm_stats.wired_count++; + pvo->pvo_pmap->pm_stats.resident_count++; + + /* + * Insert it into the hardware page table + */ + err = moea64_pte_insert(pvo); + if (err != 0) { + panic("moea64_pvo_enter: overflow"); + } + + STAT_MOEA64(moea64_pvo_entries++); + + if (pvo->pvo_pmap == kernel_pmap) + isync(); + +#ifdef __powerpc64__ + /* + * Make sure all our bootstrap mappings are in the SLB as soon + * as virtual memory is switched on. + */ + if (!pmap_bootstrapped) + moea64_bootstrap_slb_prefault(PVO_VADDR(pvo), + pvo->pvo_vaddr & PVO_LARGE); +#endif + + return (0); +} + +static void +moea64_pvo_remove_from_pmap(struct pvo_entry *pvo) +{ + struct vm_page *pg; + int32_t refchg; + + KASSERT(pvo->pvo_pmap != NULL, ("Trying to remove PVO with no pmap")); + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); + KASSERT(!(pvo->pvo_vaddr & PVO_DEAD), ("Trying to remove dead PVO")); + + /* + * If there is an active pte entry, we need to deactivate it + */ + refchg = moea64_pte_unset(pvo); + if (refchg < 0) { + /* + * If it was evicted from the page table, be pessimistic and + * dirty the page. + */ + if (pvo->pvo_pte.prot & VM_PROT_WRITE) + refchg = LPTE_CHG; + else + refchg = 0; + } + + /* + * Update our statistics. + */ + pvo->pvo_pmap->pm_stats.resident_count--; + if (pvo->pvo_vaddr & PVO_WIRED) + pvo->pvo_pmap->pm_stats.wired_count--; + + /* + * Remove this PVO from the pmap list. + */ + RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); + + /* + * Mark this for the next sweep + */ + pvo->pvo_vaddr |= PVO_DEAD; + + /* Send RC bits to VM */ + if ((pvo->pvo_vaddr & PVO_MANAGED) && + (pvo->pvo_pte.prot & VM_PROT_WRITE)) { + pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); + if (pg != NULL) { + refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs); + if (refchg & LPTE_CHG) + vm_page_dirty(pg); + if (refchg & LPTE_REF) + vm_page_aflag_set(pg, PGA_REFERENCED); + } + } +} + +static inline void +moea64_pvo_remove_from_page_locked(struct pvo_entry *pvo, + vm_page_t m) +{ + + KASSERT(pvo->pvo_vaddr & PVO_DEAD, ("Trying to delink live page")); + + /* Use NULL pmaps as a sentinel for races in page deletion */ + if (pvo->pvo_pmap == NULL) + return; + pvo->pvo_pmap = NULL; + + /* + * Update vm about page writeability/executability if managed + */ + PV_LOCKASSERT(PVO_PADDR(pvo)); + if (pvo->pvo_vaddr & PVO_MANAGED) { + if (m != NULL) { + LIST_REMOVE(pvo, pvo_vlink); + if (LIST_EMPTY(vm_page_to_pvoh(m))) + vm_page_aflag_clear(m, + PGA_WRITEABLE | PGA_EXECUTABLE); + } + } + + STAT_MOEA64(moea64_pvo_entries--); + STAT_MOEA64(moea64_pvo_remove_calls++); +} + +static void +moea64_pvo_remove_from_page(struct pvo_entry *pvo) +{ + vm_page_t pg = NULL; + + if (pvo->pvo_vaddr & PVO_MANAGED) + pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo)); + + PV_LOCK(PVO_PADDR(pvo)); + moea64_pvo_remove_from_page_locked(pvo, pg); + PV_UNLOCK(PVO_PADDR(pvo)); +} + +static struct pvo_entry * +moea64_pvo_find_va(pmap_t pm, vm_offset_t va) +{ + struct pvo_entry key; + + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + key.pvo_vaddr = va & ~ADDR_POFF; + return (RB_FIND(pvo_tree, &pm->pmap_pvo, &key)); +} + +static bool +moea64_query_bit(vm_page_t m, uint64_t ptebit) +{ + struct pvo_entry *pvo; + int64_t ret; + bool rv; + vm_page_t sp; + + /* + * See if this bit is stored in the page already. + * + * For superpages, the bit is stored in the first vm page. + */ + if ((m->md.mdpg_attrs & ptebit) != 0 || + ((sp = PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(m) & ~HPT_SP_MASK)) != NULL && + (sp->md.mdpg_attrs & (ptebit | MDPG_ATTR_SP)) == + (ptebit | MDPG_ATTR_SP))) + return (true); + + /* + * Examine each PTE. Sync so that any pending REF/CHG bits are + * flushed to the PTEs. + */ + rv = false; + powerpc_sync(); + PV_PAGE_LOCK(m); + LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { + if (PVO_IS_SP(pvo)) { + ret = moea64_sp_query(pvo, ptebit); + /* + * If SP was not demoted, check its REF/CHG bits here. + */ + if (ret != -1) { + if ((ret & ptebit) != 0) { + rv = true; + break; + } + continue; + } + /* else, fallthrough */ + } + + ret = 0; + + /* + * See if this pvo has a valid PTE. if so, fetch the + * REF/CHG bits from the valid PTE. If the appropriate + * ptebit is set, return success. + */ + PMAP_LOCK(pvo->pvo_pmap); + if (!(pvo->pvo_vaddr & PVO_DEAD)) + ret = moea64_pte_synch(pvo); + PMAP_UNLOCK(pvo->pvo_pmap); + + if (ret > 0) { + atomic_set_32(&m->md.mdpg_attrs, + ret & (LPTE_CHG | LPTE_REF)); + if (ret & ptebit) { + rv = true; + break; + } + } + } + PV_PAGE_UNLOCK(m); + + return (rv); +} + +static u_int +moea64_clear_bit(vm_page_t m, u_int64_t ptebit) +{ + u_int count; + struct pvo_entry *pvo; + int64_t ret; + + /* + * Sync so that any pending REF/CHG bits are flushed to the PTEs (so + * we can reset the right ones). + */ + powerpc_sync(); + + /* + * For each pvo entry, clear the pte's ptebit. + */ + count = 0; + PV_PAGE_LOCK(m); + LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { + if (PVO_IS_SP(pvo)) { + if ((ret = moea64_sp_clear(pvo, m, ptebit)) != -1) { + count += ret; + continue; + } + } + ret = 0; + + PMAP_LOCK(pvo->pvo_pmap); + if (!(pvo->pvo_vaddr & PVO_DEAD)) + ret = moea64_pte_clear(pvo, ptebit); + PMAP_UNLOCK(pvo->pvo_pmap); + + if (ret > 0 && (ret & ptebit)) + count++; + } + atomic_clear_32(&m->md.mdpg_attrs, ptebit); + PV_PAGE_UNLOCK(m); + + return (count); +} + +int +moea64_dev_direct_mapped(vm_paddr_t pa, vm_size_t size) +{ + struct pvo_entry *pvo, key; + vm_offset_t ppa; + int error = 0; + + if (hw_direct_map && mem_valid(pa, size) == 0) + return (0); + + PMAP_LOCK(kernel_pmap); + ppa = pa & ~ADDR_POFF; + key.pvo_vaddr = DMAP_BASE_ADDRESS + ppa; + for (pvo = RB_FIND(pvo_tree, &kernel_pmap->pmap_pvo, &key); + ppa < pa + size; ppa += PAGE_SIZE, + pvo = RB_NEXT(pvo_tree, &kernel_pmap->pmap_pvo, pvo)) { + if (pvo == NULL || PVO_PADDR(pvo) != ppa) { + error = EFAULT; + break; + } + } + PMAP_UNLOCK(kernel_pmap); + + return (error); +} + +/* + * Map a set of physical memory pages into the kernel virtual + * address space. Return a pointer to where it is mapped. This + * routine is intended to be used for mapping device memory, + * NOT real memory. + */ +void * +moea64_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) +{ + vm_offset_t va, tmpva, ppa, offset; + + ppa = trunc_page(pa); + offset = pa & PAGE_MASK; + size = roundup2(offset + size, PAGE_SIZE); + + va = kva_alloc(size); + + if (!va) + panic("moea64_mapdev: Couldn't alloc kernel virtual memory"); + + for (tmpva = va; size > 0;) { + moea64_kenter_attr(tmpva, ppa, ma); + size -= PAGE_SIZE; + tmpva += PAGE_SIZE; + ppa += PAGE_SIZE; + } + + return ((void *)(va + offset)); +} + +void * +moea64_mapdev(vm_paddr_t pa, vm_size_t size) +{ + + return moea64_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT); +} + +void +moea64_unmapdev(void *p, vm_size_t size) +{ + vm_offset_t base, offset, va; + + va = (vm_offset_t)p; + base = trunc_page(va); + offset = va & PAGE_MASK; + size = roundup2(offset + size, PAGE_SIZE); + + moea64_qremove(base, atop(size)); + kva_free(base, size); +} + +void +moea64_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) +{ + struct pvo_entry *pvo; + vm_offset_t lim; + vm_paddr_t pa; + vm_size_t len; + + if (__predict_false(pm == NULL)) + pm = &curthread->td_proc->p_vmspace->vm_pmap; + + PMAP_LOCK(pm); + while (sz > 0) { + lim = round_page(va+1); + len = MIN(lim - va, sz); + pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF); + if (pvo != NULL && !(pvo->pvo_pte.pa & LPTE_I)) { + pa = PVO_PADDR(pvo) | (va & ADDR_POFF); + moea64_syncicache(pm, va, pa, len); + } + va += len; + sz -= len; + } + PMAP_UNLOCK(pm); +} + +void +moea64_dumpsys_map(vm_paddr_t pa, size_t sz, void **va) +{ + + *va = (void *)(uintptr_t)pa; +} + +extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1]; + +void +moea64_scan_init(void) +{ + struct pvo_entry *pvo; + vm_offset_t va; + int i; + + if (!do_minidump) { + /* Initialize phys. segments for dumpsys(). */ + memset(&dump_map, 0, sizeof(dump_map)); + mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); + for (i = 0; i < pregions_sz; i++) { + dump_map[i].pa_start = pregions[i].mr_start; + dump_map[i].pa_size = pregions[i].mr_size; + } + return; + } + + /* Virtual segments for minidumps: */ + memset(&dump_map, 0, sizeof(dump_map)); + + /* 1st: kernel .data and .bss. */ + dump_map[0].pa_start = trunc_page((uintptr_t)_etext); + dump_map[0].pa_size = round_page((uintptr_t)_end) - + dump_map[0].pa_start; + + /* 2nd: msgbuf and tables (see pmap_bootstrap()). */ + dump_map[1].pa_start = (vm_paddr_t)(uintptr_t)msgbufp->msg_ptr; + dump_map[1].pa_size = round_page(msgbufp->msg_size); + + /* 3rd: kernel VM. */ + va = dump_map[1].pa_start + dump_map[1].pa_size; + /* Find start of next chunk (from va). */ + while (va < virtual_end) { + /* Don't dump the buffer cache. */ + if (va >= kmi.buffer_sva && va < kmi.buffer_eva) { + va = kmi.buffer_eva; + continue; + } + pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); + if (pvo != NULL && !(pvo->pvo_vaddr & PVO_DEAD)) + break; + va += PAGE_SIZE; + } + if (va < virtual_end) { + dump_map[2].pa_start = va; + va += PAGE_SIZE; + /* Find last page in chunk. */ + while (va < virtual_end) { + /* Don't run into the buffer cache. */ + if (va == kmi.buffer_sva) + break; + pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); + if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD)) + break; + va += PAGE_SIZE; + } + dump_map[2].pa_size = va - dump_map[2].pa_start; + } +} + +#ifdef __powerpc64__ + +static size_t +moea64_scan_pmap(struct bitset *dump_bitset) +{ + struct pvo_entry *pvo; + vm_paddr_t pa, pa_end; + vm_offset_t va, pgva, kstart, kend, kstart_lp, kend_lp; + uint64_t lpsize; + + lpsize = moea64_large_page_size; + kstart = trunc_page((vm_offset_t)_etext); + kend = round_page((vm_offset_t)_end); + kstart_lp = kstart & ~moea64_large_page_mask; + kend_lp = (kend + moea64_large_page_mask) & ~moea64_large_page_mask; + + CTR4(KTR_PMAP, "moea64_scan_pmap: kstart=0x%016lx, kend=0x%016lx, " + "kstart_lp=0x%016lx, kend_lp=0x%016lx", + kstart, kend, kstart_lp, kend_lp); + + PMAP_LOCK(kernel_pmap); + RB_FOREACH(pvo, pvo_tree, &kernel_pmap->pmap_pvo) { + va = pvo->pvo_vaddr; + + if (va & PVO_DEAD) + continue; + + /* Skip DMAP (except kernel area) */ + if (va >= DMAP_BASE_ADDRESS && va <= DMAP_MAX_ADDRESS) { + if (va & PVO_LARGE) { + pgva = va & ~moea64_large_page_mask; + if (pgva < kstart_lp || pgva >= kend_lp) + continue; + } else { + pgva = trunc_page(va); + if (pgva < kstart || pgva >= kend) + continue; + } + } + + pa = PVO_PADDR(pvo); + + if (va & PVO_LARGE) { + pa_end = pa + lpsize; + for (; pa < pa_end; pa += PAGE_SIZE) { + if (vm_phys_is_dumpable(pa)) + vm_page_dump_add(dump_bitset, pa); + } + } else { + if (vm_phys_is_dumpable(pa)) + vm_page_dump_add(dump_bitset, pa); + } + } + PMAP_UNLOCK(kernel_pmap); + + return (sizeof(struct lpte) * moea64_pteg_count * 8); +} + +static struct dump_context dump_ctx; + +static void * +moea64_dump_pmap_init(unsigned blkpgs) +{ + dump_ctx.ptex = 0; + dump_ctx.ptex_end = moea64_pteg_count * 8; + dump_ctx.blksz = blkpgs * PAGE_SIZE; + return (&dump_ctx); +} + +#else + +static size_t +moea64_scan_pmap(struct bitset *dump_bitset __unused) +{ + return (0); +} + +static void * +moea64_dump_pmap_init(unsigned blkpgs) +{ + return (NULL); +} + +#endif + +#ifdef __powerpc64__ +static void +moea64_map_range(vm_offset_t va, vm_paddr_t pa, vm_size_t npages) +{ + + for (; npages > 0; --npages) { + if (moea64_large_page_size != 0 && + (pa & moea64_large_page_mask) == 0 && + (va & moea64_large_page_mask) == 0 && + npages >= (moea64_large_page_size >> PAGE_SHIFT)) { + PMAP_LOCK(kernel_pmap); + moea64_kenter_large(va, pa, 0, 0); + PMAP_UNLOCK(kernel_pmap); + pa += moea64_large_page_size; + va += moea64_large_page_size; + npages -= (moea64_large_page_size >> PAGE_SHIFT) - 1; + } else { + moea64_kenter(va, pa); + pa += PAGE_SIZE; + va += PAGE_SIZE; + } + } +} + +static void +moea64_page_array_startup(long pages) +{ + long dom_pages[MAXMEMDOM]; + vm_paddr_t pa; + vm_offset_t va, vm_page_base; + vm_size_t needed, size; + int domain; + int i; + + vm_page_base = 0xd000000000000000ULL; + + /* Short-circuit single-domain systems. */ + if (vm_ndomains == 1) { + size = round_page(pages * sizeof(struct vm_page)); + pa = vm_phys_early_alloc(0, size); + vm_page_base = moea64_map(&vm_page_base, + pa, pa + size, VM_PROT_READ | VM_PROT_WRITE); + vm_page_array_size = pages; + vm_page_array = (vm_page_t)vm_page_base; + return; + } + + for (i = 0; i < MAXMEMDOM; i++) + dom_pages[i] = 0; + + /* Now get the number of pages required per domain. */ + for (i = 0; i < vm_phys_nsegs; i++) { + domain = vm_phys_segs[i].domain; + KASSERT(domain < MAXMEMDOM, + ("Invalid vm_phys_segs NUMA domain %d!\n", domain)); + /* Get size of vm_page_array needed for this segment. */ + size = btoc(vm_phys_segs[i].end - vm_phys_segs[i].start); + dom_pages[domain] += size; + } + + for (i = 0; phys_avail[i + 1] != 0; i+= 2) { + domain = vm_phys_domain(phys_avail[i]); + KASSERT(domain < MAXMEMDOM, + ("Invalid phys_avail NUMA domain %d!\n", domain)); + size = btoc(phys_avail[i + 1] - phys_avail[i]); + dom_pages[domain] += size; + } + + /* + * Map in chunks that can get us all 16MB pages. There will be some + * overlap between domains, but that's acceptable for now. + */ + vm_page_array_size = 0; + va = vm_page_base; + for (i = 0; i < MAXMEMDOM && vm_page_array_size < pages; i++) { + if (dom_pages[i] == 0) + continue; + size = ulmin(pages - vm_page_array_size, dom_pages[i]); + size = round_page(size * sizeof(struct vm_page)); + needed = size; + size = roundup2(size, moea64_large_page_size); + pa = vm_phys_early_alloc(i, size); + vm_page_array_size += size / sizeof(struct vm_page); + moea64_map_range(va, pa, size >> PAGE_SHIFT); + /* Scoot up domain 0, to reduce the domain page overlap. */ + if (i == 0) + vm_page_base += size - needed; + va += size; + } + vm_page_array = (vm_page_t)vm_page_base; + vm_page_array_size = pages; +} +#endif + +static int64_t +moea64_null_method(void) +{ + return (0); +} + +static int64_t moea64_pte_replace_default(struct pvo_entry *pvo, int flags) +{ + int64_t refchg; + + refchg = moea64_pte_unset(pvo); + moea64_pte_insert(pvo); + + return (refchg); +} + +struct moea64_funcs *moea64_ops; + +#define DEFINE_OEA64_IFUNC(ret, func, args, def) \ + DEFINE_IFUNC(, ret, moea64_##func, args) { \ + moea64_##func##_t f; \ + if (moea64_ops == NULL) \ + return ((moea64_##func##_t)def); \ + f = moea64_ops->func; \ + return (f != NULL ? f : (moea64_##func##_t)def);\ + } + +void +moea64_install(void) +{ +#ifdef __powerpc64__ + if (hw_direct_map == -1) { + moea64_probe_large_page(); + + /* Use a direct map if we have large page support */ + if (moea64_large_page_size > 0) + hw_direct_map = 1; + else + hw_direct_map = 0; + } +#endif + + /* + * Default to non-DMAP, and switch over to DMAP functions once we know + * we have DMAP. + */ + if (hw_direct_map) { + moea64_methods.quick_enter_page = moea64_quick_enter_page_dmap; + moea64_methods.quick_remove_page = NULL; + moea64_methods.copy_page = moea64_copy_page_dmap; + moea64_methods.zero_page = moea64_zero_page_dmap; + moea64_methods.copy_pages = moea64_copy_pages_dmap; + } +} + +DEFINE_OEA64_IFUNC(int64_t, pte_replace, (struct pvo_entry *, int), + moea64_pte_replace_default) +DEFINE_OEA64_IFUNC(int64_t, pte_insert, (struct pvo_entry *), moea64_null_method) +DEFINE_OEA64_IFUNC(int64_t, pte_unset, (struct pvo_entry *), moea64_null_method) +DEFINE_OEA64_IFUNC(int64_t, pte_clear, (struct pvo_entry *, uint64_t), + moea64_null_method) +DEFINE_OEA64_IFUNC(int64_t, pte_synch, (struct pvo_entry *), moea64_null_method) +DEFINE_OEA64_IFUNC(int64_t, pte_insert_sp, (struct pvo_entry *), moea64_null_method) +DEFINE_OEA64_IFUNC(int64_t, pte_unset_sp, (struct pvo_entry *), moea64_null_method) +DEFINE_OEA64_IFUNC(int64_t, pte_replace_sp, (struct pvo_entry *), moea64_null_method) + +/* Superpage functions */ + +/* MMU interface */ + +static bool +moea64_ps_enabled(pmap_t pmap) +{ + return (superpages_enabled); +} + +static void +moea64_align_superpage(vm_object_t object, vm_ooffset_t offset, + vm_offset_t *addr, vm_size_t size) +{ + vm_offset_t sp_offset; + + if (size < HPT_SP_SIZE) + return; + + CTR4(KTR_PMAP, "%s: offs=%#jx, addr=%p, size=%#jx", + __func__, (uintmax_t)offset, addr, (uintmax_t)size); + + if (object != NULL && (object->flags & OBJ_COLORED) != 0) + offset += ptoa(object->pg_color); + sp_offset = offset & HPT_SP_MASK; + if (size - ((HPT_SP_SIZE - sp_offset) & HPT_SP_MASK) < HPT_SP_SIZE || + (*addr & HPT_SP_MASK) == sp_offset) + return; + if ((*addr & HPT_SP_MASK) < sp_offset) + *addr = (*addr & ~HPT_SP_MASK) + sp_offset; + else + *addr = ((*addr + HPT_SP_MASK) & ~HPT_SP_MASK) + sp_offset; +} + +/* Helpers */ + +static __inline void +moea64_pvo_cleanup(struct pvo_dlist *tofree) +{ + struct pvo_entry *pvo; + + /* clean up */ + while (!SLIST_EMPTY(tofree)) { + pvo = SLIST_FIRST(tofree); + SLIST_REMOVE_HEAD(tofree, pvo_dlink); + if (pvo->pvo_vaddr & PVO_DEAD) + moea64_pvo_remove_from_page(pvo); + free_pvo_entry(pvo); + } +} + +static __inline uint16_t +pvo_to_vmpage_flags(struct pvo_entry *pvo) +{ + uint16_t flags; + + flags = 0; + if ((pvo->pvo_pte.prot & VM_PROT_WRITE) != 0) + flags |= PGA_WRITEABLE; + if ((pvo->pvo_pte.prot & VM_PROT_EXECUTE) != 0) + flags |= PGA_EXECUTABLE; + + return (flags); +} + +/* + * Check if the given pvo and its superpage are in sva-eva range. + */ +static __inline bool +moea64_sp_pvo_in_range(struct pvo_entry *pvo, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t spva; + + spva = PVO_VADDR(pvo) & ~HPT_SP_MASK; + if (spva >= sva && spva + HPT_SP_SIZE <= eva) { + /* + * Because this function is intended to be called from loops + * that iterate over ordered pvo entries, if the condition + * above is true then the pvo must be the first of its + * superpage. + */ + KASSERT(PVO_VADDR(pvo) == spva, + ("%s: unexpected unaligned superpage pvo", __func__)); + return (true); + } + return (false); +} + +/* + * Update vm about the REF/CHG bits if the superpage is managed and + * has (or had) write access. + */ +static void +moea64_sp_refchg_process(struct pvo_entry *sp, vm_page_t m, + int64_t sp_refchg, vm_prot_t prot) +{ + vm_page_t m_end; + int64_t refchg; + + if ((sp->pvo_vaddr & PVO_MANAGED) != 0 && (prot & VM_PROT_WRITE) != 0) { + for (m_end = &m[HPT_SP_PAGES]; m < m_end; m++) { + refchg = sp_refchg | + atomic_readandclear_32(&m->md.mdpg_attrs); + if (refchg & LPTE_CHG) + vm_page_dirty(m); + if (refchg & LPTE_REF) + vm_page_aflag_set(m, PGA_REFERENCED); + } + } +} + +/* Superpage ops */ + +static int +moea64_sp_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, u_int flags, int8_t psind) +{ + struct pvo_entry *pvo, **pvos; + struct pvo_head *pvo_head; + vm_offset_t sva; + vm_page_t sm; + vm_paddr_t pa, spa; + bool sync; + struct pvo_dlist tofree; + int error __diagused, i; + uint16_t aflags; + + KASSERT((va & HPT_SP_MASK) == 0, ("%s: va %#jx unaligned", + __func__, (uintmax_t)va)); + KASSERT(psind == 1, ("%s: invalid psind: %d", __func__, psind)); + KASSERT(m->psind == 1, ("%s: invalid m->psind: %d", + __func__, m->psind)); + KASSERT(pmap != kernel_pmap, + ("%s: function called with kernel pmap", __func__)); + + CTR5(KTR_PMAP, "%s: va=%#jx, pa=%#jx, prot=%#x, flags=%#x, psind=1", + __func__, (uintmax_t)va, (uintmax_t)VM_PAGE_TO_PHYS(m), + prot, flags); + + SLIST_INIT(&tofree); + + sva = va; + sm = m; + spa = pa = VM_PAGE_TO_PHYS(sm); + + /* Try to allocate all PVOs first, to make failure handling easier. */ + pvos = malloc(HPT_SP_PAGES * sizeof(struct pvo_entry *), M_TEMP, + M_NOWAIT); + if (pvos == NULL) { + CTR1(KTR_PMAP, "%s: failed to alloc pvo array", __func__); + return (KERN_RESOURCE_SHORTAGE); + } + + for (i = 0; i < HPT_SP_PAGES; i++) { + pvos[i] = alloc_pvo_entry(0); + if (pvos[i] == NULL) { + CTR1(KTR_PMAP, "%s: failed to alloc pvo", __func__); + for (i = i - 1; i >= 0; i--) + free_pvo_entry(pvos[i]); + free(pvos, M_TEMP); + return (KERN_RESOURCE_SHORTAGE); + } + } + + SP_PV_LOCK_ALIGNED(spa); + PMAP_LOCK(pmap); + + /* Note: moea64_remove_locked() also clears cached REF/CHG bits. */ + moea64_remove_locked(pmap, va, va + HPT_SP_SIZE, &tofree); + + /* Enter pages */ + for (i = 0; i < HPT_SP_PAGES; + i++, va += PAGE_SIZE, pa += PAGE_SIZE, m++) { + pvo = pvos[i]; + + pvo->pvo_pte.prot = prot; + pvo->pvo_pte.pa = (pa & ~HPT_SP_MASK) | LPTE_LP_4K_16M | + moea64_calc_wimg(pa, pmap_page_get_memattr(m)); + + if ((flags & PMAP_ENTER_WIRED) != 0) + pvo->pvo_vaddr |= PVO_WIRED; + pvo->pvo_vaddr |= PVO_LARGE; + + if ((m->oflags & VPO_UNMANAGED) != 0) + pvo_head = NULL; + else { + pvo_head = &m->md.mdpg_pvoh; + pvo->pvo_vaddr |= PVO_MANAGED; + } + + init_pvo_entry(pvo, pmap, va); + + error = moea64_pvo_enter(pvo, pvo_head, NULL); + /* + * All superpage PVOs were previously removed, so no errors + * should occur while inserting the new ones. + */ + KASSERT(error == 0, ("%s: unexpected error " + "when inserting superpage PVO: %d", + __func__, error)); + } + + PMAP_UNLOCK(pmap); + SP_PV_UNLOCK_ALIGNED(spa); + + sync = (sm->a.flags & PGA_EXECUTABLE) == 0; + /* Note: moea64_pvo_cleanup() also clears page prot. flags. */ + moea64_pvo_cleanup(&tofree); + pvo = pvos[0]; + + /* Set vm page flags */ + aflags = pvo_to_vmpage_flags(pvo); + if (aflags != 0) + for (m = sm; m < &sm[HPT_SP_PAGES]; m++) + vm_page_aflag_set(m, aflags); + + /* + * Flush the page from the instruction cache if this page is + * mapped executable and cacheable. + */ + if (sync && (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) + moea64_syncicache(pmap, sva, spa, HPT_SP_SIZE); + + atomic_add_long(&sp_mappings, 1); + CTR3(KTR_PMAP, "%s: SP success for va %#jx in pmap %p", + __func__, (uintmax_t)sva, pmap); + + free(pvos, M_TEMP); + return (KERN_SUCCESS); +} + +#if VM_NRESERVLEVEL > 0 +static void +moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m) +{ + struct pvo_entry *first, *pvo; + vm_paddr_t pa, pa_end; + vm_offset_t sva, va_end; + int64_t sp_refchg; + + /* This CTR may generate a lot of output. */ + /* CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)va); */ + + va &= ~HPT_SP_MASK; + sva = va; + /* Get superpage */ + pa = VM_PAGE_TO_PHYS(m) & ~HPT_SP_MASK; + m = PHYS_TO_VM_PAGE(pa); + + PMAP_LOCK(pmap); + + /* + * Check if all pages meet promotion criteria. + * + * XXX In some cases the loop below may be executed for each or most + * of the entered pages of a superpage, which can be expensive + * (although it was not profiled) and need some optimization. + * + * Some cases where this seems to happen are: + * - When a superpage is first entered read-only and later becomes + * read-write. + * - When some of the superpage's virtual addresses map to previously + * wired/cached pages while others map to pages allocated from a + * different physical address range. A common scenario where this + * happens is when mmap'ing a file that is already present in FS + * block cache and doesn't fill a superpage. + */ + first = pvo = moea64_pvo_find_va(pmap, sva); + for (pa_end = pa + HPT_SP_SIZE; + pa < pa_end; pa += PAGE_SIZE, va += PAGE_SIZE) { + if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD) != 0) { + CTR3(KTR_PMAP, + "%s: NULL or dead PVO: pmap=%p, va=%#jx", + __func__, pmap, (uintmax_t)va); + goto error; + } + if (PVO_PADDR(pvo) != pa) { + CTR5(KTR_PMAP, "%s: PAs don't match: " + "pmap=%p, va=%#jx, pvo_pa=%#jx, exp_pa=%#jx", + __func__, pmap, (uintmax_t)va, + (uintmax_t)PVO_PADDR(pvo), (uintmax_t)pa); + atomic_add_long(&sp_p_fail_pa, 1); + goto error; + } + if ((first->pvo_vaddr & PVO_FLAGS_PROMOTE) != + (pvo->pvo_vaddr & PVO_FLAGS_PROMOTE)) { + CTR5(KTR_PMAP, "%s: PVO flags don't match: " + "pmap=%p, va=%#jx, pvo_flags=%#jx, exp_flags=%#jx", + __func__, pmap, (uintmax_t)va, + (uintmax_t)(pvo->pvo_vaddr & PVO_FLAGS_PROMOTE), + (uintmax_t)(first->pvo_vaddr & PVO_FLAGS_PROMOTE)); + atomic_add_long(&sp_p_fail_flags, 1); + goto error; + } + if (first->pvo_pte.prot != pvo->pvo_pte.prot) { + CTR5(KTR_PMAP, "%s: PVO protections don't match: " + "pmap=%p, va=%#jx, pvo_prot=%#x, exp_prot=%#x", + __func__, pmap, (uintmax_t)va, + pvo->pvo_pte.prot, first->pvo_pte.prot); + atomic_add_long(&sp_p_fail_prot, 1); + goto error; + } + if ((first->pvo_pte.pa & LPTE_WIMG) != + (pvo->pvo_pte.pa & LPTE_WIMG)) { + CTR5(KTR_PMAP, "%s: WIMG bits don't match: " + "pmap=%p, va=%#jx, pvo_wimg=%#jx, exp_wimg=%#jx", + __func__, pmap, (uintmax_t)va, + (uintmax_t)(pvo->pvo_pte.pa & LPTE_WIMG), + (uintmax_t)(first->pvo_pte.pa & LPTE_WIMG)); + atomic_add_long(&sp_p_fail_wimg, 1); + goto error; + } + + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo); + } + + /* All OK, promote. */ + + /* + * Handle superpage REF/CHG bits. If REF or CHG is set in + * any page, then it must be set in the superpage. + * + * Instead of querying each page, we take advantage of two facts: + * 1- If a page is being promoted, it was referenced. + * 2- If promoted pages are writable, they were modified. + */ + sp_refchg = LPTE_REF | + ((first->pvo_pte.prot & VM_PROT_WRITE) != 0 ? LPTE_CHG : 0); + + /* Promote pages */ + + for (pvo = first, va_end = PVO_VADDR(pvo) + HPT_SP_SIZE; + pvo != NULL && PVO_VADDR(pvo) < va_end; + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) { + pvo->pvo_pte.pa &= ADDR_POFF | ~HPT_SP_MASK; + pvo->pvo_pte.pa |= LPTE_LP_4K_16M; + pvo->pvo_vaddr |= PVO_LARGE; + } + moea64_pte_replace_sp(first); + + /* Send REF/CHG bits to VM */ + moea64_sp_refchg_process(first, m, sp_refchg, first->pvo_pte.prot); + + /* Use first page to cache REF/CHG bits */ + atomic_set_32(&m->md.mdpg_attrs, sp_refchg | MDPG_ATTR_SP); + + PMAP_UNLOCK(pmap); + + atomic_add_long(&sp_mappings, 1); + atomic_add_long(&sp_promotions, 1); + CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p", + __func__, (uintmax_t)sva, pmap); + return; + +error: + atomic_add_long(&sp_p_failures, 1); + PMAP_UNLOCK(pmap); +} +#endif + +static void +moea64_sp_demote_aligned(struct pvo_entry *sp) +{ + struct pvo_entry *pvo; + vm_offset_t va, va_end; + vm_paddr_t pa; + vm_page_t m; + pmap_t pmap __diagused; + int64_t refchg; + + CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp)); + + pmap = sp->pvo_pmap; + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + pvo = sp; + + /* Demote pages */ + + va = PVO_VADDR(pvo); + pa = PVO_PADDR(pvo); + m = PHYS_TO_VM_PAGE(pa); + + for (pvo = sp, va_end = va + HPT_SP_SIZE; + pvo != NULL && PVO_VADDR(pvo) < va_end; + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo), + va += PAGE_SIZE, pa += PAGE_SIZE) { + KASSERT(pvo && PVO_VADDR(pvo) == va, + ("%s: missing PVO for va %#jx", __func__, (uintmax_t)va)); + + pvo->pvo_vaddr &= ~PVO_LARGE; + pvo->pvo_pte.pa &= ~LPTE_RPGN; + pvo->pvo_pte.pa |= pa; + + } + refchg = moea64_pte_replace_sp(sp); + + /* + * Clear SP flag + * + * XXX It is possible that another pmap has this page mapped as + * part of a superpage, but as the SP flag is used only for + * caching SP REF/CHG bits, that will be queried if not set + * in cache, it should be ok to clear it here. + */ + atomic_clear_32(&m->md.mdpg_attrs, MDPG_ATTR_SP); + + /* + * Handle superpage REF/CHG bits. A bit set in the superpage + * means all pages should consider it set. + */ + moea64_sp_refchg_process(sp, m, refchg, sp->pvo_pte.prot); + + atomic_add_long(&sp_demotions, 1); + CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p", + __func__, (uintmax_t)PVO_VADDR(sp), pmap); +} + +static void +moea64_sp_demote(struct pvo_entry *pvo) +{ + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); + + if ((PVO_VADDR(pvo) & HPT_SP_MASK) != 0) { + pvo = moea64_pvo_find_va(pvo->pvo_pmap, + PVO_VADDR(pvo) & ~HPT_SP_MASK); + KASSERT(pvo != NULL, ("%s: missing PVO for va %#jx", + __func__, (uintmax_t)(PVO_VADDR(pvo) & ~HPT_SP_MASK))); + } + moea64_sp_demote_aligned(pvo); +} + +static struct pvo_entry * +moea64_sp_unwire(struct pvo_entry *sp) +{ + struct pvo_entry *pvo, *prev; + vm_offset_t eva; + pmap_t pm; + int64_t ret, refchg; + + CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp)); + + pm = sp->pvo_pmap; + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + eva = PVO_VADDR(sp) + HPT_SP_SIZE; + refchg = 0; + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; + prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { + if ((pvo->pvo_vaddr & PVO_WIRED) == 0) + panic("%s: pvo %p is missing PVO_WIRED", + __func__, pvo); + pvo->pvo_vaddr &= ~PVO_WIRED; + + ret = moea64_pte_replace(pvo, 0 /* No invalidation */); + if (ret < 0) + refchg |= LPTE_CHG; + else + refchg |= ret; + + pm->pm_stats.wired_count--; + } + + /* Send REF/CHG bits to VM */ + moea64_sp_refchg_process(sp, PHYS_TO_VM_PAGE(PVO_PADDR(sp)), + refchg, sp->pvo_pte.prot); + + return (prev); +} + +static struct pvo_entry * +moea64_sp_protect(struct pvo_entry *sp, vm_prot_t prot) +{ + struct pvo_entry *pvo, *prev; + vm_offset_t eva; + pmap_t pm; + vm_page_t m, m_end; + int64_t ret, refchg; + vm_prot_t oldprot; + + CTR3(KTR_PMAP, "%s: va=%#jx, prot=%x", + __func__, (uintmax_t)PVO_VADDR(sp), prot); + + pm = sp->pvo_pmap; + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + oldprot = sp->pvo_pte.prot; + m = PHYS_TO_VM_PAGE(PVO_PADDR(sp)); + KASSERT(m != NULL, ("%s: missing vm page for pa %#jx", + __func__, (uintmax_t)PVO_PADDR(sp))); + eva = PVO_VADDR(sp) + HPT_SP_SIZE; + refchg = 0; + + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; + prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { + pvo->pvo_pte.prot = prot; + /* + * If the PVO is in the page table, update mapping + */ + ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE); + if (ret < 0) + refchg |= LPTE_CHG; + else + refchg |= ret; + } + + /* Send REF/CHG bits to VM */ + moea64_sp_refchg_process(sp, m, refchg, oldprot); + + /* Handle pages that became executable */ + if ((m->a.flags & PGA_EXECUTABLE) == 0 && + (sp->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { + if ((m->oflags & VPO_UNMANAGED) == 0) + for (m_end = &m[HPT_SP_PAGES]; m < m_end; m++) + vm_page_aflag_set(m, PGA_EXECUTABLE); + moea64_syncicache(pm, PVO_VADDR(sp), PVO_PADDR(sp), + HPT_SP_SIZE); + } + + return (prev); +} + +static struct pvo_entry * +moea64_sp_remove(struct pvo_entry *sp, struct pvo_dlist *tofree) +{ + struct pvo_entry *pvo, *tpvo; + vm_offset_t eva; + pmap_t pm __diagused; + + CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp)); + + pm = sp->pvo_pmap; + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + eva = PVO_VADDR(sp) + HPT_SP_SIZE; + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { + tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); + + /* + * For locking reasons, remove this from the page table and + * pmap, but save delinking from the vm_page for a second + * pass + */ + moea64_pvo_remove_from_pmap(pvo); + SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink); + } + + /* + * Clear SP bit + * + * XXX See comment in moea64_sp_demote_aligned() for why it's + * ok to always clear the SP bit on remove/demote. + */ + atomic_clear_32(&PHYS_TO_VM_PAGE(PVO_PADDR(sp))->md.mdpg_attrs, + MDPG_ATTR_SP); + + return (tpvo); +} + +static int64_t +moea64_sp_query_locked(struct pvo_entry *pvo, uint64_t ptebit) +{ + int64_t refchg, ret; + vm_offset_t eva; + vm_page_t m; + pmap_t pmap; + struct pvo_entry *sp; + + pmap = pvo->pvo_pmap; + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* Get first SP PVO */ + if ((PVO_VADDR(pvo) & HPT_SP_MASK) != 0) { + sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~HPT_SP_MASK); + KASSERT(sp != NULL, ("%s: missing PVO for va %#jx", + __func__, (uintmax_t)(PVO_VADDR(pvo) & ~HPT_SP_MASK))); + } else + sp = pvo; + eva = PVO_VADDR(sp) + HPT_SP_SIZE; + + refchg = 0; + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) { + ret = moea64_pte_synch(pvo); + if (ret > 0) { + refchg |= ret & (LPTE_CHG | LPTE_REF); + if ((refchg & ptebit) != 0) + break; + } + } + + /* Save results */ + if (refchg != 0) { + m = PHYS_TO_VM_PAGE(PVO_PADDR(sp)); + atomic_set_32(&m->md.mdpg_attrs, refchg | MDPG_ATTR_SP); + } + + return (refchg); +} + +static int64_t +moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit) +{ + int64_t refchg; + pmap_t pmap; + + pmap = pvo->pvo_pmap; + PMAP_LOCK(pmap); + + /* + * Check if SP was demoted/removed before pmap lock was acquired. + */ + if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) { + CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx", + __func__, (uintmax_t)PVO_PADDR(pvo)); + PMAP_UNLOCK(pmap); + return (-1); + } + + refchg = moea64_sp_query_locked(pvo, ptebit); + PMAP_UNLOCK(pmap); + + CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx", + __func__, (uintmax_t)PVO_VADDR(pvo), + (uintmax_t)PVO_PADDR(pvo), (uintmax_t)refchg); + + return (refchg); +} + +static int64_t +moea64_sp_pvo_clear(struct pvo_entry *pvo, uint64_t ptebit) +{ + int64_t refchg, ret; + pmap_t pmap; + struct pvo_entry *sp; + vm_offset_t eva; + vm_page_t m; + + pmap = pvo->pvo_pmap; + PMAP_LOCK(pmap); + + /* + * Check if SP was demoted/removed before pmap lock was acquired. + */ + if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) { + CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx", + __func__, (uintmax_t)PVO_PADDR(pvo)); + PMAP_UNLOCK(pmap); + return (-1); + } + + /* Get first SP PVO */ + if ((PVO_VADDR(pvo) & HPT_SP_MASK) != 0) { + sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~HPT_SP_MASK); + KASSERT(sp != NULL, ("%s: missing PVO for va %#jx", + __func__, (uintmax_t)(PVO_VADDR(pvo) & ~HPT_SP_MASK))); + } else + sp = pvo; + eva = PVO_VADDR(sp) + HPT_SP_SIZE; + + refchg = 0; + for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) { + ret = moea64_pte_clear(pvo, ptebit); + if (ret > 0) + refchg |= ret & (LPTE_CHG | LPTE_REF); + } + + m = PHYS_TO_VM_PAGE(PVO_PADDR(sp)); + atomic_clear_32(&m->md.mdpg_attrs, ptebit); + PMAP_UNLOCK(pmap); + + CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx", + __func__, (uintmax_t)PVO_VADDR(sp), + (uintmax_t)PVO_PADDR(sp), (uintmax_t)refchg); + + return (refchg); +} + +static int64_t +moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m, uint64_t ptebit) +{ + int64_t count, ret; + pmap_t pmap; + + count = 0; + pmap = pvo->pvo_pmap; + + /* + * Since this reference bit is shared by 4096 4KB pages, it + * should not be cleared every time it is tested. Apply a + * simple "hash" function on the physical page number, the + * virtual superpage number, and the pmap address to select + * one 4KB page out of the 4096 on which testing the + * reference bit will result in clearing that reference bit. + * This function is designed to avoid the selection of the + * same 4KB page for every 16MB page mapping. + * + * Always leave the reference bit of a wired mapping set, as + * the current state of its reference bit won't affect page + * replacement. + */ + if (ptebit == LPTE_REF && (((VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) ^ + (PVO_VADDR(pvo) >> HPT_SP_SHIFT) ^ (uintptr_t)pmap) & + (HPT_SP_PAGES - 1)) == 0 && (pvo->pvo_vaddr & PVO_WIRED) == 0) { + if ((ret = moea64_sp_pvo_clear(pvo, ptebit)) == -1) + return (-1); + + if ((ret & ptebit) != 0) + count++; + + /* + * If this page was not selected by the hash function, then assume + * its REF bit was set. + */ + } else if (ptebit == LPTE_REF) { + count++; + + /* + * To clear the CHG bit of a single SP page, first it must be demoted. + * But if no CHG bit is set, no bit clear and thus no SP demotion is + * needed. + */ + } else { + CTR4(KTR_PMAP, "%s: ptebit=%#jx, va=%#jx, pa=%#jx", + __func__, (uintmax_t)ptebit, (uintmax_t)PVO_VADDR(pvo), + (uintmax_t)PVO_PADDR(pvo)); + + PMAP_LOCK(pmap); + + /* + * Make sure SP wasn't demoted/removed before pmap lock + * was acquired. + */ + if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) { + CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx", + __func__, (uintmax_t)PVO_PADDR(pvo)); + PMAP_UNLOCK(pmap); + return (-1); + } + + ret = moea64_sp_query_locked(pvo, ptebit); + if ((ret & ptebit) != 0) + count++; + else { + PMAP_UNLOCK(pmap); + return (0); + } + + moea64_sp_demote(pvo); + moea64_pte_clear(pvo, ptebit); + + /* + * Write protect the mapping to a single page so that a + * subsequent write access may repromote. + */ + if ((pvo->pvo_vaddr & PVO_WIRED) == 0) + moea64_pvo_protect(pmap, pvo, + pvo->pvo_pte.prot & ~VM_PROT_WRITE); + + PMAP_UNLOCK(pmap); + } + + return (count); +} diff --git a/sys/powerpc/aim/mmu_oea64.h b/sys/powerpc/aim/mmu_oea64.h new file mode 100644 index 000000000000..38b743159980 --- /dev/null +++ b/sys/powerpc/aim/mmu_oea64.h @@ -0,0 +1,143 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (C) 2010 Nathan Whitehorn + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _POWERPC_AIM_MMU_OEA64_H +#define _POWERPC_AIM_MMU_OEA64_H + +#include "opt_pmap.h" + +#include <vm/vm_extern.h> +#include <machine/mmuvar.h> + +struct dump_context { + u_long ptex; + u_long ptex_end; + size_t blksz; +}; + +extern const struct mmu_kobj oea64_mmu; + +/* + * Helper routines + */ + +/* Allocate physical memory for use in moea64_bootstrap. */ +vm_offset_t moea64_bootstrap_alloc(vm_size_t size, vm_size_t align); +/* Set an LPTE structure to match the contents of a PVO */ +void moea64_pte_from_pvo(const struct pvo_entry *pvo, struct lpte *lpte); + +/* + * Flags + */ + +#define MOEA64_PTE_PROT_UPDATE 1 +#define MOEA64_PTE_INVALIDATE 2 + +/* + * Bootstrap subroutines + * + * An MMU_BOOTSTRAP() implementation looks like this: + * moea64_early_bootstrap(); + * Allocate Page Table + * moea64_mid_bootstrap(); + * Add mappings for MMU resources + * moea64_late_bootstrap(); + */ + +void moea64_early_bootstrap(vm_offset_t kernelstart, + vm_offset_t kernelend); +void moea64_mid_bootstrap(vm_offset_t kernelstart, + vm_offset_t kernelend); +void moea64_late_bootstrap(vm_offset_t kernelstart, + vm_offset_t kernelend); + +/* "base" install method for initializing moea64 pmap ifuncs */ +void moea64_install(void); + +int64_t moea64_pte_replace(struct pvo_entry *, int); +int64_t moea64_pte_insert(struct pvo_entry *); +int64_t moea64_pte_unset(struct pvo_entry *); +int64_t moea64_pte_clear(struct pvo_entry *, uint64_t); +int64_t moea64_pte_synch(struct pvo_entry *); +int64_t moea64_pte_insert_sp(struct pvo_entry *); +int64_t moea64_pte_unset_sp(struct pvo_entry *); +int64_t moea64_pte_replace_sp(struct pvo_entry *); + +typedef int64_t (*moea64_pte_replace_t)(struct pvo_entry *, int); +typedef int64_t (*moea64_pte_insert_t)(struct pvo_entry *); +typedef int64_t (*moea64_pte_unset_t)(struct pvo_entry *); +typedef int64_t (*moea64_pte_clear_t)(struct pvo_entry *, uint64_t); +typedef int64_t (*moea64_pte_synch_t)(struct pvo_entry *); +typedef int64_t (*moea64_pte_insert_sp_t)(struct pvo_entry *); +typedef int64_t (*moea64_pte_unset_sp_t)(struct pvo_entry *); +typedef int64_t (*moea64_pte_replace_sp_t)(struct pvo_entry *); + +struct moea64_funcs { + moea64_pte_replace_t pte_replace; + moea64_pte_insert_t pte_insert; + moea64_pte_unset_t pte_unset; + moea64_pte_clear_t pte_clear; + moea64_pte_synch_t pte_synch; + moea64_pte_insert_sp_t pte_insert_sp; + moea64_pte_unset_sp_t pte_unset_sp; + moea64_pte_replace_sp_t pte_replace_sp; +}; + +extern struct moea64_funcs *moea64_ops; + +static inline uint64_t +moea64_pte_vpn_from_pvo_vpn(const struct pvo_entry *pvo) +{ + return ((pvo->pvo_vpn >> (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)) & + LPTE_AVPN_MASK); +} + +/* + * Statistics + */ + +#ifdef MOEA64_STATS +extern u_int moea64_pte_valid; +extern u_int moea64_pte_overflow; +#define STAT_MOEA64(x) x +#else +#define STAT_MOEA64(x) ((void)0) +#endif + +/* + * State variables + */ + +extern int moea64_large_page_shift; +extern uint64_t moea64_large_page_size; +extern uint64_t moea64_large_page_mask; +extern u_long moea64_pteg_count; +extern u_long moea64_pteg_mask; +extern int n_slbs; +extern bool moea64_has_lp_4k_16m; + +#endif /* _POWERPC_AIM_MMU_OEA64_H */ diff --git a/sys/powerpc/aim/mmu_radix.c b/sys/powerpc/aim/mmu_radix.c new file mode 100644 index 000000000000..a12142fc2d7b --- /dev/null +++ b/sys/powerpc/aim/mmu_radix.c @@ -0,0 +1,6552 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2018 Matthew Macy + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "opt_platform.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/bitstring.h> +#include <sys/queue.h> +#include <sys/cpuset.h> +#include <sys/endian.h> +#include <sys/kerneldump.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/syslog.h> +#include <sys/msgbuf.h> +#include <sys/malloc.h> +#include <sys/mman.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/sched.h> +#include <sys/sysctl.h> +#include <sys/systm.h> +#include <sys/vmem.h> +#include <sys/vmmeter.h> +#include <sys/smp.h> + +#include <sys/kdb.h> + +#include <dev/ofw/openfirm.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/vm_pageout.h> +#include <vm/vm_phys.h> +#include <vm/vm_radix.h> +#include <vm/vm_reserv.h> +#include <vm/vm_dumpset.h> +#include <vm/uma.h> + +#include <machine/_inttypes.h> +#include <machine/cpu.h> +#include <machine/platform.h> +#include <machine/frame.h> +#include <machine/md_var.h> +#include <machine/psl.h> +#include <machine/bat.h> +#include <machine/hid.h> +#include <machine/pte.h> +#include <machine/sr.h> +#include <machine/trap.h> +#include <machine/mmuvar.h> + +/* For pseries bit. */ +#include <powerpc/pseries/phyp-hvcall.h> + +#ifdef INVARIANTS +#include <vm/uma_dbg.h> +#endif + +#define PPC_BITLSHIFT(bit) (sizeof(long)*NBBY - 1 - (bit)) +#define PPC_BIT(bit) (1UL << PPC_BITLSHIFT(bit)) +#define PPC_BITLSHIFT_VAL(val, bit) ((val) << PPC_BITLSHIFT(bit)) + +#include "opt_ddb.h" + +#ifdef DDB +static void pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va); +#endif + +#define PG_W RPTE_WIRED +#define PG_V RPTE_VALID +#define PG_MANAGED RPTE_MANAGED +#define PG_PROMOTED RPTE_PROMOTED +#define PG_M RPTE_C +#define PG_A RPTE_R +#define PG_X RPTE_EAA_X +#define PG_RW RPTE_EAA_W +#define PG_PTE_CACHE RPTE_ATTR_MASK + +#define RPTE_SHIFT 9 +#define NLS_MASK ((1UL<<5)-1) +#define RPTE_ENTRIES (1UL<<RPTE_SHIFT) +#define RPTE_MASK (RPTE_ENTRIES-1) + +#define NLB_SHIFT 0 +#define NLB_MASK (((1UL<<52)-1) << 8) + +extern int nkpt; +extern caddr_t crashdumpmap; + +#define RIC_FLUSH_TLB 0 +#define RIC_FLUSH_PWC 1 +#define RIC_FLUSH_ALL 2 + +#define POWER9_TLB_SETS_RADIX 128 /* # sets in POWER9 TLB Radix mode */ + +#define PPC_INST_TLBIE 0x7c000264 +#define PPC_INST_TLBIEL 0x7c000224 +#define PPC_INST_SLBIA 0x7c0003e4 + +#define ___PPC_RA(a) (((a) & 0x1f) << 16) +#define ___PPC_RB(b) (((b) & 0x1f) << 11) +#define ___PPC_RS(s) (((s) & 0x1f) << 21) +#define ___PPC_RT(t) ___PPC_RS(t) +#define ___PPC_R(r) (((r) & 0x1) << 16) +#define ___PPC_PRS(prs) (((prs) & 0x1) << 17) +#define ___PPC_RIC(ric) (((ric) & 0x3) << 18) + +#define PPC_SLBIA(IH) __XSTRING(.long PPC_INST_SLBIA | \ + ((IH & 0x7) << 21)) +#define PPC_TLBIE_5(rb,rs,ric,prs,r) \ + __XSTRING(.long PPC_INST_TLBIE | \ + ___PPC_RB(rb) | ___PPC_RS(rs) | \ + ___PPC_RIC(ric) | ___PPC_PRS(prs) | \ + ___PPC_R(r)) + +#define PPC_TLBIEL(rb,rs,ric,prs,r) \ + __XSTRING(.long PPC_INST_TLBIEL | \ + ___PPC_RB(rb) | ___PPC_RS(rs) | \ + ___PPC_RIC(ric) | ___PPC_PRS(prs) | \ + ___PPC_R(r)) + +#define PPC_INVALIDATE_ERAT PPC_SLBIA(7) + +static __inline void +ttusync(void) +{ + __asm __volatile("eieio; tlbsync; ptesync" ::: "memory"); +} + +#define TLBIEL_INVAL_SEL_MASK 0xc00 /* invalidation selector */ +#define TLBIEL_INVAL_PAGE 0x000 /* invalidate a single page */ +#define TLBIEL_INVAL_SET_PID 0x400 /* invalidate a set for the current PID */ +#define TLBIEL_INVAL_SET_LPID 0x800 /* invalidate a set for current LPID */ +#define TLBIEL_INVAL_SET 0xc00 /* invalidate a set for all LPIDs */ + +#define TLBIE_ACTUAL_PAGE_MASK 0xe0 +#define TLBIE_ACTUAL_PAGE_4K 0x00 +#define TLBIE_ACTUAL_PAGE_64K 0xa0 +#define TLBIE_ACTUAL_PAGE_2M 0x20 +#define TLBIE_ACTUAL_PAGE_1G 0x40 + +#define TLBIE_PRS_PARTITION_SCOPE 0x0 +#define TLBIE_PRS_PROCESS_SCOPE 0x1 + +#define TLBIE_RIC_INVALIDATE_TLB 0x0 /* Invalidate just TLB */ +#define TLBIE_RIC_INVALIDATE_PWC 0x1 /* Invalidate just PWC */ +#define TLBIE_RIC_INVALIDATE_ALL 0x2 /* Invalidate TLB, PWC, + * cached {proc, part}tab entries + */ +#define TLBIE_RIC_INVALIDATE_SEQ 0x3 /* HPT - only: + * Invalidate a range of translations + */ + +static __always_inline void +radix_tlbie(uint8_t ric, uint8_t prs, uint16_t is, uint32_t pid, uint32_t lpid, + vm_offset_t va, uint16_t ap) +{ + uint64_t rb, rs; + + MPASS((va & PAGE_MASK) == 0); + + rs = ((uint64_t)pid << 32) | lpid; + rb = va | is | ap; + __asm __volatile(PPC_TLBIE_5(%0, %1, %2, %3, 1) : : + "r" (rb), "r" (rs), "i" (ric), "i" (prs) : "memory"); +} + +static __inline void +radix_tlbie_fixup(uint32_t pid, vm_offset_t va, int ap) +{ + + __asm __volatile("ptesync" ::: "memory"); + radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, + TLBIEL_INVAL_PAGE, 0, 0, va, ap); + __asm __volatile("ptesync" ::: "memory"); + radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, + TLBIEL_INVAL_PAGE, pid, 0, va, ap); +} + +static __inline void +radix_tlbie_invlpg_user_4k(uint32_t pid, vm_offset_t va) +{ + + radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, + TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_4K); + radix_tlbie_fixup(pid, va, TLBIE_ACTUAL_PAGE_4K); +} + +static __inline void +radix_tlbie_invlpg_user_2m(uint32_t pid, vm_offset_t va) +{ + + radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, + TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_2M); + radix_tlbie_fixup(pid, va, TLBIE_ACTUAL_PAGE_2M); +} + +static __inline void +radix_tlbie_invlpwc_user(uint32_t pid) +{ + + radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE, + TLBIEL_INVAL_SET_PID, pid, 0, 0, 0); +} + +static __inline void +radix_tlbie_flush_user(uint32_t pid) +{ + + radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE, + TLBIEL_INVAL_SET_PID, pid, 0, 0, 0); +} + +static __inline void +radix_tlbie_invlpg_kernel_4k(vm_offset_t va) +{ + + radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, + TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_4K); + radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_4K); +} + +static __inline void +radix_tlbie_invlpg_kernel_2m(vm_offset_t va) +{ + + radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, + TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_2M); + radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_2M); +} + +/* 1GB pages aren't currently supported. */ +static __inline __unused void +radix_tlbie_invlpg_kernel_1g(vm_offset_t va) +{ + + radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE, + TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_1G); + radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_1G); +} + +static __inline void +radix_tlbie_invlpwc_kernel(void) +{ + + radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE, + TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0); +} + +static __inline void +radix_tlbie_flush_kernel(void) +{ + + radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE, + TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0); +} + +static __inline vm_pindex_t +pmap_l3e_pindex(vm_offset_t va) +{ + return ((va & PG_FRAME) >> L3_PAGE_SIZE_SHIFT); +} + +static __inline vm_pindex_t +pmap_pml3e_index(vm_offset_t va) +{ + + return ((va >> L3_PAGE_SIZE_SHIFT) & RPTE_MASK); +} + +static __inline vm_pindex_t +pmap_pml2e_index(vm_offset_t va) +{ + return ((va >> L2_PAGE_SIZE_SHIFT) & RPTE_MASK); +} + +static __inline vm_pindex_t +pmap_pml1e_index(vm_offset_t va) +{ + return ((va & PG_FRAME) >> L1_PAGE_SIZE_SHIFT); +} + +/* Return various clipped indexes for a given VA */ +static __inline vm_pindex_t +pmap_pte_index(vm_offset_t va) +{ + + return ((va >> PAGE_SHIFT) & RPTE_MASK); +} + +/* Return a pointer to the PT slot that corresponds to a VA */ +static __inline pt_entry_t * +pmap_l3e_to_pte(pt_entry_t *l3e, vm_offset_t va) +{ + pt_entry_t *pte; + vm_paddr_t ptepa; + + ptepa = (be64toh(*l3e) & NLB_MASK); + pte = (pt_entry_t *)PHYS_TO_DMAP(ptepa); + return (&pte[pmap_pte_index(va)]); +} + +/* Return a pointer to the PD slot that corresponds to a VA */ +static __inline pt_entry_t * +pmap_l2e_to_l3e(pt_entry_t *l2e, vm_offset_t va) +{ + pt_entry_t *l3e; + vm_paddr_t l3pa; + + l3pa = (be64toh(*l2e) & NLB_MASK); + l3e = (pml3_entry_t *)PHYS_TO_DMAP(l3pa); + return (&l3e[pmap_pml3e_index(va)]); +} + +/* Return a pointer to the PD slot that corresponds to a VA */ +static __inline pt_entry_t * +pmap_l1e_to_l2e(pt_entry_t *l1e, vm_offset_t va) +{ + pt_entry_t *l2e; + vm_paddr_t l2pa; + + l2pa = (be64toh(*l1e) & NLB_MASK); + + l2e = (pml2_entry_t *)PHYS_TO_DMAP(l2pa); + return (&l2e[pmap_pml2e_index(va)]); +} + +static __inline pml1_entry_t * +pmap_pml1e(pmap_t pmap, vm_offset_t va) +{ + + return (&pmap->pm_pml1[pmap_pml1e_index(va)]); +} + +static pt_entry_t * +pmap_pml2e(pmap_t pmap, vm_offset_t va) +{ + pt_entry_t *l1e; + + l1e = pmap_pml1e(pmap, va); + if (l1e == NULL || (be64toh(*l1e) & RPTE_VALID) == 0) + return (NULL); + return (pmap_l1e_to_l2e(l1e, va)); +} + +static __inline pt_entry_t * +pmap_pml3e(pmap_t pmap, vm_offset_t va) +{ + pt_entry_t *l2e; + + l2e = pmap_pml2e(pmap, va); + if (l2e == NULL || (be64toh(*l2e) & RPTE_VALID) == 0) + return (NULL); + return (pmap_l2e_to_l3e(l2e, va)); +} + +static __inline pt_entry_t * +pmap_pte(pmap_t pmap, vm_offset_t va) +{ + pt_entry_t *l3e; + + l3e = pmap_pml3e(pmap, va); + if (l3e == NULL || (be64toh(*l3e) & RPTE_VALID) == 0) + return (NULL); + return (pmap_l3e_to_pte(l3e, va)); +} + +int nkpt = 64; +SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, + "Number of kernel page table pages allocated on bootup"); + +vm_paddr_t dmaplimit; + +SYSCTL_DECL(_vm_pmap); + +#ifdef INVARIANTS +#define VERBOSE_PMAP 0 +#define VERBOSE_PROTECT 0 +static int pmap_logging; +SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_logging, CTLFLAG_RWTUN, + &pmap_logging, 0, "verbose debug logging"); +#endif + +static u_int64_t KPTphys; /* phys addr of kernel level 1 */ + +//static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */ + +static vm_offset_t qframe = 0; +static struct mtx qframe_mtx; + +void mmu_radix_activate(struct thread *); +void mmu_radix_advise(pmap_t, vm_offset_t, vm_offset_t, int); +void mmu_radix_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *, + vm_size_t); +void mmu_radix_clear_modify(vm_page_t); +void mmu_radix_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); +int mmu_radix_decode_kernel_ptr(vm_offset_t, int *, vm_offset_t *); +int mmu_radix_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t); +void mmu_radix_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t, + vm_prot_t); +void mmu_radix_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t); +vm_paddr_t mmu_radix_extract(pmap_t pmap, vm_offset_t va); +vm_page_t mmu_radix_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t); +void mmu_radix_kenter(vm_offset_t, vm_paddr_t); +vm_paddr_t mmu_radix_kextract(vm_offset_t); +void mmu_radix_kremove(vm_offset_t); +bool mmu_radix_is_modified(vm_page_t); +bool mmu_radix_is_prefaultable(pmap_t, vm_offset_t); +bool mmu_radix_is_referenced(vm_page_t); +void mmu_radix_object_init_pt(pmap_t, vm_offset_t, vm_object_t, + vm_pindex_t, vm_size_t); +bool mmu_radix_page_exists_quick(pmap_t, vm_page_t); +void mmu_radix_page_init(vm_page_t); +bool mmu_radix_page_is_mapped(vm_page_t m); +void mmu_radix_page_set_memattr(vm_page_t, vm_memattr_t); +int mmu_radix_page_wired_mappings(vm_page_t); +int mmu_radix_pinit(pmap_t); +void mmu_radix_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); +bool mmu_radix_ps_enabled(pmap_t); +void mmu_radix_qenter(vm_offset_t, vm_page_t *, int); +void mmu_radix_qremove(vm_offset_t, int); +vm_offset_t mmu_radix_quick_enter_page(vm_page_t); +void mmu_radix_quick_remove_page(vm_offset_t); +int mmu_radix_ts_referenced(vm_page_t); +void mmu_radix_release(pmap_t); +void mmu_radix_remove(pmap_t, vm_offset_t, vm_offset_t); +void mmu_radix_remove_all(vm_page_t); +void mmu_radix_remove_pages(pmap_t); +void mmu_radix_remove_write(vm_page_t); +void mmu_radix_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz); +void mmu_radix_unwire(pmap_t, vm_offset_t, vm_offset_t); +void mmu_radix_zero_page(vm_page_t); +void mmu_radix_zero_page_area(vm_page_t, int, int); +int mmu_radix_change_attr(vm_offset_t, vm_size_t, vm_memattr_t); +void mmu_radix_page_array_startup(long pages); + +#include "mmu_oea64.h" + +/* + * Kernel MMU interface + */ + +static void mmu_radix_bootstrap(vm_offset_t, vm_offset_t); + +static void mmu_radix_copy_page(vm_page_t, vm_page_t); +static void mmu_radix_copy_pages(vm_page_t *ma, vm_offset_t a_offset, + vm_page_t *mb, vm_offset_t b_offset, int xfersize); +static int mmu_radix_growkernel(vm_offset_t); +static void mmu_radix_init(void); +static int mmu_radix_mincore(pmap_t, vm_offset_t, vm_paddr_t *); +static vm_offset_t mmu_radix_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int); +static void mmu_radix_pinit0(pmap_t); + +static void *mmu_radix_mapdev(vm_paddr_t, vm_size_t); +static void *mmu_radix_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t); +static void mmu_radix_unmapdev(void *, vm_size_t); +static void mmu_radix_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma); +static int mmu_radix_dev_direct_mapped(vm_paddr_t, vm_size_t); +static void mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, void **va); +static void mmu_radix_scan_init(void); +static void mmu_radix_cpu_bootstrap(int ap); +static void mmu_radix_tlbie_all(void); + +static struct pmap_funcs mmu_radix_methods = { + .bootstrap = mmu_radix_bootstrap, + .copy_page = mmu_radix_copy_page, + .copy_pages = mmu_radix_copy_pages, + .cpu_bootstrap = mmu_radix_cpu_bootstrap, + .growkernel_nopanic = mmu_radix_growkernel, + .init = mmu_radix_init, + .map = mmu_radix_map, + .mincore = mmu_radix_mincore, + .pinit = mmu_radix_pinit, + .pinit0 = mmu_radix_pinit0, + + .mapdev = mmu_radix_mapdev, + .mapdev_attr = mmu_radix_mapdev_attr, + .unmapdev = mmu_radix_unmapdev, + .kenter_attr = mmu_radix_kenter_attr, + .dev_direct_mapped = mmu_radix_dev_direct_mapped, + .dumpsys_pa_init = mmu_radix_scan_init, + .dumpsys_map_chunk = mmu_radix_dumpsys_map, + .page_is_mapped = mmu_radix_page_is_mapped, + .ps_enabled = mmu_radix_ps_enabled, + .align_superpage = mmu_radix_align_superpage, + .object_init_pt = mmu_radix_object_init_pt, + .protect = mmu_radix_protect, + /* pmap dispatcher interface */ + .clear_modify = mmu_radix_clear_modify, + .copy = mmu_radix_copy, + .enter = mmu_radix_enter, + .enter_object = mmu_radix_enter_object, + .enter_quick = mmu_radix_enter_quick, + .extract = mmu_radix_extract, + .extract_and_hold = mmu_radix_extract_and_hold, + .is_modified = mmu_radix_is_modified, + .is_prefaultable = mmu_radix_is_prefaultable, + .is_referenced = mmu_radix_is_referenced, + .ts_referenced = mmu_radix_ts_referenced, + .page_exists_quick = mmu_radix_page_exists_quick, + .page_init = mmu_radix_page_init, + .page_wired_mappings = mmu_radix_page_wired_mappings, + .qenter = mmu_radix_qenter, + .qremove = mmu_radix_qremove, + .release = mmu_radix_release, + .remove = mmu_radix_remove, + .remove_all = mmu_radix_remove_all, + .remove_write = mmu_radix_remove_write, + .sync_icache = mmu_radix_sync_icache, + .unwire = mmu_radix_unwire, + .zero_page = mmu_radix_zero_page, + .zero_page_area = mmu_radix_zero_page_area, + .activate = mmu_radix_activate, + .quick_enter_page = mmu_radix_quick_enter_page, + .quick_remove_page = mmu_radix_quick_remove_page, + .page_set_memattr = mmu_radix_page_set_memattr, + .page_array_startup = mmu_radix_page_array_startup, + + /* Internal interfaces */ + .kenter = mmu_radix_kenter, + .kextract = mmu_radix_kextract, + .kremove = mmu_radix_kremove, + .change_attr = mmu_radix_change_attr, + .decode_kernel_ptr = mmu_radix_decode_kernel_ptr, + + .tlbie_all = mmu_radix_tlbie_all, +}; + +MMU_DEF(mmu_radix, MMU_TYPE_RADIX, mmu_radix_methods); + +static bool pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va, + struct rwlock **lockp); +static bool pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va); +static int pmap_unuse_pt(pmap_t, vm_offset_t, pml3_entry_t, struct spglist *); +static int pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva, + struct spglist *free, struct rwlock **lockp); +static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, + pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp); +static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); +static bool pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *pde, + struct spglist *free); +static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp); + +static bool pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e, + u_int flags, struct rwlock **lockp); +#if VM_NRESERVLEVEL > 0 +static void pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp); +#endif +static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); +static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); +static vm_page_t mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate); + +static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, struct rwlock **lockp); +static int pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, + u_int flags, vm_page_t m, struct rwlock **lockp); + +static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); +static void free_pv_chunk(struct pv_chunk *pc); +static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); +static vm_page_t pmap_allocl3e(pmap_t pmap, vm_offset_t va, + struct rwlock **lockp); +static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, + struct rwlock **lockp); +static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, + struct spglist *free); +static bool pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); + +static void pmap_invalidate_page(pmap_t pmap, vm_offset_t start); +static void pmap_invalidate_all(pmap_t pmap); +static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush); +static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); + +/* + * Internal flags for pmap_enter()'s helper functions. + */ +#define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ +#define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ + +#define UNIMPLEMENTED() panic("%s not implemented", __func__) +#define UNTESTED() panic("%s not yet tested", __func__) + +/* Number of supported PID bits */ +static unsigned int isa3_pid_bits; + +/* PID to start allocating from */ +static unsigned int isa3_base_pid; + +#define PROCTAB_SIZE_SHIFT (isa3_pid_bits + 4) +#define PROCTAB_ENTRIES (1ul << isa3_pid_bits) + +/* + * Map of physical memory regions. + */ +static struct mem_region *regions, *pregions; +static struct numa_mem_region *numa_pregions; +static u_int phys_avail_count; +static int regions_sz, pregions_sz, numa_pregions_sz; +static struct pate *isa3_parttab; +static struct prte *isa3_proctab; +static vmem_t *asid_arena; + +extern void bs_remap_earlyboot(void); + +#define RADIX_PGD_SIZE_SHIFT 16 +#define RADIX_PGD_SIZE (1UL << RADIX_PGD_SIZE_SHIFT) + +#define RADIX_PGD_INDEX_SHIFT (RADIX_PGD_SIZE_SHIFT-3) +#define NL2EPG (PAGE_SIZE/sizeof(pml2_entry_t)) +#define NL3EPG (PAGE_SIZE/sizeof(pml3_entry_t)) + +#define NUPML1E (RADIX_PGD_SIZE/sizeof(uint64_t)) /* number of userland PML1 pages */ +#define NUPDPE (NUPML1E * NL2EPG)/* number of userland PDP pages */ +#define NUPDE (NUPDPE * NL3EPG) /* number of userland PD entries */ + +/* POWER9 only permits a 64k partition table size. */ +#define PARTTAB_SIZE_SHIFT 16 +#define PARTTAB_SIZE (1UL << PARTTAB_SIZE_SHIFT) + +#define PARTTAB_HR (1UL << 63) /* host uses radix */ +#define PARTTAB_GR (1UL << 63) /* guest uses radix must match host */ + +/* TLB flush actions. Used as argument to tlbiel_flush() */ +enum { + TLB_INVAL_SCOPE_LPID = 2, /* invalidate TLBs for current LPID */ + TLB_INVAL_SCOPE_GLOBAL = 3, /* invalidate all TLBs */ +}; + +#define NPV_LIST_LOCKS MAXCPU +static int pmap_initialized; +static vm_paddr_t proctab0pa; +static vm_paddr_t parttab_phys; +CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); + +/* + * Data for the pv entry allocation mechanism. + * Updates to pv_invl_gen are protected by the pv_list_locks[] + * elements, but reads are not. + */ +static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); +static struct mtx __exclusive_cache_line pv_chunks_mutex; +static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; +static struct md_page *pv_table; +static struct md_page pv_dummy; + +#ifdef PV_STATS +#define PV_STAT(x) do { x ; } while (0) +#else +#define PV_STAT(x) do { } while (0) +#endif + +#define pa_radix_index(pa) ((pa) >> L3_PAGE_SIZE_SHIFT) +#define pa_to_pvh(pa) (&pv_table[pa_radix_index(pa)]) + +#define PHYS_TO_PV_LIST_LOCK(pa) \ + (&pv_list_locks[pa_radix_index(pa) % NPV_LIST_LOCKS]) + +#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ + struct rwlock **_lockp = (lockp); \ + struct rwlock *_new_lock; \ + \ + _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ + if (_new_lock != *_lockp) { \ + if (*_lockp != NULL) \ + rw_wunlock(*_lockp); \ + *_lockp = _new_lock; \ + rw_wlock(*_lockp); \ + } \ +} while (0) + +#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) + +#define RELEASE_PV_LIST_LOCK(lockp) do { \ + struct rwlock **_lockp = (lockp); \ + \ + if (*_lockp != NULL) { \ + rw_wunlock(*_lockp); \ + *_lockp = NULL; \ + } \ +} while (0) + +#define VM_PAGE_TO_PV_LIST_LOCK(m) \ + PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) + +/* + * We support 52 bits, hence: + * bits 52 - 31 = 21, 0b10101 + * RTS encoding details + * bits 0 - 3 of rts -> bits 6 - 8 unsigned long + * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long + */ +#define RTS_SIZE ((0x2UL << 61) | (0x5UL << 5)) + +static int powernv_enabled = 1; + +static __always_inline void +tlbiel_radix_set_isa300(uint32_t set, uint32_t is, + uint32_t pid, uint32_t ric, uint32_t prs) +{ + uint64_t rb; + uint64_t rs; + + rb = PPC_BITLSHIFT_VAL(set, 51) | PPC_BITLSHIFT_VAL(is, 53); + rs = PPC_BITLSHIFT_VAL((uint64_t)pid, 31); + + __asm __volatile(PPC_TLBIEL(%0, %1, %2, %3, 1) + : : "r"(rb), "r"(rs), "i"(ric), "i"(prs) + : "memory"); +} + +static void +tlbiel_flush_isa3(uint32_t num_sets, uint32_t is) +{ + uint32_t set; + + __asm __volatile("ptesync": : :"memory"); + + /* + * Flush the first set of the TLB, and the entire Page Walk Cache + * and partition table entries. Then flush the remaining sets of the + * TLB. + */ + if (is == TLB_INVAL_SCOPE_GLOBAL) { + tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); + } + + /* Do the same for process scoped entries. */ + tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); + + __asm __volatile("ptesync": : :"memory"); +} + +static void +mmu_radix_tlbiel_flush(int scope) +{ + MPASS(scope == TLB_INVAL_SCOPE_LPID || + scope == TLB_INVAL_SCOPE_GLOBAL); + + tlbiel_flush_isa3(POWER9_TLB_SETS_RADIX, scope); + __asm __volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +static void +mmu_radix_tlbie_all(void) +{ + if (powernv_enabled) + mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); + else + mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID); +} + +static void +mmu_radix_init_amor(void) +{ + /* + * In HV mode, we init AMOR (Authority Mask Override Register) so that + * the hypervisor and guest can setup IAMR (Instruction Authority Mask + * Register), enable key 0 and set it to 1. + * + * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) + */ + mtspr(SPR_AMOR, (3ul << 62)); +} + +static void +mmu_radix_init_iamr(void) +{ + /* + * Radix always uses key0 of the IAMR to determine if an access is + * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction + * fetch. + */ + mtspr(SPR_IAMR, (1ul << 62)); +} + +static void +mmu_radix_pid_set(pmap_t pmap) +{ + + mtspr(SPR_PID, pmap->pm_pid); + isync(); +} + +/* Quick sort callout for comparing physical addresses. */ +static int +pa_cmp(const void *a, const void *b) +{ + const vm_paddr_t *pa = a, *pb = b; + + if (*pa < *pb) + return (-1); + else if (*pa > *pb) + return (1); + else + return (0); +} + +#define pte_load_store(ptep, pte) atomic_swap_long(ptep, pte) +#define pte_load_clear(ptep) atomic_swap_long(ptep, 0) +#define pte_store(ptep, pte) do { \ + MPASS((pte) & (RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_X)); \ + *(u_long *)(ptep) = htobe64((u_long)((pte) | PG_V | RPTE_LEAF)); \ +} while (0) +/* + * NB: should only be used for adding directories - not for direct mappings + */ +#define pde_store(ptep, pa) do { \ + *(u_long *)(ptep) = htobe64((u_long)(pa|RPTE_VALID|RPTE_SHIFT)); \ +} while (0) + +#define pte_clear(ptep) do { \ + *(u_long *)(ptep) = (u_long)(0); \ +} while (0) + +#define PMAP_PDE_SUPERPAGE (1 << 8) /* supports 2MB superpages */ + +/* + * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB + * (PTE) page mappings have identical settings for the following fields: + */ +#define PG_PTE_PROMOTE (PG_X | PG_MANAGED | PG_W | PG_PTE_CACHE | \ + PG_M | PG_A | RPTE_EAA_MASK | PG_V) + +static __inline void +pmap_resident_count_inc(pmap_t pmap, int count) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + pmap->pm_stats.resident_count += count; +} + +static __inline void +pmap_resident_count_dec(pmap_t pmap, int count) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT(pmap->pm_stats.resident_count >= count, + ("pmap %p resident count underflow %ld %d", pmap, + pmap->pm_stats.resident_count, count)); + pmap->pm_stats.resident_count -= count; +} + +static void +pagezero(vm_offset_t va) +{ + va = trunc_page(va); + + bzero((void *)va, PAGE_SIZE); +} + +static uint64_t +allocpages(int n) +{ + u_int64_t ret; + + ret = moea64_bootstrap_alloc(n * PAGE_SIZE, PAGE_SIZE); + for (int i = 0; i < n; i++) + pagezero(PHYS_TO_DMAP(ret + i * PAGE_SIZE)); + return (ret); +} + +static pt_entry_t * +kvtopte(vm_offset_t va) +{ + pt_entry_t *l3e; + + l3e = pmap_pml3e(kernel_pmap, va); + if (l3e == NULL || (be64toh(*l3e) & RPTE_VALID) == 0) + return (NULL); + return (pmap_l3e_to_pte(l3e, va)); +} + +void +mmu_radix_kenter(vm_offset_t va, vm_paddr_t pa) +{ + pt_entry_t *pte; + + pte = kvtopte(va); + MPASS(pte != NULL); + *pte = htobe64(pa | RPTE_VALID | RPTE_LEAF | RPTE_EAA_R | \ + RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A); +} + +bool +mmu_radix_ps_enabled(pmap_t pmap) +{ + return (superpages_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); +} + +static pt_entry_t * +pmap_nofault_pte(pmap_t pmap, vm_offset_t va, int *is_l3e) +{ + pml3_entry_t *l3e; + pt_entry_t *pte; + + va &= PG_PS_FRAME; + l3e = pmap_pml3e(pmap, va); + if (l3e == NULL || (be64toh(*l3e) & PG_V) == 0) + return (NULL); + + if (be64toh(*l3e) & RPTE_LEAF) { + *is_l3e = 1; + return (l3e); + } + *is_l3e = 0; + va &= PG_FRAME; + pte = pmap_l3e_to_pte(l3e, va); + if (pte == NULL || (be64toh(*pte) & PG_V) == 0) + return (NULL); + return (pte); +} + +int +pmap_nofault(pmap_t pmap, vm_offset_t va, vm_prot_t flags) +{ + pt_entry_t *pte; + pt_entry_t startpte, origpte, newpte; + vm_page_t m; + int is_l3e; + + startpte = 0; + retry: + if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL) + return (KERN_INVALID_ADDRESS); + origpte = newpte = be64toh(*pte); + if (startpte == 0) { + startpte = origpte; + if (((flags & VM_PROT_WRITE) && (startpte & PG_M)) || + ((flags & VM_PROT_READ) && (startpte & PG_A))) { + pmap_invalidate_all(pmap); +#ifdef INVARIANTS + if (VERBOSE_PMAP || pmap_logging) + printf("%s(%p, %#lx, %#x) (%#lx) -- invalidate all\n", + __func__, pmap, va, flags, origpte); +#endif + return (KERN_FAILURE); + } + } +#ifdef INVARIANTS + if (VERBOSE_PMAP || pmap_logging) + printf("%s(%p, %#lx, %#x) (%#lx)\n", __func__, pmap, va, + flags, origpte); +#endif + PMAP_LOCK(pmap); + if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL || + be64toh(*pte) != origpte) { + PMAP_UNLOCK(pmap); + return (KERN_FAILURE); + } + m = PHYS_TO_VM_PAGE(newpte & PG_FRAME); + MPASS(m != NULL); + switch (flags) { + case VM_PROT_READ: + if ((newpte & (RPTE_EAA_R|RPTE_EAA_X)) == 0) + goto protfail; + newpte |= PG_A; + vm_page_aflag_set(m, PGA_REFERENCED); + break; + case VM_PROT_WRITE: + if ((newpte & RPTE_EAA_W) == 0) + goto protfail; + if (is_l3e) + goto protfail; + newpte |= PG_M; + vm_page_dirty(m); + break; + case VM_PROT_EXECUTE: + if ((newpte & RPTE_EAA_X) == 0) + goto protfail; + newpte |= PG_A; + vm_page_aflag_set(m, PGA_REFERENCED); + break; + } + + if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte))) + goto retry; + ptesync(); + PMAP_UNLOCK(pmap); + if (startpte == newpte) + return (KERN_FAILURE); + return (0); + protfail: + PMAP_UNLOCK(pmap); + return (KERN_PROTECTION_FAILURE); +} + +/* + * Returns true if the given page is mapped individually or as part of + * a 2mpage. Otherwise, returns false. + */ +bool +mmu_radix_page_is_mapped(vm_page_t m) +{ + struct rwlock *lock; + bool rv; + + if ((m->oflags & VPO_UNMANAGED) != 0) + return (false); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_rlock(lock); + rv = !TAILQ_EMPTY(&m->md.pv_list) || + ((m->flags & PG_FICTITIOUS) == 0 && + !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); + rw_runlock(lock); + return (rv); +} + +/* + * Determine the appropriate bits to set in a PTE or PDE for a specified + * caching mode. + */ +static int +pmap_cache_bits(vm_memattr_t ma) +{ + if (ma != VM_MEMATTR_DEFAULT) { + switch (ma) { + case VM_MEMATTR_UNCACHEABLE: + return (RPTE_ATTR_GUARDEDIO); + case VM_MEMATTR_CACHEABLE: + return (RPTE_ATTR_MEM); + case VM_MEMATTR_WRITE_BACK: + case VM_MEMATTR_PREFETCHABLE: + case VM_MEMATTR_WRITE_COMBINING: + return (RPTE_ATTR_UNGUARDEDIO); + } + } + return (0); +} + +static void +pmap_invalidate_page(pmap_t pmap, vm_offset_t start) +{ + ptesync(); + if (pmap == kernel_pmap) + radix_tlbie_invlpg_kernel_4k(start); + else + radix_tlbie_invlpg_user_4k(pmap->pm_pid, start); + ttusync(); +} + +static void +pmap_invalidate_page_2m(pmap_t pmap, vm_offset_t start) +{ + ptesync(); + if (pmap == kernel_pmap) + radix_tlbie_invlpg_kernel_2m(start); + else + radix_tlbie_invlpg_user_2m(pmap->pm_pid, start); + ttusync(); +} + +static void +pmap_invalidate_pwc(pmap_t pmap) +{ + ptesync(); + if (pmap == kernel_pmap) + radix_tlbie_invlpwc_kernel(); + else + radix_tlbie_invlpwc_user(pmap->pm_pid); + ttusync(); +} + +static void +pmap_invalidate_range(pmap_t pmap, vm_offset_t start, vm_offset_t end) +{ + if (((start - end) >> PAGE_SHIFT) > 8) { + pmap_invalidate_all(pmap); + return; + } + ptesync(); + if (pmap == kernel_pmap) { + while (start < end) { + radix_tlbie_invlpg_kernel_4k(start); + start += PAGE_SIZE; + } + } else { + while (start < end) { + radix_tlbie_invlpg_user_4k(pmap->pm_pid, start); + start += PAGE_SIZE; + } + } + ttusync(); +} + +static void +pmap_invalidate_all(pmap_t pmap) +{ + ptesync(); + if (pmap == kernel_pmap) + radix_tlbie_flush_kernel(); + else + radix_tlbie_flush_user(pmap->pm_pid); + ttusync(); +} + +static void +pmap_invalidate_l3e_page(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e) +{ + + /* + * When the PDE has PG_PROMOTED set, the 2MB page mapping was created + * by a promotion that did not invalidate the 512 4KB page mappings + * that might exist in the TLB. Consequently, at this point, the TLB + * may hold both 4KB and 2MB page mappings for the address range [va, + * va + L3_PAGE_SIZE). Therefore, the entire range must be invalidated here. + * In contrast, when PG_PROMOTED is clear, the TLB will not hold any + * 4KB page mappings for the address range [va, va + L3_PAGE_SIZE), and so a + * single INVLPG suffices to invalidate the 2MB page mapping from the + * TLB. + */ + ptesync(); + if ((l3e & PG_PROMOTED) != 0) + pmap_invalidate_range(pmap, va, va + L3_PAGE_SIZE - 1); + else + pmap_invalidate_page_2m(pmap, va); + + pmap_invalidate_pwc(pmap); +} + +static __inline struct pv_chunk * +pv_to_chunk(pv_entry_t pv) +{ + + return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); +} + +#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) + +#define PC_FREE0 0xfffffffffffffffful +#define PC_FREE1 ((1ul << (_NPCPV % 64)) - 1) + +static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1 }; + +/* + * Ensure that the number of spare PV entries in the specified pmap meets or + * exceeds the given count, "needed". + * + * The given PV list lock may be released. + */ +static void +reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) +{ + struct pch new_tail; + struct pv_chunk *pc; + vm_page_t m; + int avail, free; + bool reclaimed; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); + + /* + * Newly allocated PV chunks must be stored in a private list until + * the required number of PV chunks have been allocated. Otherwise, + * reclaim_pv_chunk() could recycle one of these chunks. In + * contrast, these chunks must be added to the pmap upon allocation. + */ + TAILQ_INIT(&new_tail); +retry: + avail = 0; + TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { + // if ((cpu_feature2 & CPUID2_POPCNT) == 0) + bit_count((bitstr_t *)pc->pc_map, 0, + sizeof(pc->pc_map) * NBBY, &free); +#if 0 + free = popcnt_pc_map_pq(pc->pc_map); +#endif + if (free == 0) + break; + avail += free; + if (avail >= needed) + break; + } + for (reclaimed = false; avail < needed; avail += _NPCPV) { + m = vm_page_alloc_noobj(VM_ALLOC_WIRED); + if (m == NULL) { + m = reclaim_pv_chunk(pmap, lockp); + if (m == NULL) + goto retry; + reclaimed = true; + } + PV_STAT(atomic_add_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); + dump_add_page(m->phys_addr); + pc = (void *)PHYS_TO_DMAP(m->phys_addr); + pc->pc_pmap = pmap; + pc->pc_map[0] = PC_FREE0; + pc->pc_map[1] = PC_FREE1; + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); + PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); + + /* + * The reclaim might have freed a chunk from the current pmap. + * If that chunk contained available entries, we need to + * re-count the number of available entries. + */ + if (reclaimed) + goto retry; + } + if (!TAILQ_EMPTY(&new_tail)) { + mtx_lock(&pv_chunks_mutex); + TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); + mtx_unlock(&pv_chunks_mutex); + } +} + +/* + * First find and then remove the pv entry for the specified pmap and virtual + * address from the specified pv list. Returns the pv entry if found and NULL + * otherwise. This operation can be performed on pv lists for either 4KB or + * 2MB page mappings. + */ +static __inline pv_entry_t +pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) +{ + pv_entry_t pv; + + TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { +#ifdef INVARIANTS + if (PV_PMAP(pv) == NULL) { + printf("corrupted pv_chunk/pv %p\n", pv); + printf("pv_chunk: %64D\n", pv_to_chunk(pv), ":"); + } + MPASS(PV_PMAP(pv) != NULL); + MPASS(pv->pv_va != 0); +#endif + if (pmap == PV_PMAP(pv) && va == pv->pv_va) { + TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); + pvh->pv_gen++; + break; + } + } + return (pv); +} + +/* + * After demotion from a 2MB page mapping to 512 4KB page mappings, + * destroy the pv entry for the 2MB page mapping and reinstantiate the pv + * entries for each of the 4KB page mappings. + */ +static void +pmap_pv_demote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp) +{ + struct md_page *pvh; + struct pv_chunk *pc; + pv_entry_t pv; + vm_offset_t va_last; + vm_page_t m; + int bit, field; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((pa & L3_PAGE_MASK) == 0, + ("pmap_pv_demote_pde: pa is not 2mpage aligned")); + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); + + /* + * Transfer the 2mpage's pv entry for this mapping to the first + * page's pv list. Once this transfer begins, the pv list lock + * must not be released until the last pv entry is reinstantiated. + */ + pvh = pa_to_pvh(pa); + va = trunc_2mpage(va); + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); + m = PHYS_TO_VM_PAGE(pa); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); + + m->md.pv_gen++; + /* Instantiate the remaining NPTEPG - 1 pv entries. */ + PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); + va_last = va + L3_PAGE_SIZE - PAGE_SIZE; + for (;;) { + pc = TAILQ_FIRST(&pmap->pm_pvchunk); + KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 + , ("pmap_pv_demote_pde: missing spare")); + for (field = 0; field < _NPCM; field++) { + while (pc->pc_map[field]) { + bit = cnttzd(pc->pc_map[field]); + pc->pc_map[field] &= ~(1ul << bit); + pv = &pc->pc_pventry[field * 64 + bit]; + va += PAGE_SIZE; + pv->pv_va = va; + m++; + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_pv_demote_pde: page %p is not managed", m)); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); + + m->md.pv_gen++; + if (va == va_last) + goto out; + } + } + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + } +out: + if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + } + PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); + PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); +} + +static void +reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap) +{ + + if (pmap == NULL) + return; + pmap_invalidate_all(pmap); + if (pmap != locked_pmap) + PMAP_UNLOCK(pmap); +} + +/* + * We are in a serious low memory condition. Resort to + * drastic measures to free some pages so we can allocate + * another pv entry chunk. + * + * Returns NULL if PV entries were reclaimed from the specified pmap. + * + * We do not, however, unmap 2mpages because subsequent accesses will + * allocate per-page pv entries until repromotion occurs, thereby + * exacerbating the shortage of free pv entries. + */ +static int active_reclaims = 0; +static vm_page_t +reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) +{ + struct pv_chunk *pc, *pc_marker, *pc_marker_end; + struct pv_chunk_header pc_marker_b, pc_marker_end_b; + struct md_page *pvh; + pml3_entry_t *l3e; + pmap_t next_pmap, pmap; + pt_entry_t *pte, tpte; + pv_entry_t pv; + vm_offset_t va; + vm_page_t m, m_pc; + struct spglist free; + uint64_t inuse; + int bit, field, freed; + + PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); + KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); + pmap = NULL; + m_pc = NULL; + SLIST_INIT(&free); + bzero(&pc_marker_b, sizeof(pc_marker_b)); + bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); + pc_marker = (struct pv_chunk *)&pc_marker_b; + pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; + + mtx_lock(&pv_chunks_mutex); + active_reclaims++; + TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); + TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); + while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && + SLIST_EMPTY(&free)) { + next_pmap = pc->pc_pmap; + if (next_pmap == NULL) { + /* + * The next chunk is a marker. However, it is + * not our marker, so active_reclaims must be + * > 1. Consequently, the next_chunk code + * will not rotate the pv_chunks list. + */ + goto next_chunk; + } + mtx_unlock(&pv_chunks_mutex); + + /* + * A pv_chunk can only be removed from the pc_lru list + * when both pc_chunks_mutex is owned and the + * corresponding pmap is locked. + */ + if (pmap != next_pmap) { + reclaim_pv_chunk_leave_pmap(pmap, locked_pmap); + pmap = next_pmap; + /* Avoid deadlock and lock recursion. */ + if (pmap > locked_pmap) { + RELEASE_PV_LIST_LOCK(lockp); + PMAP_LOCK(pmap); + mtx_lock(&pv_chunks_mutex); + continue; + } else if (pmap != locked_pmap) { + if (PMAP_TRYLOCK(pmap)) { + mtx_lock(&pv_chunks_mutex); + continue; + } else { + pmap = NULL; /* pmap is not locked */ + mtx_lock(&pv_chunks_mutex); + pc = TAILQ_NEXT(pc_marker, pc_lru); + if (pc == NULL || + pc->pc_pmap != next_pmap) + continue; + goto next_chunk; + } + } + } + + /* + * Destroy every non-wired, 4 KB page mapping in the chunk. + */ + freed = 0; + for (field = 0; field < _NPCM; field++) { + for (inuse = ~pc->pc_map[field] & pc_freemask[field]; + inuse != 0; inuse &= ~(1UL << bit)) { + bit = cnttzd(inuse); + pv = &pc->pc_pventry[field * 64 + bit]; + va = pv->pv_va; + l3e = pmap_pml3e(pmap, va); + if ((be64toh(*l3e) & RPTE_LEAF) != 0) + continue; + pte = pmap_l3e_to_pte(l3e, va); + if ((be64toh(*pte) & PG_W) != 0) + continue; + tpte = be64toh(pte_load_clear(pte)); + m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + if ((tpte & PG_A) != 0) + vm_page_aflag_set(m, PGA_REFERENCED); + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); + + m->md.pv_gen++; + if (TAILQ_EMPTY(&m->md.pv_list) && + (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) { + vm_page_aflag_clear(m, + PGA_WRITEABLE); + } + } + pc->pc_map[field] |= 1UL << bit; + pmap_unuse_pt(pmap, va, be64toh(*l3e), &free); + freed++; + } + } + if (freed == 0) { + mtx_lock(&pv_chunks_mutex); + goto next_chunk; + } + /* Every freed mapping is for a 4 KB page. */ + pmap_resident_count_dec(pmap, freed); + PV_STAT(atomic_add_long(&pv_entry_frees, freed)); + PV_STAT(atomic_add_int(&pv_entry_spare, freed)); + PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1) { + PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); + PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); + /* Entire chunk is free; return it. */ + m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); + dump_drop_page(m_pc->phys_addr); + mtx_lock(&pv_chunks_mutex); + TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + break; + } + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + mtx_lock(&pv_chunks_mutex); + /* One freed pv entry in locked_pmap is sufficient. */ + if (pmap == locked_pmap) + break; +next_chunk: + TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); + TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); + if (active_reclaims == 1 && pmap != NULL) { + /* + * Rotate the pv chunks list so that we do not + * scan the same pv chunks that could not be + * freed (because they contained a wired + * and/or superpage mapping) on every + * invocation of reclaim_pv_chunk(). + */ + while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { + MPASS(pc->pc_pmap != NULL); + TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); + } + } + } + TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); + TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); + active_reclaims--; + mtx_unlock(&pv_chunks_mutex); + reclaim_pv_chunk_leave_pmap(pmap, locked_pmap); + if (m_pc == NULL && !SLIST_EMPTY(&free)) { + m_pc = SLIST_FIRST(&free); + SLIST_REMOVE_HEAD(&free, plinks.s.ss); + /* Recycle a freed page table page. */ + m_pc->ref_count = 1; + } + vm_page_free_pages_toq(&free, true); + return (m_pc); +} + +/* + * free the pv_entry back to the free list + */ +static void +free_pv_entry(pmap_t pmap, pv_entry_t pv) +{ + struct pv_chunk *pc; + int idx, field, bit; + +#ifdef VERBOSE_PV + if (pmap != kernel_pmap) + printf("%s(%p, %p)\n", __func__, pmap, pv); +#endif + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + PV_STAT(atomic_add_long(&pv_entry_frees, 1)); + PV_STAT(atomic_add_int(&pv_entry_spare, 1)); + PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); + pc = pv_to_chunk(pv); + idx = pv - &pc->pc_pventry[0]; + field = idx / 64; + bit = idx % 64; + pc->pc_map[field] |= 1ul << bit; + if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1) { + /* 98% of the time, pc is already at the head of the list. */ + if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + } + return; + } + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + free_pv_chunk(pc); +} + +static void +free_pv_chunk(struct pv_chunk *pc) +{ + vm_page_t m; + + mtx_lock(&pv_chunks_mutex); + TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + mtx_unlock(&pv_chunks_mutex); + PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); + PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); + /* entire chunk is free, return it */ + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); + dump_drop_page(m->phys_addr); + vm_page_unwire_noq(m); + vm_page_free(m); +} + +/* + * Returns a new PV entry, allocating a new PV chunk from the system when + * needed. If this PV chunk allocation fails and a PV list lock pointer was + * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is + * returned. + * + * The given PV list lock may be released. + */ +static pv_entry_t +get_pv_entry(pmap_t pmap, struct rwlock **lockp) +{ + int bit, field; + pv_entry_t pv; + struct pv_chunk *pc; + vm_page_t m; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); +retry: + pc = TAILQ_FIRST(&pmap->pm_pvchunk); + if (pc != NULL) { + for (field = 0; field < _NPCM; field++) { + if (pc->pc_map[field]) { + bit = cnttzd(pc->pc_map[field]); + break; + } + } + if (field < _NPCM) { + pv = &pc->pc_pventry[field * 64 + bit]; + pc->pc_map[field] &= ~(1ul << bit); + /* If this was the last item, move it to tail */ + if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, + pc_list); + } + PV_STAT(atomic_add_long(&pv_entry_count, 1)); + PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); + MPASS(PV_PMAP(pv) != NULL); + return (pv); + } + } + /* No free items, allocate another chunk */ + m = vm_page_alloc_noobj(VM_ALLOC_WIRED); + if (m == NULL) { + if (lockp == NULL) { + PV_STAT(pc_chunk_tryfail++); + return (NULL); + } + m = reclaim_pv_chunk(pmap, lockp); + if (m == NULL) + goto retry; + } + PV_STAT(atomic_add_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); + dump_add_page(m->phys_addr); + pc = (void *)PHYS_TO_DMAP(m->phys_addr); + pc->pc_pmap = pmap; + pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ + pc->pc_map[1] = PC_FREE1; + mtx_lock(&pv_chunks_mutex); + TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); + mtx_unlock(&pv_chunks_mutex); + pv = &pc->pc_pventry[0]; + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + PV_STAT(atomic_add_long(&pv_entry_count, 1)); + PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); + MPASS(PV_PMAP(pv) != NULL); + return (pv); +} + +#if VM_NRESERVLEVEL > 0 +/* + * After promotion from 512 4KB page mappings to a single 2MB page mapping, + * replace the many pv entries for the 4KB page mappings by a single pv entry + * for the 2MB page mapping. + */ +static void +pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, + struct rwlock **lockp) +{ + struct md_page *pvh; + pv_entry_t pv; + vm_offset_t va_last; + vm_page_t m; + + KASSERT((pa & L3_PAGE_MASK) == 0, + ("pmap_pv_promote_pde: pa is not 2mpage aligned")); + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); + + /* + * Transfer the first page's pv entry for this mapping to the 2mpage's + * pv list. Aside from avoiding the cost of a call to get_pv_entry(), + * a transfer avoids the possibility that get_pv_entry() calls + * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the + * mappings that is being promoted. + */ + m = PHYS_TO_VM_PAGE(pa); + va = trunc_2mpage(va); + pv = pmap_pvh_remove(&m->md, pmap, va); + KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); + pvh = pa_to_pvh(pa); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); + pvh->pv_gen++; + /* Free the remaining NPTEPG - 1 pv entries. */ + va_last = va + L3_PAGE_SIZE - PAGE_SIZE; + do { + m++; + va += PAGE_SIZE; + pmap_pvh_free(&m->md, pmap, va); + } while (va < va_last); +} +#endif /* VM_NRESERVLEVEL > 0 */ + +/* + * First find and then destroy the pv entry for the specified pmap and virtual + * address. This operation can be performed on pv lists for either 4KB or 2MB + * page mappings. + */ +static void +pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) +{ + pv_entry_t pv; + + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); + free_pv_entry(pmap, pv); +} + +/* + * Conditionally create the PV entry for a 4KB page mapping if the required + * memory can be allocated without resorting to reclamation. + */ +static bool +pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, + struct rwlock **lockp) +{ + pv_entry_t pv; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* Pass NULL instead of the lock pointer to disable reclamation. */ + if ((pv = get_pv_entry(pmap, NULL)) != NULL) { + pv->pv_va = va; + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); + m->md.pv_gen++; + return (true); + } else + return (false); +} + +vm_paddr_t phys_avail_debug[2 * VM_PHYSSEG_MAX]; +#ifdef INVARIANTS +static void +validate_addr(vm_paddr_t addr, vm_size_t size) +{ + vm_paddr_t end = addr + size; + bool found = false; + + for (int i = 0; i < 2 * phys_avail_count; i += 2) { + if (addr >= phys_avail_debug[i] && + end <= phys_avail_debug[i + 1]) { + found = true; + break; + } + } + KASSERT(found, ("%#lx-%#lx outside of initial phys_avail array", + addr, end)); +} +#else +static void validate_addr(vm_paddr_t addr, vm_size_t size) {} +#endif +#define DMAP_PAGE_BITS (RPTE_VALID | RPTE_LEAF | RPTE_EAA_MASK | PG_M | PG_A) + +static vm_paddr_t +alloc_pt_page(void) +{ + vm_paddr_t page; + + page = allocpages(1); + pagezero(PHYS_TO_DMAP(page)); + return (page); +} + +static void +mmu_radix_dmap_range(vm_paddr_t start, vm_paddr_t end) +{ + pt_entry_t *pte, pteval; + vm_paddr_t page; + + if (bootverbose) + printf("%s %lx -> %lx\n", __func__, start, end); + while (start < end) { + pteval = start | DMAP_PAGE_BITS; + pte = pmap_pml1e(kernel_pmap, PHYS_TO_DMAP(start)); + if ((be64toh(*pte) & RPTE_VALID) == 0) { + page = alloc_pt_page(); + pde_store(pte, page); + } + pte = pmap_l1e_to_l2e(pte, PHYS_TO_DMAP(start)); + if ((start & L2_PAGE_MASK) == 0 && + end - start >= L2_PAGE_SIZE) { + start += L2_PAGE_SIZE; + goto done; + } else if ((be64toh(*pte) & RPTE_VALID) == 0) { + page = alloc_pt_page(); + pde_store(pte, page); + } + + pte = pmap_l2e_to_l3e(pte, PHYS_TO_DMAP(start)); + if ((start & L3_PAGE_MASK) == 0 && + end - start >= L3_PAGE_SIZE) { + start += L3_PAGE_SIZE; + goto done; + } else if ((be64toh(*pte) & RPTE_VALID) == 0) { + page = alloc_pt_page(); + pde_store(pte, page); + } + pte = pmap_l3e_to_pte(pte, PHYS_TO_DMAP(start)); + start += PAGE_SIZE; + done: + pte_store(pte, pteval); + } +} + +static void +mmu_radix_dmap_populate(vm_size_t hwphyssz) +{ + vm_paddr_t start, end; + + for (int i = 0; i < pregions_sz; i++) { + start = pregions[i].mr_start; + end = start + pregions[i].mr_size; + if (hwphyssz && start >= hwphyssz) + break; + if (hwphyssz && hwphyssz < end) + end = hwphyssz; + mmu_radix_dmap_range(start, end); + } +} + +static void +mmu_radix_setup_pagetables(vm_size_t hwphyssz) +{ + vm_paddr_t ptpages, pages; + pt_entry_t *pte; + vm_paddr_t l1phys; + + bzero(kernel_pmap, sizeof(struct pmap)); + PMAP_LOCK_INIT(kernel_pmap); + vm_radix_init(&kernel_pmap->pm_radix); + + ptpages = allocpages(3); + l1phys = moea64_bootstrap_alloc(RADIX_PGD_SIZE, RADIX_PGD_SIZE); + validate_addr(l1phys, RADIX_PGD_SIZE); + if (bootverbose) + printf("l1phys=%lx\n", l1phys); + MPASS((l1phys & (RADIX_PGD_SIZE-1)) == 0); + for (int i = 0; i < RADIX_PGD_SIZE/PAGE_SIZE; i++) + pagezero(PHYS_TO_DMAP(l1phys + i * PAGE_SIZE)); + kernel_pmap->pm_pml1 = (pml1_entry_t *)PHYS_TO_DMAP(l1phys); + + mmu_radix_dmap_populate(hwphyssz); + + /* + * Create page tables for first 128MB of KVA + */ + pages = ptpages; + pte = pmap_pml1e(kernel_pmap, VM_MIN_KERNEL_ADDRESS); + *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT); + pages += PAGE_SIZE; + pte = pmap_l1e_to_l2e(pte, VM_MIN_KERNEL_ADDRESS); + *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT); + pages += PAGE_SIZE; + pte = pmap_l2e_to_l3e(pte, VM_MIN_KERNEL_ADDRESS); + /* + * the kernel page table pages need to be preserved in + * phys_avail and not overlap with previous allocations + */ + pages = allocpages(nkpt); + if (bootverbose) { + printf("phys_avail after dmap populate and nkpt allocation\n"); + for (int j = 0; j < 2 * phys_avail_count; j+=2) + printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n", + j, phys_avail[j], j + 1, phys_avail[j + 1]); + } + KPTphys = pages; + for (int i = 0; i < nkpt; i++, pte++, pages += PAGE_SIZE) + *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT); + kernel_vm_end = VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE; + if (bootverbose) + printf("kernel_pmap pml1 %p\n", kernel_pmap->pm_pml1); + /* + * Add a physical memory segment (vm_phys_seg) corresponding to the + * preallocated kernel page table pages so that vm_page structures + * representing these pages will be created. The vm_page structures + * are required for promotion of the corresponding kernel virtual + * addresses to superpage mappings. + */ + vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); +} + +static void +mmu_radix_early_bootstrap(vm_offset_t start, vm_offset_t end) +{ + vm_paddr_t kpstart, kpend; + vm_size_t physsz, hwphyssz; + //uint64_t l2virt; + int rm_pavail, proctab_size; + int i, j; + + kpstart = start & ~DMAP_BASE_ADDRESS; + kpend = end & ~DMAP_BASE_ADDRESS; + + /* Get physical memory regions from firmware */ + mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); + CTR0(KTR_PMAP, "mmu_radix_early_bootstrap: physical memory"); + + if (2 * VM_PHYSSEG_MAX < regions_sz) + panic("mmu_radix_early_bootstrap: phys_avail too small"); + + if (bootverbose) + for (int i = 0; i < regions_sz; i++) + printf("regions[%d].mr_start=%lx regions[%d].mr_size=%lx\n", + i, regions[i].mr_start, i, regions[i].mr_size); + /* + * XXX workaround a simulator bug + */ + for (int i = 0; i < regions_sz; i++) + if (regions[i].mr_start & PAGE_MASK) { + regions[i].mr_start += PAGE_MASK; + regions[i].mr_start &= ~PAGE_MASK; + regions[i].mr_size &= ~PAGE_MASK; + } + if (bootverbose) + for (int i = 0; i < pregions_sz; i++) + printf("pregions[%d].mr_start=%lx pregions[%d].mr_size=%lx\n", + i, pregions[i].mr_start, i, pregions[i].mr_size); + + phys_avail_count = 0; + physsz = 0; + hwphyssz = 0; + TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); + for (i = 0, j = 0; i < regions_sz; i++) { + if (bootverbose) + printf("regions[%d].mr_start=%016lx regions[%d].mr_size=%016lx\n", + i, regions[i].mr_start, i, regions[i].mr_size); + + if (regions[i].mr_size < PAGE_SIZE) + continue; + + if (hwphyssz != 0 && + (physsz + regions[i].mr_size) >= hwphyssz) { + if (physsz < hwphyssz) { + phys_avail[j] = regions[i].mr_start; + phys_avail[j + 1] = regions[i].mr_start + + (hwphyssz - physsz); + physsz = hwphyssz; + phys_avail_count++; + dump_avail[j] = phys_avail[j]; + dump_avail[j + 1] = phys_avail[j + 1]; + } + break; + } + phys_avail[j] = regions[i].mr_start; + phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; + dump_avail[j] = phys_avail[j]; + dump_avail[j + 1] = phys_avail[j + 1]; + + phys_avail_count++; + physsz += regions[i].mr_size; + j += 2; + } + + /* Check for overlap with the kernel and exception vectors */ + rm_pavail = 0; + for (j = 0; j < 2 * phys_avail_count; j+=2) { + if (phys_avail[j] < EXC_LAST) + phys_avail[j] += EXC_LAST; + + if (phys_avail[j] >= kpstart && + phys_avail[j + 1] <= kpend) { + phys_avail[j] = phys_avail[j + 1] = ~0; + rm_pavail++; + continue; + } + + if (kpstart >= phys_avail[j] && + kpstart < phys_avail[j + 1]) { + if (kpend < phys_avail[j + 1]) { + phys_avail[2 * phys_avail_count] = + (kpend & ~PAGE_MASK) + PAGE_SIZE; + phys_avail[2 * phys_avail_count + 1] = + phys_avail[j + 1]; + phys_avail_count++; + } + + phys_avail[j + 1] = kpstart & ~PAGE_MASK; + } + + if (kpend >= phys_avail[j] && + kpend < phys_avail[j + 1]) { + if (kpstart > phys_avail[j]) { + phys_avail[2 * phys_avail_count] = phys_avail[j]; + phys_avail[2 * phys_avail_count + 1] = + kpstart & ~PAGE_MASK; + phys_avail_count++; + } + + phys_avail[j] = (kpend & ~PAGE_MASK) + + PAGE_SIZE; + } + } + qsort(phys_avail, 2 * phys_avail_count, sizeof(phys_avail[0]), pa_cmp); + for (i = 0; i < 2 * phys_avail_count; i++) + phys_avail_debug[i] = phys_avail[i]; + + /* Remove physical available regions marked for removal (~0) */ + if (rm_pavail) { + phys_avail_count -= rm_pavail; + for (i = 2 * phys_avail_count; + i < 2*(phys_avail_count + rm_pavail); i+=2) + phys_avail[i] = phys_avail[i + 1] = 0; + } + if (bootverbose) { + printf("phys_avail ranges after filtering:\n"); + for (j = 0; j < 2 * phys_avail_count; j+=2) + printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n", + j, phys_avail[j], j + 1, phys_avail[j + 1]); + } + physmem = btoc(physsz); + + /* XXX assume we're running non-virtualized and + * we don't support BHYVE + */ + if (isa3_pid_bits == 0) + isa3_pid_bits = 20; + if (powernv_enabled) { + parttab_phys = + moea64_bootstrap_alloc(PARTTAB_SIZE, PARTTAB_SIZE); + validate_addr(parttab_phys, PARTTAB_SIZE); + for (int i = 0; i < PARTTAB_SIZE/PAGE_SIZE; i++) + pagezero(PHYS_TO_DMAP(parttab_phys + i * PAGE_SIZE)); + + } + proctab_size = 1UL << PROCTAB_SIZE_SHIFT; + proctab0pa = moea64_bootstrap_alloc(proctab_size, proctab_size); + validate_addr(proctab0pa, proctab_size); + for (int i = 0; i < proctab_size/PAGE_SIZE; i++) + pagezero(PHYS_TO_DMAP(proctab0pa + i * PAGE_SIZE)); + + mmu_radix_setup_pagetables(hwphyssz); +} + +static void +mmu_radix_late_bootstrap(vm_offset_t start, vm_offset_t end) +{ + int i; + vm_paddr_t pa; + void *dpcpu; + vm_offset_t va; + + /* + * Set up the Open Firmware pmap and add its mappings if not in real + * mode. + */ + if (bootverbose) + printf("%s enter\n", __func__); + + /* + * Calculate the last available physical address, and reserve the + * vm_page_array (upper bound). + */ + Maxmem = 0; + for (i = 0; phys_avail[i + 1] != 0; i += 2) + Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1])); + + /* + * Remap any early IO mappings (console framebuffer, etc.) + */ + bs_remap_earlyboot(); + + /* + * Allocate a kernel stack with a guard page for thread0 and map it + * into the kernel page map. + */ + pa = allocpages(kstack_pages); + va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; + virtual_avail = va + kstack_pages * PAGE_SIZE; + CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va); + thread0.td_kstack = va; + for (i = 0; i < kstack_pages; i++) { + mmu_radix_kenter(va, pa); + pa += PAGE_SIZE; + va += PAGE_SIZE; + } + thread0.td_kstack_pages = kstack_pages; + + /* + * Allocate virtual address space for the message buffer. + */ + pa = msgbuf_phys = allocpages((msgbufsize + PAGE_MASK) >> PAGE_SHIFT); + msgbufp = (struct msgbuf *)PHYS_TO_DMAP(pa); + + /* + * Allocate virtual address space for the dynamic percpu area. + */ + pa = allocpages(DPCPU_SIZE >> PAGE_SHIFT); + dpcpu = (void *)PHYS_TO_DMAP(pa); + dpcpu_init(dpcpu, curcpu); + + crashdumpmap = (caddr_t)virtual_avail; + virtual_avail += MAXDUMPPGS * PAGE_SIZE; + + /* + * Reserve some special page table entries/VA space for temporary + * mapping of pages. + */ +} + +static void +mmu_parttab_init(void) +{ + uint64_t ptcr; + + isa3_parttab = (struct pate *)PHYS_TO_DMAP(parttab_phys); + + if (bootverbose) + printf("%s parttab: %p\n", __func__, isa3_parttab); + ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12); + if (bootverbose) + printf("setting ptcr %lx\n", ptcr); + mtspr(SPR_PTCR, ptcr); +} + +static void +mmu_parttab_update(uint64_t lpid, uint64_t pagetab, uint64_t proctab) +{ + uint64_t prev; + + if (bootverbose) + printf("%s isa3_parttab %p lpid %lx pagetab %lx proctab %lx\n", __func__, isa3_parttab, + lpid, pagetab, proctab); + prev = be64toh(isa3_parttab[lpid].pagetab); + isa3_parttab[lpid].pagetab = htobe64(pagetab); + isa3_parttab[lpid].proctab = htobe64(proctab); + + if (prev & PARTTAB_HR) { + __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + } else { + __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + } + ttusync(); +} + +static void +mmu_radix_parttab_init(void) +{ + uint64_t pagetab; + + mmu_parttab_init(); + pagetab = RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | \ + RADIX_PGD_INDEX_SHIFT | PARTTAB_HR; + mmu_parttab_update(0, pagetab, 0); +} + +static void +mmu_radix_proctab_register(vm_paddr_t proctabpa, uint64_t table_size) +{ + uint64_t pagetab, proctab; + + pagetab = be64toh(isa3_parttab[0].pagetab); + proctab = proctabpa | table_size | PARTTAB_GR; + mmu_parttab_update(0, pagetab, proctab); +} + +static void +mmu_radix_proctab_init(void) +{ + + isa3_base_pid = 1; + + isa3_proctab = (void*)PHYS_TO_DMAP(proctab0pa); + isa3_proctab->proctab0 = + htobe64(RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | + RADIX_PGD_INDEX_SHIFT); + + if (powernv_enabled) { + mmu_radix_proctab_register(proctab0pa, PROCTAB_SIZE_SHIFT - 12); + __asm __volatile("ptesync" : : : "memory"); + __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); + __asm __volatile("eieio; tlbsync; ptesync" : : : "memory"); +#ifdef PSERIES + } else { + int64_t rc; + + rc = phyp_hcall(H_REGISTER_PROC_TBL, + PROC_TABLE_NEW | PROC_TABLE_RADIX | PROC_TABLE_GTSE, + proctab0pa, 0, PROCTAB_SIZE_SHIFT - 12); + if (rc != H_SUCCESS) + panic("mmu_radix_proctab_init: " + "failed to register process table: rc=%jd", + (intmax_t)rc); +#endif + } + + if (bootverbose) + printf("process table %p and kernel radix PDE: %p\n", + isa3_proctab, kernel_pmap->pm_pml1); + mtmsr(mfmsr() | PSL_DR ); + mtmsr(mfmsr() & ~PSL_DR); + kernel_pmap->pm_pid = isa3_base_pid; + isa3_base_pid++; +} + +void +mmu_radix_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + int advice) +{ + struct rwlock *lock; + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t oldl3e, *l3e; + pt_entry_t *pte; + vm_offset_t va, va_next; + vm_page_t m; + bool anychanged; + + if (advice != MADV_DONTNEED && advice != MADV_FREE) + return; + anychanged = false; + PMAP_LOCK(pmap); + for (; sva < eva; sva = va_next) { + l1e = pmap_pml1e(pmap, sva); + if ((be64toh(*l1e) & PG_V) == 0) { + va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + l2e = pmap_l1e_to_l2e(l1e, sva); + if ((be64toh(*l2e) & PG_V) == 0) { + va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (va_next < sva) + va_next = eva; + l3e = pmap_l2e_to_l3e(l2e, sva); + oldl3e = be64toh(*l3e); + if ((oldl3e & PG_V) == 0) + continue; + else if ((oldl3e & RPTE_LEAF) != 0) { + if ((oldl3e & PG_MANAGED) == 0) + continue; + lock = NULL; + if (!pmap_demote_l3e_locked(pmap, l3e, sva, &lock)) { + if (lock != NULL) + rw_wunlock(lock); + + /* + * The large page mapping was destroyed. + */ + continue; + } + + /* + * Unless the page mappings are wired, remove the + * mapping to a single page so that a subsequent + * access may repromote. Choosing the last page + * within the address range [sva, min(va_next, eva)) + * generally results in more repromotions. Since the + * underlying page table page is fully populated, this + * removal never frees a page table page. + */ + if ((oldl3e & PG_W) == 0) { + va = eva; + if (va > va_next) + va = va_next; + va -= PAGE_SIZE; + KASSERT(va >= sva, + ("mmu_radix_advise: no address gap")); + pte = pmap_l3e_to_pte(l3e, va); + KASSERT((be64toh(*pte) & PG_V) != 0, + ("pmap_advise: invalid PTE")); + pmap_remove_pte(pmap, pte, va, be64toh(*l3e), NULL, + &lock); + anychanged = true; + } + if (lock != NULL) + rw_wunlock(lock); + } + if (va_next > eva) + va_next = eva; + va = va_next; + for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; + pte++, sva += PAGE_SIZE) { + MPASS(pte == pmap_pte(pmap, sva)); + + if ((be64toh(*pte) & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) + goto maybe_invlrng; + else if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + if (advice == MADV_DONTNEED) { + /* + * Future calls to pmap_is_modified() + * can be avoided by making the page + * dirty now. + */ + m = PHYS_TO_VM_PAGE(be64toh(*pte) & PG_FRAME); + vm_page_dirty(m); + } + atomic_clear_long(pte, htobe64(PG_M | PG_A)); + } else if ((be64toh(*pte) & PG_A) != 0) + atomic_clear_long(pte, htobe64(PG_A)); + else + goto maybe_invlrng; + anychanged = true; + continue; +maybe_invlrng: + if (va != va_next) { + anychanged = true; + va = va_next; + } + } + if (va != va_next) + anychanged = true; + } + if (anychanged) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); +} + +/* + * Routines used in machine-dependent code + */ +static void +mmu_radix_bootstrap(vm_offset_t start, vm_offset_t end) +{ + uint64_t lpcr; + + if (bootverbose) + printf("%s\n", __func__); + hw_direct_map = 1; + powernv_enabled = (mfmsr() & PSL_HV) ? 1 : 0; + mmu_radix_early_bootstrap(start, end); + if (bootverbose) + printf("early bootstrap complete\n"); + if (powernv_enabled) { + lpcr = mfspr(SPR_LPCR); + mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR); + mmu_radix_parttab_init(); + mmu_radix_init_amor(); + if (bootverbose) + printf("powernv init complete\n"); + } + mmu_radix_init_iamr(); + mmu_radix_proctab_init(); + mmu_radix_pid_set(kernel_pmap); + if (powernv_enabled) + mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); + else + mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID); + + mmu_radix_late_bootstrap(start, end); + numa_mem_regions(&numa_pregions, &numa_pregions_sz); + if (bootverbose) + printf("%s done\n", __func__); + pmap_bootstrapped = 1; + dmaplimit = roundup2(powerpc_ptob(Maxmem), L2_PAGE_SIZE); + PCPU_SET(flags, PCPU_GET(flags) | PC_FLAG_NOSRS); +} + +static void +mmu_radix_cpu_bootstrap(int ap) +{ + uint64_t lpcr; + uint64_t ptcr; + + if (powernv_enabled) { + lpcr = mfspr(SPR_LPCR); + mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR); + + ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12); + mtspr(SPR_PTCR, ptcr); + mmu_radix_init_amor(); + } + mmu_radix_init_iamr(); + mmu_radix_pid_set(kernel_pmap); + if (powernv_enabled) + mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL); + else + mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID); +} + +static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3e, CTLFLAG_RD, 0, + "2MB page mapping counters"); + +static COUNTER_U64_DEFINE_EARLY(pmap_l3e_demotions); +SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, demotions, CTLFLAG_RD, + &pmap_l3e_demotions, "2MB page demotions"); + +static COUNTER_U64_DEFINE_EARLY(pmap_l3e_mappings); +SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, mappings, CTLFLAG_RD, + &pmap_l3e_mappings, "2MB page mappings"); + +static COUNTER_U64_DEFINE_EARLY(pmap_l3e_p_failures); +SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, p_failures, CTLFLAG_RD, + &pmap_l3e_p_failures, "2MB page promotion failures"); + +static COUNTER_U64_DEFINE_EARLY(pmap_l3e_promotions); +SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, promotions, CTLFLAG_RD, + &pmap_l3e_promotions, "2MB page promotions"); + +static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2e, CTLFLAG_RD, 0, + "1GB page mapping counters"); + +static COUNTER_U64_DEFINE_EARLY(pmap_l2e_demotions); +SYSCTL_COUNTER_U64(_vm_pmap_l2e, OID_AUTO, demotions, CTLFLAG_RD, + &pmap_l2e_demotions, "1GB page demotions"); + +void +mmu_radix_clear_modify(vm_page_t m) +{ + struct md_page *pvh; + pmap_t pmap; + pv_entry_t next_pv, pv; + pml3_entry_t oldl3e, *l3e; + pt_entry_t oldpte, *pte; + struct rwlock *lock; + vm_offset_t va; + int md_gen, pvh_gen; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_clear_modify: page %p is not managed", m)); + vm_page_assert_busied(m); + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + + /* + * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. + * If the object containing the page is locked and the page is not + * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. + */ + if ((m->a.flags & PGA_WRITEABLE) == 0) + return; + pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : + pa_to_pvh(VM_PAGE_TO_PHYS(m)); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_wlock(lock); +restart: + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + va = pv->pv_va; + l3e = pmap_pml3e(pmap, va); + oldl3e = be64toh(*l3e); + if ((oldl3e & PG_RW) != 0 && + pmap_demote_l3e_locked(pmap, l3e, va, &lock) && + (oldl3e & PG_W) == 0) { + /* + * Write protect the mapping to a + * single page so that a subsequent + * write access may repromote. + */ + va += VM_PAGE_TO_PHYS(m) - (oldl3e & + PG_PS_FRAME); + pte = pmap_l3e_to_pte(l3e, va); + oldpte = be64toh(*pte); + while (!atomic_cmpset_long(pte, + htobe64(oldpte), + htobe64((oldpte | RPTE_EAA_R) & ~(PG_M | PG_RW)))) + oldpte = be64toh(*pte); + vm_page_dirty(m); + pmap_invalidate_page(pmap, va); + } + PMAP_UNLOCK(pmap); + } + TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + l3e = pmap_pml3e(pmap, pv->pv_va); + KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_clear_modify: found" + " a 2mpage in page %p's pv list", m)); + pte = pmap_l3e_to_pte(l3e, pv->pv_va); + if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + atomic_clear_long(pte, htobe64(PG_M)); + pmap_invalidate_page(pmap, pv->pv_va); + } + PMAP_UNLOCK(pmap); + } + rw_wunlock(lock); +} + +void +mmu_radix_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, + vm_size_t len, vm_offset_t src_addr) +{ + struct rwlock *lock; + struct spglist free; + vm_offset_t addr; + vm_offset_t end_addr = src_addr + len; + vm_offset_t va_next; + vm_page_t dst_pdpg, dstmpte, srcmpte; + bool invalidate_all; + + CTR6(KTR_PMAP, + "%s(dst_pmap=%p, src_pmap=%p, dst_addr=%lx, len=%lu, src_addr=%lx)\n", + __func__, dst_pmap, src_pmap, dst_addr, len, src_addr); + + if (dst_addr != src_addr) + return; + lock = NULL; + invalidate_all = false; + if (dst_pmap < src_pmap) { + PMAP_LOCK(dst_pmap); + PMAP_LOCK(src_pmap); + } else { + PMAP_LOCK(src_pmap); + PMAP_LOCK(dst_pmap); + } + + for (addr = src_addr; addr < end_addr; addr = va_next) { + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t srcptepaddr, *l3e; + pt_entry_t *src_pte, *dst_pte; + + l1e = pmap_pml1e(src_pmap, addr); + if ((be64toh(*l1e) & PG_V) == 0) { + va_next = (addr + L1_PAGE_SIZE) & ~L1_PAGE_MASK; + if (va_next < addr) + va_next = end_addr; + continue; + } + + l2e = pmap_l1e_to_l2e(l1e, addr); + if ((be64toh(*l2e) & PG_V) == 0) { + va_next = (addr + L2_PAGE_SIZE) & ~L2_PAGE_MASK; + if (va_next < addr) + va_next = end_addr; + continue; + } + + va_next = (addr + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (va_next < addr) + va_next = end_addr; + + l3e = pmap_l2e_to_l3e(l2e, addr); + srcptepaddr = be64toh(*l3e); + if (srcptepaddr == 0) + continue; + + if (srcptepaddr & RPTE_LEAF) { + if ((addr & L3_PAGE_MASK) != 0 || + addr + L3_PAGE_SIZE > end_addr) + continue; + dst_pdpg = pmap_allocl3e(dst_pmap, addr, NULL); + if (dst_pdpg == NULL) + break; + l3e = (pml3_entry_t *) + PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg)); + l3e = &l3e[pmap_pml3e_index(addr)]; + if (be64toh(*l3e) == 0 && ((srcptepaddr & PG_MANAGED) == 0 || + pmap_pv_insert_l3e(dst_pmap, addr, srcptepaddr, + PMAP_ENTER_NORECLAIM, &lock))) { + *l3e = htobe64(srcptepaddr & ~PG_W); + pmap_resident_count_inc(dst_pmap, + L3_PAGE_SIZE / PAGE_SIZE); + counter_u64_add(pmap_l3e_mappings, 1); + } else + dst_pdpg->ref_count--; + continue; + } + + srcptepaddr &= PG_FRAME; + srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); + KASSERT(srcmpte->ref_count > 0, + ("pmap_copy: source page table page is unused")); + + if (va_next > end_addr) + va_next = end_addr; + + src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); + src_pte = &src_pte[pmap_pte_index(addr)]; + dstmpte = NULL; + while (addr < va_next) { + pt_entry_t ptetemp; + ptetemp = be64toh(*src_pte); + /* + * we only virtual copy managed pages + */ + if ((ptetemp & PG_MANAGED) != 0) { + if (dstmpte != NULL && + dstmpte->pindex == pmap_l3e_pindex(addr)) + dstmpte->ref_count++; + else if ((dstmpte = pmap_allocpte(dst_pmap, + addr, NULL)) == NULL) + goto out; + dst_pte = (pt_entry_t *) + PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); + dst_pte = &dst_pte[pmap_pte_index(addr)]; + if (be64toh(*dst_pte) == 0 && + pmap_try_insert_pv_entry(dst_pmap, addr, + PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), + &lock)) { + /* + * Clear the wired, modified, and + * accessed (referenced) bits + * during the copy. + */ + *dst_pte = htobe64(ptetemp & ~(PG_W | PG_M | + PG_A)); + pmap_resident_count_inc(dst_pmap, 1); + } else { + SLIST_INIT(&free); + if (pmap_unwire_ptp(dst_pmap, addr, + dstmpte, &free)) { + /* + * Although "addr" is not + * mapped, paging-structure + * caches could nonetheless + * have entries that refer to + * the freed page table pages. + * Invalidate those entries. + */ + invalidate_all = true; + vm_page_free_pages_toq(&free, + true); + } + goto out; + } + if (dstmpte->ref_count >= srcmpte->ref_count) + break; + } + addr += PAGE_SIZE; + if (__predict_false((addr & L3_PAGE_MASK) == 0)) + src_pte = pmap_pte(src_pmap, addr); + else + src_pte++; + } + } +out: + if (invalidate_all) + pmap_invalidate_all(dst_pmap); + if (lock != NULL) + rw_wunlock(lock); + PMAP_UNLOCK(src_pmap); + PMAP_UNLOCK(dst_pmap); +} + +static void +mmu_radix_copy_page(vm_page_t msrc, vm_page_t mdst) +{ + vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); + vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); + + CTR3(KTR_PMAP, "%s(%p, %p)", __func__, src, dst); + /* + * XXX slow + */ + bcopy((void *)src, (void *)dst, PAGE_SIZE); +} + +static void +mmu_radix_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + void *a_cp, *b_cp; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + CTR6(KTR_PMAP, "%s(%p, %#x, %p, %#x, %#x)", __func__, ma, + a_offset, mb, b_offset, xfersize); + + while (xfersize > 0) { + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + a_cp = (char *)(uintptr_t)PHYS_TO_DMAP( + VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) + + a_pg_offset; + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + b_cp = (char *)(uintptr_t)PHYS_TO_DMAP( + VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) + + b_pg_offset; + bcopy(a_cp, b_cp, cnt); + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } +} + +#if VM_NRESERVLEVEL > 0 +/* + * Tries to promote the 512, contiguous 4KB page mappings that are within a + * single page table page (PTP) to a single 2MB page mapping. For promotion + * to occur, two conditions must be met: (1) the 4KB page mappings must map + * aligned, contiguous physical memory and (2) the 4KB page mappings must have + * identical characteristics. + */ +static int +pmap_promote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va, + struct rwlock **lockp) +{ + pml3_entry_t newpde; + pt_entry_t *firstpte, oldpte, pa, *pte; + vm_page_t mpte; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * Examine the first PTE in the specified PTP. Abort if this PTE is + * either invalid, unused, or does not map the first 4KB physical page + * within a 2MB page. + */ + firstpte = (pt_entry_t *)PHYS_TO_DMAP(be64toh(*pde) & PG_FRAME); +setpde: + newpde = be64toh(*firstpte); + if ((newpde & ((PG_FRAME & L3_PAGE_MASK) | PG_A | PG_V)) != (PG_A | PG_V)) { + CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" + " in pmap %p", va, pmap); + goto fail; + } + if ((newpde & (PG_M | PG_RW)) == PG_RW) { + /* + * When PG_M is already clear, PG_RW can be cleared without + * a TLB invalidation. + */ + if (!atomic_cmpset_long(firstpte, htobe64(newpde), htobe64((newpde | RPTE_EAA_R) & ~RPTE_EAA_W))) + goto setpde; + newpde &= ~RPTE_EAA_W; + } + + /* + * Examine each of the other PTEs in the specified PTP. Abort if this + * PTE maps an unexpected 4KB physical page or does not have identical + * characteristics to the first PTE. + */ + pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + L3_PAGE_SIZE - PAGE_SIZE; + for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { +setpte: + oldpte = be64toh(*pte); + if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { + CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" + " in pmap %p", va, pmap); + goto fail; + } + if ((oldpte & (PG_M | PG_RW)) == PG_RW) { + /* + * When PG_M is already clear, PG_RW can be cleared + * without a TLB invalidation. + */ + if (!atomic_cmpset_long(pte, htobe64(oldpte), htobe64((oldpte | RPTE_EAA_R) & ~RPTE_EAA_W))) + goto setpte; + oldpte &= ~RPTE_EAA_W; + CTR2(KTR_PMAP, "pmap_promote_l3e: protect for va %#lx" + " in pmap %p", (oldpte & PG_FRAME & L3_PAGE_MASK) | + (va & ~L3_PAGE_MASK), pmap); + } + if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { + CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx" + " in pmap %p", va, pmap); + goto fail; + } + pa -= PAGE_SIZE; + } + + /* + * Save the page table page in its current state until the PDE + * mapping the superpage is demoted by pmap_demote_pde() or + * destroyed by pmap_remove_pde(). + */ + mpte = PHYS_TO_VM_PAGE(be64toh(*pde) & PG_FRAME); + KASSERT(mpte >= vm_page_array && + mpte < &vm_page_array[vm_page_array_size], + ("pmap_promote_l3e: page table page is out of range")); + KASSERT(mpte->pindex == pmap_l3e_pindex(va), + ("pmap_promote_l3e: page table page's pindex is wrong")); + if (pmap_insert_pt_page(pmap, mpte)) { + CTR2(KTR_PMAP, + "pmap_promote_l3e: failure for va %#lx in pmap %p", va, + pmap); + goto fail; + } + + /* + * Promote the pv entries. + */ + if ((newpde & PG_MANAGED) != 0) + pmap_pv_promote_l3e(pmap, va, newpde & PG_PS_FRAME, lockp); + + pte_store(pde, PG_PROMOTED | newpde); + ptesync(); + counter_u64_add(pmap_l3e_promotions, 1); + CTR2(KTR_PMAP, "pmap_promote_l3e: success for va %#lx" + " in pmap %p", va, pmap); + return (0); + fail: + counter_u64_add(pmap_l3e_p_failures, 1); + return (KERN_FAILURE); +} +#endif /* VM_NRESERVLEVEL > 0 */ + +int +mmu_radix_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, u_int flags, int8_t psind) +{ + struct rwlock *lock; + pml3_entry_t *l3e; + pt_entry_t *pte; + pt_entry_t newpte, origpte; + pv_entry_t pv; + vm_paddr_t opa, pa; + vm_page_t mpte, om; + int rv, retrycount; + bool nosleep, invalidate_all, invalidate_page; + + va = trunc_page(va); + retrycount = 0; + invalidate_page = invalidate_all = false; + CTR6(KTR_PMAP, "pmap_enter(%p, %#lx, %p, %#x, %#x, %d)", pmap, va, + m, prot, flags, psind); + KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); + KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), + ("pmap_enter: managed mapping within the clean submap")); + if ((m->oflags & VPO_UNMANAGED) == 0) + VM_PAGE_OBJECT_BUSY_ASSERT(m); + + KASSERT((flags & PMAP_ENTER_RESERVED) == 0, + ("pmap_enter: flags %u has reserved bits set", flags)); + pa = VM_PAGE_TO_PHYS(m); + newpte = (pt_entry_t)(pa | PG_A | PG_V | RPTE_LEAF); + if ((flags & VM_PROT_WRITE) != 0) + newpte |= PG_M; + if ((flags & VM_PROT_READ) != 0) + newpte |= PG_A; + if (prot & VM_PROT_READ) + newpte |= RPTE_EAA_R; + if ((prot & VM_PROT_WRITE) != 0) + newpte |= RPTE_EAA_W; + KASSERT((newpte & (PG_M | PG_RW)) != PG_M, + ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); + + if (prot & VM_PROT_EXECUTE) + newpte |= PG_X; + if ((flags & PMAP_ENTER_WIRED) != 0) + newpte |= PG_W; + if (va >= DMAP_MIN_ADDRESS) + newpte |= RPTE_EAA_P; + newpte |= pmap_cache_bits(m->md.mdpg_cache_attrs); + /* + * Set modified bit gratuitously for writeable mappings if + * the page is unmanaged. We do not want to take a fault + * to do the dirty bit accounting for these mappings. + */ + if ((m->oflags & VPO_UNMANAGED) != 0) { + if ((newpte & PG_RW) != 0) + newpte |= PG_M; + } else + newpte |= PG_MANAGED; + + lock = NULL; + PMAP_LOCK(pmap); + if (psind == 1) { + /* Assert the required virtual and physical alignment. */ + KASSERT((va & L3_PAGE_MASK) == 0, ("pmap_enter: va unaligned")); + KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); + rv = pmap_enter_l3e(pmap, va, newpte | RPTE_LEAF, flags, m, &lock); + goto out; + } + mpte = NULL; + + /* + * In the case that a page table page is not + * resident, we are creating it here. + */ +retry: + l3e = pmap_pml3e(pmap, va); + if (l3e != NULL && (be64toh(*l3e) & PG_V) != 0 && ((be64toh(*l3e) & RPTE_LEAF) == 0 || + pmap_demote_l3e_locked(pmap, l3e, va, &lock))) { + pte = pmap_l3e_to_pte(l3e, va); + if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { + mpte = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME); + mpte->ref_count++; + } + } else if (va < VM_MAXUSER_ADDRESS) { + /* + * Here if the pte page isn't mapped, or if it has been + * deallocated. + */ + nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; + mpte = _pmap_allocpte(pmap, pmap_l3e_pindex(va), + nosleep ? NULL : &lock); + if (mpte == NULL && nosleep) { + rv = KERN_RESOURCE_SHORTAGE; + goto out; + } + if (__predict_false(retrycount++ == 6)) + panic("too many retries"); + invalidate_all = true; + goto retry; + } else + panic("pmap_enter: invalid page directory va=%#lx", va); + + origpte = be64toh(*pte); + pv = NULL; + + /* + * Is the specified virtual address already mapped? + */ + if ((origpte & PG_V) != 0) { +#ifdef INVARIANTS + if (VERBOSE_PMAP || pmap_logging) { + printf("cow fault pmap_enter(%p, %#lx, %p, %#x, %x, %d) --" + " asid=%lu curpid=%d name=%s origpte0x%lx\n", + pmap, va, m, prot, flags, psind, pmap->pm_pid, + curproc->p_pid, curproc->p_comm, origpte); +#ifdef DDB + pmap_pte_walk(pmap->pm_pml1, va); +#endif + } +#endif + /* + * Wiring change, just update stats. We don't worry about + * wiring PT pages as they remain resident as long as there + * are valid mappings in them. Hence, if a user page is wired, + * the PT page will be also. + */ + if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) + pmap->pm_stats.wired_count++; + else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) + pmap->pm_stats.wired_count--; + + /* + * Remove the extra PT page reference. + */ + if (mpte != NULL) { + mpte->ref_count--; + KASSERT(mpte->ref_count > 0, + ("pmap_enter: missing reference to page table page," + " va: 0x%lx", va)); + } + + /* + * Has the physical page changed? + */ + opa = origpte & PG_FRAME; + if (opa == pa) { + /* + * No, might be a protection or wiring change. + */ + if ((origpte & PG_MANAGED) != 0 && + (newpte & PG_RW) != 0) + vm_page_aflag_set(m, PGA_WRITEABLE); + if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) { + if ((newpte & (PG_A|PG_M)) != (origpte & (PG_A|PG_M))) { + if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte))) + goto retry; + if ((newpte & PG_M) != (origpte & PG_M)) + vm_page_dirty(m); + if ((newpte & PG_A) != (origpte & PG_A)) + vm_page_aflag_set(m, PGA_REFERENCED); + ptesync(); + } else + invalidate_all = true; + if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) + goto unchanged; + } + goto validate; + } + + /* + * The physical page has changed. Temporarily invalidate + * the mapping. This ensures that all threads sharing the + * pmap keep a consistent view of the mapping, which is + * necessary for the correct handling of COW faults. It + * also permits reuse of the old mapping's PV entry, + * avoiding an allocation. + * + * For consistency, handle unmanaged mappings the same way. + */ + origpte = be64toh(pte_load_clear(pte)); + KASSERT((origpte & PG_FRAME) == opa, + ("pmap_enter: unexpected pa update for %#lx", va)); + if ((origpte & PG_MANAGED) != 0) { + om = PHYS_TO_VM_PAGE(opa); + + /* + * The pmap lock is sufficient to synchronize with + * concurrent calls to pmap_page_test_mappings() and + * pmap_ts_referenced(). + */ + if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(om); + if ((origpte & PG_A) != 0) + vm_page_aflag_set(om, PGA_REFERENCED); + CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); + pv = pmap_pvh_remove(&om->md, pmap, va); + if ((newpte & PG_MANAGED) == 0) + free_pv_entry(pmap, pv); +#ifdef INVARIANTS + else if (origpte & PG_MANAGED) { + if (pv == NULL) { +#ifdef DDB + pmap_page_print_mappings(om); +#endif + MPASS(pv != NULL); + } + } +#endif + if ((om->a.flags & PGA_WRITEABLE) != 0 && + TAILQ_EMPTY(&om->md.pv_list) && + ((om->flags & PG_FICTITIOUS) != 0 || + TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) + vm_page_aflag_clear(om, PGA_WRITEABLE); + } + if ((origpte & PG_A) != 0) + invalidate_page = true; + origpte = 0; + } else { + if (pmap != kernel_pmap) { +#ifdef INVARIANTS + if (VERBOSE_PMAP || pmap_logging) + printf("pmap_enter(%p, %#lx, %p, %#x, %x, %d) -- asid=%lu curpid=%d name=%s\n", + pmap, va, m, prot, flags, psind, + pmap->pm_pid, curproc->p_pid, + curproc->p_comm); +#endif + } + + /* + * Increment the counters. + */ + if ((newpte & PG_W) != 0) + pmap->pm_stats.wired_count++; + pmap_resident_count_inc(pmap, 1); + } + + /* + * Enter on the PV list if part of our managed memory. + */ + if ((newpte & PG_MANAGED) != 0) { + if (pv == NULL) { + pv = get_pv_entry(pmap, &lock); + pv->pv_va = va; + } +#ifdef VERBOSE_PV + else + printf("reassigning pv: %p to pmap: %p\n", + pv, pmap); +#endif + CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); + m->md.pv_gen++; + if ((newpte & PG_RW) != 0) + vm_page_aflag_set(m, PGA_WRITEABLE); + } + + /* + * Update the PTE. + */ + if ((origpte & PG_V) != 0) { +validate: + origpte = be64toh(pte_load_store(pte, htobe64(newpte))); + KASSERT((origpte & PG_FRAME) == pa, + ("pmap_enter: unexpected pa update for %#lx", va)); + if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == + (PG_M | PG_RW)) { + if ((origpte & PG_MANAGED) != 0) + vm_page_dirty(m); + invalidate_page = true; + + /* + * Although the PTE may still have PG_RW set, TLB + * invalidation may nonetheless be required because + * the PTE no longer has PG_M set. + */ + } else if ((origpte & PG_X) != 0 || (newpte & PG_X) == 0) { + /* + * Removing capabilities requires invalidation on POWER + */ + invalidate_page = true; + goto unchanged; + } + if ((origpte & PG_A) != 0) + invalidate_page = true; + } else { + pte_store(pte, newpte); + ptesync(); + } +unchanged: + +#if VM_NRESERVLEVEL > 0 + /* + * If both the page table page and the reservation are fully + * populated, then attempt promotion. + */ + if ((mpte == NULL || mpte->ref_count == NPTEPG) && + mmu_radix_ps_enabled(pmap) && + (m->flags & PG_FICTITIOUS) == 0 && + vm_reserv_level_iffullpop(m) == 0 && + pmap_promote_l3e(pmap, l3e, va, &lock) == 0) + invalidate_all = true; +#endif + if (invalidate_all) + pmap_invalidate_all(pmap); + else if (invalidate_page) + pmap_invalidate_page(pmap, va); + + rv = KERN_SUCCESS; +out: + if (lock != NULL) + rw_wunlock(lock); + PMAP_UNLOCK(pmap); + + return (rv); +} + +/* + * Release a page table page reference after a failed attempt to create a + * mapping. + */ +static void +pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t pdpg) +{ + struct spglist free; + + SLIST_INIT(&free); + if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { + /* + * Although "va" is not mapped, paging- + * structure caches could nonetheless have + * entries that refer to the freed page table + * pages. Invalidate those entries. + */ + pmap_invalidate_page(pmap, va); + vm_page_free_pages_toq(&free, true); + } +} + +/* + * Tries to create a read- and/or execute-only 2MB page mapping. Returns true + * if successful. Returns false if (1) a page table page cannot be allocated + * without sleeping, (2) a mapping already exists at the specified virtual + * address, or (3) a PV entry cannot be allocated without reclaiming another + * PV entry. + */ +static bool +pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, + struct rwlock **lockp) +{ + pml3_entry_t newpde; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs) | + RPTE_LEAF | PG_V; + if ((m->oflags & VPO_UNMANAGED) == 0) + newpde |= PG_MANAGED; + if (prot & VM_PROT_EXECUTE) + newpde |= PG_X; + if (prot & VM_PROT_READ) + newpde |= RPTE_EAA_R; + if (va >= DMAP_MIN_ADDRESS) + newpde |= RPTE_EAA_P; + return (pmap_enter_l3e(pmap, va, newpde, PMAP_ENTER_NOSLEEP | + PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == + KERN_SUCCESS); +} + +/* + * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if + * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE + * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and + * a mapping already exists at the specified virtual address. Returns + * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table + * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if + * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. + * + * The parameter "m" is only used when creating a managed, writeable mapping. + */ +static int +pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, u_int flags, + vm_page_t m, struct rwlock **lockp) +{ + struct spglist free; + pml3_entry_t oldl3e, *l3e; + vm_page_t mt, pdpg; + vm_page_t uwptpg; + + KASSERT((newpde & (PG_M | PG_RW)) != PG_RW, + ("pmap_enter_pde: newpde is missing PG_M")); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + if ((pdpg = pmap_allocl3e(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? + NULL : lockp)) == NULL) { + CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return (KERN_RESOURCE_SHORTAGE); + } + l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); + l3e = &l3e[pmap_pml3e_index(va)]; + oldl3e = be64toh(*l3e); + if ((oldl3e & PG_V) != 0) { + KASSERT(pdpg->ref_count > 1, + ("pmap_enter_pde: pdpg's wire count is too low")); + if ((flags & PMAP_ENTER_NOREPLACE) != 0) { + pdpg->ref_count--; + CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return (KERN_FAILURE); + } + /* Break the existing mapping(s). */ + SLIST_INIT(&free); + if ((oldl3e & RPTE_LEAF) != 0) { + /* + * The reference to the PD page that was acquired by + * pmap_allocl3e() ensures that it won't be freed. + * However, if the PDE resulted from a promotion, then + * a reserved PT page could be freed. + */ + (void)pmap_remove_l3e(pmap, l3e, va, &free, lockp); + pmap_invalidate_l3e_page(pmap, va, oldl3e); + } else { + if (pmap_remove_ptes(pmap, va, va + L3_PAGE_SIZE, l3e, + &free, lockp)) + pmap_invalidate_all(pmap); + } + vm_page_free_pages_toq(&free, true); + if (va >= VM_MAXUSER_ADDRESS) { + mt = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME); + if (pmap_insert_pt_page(pmap, mt)) { + /* + * XXX Currently, this can't happen because + * we do not perform pmap_enter(psind == 1) + * on the kernel pmap. + */ + panic("pmap_enter_pde: trie insert failed"); + } + } else + KASSERT(be64toh(*l3e) == 0, ("pmap_enter_pde: non-zero pde %p", + l3e)); + } + + /* + * Allocate leaf ptpage for wired userspace pages. + */ + uwptpg = NULL; + if ((newpde & PG_W) != 0 && pmap != kernel_pmap) { + uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED); + if (uwptpg == NULL) { + pmap_abort_ptp(pmap, va, pdpg); + return (KERN_RESOURCE_SHORTAGE); + } + uwptpg->pindex = pmap_l3e_pindex(va); + if (pmap_insert_pt_page(pmap, uwptpg)) { + vm_page_unwire_noq(uwptpg); + vm_page_free(uwptpg); + pmap_abort_ptp(pmap, va, pdpg); + return (KERN_RESOURCE_SHORTAGE); + } + pmap_resident_count_inc(pmap, 1); + uwptpg->ref_count = NPTEPG; + pmap_fill_ptp((pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(uwptpg)), + newpde); + } + if ((newpde & PG_MANAGED) != 0) { + /* + * Abort this mapping if its PV entry could not be created. + */ + if (!pmap_pv_insert_l3e(pmap, va, newpde, flags, lockp)) { + pmap_abort_ptp(pmap, va, pdpg); + if (uwptpg != NULL) { + mt = pmap_remove_pt_page(pmap, va); + KASSERT(mt == uwptpg, + ("removed pt page %p, expected %p", mt, + uwptpg)); + pmap_resident_count_dec(pmap, 1); + uwptpg->ref_count = 1; + vm_page_unwire_noq(uwptpg); + vm_page_free(uwptpg); + } + CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return (KERN_RESOURCE_SHORTAGE); + } + if ((newpde & PG_RW) != 0) { + for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) + vm_page_aflag_set(mt, PGA_WRITEABLE); + } + } + + /* + * Increment counters. + */ + if ((newpde & PG_W) != 0) + pmap->pm_stats.wired_count += L3_PAGE_SIZE / PAGE_SIZE; + pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE); + + /* + * Map the superpage. (This is not a promoted mapping; there will not + * be any lingering 4KB page mappings in the TLB.) + */ + pte_store(l3e, newpde); + ptesync(); + + counter_u64_add(pmap_l3e_mappings, 1); + CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" + " in pmap %p", va, pmap); + return (KERN_SUCCESS); +} + +void +mmu_radix_enter_object(pmap_t pmap, vm_offset_t start, + vm_offset_t end, vm_page_t m_start, vm_prot_t prot) +{ + struct pctrie_iter pages; + struct rwlock *lock; + vm_offset_t va; + vm_page_t m, mpte; + bool invalidate; + + VM_OBJECT_ASSERT_LOCKED(m_start->object); + + CTR6(KTR_PMAP, "%s(%p, %#x, %#x, %p, %#x)", __func__, pmap, start, + end, m_start, prot); + invalidate = false; + mpte = NULL; + vm_page_iter_limit_init(&pages, m_start->object, + m_start->pindex + atop(end - start)); + m = vm_radix_iter_lookup(&pages, m_start->pindex); + lock = NULL; + PMAP_LOCK(pmap); + while (m != NULL) { + va = start + ptoa(m->pindex - m_start->pindex); + if ((va & L3_PAGE_MASK) == 0 && va + L3_PAGE_SIZE <= end && + m->psind == 1 && mmu_radix_ps_enabled(pmap) && + pmap_enter_2mpage(pmap, va, m, prot, &lock)) { + m = vm_radix_iter_jump(&pages, L3_PAGE_SIZE / PAGE_SIZE); + } else { + mpte = mmu_radix_enter_quick_locked(pmap, va, m, prot, + mpte, &lock, &invalidate); + m = vm_radix_iter_step(&pages); + } + } + ptesync(); + if (lock != NULL) + rw_wunlock(lock); + if (invalidate) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); +} + +static vm_page_t +mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate) +{ + struct spglist free; + pt_entry_t *pte; + vm_paddr_t pa; + + KASSERT(!VA_IS_CLEANMAP(va) || + (m->oflags & VPO_UNMANAGED) != 0, + ("mmu_radix_enter_quick_locked: managed mapping within the clean submap")); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * In the case that a page table page is not + * resident, we are creating it here. + */ + if (va < VM_MAXUSER_ADDRESS) { + vm_pindex_t ptepindex; + pml3_entry_t *ptepa; + + /* + * Calculate pagetable page index + */ + ptepindex = pmap_l3e_pindex(va); + if (mpte && (mpte->pindex == ptepindex)) { + mpte->ref_count++; + } else { + /* + * Get the page directory entry + */ + ptepa = pmap_pml3e(pmap, va); + + /* + * If the page table page is mapped, we just increment + * the hold count, and activate it. Otherwise, we + * attempt to allocate a page table page. If this + * attempt fails, we don't retry. Instead, we give up. + */ + if (ptepa && (be64toh(*ptepa) & PG_V) != 0) { + if (be64toh(*ptepa) & RPTE_LEAF) + return (NULL); + mpte = PHYS_TO_VM_PAGE(be64toh(*ptepa) & PG_FRAME); + mpte->ref_count++; + } else { + /* + * Pass NULL instead of the PV list lock + * pointer, because we don't intend to sleep. + */ + mpte = _pmap_allocpte(pmap, ptepindex, NULL); + if (mpte == NULL) + return (mpte); + } + } + pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); + pte = &pte[pmap_pte_index(va)]; + } else { + mpte = NULL; + pte = pmap_pte(pmap, va); + } + if (be64toh(*pte)) { + if (mpte != NULL) { + mpte->ref_count--; + mpte = NULL; + } + return (mpte); + } + + /* + * Enter on the PV list if part of our managed memory. + */ + if ((m->oflags & VPO_UNMANAGED) == 0 && + !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { + if (mpte != NULL) { + SLIST_INIT(&free); + if (pmap_unwire_ptp(pmap, va, mpte, &free)) { + /* + * Although "va" is not mapped, paging- + * structure caches could nonetheless have + * entries that refer to the freed page table + * pages. Invalidate those entries. + */ + *invalidate = true; + vm_page_free_pages_toq(&free, true); + } + mpte = NULL; + } + return (mpte); + } + + /* + * Increment counters + */ + pmap_resident_count_inc(pmap, 1); + + pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs); + if (prot & VM_PROT_EXECUTE) + pa |= PG_X; + else + pa |= RPTE_EAA_R; + if ((m->oflags & VPO_UNMANAGED) == 0) + pa |= PG_MANAGED; + + pte_store(pte, pa); + return (mpte); +} + +void +mmu_radix_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot) +{ + struct rwlock *lock; + bool invalidate; + + lock = NULL; + invalidate = false; + PMAP_LOCK(pmap); + mmu_radix_enter_quick_locked(pmap, va, m, prot, NULL, &lock, + &invalidate); + ptesync(); + if (lock != NULL) + rw_wunlock(lock); + if (invalidate) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); +} + +vm_paddr_t +mmu_radix_extract(pmap_t pmap, vm_offset_t va) +{ + pml3_entry_t *l3e; + pt_entry_t *pte; + vm_paddr_t pa; + + l3e = pmap_pml3e(pmap, va); + if (__predict_false(l3e == NULL)) + return (0); + if (be64toh(*l3e) & RPTE_LEAF) { + pa = (be64toh(*l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK); + pa |= (va & L3_PAGE_MASK); + } else { + /* + * Beware of a concurrent promotion that changes the + * PDE at this point! For example, vtopte() must not + * be used to access the PTE because it would use the + * new PDE. It is, however, safe to use the old PDE + * because the page table page is preserved by the + * promotion. + */ + pte = pmap_l3e_to_pte(l3e, va); + if (__predict_false(pte == NULL)) + return (0); + pa = be64toh(*pte); + pa = (pa & PG_FRAME) | (va & PAGE_MASK); + pa |= (va & PAGE_MASK); + } + return (pa); +} + +vm_page_t +mmu_radix_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) +{ + pml3_entry_t l3e, *l3ep; + pt_entry_t pte; + vm_page_t m; + + m = NULL; + CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, va, prot); + PMAP_LOCK(pmap); + l3ep = pmap_pml3e(pmap, va); + if (l3ep != NULL && (l3e = be64toh(*l3ep))) { + if (l3e & RPTE_LEAF) { + if ((l3e & PG_RW) || (prot & VM_PROT_WRITE) == 0) + m = PHYS_TO_VM_PAGE((l3e & PG_PS_FRAME) | + (va & L3_PAGE_MASK)); + } else { + /* Native endian PTE, do not pass to pmap functions */ + pte = be64toh(*pmap_l3e_to_pte(l3ep, va)); + if ((pte & PG_V) && + ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) + m = PHYS_TO_VM_PAGE(pte & PG_FRAME); + } + if (m != NULL && !vm_page_wire_mapped(m)) + m = NULL; + } + PMAP_UNLOCK(pmap); + return (m); +} + +static int +mmu_radix_growkernel(vm_offset_t addr) +{ + vm_paddr_t paddr; + vm_page_t nkpg; + pml3_entry_t *l3e; + pml2_entry_t *l2e; + + CTR2(KTR_PMAP, "%s(%#x)", __func__, addr); + if (VM_MIN_KERNEL_ADDRESS < addr && + addr < (VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE)) + return (KERN_SUCCESS); + + addr = roundup2(addr, L3_PAGE_SIZE); + if (addr - 1 >= vm_map_max(kernel_map)) + addr = vm_map_max(kernel_map); + while (kernel_vm_end < addr) { + l2e = pmap_pml2e(kernel_pmap, kernel_vm_end); + if ((be64toh(*l2e) & PG_V) == 0) { + /* We need a new PDP entry */ + nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | + VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (nkpg == NULL) + return (KERN_RESOURCE_SHORTAGE); + nkpg->pindex = kernel_vm_end >> L2_PAGE_SIZE_SHIFT; + paddr = VM_PAGE_TO_PHYS(nkpg); + pde_store(l2e, paddr); + continue; /* try again */ + } + l3e = pmap_l2e_to_l3e(l2e, kernel_vm_end); + if ((be64toh(*l3e) & PG_V) != 0) { + kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { + kernel_vm_end = vm_map_max(kernel_map); + break; + } + continue; + } + + nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | + VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (nkpg == NULL) + return (KERN_RESOURCE_SHORTAGE); + nkpg->pindex = pmap_l3e_pindex(kernel_vm_end); + paddr = VM_PAGE_TO_PHYS(nkpg); + pde_store(l3e, paddr); + + kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { + kernel_vm_end = vm_map_max(kernel_map); + break; + } + } + ptesync(); + return (KERN_SUCCESS); +} + +static MALLOC_DEFINE(M_RADIX_PGD, "radix_pgd", "radix page table root directory"); +static uma_zone_t zone_radix_pgd; + +static int +radix_pgd_import(void *arg __unused, void **store, int count, int domain __unused, + int flags) +{ + int req; + + req = VM_ALLOC_WIRED | malloc2vm_flags(flags); + for (int i = 0; i < count; i++) { + vm_page_t m = vm_page_alloc_noobj_contig(req, + RADIX_PGD_SIZE / PAGE_SIZE, + 0, (vm_paddr_t)-1, RADIX_PGD_SIZE, L1_PAGE_SIZE, + VM_MEMATTR_DEFAULT); + store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); + } + return (count); +} + +static void +radix_pgd_release(void *arg __unused, void **store, int count) +{ + vm_page_t m; + struct spglist free; + int page_count; + + SLIST_INIT(&free); + page_count = RADIX_PGD_SIZE/PAGE_SIZE; + + for (int i = 0; i < count; i++) { + /* + * XXX selectively remove dmap and KVA entries so we don't + * need to bzero + */ + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i])); + for (int j = page_count-1; j >= 0; j--) { + vm_page_unwire_noq(&m[j]); + SLIST_INSERT_HEAD(&free, &m[j], plinks.s.ss); + } + vm_page_free_pages_toq(&free, false); + } +} + +static void +mmu_radix_init(void) +{ + vm_page_t mpte; + vm_size_t s; + int error, i, pv_npg; + + /* XXX is this really needed for POWER? */ + /* L1TF, reserve page @0 unconditionally */ + vm_page_blacklist_add(0, bootverbose); + + zone_radix_pgd = uma_zcache_create("radix_pgd_cache", + RADIX_PGD_SIZE, NULL, NULL, +#ifdef INVARIANTS + trash_init, trash_fini, +#else + NULL, NULL, +#endif + radix_pgd_import, radix_pgd_release, + NULL, UMA_ZONE_NOBUCKET); + + /* + * Initialize the vm page array entries for the kernel pmap's + * page table pages. + */ + PMAP_LOCK(kernel_pmap); + for (i = 0; i < nkpt; i++) { + mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); + KASSERT(mpte >= vm_page_array && + mpte < &vm_page_array[vm_page_array_size], + ("pmap_init: page table page is out of range size: %lu", + vm_page_array_size)); + mpte->pindex = pmap_l3e_pindex(VM_MIN_KERNEL_ADDRESS) + i; + mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); + MPASS(PHYS_TO_VM_PAGE(mpte->phys_addr) == mpte); + //pmap_insert_pt_page(kernel_pmap, mpte); + mpte->ref_count = 1; + } + PMAP_UNLOCK(kernel_pmap); + vm_wire_add(nkpt); + + CTR1(KTR_PMAP, "%s()", __func__); + TAILQ_INIT(&pv_dummy.pv_list); + + /* + * Are large page mappings enabled? + */ + TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); + if (superpages_enabled) { + KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, + ("pmap_init: can't assign to pagesizes[1]")); + pagesizes[1] = L3_PAGE_SIZE; + } + + /* + * Initialize the pv chunk list mutex. + */ + mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); + + /* + * Initialize the pool of pv list locks. + */ + for (i = 0; i < NPV_LIST_LOCKS; i++) + rw_init(&pv_list_locks[i], "pmap pv list"); + + /* + * Calculate the size of the pv head table for superpages. + */ + pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L3_PAGE_SIZE); + + /* + * Allocate memory for the pv head table for superpages. + */ + s = (vm_size_t)(pv_npg * sizeof(struct md_page)); + s = round_page(s); + pv_table = kmem_malloc(s, M_WAITOK | M_ZERO); + for (i = 0; i < pv_npg; i++) + TAILQ_INIT(&pv_table[i].pv_list); + TAILQ_INIT(&pv_dummy.pv_list); + + pmap_initialized = 1; + mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); + error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, + (vmem_addr_t *)&qframe); + + if (error != 0) + panic("qframe allocation failed"); + asid_arena = vmem_create("ASID", isa3_base_pid + 1, (1<<isa3_pid_bits), + 1, 1, M_WAITOK); +} + +static bool +pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified) +{ + struct rwlock *lock; + pv_entry_t pv; + struct md_page *pvh; + pt_entry_t *pte, mask; + pmap_t pmap; + int md_gen, pvh_gen; + bool rv; + + rv = false; + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_rlock(lock); +restart: + TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pte = pmap_pte(pmap, pv->pv_va); + mask = 0; + if (modified) + mask |= PG_RW | PG_M; + if (accessed) + mask |= PG_V | PG_A; + rv = (be64toh(*pte) & mask) == mask; + PMAP_UNLOCK(pmap); + if (rv) + goto out; + } + if ((m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + pvh_gen = pvh->pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen || + pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pte = pmap_pml3e(pmap, pv->pv_va); + mask = 0; + if (modified) + mask |= PG_RW | PG_M; + if (accessed) + mask |= PG_V | PG_A; + rv = (be64toh(*pte) & mask) == mask; + PMAP_UNLOCK(pmap); + if (rv) + goto out; + } + } +out: + rw_runlock(lock); + return (rv); +} + +/* + * pmap_is_modified: + * + * Return whether or not the specified physical page was modified + * in any physical maps. + */ +bool +mmu_radix_is_modified(vm_page_t m) +{ + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_is_modified: page %p is not managed", m)); + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + /* + * If the page is not busied then this check is racy. + */ + if (!pmap_page_is_write_mapped(m)) + return (false); + return (pmap_page_test_mappings(m, false, true)); +} + +bool +mmu_radix_is_prefaultable(pmap_t pmap, vm_offset_t addr) +{ + pml3_entry_t *l3e; + pt_entry_t *pte; + bool rv; + + CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr); + rv = false; + PMAP_LOCK(pmap); + l3e = pmap_pml3e(pmap, addr); + if (l3e != NULL && (be64toh(*l3e) & (RPTE_LEAF | PG_V)) == PG_V) { + pte = pmap_l3e_to_pte(l3e, addr); + rv = (be64toh(*pte) & PG_V) == 0; + } + PMAP_UNLOCK(pmap); + return (rv); +} + +bool +mmu_radix_is_referenced(vm_page_t m) +{ + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_is_referenced: page %p is not managed", m)); + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + return (pmap_page_test_mappings(m, true, false)); +} + +/* + * pmap_ts_referenced: + * + * Return a count of reference bits for a page, clearing those bits. + * It is not necessary for every reference bit to be cleared, but it + * is necessary that 0 only be returned when there are truly no + * reference bits set. + * + * As an optimization, update the page's dirty field if a modified bit is + * found while counting reference bits. This opportunistic update can be + * performed at low cost and can eliminate the need for some future calls + * to pmap_is_modified(). However, since this function stops after + * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some + * dirty pages. Those dirty pages will only be detected by a future call + * to pmap_is_modified(). + * + * A DI block is not needed within this function, because + * invalidations are performed before the PV list lock is + * released. + */ +int +mmu_radix_ts_referenced(vm_page_t m) +{ + struct md_page *pvh; + pv_entry_t pv, pvf; + pmap_t pmap; + struct rwlock *lock; + pml3_entry_t oldl3e, *l3e; + pt_entry_t *pte; + vm_paddr_t pa; + int cleared, md_gen, not_cleared, pvh_gen; + struct spglist free; + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_ts_referenced: page %p is not managed", m)); + SLIST_INIT(&free); + cleared = 0; + pa = VM_PAGE_TO_PHYS(m); + lock = PHYS_TO_PV_LIST_LOCK(pa); + pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); + rw_wlock(lock); +retry: + not_cleared = 0; + if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) + goto small_mappings; + pv = pvf; + do { + if (pvf == NULL) + pvf = pv; + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto retry; + } + } + l3e = pmap_pml3e(pmap, pv->pv_va); + oldl3e = be64toh(*l3e); + if ((oldl3e & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + /* + * Although "oldpde" is mapping a 2MB page, because + * this function is called at a 4KB page granularity, + * we only update the 4KB page under test. + */ + vm_page_dirty(m); + } + if ((oldl3e & PG_A) != 0) { + /* + * Since this reference bit is shared by 512 4KB + * pages, it should not be cleared every time it is + * tested. Apply a simple "hash" function on the + * physical page number, the virtual superpage number, + * and the pmap address to select one 4KB page out of + * the 512 on which testing the reference bit will + * result in clearing that reference bit. This + * function is designed to avoid the selection of the + * same 4KB page for every 2MB page mapping. + * + * On demotion, a mapping that hasn't been referenced + * is simply destroyed. To avoid the possibility of a + * subsequent page fault on a demoted wired mapping, + * always leave its reference bit set. Moreover, + * since the superpage is wired, the current state of + * its reference bit won't affect page replacement. + */ + if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L3_PAGE_SIZE_SHIFT) ^ + (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && + (oldl3e & PG_W) == 0) { + atomic_clear_long(l3e, htobe64(PG_A)); + pmap_invalidate_page(pmap, pv->pv_va); + cleared++; + KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), + ("inconsistent pv lock %p %p for page %p", + lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); + } else + not_cleared++; + } + PMAP_UNLOCK(pmap); + /* Rotate the PV list if it has more than one entry. */ + if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) { + TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); + pvh->pv_gen++; + } + if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) + goto out; + } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); +small_mappings: + if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) + goto out; + pv = pvf; + do { + if (pvf == NULL) + pvf = pv; + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + md_gen = m->md.pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto retry; + } + } + l3e = pmap_pml3e(pmap, pv->pv_va); + KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, + ("pmap_ts_referenced: found a 2mpage in page %p's pv list", + m)); + pte = pmap_l3e_to_pte(l3e, pv->pv_va); + if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + if ((be64toh(*pte) & PG_A) != 0) { + atomic_clear_long(pte, htobe64(PG_A)); + pmap_invalidate_page(pmap, pv->pv_va); + cleared++; + } + PMAP_UNLOCK(pmap); + /* Rotate the PV list if it has more than one entry. */ + if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) { + TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link); + m->md.pv_gen++; + } + } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + + not_cleared < PMAP_TS_REFERENCED_MAX); +out: + rw_wunlock(lock); + vm_page_free_pages_toq(&free, true); + return (cleared + not_cleared); +} + +static vm_offset_t +mmu_radix_map(vm_offset_t *virt __unused, vm_paddr_t start, + vm_paddr_t end, int prot __unused) +{ + + CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, virt, start, end, + prot); + return (PHYS_TO_DMAP(start)); +} + +void +mmu_radix_object_init_pt(pmap_t pmap, vm_offset_t addr, + vm_object_t object, vm_pindex_t pindex, vm_size_t size) +{ + struct pctrie_iter pages; + pml3_entry_t *l3e; + vm_paddr_t pa, ptepa; + vm_page_t p, pdpg; + vm_memattr_t ma; + + CTR6(KTR_PMAP, "%s(%p, %#x, %p, %u, %#x)", __func__, pmap, addr, + object, pindex, size); + VM_OBJECT_ASSERT_WLOCKED(object); + KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, + ("pmap_object_init_pt: non-device object")); + /* NB: size can be logically ored with addr here */ + if ((addr & L3_PAGE_MASK) == 0 && (size & L3_PAGE_MASK) == 0) { + if (!mmu_radix_ps_enabled(pmap)) + return; + if (!vm_object_populate(object, pindex, pindex + atop(size))) + return; + vm_page_iter_init(&pages, object); + p = vm_radix_iter_lookup(&pages, pindex); + + KASSERT(p->valid == VM_PAGE_BITS_ALL, + ("pmap_object_init_pt: invalid page %p", p)); + ma = p->md.mdpg_cache_attrs; + + /* + * Abort the mapping if the first page is not physically + * aligned to a 2MB page boundary. + */ + ptepa = VM_PAGE_TO_PHYS(p); + if (ptepa & L3_PAGE_MASK) + return; + + /* + * Skip the first page. Abort the mapping if the rest of + * the pages are not physically contiguous or have differing + * memory attributes. + */ + for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; + pa += PAGE_SIZE) { + p = vm_radix_iter_next(&pages); + KASSERT(p->valid == VM_PAGE_BITS_ALL, + ("pmap_object_init_pt: invalid page %p", p)); + if (pa != VM_PAGE_TO_PHYS(p) || + ma != p->md.mdpg_cache_attrs) + return; + } + + PMAP_LOCK(pmap); + for (pa = ptepa | pmap_cache_bits(ma); + pa < ptepa + size; pa += L3_PAGE_SIZE) { + pdpg = pmap_allocl3e(pmap, addr, NULL); + if (pdpg == NULL) { + /* + * The creation of mappings below is only an + * optimization. If a page directory page + * cannot be allocated without blocking, + * continue on to the next mapping rather than + * blocking. + */ + addr += L3_PAGE_SIZE; + continue; + } + l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); + l3e = &l3e[pmap_pml3e_index(addr)]; + if ((be64toh(*l3e) & PG_V) == 0) { + pa |= PG_M | PG_A | PG_RW; + pte_store(l3e, pa); + pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE); + counter_u64_add(pmap_l3e_mappings, 1); + } else { + /* Continue on if the PDE is already valid. */ + pdpg->ref_count--; + KASSERT(pdpg->ref_count > 0, + ("pmap_object_init_pt: missing reference " + "to page directory page, va: 0x%lx", addr)); + } + addr += L3_PAGE_SIZE; + } + ptesync(); + PMAP_UNLOCK(pmap); + } +} + +bool +mmu_radix_page_exists_quick(pmap_t pmap, vm_page_t m) +{ + struct md_page *pvh; + struct rwlock *lock; + pv_entry_t pv; + int loops = 0; + bool rv; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_page_exists_quick: page %p is not managed", m)); + CTR3(KTR_PMAP, "%s(%p, %p)", __func__, pmap, m); + rv = false; + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_rlock(lock); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { + if (PV_PMAP(pv) == pmap) { + rv = true; + break; + } + loops++; + if (loops >= 16) + break; + } + if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { + if (PV_PMAP(pv) == pmap) { + rv = true; + break; + } + loops++; + if (loops >= 16) + break; + } + } + rw_runlock(lock); + return (rv); +} + +void +mmu_radix_page_init(vm_page_t m) +{ + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + TAILQ_INIT(&m->md.pv_list); + m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT; +} + +int +mmu_radix_page_wired_mappings(vm_page_t m) +{ + struct rwlock *lock; + struct md_page *pvh; + pmap_t pmap; + pt_entry_t *pte; + pv_entry_t pv; + int count, md_gen, pvh_gen; + + if ((m->oflags & VPO_UNMANAGED) != 0) + return (0); + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_rlock(lock); +restart: + count = 0; + TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pte = pmap_pte(pmap, pv->pv_va); + if ((be64toh(*pte) & PG_W) != 0) + count++; + PMAP_UNLOCK(pmap); + } + if ((m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + pvh_gen = pvh->pv_gen; + rw_runlock(lock); + PMAP_LOCK(pmap); + rw_rlock(lock); + if (md_gen != m->md.pv_gen || + pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + pte = pmap_pml3e(pmap, pv->pv_va); + if ((be64toh(*pte) & PG_W) != 0) + count++; + PMAP_UNLOCK(pmap); + } + } + rw_runlock(lock); + return (count); +} + +static void +mmu_radix_update_proctab(int pid, pml1_entry_t l1pa) +{ + isa3_proctab[pid].proctab0 = htobe64(RTS_SIZE | l1pa | RADIX_PGD_INDEX_SHIFT); +} + +int +mmu_radix_pinit(pmap_t pmap) +{ + vmem_addr_t pid; + vm_paddr_t l1pa; + + CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); + + /* + * allocate the page directory page + */ + pmap->pm_pml1 = uma_zalloc(zone_radix_pgd, M_WAITOK); + + for (int j = 0; j < RADIX_PGD_SIZE_SHIFT; j++) + pagezero((vm_offset_t)pmap->pm_pml1 + j * PAGE_SIZE); + vm_radix_init(&pmap->pm_radix); + TAILQ_INIT(&pmap->pm_pvchunk); + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + pmap->pm_flags = PMAP_PDE_SUPERPAGE; + vmem_alloc(asid_arena, 1, M_FIRSTFIT|M_WAITOK, &pid); + + pmap->pm_pid = pid; + l1pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml1); + mmu_radix_update_proctab(pid, l1pa); + __asm __volatile("ptesync;isync" : : : "memory"); + + return (1); +} + +/* + * This routine is called if the desired page table page does not exist. + * + * If page table page allocation fails, this routine may sleep before + * returning NULL. It sleeps only if a lock pointer was given. + * + * Note: If a page allocation fails at page table level two or three, + * one or two pages may be held during the wait, only to be released + * afterwards. This conservative approach is easily argued to avoid + * race conditions. + */ +static vm_page_t +_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) +{ + vm_page_t m, pdppg, pdpg; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * Allocate a page table page. + */ + if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { + if (lockp != NULL) { + RELEASE_PV_LIST_LOCK(lockp); + PMAP_UNLOCK(pmap); + vm_wait(NULL); + PMAP_LOCK(pmap); + } + /* + * Indicate the need to retry. While waiting, the page table + * page may have been allocated. + */ + return (NULL); + } + m->pindex = ptepindex; + + /* + * Map the pagetable page into the process address space, if + * it isn't already there. + */ + + if (ptepindex >= (NUPDE + NUPDPE)) { + pml1_entry_t *l1e; + vm_pindex_t pml1index; + + /* Wire up a new PDPE page */ + pml1index = ptepindex - (NUPDE + NUPDPE); + l1e = &pmap->pm_pml1[pml1index]; + KASSERT((be64toh(*l1e) & PG_V) == 0, + ("%s: L1 entry %#lx is valid", __func__, *l1e)); + pde_store(l1e, VM_PAGE_TO_PHYS(m)); + } else if (ptepindex >= NUPDE) { + vm_pindex_t pml1index; + vm_pindex_t pdpindex; + pml1_entry_t *l1e; + pml2_entry_t *l2e; + + /* Wire up a new l2e page */ + pdpindex = ptepindex - NUPDE; + pml1index = pdpindex >> RPTE_SHIFT; + + l1e = &pmap->pm_pml1[pml1index]; + if ((be64toh(*l1e) & PG_V) == 0) { + /* Have to allocate a new pdp, recurse */ + if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml1index, + lockp) == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + } else { + /* Add reference to l2e page */ + pdppg = PHYS_TO_VM_PAGE(be64toh(*l1e) & PG_FRAME); + pdppg->ref_count++; + } + l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME); + + /* Now find the pdp page */ + l2e = &l2e[pdpindex & RPTE_MASK]; + KASSERT((be64toh(*l2e) & PG_V) == 0, + ("%s: L2 entry %#lx is valid", __func__, *l2e)); + pde_store(l2e, VM_PAGE_TO_PHYS(m)); + } else { + vm_pindex_t pml1index; + vm_pindex_t pdpindex; + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t *l3e; + + /* Wire up a new PTE page */ + pdpindex = ptepindex >> RPTE_SHIFT; + pml1index = pdpindex >> RPTE_SHIFT; + + /* First, find the pdp and check that its valid. */ + l1e = &pmap->pm_pml1[pml1index]; + if ((be64toh(*l1e) & PG_V) == 0) { + /* Have to allocate a new pd, recurse */ + if (_pmap_allocpte(pmap, NUPDE + pdpindex, + lockp) == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME); + l2e = &l2e[pdpindex & RPTE_MASK]; + } else { + l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME); + l2e = &l2e[pdpindex & RPTE_MASK]; + if ((be64toh(*l2e) & PG_V) == 0) { + /* Have to allocate a new pd, recurse */ + if (_pmap_allocpte(pmap, NUPDE + pdpindex, + lockp) == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + } else { + /* Add reference to the pd page */ + pdpg = PHYS_TO_VM_PAGE(be64toh(*l2e) & PG_FRAME); + pdpg->ref_count++; + } + } + l3e = (pml3_entry_t *)PHYS_TO_DMAP(be64toh(*l2e) & PG_FRAME); + + /* Now we know where the page directory page is */ + l3e = &l3e[ptepindex & RPTE_MASK]; + KASSERT((be64toh(*l3e) & PG_V) == 0, + ("%s: L3 entry %#lx is valid", __func__, *l3e)); + pde_store(l3e, VM_PAGE_TO_PHYS(m)); + } + + pmap_resident_count_inc(pmap, 1); + return (m); +} +static vm_page_t +pmap_allocl3e(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) +{ + vm_pindex_t pdpindex, ptepindex; + pml2_entry_t *pdpe; + vm_page_t pdpg; + +retry: + pdpe = pmap_pml2e(pmap, va); + if (pdpe != NULL && (be64toh(*pdpe) & PG_V) != 0) { + /* Add a reference to the pd page. */ + pdpg = PHYS_TO_VM_PAGE(be64toh(*pdpe) & PG_FRAME); + pdpg->ref_count++; + } else { + /* Allocate a pd page. */ + ptepindex = pmap_l3e_pindex(va); + pdpindex = ptepindex >> RPTE_SHIFT; + pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); + if (pdpg == NULL && lockp != NULL) + goto retry; + } + return (pdpg); +} + +static vm_page_t +pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) +{ + vm_pindex_t ptepindex; + pml3_entry_t *pd; + vm_page_t m; + + /* + * Calculate pagetable page index + */ + ptepindex = pmap_l3e_pindex(va); +retry: + /* + * Get the page directory entry + */ + pd = pmap_pml3e(pmap, va); + + /* + * This supports switching from a 2MB page to a + * normal 4K page. + */ + if (pd != NULL && (be64toh(*pd) & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V)) { + if (!pmap_demote_l3e_locked(pmap, pd, va, lockp)) { + /* + * Invalidation of the 2MB page mapping may have caused + * the deallocation of the underlying PD page. + */ + pd = NULL; + } + } + + /* + * If the page table page is mapped, we just increment the + * hold count, and activate it. + */ + if (pd != NULL && (be64toh(*pd) & PG_V) != 0) { + m = PHYS_TO_VM_PAGE(be64toh(*pd) & PG_FRAME); + m->ref_count++; + } else { + /* + * Here if the pte page isn't mapped, or if it has been + * deallocated. + */ + m = _pmap_allocpte(pmap, ptepindex, lockp); + if (m == NULL && lockp != NULL) + goto retry; + } + return (m); +} + +static void +mmu_radix_pinit0(pmap_t pmap) +{ + + CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); + PMAP_LOCK_INIT(pmap); + pmap->pm_pml1 = kernel_pmap->pm_pml1; + pmap->pm_pid = kernel_pmap->pm_pid; + + vm_radix_init(&pmap->pm_radix); + TAILQ_INIT(&pmap->pm_pvchunk); + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + kernel_pmap->pm_flags = + pmap->pm_flags = PMAP_PDE_SUPERPAGE; +} +/* + * pmap_protect_l3e: do the things to protect a 2mpage in a process + */ +static bool +pmap_protect_l3e(pmap_t pmap, pt_entry_t *l3e, vm_offset_t sva, vm_prot_t prot) +{ + pt_entry_t newpde, oldpde; + vm_offset_t eva, va; + vm_page_t m; + bool anychanged; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((sva & L3_PAGE_MASK) == 0, + ("pmap_protect_l3e: sva is not 2mpage aligned")); + anychanged = false; +retry: + oldpde = newpde = be64toh(*l3e); + if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == + (PG_MANAGED | PG_M | PG_RW)) { + eva = sva + L3_PAGE_SIZE; + for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); + va < eva; va += PAGE_SIZE, m++) + vm_page_dirty(m); + } + if ((prot & VM_PROT_WRITE) == 0) { + newpde &= ~(PG_RW | PG_M); + newpde |= RPTE_EAA_R; + } + if (prot & VM_PROT_EXECUTE) + newpde |= PG_X; + if (newpde != oldpde) { + /* + * As an optimization to future operations on this PDE, clear + * PG_PROMOTED. The impending invalidation will remove any + * lingering 4KB page mappings from the TLB. + */ + if (!atomic_cmpset_long(l3e, htobe64(oldpde), htobe64(newpde & ~PG_PROMOTED))) + goto retry; + anychanged = true; + } + return (anychanged); +} + +void +mmu_radix_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + vm_prot_t prot) +{ + vm_offset_t va_next; + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t ptpaddr, *l3e; + pt_entry_t *pte; + bool anychanged; + + CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, pmap, sva, eva, + prot); + + KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); + if (prot == VM_PROT_NONE) { + mmu_radix_remove(pmap, sva, eva); + return; + } + + if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == + (VM_PROT_WRITE|VM_PROT_EXECUTE)) + return; + +#ifdef INVARIANTS + if (VERBOSE_PROTECT || pmap_logging) + printf("pmap_protect(%p, %#lx, %#lx, %x) - asid: %lu\n", + pmap, sva, eva, prot, pmap->pm_pid); +#endif + anychanged = false; + + PMAP_LOCK(pmap); + for (; sva < eva; sva = va_next) { + l1e = pmap_pml1e(pmap, sva); + if ((be64toh(*l1e) & PG_V) == 0) { + va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + + l2e = pmap_l1e_to_l2e(l1e, sva); + if ((be64toh(*l2e) & PG_V) == 0) { + va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + + va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (va_next < sva) + va_next = eva; + + l3e = pmap_l2e_to_l3e(l2e, sva); + ptpaddr = be64toh(*l3e); + + /* + * Weed out invalid mappings. + */ + if (ptpaddr == 0) + continue; + + /* + * Check for large page. + */ + if ((ptpaddr & RPTE_LEAF) != 0) { + /* + * Are we protecting the entire large page? If not, + * demote the mapping and fall through. + */ + if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { + if (pmap_protect_l3e(pmap, l3e, sva, prot)) + anychanged = true; + continue; + } else if (!pmap_demote_l3e(pmap, l3e, sva)) { + /* + * The large page mapping was destroyed. + */ + continue; + } + } + + if (va_next > eva) + va_next = eva; + + for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, + sva += PAGE_SIZE) { + pt_entry_t obits, pbits; + vm_page_t m; + +retry: + MPASS(pte == pmap_pte(pmap, sva)); + obits = pbits = be64toh(*pte); + if ((pbits & PG_V) == 0) + continue; + + if ((prot & VM_PROT_WRITE) == 0) { + if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == + (PG_MANAGED | PG_M | PG_RW)) { + m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); + vm_page_dirty(m); + } + pbits &= ~(PG_RW | PG_M); + pbits |= RPTE_EAA_R; + } + if (prot & VM_PROT_EXECUTE) + pbits |= PG_X; + + if (pbits != obits) { + if (!atomic_cmpset_long(pte, htobe64(obits), htobe64(pbits))) + goto retry; + if (obits & (PG_A|PG_M)) { + anychanged = true; +#ifdef INVARIANTS + if (VERBOSE_PROTECT || pmap_logging) + printf("%#lx %#lx -> %#lx\n", + sva, obits, pbits); +#endif + } + } + } + } + if (anychanged) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); +} + +void +mmu_radix_qenter(vm_offset_t sva, vm_page_t *ma, int count) +{ + + CTR4(KTR_PMAP, "%s(%#x, %p, %d)", __func__, sva, ma, count); + pt_entry_t oldpte, pa, *pte; + vm_page_t m; + uint64_t cache_bits, attr_bits; + vm_offset_t va; + + oldpte = 0; + attr_bits = RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A; + va = sva; + pte = kvtopte(va); + while (va < sva + PAGE_SIZE * count) { + if (__predict_false((va & L3_PAGE_MASK) == 0)) + pte = kvtopte(va); + MPASS(pte == pmap_pte(kernel_pmap, va)); + + /* + * XXX there has to be a more efficient way than traversing + * the page table every time - but go for correctness for + * today + */ + + m = *ma++; + cache_bits = pmap_cache_bits(m->md.mdpg_cache_attrs); + pa = VM_PAGE_TO_PHYS(m) | cache_bits | attr_bits; + if (be64toh(*pte) != pa) { + oldpte |= be64toh(*pte); + pte_store(pte, pa); + } + va += PAGE_SIZE; + pte++; + } + if (__predict_false((oldpte & RPTE_VALID) != 0)) + pmap_invalidate_range(kernel_pmap, sva, sva + count * + PAGE_SIZE); + else + ptesync(); +} + +void +mmu_radix_qremove(vm_offset_t sva, int count) +{ + vm_offset_t va; + pt_entry_t *pte; + + CTR3(KTR_PMAP, "%s(%#x, %d)", __func__, sva, count); + KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode or dmap va %lx", sva)); + + va = sva; + pte = kvtopte(va); + while (va < sva + PAGE_SIZE * count) { + if (__predict_false((va & L3_PAGE_MASK) == 0)) + pte = kvtopte(va); + pte_clear(pte); + pte++; + va += PAGE_SIZE; + } + pmap_invalidate_range(kernel_pmap, sva, va); +} + +/*************************************************** + * Page table page management routines..... + ***************************************************/ +/* + * Schedule the specified unused page table page to be freed. Specifically, + * add the page to the specified list of pages that will be released to the + * physical memory manager after the TLB has been updated. + */ +static __inline void +pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO) +{ + + if (set_PG_ZERO) + m->flags |= PG_ZERO; + else + m->flags &= ~PG_ZERO; + SLIST_INSERT_HEAD(free, m, plinks.s.ss); +} + +/* + * Inserts the specified page table page into the specified pmap's collection + * of idle page table pages. Each of a pmap's page table pages is responsible + * for mapping a distinct range of virtual addresses. The pmap's collection is + * ordered by this virtual address range. + */ +static __inline int +pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + return (vm_radix_insert(&pmap->pm_radix, mpte)); +} + +/* + * Removes the page table page mapping the specified virtual address from the + * specified pmap's collection of idle page table pages, and returns it. + * Otherwise, returns NULL if there is no page table page corresponding to the + * specified virtual address. + */ +static __inline vm_page_t +pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + return (vm_radix_remove(&pmap->pm_radix, pmap_l3e_pindex(va))); +} + +/* + * Decrements a page table page's wire count, which is used to record the + * number of valid page table entries within the page. If the wire count + * drops to zero, then the page table page is unmapped. Returns true if the + * page table page was unmapped and false otherwise. + */ +static inline bool +pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) +{ + + --m->ref_count; + if (m->ref_count == 0) { + _pmap_unwire_ptp(pmap, va, m, free); + return (true); + } else + return (false); +} + +static void +_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* + * unmap the page table page + */ + if (m->pindex >= NUPDE + NUPDPE) { + /* PDP page */ + pml1_entry_t *pml1; + pml1 = pmap_pml1e(pmap, va); + *pml1 = 0; + } else if (m->pindex >= NUPDE) { + /* PD page */ + pml2_entry_t *l2e; + l2e = pmap_pml2e(pmap, va); + *l2e = 0; + } else { + /* PTE page */ + pml3_entry_t *l3e; + l3e = pmap_pml3e(pmap, va); + *l3e = 0; + } + pmap_resident_count_dec(pmap, 1); + if (m->pindex < NUPDE) { + /* We just released a PT, unhold the matching PD */ + vm_page_t pdpg; + + pdpg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml2e(pmap, va)) & PG_FRAME); + pmap_unwire_ptp(pmap, va, pdpg, free); + } + else if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { + /* We just released a PD, unhold the matching PDP */ + vm_page_t pdppg; + + pdppg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml1e(pmap, va)) & PG_FRAME); + pmap_unwire_ptp(pmap, va, pdppg, free); + } + + /* + * Put page on a list so that it is released after + * *ALL* TLB shootdown is done + */ + pmap_add_delayed_free_list(m, free, true); +} + +/* + * After removing a page table entry, this routine is used to + * conditionally free the page, and manage the hold/wire counts. + */ +static int +pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pml3_entry_t ptepde, + struct spglist *free) +{ + vm_page_t mpte; + + if (va >= VM_MAXUSER_ADDRESS) + return (0); + KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); + mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); + return (pmap_unwire_ptp(pmap, va, mpte, free)); +} + +void +mmu_radix_release(pmap_t pmap) +{ + + CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); + KASSERT(pmap->pm_stats.resident_count == 0, + ("pmap_release: pmap resident count %ld != 0", + pmap->pm_stats.resident_count)); + KASSERT(vm_radix_is_empty(&pmap->pm_radix), + ("pmap_release: pmap has reserved page table page(s)")); + + pmap_invalidate_all(pmap); + isa3_proctab[pmap->pm_pid].proctab0 = 0; + uma_zfree(zone_radix_pgd, pmap->pm_pml1); + vmem_free(asid_arena, pmap->pm_pid, 1); +} + +/* + * Create the PV entry for a 2MB page mapping. Always returns true unless the + * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns + * false if the PV entry cannot be allocated without resorting to reclamation. + */ +static bool +pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t pde, u_int flags, + struct rwlock **lockp) +{ + struct md_page *pvh; + pv_entry_t pv; + vm_paddr_t pa; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* Pass NULL instead of the lock pointer to disable reclamation. */ + if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? + NULL : lockp)) == NULL) + return (false); + pv->pv_va = va; + pa = pde & PG_PS_FRAME; + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); + pvh = pa_to_pvh(pa); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link); + pvh->pv_gen++; + return (true); +} + +/* + * Fills a page table page with mappings to consecutive physical pages. + */ +static void +pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) +{ + pt_entry_t *pte; + + for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { + *pte = htobe64(newpte); + newpte += PAGE_SIZE; + } +} + +static bool +pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va) +{ + struct rwlock *lock; + bool rv; + + lock = NULL; + rv = pmap_demote_l3e_locked(pmap, pde, va, &lock); + if (lock != NULL) + rw_wunlock(lock); + return (rv); +} + +static bool +pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va, + struct rwlock **lockp) +{ + pml3_entry_t oldpde; + pt_entry_t *firstpte; + vm_paddr_t mptepa; + vm_page_t mpte; + struct spglist free; + vm_offset_t sva; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + oldpde = be64toh(*l3e); + KASSERT((oldpde & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V), + ("pmap_demote_l3e: oldpde is missing RPTE_LEAF and/or PG_V %lx", + oldpde)); + if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == + NULL) { + KASSERT((oldpde & PG_W) == 0, + ("pmap_demote_l3e: page table page for a wired mapping" + " is missing")); + + /* + * Invalidate the 2MB page mapping and return "failure" if the + * mapping was never accessed or the allocation of the new + * page table page fails. If the 2MB page mapping belongs to + * the direct map region of the kernel's address space, then + * the page allocation request specifies the highest possible + * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is + * normal. Page table pages are preallocated for every other + * part of the kernel address space, so the direct map region + * is the only part of the kernel address space that must be + * handled here. + */ + if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc_noobj( + (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS ? + VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED)) == NULL) { + SLIST_INIT(&free); + sva = trunc_2mpage(va); + pmap_remove_l3e(pmap, l3e, sva, &free, lockp); + pmap_invalidate_l3e_page(pmap, sva, oldpde); + vm_page_free_pages_toq(&free, true); + CTR2(KTR_PMAP, "pmap_demote_l3e: failure for va %#lx" + " in pmap %p", va, pmap); + return (false); + } + mpte->pindex = pmap_l3e_pindex(va); + if (va < VM_MAXUSER_ADDRESS) + pmap_resident_count_inc(pmap, 1); + } + mptepa = VM_PAGE_TO_PHYS(mpte); + firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); + KASSERT((oldpde & PG_A) != 0, + ("pmap_demote_l3e: oldpde is missing PG_A")); + KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, + ("pmap_demote_l3e: oldpde is missing PG_M")); + + /* + * If the page table page is new, initialize it. + */ + if (mpte->ref_count == 1) { + mpte->ref_count = NPTEPG; + pmap_fill_ptp(firstpte, oldpde); + } + + KASSERT((be64toh(*firstpte) & PG_FRAME) == (oldpde & PG_FRAME), + ("pmap_demote_l3e: firstpte and newpte map different physical" + " addresses")); + + /* + * If the mapping has changed attributes, update the page table + * entries. + */ + if ((be64toh(*firstpte) & PG_PTE_PROMOTE) != (oldpde & PG_PTE_PROMOTE)) + pmap_fill_ptp(firstpte, oldpde); + + /* + * The spare PV entries must be reserved prior to demoting the + * mapping, that is, prior to changing the PDE. Otherwise, the state + * of the PDE and the PV lists will be inconsistent, which can result + * in reclaim_pv_chunk() attempting to remove a PV entry from the + * wrong PV list and pmap_pv_demote_l3e() failing to find the expected + * PV entry for the 2MB page mapping that is being demoted. + */ + if ((oldpde & PG_MANAGED) != 0) + reserve_pv_entries(pmap, NPTEPG - 1, lockp); + + /* + * Demote the mapping. This pmap is locked. The old PDE has + * PG_A set. If the old PDE has PG_RW set, it also has PG_M + * set. Thus, there is no danger of a race with another + * processor changing the setting of PG_A and/or PG_M between + * the read above and the store below. + */ + pde_store(l3e, mptepa); + pmap_invalidate_l3e_page(pmap, trunc_2mpage(va), oldpde); + /* + * Demote the PV entry. + */ + if ((oldpde & PG_MANAGED) != 0) + pmap_pv_demote_l3e(pmap, va, oldpde & PG_PS_FRAME, lockp); + + counter_u64_add(pmap_l3e_demotions, 1); + CTR2(KTR_PMAP, "pmap_demote_l3e: success for va %#lx" + " in pmap %p", va, pmap); + return (true); +} + +/* + * pmap_remove_kernel_pde: Remove a kernel superpage mapping. + */ +static void +pmap_remove_kernel_l3e(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va) +{ + vm_paddr_t mptepa; + vm_page_t mpte; + + KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mpte = pmap_remove_pt_page(pmap, va); + if (mpte == NULL) + panic("pmap_remove_kernel_pde: Missing pt page."); + + mptepa = VM_PAGE_TO_PHYS(mpte); + + /* + * Initialize the page table page. + */ + pagezero(PHYS_TO_DMAP(mptepa)); + + /* + * Demote the mapping. + */ + pde_store(l3e, mptepa); + ptesync(); +} + +/* + * pmap_remove_l3e: do the things to unmap a superpage in a process + */ +static int +pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva, + struct spglist *free, struct rwlock **lockp) +{ + struct md_page *pvh; + pml3_entry_t oldpde; + vm_offset_t eva, va; + vm_page_t m, mpte; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((sva & L3_PAGE_MASK) == 0, + ("pmap_remove_l3e: sva is not 2mpage aligned")); + oldpde = be64toh(pte_load_clear(pdq)); + if (oldpde & PG_W) + pmap->pm_stats.wired_count -= (L3_PAGE_SIZE / PAGE_SIZE); + pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE); + if (oldpde & PG_MANAGED) { + CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); + pvh = pa_to_pvh(oldpde & PG_PS_FRAME); + pmap_pvh_free(pvh, pmap, sva); + eva = sva + L3_PAGE_SIZE; + for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); + va < eva; va += PAGE_SIZE, m++) { + if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + if (oldpde & PG_A) + vm_page_aflag_set(m, PGA_REFERENCED); + if (TAILQ_EMPTY(&m->md.pv_list) && + TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } + } + if (pmap == kernel_pmap) { + pmap_remove_kernel_l3e(pmap, pdq, sva); + } else { + mpte = pmap_remove_pt_page(pmap, sva); + if (mpte != NULL) { + pmap_resident_count_dec(pmap, 1); + KASSERT(mpte->ref_count == NPTEPG, + ("pmap_remove_l3e: pte page wire count error")); + mpte->ref_count = 0; + pmap_add_delayed_free_list(mpte, free, false); + } + } + return (pmap_unuse_pt(pmap, sva, be64toh(*pmap_pml2e(pmap, sva)), free)); +} + +/* + * pmap_remove_pte: do the things to unmap a page in a process + */ +static int +pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, + pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp) +{ + struct md_page *pvh; + pt_entry_t oldpte; + vm_page_t m; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + oldpte = be64toh(pte_load_clear(ptq)); + if (oldpte & RPTE_WIRED) + pmap->pm_stats.wired_count -= 1; + pmap_resident_count_dec(pmap, 1); + if (oldpte & RPTE_MANAGED) { + m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); + if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + if (oldpte & PG_A) + vm_page_aflag_set(m, PGA_REFERENCED); + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); + pmap_pvh_free(&m->md, pmap, va); + if (TAILQ_EMPTY(&m->md.pv_list) && + (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } + } + return (pmap_unuse_pt(pmap, va, ptepde, free)); +} + +/* + * Remove a single page from a process address space + */ +static bool +pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *l3e, + struct spglist *free) +{ + struct rwlock *lock; + pt_entry_t *pte; + bool invalidate_all; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if ((be64toh(*l3e) & RPTE_VALID) == 0) { + return (false); + } + pte = pmap_l3e_to_pte(l3e, va); + if ((be64toh(*pte) & RPTE_VALID) == 0) { + return (false); + } + lock = NULL; + + invalidate_all = pmap_remove_pte(pmap, pte, va, be64toh(*l3e), free, &lock); + if (lock != NULL) + rw_wunlock(lock); + if (!invalidate_all) + pmap_invalidate_page(pmap, va); + return (invalidate_all); +} + +/* + * Removes the specified range of addresses from the page table page. + */ +static bool +pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp) +{ + pt_entry_t *pte; + vm_offset_t va; + bool anyvalid; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + anyvalid = false; + va = eva; + for (pte = pmap_l3e_to_pte(l3e, sva); sva != eva; pte++, + sva += PAGE_SIZE) { + MPASS(pte == pmap_pte(pmap, sva)); + if (*pte == 0) { + if (va != eva) { + anyvalid = true; + va = eva; + } + continue; + } + if (va == eva) + va = sva; + if (pmap_remove_pte(pmap, pte, sva, be64toh(*l3e), free, lockp)) { + anyvalid = true; + sva += PAGE_SIZE; + break; + } + } + if (anyvalid) + pmap_invalidate_all(pmap); + else if (va != eva) + pmap_invalidate_range(pmap, va, sva); + return (anyvalid); +} + +void +mmu_radix_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + struct rwlock *lock; + vm_offset_t va_next; + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t ptpaddr, *l3e; + struct spglist free; + bool anyvalid; + + CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva); + + /* + * Perform an unsynchronized read. This is, however, safe. + */ + if (pmap->pm_stats.resident_count == 0) + return; + + anyvalid = false; + SLIST_INIT(&free); + + /* XXX something fishy here */ + sva = (sva + PAGE_MASK) & ~PAGE_MASK; + eva = (eva + PAGE_MASK) & ~PAGE_MASK; + + PMAP_LOCK(pmap); + + /* + * special handling of removing one page. a very + * common operation and easy to short circuit some + * code. + */ + if (sva + PAGE_SIZE == eva) { + l3e = pmap_pml3e(pmap, sva); + if (l3e && (be64toh(*l3e) & RPTE_LEAF) == 0) { + anyvalid = pmap_remove_page(pmap, sva, l3e, &free); + goto out; + } + } + + lock = NULL; + for (; sva < eva; sva = va_next) { + if (pmap->pm_stats.resident_count == 0) + break; + l1e = pmap_pml1e(pmap, sva); + if (l1e == NULL || (be64toh(*l1e) & PG_V) == 0) { + va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + + l2e = pmap_l1e_to_l2e(l1e, sva); + if (l2e == NULL || (be64toh(*l2e) & PG_V) == 0) { + va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + + /* + * Calculate index for next page table. + */ + va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (va_next < sva) + va_next = eva; + + l3e = pmap_l2e_to_l3e(l2e, sva); + ptpaddr = be64toh(*l3e); + + /* + * Weed out invalid mappings. + */ + if (ptpaddr == 0) + continue; + + /* + * Check for large page. + */ + if ((ptpaddr & RPTE_LEAF) != 0) { + /* + * Are we removing the entire large page? If not, + * demote the mapping and fall through. + */ + if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { + pmap_remove_l3e(pmap, l3e, sva, &free, &lock); + anyvalid = true; + continue; + } else if (!pmap_demote_l3e_locked(pmap, l3e, sva, + &lock)) { + /* The large page mapping was destroyed. */ + continue; + } else + ptpaddr = be64toh(*l3e); + } + + /* + * Limit our scan to either the end of the va represented + * by the current page table page, or to the end of the + * range being removed. + */ + if (va_next > eva) + va_next = eva; + + if (pmap_remove_ptes(pmap, sva, va_next, l3e, &free, &lock)) + anyvalid = true; + } + if (lock != NULL) + rw_wunlock(lock); +out: + if (anyvalid) + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); + vm_page_free_pages_toq(&free, true); +} + +void +mmu_radix_remove_all(vm_page_t m) +{ + struct md_page *pvh; + pv_entry_t pv; + pmap_t pmap; + struct rwlock *lock; + pt_entry_t *pte, tpte; + pml3_entry_t *l3e; + vm_offset_t va; + struct spglist free; + int pvh_gen, md_gen; + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_remove_all: page %p is not managed", m)); + SLIST_INIT(&free); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : + pa_to_pvh(VM_PAGE_TO_PHYS(m)); +retry: + rw_wlock(lock); + while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + rw_wunlock(lock); + PMAP_UNLOCK(pmap); + goto retry; + } + } + va = pv->pv_va; + l3e = pmap_pml3e(pmap, va); + (void)pmap_demote_l3e_locked(pmap, l3e, va, &lock); + PMAP_UNLOCK(pmap); + } + while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + md_gen = m->md.pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { + rw_wunlock(lock); + PMAP_UNLOCK(pmap); + goto retry; + } + } + pmap_resident_count_dec(pmap, 1); + l3e = pmap_pml3e(pmap, pv->pv_va); + KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_remove_all: found" + " a 2mpage in page %p's pv list", m)); + pte = pmap_l3e_to_pte(l3e, pv->pv_va); + tpte = be64toh(pte_load_clear(pte)); + if (tpte & PG_W) + pmap->pm_stats.wired_count--; + if (tpte & PG_A) + vm_page_aflag_set(m, PGA_REFERENCED); + + /* + * Update the vm_page_t clean and reference bits. + */ + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + vm_page_dirty(m); + pmap_unuse_pt(pmap, pv->pv_va, be64toh(*l3e), &free); + pmap_invalidate_page(pmap, pv->pv_va); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); + m->md.pv_gen++; + free_pv_entry(pmap, pv); + PMAP_UNLOCK(pmap); + } + vm_page_aflag_clear(m, PGA_WRITEABLE); + rw_wunlock(lock); + vm_page_free_pages_toq(&free, true); +} + +/* + * Destroy all managed, non-wired mappings in the given user-space + * pmap. This pmap cannot be active on any processor besides the + * caller. + * + * This function cannot be applied to the kernel pmap. Moreover, it + * is not intended for general use. It is only to be used during + * process termination. Consequently, it can be implemented in ways + * that make it faster than pmap_remove(). First, it can more quickly + * destroy mappings by iterating over the pmap's collection of PV + * entries, rather than searching the page table. Second, it doesn't + * have to test and clear the page table entries atomically, because + * no processor is currently accessing the user address space. In + * particular, a page table entry's dirty bit won't change state once + * this function starts. + * + * Although this function destroys all of the pmap's managed, + * non-wired mappings, it can delay and batch the invalidation of TLB + * entries without calling pmap_delayed_invl_started() and + * pmap_delayed_invl_finished(). Because the pmap is not active on + * any other processor, none of these TLB entries will ever be used + * before their eventual invalidation. Consequently, there is no need + * for either pmap_remove_all() or pmap_remove_write() to wait for + * that eventual TLB invalidation. + */ + +void +mmu_radix_remove_pages(pmap_t pmap) +{ + + CTR2(KTR_PMAP, "%s(%p)", __func__, pmap); + pml3_entry_t ptel3e; + pt_entry_t *pte, tpte; + struct spglist free; + vm_page_t m, mpte, mt; + pv_entry_t pv; + struct md_page *pvh; + struct pv_chunk *pc, *npc; + struct rwlock *lock; + int64_t bit; + uint64_t inuse, bitmask; + int allfree, field, idx; +#ifdef PV_STATS + int freed; +#endif + bool superpage; + vm_paddr_t pa; + + /* + * Assert that the given pmap is only active on the current + * CPU. Unfortunately, we cannot block another CPU from + * activating the pmap while this function is executing. + */ + KASSERT(pmap->pm_pid == mfspr(SPR_PID), + ("non-current asid %lu - expected %lu", pmap->pm_pid, + mfspr(SPR_PID))); + + lock = NULL; + + SLIST_INIT(&free); + PMAP_LOCK(pmap); + TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { + allfree = 1; +#ifdef PV_STATS + freed = 0; +#endif + for (field = 0; field < _NPCM; field++) { + inuse = ~pc->pc_map[field] & pc_freemask[field]; + while (inuse != 0) { + bit = cnttzd(inuse); + bitmask = 1UL << bit; + idx = field * 64 + bit; + pv = &pc->pc_pventry[idx]; + inuse &= ~bitmask; + + pte = pmap_pml2e(pmap, pv->pv_va); + ptel3e = be64toh(*pte); + pte = pmap_l2e_to_l3e(pte, pv->pv_va); + tpte = be64toh(*pte); + if ((tpte & (RPTE_LEAF | PG_V)) == PG_V) { + superpage = false; + ptel3e = tpte; + pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & + PG_FRAME); + pte = &pte[pmap_pte_index(pv->pv_va)]; + tpte = be64toh(*pte); + } else { + /* + * Keep track whether 'tpte' is a + * superpage explicitly instead of + * relying on RPTE_LEAF being set. + * + * This is because RPTE_LEAF is numerically + * identical to PG_PTE_PAT and thus a + * regular page could be mistaken for + * a superpage. + */ + superpage = true; + } + + if ((tpte & PG_V) == 0) { + panic("bad pte va %lx pte %lx", + pv->pv_va, tpte); + } + +/* + * We cannot remove wired pages from a process' mapping at this time + */ + if (tpte & PG_W) { + allfree = 0; + continue; + } + + if (superpage) + pa = tpte & PG_PS_FRAME; + else + pa = tpte & PG_FRAME; + + m = PHYS_TO_VM_PAGE(pa); + KASSERT(m->phys_addr == pa, + ("vm_page_t %p phys_addr mismatch %016jx %016jx", + m, (uintmax_t)m->phys_addr, + (uintmax_t)tpte)); + + KASSERT((m->flags & PG_FICTITIOUS) != 0 || + m < &vm_page_array[vm_page_array_size], + ("pmap_remove_pages: bad tpte %#jx", + (uintmax_t)tpte)); + + pte_clear(pte); + + /* + * Update the vm_page_t clean/reference bits. + */ + if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { + if (superpage) { + for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) + vm_page_dirty(mt); + } else + vm_page_dirty(m); + } + + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); + + /* Mark free */ + pc->pc_map[field] |= bitmask; + if (superpage) { + pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE); + pvh = pa_to_pvh(tpte & PG_PS_FRAME); + TAILQ_REMOVE(&pvh->pv_list, pv, pv_link); + pvh->pv_gen++; + if (TAILQ_EMPTY(&pvh->pv_list)) { + for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++) + if ((mt->a.flags & PGA_WRITEABLE) != 0 && + TAILQ_EMPTY(&mt->md.pv_list)) + vm_page_aflag_clear(mt, PGA_WRITEABLE); + } + mpte = pmap_remove_pt_page(pmap, pv->pv_va); + if (mpte != NULL) { + pmap_resident_count_dec(pmap, 1); + KASSERT(mpte->ref_count == NPTEPG, + ("pmap_remove_pages: pte page wire count error")); + mpte->ref_count = 0; + pmap_add_delayed_free_list(mpte, &free, false); + } + } else { + pmap_resident_count_dec(pmap, 1); +#ifdef VERBOSE_PV + printf("freeing pv (%p, %p)\n", + pmap, pv); +#endif + TAILQ_REMOVE(&m->md.pv_list, pv, pv_link); + m->md.pv_gen++; + if ((m->a.flags & PGA_WRITEABLE) != 0 && + TAILQ_EMPTY(&m->md.pv_list) && + (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } + } + pmap_unuse_pt(pmap, pv->pv_va, ptel3e, &free); +#ifdef PV_STATS + freed++; +#endif + } + } + PV_STAT(atomic_add_long(&pv_entry_frees, freed)); + PV_STAT(atomic_add_int(&pv_entry_spare, freed)); + PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); + if (allfree) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + free_pv_chunk(pc); + } + } + if (lock != NULL) + rw_wunlock(lock); + pmap_invalidate_all(pmap); + PMAP_UNLOCK(pmap); + vm_page_free_pages_toq(&free, true); +} + +void +mmu_radix_remove_write(vm_page_t m) +{ + struct md_page *pvh; + pmap_t pmap; + struct rwlock *lock; + pv_entry_t next_pv, pv; + pml3_entry_t *l3e; + pt_entry_t oldpte, *pte; + int pvh_gen, md_gen; + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_remove_write: page %p is not managed", m)); + vm_page_assert_busied(m); + + if (!pmap_page_is_write_mapped(m)) + return; + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : + pa_to_pvh(VM_PAGE_TO_PHYS(m)); +retry_pv_loop: + rw_wlock(lock); + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + rw_wunlock(lock); + goto retry_pv_loop; + } + } + l3e = pmap_pml3e(pmap, pv->pv_va); + if ((be64toh(*l3e) & PG_RW) != 0) + (void)pmap_demote_l3e_locked(pmap, l3e, pv->pv_va, &lock); + KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), + ("inconsistent pv lock %p %p for page %p", + lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); + PMAP_UNLOCK(pmap); + } + TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + md_gen = m->md.pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen || + md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + rw_wunlock(lock); + goto retry_pv_loop; + } + } + l3e = pmap_pml3e(pmap, pv->pv_va); + KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, + ("pmap_remove_write: found a 2mpage in page %p's pv list", + m)); + pte = pmap_l3e_to_pte(l3e, pv->pv_va); +retry: + oldpte = be64toh(*pte); + if (oldpte & PG_RW) { + if (!atomic_cmpset_long(pte, htobe64(oldpte), + htobe64((oldpte | RPTE_EAA_R) & ~(PG_RW | PG_M)))) + goto retry; + if ((oldpte & PG_M) != 0) + vm_page_dirty(m); + pmap_invalidate_page(pmap, pv->pv_va); + } + PMAP_UNLOCK(pmap); + } + rw_wunlock(lock); + vm_page_aflag_clear(m, PGA_WRITEABLE); +} + +/* + * Clear the wired attribute from the mappings for the specified range of + * addresses in the given pmap. Every valid mapping within that range + * must have the wired attribute set. In contrast, invalid mappings + * cannot have the wired attribute set, so they are ignored. + * + * The wired attribute of the page table entry is not a hardware + * feature, so there is no need to invalidate any TLB entries. + * Since pmap_demote_l3e() for the wired entry must never fail, + * pmap_delayed_invl_started()/finished() calls around the + * function are not needed. + */ +void +mmu_radix_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t va_next; + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t *l3e; + pt_entry_t *pte; + + CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva); + PMAP_LOCK(pmap); + for (; sva < eva; sva = va_next) { + l1e = pmap_pml1e(pmap, sva); + if ((be64toh(*l1e) & PG_V) == 0) { + va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + l2e = pmap_l1e_to_l2e(l1e, sva); + if ((be64toh(*l2e) & PG_V) == 0) { + va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK; + if (va_next < sva) + va_next = eva; + continue; + } + va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK; + if (va_next < sva) + va_next = eva; + l3e = pmap_l2e_to_l3e(l2e, sva); + if ((be64toh(*l3e) & PG_V) == 0) + continue; + if ((be64toh(*l3e) & RPTE_LEAF) != 0) { + if ((be64toh(*l3e) & PG_W) == 0) + panic("pmap_unwire: pde %#jx is missing PG_W", + (uintmax_t)(be64toh(*l3e))); + + /* + * Are we unwiring the entire large page? If not, + * demote the mapping and fall through. + */ + if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) { + atomic_clear_long(l3e, htobe64(PG_W)); + pmap->pm_stats.wired_count -= L3_PAGE_SIZE / + PAGE_SIZE; + continue; + } else if (!pmap_demote_l3e(pmap, l3e, sva)) + panic("pmap_unwire: demotion failed"); + } + if (va_next > eva) + va_next = eva; + for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++, + sva += PAGE_SIZE) { + MPASS(pte == pmap_pte(pmap, sva)); + if ((be64toh(*pte) & PG_V) == 0) + continue; + if ((be64toh(*pte) & PG_W) == 0) + panic("pmap_unwire: pte %#jx is missing PG_W", + (uintmax_t)(be64toh(*pte))); + + /* + * PG_W must be cleared atomically. Although the pmap + * lock synchronizes access to PG_W, another processor + * could be setting PG_M and/or PG_A concurrently. + */ + atomic_clear_long(pte, htobe64(PG_W)); + pmap->pm_stats.wired_count--; + } + } + PMAP_UNLOCK(pmap); +} + +void +mmu_radix_zero_page(vm_page_t m) +{ + vm_offset_t addr; + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + addr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); + pagezero(addr); +} + +void +mmu_radix_zero_page_area(vm_page_t m, int off, int size) +{ + caddr_t addr; + + CTR4(KTR_PMAP, "%s(%p, %d, %d)", __func__, m, off, size); + MPASS(off + size <= PAGE_SIZE); + addr = (caddr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); + memset(addr + off, 0, size); +} + +static int +mmu_radix_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) +{ + pml3_entry_t *l3ep; + pt_entry_t pte; + vm_paddr_t pa; + int val; + + CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr); + PMAP_LOCK(pmap); + + l3ep = pmap_pml3e(pmap, addr); + if (l3ep != NULL && (be64toh(*l3ep) & PG_V)) { + if (be64toh(*l3ep) & RPTE_LEAF) { + pte = be64toh(*l3ep); + /* Compute the physical address of the 4KB page. */ + pa = ((be64toh(*l3ep) & PG_PS_FRAME) | (addr & L3_PAGE_MASK)) & + PG_FRAME; + val = MINCORE_PSIND(1); + } else { + /* Native endian PTE, do not pass to functions */ + pte = be64toh(*pmap_l3e_to_pte(l3ep, addr)); + pa = pte & PG_FRAME; + val = 0; + } + } else { + pte = 0; + pa = 0; + val = 0; + } + if ((pte & PG_V) != 0) { + val |= MINCORE_INCORE; + if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) + val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; + if ((pte & PG_A) != 0) + val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; + } + if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != + (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && + (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { + *locked_pa = pa; + } + PMAP_UNLOCK(pmap); + return (val); +} + +void +mmu_radix_activate(struct thread *td) +{ + pmap_t pmap; + uint32_t curpid; + + CTR2(KTR_PMAP, "%s(%p)", __func__, td); + critical_enter(); + pmap = vmspace_pmap(td->td_proc->p_vmspace); + curpid = mfspr(SPR_PID); + if (pmap->pm_pid > isa3_base_pid && + curpid != pmap->pm_pid) { + mmu_radix_pid_set(pmap); + } + critical_exit(); +} + +/* + * Increase the starting virtual address of the given mapping if a + * different alignment might result in more superpage mappings. + */ +void +mmu_radix_align_superpage(vm_object_t object, vm_ooffset_t offset, + vm_offset_t *addr, vm_size_t size) +{ + + CTR5(KTR_PMAP, "%s(%p, %#x, %p, %#x)", __func__, object, offset, addr, + size); + vm_offset_t superpage_offset; + + if (size < L3_PAGE_SIZE) + return; + if (object != NULL && (object->flags & OBJ_COLORED) != 0) + offset += ptoa(object->pg_color); + superpage_offset = offset & L3_PAGE_MASK; + if (size - ((L3_PAGE_SIZE - superpage_offset) & L3_PAGE_MASK) < L3_PAGE_SIZE || + (*addr & L3_PAGE_MASK) == superpage_offset) + return; + if ((*addr & L3_PAGE_MASK) < superpage_offset) + *addr = (*addr & ~L3_PAGE_MASK) + superpage_offset; + else + *addr = ((*addr + L3_PAGE_MASK) & ~L3_PAGE_MASK) + superpage_offset; +} + +static void * +mmu_radix_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t attr) +{ + vm_offset_t va, tmpva, ppa, offset; + + ppa = trunc_page(pa); + offset = pa & PAGE_MASK; + size = roundup2(offset + size, PAGE_SIZE); + if (pa < powerpc_ptob(Maxmem)) + panic("bad pa: %#lx less than Maxmem %#lx\n", + pa, powerpc_ptob(Maxmem)); + va = kva_alloc(size); + if (bootverbose) + printf("%s(%#lx, %lu, %d)\n", __func__, pa, size, attr); + KASSERT(size > 0, ("%s(%#lx, %lu, %d)", __func__, pa, size, attr)); + + if (!va) + panic("%s: Couldn't alloc kernel virtual memory", __func__); + + for (tmpva = va; size > 0;) { + mmu_radix_kenter_attr(tmpva, ppa, attr); + size -= PAGE_SIZE; + tmpva += PAGE_SIZE; + ppa += PAGE_SIZE; + } + ptesync(); + + return ((void *)(va + offset)); +} + +static void * +mmu_radix_mapdev(vm_paddr_t pa, vm_size_t size) +{ + + CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); + + return (mmu_radix_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT)); +} + +void +mmu_radix_page_set_memattr(vm_page_t m, vm_memattr_t ma) +{ + + CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma); + + if (m->md.mdpg_cache_attrs == ma) + return; + + m->md.mdpg_cache_attrs = ma; + + /* + * If "m" is a normal page, update its direct mapping. This update + * can be relied upon to perform any cache operations that are + * required for data coherence. + */ + if ((m->flags & PG_FICTITIOUS) == 0 && + mmu_radix_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), + PAGE_SIZE, m->md.mdpg_cache_attrs)) + panic("memory attribute change on the direct map failed"); +} + +static void +mmu_radix_unmapdev(void *p, vm_size_t size) +{ + vm_offset_t offset, va; + + CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, p, size); + + /* If we gave a direct map region in pmap_mapdev, do nothing */ + va = (vm_offset_t)p; + if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) + return; + + offset = va & PAGE_MASK; + size = round_page(offset + size); + va = trunc_page(va); + + if (pmap_initialized) { + mmu_radix_qremove(va, atop(size)); + kva_free(va, size); + } +} + +void +mmu_radix_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) +{ + vm_paddr_t pa = 0; + int sync_sz; + + if (__predict_false(pm == NULL)) + pm = &curthread->td_proc->p_vmspace->vm_pmap; + + while (sz > 0) { + pa = pmap_extract(pm, va); + sync_sz = PAGE_SIZE - (va & PAGE_MASK); + sync_sz = min(sync_sz, sz); + if (pa != 0) { + pa += (va & PAGE_MASK); + __syncicache((void *)PHYS_TO_DMAP(pa), sync_sz); + } + va += sync_sz; + sz -= sync_sz; + } +} + +static __inline void +pmap_pte_attr(pt_entry_t *pte, uint64_t cache_bits, uint64_t mask) +{ + uint64_t opte, npte; + + /* + * The cache mode bits are all in the low 32-bits of the + * PTE, so we can just spin on updating the low 32-bits. + */ + do { + opte = be64toh(*pte); + npte = opte & ~mask; + npte |= cache_bits; + } while (npte != opte && !atomic_cmpset_long(pte, htobe64(opte), htobe64(npte))); +} + +/* + * Tries to demote a 1GB page mapping. + */ +static bool +pmap_demote_l2e(pmap_t pmap, pml2_entry_t *l2e, vm_offset_t va) +{ + pml2_entry_t oldpdpe; + pml3_entry_t *firstpde, newpde, *pde; + vm_paddr_t pdpgpa; + vm_page_t pdpg; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + oldpdpe = be64toh(*l2e); + KASSERT((oldpdpe & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V), + ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); + pdpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED); + if (pdpg == NULL) { + CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" + " in pmap %p", va, pmap); + return (false); + } + pdpg->pindex = va >> L2_PAGE_SIZE_SHIFT; + pdpgpa = VM_PAGE_TO_PHYS(pdpg); + firstpde = (pml3_entry_t *)PHYS_TO_DMAP(pdpgpa); + KASSERT((oldpdpe & PG_A) != 0, + ("pmap_demote_pdpe: oldpdpe is missing PG_A")); + KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, + ("pmap_demote_pdpe: oldpdpe is missing PG_M")); + newpde = oldpdpe; + + /* + * Initialize the page directory page. + */ + for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { + *pde = htobe64(newpde); + newpde += L3_PAGE_SIZE; + } + + /* + * Demote the mapping. + */ + pde_store(l2e, pdpgpa); + + /* + * Flush PWC --- XXX revisit + */ + pmap_invalidate_all(pmap); + + counter_u64_add(pmap_l2e_demotions, 1); + CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" + " in pmap %p", va, pmap); + return (true); +} + +vm_paddr_t +mmu_radix_kextract(vm_offset_t va) +{ + pml3_entry_t l3e; + vm_paddr_t pa; + + CTR2(KTR_PMAP, "%s(%#x)", __func__, va); + if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { + pa = DMAP_TO_PHYS(va); + } else { + /* Big-endian PTE on stack */ + l3e = *pmap_pml3e(kernel_pmap, va); + if (be64toh(l3e) & RPTE_LEAF) { + pa = (be64toh(l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK); + pa |= (va & L3_PAGE_MASK); + } else { + /* + * Beware of a concurrent promotion that changes the + * PDE at this point! For example, vtopte() must not + * be used to access the PTE because it would use the + * new PDE. It is, however, safe to use the old PDE + * because the page table page is preserved by the + * promotion. + */ + pa = be64toh(*pmap_l3e_to_pte(&l3e, va)); + pa = (pa & PG_FRAME) | (va & PAGE_MASK); + pa |= (va & PAGE_MASK); + } + } + return (pa); +} + +static pt_entry_t +mmu_radix_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) +{ + + if (ma != VM_MEMATTR_DEFAULT) { + return pmap_cache_bits(ma); + } + + /* + * Assume the page is cache inhibited and access is guarded unless + * it's in our available memory array. + */ + for (int i = 0; i < pregions_sz; i++) { + if ((pa >= pregions[i].mr_start) && + (pa < (pregions[i].mr_start + pregions[i].mr_size))) + return (RPTE_ATTR_MEM); + } + return (RPTE_ATTR_GUARDEDIO); +} + +static void +mmu_radix_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) +{ + pt_entry_t *pte, pteval; + uint64_t cache_bits; + + pte = kvtopte(va); + MPASS(pte != NULL); + pteval = pa | RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A; + cache_bits = mmu_radix_calc_wimg(pa, ma); + pte_store(pte, pteval | cache_bits); +} + +void +mmu_radix_kremove(vm_offset_t va) +{ + pt_entry_t *pte; + + CTR2(KTR_PMAP, "%s(%#x)", __func__, va); + + pte = kvtopte(va); + pte_clear(pte); +} + +int +mmu_radix_decode_kernel_ptr(vm_offset_t addr, + int *is_user, vm_offset_t *decoded) +{ + + CTR2(KTR_PMAP, "%s(%#jx)", __func__, (uintmax_t)addr); + *decoded = addr; + *is_user = (addr < VM_MAXUSER_ADDRESS); + return (0); +} + +static int +mmu_radix_dev_direct_mapped(vm_paddr_t pa, vm_size_t size) +{ + + CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size); + return (mem_valid(pa, size)); +} + +static void +mmu_radix_scan_init(void) +{ + + CTR1(KTR_PMAP, "%s()", __func__); + UNIMPLEMENTED(); +} + +static void +mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, + void **va) +{ + CTR4(KTR_PMAP, "%s(%#jx, %#zx, %p)", __func__, (uintmax_t)pa, sz, va); + UNIMPLEMENTED(); +} + +vm_offset_t +mmu_radix_quick_enter_page(vm_page_t m) +{ + vm_paddr_t paddr; + + CTR2(KTR_PMAP, "%s(%p)", __func__, m); + paddr = VM_PAGE_TO_PHYS(m); + return (PHYS_TO_DMAP(paddr)); +} + +void +mmu_radix_quick_remove_page(vm_offset_t addr __unused) +{ + /* no work to do here */ + CTR2(KTR_PMAP, "%s(%#x)", __func__, addr); +} + +static void +pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) +{ + cpu_flush_dcache((void *)sva, eva - sva); +} + +int +mmu_radix_change_attr(vm_offset_t va, vm_size_t size, + vm_memattr_t mode) +{ + int error; + + CTR4(KTR_PMAP, "%s(%#x, %#zx, %d)", __func__, va, size, mode); + PMAP_LOCK(kernel_pmap); + error = pmap_change_attr_locked(va, size, mode, true); + PMAP_UNLOCK(kernel_pmap); + return (error); +} + +static int +pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush) +{ + vm_offset_t base, offset, tmpva; + vm_paddr_t pa_start, pa_end, pa_end1; + pml2_entry_t *l2e; + pml3_entry_t *l3e; + pt_entry_t *pte; + int cache_bits, error; + bool changed; + + PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); + base = trunc_page(va); + offset = va & PAGE_MASK; + size = round_page(offset + size); + + /* + * Only supported on kernel virtual addresses, including the direct + * map but excluding the recursive map. + */ + if (base < DMAP_MIN_ADDRESS) + return (EINVAL); + + cache_bits = pmap_cache_bits(mode); + changed = false; + + /* + * Pages that aren't mapped aren't supported. Also break down 2MB pages + * into 4KB pages if required. + */ + for (tmpva = base; tmpva < base + size; ) { + l2e = pmap_pml2e(kernel_pmap, tmpva); + if (l2e == NULL || *l2e == 0) + return (EINVAL); + if (be64toh(*l2e) & RPTE_LEAF) { + /* + * If the current 1GB page already has the required + * memory type, then we need not demote this page. Just + * increment tmpva to the next 1GB page frame. + */ + if ((be64toh(*l2e) & RPTE_ATTR_MASK) == cache_bits) { + tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE; + continue; + } + + /* + * If the current offset aligns with a 1GB page frame + * and there is at least 1GB left within the range, then + * we need not break down this page into 2MB pages. + */ + if ((tmpva & L2_PAGE_MASK) == 0 && + tmpva + L2_PAGE_MASK < base + size) { + tmpva += L2_PAGE_MASK; + continue; + } + if (!pmap_demote_l2e(kernel_pmap, l2e, tmpva)) + return (ENOMEM); + } + l3e = pmap_l2e_to_l3e(l2e, tmpva); + KASSERT(l3e != NULL, ("no l3e entry for %#lx in %p\n", + tmpva, l2e)); + if (*l3e == 0) + return (EINVAL); + if (be64toh(*l3e) & RPTE_LEAF) { + /* + * If the current 2MB page already has the required + * memory type, then we need not demote this page. Just + * increment tmpva to the next 2MB page frame. + */ + if ((be64toh(*l3e) & RPTE_ATTR_MASK) == cache_bits) { + tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE; + continue; + } + + /* + * If the current offset aligns with a 2MB page frame + * and there is at least 2MB left within the range, then + * we need not break down this page into 4KB pages. + */ + if ((tmpva & L3_PAGE_MASK) == 0 && + tmpva + L3_PAGE_MASK < base + size) { + tmpva += L3_PAGE_SIZE; + continue; + } + if (!pmap_demote_l3e(kernel_pmap, l3e, tmpva)) + return (ENOMEM); + } + pte = pmap_l3e_to_pte(l3e, tmpva); + if (*pte == 0) + return (EINVAL); + tmpva += PAGE_SIZE; + } + error = 0; + + /* + * Ok, all the pages exist, so run through them updating their + * cache mode if required. + */ + pa_start = pa_end = 0; + for (tmpva = base; tmpva < base + size; ) { + l2e = pmap_pml2e(kernel_pmap, tmpva); + if (be64toh(*l2e) & RPTE_LEAF) { + if ((be64toh(*l2e) & RPTE_ATTR_MASK) != cache_bits) { + pmap_pte_attr(l2e, cache_bits, + RPTE_ATTR_MASK); + changed = true; + } + if (tmpva >= VM_MIN_KERNEL_ADDRESS && + (*l2e & PG_PS_FRAME) < dmaplimit) { + if (pa_start == pa_end) { + /* Start physical address run. */ + pa_start = be64toh(*l2e) & PG_PS_FRAME; + pa_end = pa_start + L2_PAGE_SIZE; + } else if (pa_end == (be64toh(*l2e) & PG_PS_FRAME)) + pa_end += L2_PAGE_SIZE; + else { + /* Run ended, update direct map. */ + error = pmap_change_attr_locked( + PHYS_TO_DMAP(pa_start), + pa_end - pa_start, mode, flush); + if (error != 0) + break; + /* Start physical address run. */ + pa_start = be64toh(*l2e) & PG_PS_FRAME; + pa_end = pa_start + L2_PAGE_SIZE; + } + } + tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE; + continue; + } + l3e = pmap_l2e_to_l3e(l2e, tmpva); + if (be64toh(*l3e) & RPTE_LEAF) { + if ((be64toh(*l3e) & RPTE_ATTR_MASK) != cache_bits) { + pmap_pte_attr(l3e, cache_bits, + RPTE_ATTR_MASK); + changed = true; + } + if (tmpva >= VM_MIN_KERNEL_ADDRESS && + (be64toh(*l3e) & PG_PS_FRAME) < dmaplimit) { + if (pa_start == pa_end) { + /* Start physical address run. */ + pa_start = be64toh(*l3e) & PG_PS_FRAME; + pa_end = pa_start + L3_PAGE_SIZE; + } else if (pa_end == (be64toh(*l3e) & PG_PS_FRAME)) + pa_end += L3_PAGE_SIZE; + else { + /* Run ended, update direct map. */ + error = pmap_change_attr_locked( + PHYS_TO_DMAP(pa_start), + pa_end - pa_start, mode, flush); + if (error != 0) + break; + /* Start physical address run. */ + pa_start = be64toh(*l3e) & PG_PS_FRAME; + pa_end = pa_start + L3_PAGE_SIZE; + } + } + tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE; + } else { + pte = pmap_l3e_to_pte(l3e, tmpva); + if ((be64toh(*pte) & RPTE_ATTR_MASK) != cache_bits) { + pmap_pte_attr(pte, cache_bits, + RPTE_ATTR_MASK); + changed = true; + } + if (tmpva >= VM_MIN_KERNEL_ADDRESS && + (be64toh(*pte) & PG_FRAME) < dmaplimit) { + if (pa_start == pa_end) { + /* Start physical address run. */ + pa_start = be64toh(*pte) & PG_FRAME; + pa_end = pa_start + PAGE_SIZE; + } else if (pa_end == (be64toh(*pte) & PG_FRAME)) + pa_end += PAGE_SIZE; + else { + /* Run ended, update direct map. */ + error = pmap_change_attr_locked( + PHYS_TO_DMAP(pa_start), + pa_end - pa_start, mode, flush); + if (error != 0) + break; + /* Start physical address run. */ + pa_start = be64toh(*pte) & PG_FRAME; + pa_end = pa_start + PAGE_SIZE; + } + } + tmpva += PAGE_SIZE; + } + } + if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { + pa_end1 = MIN(pa_end, dmaplimit); + if (pa_start != pa_end1) + error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), + pa_end1 - pa_start, mode, flush); + } + + /* + * Flush CPU caches if required to make sure any data isn't cached that + * shouldn't be, etc. + */ + if (changed) { + pmap_invalidate_all(kernel_pmap); + + if (flush) + pmap_invalidate_cache_range(base, tmpva); + } + return (error); +} + +/* + * Allocate physical memory for the vm_page array and map it into KVA, + * attempting to back the vm_pages with domain-local memory. + */ +void +mmu_radix_page_array_startup(long pages) +{ +#ifdef notyet + pml2_entry_t *l2e; + pml3_entry_t *pde; + pml3_entry_t newl3; + vm_offset_t va; + long pfn; + int domain, i; +#endif + vm_paddr_t pa; + vm_offset_t start, end; + + vm_page_array_size = pages; + + start = VM_MIN_KERNEL_ADDRESS; + end = start + pages * sizeof(struct vm_page); + + pa = vm_phys_early_alloc(-1, end - start); + + start = mmu_radix_map(&start, pa, end - start, VM_MEMATTR_DEFAULT); +#ifdef notyet + /* TODO: NUMA vm_page_array. Blocked out until then (copied from amd64). */ + for (va = start; va < end; va += L3_PAGE_SIZE) { + pfn = first_page + (va - start) / sizeof(struct vm_page); + domain = vm_phys_domain(ptoa(pfn)); + l2e = pmap_pml2e(kernel_pmap, va); + if ((be64toh(*l2e) & PG_V) == 0) { + pa = vm_phys_early_alloc(domain, PAGE_SIZE); + dump_add_page(pa); + pagezero(PHYS_TO_DMAP(pa)); + pde_store(l2e, (pml2_entry_t)pa); + } + pde = pmap_l2e_to_l3e(l2e, va); + if ((be64toh(*pde) & PG_V) != 0) + panic("Unexpected pde %p", pde); + pa = vm_phys_early_alloc(domain, L3_PAGE_SIZE); + for (i = 0; i < NPDEPG; i++) + dump_add_page(pa + i * PAGE_SIZE); + newl3 = (pml3_entry_t)(pa | RPTE_EAA_P | RPTE_EAA_R | RPTE_EAA_W); + pte_store(pde, newl3); + } +#endif + vm_page_array = (vm_page_t)start; +} + +#ifdef DDB +#include <sys/kdb.h> +#include <ddb/ddb.h> + +static void +pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va) +{ + pml1_entry_t *l1e; + pml2_entry_t *l2e; + pml3_entry_t *l3e; + pt_entry_t *pte; + + l1e = &l1[pmap_pml1e_index(va)]; + db_printf("VA %#016lx l1e %#016lx", va, be64toh(*l1e)); + if ((be64toh(*l1e) & PG_V) == 0) { + db_printf("\n"); + return; + } + l2e = pmap_l1e_to_l2e(l1e, va); + db_printf(" l2e %#016lx", be64toh(*l2e)); + if ((be64toh(*l2e) & PG_V) == 0 || (be64toh(*l2e) & RPTE_LEAF) != 0) { + db_printf("\n"); + return; + } + l3e = pmap_l2e_to_l3e(l2e, va); + db_printf(" l3e %#016lx", be64toh(*l3e)); + if ((be64toh(*l3e) & PG_V) == 0 || (be64toh(*l3e) & RPTE_LEAF) != 0) { + db_printf("\n"); + return; + } + pte = pmap_l3e_to_pte(l3e, va); + db_printf(" pte %#016lx\n", be64toh(*pte)); +} + +void +pmap_page_print_mappings(vm_page_t m) +{ + pmap_t pmap; + pv_entry_t pv; + + db_printf("page %p(%lx)\n", m, m->phys_addr); + /* need to elide locks if running in ddb */ + TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { + db_printf("pv: %p ", pv); + db_printf("va: %#016lx ", pv->pv_va); + pmap = PV_PMAP(pv); + db_printf("pmap %p ", pmap); + if (pmap != NULL) { + db_printf("asid: %lu\n", pmap->pm_pid); + pmap_pte_walk(pmap->pm_pml1, pv->pv_va); + } + } +} + +DB_SHOW_COMMAND(pte, pmap_print_pte) +{ + vm_offset_t va; + pmap_t pmap; + + if (!have_addr) { + db_printf("show pte addr\n"); + return; + } + va = (vm_offset_t)addr; + + if (va >= DMAP_MIN_ADDRESS) + pmap = kernel_pmap; + else if (kdb_thread != NULL) + pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); + else + pmap = vmspace_pmap(curthread->td_proc->p_vmspace); + + pmap_pte_walk(pmap->pm_pml1, va); +} + +#endif diff --git a/sys/powerpc/aim/moea64_native.c b/sys/powerpc/aim/moea64_native.c new file mode 100644 index 000000000000..bf254e1f466c --- /dev/null +++ b/sys/powerpc/aim/moea64_native.c @@ -0,0 +1,1031 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause AND BSD-4-Clause + * + * Copyright (c) 2001 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Matt Thomas <matt@3am-software.com> of Allegro Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +/*- + * Copyright (C) 1995, 1996 Wolfgang Solfrank. + * Copyright (C) 1995, 1996 TooLs GmbH. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by TooLs GmbH. + * 4. The name of TooLs GmbH may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $NetBSD: pmap.c,v 1.28 2000/03/26 20:42:36 kleink Exp $ + */ +/*- + * Copyright (C) 2001 Benno Rice. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +/* + * Native 64-bit page table operations for running without a hypervisor. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/sched.h> +#include <sys/sysctl.h> +#include <sys/systm.h> +#include <sys/rwlock.h> +#include <sys/endian.h> + +#include <sys/kdb.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/vm_pageout.h> + +#include <machine/cpu.h> +#include <machine/hid.h> +#include <machine/md_var.h> +#include <machine/mmuvar.h> + +#include "mmu_oea64.h" + +#define PTESYNC() __asm __volatile("ptesync"); +#define TLBSYNC() __asm __volatile("tlbsync; ptesync"); +#define SYNC() __asm __volatile("sync"); +#define EIEIO() __asm __volatile("eieio"); + +#define VSID_HASH_MASK 0x0000007fffffffffULL + +/* POWER9 only permits a 64k partition table size. */ +#define PART_SIZE 0x10000 + +/* Actual page sizes (to be used with tlbie, when L=0) */ +#define AP_4K 0x00 +#define AP_16M 0x80 + +#define LPTE_KERNEL_VSID_BIT (KERNEL_VSID_BIT << \ + (16 - (ADDR_API_SHFT64 - ADDR_PIDX_SHFT))) + +/* Abbreviated Virtual Address Page - high bits */ +#define LPTE_AVA_PGNHI_MASK 0x0000000000000F80ULL +#define LPTE_AVA_PGNHI_SHIFT 7 + +/* Effective Address Page - low bits */ +#define EA_PAGELO_MASK 0x7ffULL +#define EA_PAGELO_SHIFT 11 + +static bool moea64_crop_tlbie; +static bool moea64_need_lock; + +/* + * The tlbie instruction has two forms: an old one used by PowerISA + * 2.03 and prior, and a newer one used by PowerISA 2.06 and later. + * We need to support both. + */ +static __inline void +TLBIE(uint64_t vpn, uint64_t oldptehi) +{ +#ifndef __powerpc64__ + register_t vpn_hi, vpn_lo; + register_t msr; + register_t scratch, intr; +#endif + + static volatile u_int tlbie_lock = 0; + bool need_lock = moea64_need_lock; + + vpn <<= ADDR_PIDX_SHFT; + + /* Hobo spinlock: we need stronger guarantees than mutexes provide */ + if (need_lock) { + while (!atomic_cmpset_int(&tlbie_lock, 0, 1)); + isync(); /* Flush instruction queue once lock acquired */ + + if (moea64_crop_tlbie) { + vpn &= ~(0xffffULL << 48); +#ifdef __powerpc64__ + if ((oldptehi & LPTE_BIG) != 0) + __asm __volatile("tlbie %0, 1" :: "r"(vpn) : + "memory"); + else + __asm __volatile("tlbie %0, 0" :: "r"(vpn) : + "memory"); + __asm __volatile("eieio; tlbsync; ptesync" ::: + "memory"); + goto done; +#endif + } + } + +#ifdef __powerpc64__ + /* + * If this page has LPTE_BIG set and is from userspace, then + * it must be a superpage with 4KB base/16MB actual page size. + */ + if ((oldptehi & LPTE_BIG) != 0 && + (oldptehi & LPTE_KERNEL_VSID_BIT) == 0) + vpn |= AP_16M; + + /* + * Explicitly clobber r0. The tlbie instruction has two forms: an old + * one used by PowerISA 2.03 and prior, and a newer one used by PowerISA + * 2.06 (maybe 2.05?) and later. We need to support both, and it just + * so happens that since we use 4k pages we can simply zero out r0, and + * clobber it, and the assembler will interpret the single-operand form + * of tlbie as having RB set, and everything else as 0. The RS operand + * in the newer form is in the same position as the L(page size) bit of + * the old form, so a slong as RS is 0, we're good on both sides. + */ + __asm __volatile("li 0, 0 \n tlbie %0, 0" :: "r"(vpn) : "r0", "memory"); + __asm __volatile("eieio; tlbsync; ptesync" ::: "memory"); +done: + +#else + vpn_hi = (uint32_t)(vpn >> 32); + vpn_lo = (uint32_t)vpn; + + intr = intr_disable(); + __asm __volatile("\ + mfmsr %0; \ + mr %1, %0; \ + insrdi %1,%5,1,0; \ + mtmsrd %1; isync; \ + \ + sld %1,%2,%4; \ + or %1,%1,%3; \ + tlbie %1; \ + \ + mtmsrd %0; isync; \ + eieio; \ + tlbsync; \ + ptesync;" + : "=r"(msr), "=r"(scratch) : "r"(vpn_hi), "r"(vpn_lo), "r"(32), "r"(1) + : "memory"); + intr_restore(intr); +#endif + + /* No barriers or special ops -- taken care of by ptesync above */ + if (need_lock) + tlbie_lock = 0; +} + +#define DISABLE_TRANS(msr) msr = mfmsr(); mtmsr(msr & ~PSL_DR) +#define ENABLE_TRANS(msr) mtmsr(msr) + +/* + * PTEG data. + */ +static volatile struct lpte *moea64_pteg_table; +static struct rwlock moea64_eviction_lock; + +static volatile struct pate *moea64_part_table; + +/* + * Dump function. + */ +static void *moea64_dump_pmap_native(void *ctx, void *buf, + u_long *nbytes); + +/* + * PTE calls. + */ +static int64_t moea64_pte_insert_native(struct pvo_entry *); +static int64_t moea64_pte_synch_native(struct pvo_entry *); +static int64_t moea64_pte_clear_native(struct pvo_entry *, uint64_t); +static int64_t moea64_pte_replace_native(struct pvo_entry *, int); +static int64_t moea64_pte_unset_native(struct pvo_entry *); +static int64_t moea64_pte_insert_sp_native(struct pvo_entry *); +static int64_t moea64_pte_unset_sp_native(struct pvo_entry *); +static int64_t moea64_pte_replace_sp_native(struct pvo_entry *); + +/* + * Utility routines. + */ +static void moea64_bootstrap_native( + vm_offset_t kernelstart, vm_offset_t kernelend); +static void moea64_cpu_bootstrap_native(int ap); +static void tlbia(void); +static void moea64_install_native(void); + +static struct pmap_funcs moea64_native_methods = { + .install = moea64_install_native, + + /* Internal interfaces */ + .bootstrap = moea64_bootstrap_native, + .cpu_bootstrap = moea64_cpu_bootstrap_native, + .dumpsys_dump_pmap = moea64_dump_pmap_native, +}; + +static struct moea64_funcs moea64_native_funcs = { + .pte_synch = moea64_pte_synch_native, + .pte_clear = moea64_pte_clear_native, + .pte_unset = moea64_pte_unset_native, + .pte_replace = moea64_pte_replace_native, + .pte_insert = moea64_pte_insert_native, + .pte_insert_sp = moea64_pte_insert_sp_native, + .pte_unset_sp = moea64_pte_unset_sp_native, + .pte_replace_sp = moea64_pte_replace_sp_native, +}; + +MMU_DEF_INHERIT(oea64_mmu_native, MMU_TYPE_G5, moea64_native_methods, oea64_mmu); + +static void +moea64_install_native(void) +{ + + /* Install the MOEA64 ops. */ + moea64_ops = &moea64_native_funcs; + + moea64_install(); +} + +static int64_t +moea64_pte_synch_native(struct pvo_entry *pvo) +{ + volatile struct lpte *pt = moea64_pteg_table + pvo->pvo_pte.slot; + uint64_t ptelo, pvo_ptevpn; + + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); + + pvo_ptevpn = moea64_pte_vpn_from_pvo_vpn(pvo); + + rw_rlock(&moea64_eviction_lock); + if ((be64toh(pt->pte_hi) & LPTE_AVPN_MASK) != pvo_ptevpn) { + /* Evicted */ + rw_runlock(&moea64_eviction_lock); + return (-1); + } + + PTESYNC(); + ptelo = be64toh(pt->pte_lo); + + rw_runlock(&moea64_eviction_lock); + + return (ptelo & (LPTE_REF | LPTE_CHG)); +} + +static int64_t +moea64_pte_clear_native(struct pvo_entry *pvo, uint64_t ptebit) +{ + volatile struct lpte *pt = moea64_pteg_table + pvo->pvo_pte.slot; + struct lpte properpt; + uint64_t ptelo; + + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); + + moea64_pte_from_pvo(pvo, &properpt); + + rw_rlock(&moea64_eviction_lock); + if ((be64toh(pt->pte_hi) & LPTE_AVPN_MASK) != + (properpt.pte_hi & LPTE_AVPN_MASK)) { + /* Evicted */ + rw_runlock(&moea64_eviction_lock); + return (-1); + } + + if (ptebit == LPTE_REF) { + /* See "Resetting the Reference Bit" in arch manual */ + PTESYNC(); + /* 2-step here safe: precision is not guaranteed */ + ptelo = be64toh(pt->pte_lo); + + /* One-byte store to avoid touching the C bit */ + ((volatile uint8_t *)(&pt->pte_lo))[6] = +#if BYTE_ORDER == BIG_ENDIAN + ((uint8_t *)(&properpt.pte_lo))[6]; +#else + ((uint8_t *)(&properpt.pte_lo))[1]; +#endif + rw_runlock(&moea64_eviction_lock); + + critical_enter(); + TLBIE(pvo->pvo_vpn, properpt.pte_hi); + critical_exit(); + } else { + rw_runlock(&moea64_eviction_lock); + ptelo = moea64_pte_unset_native(pvo); + moea64_pte_insert_native(pvo); + } + + return (ptelo & (LPTE_REF | LPTE_CHG)); +} + +static __always_inline int64_t +moea64_pte_unset_locked(volatile struct lpte *pt, uint64_t vpn) +{ + uint64_t ptelo, ptehi; + + /* + * Invalidate the pte, briefly locking it to collect RC bits. No + * atomics needed since this is protected against eviction by the lock. + */ + isync(); + critical_enter(); + ptehi = (be64toh(pt->pte_hi) & ~LPTE_VALID) | LPTE_LOCKED; + pt->pte_hi = htobe64(ptehi); + PTESYNC(); + TLBIE(vpn, ptehi); + ptelo = be64toh(pt->pte_lo); + *((volatile int32_t *)(&pt->pte_hi) + 1) = 0; /* Release lock */ + critical_exit(); + + /* Keep statistics */ + STAT_MOEA64(moea64_pte_valid--); + + return (ptelo & (LPTE_CHG | LPTE_REF)); +} + +static int64_t +moea64_pte_unset_native(struct pvo_entry *pvo) +{ + volatile struct lpte *pt = moea64_pteg_table + pvo->pvo_pte.slot; + int64_t ret; + uint64_t pvo_ptevpn; + + pvo_ptevpn = moea64_pte_vpn_from_pvo_vpn(pvo); + + rw_rlock(&moea64_eviction_lock); + + if ((be64toh(pt->pte_hi) & LPTE_AVPN_MASK) != pvo_ptevpn) { + /* Evicted */ + STAT_MOEA64(moea64_pte_overflow--); + ret = -1; + } else + ret = moea64_pte_unset_locked(pt, pvo->pvo_vpn); + + rw_runlock(&moea64_eviction_lock); + + return (ret); +} + +static int64_t +moea64_pte_replace_inval_native(struct pvo_entry *pvo, + volatile struct lpte *pt) +{ + struct lpte properpt; + uint64_t ptelo, ptehi; + + moea64_pte_from_pvo(pvo, &properpt); + + rw_rlock(&moea64_eviction_lock); + if ((be64toh(pt->pte_hi) & LPTE_AVPN_MASK) != + (properpt.pte_hi & LPTE_AVPN_MASK)) { + /* Evicted */ + STAT_MOEA64(moea64_pte_overflow--); + rw_runlock(&moea64_eviction_lock); + return (-1); + } + + /* + * Replace the pte, briefly locking it to collect RC bits. No + * atomics needed since this is protected against eviction by the lock. + */ + isync(); + critical_enter(); + ptehi = (be64toh(pt->pte_hi) & ~LPTE_VALID) | LPTE_LOCKED; + pt->pte_hi = htobe64(ptehi); + PTESYNC(); + TLBIE(pvo->pvo_vpn, ptehi); + ptelo = be64toh(pt->pte_lo); + EIEIO(); + pt->pte_lo = htobe64(properpt.pte_lo); + EIEIO(); + pt->pte_hi = htobe64(properpt.pte_hi); /* Release lock */ + PTESYNC(); + critical_exit(); + rw_runlock(&moea64_eviction_lock); + + return (ptelo & (LPTE_CHG | LPTE_REF)); +} + +static int64_t +moea64_pte_replace_native(struct pvo_entry *pvo, int flags) +{ + volatile struct lpte *pt = moea64_pteg_table + pvo->pvo_pte.slot; + struct lpte properpt; + int64_t ptelo; + + if (flags == 0) { + /* Just some software bits changing. */ + moea64_pte_from_pvo(pvo, &properpt); + + rw_rlock(&moea64_eviction_lock); + if ((be64toh(pt->pte_hi) & LPTE_AVPN_MASK) != + (properpt.pte_hi & LPTE_AVPN_MASK)) { + rw_runlock(&moea64_eviction_lock); + return (-1); + } + pt->pte_hi = htobe64(properpt.pte_hi); + ptelo = be64toh(pt->pte_lo); + rw_runlock(&moea64_eviction_lock); + } else { + /* Otherwise, need reinsertion and deletion */ + ptelo = moea64_pte_replace_inval_native(pvo, pt); + } + + return (ptelo); +} + +static void +moea64_cpu_bootstrap_native(int ap) +{ + int i = 0; + #ifdef __powerpc64__ + struct slb *slb = PCPU_GET(aim.slb); + register_t seg0; + #endif + + /* + * Initialize segment registers and MMU + */ + + mtmsr(mfmsr() & ~PSL_DR & ~PSL_IR); + + switch(mfpvr() >> 16) { + case IBMPOWER9: + mtspr(SPR_HID0, mfspr(SPR_HID0) & ~HID0_RADIX); + break; + } + + /* + * Install kernel SLB entries + */ + + #ifdef __powerpc64__ + __asm __volatile ("slbia"); + __asm __volatile ("slbmfee %0,%1; slbie %0;" : "=r"(seg0) : + "r"(0)); + + for (i = 0; i < n_slbs; i++) { + if (!(slb[i].slbe & SLBE_VALID)) + continue; + + __asm __volatile ("slbmte %0, %1" :: + "r"(slb[i].slbv), "r"(slb[i].slbe)); + } + #else + for (i = 0; i < 16; i++) + mtsrin(i << ADDR_SR_SHFT, kernel_pmap->pm_sr[i]); + #endif + + /* + * Install page table + */ + + if (cpu_features2 & PPC_FEATURE2_ARCH_3_00) + mtspr(SPR_PTCR, + ((uintptr_t)moea64_part_table & ~DMAP_BASE_ADDRESS) | + flsl((PART_SIZE >> 12) - 1)); + else + __asm __volatile ("ptesync; mtsdr1 %0; isync" + :: "r"(((uintptr_t)moea64_pteg_table & ~DMAP_BASE_ADDRESS) + | (uintptr_t)(flsl(moea64_pteg_mask >> 11)))); + tlbia(); +} + +static void +moea64_bootstrap_native(vm_offset_t kernelstart, vm_offset_t kernelend) +{ + vm_size_t size; + vm_offset_t off; + vm_paddr_t pa; + register_t msr; + + moea64_early_bootstrap(kernelstart, kernelend); + + switch (mfpvr() >> 16) { + case IBMPOWER9: + moea64_need_lock = false; + break; + case IBMPOWER4: + case IBMPOWER4PLUS: + case IBM970: + case IBM970FX: + case IBM970GX: + case IBM970MP: + moea64_crop_tlbie = true; + default: + moea64_need_lock = true; + } + /* + * Allocate PTEG table. + */ + + size = moea64_pteg_count * sizeof(struct lpteg); + CTR2(KTR_PMAP, "moea64_bootstrap: %lu PTEGs, %lu bytes", + moea64_pteg_count, size); + rw_init(&moea64_eviction_lock, "pte eviction"); + + /* + * We now need to allocate memory. This memory, to be allocated, + * has to reside in a page table. The page table we are about to + * allocate. We don't have BAT. So drop to data real mode for a minute + * as a measure of last resort. We do this a couple times. + */ + /* + * PTEG table must be aligned on a 256k boundary, but can be placed + * anywhere with that alignment on POWER ISA 3+ systems. On earlier + * systems, offset addition is done by the CPU with bitwise OR rather + * than addition, so the table must also be aligned on a boundary of + * its own size. Pick the larger of the two, which works on all + * systems. + */ + moea64_pteg_table = (struct lpte *)moea64_bootstrap_alloc(size, + MAX(256*1024, size)); + if (hw_direct_map) + moea64_pteg_table = + (struct lpte *)PHYS_TO_DMAP((vm_offset_t)moea64_pteg_table); + /* Allocate partition table (ISA 3.0). */ + if (cpu_features2 & PPC_FEATURE2_ARCH_3_00) { + moea64_part_table = + (struct pate *)moea64_bootstrap_alloc(PART_SIZE, PART_SIZE); + moea64_part_table = + (struct pate *)PHYS_TO_DMAP((vm_offset_t)moea64_part_table); + } + DISABLE_TRANS(msr); + bzero(__DEVOLATILE(void *, moea64_pteg_table), moea64_pteg_count * + sizeof(struct lpteg)); + if (cpu_features2 & PPC_FEATURE2_ARCH_3_00) { + bzero(__DEVOLATILE(void *, moea64_part_table), PART_SIZE); + moea64_part_table[0].pagetab = htobe64( + (DMAP_TO_PHYS((vm_offset_t)moea64_pteg_table)) | + (uintptr_t)(flsl((moea64_pteg_count - 1) >> 11))); + } + ENABLE_TRANS(msr); + + CTR1(KTR_PMAP, "moea64_bootstrap: PTEG table at %p", moea64_pteg_table); + + moea64_mid_bootstrap(kernelstart, kernelend); + + /* + * Add a mapping for the page table itself if there is no direct map. + */ + if (!hw_direct_map) { + size = moea64_pteg_count * sizeof(struct lpteg); + off = (vm_offset_t)(moea64_pteg_table); + DISABLE_TRANS(msr); + for (pa = off; pa < off + size; pa += PAGE_SIZE) + pmap_kenter(pa, pa); + ENABLE_TRANS(msr); + } + + /* Bring up virtual memory */ + moea64_late_bootstrap(kernelstart, kernelend); +} + +static void +tlbia(void) +{ + vm_offset_t i; + #ifndef __powerpc64__ + register_t msr, scratch; + #endif + + i = 0xc00; /* IS = 11 */ + switch (mfpvr() >> 16) { + case IBM970: + case IBM970FX: + case IBM970MP: + case IBM970GX: + case IBMPOWER4: + case IBMPOWER4PLUS: + case IBMPOWER5: + case IBMPOWER5PLUS: + i = 0; /* IS not supported */ + break; + } + + TLBSYNC(); + + for (; i < 0x400000; i += 0x00001000) { + #ifdef __powerpc64__ + __asm __volatile("tlbiel %0" :: "r"(i)); + #else + __asm __volatile("\ + mfmsr %0; \ + mr %1, %0; \ + insrdi %1,%3,1,0; \ + mtmsrd %1; \ + isync; \ + \ + tlbiel %2; \ + \ + mtmsrd %0; \ + isync;" + : "=r"(msr), "=r"(scratch) : "r"(i), "r"(1)); + #endif + } + + EIEIO(); + TLBSYNC(); +} + +static int +atomic_pte_lock(volatile struct lpte *pte, uint64_t bitmask, uint64_t *oldhi) +{ + int ret; +#ifdef __powerpc64__ + uint64_t temp; +#else + uint32_t oldhihalf; +#endif + + /* + * Note: in principle, if just the locked bit were set here, we + * could avoid needing the eviction lock. However, eviction occurs + * so rarely that it isn't worth bothering about in practice. + */ +#ifdef __powerpc64__ + /* + * Note: Success of this sequence has the side effect of invalidating + * the PTE, as we are setting it to LPTE_LOCKED and discarding the + * other bits, including LPTE_V. + */ + __asm __volatile ( + "1:\tldarx %1, 0, %3\n\t" /* load old value */ + "and. %0,%1,%4\n\t" /* check if any bits set */ + "bne 2f\n\t" /* exit if any set */ + "stdcx. %5, 0, %3\n\t" /* attempt to store */ + "bne- 1b\n\t" /* spin if failed */ + "li %0, 1\n\t" /* success - retval = 1 */ + "b 3f\n\t" /* we've succeeded */ + "2:\n\t" + "stdcx. %1, 0, %3\n\t" /* clear reservation (74xx) */ + "li %0, 0\n\t" /* failure - retval = 0 */ + "3:\n\t" + : "=&r" (ret), "=&r"(temp), "=m" (pte->pte_hi) + : "r" ((volatile char *)&pte->pte_hi), + "r" (htobe64(bitmask)), "r" (htobe64(LPTE_LOCKED)), + "m" (pte->pte_hi) + : "cr0", "cr1", "cr2", "memory"); + *oldhi = be64toh(temp); +#else + /* + * This code is used on bridge mode only. + */ + __asm __volatile ( + "1:\tlwarx %1, 0, %3\n\t" /* load old value */ + "and. %0,%1,%4\n\t" /* check if any bits set */ + "bne 2f\n\t" /* exit if any set */ + "stwcx. %5, 0, %3\n\t" /* attempt to store */ + "bne- 1b\n\t" /* spin if failed */ + "li %0, 1\n\t" /* success - retval = 1 */ + "b 3f\n\t" /* we've succeeded */ + "2:\n\t" + "stwcx. %1, 0, %3\n\t" /* clear reservation (74xx) */ + "li %0, 0\n\t" /* failure - retval = 0 */ + "3:\n\t" + : "=&r" (ret), "=&r"(oldhihalf), "=m" (pte->pte_hi) + : "r" ((volatile char *)&pte->pte_hi + 4), + "r" ((uint32_t)bitmask), "r" ((uint32_t)LPTE_LOCKED), + "m" (pte->pte_hi) + : "cr0", "cr1", "cr2", "memory"); + + *oldhi = (pte->pte_hi & 0xffffffff00000000ULL) | oldhihalf; +#endif + + return (ret); +} + +static uintptr_t +moea64_insert_to_pteg_native(struct lpte *pvo_pt, uintptr_t slotbase, + uint64_t mask) +{ + volatile struct lpte *pt; + uint64_t oldptehi, va; + uintptr_t k; + int i, j; + + /* Start at a random slot */ + i = mftb() % 8; + for (j = 0; j < 8; j++) { + k = slotbase + (i + j) % 8; + pt = &moea64_pteg_table[k]; + /* Invalidate and seize lock only if no bits in mask set */ + if (atomic_pte_lock(pt, mask, &oldptehi)) /* Lock obtained */ + break; + } + + if (j == 8) + return (-1); + + if (oldptehi & LPTE_VALID) { + KASSERT(!(oldptehi & LPTE_WIRED), ("Unmapped wired entry")); + /* + * Need to invalidate old entry completely: see + * "Modifying a Page Table Entry". Need to reconstruct + * the virtual address for the outgoing entry to do that. + */ + va = oldptehi >> (ADDR_SR_SHFT - ADDR_API_SHFT64); + if (oldptehi & LPTE_HID) + va = (((k >> 3) ^ moea64_pteg_mask) ^ va) & + (ADDR_PIDX >> ADDR_PIDX_SHFT); + else + va = ((k >> 3) ^ va) & (ADDR_PIDX >> ADDR_PIDX_SHFT); + va |= (oldptehi & LPTE_AVPN_MASK) << + (ADDR_API_SHFT64 - ADDR_PIDX_SHFT); + PTESYNC(); + TLBIE(va, oldptehi); + STAT_MOEA64(moea64_pte_valid--); + STAT_MOEA64(moea64_pte_overflow++); + } + + /* + * Update the PTE as per "Adding a Page Table Entry". Lock is released + * by setting the high doubleworld. + */ + pt->pte_lo = htobe64(pvo_pt->pte_lo); + EIEIO(); + pt->pte_hi = htobe64(pvo_pt->pte_hi); + PTESYNC(); + + /* Keep statistics */ + STAT_MOEA64(moea64_pte_valid++); + + return (k); +} + +static __always_inline int64_t +moea64_pte_insert_locked(struct pvo_entry *pvo, struct lpte *insertpt, + uint64_t mask) +{ + uintptr_t slot; + + /* + * First try primary hash. + */ + slot = moea64_insert_to_pteg_native(insertpt, pvo->pvo_pte.slot, + mask | LPTE_WIRED | LPTE_LOCKED); + if (slot != -1) { + pvo->pvo_pte.slot = slot; + return (0); + } + + /* + * Now try secondary hash. + */ + pvo->pvo_vaddr ^= PVO_HID; + insertpt->pte_hi ^= LPTE_HID; + pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); + slot = moea64_insert_to_pteg_native(insertpt, pvo->pvo_pte.slot, + mask | LPTE_WIRED | LPTE_LOCKED); + if (slot != -1) { + pvo->pvo_pte.slot = slot; + return (0); + } + + return (-1); +} + +static int64_t +moea64_pte_insert_native(struct pvo_entry *pvo) +{ + struct lpte insertpt; + int64_t ret; + + /* Initialize PTE */ + moea64_pte_from_pvo(pvo, &insertpt); + + /* Make sure further insertion is locked out during evictions */ + rw_rlock(&moea64_eviction_lock); + + pvo->pvo_pte.slot &= ~7ULL; /* Base slot address */ + ret = moea64_pte_insert_locked(pvo, &insertpt, LPTE_VALID); + if (ret == -1) { + /* + * Out of luck. Find a PTE to sacrifice. + */ + + /* Lock out all insertions for a bit */ + if (!rw_try_upgrade(&moea64_eviction_lock)) { + rw_runlock(&moea64_eviction_lock); + rw_wlock(&moea64_eviction_lock); + } + /* Don't evict large pages */ + ret = moea64_pte_insert_locked(pvo, &insertpt, LPTE_BIG); + rw_wunlock(&moea64_eviction_lock); + /* No freeable slots in either PTEG? We're hosed. */ + if (ret == -1) + panic("moea64_pte_insert: overflow"); + } else + rw_runlock(&moea64_eviction_lock); + + return (0); +} + +static void * +moea64_dump_pmap_native(void *ctx, void *buf, u_long *nbytes) +{ + struct dump_context *dctx; + u_long ptex, ptex_end; + + dctx = (struct dump_context *)ctx; + ptex = dctx->ptex; + ptex_end = ptex + dctx->blksz / sizeof(struct lpte); + ptex_end = MIN(ptex_end, dctx->ptex_end); + *nbytes = (ptex_end - ptex) * sizeof(struct lpte); + + if (*nbytes == 0) + return (NULL); + + dctx->ptex = ptex_end; + return (__DEVOLATILE(struct lpte *, moea64_pteg_table) + ptex); +} + +static __always_inline uint64_t +moea64_vpn_from_pte(uint64_t ptehi, uintptr_t slot) +{ + uint64_t pgn, pgnlo, vsid; + + vsid = (ptehi & LPTE_AVA_MASK) >> LPTE_VSID_SHIFT; + if ((ptehi & LPTE_HID) != 0) + slot ^= (moea64_pteg_mask << 3); + pgnlo = ((vsid & VSID_HASH_MASK) ^ (slot >> 3)) & EA_PAGELO_MASK; + pgn = ((ptehi & LPTE_AVA_PGNHI_MASK) << (EA_PAGELO_SHIFT - + LPTE_AVA_PGNHI_SHIFT)) | pgnlo; + return ((vsid << 16) | pgn); +} + +static __always_inline int64_t +moea64_pte_unset_sp_locked(struct pvo_entry *pvo) +{ + volatile struct lpte *pt; + uint64_t ptehi, refchg, vpn; + vm_offset_t eva; + + refchg = 0; + eva = PVO_VADDR(pvo) + HPT_SP_SIZE; + + for (; pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo)) { + pt = moea64_pteg_table + pvo->pvo_pte.slot; + ptehi = be64toh(pt->pte_hi); + if ((ptehi & LPTE_AVPN_MASK) != + moea64_pte_vpn_from_pvo_vpn(pvo)) { + /* Evicted: invalidate new entry */ + STAT_MOEA64(moea64_pte_overflow--); + vpn = moea64_vpn_from_pte(ptehi, pvo->pvo_pte.slot); + CTR1(KTR_PMAP, "Evicted page in pte_unset_sp: vpn=%jx", + (uintmax_t)vpn); + /* Assume evicted page was modified */ + refchg |= LPTE_CHG; + } else + vpn = pvo->pvo_vpn; + + refchg |= moea64_pte_unset_locked(pt, vpn); + } + + return (refchg); +} + +static int64_t +moea64_pte_unset_sp_native(struct pvo_entry *pvo) +{ + uint64_t refchg; + + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); + KASSERT((PVO_VADDR(pvo) & HPT_SP_MASK) == 0, + ("%s: va %#jx unaligned", __func__, (uintmax_t)PVO_VADDR(pvo))); + + rw_rlock(&moea64_eviction_lock); + refchg = moea64_pte_unset_sp_locked(pvo); + rw_runlock(&moea64_eviction_lock); + + return (refchg); +} + +static __always_inline int64_t +moea64_pte_insert_sp_locked(struct pvo_entry *pvo) +{ + struct lpte insertpt; + int64_t ret; + vm_offset_t eva; + + eva = PVO_VADDR(pvo) + HPT_SP_SIZE; + + for (; pvo != NULL && PVO_VADDR(pvo) < eva; + pvo = RB_NEXT(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo)) { + moea64_pte_from_pvo(pvo, &insertpt); + pvo->pvo_pte.slot &= ~7ULL; /* Base slot address */ + + ret = moea64_pte_insert_locked(pvo, &insertpt, LPTE_VALID); + if (ret == -1) { + /* Lock out all insertions for a bit */ + if (!rw_try_upgrade(&moea64_eviction_lock)) { + rw_runlock(&moea64_eviction_lock); + rw_wlock(&moea64_eviction_lock); + } + /* Don't evict large pages */ + ret = moea64_pte_insert_locked(pvo, &insertpt, + LPTE_BIG); + rw_downgrade(&moea64_eviction_lock); + /* No freeable slots in either PTEG? We're hosed. */ + if (ret == -1) + panic("moea64_pte_insert_sp: overflow"); + } + } + + return (0); +} + +static int64_t +moea64_pte_insert_sp_native(struct pvo_entry *pvo) +{ + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); + KASSERT((PVO_VADDR(pvo) & HPT_SP_MASK) == 0, + ("%s: va %#jx unaligned", __func__, (uintmax_t)PVO_VADDR(pvo))); + + rw_rlock(&moea64_eviction_lock); + moea64_pte_insert_sp_locked(pvo); + rw_runlock(&moea64_eviction_lock); + + return (0); +} + +static int64_t +moea64_pte_replace_sp_native(struct pvo_entry *pvo) +{ + uint64_t refchg; + + PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); + KASSERT((PVO_VADDR(pvo) & HPT_SP_MASK) == 0, + ("%s: va %#jx unaligned", __func__, (uintmax_t)PVO_VADDR(pvo))); + + rw_rlock(&moea64_eviction_lock); + refchg = moea64_pte_unset_sp_locked(pvo); + moea64_pte_insert_sp_locked(pvo); + rw_runlock(&moea64_eviction_lock); + + return (refchg); +} diff --git a/sys/powerpc/aim/mp_cpudep.c b/sys/powerpc/aim/mp_cpudep.c new file mode 100644 index 000000000000..98acfc1a5c37 --- /dev/null +++ b/sys/powerpc/aim/mp_cpudep.c @@ -0,0 +1,425 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2008 Marcel Moolenaar + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/sched.h> +#include <sys/smp.h> + +#include <machine/bus.h> +#include <machine/cpu.h> +#include <machine/hid.h> +#include <machine/intr_machdep.h> +#include <machine/pcb.h> +#include <machine/psl.h> +#include <machine/smp.h> +#include <machine/spr.h> +#include <machine/trap.h> + +#include <dev/ofw/openfirm.h> +#include <machine/ofw_machdep.h> + +void *ap_pcpu; + +static register_t bsp_state[8] __aligned(8); + +static void cpudep_save_config(void *dummy); +SYSINIT(cpu_save_config, SI_SUB_CPU, SI_ORDER_ANY, cpudep_save_config, NULL); + +void +cpudep_ap_early_bootstrap(void) +{ +#ifndef __powerpc64__ + register_t reg; +#endif + + switch (mfpvr() >> 16) { + case IBM970: + case IBM970FX: + case IBM970MP: + /* Set HIOR to 0 */ + __asm __volatile("mtspr 311,%0" :: "r"(0)); + powerpc_sync(); + + /* Restore HID4 and HID5, which are necessary for the MMU */ + +#ifdef __powerpc64__ + mtspr(SPR_HID4, bsp_state[2]); powerpc_sync(); isync(); + mtspr(SPR_HID5, bsp_state[3]); powerpc_sync(); isync(); +#else + __asm __volatile("ld %0, 16(%2); sync; isync; \ + mtspr %1, %0; sync; isync;" + : "=r"(reg) : "K"(SPR_HID4), "b"(bsp_state)); + __asm __volatile("ld %0, 24(%2); sync; isync; \ + mtspr %1, %0; sync; isync;" + : "=r"(reg) : "K"(SPR_HID5), "b"(bsp_state)); +#endif + powerpc_sync(); + break; + case IBMPOWER8: + case IBMPOWER8E: + case IBMPOWER8NVL: + case IBMPOWER9: +#ifdef __powerpc64__ + if (mfmsr() & PSL_HV) { + isync(); + /* + * Direct interrupts to SRR instead of HSRR and + * reset LPCR otherwise + */ + mtspr(SPR_LPID, 0); + isync(); + + mtspr(SPR_LPCR, lpcr); + isync(); + + /* + * Nuke FSCR, to be managed on a per-process basis + * later. + */ + mtspr(SPR_FSCR, 0); + } +#endif + break; + } + + __asm __volatile("mtsprg 0, %0" :: "r"(ap_pcpu)); + powerpc_sync(); +} + +uintptr_t +cpudep_ap_bootstrap(void) +{ + register_t msr, sp; + + msr = psl_kernset & ~PSL_EE; + mtmsr(msr); + + pcpup->pc_curthread = pcpup->pc_idlethread; +#ifdef __powerpc64__ + __asm __volatile("mr 13,%0" :: "r"(pcpup->pc_curthread)); +#else + __asm __volatile("mr 2,%0" :: "r"(pcpup->pc_curthread)); +#endif + pcpup->pc_curpcb = pcpup->pc_curthread->td_pcb; + sp = pcpup->pc_curpcb->pcb_sp; + schedinit_ap(); + + return (sp); +} + +static register_t +mpc74xx_l2_enable(register_t l2cr_config) +{ + register_t ccr, bit; + uint16_t vers; + + vers = mfpvr() >> 16; + switch (vers) { + case MPC7400: + case MPC7410: + bit = L2CR_L2IP; + break; + default: + bit = L2CR_L2I; + break; + } + + ccr = mfspr(SPR_L2CR); + if (ccr & L2CR_L2E) + return (ccr); + + /* Configure L2 cache. */ + ccr = l2cr_config & ~L2CR_L2E; + mtspr(SPR_L2CR, ccr | L2CR_L2I); + do { + ccr = mfspr(SPR_L2CR); + } while (ccr & bit); + powerpc_sync(); + mtspr(SPR_L2CR, l2cr_config); + powerpc_sync(); + + return (l2cr_config); +} + +static register_t +mpc745x_l3_enable(register_t l3cr_config) +{ + register_t ccr; + + ccr = mfspr(SPR_L3CR); + if (ccr & L3CR_L3E) + return (ccr); + + /* Configure L3 cache. */ + ccr = l3cr_config & ~(L3CR_L3E | L3CR_L3I | L3CR_L3PE | L3CR_L3CLKEN); + mtspr(SPR_L3CR, ccr); + ccr |= 0x4000000; /* Magic, but documented. */ + mtspr(SPR_L3CR, ccr); + ccr |= L3CR_L3CLKEN; + mtspr(SPR_L3CR, ccr); + mtspr(SPR_L3CR, ccr | L3CR_L3I); + while (mfspr(SPR_L3CR) & L3CR_L3I) + ; + mtspr(SPR_L3CR, ccr & ~L3CR_L3CLKEN); + powerpc_sync(); + DELAY(100); + mtspr(SPR_L3CR, ccr); + powerpc_sync(); + DELAY(100); + ccr |= L3CR_L3E; + mtspr(SPR_L3CR, ccr); + powerpc_sync(); + + return(ccr); +} + +static register_t +mpc74xx_l1d_enable(void) +{ + register_t hid; + + hid = mfspr(SPR_HID0); + if (hid & HID0_DCE) + return (hid); + + /* Enable L1 D-cache */ + hid |= HID0_DCE; + powerpc_sync(); + mtspr(SPR_HID0, hid | HID0_DCFI); + powerpc_sync(); + + return (hid); +} + +static register_t +mpc74xx_l1i_enable(void) +{ + register_t hid; + + hid = mfspr(SPR_HID0); + if (hid & HID0_ICE) + return (hid); + + /* Enable L1 I-cache */ + hid |= HID0_ICE; + isync(); + mtspr(SPR_HID0, hid | HID0_ICFI); + isync(); + + return (hid); +} + +static void +cpudep_save_config(void *dummy) +{ + uint16_t vers; + + vers = mfpvr() >> 16; + + switch(vers) { + case IBM970: + case IBM970FX: + case IBM970MP: + #ifdef __powerpc64__ + bsp_state[0] = mfspr(SPR_HID0); + bsp_state[1] = mfspr(SPR_HID1); + bsp_state[2] = mfspr(SPR_HID4); + bsp_state[3] = mfspr(SPR_HID5); + #else + __asm __volatile ("mfspr %0,%2; mr %1,%0; srdi %0,%0,32" + : "=r" (bsp_state[0]),"=r" (bsp_state[1]) : "K" (SPR_HID0)); + __asm __volatile ("mfspr %0,%2; mr %1,%0; srdi %0,%0,32" + : "=r" (bsp_state[2]),"=r" (bsp_state[3]) : "K" (SPR_HID1)); + __asm __volatile ("mfspr %0,%2; mr %1,%0; srdi %0,%0,32" + : "=r" (bsp_state[4]),"=r" (bsp_state[5]) : "K" (SPR_HID4)); + __asm __volatile ("mfspr %0,%2; mr %1,%0; srdi %0,%0,32" + : "=r" (bsp_state[6]),"=r" (bsp_state[7]) : "K" (SPR_HID5)); + #endif + + powerpc_sync(); + + break; + case IBMCELLBE: + #ifdef NOTYET /* Causes problems if in instruction stream on 970 */ + if (mfmsr() & PSL_HV) { + bsp_state[0] = mfspr(SPR_HID0); + bsp_state[1] = mfspr(SPR_HID1); + bsp_state[2] = mfspr(SPR_HID4); + bsp_state[3] = mfspr(SPR_HID6); + + bsp_state[4] = mfspr(SPR_CELL_TSCR); + } + #endif + + bsp_state[5] = mfspr(SPR_CELL_TSRL); + + break; + case MPC7450: + case MPC7455: + case MPC7457: + /* Only MPC745x CPUs have an L3 cache. */ + bsp_state[3] = mfspr(SPR_L3CR); + + /* Fallthrough */ + case MPC7400: + case MPC7410: + case MPC7447A: + case MPC7448: + bsp_state[2] = mfspr(SPR_L2CR); + bsp_state[1] = mfspr(SPR_HID1); + bsp_state[0] = mfspr(SPR_HID0); + break; + } +} + +void +cpudep_ap_setup(void) +{ +#ifndef __powerpc64__ + register_t reg; +#endif + uint16_t vers; + + vers = mfpvr() >> 16; + + switch(vers) { + case IBM970: + case IBM970FX: + case IBM970MP: + /* + * The 970 has strange rules about how to update HID registers. + * See Table 2-3, 970MP manual + * + * Note: HID4 and HID5 restored already in + * cpudep_ap_early_bootstrap() + */ + + __asm __volatile("mtasr %0; sync" :: "r"(0)); + #ifdef __powerpc64__ + __asm __volatile(" \ + sync; isync; \ + mtspr %1, %0; \ + mfspr %0, %1; mfspr %0, %1; mfspr %0, %1; \ + mfspr %0, %1; mfspr %0, %1; mfspr %0, %1; \ + sync; isync" + :: "r"(bsp_state[0]), "K"(SPR_HID0)); + __asm __volatile("sync; isync; \ + mtspr %1, %0; mtspr %1, %0; sync; isync" + :: "r"(bsp_state[1]), "K"(SPR_HID1)); + #else + __asm __volatile(" \ + ld %0,0(%2); \ + sync; isync; \ + mtspr %1, %0; \ + mfspr %0, %1; mfspr %0, %1; mfspr %0, %1; \ + mfspr %0, %1; mfspr %0, %1; mfspr %0, %1; \ + sync; isync" + : "=r"(reg) : "K"(SPR_HID0), "b"(bsp_state)); + __asm __volatile("ld %0, 8(%2); sync; isync; \ + mtspr %1, %0; mtspr %1, %0; sync; isync" + : "=r"(reg) : "K"(SPR_HID1), "b"(bsp_state)); + #endif + + powerpc_sync(); + break; + case IBMCELLBE: + #ifdef NOTYET /* Causes problems if in instruction stream on 970 */ + if (mfmsr() & PSL_HV) { + mtspr(SPR_HID0, bsp_state[0]); + mtspr(SPR_HID1, bsp_state[1]); + mtspr(SPR_HID4, bsp_state[2]); + mtspr(SPR_HID6, bsp_state[3]); + + mtspr(SPR_CELL_TSCR, bsp_state[4]); + } + #endif + + mtspr(SPR_CELL_TSRL, bsp_state[5]); + + break; + case MPC7400: + case MPC7410: + case MPC7447A: + case MPC7448: + case MPC7450: + case MPC7455: + case MPC7457: + /* XXX: Program the CPU ID into PIR */ + __asm __volatile("mtspr 1023,%0" :: "r"(PCPU_GET(cpuid))); + + powerpc_sync(); + isync(); + + mtspr(SPR_HID0, bsp_state[0]); isync(); + mtspr(SPR_HID1, bsp_state[1]); isync(); + + /* Now enable the L3 cache. */ + switch (vers) { + case MPC7450: + case MPC7455: + case MPC7457: + /* Only MPC745x CPUs have an L3 cache. */ + mpc745x_l3_enable(bsp_state[3]); + default: + break; + } + + mpc74xx_l2_enable(bsp_state[2]); + mpc74xx_l1d_enable(); + mpc74xx_l1i_enable(); + + break; + case IBMPOWER7: + case IBMPOWER7PLUS: + case IBMPOWER8: + case IBMPOWER8E: + case IBMPOWER8NVL: + case IBMPOWER9: +#ifdef __powerpc64__ + if (mfmsr() & PSL_HV) { + mtspr(SPR_LPCR, mfspr(SPR_LPCR) | lpcr | + LPCR_PECE_WAKESET); + isync(); + } +#endif + break; + default: +#ifdef __powerpc64__ + if (!(mfmsr() & PSL_HV)) /* Rely on HV to have set things up */ + break; +#endif + printf("WARNING: Unknown CPU type. Cache performace may be " + "suboptimal.\n"); + break; + } +} diff --git a/sys/powerpc/aim/slb.c b/sys/powerpc/aim/slb.c new file mode 100644 index 000000000000..bd008234c40f --- /dev/null +++ b/sys/powerpc/aim/slb.c @@ -0,0 +1,624 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2010 Nathan Whitehorn + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/systm.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/uma.h> +#include <vm/vm.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> + +#include <machine/md_var.h> +#include <machine/platform.h> +#include <machine/vmparam.h> +#include <machine/trap.h> + +#include "mmu_oea64.h" + +uintptr_t moea64_get_unique_vsid(void); +void moea64_release_vsid(uint64_t vsid); +static void slb_zone_init(void *); + +static uma_zone_t slbt_zone; +static uma_zone_t slb_cache_zone; +int n_slbs = 64; + +SYSINIT(slb_zone_init, SI_SUB_KMEM, SI_ORDER_ANY, slb_zone_init, NULL); + +struct slbtnode { + uint16_t ua_alloc; + uint8_t ua_level; + /* Only 36 bits needed for full 64-bit address space. */ + uint64_t ua_base; + union { + struct slbtnode *ua_child[16]; + struct slb slb_entries[16]; + } u; +}; + +/* + * For a full 64-bit address space, there are 36 bits in play in an + * esid, so 8 levels, with the leaf being at level 0. + * + * |3333|3322|2222|2222|1111|1111|11 | | | esid + * |5432|1098|7654|3210|9876|5432|1098|7654|3210| bits + * +----+----+----+----+----+----+----+----+----+-------- + * | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | level + */ +#define UAD_ROOT_LEVEL 8 +#define UAD_LEAF_LEVEL 0 + +static inline int +esid2idx(uint64_t esid, int level) +{ + int shift; + + shift = level * 4; + return ((esid >> shift) & 0xF); +} + +/* + * The ua_base field should have 0 bits after the first 4*(level+1) + * bits; i.e. only + */ +#define uad_baseok(ua) \ + (esid2base(ua->ua_base, ua->ua_level) == ua->ua_base) + +static inline uint64_t +esid2base(uint64_t esid, int level) +{ + uint64_t mask; + int shift; + + shift = (level + 1) * 4; + mask = ~((1ULL << shift) - 1); + return (esid & mask); +} + +/* + * Allocate a new leaf node for the specified esid/vmhandle from the + * parent node. + */ +static struct slb * +make_new_leaf(uint64_t esid, uint64_t slbv, struct slbtnode *parent) +{ + struct slbtnode *child; + struct slb *retval; + int idx; + + idx = esid2idx(esid, parent->ua_level); + KASSERT(parent->u.ua_child[idx] == NULL, ("Child already exists!")); + + /* unlock and M_WAITOK and loop? */ + child = uma_zalloc(slbt_zone, M_NOWAIT | M_ZERO); + KASSERT(child != NULL, ("unhandled NULL case")); + + child->ua_level = UAD_LEAF_LEVEL; + child->ua_base = esid2base(esid, child->ua_level); + idx = esid2idx(esid, child->ua_level); + child->u.slb_entries[idx].slbv = slbv; + child->u.slb_entries[idx].slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID; + setbit(&child->ua_alloc, idx); + + retval = &child->u.slb_entries[idx]; + + /* + * The above stores must be visible before the next one, so + * that a lockless searcher always sees a valid path through + * the tree. + */ + powerpc_lwsync(); + + idx = esid2idx(esid, parent->ua_level); + parent->u.ua_child[idx] = child; + setbit(&parent->ua_alloc, idx); + + return (retval); +} + +/* + * Allocate a new intermediate node to fit between the parent and + * esid. + */ +static struct slbtnode* +make_intermediate(uint64_t esid, struct slbtnode *parent) +{ + struct slbtnode *child, *inter; + int idx, level; + + idx = esid2idx(esid, parent->ua_level); + child = parent->u.ua_child[idx]; + KASSERT(esid2base(esid, child->ua_level) != child->ua_base, + ("No need for an intermediate node?")); + + /* + * Find the level where the existing child and our new esid + * meet. It must be lower than parent->ua_level or we would + * have chosen a different index in parent. + */ + level = child->ua_level + 1; + while (esid2base(esid, level) != + esid2base(child->ua_base, level)) + level++; + KASSERT(level < parent->ua_level, + ("Found splitting level %d for %09jx and %09jx, " + "but it's the same as %p's", + level, esid, child->ua_base, parent)); + + /* unlock and M_WAITOK and loop? */ + inter = uma_zalloc(slbt_zone, M_NOWAIT | M_ZERO); + KASSERT(inter != NULL, ("unhandled NULL case")); + + /* Set up intermediate node to point to child ... */ + inter->ua_level = level; + inter->ua_base = esid2base(esid, inter->ua_level); + idx = esid2idx(child->ua_base, inter->ua_level); + inter->u.ua_child[idx] = child; + setbit(&inter->ua_alloc, idx); + powerpc_lwsync(); + + /* Set up parent to point to intermediate node ... */ + idx = esid2idx(inter->ua_base, parent->ua_level); + parent->u.ua_child[idx] = inter; + setbit(&parent->ua_alloc, idx); + + return (inter); +} + +uint64_t +kernel_va_to_slbv(vm_offset_t va) +{ + uint64_t slbv; + + /* Set kernel VSID to deterministic value */ + slbv = (KERNEL_VSID((uintptr_t)va >> ADDR_SR_SHFT)) << SLBV_VSID_SHIFT; + + /* + * Figure out if this is a large-page mapping. + */ + if (hw_direct_map && va > DMAP_BASE_ADDRESS && va < DMAP_MAX_ADDRESS) { + /* + * XXX: If we have set up a direct map, assumes + * all physical memory is mapped with large pages. + */ + + if (mem_valid(DMAP_TO_PHYS(va), 0) == 0) + slbv |= SLBV_L; + } else if (moea64_large_page_size != 0 && + va >= (vm_offset_t)vm_page_array && + va <= (uintptr_t)(&vm_page_array[vm_page_array_size])) + slbv |= SLBV_L; + + return (slbv); +} + +struct slb * +user_va_to_slb_entry(pmap_t pm, vm_offset_t va) +{ + uint64_t esid = va >> ADDR_SR_SHFT; + struct slbtnode *ua; + int idx; + + ua = pm->pm_slb_tree_root; + + for (;;) { + KASSERT(uad_baseok(ua), ("uad base %016jx level %d bad!", + ua->ua_base, ua->ua_level)); + idx = esid2idx(esid, ua->ua_level); + + /* + * This code is specific to ppc64 where a load is + * atomic, so no need for atomic_load macro. + */ + if (ua->ua_level == UAD_LEAF_LEVEL) + return ((ua->u.slb_entries[idx].slbe & SLBE_VALID) ? + &ua->u.slb_entries[idx] : NULL); + + /* + * The following accesses are implicitly ordered under the POWER + * ISA by load dependencies (the store ordering is provided by + * the powerpc_lwsync() calls elsewhere) and so are run without + * barriers. + */ + ua = ua->u.ua_child[idx]; + if (ua == NULL || + esid2base(esid, ua->ua_level) != ua->ua_base) + return (NULL); + } + + return (NULL); +} + +uint64_t +va_to_vsid(pmap_t pm, vm_offset_t va) +{ + struct slb *entry; + + /* Shortcut kernel case */ + if (pm == kernel_pmap) + return (KERNEL_VSID((uintptr_t)va >> ADDR_SR_SHFT)); + + /* + * If there is no vsid for this VA, we need to add a new entry + * to the PMAP's segment table. + */ + + entry = user_va_to_slb_entry(pm, va); + + if (entry == NULL) + return (allocate_user_vsid(pm, + (uintptr_t)va >> ADDR_SR_SHFT, 0)); + + return ((entry->slbv & SLBV_VSID_MASK) >> SLBV_VSID_SHIFT); +} + +uint64_t +allocate_user_vsid(pmap_t pm, uint64_t esid, int large) +{ + uint64_t vsid, slbv; + struct slbtnode *ua, *next, *inter; + struct slb *slb; + int idx; + + KASSERT(pm != kernel_pmap, ("Attempting to allocate a kernel VSID")); + + PMAP_LOCK_ASSERT(pm, MA_OWNED); + vsid = moea64_get_unique_vsid(); + + slbv = vsid << SLBV_VSID_SHIFT; + if (large) + slbv |= SLBV_L; + + ua = pm->pm_slb_tree_root; + + /* Descend to the correct leaf or NULL pointer. */ + for (;;) { + KASSERT(uad_baseok(ua), + ("uad base %09jx level %d bad!", ua->ua_base, ua->ua_level)); + idx = esid2idx(esid, ua->ua_level); + + if (ua->ua_level == UAD_LEAF_LEVEL) { + ua->u.slb_entries[idx].slbv = slbv; + eieio(); + ua->u.slb_entries[idx].slbe = (esid << SLBE_ESID_SHIFT) + | SLBE_VALID; + setbit(&ua->ua_alloc, idx); + slb = &ua->u.slb_entries[idx]; + break; + } + + next = ua->u.ua_child[idx]; + if (next == NULL) { + slb = make_new_leaf(esid, slbv, ua); + break; + } + + /* + * Check if the next item down has an okay ua_base. + * If not, we need to allocate an intermediate node. + */ + if (esid2base(esid, next->ua_level) != next->ua_base) { + inter = make_intermediate(esid, ua); + slb = make_new_leaf(esid, slbv, inter); + break; + } + + ua = next; + } + + /* + * Someone probably wants this soon, and it may be a wired + * SLB mapping, so pre-spill this entry. + */ + eieio(); + slb_insert_user(pm, slb); + + return (vsid); +} + +void +free_vsid(pmap_t pm, uint64_t esid, int large) +{ + struct slbtnode *ua; + int idx; + + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + ua = pm->pm_slb_tree_root; + /* Descend to the correct leaf. */ + for (;;) { + KASSERT(uad_baseok(ua), + ("uad base %09jx level %d bad!", ua->ua_base, ua->ua_level)); + + idx = esid2idx(esid, ua->ua_level); + if (ua->ua_level == UAD_LEAF_LEVEL) { + ua->u.slb_entries[idx].slbv = 0; + eieio(); + ua->u.slb_entries[idx].slbe = 0; + clrbit(&ua->ua_alloc, idx); + return; + } + + ua = ua->u.ua_child[idx]; + if (ua == NULL || + esid2base(esid, ua->ua_level) != ua->ua_base) { + /* Perhaps just return instead of assert? */ + KASSERT(0, + ("Asked to remove an entry that was never inserted!")); + return; + } + } +} + +static void +free_slb_tree_node(struct slbtnode *ua) +{ + int idx; + + for (idx = 0; idx < 16; idx++) { + if (ua->ua_level != UAD_LEAF_LEVEL) { + if (ua->u.ua_child[idx] != NULL) + free_slb_tree_node(ua->u.ua_child[idx]); + } else { + if (ua->u.slb_entries[idx].slbv != 0) + moea64_release_vsid(ua->u.slb_entries[idx].slbv + >> SLBV_VSID_SHIFT); + } + } + + uma_zfree(slbt_zone, ua); +} + +void +slb_free_tree(pmap_t pm) +{ + + free_slb_tree_node(pm->pm_slb_tree_root); +} + +struct slbtnode * +slb_alloc_tree(void) +{ + struct slbtnode *root; + + root = uma_zalloc(slbt_zone, M_NOWAIT | M_ZERO); + KASSERT(root != NULL, ("unhandled NULL case")); + root->ua_level = UAD_ROOT_LEVEL; + + return (root); +} + +/* Lock entries mapping kernel text and stacks */ + +void +slb_insert_kernel(uint64_t slbe, uint64_t slbv) +{ + struct slb *slbcache; + int i; + + /* We don't want to be preempted while modifying the kernel map */ + critical_enter(); + + slbcache = PCPU_GET(aim.slb); + + /* Check for an unused slot, abusing the user slot as a full flag */ + if (slbcache[USER_SLB_SLOT].slbe == 0) { + for (i = 0; i < n_slbs; i++) { + if (i == USER_SLB_SLOT) + continue; + if (!(slbcache[i].slbe & SLBE_VALID)) + goto fillkernslb; + } + + if (i == n_slbs) + slbcache[USER_SLB_SLOT].slbe = 1; + } + + i = mftb() % n_slbs; + if (i == USER_SLB_SLOT) + i = (i+1) % n_slbs; + +fillkernslb: + KASSERT(i != USER_SLB_SLOT, + ("Filling user SLB slot with a kernel mapping")); + slbcache[i].slbv = slbv; + slbcache[i].slbe = slbe | (uint64_t)i; + + /* If it is for this CPU, put it in the SLB right away */ + if (pmap_bootstrapped) { + /* slbie not required */ + __asm __volatile ("slbmte %0, %1" :: + "r"(slbcache[i].slbv), "r"(slbcache[i].slbe)); + } + + critical_exit(); +} + +void +slb_insert_user(pmap_t pm, struct slb *slb) +{ + int i; + + PMAP_LOCK_ASSERT(pm, MA_OWNED); + + if (pm->pm_slb_len < n_slbs) { + i = pm->pm_slb_len; + pm->pm_slb_len++; + } else { + i = mftb() % n_slbs; + } + + /* Note that this replacement is atomic with respect to trap_subr */ + pm->pm_slb[i] = slb; +} + +static void * +slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, int domain, + u_int8_t *flags, int wait) +{ + static vm_offset_t realmax = 0; + void *va; + vm_page_t m; + + if (realmax == 0) + realmax = platform_real_maxaddr(); + + *flags = UMA_SLAB_PRIV; + m = vm_page_alloc_noobj_contig_domain(domain, malloc2vm_flags(wait) | + VM_ALLOC_WIRED, 1, 0, realmax, PAGE_SIZE, PAGE_SIZE, + VM_MEMATTR_DEFAULT); + if (m == NULL) + return (NULL); + + if (hw_direct_map) + va = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); + else { + va = (void *)(VM_PAGE_TO_PHYS(m) | DMAP_BASE_ADDRESS); + pmap_kenter((vm_offset_t)va, VM_PAGE_TO_PHYS(m)); + } + + return (va); +} + +static void +slb_zone_init(void *dummy) +{ + slbt_zone = uma_zcreate("SLB tree node", sizeof(struct slbtnode), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, + UMA_ZONE_CONTIG | UMA_ZONE_VM); + slb_cache_zone = uma_zcreate("SLB cache", + (n_slbs + 1)*sizeof(struct slb *), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, UMA_ZONE_CONTIG | UMA_ZONE_VM); + + if (platform_real_maxaddr() != VM_MAX_ADDRESS) { + uma_zone_set_allocf(slb_cache_zone, slb_uma_real_alloc); + uma_zone_set_allocf(slbt_zone, slb_uma_real_alloc); + } +} + +struct slb ** +slb_alloc_user_cache(void) +{ + return (uma_zalloc(slb_cache_zone, M_WAITOK | M_ZERO)); +} + +void +slb_free_user_cache(struct slb **slb) +{ + uma_zfree(slb_cache_zone, slb); +} + +/* Handle kernel SLB faults -- runs in real mode, all seat belts off */ +void +handle_kernel_slb_spill(int type, register_t dar, register_t srr0) +{ + struct slb *slbcache; + uint64_t slbe, slbv; + uint64_t esid, addr; + int i; + + addr = (type == EXC_ISE) ? srr0 : dar; + slbcache = PCPU_GET(aim.slb); + esid = (uintptr_t)addr >> ADDR_SR_SHFT; + slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID; + + /* See if the hardware flushed this somehow (can happen in LPARs) */ + for (i = 0; i < n_slbs; i++) + if (slbcache[i].slbe == (slbe | (uint64_t)i)) + return; + + /* Not in the map, needs to actually be added */ + slbv = kernel_va_to_slbv(addr); + if (slbcache[USER_SLB_SLOT].slbe == 0) { + for (i = 0; i < n_slbs; i++) { + if (i == USER_SLB_SLOT) + continue; + if (!(slbcache[i].slbe & SLBE_VALID)) + goto fillkernslb; + } + + if (i == n_slbs) + slbcache[USER_SLB_SLOT].slbe = 1; + } + + /* Sacrifice a random SLB entry that is not the user entry */ + i = mftb() % n_slbs; + if (i == USER_SLB_SLOT) + i = (i+1) % n_slbs; + +fillkernslb: + /* Write new entry */ + slbcache[i].slbv = slbv; + slbcache[i].slbe = slbe | (uint64_t)i; + + /* Trap handler will restore from cache on exit */ +} + +int +handle_user_slb_spill(pmap_t pm, vm_offset_t addr) +{ + struct slb *user_entry; + uint64_t esid; + int i; + + if (pm->pm_slb == NULL) + return (-1); + + esid = (uintptr_t)addr >> ADDR_SR_SHFT; + + PMAP_LOCK(pm); + user_entry = user_va_to_slb_entry(pm, addr); + + if (user_entry == NULL) { + /* allocate_vsid auto-spills it */ + (void)allocate_user_vsid(pm, esid, 0); + } else { + /* + * Check that another CPU has not already mapped this. + * XXX: Per-thread SLB caches would be better. + */ + for (i = 0; i < pm->pm_slb_len; i++) + if (pm->pm_slb[i] == user_entry) + break; + + if (i == pm->pm_slb_len) + slb_insert_user(pm, user_entry); + } + PMAP_UNLOCK(pm); + + return (0); +} diff --git a/sys/powerpc/aim/trap_subr32.S b/sys/powerpc/aim/trap_subr32.S new file mode 100644 index 000000000000..95e1d53360e2 --- /dev/null +++ b/sys/powerpc/aim/trap_subr32.S @@ -0,0 +1,929 @@ +/* $NetBSD: trap_subr.S,v 1.20 2002/04/22 23:20:08 kleink Exp $ */ + +/*- + * Copyright (C) 1995, 1996 Wolfgang Solfrank. + * Copyright (C) 1995, 1996 TooLs GmbH. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by TooLs GmbH. + * 4. The name of TooLs GmbH may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NOTICE: This is not a standalone file. to use it, #include it in + * your port's locore.S, like so: + * + * #include <powerpc/aim/trap_subr.S> + */ + +/* + * Save/restore segment registers + */ +#define RESTORE_SRS(pmap,sr) mtsr 0,sr; \ + lwz sr,1*4(pmap); mtsr 1,sr; \ + lwz sr,2*4(pmap); mtsr 2,sr; \ + lwz sr,3*4(pmap); mtsr 3,sr; \ + lwz sr,4*4(pmap); mtsr 4,sr; \ + lwz sr,5*4(pmap); mtsr 5,sr; \ + lwz sr,6*4(pmap); mtsr 6,sr; \ + lwz sr,7*4(pmap); mtsr 7,sr; \ + lwz sr,8*4(pmap); mtsr 8,sr; \ + lwz sr,9*4(pmap); mtsr 9,sr; \ + lwz sr,10*4(pmap); mtsr 10,sr; \ + lwz sr,11*4(pmap); mtsr 11,sr; \ + /* Skip segment 12 (USER_SR), which is restored differently */ \ + lwz sr,13*4(pmap); mtsr 13,sr; \ + lwz sr,14*4(pmap); mtsr 14,sr; \ + lwz sr,15*4(pmap); mtsr 15,sr; isync; + +/* + * User SRs are loaded through a pointer to the current pmap. + */ +#define RESTORE_USER_SRS(pmap,sr) \ + GET_CPUINFO(pmap); \ + lwz pmap,PC_CURPMAP(pmap); \ + lwzu sr,PM_SR(pmap); \ + RESTORE_SRS(pmap,sr) \ + /* Restore SR 12 */ \ + lwz sr,12*4(pmap); mtsr 12,sr; isync + +/* + * Kernel SRs are loaded directly from kernel_pmap_ + */ +#define RESTORE_KERN_SRS(pmap,sr) \ + lwz pmap,TRAP_TOCBASE(0); \ + lwz pmap,CNAME(kernel_pmap_store)@got(pmap); \ + lwzu sr,PM_SR(pmap); \ + RESTORE_SRS(pmap,sr) + +/* + * FRAME_SETUP assumes: + * SPRG1 SP (1) + * SPRG3 trap type + * savearea r28-r31,DAR,DSISR (DAR & DSISR only for DSI traps) + * r28 LR + * r29 CR + * r30 scratch + * r31 scratch + * r1 kernel stack + * SRR0/1 as at start of trap + */ +#define FRAME_SETUP(savearea) \ +/* Have to enable translation to allow access of kernel stack: */ \ + GET_CPUINFO(%r31); \ + mfsrr0 %r30; \ + stw %r30,(savearea+CPUSAVE_SRR0)(%r31); /* save SRR0 */ \ + mfsrr1 %r30; \ + stw %r30,(savearea+CPUSAVE_SRR1)(%r31); /* save SRR1 */ \ + mfmsr %r30; \ + ori %r30,%r30,(PSL_DR|PSL_IR|PSL_RI)@l; /* relocation on */ \ + mtmsr %r30; /* stack can now be accessed */ \ + isync; \ + mfsprg1 %r31; /* get saved SP */ \ + stwu %r31,-FRAMELEN(%r1); /* save it in the callframe */ \ + stw %r0, FRAME_0+8(%r1); /* save r0 in the trapframe */ \ + stw %r31,FRAME_1+8(%r1); /* save SP " " */ \ + stw %r2, FRAME_2+8(%r1); /* save r2 " " */ \ + stw %r28,FRAME_LR+8(%r1); /* save LR " " */ \ + stw %r29,FRAME_CR+8(%r1); /* save CR " " */ \ + GET_CPUINFO(%r2); \ + lwz %r28,(savearea+CPUSAVE_R28)(%r2); /* get saved r28 */ \ + lwz %r29,(savearea+CPUSAVE_R29)(%r2); /* get saved r29 */ \ + lwz %r30,(savearea+CPUSAVE_R30)(%r2); /* get saved r30 */ \ + lwz %r31,(savearea+CPUSAVE_R31)(%r2); /* get saved r31 */ \ + stw %r3, FRAME_3+8(%r1); /* save r3-r31 */ \ + stw %r4, FRAME_4+8(%r1); \ + stw %r5, FRAME_5+8(%r1); \ + stw %r6, FRAME_6+8(%r1); \ + stw %r7, FRAME_7+8(%r1); \ + stw %r8, FRAME_8+8(%r1); \ + stw %r9, FRAME_9+8(%r1); \ + stw %r10, FRAME_10+8(%r1); \ + stw %r11, FRAME_11+8(%r1); \ + stw %r12, FRAME_12+8(%r1); \ + stw %r13, FRAME_13+8(%r1); \ + stw %r14, FRAME_14+8(%r1); \ + stw %r15, FRAME_15+8(%r1); \ + stw %r16, FRAME_16+8(%r1); \ + stw %r17, FRAME_17+8(%r1); \ + stw %r18, FRAME_18+8(%r1); \ + stw %r19, FRAME_19+8(%r1); \ + stw %r20, FRAME_20+8(%r1); \ + stw %r21, FRAME_21+8(%r1); \ + stw %r22, FRAME_22+8(%r1); \ + stw %r23, FRAME_23+8(%r1); \ + stw %r24, FRAME_24+8(%r1); \ + stw %r25, FRAME_25+8(%r1); \ + stw %r26, FRAME_26+8(%r1); \ + stw %r27, FRAME_27+8(%r1); \ + stw %r28, FRAME_28+8(%r1); \ + stw %r29, FRAME_29+8(%r1); \ + stw %r30, FRAME_30+8(%r1); \ + stw %r31, FRAME_31+8(%r1); \ + lwz %r28,(savearea+CPUSAVE_AIM_DAR)(%r2); /* saved DAR */ \ + lwz %r29,(savearea+CPUSAVE_AIM_DSISR)(%r2);/* saved DSISR */\ + lwz %r30,(savearea+CPUSAVE_SRR0)(%r2); /* saved SRR0 */ \ + lwz %r31,(savearea+CPUSAVE_SRR1)(%r2); /* saved SRR1 */ \ + mfxer %r3; \ + mfctr %r4; \ + mfsprg3 %r5; \ + stw %r3, FRAME_XER+8(1); /* save xer/ctr/exc */ \ + stw %r4, FRAME_CTR+8(1); \ + stw %r5, FRAME_EXC+8(1); \ + stw %r28,FRAME_AIM_DAR+8(1); \ + stw %r29,FRAME_AIM_DSISR+8(1); /* save dsisr/srr0/srr1 */ \ + stw %r30,FRAME_SRR0+8(1); \ + stw %r31,FRAME_SRR1+8(1); \ + lwz %r2,PC_CURTHREAD(%r2) /* set curthread pointer */ + +#define FRAME_LEAVE(savearea) \ +/* Disable exceptions: */ \ + mfmsr %r2; \ + andi. %r2,%r2,~PSL_EE@l; \ + mtmsr %r2; \ + isync; \ +/* Now restore regs: */ \ + lwz %r2,FRAME_SRR0+8(%r1); \ + lwz %r3,FRAME_SRR1+8(%r1); \ + lwz %r4,FRAME_CTR+8(%r1); \ + lwz %r5,FRAME_XER+8(%r1); \ + lwz %r6,FRAME_LR+8(%r1); \ + GET_CPUINFO(%r7); \ + stw %r2,(savearea+CPUSAVE_SRR0)(%r7); /* save SRR0 */ \ + stw %r3,(savearea+CPUSAVE_SRR1)(%r7); /* save SRR1 */ \ + lwz %r7,FRAME_CR+8(%r1); \ + mtctr %r4; \ + mtxer %r5; \ + mtlr %r6; \ + mtsprg1 %r7; /* save cr */ \ + lwz %r31,FRAME_31+8(%r1); /* restore r0-31 */ \ + lwz %r30,FRAME_30+8(%r1); \ + lwz %r29,FRAME_29+8(%r1); \ + lwz %r28,FRAME_28+8(%r1); \ + lwz %r27,FRAME_27+8(%r1); \ + lwz %r26,FRAME_26+8(%r1); \ + lwz %r25,FRAME_25+8(%r1); \ + lwz %r24,FRAME_24+8(%r1); \ + lwz %r23,FRAME_23+8(%r1); \ + lwz %r22,FRAME_22+8(%r1); \ + lwz %r21,FRAME_21+8(%r1); \ + lwz %r20,FRAME_20+8(%r1); \ + lwz %r19,FRAME_19+8(%r1); \ + lwz %r18,FRAME_18+8(%r1); \ + lwz %r17,FRAME_17+8(%r1); \ + lwz %r16,FRAME_16+8(%r1); \ + lwz %r15,FRAME_15+8(%r1); \ + lwz %r14,FRAME_14+8(%r1); \ + lwz %r13,FRAME_13+8(%r1); \ + lwz %r12,FRAME_12+8(%r1); \ + lwz %r11,FRAME_11+8(%r1); \ + lwz %r10,FRAME_10+8(%r1); \ + lwz %r9, FRAME_9+8(%r1); \ + lwz %r8, FRAME_8+8(%r1); \ + lwz %r7, FRAME_7+8(%r1); \ + lwz %r6, FRAME_6+8(%r1); \ + lwz %r5, FRAME_5+8(%r1); \ + lwz %r4, FRAME_4+8(%r1); \ + lwz %r3, FRAME_3+8(%r1); \ + lwz %r2, FRAME_2+8(%r1); \ + lwz %r0, FRAME_0+8(%r1); \ + lwz %r1, FRAME_1+8(%r1); \ +/* Can't touch %r1 from here on */ \ + mtsprg2 %r2; /* save r2 & r3 */ \ + mtsprg3 %r3; \ +/* Disable translation, machine check and recoverability: */ \ + mfmsr %r2; \ + andi. %r2,%r2,~(PSL_DR|PSL_IR|PSL_ME|PSL_RI)@l; \ + mtmsr %r2; \ + isync; \ +/* Decide whether we return to user mode: */ \ + GET_CPUINFO(%r2); \ + lwz %r3,(savearea+CPUSAVE_SRR1)(%r2); \ + mtcr %r3; \ + bf 17,1f; /* branch if PSL_PR is false */ \ +/* Restore user SRs */ \ + RESTORE_USER_SRS(%r2,%r3); \ +1: mfsprg1 %r2; /* restore cr */ \ + mtcr %r2; \ + GET_CPUINFO(%r2); \ + lwz %r3,(savearea+CPUSAVE_SRR0)(%r2); /* restore srr0 */ \ + mtsrr0 %r3; \ + lwz %r3,(savearea+CPUSAVE_SRR1)(%r2); /* restore srr1 */ \ + \ + /* Make sure HV bit of MSR propagated to SRR1 */ \ + mfmsr %r2; \ + or %r3,%r2,%r3; \ + \ + mtsrr1 %r3; \ + mfsprg2 %r2; /* restore r2 & r3 */ \ + mfsprg3 %r3 + +#ifdef KDTRACE_HOOKS + .data + .globl dtrace_invop_calltrap_addr + .align 4 + .type dtrace_invop_calltrap_addr, @object + .size dtrace_invop_calltrap_addr, 4 +dtrace_invop_calltrap_addr: + .word 0 + .word 0 + + .text +#endif + +/* + * The next two routines are 64-bit glue code. The first is used to test if + * we are on a 64-bit system. By copying it to the illegal instruction + * handler, we can test for 64-bit mode by trying to execute a 64-bit + * instruction and seeing what happens. The second gets copied in front + * of all the other handlers to restore 32-bit bridge mode when traps + * are taken. + */ + +/* 64-bit test code. Sets SPRG2 to 0 if an illegal instruction is executed */ + + .globl CNAME(testppc64),CNAME(testppc64size) +CNAME(testppc64): + mtsprg1 %r31 + mfsrr0 %r31 + addi %r31, %r31, 4 + mtsrr0 %r31 + + li %r31, 0 + mtsprg2 %r31 + mfsprg1 %r31 + + rfi +CNAME(testppc64size) = .-CNAME(testppc64) + + +/* 64-bit bridge mode restore snippet. Gets copied in front of everything else + * on 64-bit systems. */ + + .globl CNAME(restorebridge),CNAME(restorebridgesize) +CNAME(restorebridge): + mtsprg1 %r31 + mfmsr %r31 + clrldi %r31,%r31,1 + mtmsrd %r31 + mfsprg1 %r31 + isync +CNAME(restorebridgesize) = .-CNAME(restorebridge) + +/* + * Processor reset exception handler. These are typically + * the first instructions the processor executes after a + * software reset. We do this in two bits so that we are + * not still hanging around in the trap handling region + * once the MMU is turned on. + */ + .globl CNAME(rstcode), CNAME(rstcodeend) +CNAME(rstcode): + lwz %r31, TRAP_GENTRAP(0) + addi %r31, %r31, (cpu_reset - generictrap) + mtlr %r31 + blrl +CNAME(rstcodeend): + +cpu_reset: + bl 1f + + .space 124 + +1: + mflr %r1 + addi %r1,%r1,(124-16)@l + lwz %r30,TRAP_TOCBASE(0) + + bl CNAME(cpudep_ap_early_bootstrap) + lis %r3,1@l + bl CNAME(pmap_cpu_bootstrap) + bl CNAME(cpudep_ap_bootstrap) + mr %r1,%r3 + bl CNAME(cpudep_ap_setup) + GET_CPUINFO(%r5) + lwz %r3,(PC_RESTORE)(%r5) + cmplwi %cr0,%r3,0 + beq %cr0,2f + li %r4, 1 + b CNAME(longjmp) +2: +#ifdef SMP + bl CNAME(machdep_ap_bootstrap) +#endif + + /* Should not be reached */ +9: + b 9b + +/* + * This code gets copied to all the trap vectors + * (except ISI/DSI, ALI, and the interrupts) + */ + + .globl CNAME(trapcode),CNAME(trapcodeend) +CNAME(trapcode): + mtsprg1 %r1 /* save SP */ + mflr %r1 /* Save the old LR in r1 */ + mtsprg2 %r1 /* And then in SPRG2 */ + lwz %r1, TRAP_ENTRY(0) /* Get branch address */ + mtlr %r1 + li %r1, 0xe0 /* How to get the vector from LR */ + blrl /* LR & (0xff00 | r1) is exception # */ +CNAME(trapcodeend): + +/* + * For ALI: has to save DSISR and DAR + */ + .globl CNAME(alitrap),CNAME(aliend) +CNAME(alitrap): + mtsprg1 %r1 /* save SP */ + GET_CPUINFO(%r1) + stw %r28,(PC_TEMPSAVE+CPUSAVE_R28)(%r1) /* free r28-r31 */ + stw %r29,(PC_TEMPSAVE+CPUSAVE_R29)(%r1) + stw %r30,(PC_TEMPSAVE+CPUSAVE_R30)(%r1) + stw %r31,(PC_TEMPSAVE+CPUSAVE_R31)(%r1) + mfdar %r30 + mfdsisr %r31 + stw %r30,(PC_TEMPSAVE+CPUSAVE_AIM_DAR)(%r1) + stw %r31,(PC_TEMPSAVE+CPUSAVE_AIM_DSISR)(%r1) + mfsprg1 %r1 /* restore SP, in case of branch */ + mflr %r28 /* save LR */ + mfcr %r29 /* save CR */ + + /* Put our exception vector in SPRG3 */ + li %r31, EXC_ALI + mtsprg3 %r31 + + /* Test whether we already had PR set */ + mfsrr1 %r31 + mtcr %r31 + + /* Jump to s_trap */ + lwz %r31, TRAP_GENTRAP(0) + addi %r31, %r31, (s_trap - generictrap) + mtlr %r31 + blrl +CNAME(aliend): + +/* + * G2 specific: instuction TLB miss. + */ + .globl CNAME(imisstrap),CNAME(imisssize) +CNAME(imisstrap): + mfspr %r2, SPR_HASH1 /* get first pointer */ + addi %r1, 0, 8 /* load 8 for counter */ + mfctr %r0 /* save counter */ + mfspr %r3, SPR_ICMP /* get first compare value */ + addi %r2, %r2, -8 /* pre dec the pointer */ +im0: + mtctr %r1 /* load counter */ +im1: + lwzu %r1, 8(%r2) /* get next pte */ + cmp 0, 0, %r1, %r3 /* see if found pte */ + bdnzf 2, im1 /* dec count br if cmp ne and if + * count not zero */ + bne instr_sec_hash /* if not found set up second hash + * or exit */ + lwz %r1, +4(%r2) /* load tlb entry lower-word */ + andi. %r3, %r1, 8 /* check G bit */ + bne do_isi_prot /* if guarded, take an ISI */ + mtctr %r0 /* restore counter */ + mfspr %r0, SPR_IMISS /* get the miss address for the tlbli */ + mfspr %r3, SPR_SRR1 /* get the saved cr0 bits */ + mtcrf 0x80, %r3 /* restore CR0 */ + mtspr SPR_RPA, %r1 /* set the pte */ + ori %r1, %r1, 0x100 /* set reference bit */ + srwi %r1, %r1, 8 /* get byte 7 of pte */ + tlbli %r0 /* load the itlb */ + stb %r1, +6(%r2) /* update page table */ + rfi /* return to executing program */ + +instr_sec_hash: + andi. %r1, %r3, 0x0040 /* see if we have done second hash */ + bne do_isi /* if so, go to ISI interrupt */ + mfspr %r2, SPR_HASH2 /* get the second pointer */ + ori %r3, %r3, 0x0040 /* change the compare value */ + addi %r1, 0, 8 /* load 8 for counter */ + addi %r2, %r2, -8 /* pre dec for update on load */ + b im0 /* try second hash */ + +/* Create a faked ISI interrupt as the address was not found */ +do_isi_prot: + mfspr %r3, SPR_SRR1 /* get srr1 */ + andi. %r2, %r3, 0xffff /* clean upper srr1 */ + addis %r2, %r2, 0x0800 /* or in srr<4> = 1 to flag prot + * violation */ + b isi1 +do_isi: + mfspr %r3, SPR_SRR1 /* get srr1 */ + andi. %r2, %r3, 0xffff /* clean srr1 */ + addis %r2, %r2, 0x4000 /* or in srr1<1> = 1 to flag pte + * not found */ +isi1: + mtctr %r0 /* restore counter */ + mtspr SPR_SRR1, %r2 /* set srr1 */ + mfmsr %r0 /* get msr */ + xoris %r0, %r0, 0x2 /* flip the msr<tgpr> bit */ + mtcrf 0x80, %r3 /* restore CR0 */ + mtmsr %r0 /* flip back to the native gprs */ + ba EXC_ISI /* go to instr. access interrupt */ + +CNAME(imisssize) = .-CNAME(imisstrap) + +/* + * G2 specific: data load TLB miss. + */ + .globl CNAME(dlmisstrap),CNAME(dlmisssize) +CNAME(dlmisstrap): + mfspr %r2, SPR_HASH1 /* get first pointer */ + addi %r1, 0, 8 /* load 8 for counter */ + mfctr %r0 /* save counter */ + mfspr %r3, SPR_DCMP /* get first compare value */ + addi %r2, %r2, -8 /* pre dec the pointer */ +dm0: + mtctr %r1 /* load counter */ +dm1: + lwzu %r1, 8(%r2) /* get next pte */ + cmp 0, 0, %r1, %r3 /* see if found pte */ + bdnzf 2, dm1 /* dec count br if cmp ne and if + * count not zero */ + bne data_sec_hash /* if not found set up second hash + * or exit */ + lwz %r1, +4(%r2) /* load tlb entry lower-word */ + mtctr %r0 /* restore counter */ + mfspr %r0, SPR_DMISS /* get the miss address for the tlbld */ + mfspr %r3, SPR_SRR1 /* get the saved cr0 bits */ + mtcrf 0x80, %r3 /* restore CR0 */ + mtspr SPR_RPA, %r1 /* set the pte */ + ori %r1, %r1, 0x100 /* set reference bit */ + srwi %r1, %r1, 8 /* get byte 7 of pte */ + tlbld %r0 /* load the dtlb */ + stb %r1, +6(%r2) /* update page table */ + rfi /* return to executing program */ + +data_sec_hash: + andi. %r1, %r3, 0x0040 /* see if we have done second hash */ + bne do_dsi /* if so, go to DSI interrupt */ + mfspr %r2, SPR_HASH2 /* get the second pointer */ + ori %r3, %r3, 0x0040 /* change the compare value */ + addi %r1, 0, 8 /* load 8 for counter */ + addi %r2, %r2, -8 /* pre dec for update on load */ + b dm0 /* try second hash */ + +CNAME(dlmisssize) = .-CNAME(dlmisstrap) + +/* + * G2 specific: data store TLB miss. + */ + .globl CNAME(dsmisstrap),CNAME(dsmisssize) +CNAME(dsmisstrap): + mfspr %r2, SPR_HASH1 /* get first pointer */ + addi %r1, 0, 8 /* load 8 for counter */ + mfctr %r0 /* save counter */ + mfspr %r3, SPR_DCMP /* get first compare value */ + addi %r2, %r2, -8 /* pre dec the pointer */ +ds0: + mtctr %r1 /* load counter */ +ds1: + lwzu %r1, 8(%r2) /* get next pte */ + cmp 0, 0, %r1, %r3 /* see if found pte */ + bdnzf 2, ds1 /* dec count br if cmp ne and if + * count not zero */ + bne data_store_sec_hash /* if not found set up second hash + * or exit */ + lwz %r1, +4(%r2) /* load tlb entry lower-word */ + andi. %r3, %r1, 0x80 /* check the C-bit */ + beq data_store_chk_prot /* if (C==0) + * go check protection modes */ +ds2: + mtctr %r0 /* restore counter */ + mfspr %r0, SPR_DMISS /* get the miss address for the tlbld */ + mfspr %r3, SPR_SRR1 /* get the saved cr0 bits */ + mtcrf 0x80, %r3 /* restore CR0 */ + mtspr SPR_RPA, %r1 /* set the pte */ + tlbld %r0 /* load the dtlb */ + rfi /* return to executing program */ + +data_store_sec_hash: + andi. %r1, %r3, 0x0040 /* see if we have done second hash */ + bne do_dsi /* if so, go to DSI interrupt */ + mfspr %r2, SPR_HASH2 /* get the second pointer */ + ori %r3, %r3, 0x0040 /* change the compare value */ + addi %r1, 0, 8 /* load 8 for counter */ + addi %r2, %r2, -8 /* pre dec for update on load */ + b ds0 /* try second hash */ + +/* Check the protection before setting PTE(c-bit) */ +data_store_chk_prot: + rlwinm. %r3,%r1,30,0,1 /* test PP */ + bge- chk0 /* if (PP == 00 or PP == 01) + * goto chk0: */ + andi. %r3, %r1, 1 /* test PP[0] */ + beq+ chk2 /* return if PP[0] == 0 */ + b do_dsi_prot /* else DSIp */ +chk0: + mfspr %r3,SPR_SRR1 /* get old msr */ + andis. %r3,%r3,0x0008 /* test the KEY bit (SRR1-bit 12) */ + beq chk2 /* if (KEY==0) goto chk2: */ + b do_dsi_prot /* else do_dsi_prot */ +chk2: + ori %r1, %r1, 0x180 /* set reference and change bit */ + sth %r1, 6(%r2) /* update page table */ + b ds2 /* and back we go */ + +/* Create a faked DSI interrupt as the address was not found */ +do_dsi: + mfspr %r3, SPR_SRR1 /* get srr1 */ + rlwinm %r1,%r3,9,6,6 /* get srr1<flag> to bit 6 for + * load/store, zero rest */ + addis %r1, %r1, 0x4000 /* or in dsisr<1> = 1 to flag pte + * not found */ + b dsi1 + +do_dsi_prot: + mfspr %r3, SPR_SRR1 /* get srr1 */ + rlwinm %r1,%r3,9,6,6 /* get srr1<flag> to bit 6 for + *load/store, zero rest */ + addis %r1, %r1, 0x0800 /* or in dsisr<4> = 1 to flag prot + * violation */ + +dsi1: + mtctr %r0 /* restore counter */ + andi. %r2, %r3, 0xffff /* clear upper bits of srr1 */ + mtspr SPR_SRR1, %r2 /* set srr1 */ + mtspr SPR_DSISR, %r1 /* load the dsisr */ + mfspr %r1, SPR_DMISS /* get miss address */ + rlwinm. %r2,%r2,0,31,31 /* test LE bit */ + beq dsi2 /* if little endian then: */ + xor %r1, %r1, 0x07 /* de-mung the data address */ +dsi2: + mtspr SPR_DAR, %r1 /* put in dar */ + mfmsr %r0 /* get msr */ + xoris %r0, %r0, 0x2 /* flip the msr<tgpr> bit */ + mtcrf 0x80, %r3 /* restore CR0 */ + mtmsr %r0 /* flip back to the native gprs */ + ba EXC_DSI /* branch to DSI interrupt */ + +CNAME(dsmisssize) = .-CNAME(dsmisstrap) + +/* + * Similar to the above for DSI + * Has to handle BAT spills + * and standard pagetable spills + */ + .globl CNAME(dsitrap),CNAME(dsiend) +CNAME(dsitrap): + mtsprg1 %r1 /* save SP */ + GET_CPUINFO(%r1) + stw %r28,(PC_DISISAVE+CPUSAVE_R28)(%r1) /* free r28-r31 */ + stw %r29,(PC_DISISAVE+CPUSAVE_R29)(%r1) + stw %r30,(PC_DISISAVE+CPUSAVE_R30)(%r1) + stw %r31,(PC_DISISAVE+CPUSAVE_R31)(%r1) + mfsprg1 %r1 /* restore SP */ + mfcr %r29 /* save CR */ + mfxer %r30 /* save XER */ + mtsprg2 %r30 /* in SPRG2 */ + mfsrr1 %r31 /* test kernel mode */ + mtcr %r31 + bt 17,1f /* branch if PSL_PR is set */ + mfdar %r31 /* get fault address */ + rlwinm %r31,%r31,7,25,28 /* get segment * 8 */ + + /* get batu */ + lwz %r30,TRAP_TOCBASE(0) + lwz %r30,CNAME(battable)@got(%r30) + add %r31,%r30,%r31 + lwz %r30,0(%r31) + mtcr %r30 + bf 30,1f /* branch if supervisor valid is + false */ + /* get batl */ + lwz %r31,4(%r31) +/* We randomly use the highest two bat registers here */ + mftb %r28 + andi. %r28,%r28,1 + bne 2f + mtdbatu 2,%r30 + mtdbatl 2,%r31 + b 3f +2: + mtdbatu 3,%r30 + mtdbatl 3,%r31 +3: + mfsprg2 %r30 /* restore XER */ + mtxer %r30 + mtcr %r29 /* restore CR */ + mtsprg1 %r1 + GET_CPUINFO(%r1) + lwz %r28,(PC_DISISAVE+CPUSAVE_R28)(%r1) /* restore r28-r31 */ + lwz %r29,(PC_DISISAVE+CPUSAVE_R29)(%r1) + lwz %r30,(PC_DISISAVE+CPUSAVE_R30)(%r1) + lwz %r31,(PC_DISISAVE+CPUSAVE_R31)(%r1) + mfsprg1 %r1 + rfi /* return to trapped code */ +1: + mflr %r28 /* save LR (SP already saved) */ + + /* Jump to disitrap */ + lwz %r1, TRAP_GENTRAP(0) + addi %r1, %r1, (disitrap - generictrap) + mtlr %r1 + blrl +CNAME(dsiend): + +/* + * Preamble code for DSI/ISI traps + */ +disitrap: + /* Write the trap vector to SPRG3 by computing LR & 0xff00 */ + mflr %r1 + andi. %r1,%r1,0xff00 + mtsprg3 %r1 + + GET_CPUINFO(%r1) + lwz %r30,(PC_DISISAVE+CPUSAVE_R28)(%r1) + stw %r30,(PC_TEMPSAVE+CPUSAVE_R28)(%r1) + lwz %r31,(PC_DISISAVE+CPUSAVE_R29)(%r1) + stw %r31,(PC_TEMPSAVE+CPUSAVE_R29)(%r1) + lwz %r30,(PC_DISISAVE+CPUSAVE_R30)(%r1) + stw %r30,(PC_TEMPSAVE+CPUSAVE_R30)(%r1) + lwz %r31,(PC_DISISAVE+CPUSAVE_R31)(%r1) + stw %r31,(PC_TEMPSAVE+CPUSAVE_R31)(%r1) + mfdar %r30 + mfdsisr %r31 + stw %r30,(PC_TEMPSAVE+CPUSAVE_AIM_DAR)(%r1) + stw %r31,(PC_TEMPSAVE+CPUSAVE_AIM_DSISR)(%r1) + +#ifdef KDB + /* Try to detect a kernel stack overflow */ + mfsrr1 %r31 + mtcr %r31 + bt 17,realtrap /* branch is user mode */ + mfsprg1 %r31 /* get old SP */ + clrrwi %r31,%r31,12 /* Round SP down to nearest page */ + sub. %r30,%r31,%r30 /* SP - DAR */ + bge 1f + neg %r30,%r30 /* modulo value */ +1: cmplwi %cr0,%r30,4096 /* is DAR within a page of SP? */ + bge %cr0,realtrap /* no, too far away. */ + + /* Now convert this DSI into a DDB trap. */ + GET_CPUINFO(%r1) + lwz %r30,(PC_TEMPSAVE+CPUSAVE_AIM_DAR)(%r1) /* get DAR */ + stw %r30,(PC_DBSAVE +CPUSAVE_AIM_DAR)(%r1) /* save DAR */ + lwz %r31,(PC_TEMPSAVE+CPUSAVE_AIM_DSISR)(%r1) /* get DSISR */ + stw %r31,(PC_DBSAVE +CPUSAVE_AIM_DSISR)(%r1) /* save DSISR */ + lwz %r30,(PC_DISISAVE+CPUSAVE_R28)(%r1) /* get r28 */ + stw %r30,(PC_DBSAVE +CPUSAVE_R28)(%r1) /* save r28 */ + lwz %r31,(PC_DISISAVE+CPUSAVE_R29)(%r1) /* get r29 */ + stw %r31,(PC_DBSAVE +CPUSAVE_R29)(%r1) /* save r29 */ + lwz %r30,(PC_DISISAVE+CPUSAVE_R30)(%r1) /* get r30 */ + stw %r30,(PC_DBSAVE +CPUSAVE_R30)(%r1) /* save r30 */ + lwz %r31,(PC_DISISAVE+CPUSAVE_R31)(%r1) /* get r31 */ + stw %r31,(PC_DBSAVE +CPUSAVE_R31)(%r1) /* save r31 */ + b dbtrap +#endif + + /* XXX need stack probe here */ +realtrap: +/* Test whether we already had PR set */ + mfsrr1 %r1 + mtcr %r1 + mfsprg1 %r1 /* restore SP (might have been + overwritten) */ + bf 17,k_trap /* branch if PSL_PR is false */ + GET_CPUINFO(%r1) + lwz %r1,PC_CURPCB(%r1) + RESTORE_KERN_SRS(%r30,%r31) /* enable kernel mapping */ + b s_trap + +/* + * generictrap does some standard setup for trap handling to minimize + * the code that need be installed in the actual vectors. It expects + * the following conditions. + * + * R1 - Trap vector = LR & (0xff00 | R1) + * SPRG1 - Original R1 contents + * SPRG2 - Original LR + */ + + .globl CNAME(generictrap64) +generictrap64: + mtsprg3 %r31 + mfmsr %r31 + clrldi %r31,%r31,1 + mtmsrd %r31 + mfsprg3 %r31 + isync + + .globl CNAME(generictrap) +generictrap: + /* Save R1 for computing the exception vector */ + mtsprg3 %r1 + + /* Save interesting registers */ + GET_CPUINFO(%r1) + stw %r28,(PC_TEMPSAVE+CPUSAVE_R28)(%r1) /* free r28-r31 */ + stw %r29,(PC_TEMPSAVE+CPUSAVE_R29)(%r1) + stw %r30,(PC_TEMPSAVE+CPUSAVE_R30)(%r1) + stw %r31,(PC_TEMPSAVE+CPUSAVE_R31)(%r1) + mfsprg1 %r1 /* restore SP, in case of branch */ + mfsprg2 %r28 /* save LR */ + mfcr %r29 /* save CR */ + + /* Compute the exception vector from the link register */ + mfsprg3 %r31 + ori %r31,%r31,0xff00 + mflr %r30 + and %r30,%r30,%r31 + mtsprg3 %r30 + + /* Test whether we already had PR set */ + mfsrr1 %r31 + mtcr %r31 + +s_trap: + bf 17,k_trap /* branch if PSL_PR is false */ + GET_CPUINFO(%r1) +u_trap: + lwz %r1,PC_CURPCB(%r1) + RESTORE_KERN_SRS(%r30,%r31) /* enable kernel mapping */ + +/* + * Now the common trap catching code. + */ +k_trap: + FRAME_SETUP(PC_TEMPSAVE) + /* Restore USER_SR */ + GET_CPUINFO(%r30) + lwz %r30,PC_CURPCB(%r30) + lwz %r30,PCB_AIM_USR_VSID(%r30) + mtsr USER_SR,%r30; sync; isync +/* Call C interrupt dispatcher: */ +trapagain: + addi %r3,%r1,8 + bl CNAME(powerpc_interrupt) + .globl CNAME(trapexit) /* backtrace code sentinel */ +CNAME(trapexit): + +/* Disable interrupts: */ + mfmsr %r3 + andi. %r3,%r3,~PSL_EE@l + mtmsr %r3 + isync +/* Test AST pending: */ + lwz %r5,FRAME_SRR1+8(%r1) + mtcr %r5 + bf 17,1f /* branch if PSL_PR is false */ + + GET_CPUINFO(%r3) /* get per-CPU pointer */ + lwz %r4, TD_AST(%r2) /* get thread ast value + * (r2 is curthread) */ + cmpwi %r4, 0 + beq 1f + mfmsr %r3 /* re-enable interrupts */ + ori %r3,%r3,PSL_EE@l + mtmsr %r3 + isync + addi %r3,%r1,8 + bl CNAME(ast) + .globl CNAME(asttrapexit) /* backtrace code sentinel #2 */ +CNAME(asttrapexit): + b trapexit /* test ast ret value ? */ +1: + FRAME_LEAVE(PC_TEMPSAVE) + + .globl CNAME(rfi_patch1) /* replace rfi with rfid on ppc64 */ +CNAME(rfi_patch1): + rfi + + .globl CNAME(rfid_patch) +CNAME(rfid_patch): + rfid + +#if defined(KDB) +/* + * Deliberate entry to dbtrap + */ + .globl CNAME(breakpoint) +CNAME(breakpoint): + mtsprg1 %r1 + mfmsr %r3 + mtsrr1 %r3 + andi. %r3,%r3,~(PSL_EE|PSL_ME)@l + mtmsr %r3 /* disable interrupts */ + isync + GET_CPUINFO(%r3) + stw %r28,(PC_DBSAVE+CPUSAVE_R28)(%r3) + stw %r29,(PC_DBSAVE+CPUSAVE_R29)(%r3) + stw %r30,(PC_DBSAVE+CPUSAVE_R30)(%r3) + stw %r31,(PC_DBSAVE+CPUSAVE_R31)(%r3) + mflr %r28 + li %r29,EXC_BPT + mtlr %r29 + mfcr %r29 + mtsrr0 %r28 + +/* + * Now the kdb trap catching code. + */ +dbtrap: + /* Write the trap vector to SPRG3 by computing LR & 0xff00 */ + mflr %r1 + andi. %r1,%r1,0xff00 + mtsprg3 %r1 + + lwz %r1,TRAP_TOCBASE(0) /* get new SP */ + lwz %r1,trapstk@got(%r1) + addi %r1,%r1,TRAPSTKSZ-16 + + FRAME_SETUP(PC_DBSAVE) +/* Call C trap code: */ + addi %r3,%r1,8 + bl CNAME(db_trap_glue) + or. %r3,%r3,%r3 + bne dbleave +/* This wasn't for KDB, so switch to real trap: */ + lwz %r3,FRAME_EXC+8(%r1) /* save exception */ + GET_CPUINFO(%r4) + stw %r3,(PC_DBSAVE+CPUSAVE_R31)(%r4) + FRAME_LEAVE(PC_DBSAVE) + mtsprg1 %r1 /* prepare for entrance to realtrap */ + GET_CPUINFO(%r1) + stw %r28,(PC_TEMPSAVE+CPUSAVE_R28)(%r1) + stw %r29,(PC_TEMPSAVE+CPUSAVE_R29)(%r1) + stw %r30,(PC_TEMPSAVE+CPUSAVE_R30)(%r1) + stw %r31,(PC_TEMPSAVE+CPUSAVE_R31)(%r1) + mflr %r28 + mfcr %r29 + lwz %r31,(PC_DBSAVE+CPUSAVE_R31)(%r1) + mtsprg3 %r31 /* SPRG3 was clobbered by FRAME_LEAVE */ + mfsprg1 %r1 + b realtrap +dbleave: + FRAME_LEAVE(PC_DBSAVE) + .globl CNAME(rfi_patch2) /* replace rfi with rfid on ppc64 */ +CNAME(rfi_patch2): + rfi + +/* + * In case of KDB we want a separate trap catcher for it + */ + .globl CNAME(dblow),CNAME(dbend) +CNAME(dblow): + mtsprg1 %r1 /* save SP */ + mtsprg2 %r29 /* save r29 */ + mfcr %r29 /* save CR in r29 */ + mfsrr1 %r1 + mtcr %r1 + bf 17,1f /* branch if privileged */ + /* Unprivileged case */ + mtcr %r29 /* put the condition register back */ + mfsprg2 %r29 /* ... and r29 */ + mflr %r1 /* save LR */ + mtsprg2 %r1 /* And then in SPRG2 */ + + lwz %r1, TRAP_ENTRY(0) /* Get branch address */ + mtlr %r1 + li %r1, 0 /* How to get the vector from LR */ + blrl /* LR & (0xff00 | r1) is exception # */ +1: + /* Privileged, so drop to KDB */ + GET_CPUINFO(%r1) + stw %r28,(PC_DBSAVE+CPUSAVE_R28)(%r1) /* free r28 */ + mfsprg2 %r28 /* r29 holds cr... */ + stw %r28,(PC_DBSAVE+CPUSAVE_R29)(%r1) /* free r29 */ + stw %r30,(PC_DBSAVE+CPUSAVE_R30)(%r1) /* free r30 */ + stw %r31,(PC_DBSAVE+CPUSAVE_R31)(%r1) /* free r31 */ + mflr %r28 /* save LR */ + + /* Jump to dbtrap */ + lwz %r1, TRAP_GENTRAP(0) + addi %r1, %r1, (dbtrap - generictrap) + mtlr %r1 + blrl +CNAME(dbend): +#endif /* KDB */ diff --git a/sys/powerpc/aim/trap_subr64.S b/sys/powerpc/aim/trap_subr64.S new file mode 100644 index 000000000000..56291337dd5b --- /dev/null +++ b/sys/powerpc/aim/trap_subr64.S @@ -0,0 +1,997 @@ +/* $NetBSD: trap_subr.S,v 1.20 2002/04/22 23:20:08 kleink Exp $ */ + +/*- + * Copyright (C) 1995, 1996 Wolfgang Solfrank. + * Copyright (C) 1995, 1996 TooLs GmbH. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by TooLs GmbH. + * 4. The name of TooLs GmbH may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NOTICE: This is not a standalone file. to use it, #include it in + * your port's locore.S, like so: + * + * #include <powerpc/aim/trap_subr.S> + */ + +/* Locate the per-CPU data structure */ +#define GET_CPUINFO(r) \ + mfsprg0 r +#define GET_TOCBASE(r) \ + lis r,DMAP_BASE_ADDRESS@highesta; /* To real-mode alias/dmap */ \ + sldi r,r,32; \ + ori r,r,TRAP_TOCBASE; /* Magic address for TOC */ \ + ld r,0(r) + +/* + * Restore SRs for a pmap + * + * Requires that r28-r31 be scratch, with r28 initialized to the SLB cache + */ + +/* + * User SRs are loaded through a pointer to the current pmap. + * PCPU already in %r3 + */ +restore_usersrs: + ld %r28,PC_USERSLB(%r3) + cmpdi %r28, 0 /* If user SLB pointer NULL, exit */ + beqlr + + li %r29, 0 /* Set the counter to zero */ + + slbia + slbmfee %r31,%r29 + clrrdi %r31,%r31,28 + slbie %r31 +1: ld %r31, 0(%r28) /* Load SLB entry pointer */ + cmpdi %r31, 0 /* If NULL, stop */ + beqlr + + ld %r30, 0(%r31) /* Load SLBV */ + ld %r31, 8(%r31) /* Load SLBE */ + or %r31, %r31, %r29 /* Set SLBE slot */ + slbmte %r30, %r31 /* Install SLB entry */ + + addi %r28, %r28, 8 /* Advance pointer */ + addi %r29, %r29, 1 + b 1b /* Repeat */ + +/* + * Kernel SRs are loaded directly from the PCPU fields + * PCPU in %r1 + */ +restore_kernsrs: + lwz %r29, PC_FLAGS(%r1) + mtcr %r29 + btlr 0 + addi %r28,%r1,PC_KERNSLB + ld %r29,16(%r28) /* One past USER_SLB_SLOT */ + cmpdi %r29,0 + beqlr /* If first kernel entry is invalid, + * SLBs not in use, so exit early */ + + /* Otherwise, set up SLBs */ + li %r29, 0 /* Set the counter to zero */ + + slbia + slbmfee %r31,%r29 + clrrdi %r31,%r31,28 + slbie %r31 +1: cmpdi %r29, USER_SLB_SLOT /* Skip the user slot */ + beq- 2f + + ld %r31, 8(%r28) /* Load SLBE */ + cmpdi %r31, 0 /* If SLBE is not valid, stop */ + beqlr + ld %r30, 0(%r28) /* Load SLBV */ + slbmte %r30, %r31 /* Install SLB entry */ + +2: addi %r28, %r28, 16 /* Advance pointer */ + addi %r29, %r29, 1 + cmpdi %r29, 64 /* Repeat if we are not at the end */ + blt 1b + blr + +/* + * FRAME_SETUP assumes: + * SPRG1 SP (1) + * SPRG3 trap type + * savearea r27-r31,DAR,DSISR (DAR & DSISR only for DSI traps) + * r28 LR + * r29 CR + * r30 scratch + * r31 scratch + * r1 kernel stack + * SRR0/1 as at start of trap + * + * NOTE: SPRG1 is never used while the MMU is on, making it safe to reuse + * in any real-mode fault handler, including those handling double faults. + */ +#define FRAME_SETUP(savearea) \ +/* Have to enable translation to allow access of kernel stack: */ \ + GET_CPUINFO(%r31); \ + mfsrr0 %r30; \ + std %r30,(savearea+CPUSAVE_SRR0)(%r31); /* save SRR0 */ \ + mfsrr1 %r30; \ + std %r30,(savearea+CPUSAVE_SRR1)(%r31); /* save SRR1 */ \ + mfsprg1 %r31; /* get saved SP (clears SPRG1) */ \ + mfmsr %r30; \ + ori %r30,%r30,(PSL_DR|PSL_IR|PSL_RI)@l; /* relocation on */ \ + mtmsr %r30; /* stack can now be accessed */ \ + isync; \ + stdu %r31,-(FRAMELEN+288)(%r1); /* save it in the callframe */ \ + std %r0, FRAME_0+48(%r1); /* save r0 in the trapframe */ \ + std %r31,FRAME_1+48(%r1); /* save SP " " */ \ + std %r2, FRAME_2+48(%r1); /* save r2 " " */ \ + std %r28,FRAME_LR+48(%r1); /* save LR " " */ \ + std %r29,FRAME_CR+48(%r1); /* save CR " " */ \ + GET_CPUINFO(%r2); \ + ld %r27,(savearea+CPUSAVE_R27)(%r2); /* get saved r27 */ \ + ld %r28,(savearea+CPUSAVE_R28)(%r2); /* get saved r28 */ \ + ld %r29,(savearea+CPUSAVE_R29)(%r2); /* get saved r29 */ \ + ld %r30,(savearea+CPUSAVE_R30)(%r2); /* get saved r30 */ \ + ld %r31,(savearea+CPUSAVE_R31)(%r2); /* get saved r31 */ \ + std %r3, FRAME_3+48(%r1); /* save r3-r31 */ \ + std %r4, FRAME_4+48(%r1); \ + std %r5, FRAME_5+48(%r1); \ + std %r6, FRAME_6+48(%r1); \ + std %r7, FRAME_7+48(%r1); \ + std %r8, FRAME_8+48(%r1); \ + std %r9, FRAME_9+48(%r1); \ + std %r10, FRAME_10+48(%r1); \ + std %r11, FRAME_11+48(%r1); \ + std %r12, FRAME_12+48(%r1); \ + std %r13, FRAME_13+48(%r1); \ + std %r14, FRAME_14+48(%r1); \ + std %r15, FRAME_15+48(%r1); \ + std %r16, FRAME_16+48(%r1); \ + std %r17, FRAME_17+48(%r1); \ + std %r18, FRAME_18+48(%r1); \ + std %r19, FRAME_19+48(%r1); \ + std %r20, FRAME_20+48(%r1); \ + std %r21, FRAME_21+48(%r1); \ + std %r22, FRAME_22+48(%r1); \ + std %r23, FRAME_23+48(%r1); \ + std %r24, FRAME_24+48(%r1); \ + std %r25, FRAME_25+48(%r1); \ + std %r26, FRAME_26+48(%r1); \ + std %r27, FRAME_27+48(%r1); \ + std %r28, FRAME_28+48(%r1); \ + std %r29, FRAME_29+48(%r1); \ + std %r30, FRAME_30+48(%r1); \ + std %r31, FRAME_31+48(%r1); \ + ld %r28,(savearea+CPUSAVE_AIM_DAR)(%r2); /* saved DAR */ \ + ld %r29,(savearea+CPUSAVE_AIM_DSISR)(%r2);/* saved DSISR */\ + ld %r30,(savearea+CPUSAVE_SRR0)(%r2); /* saved SRR0 */ \ + ld %r31,(savearea+CPUSAVE_SRR1)(%r2); /* saved SRR1 */ \ + mfxer %r3; \ + mfctr %r4; \ + mfsprg3 %r5; \ + std %r3, FRAME_XER+48(1); /* save xer/ctr/exc */ \ + std %r4, FRAME_CTR+48(1); \ + std %r5, FRAME_EXC+48(1); \ + std %r28,FRAME_AIM_DAR+48(1); \ + std %r29,FRAME_AIM_DSISR+48(1); /* save dsisr/srr0/srr1 */ \ + std %r30,FRAME_SRR0+48(1); \ + std %r31,FRAME_SRR1+48(1); \ + ld %r13,PC_CURTHREAD(%r2) /* set kernel curthread */ + +#define FRAME_LEAVE(savearea) \ +/* Disable exceptions: */ \ + mfmsr %r2; \ + andi. %r2,%r2,~PSL_EE@l; \ + mtmsr %r2; \ + isync; \ +/* Now restore regs: */ \ + ld %r2,FRAME_SRR0+48(%r1); \ + ld %r3,FRAME_SRR1+48(%r1); \ + ld %r4,FRAME_CTR+48(%r1); \ + ld %r5,FRAME_XER+48(%r1); \ + ld %r6,FRAME_LR+48(%r1); \ + GET_CPUINFO(%r7); \ + std %r2,(savearea+CPUSAVE_SRR0)(%r7); /* save SRR0 */ \ + std %r3,(savearea+CPUSAVE_SRR1)(%r7); /* save SRR1 */ \ + ld %r7,FRAME_CR+48(%r1); \ + mtctr %r4; \ + mtxer %r5; \ + mtlr %r6; \ + mtsprg2 %r7; /* save cr */ \ + ld %r31,FRAME_31+48(%r1); /* restore r0-31 */ \ + ld %r30,FRAME_30+48(%r1); \ + ld %r29,FRAME_29+48(%r1); \ + ld %r28,FRAME_28+48(%r1); \ + ld %r27,FRAME_27+48(%r1); \ + ld %r26,FRAME_26+48(%r1); \ + ld %r25,FRAME_25+48(%r1); \ + ld %r24,FRAME_24+48(%r1); \ + ld %r23,FRAME_23+48(%r1); \ + ld %r22,FRAME_22+48(%r1); \ + ld %r21,FRAME_21+48(%r1); \ + ld %r20,FRAME_20+48(%r1); \ + ld %r19,FRAME_19+48(%r1); \ + ld %r18,FRAME_18+48(%r1); \ + ld %r17,FRAME_17+48(%r1); \ + ld %r16,FRAME_16+48(%r1); \ + ld %r15,FRAME_15+48(%r1); \ + ld %r14,FRAME_14+48(%r1); \ + ld %r13,FRAME_13+48(%r1); \ + ld %r12,FRAME_12+48(%r1); \ + ld %r11,FRAME_11+48(%r1); \ + ld %r10,FRAME_10+48(%r1); \ + ld %r9, FRAME_9+48(%r1); \ + ld %r8, FRAME_8+48(%r1); \ + ld %r7, FRAME_7+48(%r1); \ + ld %r6, FRAME_6+48(%r1); \ + ld %r5, FRAME_5+48(%r1); \ + ld %r4, FRAME_4+48(%r1); \ + ld %r3, FRAME_3+48(%r1); \ + ld %r2, FRAME_2+48(%r1); \ + ld %r0, FRAME_0+48(%r1); \ + ld %r1, FRAME_1+48(%r1); \ +/* Can't touch %r1 from here on */ \ + mtsprg3 %r3; /* save r3 */ \ +/* Disable translation, machine check and recoverability: */ \ + mfmsr %r3; \ + andi. %r3,%r3,~(PSL_DR|PSL_IR|PSL_ME|PSL_RI)@l; \ + mtmsr %r3; \ + isync; \ +/* Decide whether we return to user mode: */ \ + GET_CPUINFO(%r3); \ + ld %r3,(savearea+CPUSAVE_SRR1)(%r3); \ + mtcr %r3; \ + bf 17,1f; /* branch if PSL_PR is false */ \ +/* Restore user SRs */ \ + GET_CPUINFO(%r3); \ + std %r27,(savearea+CPUSAVE_R27)(%r3); \ + lwz %r27,PC_FLAGS(%r3); \ + mtcr %r27; \ + bt 0, 0f; /* Check to skip restoring SRs. */ \ + std %r28,(savearea+CPUSAVE_R28)(%r3); \ + std %r29,(savearea+CPUSAVE_R29)(%r3); \ + std %r30,(savearea+CPUSAVE_R30)(%r3); \ + std %r31,(savearea+CPUSAVE_R31)(%r3); \ + mflr %r27; /* preserve LR */ \ + bl restore_usersrs; /* uses r28-r31 */ \ + mtlr %r27; \ + ld %r31,(savearea+CPUSAVE_R31)(%r3); \ + ld %r30,(savearea+CPUSAVE_R30)(%r3); \ + ld %r29,(savearea+CPUSAVE_R29)(%r3); \ + ld %r28,(savearea+CPUSAVE_R28)(%r3); \ +0: \ + ld %r27,(savearea+CPUSAVE_R27)(%r3); \ +1: mfsprg2 %r3; /* restore cr */ \ + mtcr %r3; \ + GET_CPUINFO(%r3); \ + ld %r3,(savearea+CPUSAVE_SRR0)(%r3); /* restore srr0 */ \ + mtsrr0 %r3; \ + GET_CPUINFO(%r3); \ + ld %r3,(savearea+CPUSAVE_SRR1)(%r3); /* restore srr1 */ \ + mtsrr1 %r3; \ + mfsprg3 %r3 /* restore r3 */ + +#ifdef KDTRACE_HOOKS + .data + .globl dtrace_invop_calltrap_addr + .align 8 + .type dtrace_invop_calltrap_addr, @object + .size dtrace_invop_calltrap_addr, 8 +dtrace_invop_calltrap_addr: + .word 0 + .word 0 + + .text +#endif + +/* + * Processor reset exception handler. These are typically + * the first instructions the processor executes after a + * software reset. We do this in two bits so that we are + * not still hanging around in the trap handling region + * once the MMU is turned on. + */ + .globl CNAME(rstcode), CNAME(rstcodeend), CNAME(cpu_reset_handler) + .globl CNAME(cpu_wakeup_handler) + .p2align 3 +CNAME(rstcode): +#ifdef __LITTLE_ENDIAN__ + /* + * XXX This shouldn't be necessary. + * + * According to the ISA documentation, LE should be set from HILE + * or the LPCR ILE bit automatically. However, the entry into this + * vector from OPAL_START_CPU does not honor this correctly. + * + * We should be able to define an alternate entry for opal's + * start_kernel_secondary asm code to branch to. + */ + RETURN_TO_NATIVE_ENDIAN +#endif + /* + * Check if this is software reset or + * processor is waking up from power saving mode + * It is software reset when 46:47 = 0b00 + */ + /* 0x00 */ + ld %r2,TRAP_GENTRAP(0) /* Real-mode &generictrap */ + mfsrr1 %r9 /* Load SRR1 into r9 */ + andis. %r9,%r9,0x3 /* Logic AND with 46:47 bits */ + + beq 2f /* Branch if software reset */ + /* 0x10 */ + /* Reset was wakeup */ + addi %r9,%r2,(cpu_wakeup_handler-generictrap) + b 1f /* Was power save, do the wakeup */ + + /* Reset was software reset */ + /* Explicitly set MSR[SF] */ +2: mfmsr %r9 + li %r8,1 + /* 0x20 */ + insrdi %r9,%r8,1,0 + mtmsrd %r9 + isync + + addi %r9,%r2,(cpu_reset_handler-generictrap) + + /* 0x30 */ +1: mtlr %r9 + blr /* Branch to either cpu_reset_handler + * or cpu_wakeup_handler. + */ +CNAME(rstcodeend): + +cpu_reset_handler: + GET_TOCBASE(%r2) + + addis %r1,%r2,TOC_REF(tmpstk)@ha + ld %r1,TOC_REF(tmpstk)@l(%r1) /* get new SP */ + addi %r1,%r1,(TMPSTKSZ-48) + + bl CNAME(cpudep_ap_early_bootstrap) /* Set PCPU */ + nop + lis %r3,1@l + bl CNAME(pmap_cpu_bootstrap) /* Turn on virtual memory */ + nop + bl CNAME(cpudep_ap_bootstrap) /* Set up PCPU and stack */ + nop + mr %r1,%r3 /* Use new stack */ + bl CNAME(cpudep_ap_setup) + nop + GET_CPUINFO(%r5) + ld %r3,(PC_RESTORE)(%r5) + cmpldi %cr0,%r3,0 + beq %cr0,2f + nop + li %r4,1 + bl CNAME(longjmp) + nop +2: +#ifdef SMP + bl CNAME(machdep_ap_bootstrap) /* And away! */ + nop +#endif + + /* Should not be reached */ +9: + b 9b + +cpu_wakeup_handler: + GET_TOCBASE(%r2) + + /* Check for false wake up due to badly SRR1 set (eg. by OPAL) */ + addis %r3,%r2,TOC_REF(can_wakeup)@ha + ld %r3,TOC_REF(can_wakeup)@l(%r3) + ld %r3,0(%r3) + cmpdi %r3,0 + beq cpu_reset_handler + + /* Turn on MMU after return from interrupt */ + mfsrr1 %r3 + ori %r3,%r3,(PSL_IR | PSL_DR) + mtsrr1 %r3 + + /* Turn on MMU (needed to access PCB) */ + mfmsr %r3 + ori %r3,%r3,(PSL_IR | PSL_DR) + mtmsr %r3 + isync + + mfsprg0 %r3 + + ld %r3,PC_CURTHREAD(%r3) /* Get current thread */ + ld %r3,TD_PCB(%r3) /* Get PCB of current thread */ + ld %r12,PCB_CONTEXT(%r3) /* Load the non-volatile GP regs. */ + ld %r13,PCB_CONTEXT+1*8(%r3) + ld %r14,PCB_CONTEXT+2*8(%r3) + ld %r15,PCB_CONTEXT+3*8(%r3) + ld %r16,PCB_CONTEXT+4*8(%r3) + ld %r17,PCB_CONTEXT+5*8(%r3) + ld %r18,PCB_CONTEXT+6*8(%r3) + ld %r19,PCB_CONTEXT+7*8(%r3) + ld %r20,PCB_CONTEXT+8*8(%r3) + ld %r21,PCB_CONTEXT+9*8(%r3) + ld %r22,PCB_CONTEXT+10*8(%r3) + ld %r23,PCB_CONTEXT+11*8(%r3) + ld %r24,PCB_CONTEXT+12*8(%r3) + ld %r25,PCB_CONTEXT+13*8(%r3) + ld %r26,PCB_CONTEXT+14*8(%r3) + ld %r27,PCB_CONTEXT+15*8(%r3) + ld %r28,PCB_CONTEXT+16*8(%r3) + ld %r29,PCB_CONTEXT+17*8(%r3) + ld %r30,PCB_CONTEXT+18*8(%r3) + ld %r31,PCB_CONTEXT+19*8(%r3) + ld %r5,PCB_CR(%r3) /* Load the condition register */ + mtcr %r5 + ld %r5,PCB_LR(%r3) /* Load the link register */ + mtsrr0 %r5 + ld %r1,PCB_SP(%r3) /* Load the stack pointer */ + ld %r2,PCB_TOC(%r3) /* Load the TOC pointer */ + + rfid + +/* + * This code gets copied to all the trap vectors + * (except ISI/DSI, ALI, and the interrupts). Has to fit in 8 instructions! + */ + + .globl CNAME(trapcode),CNAME(trapcodeend) + .p2align 3 +CNAME(trapcode): + mtsprg1 %r1 /* save SP */ + mflr %r1 /* Save the old LR in r1 */ + mtsprg2 %r1 /* And then in SPRG2 */ + ld %r1,TRAP_ENTRY(0) + mtlr %r1 + li %r1, 0xe0 /* How to get the vector from LR */ + blrl /* Branch to generictrap */ +CNAME(trapcodeend): + +/* Same thing for traps setting HSRR0/HSRR1 */ + .globl CNAME(hypertrapcode),CNAME(hypertrapcodeend) + .p2align 3 +CNAME(hypertrapcode): + mtsprg1 %r1 /* save SP */ + mflr %r1 /* Save the old LR in r1 */ + mtsprg2 %r1 /* And then in SPRG2 */ + ld %r1,TRAP_GENTRAP(0) + addi %r1,%r1,(generichypertrap-generictrap) + mtlr %r1 + li %r1, 0xe0 /* How to get the vector from LR */ + blrl /* Branch to generichypertrap */ +CNAME(hypertrapcodeend): + +/* + * For SLB misses: do special things for the kernel + * + * Note: SPRG1 is always safe to overwrite any time the MMU was on, which is + * the only time this can be called. + */ + .globl CNAME(slbtrap),CNAME(slbtrapend) + .p2align 3 +CNAME(slbtrap): + /* 0x00 */ + mtsprg1 %r1 /* save SP */ + GET_CPUINFO(%r1) + std %r2,(PC_SLBSAVE+16)(%r1) /* save r2 */ + mfcr %r2 + /* 0x10 */ + std %r2,(PC_SLBSAVE+104)(%r1) /* save CR */ + mfsrr1 %r2 /* test kernel mode */ + mtcr %r2 + bf 17,2f /* branch if PSL_PR is false */ + /* 0x20 */ + /* User mode */ + ld %r2,(PC_SLBSAVE+104)(%r1) + mtcr %r2 /* restore CR */ + ld %r2,(PC_SLBSAVE+16)(%r1) /* restore r2 */ + mflr %r1 + /* 0x30 */ + mtsprg2 %r1 /* save LR in SPRG2 */ + ld %r1,TRAP_ENTRY(0) /* real-mode &generictrap */ + mtlr %r1 + li %r1, 0x80 /* How to get the vector from LR */ + /* 0x40 */ + blrl /* Branch to generictrap */ +2: mflr %r2 /* Save the old LR in r2 */ + /* Kernel mode */ + ld %r1,TRAP_GENTRAP(0) /* Real-mode &generictrap */ + addi %r1,%r1,(kern_slbtrap-generictrap) + /* 0x50 */ + mtlr %r1 + GET_CPUINFO(%r1) + blrl /* Branch to kern_slbtrap */ +/* must fit in 128 bytes! */ +CNAME(slbtrapend): + +/* + * On entry: + * SPRG1: SP + * r1: pcpu + * r2: LR + * LR: branch address in trap region + */ +kern_slbtrap: + std %r2,(PC_SLBSAVE+136)(%r1) /* old LR */ + std %r3,(PC_SLBSAVE+24)(%r1) /* save R3 */ + + /* Check if this needs to be handled as a regular trap (userseg miss) */ + mflr %r2 + andi. %r2,%r2,0xff80 + cmpwi %r2,EXC_DSE + bne 1f + mfdar %r2 + b 2f +1: mfsrr0 %r2 +2: /* r2 now contains the fault address */ + lis %r3,SEGMENT_MASK@highesta + ori %r3,%r3,SEGMENT_MASK@highera + sldi %r3,%r3,32 + oris %r3,%r3,SEGMENT_MASK@ha + ori %r3,%r3,SEGMENT_MASK@l + and %r2,%r2,%r3 /* R2 = segment base address */ + lis %r3,USER_ADDR@highesta + ori %r3,%r3,USER_ADDR@highera + sldi %r3,%r3,32 + oris %r3,%r3,USER_ADDR@ha + ori %r3,%r3,USER_ADDR@l + cmpd %r2,%r3 /* Compare fault base to USER_ADDR */ + bne 3f + + /* User seg miss, handle as a regular trap */ + ld %r2,(PC_SLBSAVE+104)(%r1) /* Restore CR */ + mtcr %r2 + ld %r2,(PC_SLBSAVE+16)(%r1) /* Restore R2,R3 */ + ld %r3,(PC_SLBSAVE+24)(%r1) + ld %r1,(PC_SLBSAVE+136)(%r1) /* Save the old LR in r1 */ + mtsprg2 %r1 /* And then in SPRG2 */ + li %r1, 0x80 /* How to get the vector from LR */ + b generictrap /* Retain old LR using b */ + +3: /* Real kernel SLB miss */ + std %r0,(PC_SLBSAVE+0)(%r1) /* free all volatile regs */ + mfsprg1 %r2 /* Old R1 */ + std %r2,(PC_SLBSAVE+8)(%r1) + /* R2,R3 already saved */ + std %r4,(PC_SLBSAVE+32)(%r1) + std %r5,(PC_SLBSAVE+40)(%r1) + std %r6,(PC_SLBSAVE+48)(%r1) + std %r7,(PC_SLBSAVE+56)(%r1) + std %r8,(PC_SLBSAVE+64)(%r1) + std %r9,(PC_SLBSAVE+72)(%r1) + std %r10,(PC_SLBSAVE+80)(%r1) + std %r11,(PC_SLBSAVE+88)(%r1) + std %r12,(PC_SLBSAVE+96)(%r1) + /* CR already saved */ + mfxer %r2 /* save XER */ + std %r2,(PC_SLBSAVE+112)(%r1) + mflr %r2 /* save LR (SP already saved) */ + std %r2,(PC_SLBSAVE+120)(%r1) + mfctr %r2 /* save CTR */ + std %r2,(PC_SLBSAVE+128)(%r1) + + /* Call handler */ + addi %r1,%r1,PC_SLBSTACK-48+1024 + li %r2,~15 + and %r1,%r1,%r2 + GET_TOCBASE(%r2) + mflr %r3 + andi. %r3,%r3,0xff80 + mfdar %r4 + mfsrr0 %r5 + bl handle_kernel_slb_spill + nop + + /* Save r28-31, restore r4-r12 */ + GET_CPUINFO(%r1) + ld %r4,(PC_SLBSAVE+32)(%r1) + ld %r5,(PC_SLBSAVE+40)(%r1) + ld %r6,(PC_SLBSAVE+48)(%r1) + ld %r7,(PC_SLBSAVE+56)(%r1) + ld %r8,(PC_SLBSAVE+64)(%r1) + ld %r9,(PC_SLBSAVE+72)(%r1) + ld %r10,(PC_SLBSAVE+80)(%r1) + ld %r11,(PC_SLBSAVE+88)(%r1) + ld %r12,(PC_SLBSAVE+96)(%r1) + std %r28,(PC_SLBSAVE+64)(%r1) + std %r29,(PC_SLBSAVE+72)(%r1) + std %r30,(PC_SLBSAVE+80)(%r1) + std %r31,(PC_SLBSAVE+88)(%r1) + + /* Restore kernel mapping */ + bl restore_kernsrs + + /* Restore remaining registers */ + ld %r28,(PC_SLBSAVE+64)(%r1) + ld %r29,(PC_SLBSAVE+72)(%r1) + ld %r30,(PC_SLBSAVE+80)(%r1) + ld %r31,(PC_SLBSAVE+88)(%r1) + + ld %r2,(PC_SLBSAVE+104)(%r1) + mtcr %r2 + ld %r2,(PC_SLBSAVE+112)(%r1) + mtxer %r2 + ld %r2,(PC_SLBSAVE+120)(%r1) + mtlr %r2 + ld %r2,(PC_SLBSAVE+128)(%r1) + mtctr %r2 + ld %r2,(PC_SLBSAVE+136)(%r1) + mtlr %r2 + + /* Restore r0-r3 */ + ld %r0,(PC_SLBSAVE+0)(%r1) + ld %r2,(PC_SLBSAVE+16)(%r1) + ld %r3,(PC_SLBSAVE+24)(%r1) + mfsprg1 %r1 + + /* Back to whatever we were doing */ + rfid + +/* + * For ALI: has to save DSISR and DAR + */ + .globl CNAME(alitrap),CNAME(aliend) +CNAME(alitrap): + mtsprg1 %r1 /* save SP */ + GET_CPUINFO(%r1) + std %r27,(PC_TEMPSAVE+CPUSAVE_R27)(%r1) /* free r27-r31 */ + std %r28,(PC_TEMPSAVE+CPUSAVE_R28)(%r1) + std %r29,(PC_TEMPSAVE+CPUSAVE_R29)(%r1) + std %r30,(PC_TEMPSAVE+CPUSAVE_R30)(%r1) + std %r31,(PC_TEMPSAVE+CPUSAVE_R31)(%r1) + mfdar %r30 + mfdsisr %r31 + std %r30,(PC_TEMPSAVE+CPUSAVE_AIM_DAR)(%r1) + std %r31,(PC_TEMPSAVE+CPUSAVE_AIM_DSISR)(%r1) + mfsprg1 %r1 /* restore SP, in case of branch */ + mflr %r28 /* save LR */ + mfcr %r29 /* save CR */ + + ld %r31,TRAP_GENTRAP(0) + addi %r31,%r31,(s_trap - generictrap) + mtlr %r31 + + /* Put our exception vector in SPRG3 */ + li %r31, EXC_ALI + mtsprg3 %r31 + + /* Test whether we already had PR set */ + mfsrr1 %r31 + mtcr %r31 + blrl /* Branch to s_trap */ +CNAME(aliend): + +/* + * Similar to the above for DSI + * Has to handle standard pagetable spills + */ + .globl CNAME(dsitrap),CNAME(dsiend) + .p2align 3 +CNAME(dsitrap): + mtsprg1 %r1 /* save SP */ + GET_CPUINFO(%r1) + std %r27,(PC_DISISAVE+CPUSAVE_R27)(%r1) /* free r27-r31 */ + std %r28,(PC_DISISAVE+CPUSAVE_R28)(%r1) + std %r29,(PC_DISISAVE+CPUSAVE_R29)(%r1) + std %r30,(PC_DISISAVE+CPUSAVE_R30)(%r1) + std %r31,(PC_DISISAVE+CPUSAVE_R31)(%r1) + mfcr %r29 /* save CR */ + mfxer %r30 /* save XER */ + mtsprg2 %r30 /* in SPRG2 */ + mfsrr1 %r31 /* test kernel mode */ + mtcr %r31 + mflr %r28 /* save LR (SP already saved) */ + ld %r1,TRAP_GENTRAP(0) + addi %r1,%r1,(disitrap-generictrap) + mtlr %r1 + blrl /* Branch to disitrap */ +CNAME(dsiend): + +/* + * Preamble code for DSI/ISI traps + */ +disitrap: + /* Write the trap vector to SPRG3 by computing LR & 0xff00 */ + mflr %r1 + andi. %r1,%r1,0xff00 + mtsprg3 %r1 + + GET_CPUINFO(%r1) + ld %r31,(PC_DISISAVE+CPUSAVE_R27)(%r1) + std %r31,(PC_TEMPSAVE+CPUSAVE_R27)(%r1) + ld %r30,(PC_DISISAVE+CPUSAVE_R28)(%r1) + std %r30,(PC_TEMPSAVE+CPUSAVE_R28)(%r1) + ld %r31,(PC_DISISAVE+CPUSAVE_R29)(%r1) + std %r31,(PC_TEMPSAVE+CPUSAVE_R29)(%r1) + ld %r30,(PC_DISISAVE+CPUSAVE_R30)(%r1) + std %r30,(PC_TEMPSAVE+CPUSAVE_R30)(%r1) + ld %r31,(PC_DISISAVE+CPUSAVE_R31)(%r1) + std %r31,(PC_TEMPSAVE+CPUSAVE_R31)(%r1) + mfdar %r30 + mfdsisr %r31 + std %r30,(PC_TEMPSAVE+CPUSAVE_AIM_DAR)(%r1) + std %r31,(PC_TEMPSAVE+CPUSAVE_AIM_DSISR)(%r1) + +#ifdef KDB + /* Try to detect a kernel stack overflow */ + mfsrr1 %r31 + mtcr %r31 + bt 17,realtrap /* branch is user mode */ + mfsprg1 %r31 /* get old SP */ + clrrdi %r31,%r31,12 /* Round SP down to nearest page */ + sub. %r30,%r31,%r30 /* SP - DAR */ + bge 1f + neg %r30,%r30 /* modulo value */ +1: cmpldi %cr0,%r30,4096 /* is DAR within a page of SP? */ + bge %cr0,realtrap /* no, too far away. */ + + /* Now convert this DSI into a DDB trap. */ + GET_CPUINFO(%r1) + ld %r30,(PC_TEMPSAVE+CPUSAVE_AIM_DAR)(%r1) /* get DAR */ + std %r30,(PC_DBSAVE +CPUSAVE_AIM_DAR)(%r1) /* save DAR */ + ld %r30,(PC_TEMPSAVE+CPUSAVE_AIM_DSISR)(%r1) /* get DSISR */ + std %r30,(PC_DBSAVE +CPUSAVE_AIM_DSISR)(%r1) /* save DSISR */ + ld %r31,(PC_DISISAVE+CPUSAVE_R27)(%r1) /* get r27 */ + std %r31,(PC_DBSAVE +CPUSAVE_R27)(%r1) /* save r27 */ + ld %r30,(PC_DISISAVE+CPUSAVE_R28)(%r1) /* get r28 */ + std %r30,(PC_DBSAVE +CPUSAVE_R28)(%r1) /* save r28 */ + ld %r31,(PC_DISISAVE+CPUSAVE_R29)(%r1) /* get r29 */ + std %r31,(PC_DBSAVE +CPUSAVE_R29)(%r1) /* save r29 */ + ld %r30,(PC_DISISAVE+CPUSAVE_R30)(%r1) /* get r30 */ + std %r30,(PC_DBSAVE +CPUSAVE_R30)(%r1) /* save r30 */ + ld %r31,(PC_DISISAVE+CPUSAVE_R31)(%r1) /* get r31 */ + std %r31,(PC_DBSAVE +CPUSAVE_R31)(%r1) /* save r31 */ + b dbtrap +#endif + + /* XXX need stack probe here */ +realtrap: +/* Test whether we already had PR set */ + mfsrr1 %r1 + mtcr %r1 + mfsprg1 %r1 /* restore SP (might have been + overwritten) */ + bf 17,k_trap /* branch if PSL_PR is false */ + GET_CPUINFO(%r1) + mr %r27,%r28 /* Save LR, r29 */ + mtsprg2 %r29 + bl restore_kernsrs /* enable kernel mapping */ + mfsprg2 %r29 + mr %r28,%r27 + ld %r1,PC_CURPCB(%r1) + b s_trap + +/* + * generictrap does some standard setup for trap handling to minimize + * the code that need be installed in the actual vectors. It expects + * the following conditions. + * + * R1 - Trap vector = LR & (0xff00 | R1) + * SPRG1 - Original R1 contents + * SPRG2 - Original LR + */ + +generichypertrap: + mtsprg3 %r1 + mfspr %r1, SPR_HSRR0 + mtsrr0 %r1 + mfspr %r1, SPR_HSRR1 + mtsrr1 %r1 + mfsprg3 %r1 + .globl CNAME(generictrap) +generictrap: + /* Save R1 for computing the exception vector */ + mtsprg3 %r1 + + /* Save interesting registers */ + GET_CPUINFO(%r1) + std %r27,(PC_TEMPSAVE+CPUSAVE_R27)(%r1) /* free r27-r31 */ + std %r28,(PC_TEMPSAVE+CPUSAVE_R28)(%r1) + std %r29,(PC_TEMPSAVE+CPUSAVE_R29)(%r1) + std %r30,(PC_TEMPSAVE+CPUSAVE_R30)(%r1) + std %r31,(PC_TEMPSAVE+CPUSAVE_R31)(%r1) + mfdar %r30 + std %r30,(PC_TEMPSAVE+CPUSAVE_AIM_DAR)(%r1) + mfdsisr %r30 + std %r30,(PC_TEMPSAVE+CPUSAVE_AIM_DSISR)(%r1) + mfsprg1 %r1 /* restore SP, in case of branch */ + mfsprg2 %r28 /* save LR */ + mfcr %r29 /* save CR */ + + /* Compute the exception vector from the link register */ + mfsprg3 %r31 + ori %r31,%r31,0xff00 + mflr %r30 + addi %r30,%r30,-4 /* The branch instruction, not the next */ + and %r30,%r30,%r31 + mtsprg3 %r30 + + /* Test whether we already had PR set */ + mfsrr1 %r31 + mtcr %r31 + +s_trap: + bf 17,k_trap /* branch if PSL_PR is false */ + GET_CPUINFO(%r1) +u_trap: + mr %r27,%r28 /* Save LR, r29 */ + mtsprg2 %r29 + bl restore_kernsrs /* enable kernel mapping */ + mfsprg2 %r29 + mr %r28,%r27 + ld %r1,PC_CURPCB(%r1) + +/* + * Now the common trap catching code. + */ +k_trap: + FRAME_SETUP(PC_TEMPSAVE) +/* Call C interrupt dispatcher: */ +trapagain: + GET_TOCBASE(%r2) + addi %r3,%r1,48 + bl CNAME(powerpc_interrupt) + nop + + .globl CNAME(trapexit) /* backtrace code sentinel */ +CNAME(trapexit): +/* Disable interrupts: */ + mfmsr %r3 + andi. %r3,%r3,~PSL_EE@l + mtmsr %r3 + isync +/* Test AST pending: */ + ld %r5,FRAME_SRR1+48(%r1) + mtcr %r5 + bf 17,1f /* branch if PSL_PR is false */ + + GET_CPUINFO(%r3) /* get per-CPU pointer */ + lwz %r4,TD_AST(%r13) /* get thread ast value */ + cmpwi %r4,0 + beq 1f + mfmsr %r3 /* re-enable interrupts */ + ori %r3,%r3,PSL_EE@l + mtmsr %r3 + isync + GET_TOCBASE(%r2) + addi %r3,%r1,48 + bl CNAME(ast) + nop + .globl CNAME(asttrapexit) /* backtrace code sentinel #2 */ +CNAME(asttrapexit): + b trapexit /* test ast ret value ? */ +1: + FRAME_LEAVE(PC_TEMPSAVE) + rfid + +#if defined(KDB) +/* + * Deliberate entry to dbtrap + */ +ASENTRY_NOPROF(breakpoint) + mtsprg1 %r1 + mfmsr %r3 + mtsrr1 %r3 + andi. %r3,%r3,~(PSL_EE|PSL_ME)@l + mtmsr %r3 /* disable interrupts */ + isync + GET_CPUINFO(%r3) + std %r27,(PC_DBSAVE+CPUSAVE_R27)(%r3) + std %r28,(PC_DBSAVE+CPUSAVE_R28)(%r3) + std %r29,(PC_DBSAVE+CPUSAVE_R29)(%r3) + std %r30,(PC_DBSAVE+CPUSAVE_R30)(%r3) + std %r31,(PC_DBSAVE+CPUSAVE_R31)(%r3) + mflr %r28 + li %r29,EXC_BPT + mtlr %r29 + mfcr %r29 + mtsrr0 %r28 + +/* + * Now the kdb trap catching code. + */ +dbtrap: + /* Write the trap vector to SPRG3 by computing LR & 0xff00 */ + mflr %r1 + andi. %r1,%r1,0xff00 + mtsprg3 %r1 + + GET_TOCBASE(%r1) /* get new SP */ + addis %r1,%r1,TOC_REF(trapstk)@ha + ld %r1,TOC_REF(trapstk)@l(%r1) + addi %r1,%r1,(TRAPSTKSZ-48) + + FRAME_SETUP(PC_DBSAVE) +/* Call C trap code: */ + GET_TOCBASE(%r2) + addi %r3,%r1,48 + bl CNAME(db_trap_glue) + nop + or. %r3,%r3,%r3 + bne dbleave +/* This wasn't for KDB, so switch to real trap: */ + ld %r3,FRAME_EXC+48(%r1) /* save exception */ + GET_CPUINFO(%r4) + std %r3,(PC_DBSAVE+CPUSAVE_R31)(%r4) + FRAME_LEAVE(PC_DBSAVE) + mtsprg1 %r1 /* prepare for entrance to realtrap */ + GET_CPUINFO(%r1) + std %r27,(PC_TEMPSAVE+CPUSAVE_R27)(%r1) + std %r28,(PC_TEMPSAVE+CPUSAVE_R28)(%r1) + std %r29,(PC_TEMPSAVE+CPUSAVE_R29)(%r1) + std %r30,(PC_TEMPSAVE+CPUSAVE_R30)(%r1) + std %r31,(PC_TEMPSAVE+CPUSAVE_R31)(%r1) + mflr %r28 + mfcr %r29 + ld %r31,(PC_DBSAVE+CPUSAVE_R31)(%r1) + mtsprg3 %r31 /* SPRG3 was clobbered by FRAME_LEAVE */ + mfsprg1 %r1 + b realtrap +dbleave: + FRAME_LEAVE(PC_DBSAVE) + rfid +ASEND(breakpoint) + +/* + * In case of KDB we want a separate trap catcher for it + */ + .globl CNAME(dblow),CNAME(dbend) + .p2align 3 +CNAME(dblow): + mtsprg1 %r1 /* save SP */ + mtsprg2 %r29 /* save r29 */ + mfcr %r29 /* save CR in r29 */ + mfsrr1 %r1 + mtcr %r1 + bf 17,1f /* branch if privileged */ + + /* Unprivileged case */ + mtcr %r29 /* put the condition register back */ + mfsprg2 %r29 /* ... and r29 */ + mflr %r1 /* save LR */ + mtsprg2 %r1 /* And then in SPRG2 */ + + ld %r1, TRAP_ENTRY(0) /* Get branch address */ + mtlr %r1 + li %r1, 0 /* How to get the vector from LR */ + blrl /* Branch to generictrap */ + /* No fallthrough */ +1: + GET_CPUINFO(%r1) + std %r27,(PC_DBSAVE+CPUSAVE_R27)(%r1) /* free r27 */ + std %r28,(PC_DBSAVE+CPUSAVE_R28)(%r1) /* free r28 */ + mfsprg2 %r28 /* r29 holds cr... */ + std %r28,(PC_DBSAVE+CPUSAVE_R29)(%r1) /* free r29 */ + std %r30,(PC_DBSAVE+CPUSAVE_R30)(%r1) /* free r30 */ + std %r31,(PC_DBSAVE+CPUSAVE_R31)(%r1) /* free r31 */ + mflr %r28 /* save LR */ + ld %r1,TRAP_GENTRAP(0) + addi %r1,%r1,(dbtrap-generictrap) + mtlr %r1 + blrl /* Branch to dbtrap */ +CNAME(dbend): +#endif /* KDB */ |
