diff options
Diffstat (limited to 'lib/Target')
-rw-r--r-- | lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp | 12 | ||||
-rw-r--r-- | lib/Target/AArch64/AArch64FrameLowering.cpp | 52 | ||||
-rw-r--r-- | lib/Target/AArch64/AArch64ISelLowering.cpp | 4 | ||||
-rw-r--r-- | lib/Target/AArch64/AArch64InstrInfo.td | 28 | ||||
-rw-r--r-- | lib/Target/AArch64/AArch64MacroFusion.cpp | 4 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 11 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 3 |
7 files changed, 85 insertions, 29 deletions
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 160107cd7e2b..d52cd84246a1 100644 --- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -946,6 +946,18 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, case AArch64::CMP_SWAP_128: return expandCMP_SWAP_128(MBB, MBBI, NextMBBI); + case AArch64::AESMCrrTied: + case AArch64::AESIMCrrTied: { + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::AESMCrrTied ? AArch64::AESMCrr : + AArch64::AESIMCrr)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)); + transferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; + } } return false; } diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 4907d082eda0..7c6a99990406 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -506,19 +506,23 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, return; } - auto CSStackSize = AFI->getCalleeSavedStackSize(); + bool IsWin64 = + Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()); + unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; + + auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; // All of the remaining stack allocations are for locals. - AFI->setLocalStackSize(NumBytes - CSStackSize); + AFI->setLocalStackSize(NumBytes - PrologueSaveSize); bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); if (CombineSPBump) { emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, MachineInstr::FrameSetup); NumBytes = 0; - } else if (CSStackSize != 0) { + } else if (PrologueSaveSize != 0) { MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII, - -CSStackSize); - NumBytes -= CSStackSize; + -PrologueSaveSize); + NumBytes -= PrologueSaveSize; } assert(NumBytes >= 0 && "Negative stack allocation size!?"); @@ -532,8 +536,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, ++MBBI; } if (HasFP) { - // Only set up FP if we actually need to. Frame pointer is fp = sp - 16. - int FPOffset = CSStackSize - 16; + // Only set up FP if we actually need to. Frame pointer is fp = + // sp - fixedobject - 16. + int FPOffset = AFI->getCalleeSavedStackSize() - 16; if (CombineSPBump) FPOffset += AFI->getLocalStackSize(); @@ -672,8 +677,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (HasFP) { // Define the current CFA rule to use the provided FP. unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true); - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth)); + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa( + nullptr, Reg, 2 * StackGrowth - FixedObject)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); @@ -759,12 +764,16 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps // it as the 2nd argument of AArch64ISD::TC_RETURN. - auto CSStackSize = AFI->getCalleeSavedStackSize(); + bool IsWin64 = + Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()); + unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; + + auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); - if (!CombineSPBump && CSStackSize != 0) + if (!CombineSPBump && PrologueSaveSize != 0) convertCalleeSaveRestoreToSPPrePostIncDec( - MBB, std::prev(MBB.getFirstTerminator()), DL, TII, CSStackSize); + MBB, std::prev(MBB.getFirstTerminator()), DL, TII, PrologueSaveSize); // Move past the restores of the callee-saved registers. MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator(); @@ -786,7 +795,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, return; } - NumBytes -= CSStackSize; + NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); if (!hasFP(MF)) { @@ -796,7 +805,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, if (RedZone && ArgumentPopSize == 0) return; - bool NoCalleeSaveRestore = CSStackSize == 0; + bool NoCalleeSaveRestore = PrologueSaveSize == 0; int StackRestoreBytes = RedZone ? 0 : NumBytes; if (NoCalleeSaveRestore) StackRestoreBytes += ArgumentPopSize; @@ -815,7 +824,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // be able to save any instructions. if (MFI.hasVarSizedObjects() || AFI->isStackRealigned()) emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP, - -CSStackSize + 16, TII, MachineInstr::FrameDestroy); + -AFI->getCalleeSavedStackSize() + 16, TII, + MachineInstr::FrameDestroy); else if (NumBytes) emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII, MachineInstr::FrameDestroy); @@ -845,7 +855,11 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - int FPOffset = MFI.getObjectOffset(FI) + 16; + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + bool IsWin64 = + Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()); + unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; + int FPOffset = MFI.getObjectOffset(FI) + FixedObject + 16; int Offset = MFI.getObjectOffset(FI) + MFI.getStackSize(); bool isFixed = MFI.isFixedObjectIndex(FI); @@ -956,12 +970,6 @@ static void computeCalleeSaveRegisterPairs( "Odd number of callee-saved regs to spill!"); int Offset = AFI->getCalleeSavedStackSize(); - unsigned GPRSaveSize = AFI->getVarArgsGPRSize(); - const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); - bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()); - if (IsWin64) - Offset -= alignTo(GPRSaveSize, 16); - for (unsigned i = 0; i < Count; ++i) { RegPairInfo RPI; RPI.Reg1 = CSI[i].getReg(); diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 8c30c4410c09..9d879886d39d 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9586,8 +9586,8 @@ static bool performTBISimplification(SDValue Addr, SelectionDAG &DAG) { APInt DemandedMask = APInt::getLowBitsSet(64, 56); KnownBits Known; - TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), - DCI.isBeforeLegalizeOps()); + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) { DCI.CommitTargetLoweringOpt(TLO); diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index 0dcf07f98412..5049a39814f1 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -37,6 +37,9 @@ def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, AssemblerPredicate<"FeatureFullFP16", "fullfp16">; def HasSPE : Predicate<"Subtarget->hasSPE()">, AssemblerPredicate<"FeatureSPE", "spe">; +def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">, + AssemblerPredicate<"FeatureFuseAES", + "fuse-aes">; def HasSVE : Predicate<"Subtarget->hasSVE()">, AssemblerPredicate<"FeatureSVE", "sve">; @@ -5304,6 +5307,31 @@ def AESDrr : AESTiedInst<0b0101, "aesd", int_aarch64_crypto_aesd>; def AESMCrr : AESInst< 0b0110, "aesmc", int_aarch64_crypto_aesmc>; def AESIMCrr : AESInst< 0b0111, "aesimc", int_aarch64_crypto_aesimc>; +// Pseudo instructions for AESMCrr/AESIMCrr with a register constraint required +// for AES fusion on some CPUs. +let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in { +def AESMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">, + Sched<[WriteV]>; +def AESIMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">, + Sched<[WriteV]>; +} + +// Only use constrained versions of AES(I)MC instructions if they are paired with +// AESE/AESD. +def : Pat<(v16i8 (int_aarch64_crypto_aesmc + (v16i8 (int_aarch64_crypto_aese (v16i8 V128:$src1), + (v16i8 V128:$src2))))), + (v16i8 (AESMCrrTied (v16i8 (AESErr (v16i8 V128:$src1), + (v16i8 V128:$src2)))))>, + Requires<[HasFuseAES]>; + +def : Pat<(v16i8 (int_aarch64_crypto_aesimc + (v16i8 (int_aarch64_crypto_aesd (v16i8 V128:$src1), + (v16i8 V128:$src2))))), + (v16i8 (AESIMCrrTied (v16i8 (AESDrr (v16i8 V128:$src1), + (v16i8 V128:$src2)))))>, + Requires<[HasFuseAES]>; + def SHA1Crrr : SHATiedInstQSV<0b000, "sha1c", int_aarch64_crypto_sha1c>; def SHA1Prrr : SHATiedInstQSV<0b001, "sha1p", int_aarch64_crypto_sha1p>; def SHA1Mrrr : SHATiedInstQSV<0b010, "sha1m", int_aarch64_crypto_sha1m>; diff --git a/lib/Target/AArch64/AArch64MacroFusion.cpp b/lib/Target/AArch64/AArch64MacroFusion.cpp index ccc9d2ad1b48..963cfadc54fd 100644 --- a/lib/Target/AArch64/AArch64MacroFusion.cpp +++ b/lib/Target/AArch64/AArch64MacroFusion.cpp @@ -118,11 +118,13 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, // Fuse AES crypto operations. switch(SecondOpcode) { // AES encode. - case AArch64::AESMCrr : + case AArch64::AESMCrr: + case AArch64::AESMCrrTied: return FirstOpcode == AArch64::AESErr || FirstOpcode == AArch64::INSTRUCTION_LIST_END; // AES decode. case AArch64::AESIMCrr: + case AArch64::AESIMCrrTied: return FirstOpcode == AArch64::AESDrr || FirstOpcode == AArch64::INSTRUCTION_LIST_END; } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index ba8eb8656585..7563bffd8f87 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -3984,6 +3984,13 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, if (Offset != MFI.getObjectOffset(FI)) return false; + // If this is not byval, check that the argument stack object is immutable. + // inalloca and argument copy elision can create mutable argument stack + // objects. Byval objects can be mutated, but a byval call intends to pass the + // mutated memory. + if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI)) + return false; + if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) { // If the argument location is wider than the argument type, check that any // extension flags match. @@ -30605,8 +30612,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); APInt DemandedMask(APInt::getSignMask(BitWidth)); KnownBits Known; - TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), - DCI.isBeforeLegalizeOps()); + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) || TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) { // If we changed the computation somewhere in the DAG, this change will diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index fe87bbd99473..650e4fc8716c 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3697,8 +3697,7 @@ let SchedRW = [WriteNop] in { // Pause. This "instruction" is encoded as "rep; nop", so even though it // was introduced with SSE2, it's backward compatible. def PAUSE : I<0x90, RawFrm, (outs), (ins), - "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, - OBXS, Requires<[HasSSE2]>; + "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, OBXS; } let SchedRW = [WriteFence] in { |