diff options
Diffstat (limited to 'lib/Target')
89 files changed, 1521 insertions, 828 deletions
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp index 67138f41dda8..2ff2ee347f56 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -583,6 +583,20 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { switch (MI->getOpcode()) { default: break; + case AArch64::MOVIv2d_ns: + // If the target has <rdar://problem/16473581>, lower this + // instruction to movi.16b instead. + if (STI->hasZeroCycleZeroingFPWorkaround() && + MI->getOperand(1).getImm() == 0) { + MCInst TmpInst; + TmpInst.setOpcode(AArch64::MOVIv16b_ns); + TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg())); + TmpInst.addOperand(MCOperand::createImm(MI->getOperand(1).getImm())); + EmitToStreamer(*OutStreamer, TmpInst); + return; + } + break; + case AArch64::DBG_VALUE: { if (isVerbose() && OutStreamer->hasRawTextSupport()) { SmallString<128> TmpStr; diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index fd1699fd363d..022200986d2b 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -5135,11 +5135,12 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) { return selectAtomicCmpXchg(cast<AtomicCmpXchgInst>(I)); } - // fall-back to target-independent instruction selection. - return selectOperator(I, I->getOpcode()); // Silence warnings. (void)&CC_AArch64_DarwinPCS_VarArg; (void)&CC_AArch64_Win64_VarArg; + + // fall-back to target-independent instruction selection. + return selectOperator(I, I->getOpcode()); } namespace llvm { diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 73944359223a..d66f7b59a4b5 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -97,6 +97,7 @@ #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" +#include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LivePhysRegs.h" @@ -335,6 +336,22 @@ bool AArch64FrameLowering::canUseAsPrologue( return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister; } +static bool windowsRequiresStackProbe(MachineFunction &MF, + unsigned StackSizeInBytes) { + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + if (!Subtarget.isTargetWindows()) + return false; + const Function &F = MF.getFunction(); + // TODO: When implementing stack protectors, take that into account + // for the probe threshold. + unsigned StackProbeSize = 4096; + if (F.hasFnAttribute("stack-probe-size")) + F.getFnAttribute("stack-probe-size") + .getValueAsString() + .getAsInteger(0, StackProbeSize); + return StackSizeInBytes >= StackProbeSize; +} + bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( MachineFunction &MF, unsigned StackBumpBytes) const { AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); @@ -347,7 +364,7 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( // 512 is the maximum immediate for stp/ldp that will be used for // callee-save save/restores - if (StackBumpBytes >= 512) + if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes)) return false; if (MFI.hasVarSizedObjects()) @@ -478,7 +495,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, return; int NumBytes = (int)MFI.getStackSize(); - if (!AFI->hasStackFrame()) { + if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) { assert(!HasFP && "unexpected function without stack frame but with FP"); // All of the stack allocation is for locals. @@ -550,6 +567,44 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineInstr::FrameSetup); } + if (windowsRequiresStackProbe(MF, NumBytes)) { + uint32_t NumWords = NumBytes >> 4; + + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15) + .addImm(NumWords) + .setMIFlags(MachineInstr::FrameSetup); + + switch (MF.getTarget().getCodeModel()) { + case CodeModel::Small: + case CodeModel::Medium: + case CodeModel::Kernel: + BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)) + .addExternalSymbol("__chkstk") + .addReg(AArch64::X15, RegState::Implicit) + .setMIFlags(MachineInstr::FrameSetup); + break; + case CodeModel::Large: + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT)) + .addReg(AArch64::X16, RegState::Define) + .addExternalSymbol("__chkstk") + .addExternalSymbol("__chkstk") + .setMIFlags(MachineInstr::FrameSetup); + + BuildMI(MBB, MBBI, DL, TII->get(AArch64::BLR)) + .addReg(AArch64::X16, RegState::Kill) + .addReg(AArch64::X15, RegState::Implicit | RegState::Define) + .setMIFlags(MachineInstr::FrameSetup); + break; + } + + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP) + .addReg(AArch64::SP, RegState::Kill) + .addReg(AArch64::X15, RegState::Kill) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4)) + .setMIFlags(MachineInstr::FrameSetup); + NumBytes = 0; + } + // Allocate space for the rest of the frame. if (NumBytes) { const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); @@ -1164,18 +1219,32 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, unsigned UnspilledCSGPR = AArch64::NoRegister; unsigned UnspilledCSGPRPaired = AArch64::NoRegister; + MachineFrameInfo &MFI = MF.getFrameInfo(); + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); + + unsigned BasePointerReg = RegInfo->hasBasePointer(MF) + ? RegInfo->getBaseRegister() + : (unsigned)AArch64::NoRegister; + + unsigned SpillEstimate = SavedRegs.count(); + for (unsigned i = 0; CSRegs[i]; ++i) { + unsigned Reg = CSRegs[i]; + unsigned PairedReg = CSRegs[i ^ 1]; + if (Reg == BasePointerReg) + SpillEstimate++; + if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg)) + SpillEstimate++; + } + SpillEstimate += 2; // Conservatively include FP+LR in the estimate + unsigned StackEstimate = MFI.estimateStackSize(MF) + 8 * SpillEstimate; + // The frame record needs to be created by saving the appropriate registers - if (hasFP(MF)) { + if (hasFP(MF) || windowsRequiresStackProbe(MF, StackEstimate)) { SavedRegs.set(AArch64::FP); SavedRegs.set(AArch64::LR); } - unsigned BasePointerReg = AArch64::NoRegister; - if (RegInfo->hasBasePointer(MF)) - BasePointerReg = RegInfo->getBaseRegister(); - unsigned ExtraCSSpill = 0; - const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); // Figure out which callee-saved registers to save/restore. for (unsigned i = 0; CSRegs[i]; ++i) { const unsigned Reg = CSRegs[i]; @@ -1217,7 +1286,6 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // The CSR spill slots have not been allocated yet, so estimateStackSize // won't include them. - MachineFrameInfo &MFI = MF.getFrameInfo(); unsigned CFSize = MFI.estimateStackSize(MF) + 8 * NumRegsSpilled; DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n"); unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF); diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 1242cf5be188..6f7b2b6fd5b5 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -470,10 +470,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, if (Subtarget->hasPerfMon()) setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); - if (Subtarget->isTargetMachO()) { - // For iOS, we don't want to the normal expansion of a libcall to - // sincos. We want to issue a libcall to __sincos_stret to avoid memory - // traffic. + if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && + getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { + // Issue __sincos_stret if available. setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } else { @@ -2328,8 +2327,9 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, Entry.IsZExt = false; Args.push_back(Entry); - const char *LibcallName = - (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; + RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 + : RTLIB::SINCOS_STRET_F32; + const char *LibcallName = getLibcallName(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index c7c560a81328..abbba7d1d5a9 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -4963,16 +4963,9 @@ void AArch64InstrInfo::insertOutlinerEpilogue( MachineBasicBlock &MBB, MachineFunction &MF, const MachineOutlinerInfo &MInfo) const { - bool ContainsCalls = false; - - for (MachineInstr &MI : MBB) { - if (MI.isCall()) { - ContainsCalls = true; - break; - } - } - - if (ContainsCalls) { + // Is there a call in the outlined range? + if (std::any_of(MBB.instr_begin(), MBB.instr_end(), + [](MachineInstr &MI) { return MI.isCall(); })) { // Fix up the instructions in the range, since we're going to modify the // stack. fixupPostOutline(MBB); diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 7f5507371fa0..a719d47618e5 100644 --- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -25,11 +25,11 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size); const AArch64Subtarget &STI = DAG.getMachineFunction().getSubtarget<AArch64Subtarget>(); - const char *bzeroEntry = - (V && V->isNullValue()) ? STI.getBZeroEntry() : nullptr; + const char *bzeroName = (V && V->isNullValue()) + ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO) : nullptr; // For small size (< 256), it is not beneficial to use bzero // instead of memset. - if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) { + if (bzeroName && (!SizeValue || SizeValue->getZExtValue() > 256)) { const AArch64TargetLowering &TLI = *STI.getTargetLowering(); EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout()); @@ -45,7 +45,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( CLI.setDebugLoc(dl) .setChain(Chain) .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol(bzeroEntry, IntPtr), + DAG.getExternalSymbol(bzeroName, IntPtr), std::move(Args)) .setDiscardResult(); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index e397d585ae77..688bb936d0ca 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -217,19 +217,6 @@ unsigned char AArch64Subtarget::classifyGlobalFunctionReference( return AArch64II::MO_NO_FLAG; } -/// This function returns the name of a function which has an interface -/// like the non-standard bzero function, if such a function exists on -/// the current subtarget and it is considered prefereable over -/// memset with zero passed as the second argument. Otherwise it -/// returns null. -const char *AArch64Subtarget::getBZeroEntry() const { - // Prefer bzero on Darwin only. - if(isTargetDarwin()) - return "bzero"; - - return nullptr; -} - void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const { // LNT run (at least on Cyclone) showed reasonably significant gains for diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 5d9759d363dd..9245b2f396b7 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -309,13 +309,6 @@ public: unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const; - /// This function returns the name of a function which has an interface - /// like the non-standard bzero function, if such a function exists on - /// the current subtarget and it is considered prefereable over - /// memset with zero passed as the second argument. Otherwise it - /// returns null. - const char *getBZeroEntry() const; - void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override; diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td index df939add70fa..66b7e02ceb99 100644 --- a/lib/Target/AArch64/AArch64SystemOperands.td +++ b/lib/Target/AArch64/AArch64SystemOperands.td @@ -322,6 +322,9 @@ def : ROSysReg<"PMCEID0_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b110>; def : ROSysReg<"PMCEID1_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b111>; def : ROSysReg<"MIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b000>; def : ROSysReg<"CCSIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b000>; +def : ROSysReg<"CCSIDR2_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b010> { + let Requires = [{ {AArch64::HasV8_3aOps} }]; +} def : ROSysReg<"CLIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b001>; def : ROSysReg<"CTR_EL0", 0b11, 0b011, 0b0000, 0b0000, 0b001>; def : ROSysReg<"MPIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b101>; diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index 64583ead73f2..0e6ad944c141 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -346,10 +346,9 @@ public: } // end anonymous namespace -TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(AArch64TTIImpl(this, F)); - }); +TargetTransformInfo +AArch64TargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(AArch64TTIImpl(this, F)); } TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) { diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h index 2bbfb2da3db6..8d28a5e30ebf 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.h +++ b/lib/Target/AArch64/AArch64TargetMachine.h @@ -44,8 +44,7 @@ public: // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; - /// \brief Get the TargetIRAnalysis for this target. - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; TargetLoweringObjectFile* getObjFileLowering() const override { return TLOF.get(); diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index aeffbd70fc81..6e63783e5646 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -1975,10 +1975,6 @@ static bool isValidSVEKind(StringRef Name) { .Default(false); } -static bool isSVERegister(StringRef Name) { - return Name[0] == 'z' || Name[0] == 'p'; -} - static void parseValidVectorKind(StringRef Name, unsigned &NumElements, char &ElementKind) { assert(isValidVectorKind(Name)); @@ -2008,21 +2004,19 @@ bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, // Matches a register name or register alias previously defined by '.req' unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name, RegKind Kind) { - unsigned RegNum; - switch (Kind) { - case RegKind::Scalar: - RegNum = MatchRegisterName(Name); - break; - case RegKind::NeonVector: - RegNum = MatchNeonVectorRegName(Name); - break; - case RegKind::SVEDataVector: - RegNum = matchSVEDataVectorRegName(Name); - break; - case RegKind::SVEPredicateVector: - RegNum = matchSVEPredicateVectorRegName(Name); - break; - } + unsigned RegNum = 0; + if ((RegNum = matchSVEDataVectorRegName(Name))) + return Kind == RegKind::SVEDataVector ? RegNum : 0; + + if ((RegNum = matchSVEPredicateVectorRegName(Name))) + return Kind == RegKind::SVEPredicateVector ? RegNum : 0; + + if ((RegNum = MatchNeonVectorRegName(Name))) + return Kind == RegKind::NeonVector ? RegNum : 0; + + // The parsed register must be of RegKind Scalar + if ((RegNum = MatchRegisterName(Name))) + return Kind == RegKind::Scalar ? RegNum : 0; if (!RegNum) { // Check for aliases registered via .req. Canonicalize to lower case. @@ -2049,10 +2043,8 @@ int AArch64AsmParser::tryParseRegister() { return -1; std::string lowerCase = Tok.getString().lower(); - if (isSVERegister(lowerCase)) - return -1; - unsigned RegNum = matchRegisterNameAlias(lowerCase, RegKind::Scalar); + // Also handle a few aliases of registers. if (RegNum == 0) RegNum = StringSwitch<unsigned>(lowerCase) diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index bb628b8c558f..fda6252f46e3 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -695,18 +695,24 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( IsSGPR = false; Width = 3; } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && + "trap handler registers should not be used"); IsSGPR = true; Width = 4; } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { IsSGPR = false; Width = 4; } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && + "trap handler registers should not be used"); IsSGPR = true; Width = 8; } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { IsSGPR = false; Width = 8; } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && + "trap handler registers should not be used"); IsSGPR = true; Width = 16; } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 3f8a9b1964ca..5c31bddd9b1a 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -202,6 +202,16 @@ public: const char* getTargetNodeName(unsigned Opcode) const override; + // FIXME: Turn off MergeConsecutiveStores() before Instruction Selection + // for AMDGPU. + // A commit ( git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319036 + // 91177308-0d34-0410-b5e6-96231b3b80d8 ) turned on + // MergeConsecutiveStores() before Instruction Selection for all targets. + // Enough AMDGPU compiles go into an infinite loop ( MergeConsecutiveStores() + // merges two stores; LegalizeStoreOps() un-merges; MergeConsecutiveStores() + // re-merges, etc. ) to warrant turning it off for now. + bool mergeStoresAfterLegalization() const override { return false; } + bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override { return true; } diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 6984f4e71613..2042dbf6d5e2 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -571,10 +571,9 @@ public: } // end anonymous namespace -TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(AMDGPUTTIImpl(this, F)); - }); +TargetTransformInfo +AMDGPUTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(AMDGPUTTIImpl(this, F)); } void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { @@ -898,4 +897,3 @@ void GCNPassConfig::addPreEmitPass() { TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { return new GCNPassConfig(*this, PM); } - diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 5043e31f6f5b..5f9b2a7fca20 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -55,7 +55,7 @@ public: const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { return &IntrinsicInfo; } - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 2acd7f78faea..ebf656c549ec 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -536,6 +536,10 @@ public: return EndLoc; } + SMRange getLocRange() const { + return SMRange(StartLoc, EndLoc); + } + Modifiers getModifiers() const { assert(isRegKind() || isImmTy(ImmTyNone)); return isRegKind() ? Reg.Mods : Imm.Mods; @@ -1491,6 +1495,8 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) { case 1: return AMDGPU::TTMP_32RegClassID; case 2: return AMDGPU::TTMP_64RegClassID; case 4: return AMDGPU::TTMP_128RegClassID; + case 8: return AMDGPU::TTMP_256RegClassID; + case 16: return AMDGPU::TTMP_512RegClassID; } } else if (Is == IS_SGPR) { switch (RegWidth) { @@ -1498,8 +1504,8 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) { case 1: return AMDGPU::SGPR_32RegClassID; case 2: return AMDGPU::SGPR_64RegClassID; case 4: return AMDGPU::SGPR_128RegClassID; - case 8: return AMDGPU::SReg_256RegClassID; - case 16: return AMDGPU::SReg_512RegClassID; + case 8: return AMDGPU::SGPR_256RegClassID; + case 16: return AMDGPU::SGPR_512RegClassID; } } return -1; @@ -1754,6 +1760,11 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool AbsMod) { // TODO: add syntactic sugar for 1/(2*PI) bool Minus = false; if (getLexer().getKind() == AsmToken::Minus) { + const AsmToken NextToken = getLexer().peekTok(); + if (!NextToken.is(AsmToken::Integer) && + !NextToken.is(AsmToken::Real)) { + return MatchOperand_NoMatch; + } Minus = true; Parser.Lex(); } @@ -1783,7 +1794,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool AbsMod) { return MatchOperand_Success; } default: - return Minus ? MatchOperand_ParseFail : MatchOperand_NoMatch; + return MatchOperand_NoMatch; } } @@ -2244,6 +2255,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, return true; } +static std::string AMDGPUMnemonicSpellCheck(StringRef S, uint64_t FBS, + unsigned VariantID = 0); + bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, @@ -2286,8 +2300,13 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_MissingFeature: return Error(IDLoc, "instruction not supported on this GPU"); - case Match_MnemonicFail: - return Error(IDLoc, "unrecognized instruction mnemonic"); + case Match_MnemonicFail: { + uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); + std::string Suggestion = AMDGPUMnemonicSpellCheck( + ((AMDGPUOperand &)*Operands[0]).getToken(), FBS); + return Error(IDLoc, "invalid instruction" + Suggestion, + ((AMDGPUOperand &)*Operands[0]).getLocRange()); + } case Match_InvalidOperand: { SMLoc ErrorLoc = IDLoc; @@ -3838,7 +3857,9 @@ AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) { return Ok? MatchOperand_Success : MatchOperand_ParseFail; } else { - return MatchOperand_NoMatch; + // Swizzle "offset" operand is optional. + // If it is omitted, try parsing other optional operands. + return parseOptionalOperand(Operands); } } @@ -4786,6 +4807,7 @@ extern "C" void LLVMInitializeAMDGPUAsmParser() { #define GET_REGISTER_MATCHER #define GET_MATCHER_IMPLEMENTATION +#define GET_MNEMONIC_SPELL_CHECKER #include "AMDGPUGenAsmMatcher.inc" // This fuction should be defined after auto-generated include so that we have diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 4a3f2c975179..47a2d3f2fdc5 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -348,10 +348,12 @@ MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID, case AMDGPU::TTMP_128RegClassID: // ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in // this bundle? - case AMDGPU::SReg_256RegClassID: - // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in + case AMDGPU::SGPR_256RegClassID: + case AMDGPU::TTMP_256RegClassID: + // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in // this bundle? - case AMDGPU::SReg_512RegClassID: + case AMDGPU::SGPR_512RegClassID: + case AMDGPU::TTMP_512RegClassID: shift = 2; break; // ToDo: unclear if s[88:104] is available on VI. Can we use VCC as SGPR in @@ -441,11 +443,11 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_128(unsigned Val) const { } MCOperand AMDGPUDisassembler::decodeOperand_SReg_256(unsigned Val) const { - return createSRegOperand(AMDGPU::SReg_256RegClassID, Val); + return decodeDstOp(OPW256, Val); } MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const { - return createSRegOperand(AMDGPU::SReg_512RegClassID, Val); + return decodeDstOp(OPW512, Val); } MCOperand AMDGPUDisassembler::decodeLiteralConstant() const { @@ -593,6 +595,8 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const { return SGPR_32RegClassID; case OPW64: return SGPR_64RegClassID; case OPW128: return SGPR_128RegClassID; + case OPW256: return SGPR_256RegClassID; + case OPW512: return SGPR_512RegClassID; } } @@ -608,6 +612,8 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const { return TTMP_32RegClassID; case OPW64: return TTMP_64RegClassID; case OPW128: return TTMP_128RegClassID; + case OPW256: return TTMP_256RegClassID; + case OPW512: return TTMP_512RegClassID; } } @@ -659,6 +665,25 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c } } +MCOperand AMDGPUDisassembler::decodeDstOp(const OpWidthTy Width, unsigned Val) const { + using namespace AMDGPU::EncValues; + + assert(Val < 128); + assert(Width == OPW256 || Width == OPW512); + + if (Val <= SGPR_MAX) { + assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning. + return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN); + } + + int TTmpIdx = getTTmpIdx(Val); + if (TTmpIdx >= 0) { + return createSRegOperand(getTtmpClassId(Width), TTmpIdx); + } + + llvm_unreachable("unknown dst register"); +} + MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const { using namespace AMDGPU; diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index ce396eb68c4c..75cfc5e11282 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -95,6 +95,8 @@ public: OPW32, OPW64, OPW128, + OPW256, + OPW512, OPW16, OPWV216, OPW_LAST_, @@ -110,6 +112,7 @@ public: MCOperand decodeLiteralConstant() const; MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const; + MCOperand decodeDstOp(const OpWidthTy Width, unsigned Val) const; MCOperand decodeSpecialReg32(unsigned Val) const; MCOperand decodeSpecialReg64(unsigned Val) const; diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index 67663d39967c..bf57f88bef91 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -335,13 +335,13 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo)) { O << 'v'; NumRegs = 8; - } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(RegNo)) { + } else if (MRI.getRegClass(AMDGPU::SGPR_256RegClassID).contains(RegNo)) { O << 's'; NumRegs = 8; } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo)) { O << 'v'; NumRegs = 16; - } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(RegNo)) { + } else if (MRI.getRegClass(AMDGPU::SGPR_512RegClassID).contains(RegNo)) { O << 's'; NumRegs = 16; } else { diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index 6b7c3ffb7bb8..dd0efef7f91b 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -8,6 +8,26 @@ //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// +// Helpers +//===----------------------------------------------------------------------===// + +class getSubRegs<int size> { + list<SubRegIndex> ret2 = [sub0, sub1]; + list<SubRegIndex> ret3 = [sub0, sub1, sub2]; + list<SubRegIndex> ret4 = [sub0, sub1, sub2, sub3]; + list<SubRegIndex> ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7]; + list<SubRegIndex> ret16 = [sub0, sub1, sub2, sub3, + sub4, sub5, sub6, sub7, + sub8, sub9, sub10, sub11, + sub12, sub13, sub14, sub15]; + + list<SubRegIndex> ret = !if(!eq(size, 2), ret2, + !if(!eq(size, 3), ret3, + !if(!eq(size, 4), ret4, + !if(!eq(size, 8), ret8, ret16)))); +} + +//===----------------------------------------------------------------------===// // Declarations that describe the SI registers //===----------------------------------------------------------------------===// class SIReg <string n, bits<16> regIdx = 0> : Register<n>, @@ -141,19 +161,19 @@ def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, } // SGPR 64-bit registers -def SGPR_64Regs : RegisterTuples<[sub0, sub1], +def SGPR_64Regs : RegisterTuples<getSubRegs<2>.ret, [(add (decimate SGPR_32, 2)), (add (decimate (shl SGPR_32, 1), 2))]>; // SGPR 128-bit registers -def SGPR_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3], +def SGPR_128Regs : RegisterTuples<getSubRegs<4>.ret, [(add (decimate SGPR_32, 4)), (add (decimate (shl SGPR_32, 1), 4)), (add (decimate (shl SGPR_32, 2), 4)), (add (decimate (shl SGPR_32, 3), 4))]>; // SGPR 256-bit registers -def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], +def SGPR_256Regs : RegisterTuples<getSubRegs<8>.ret, [(add (decimate SGPR_32, 4)), (add (decimate (shl SGPR_32, 1), 4)), (add (decimate (shl SGPR_32, 2), 4)), @@ -164,8 +184,7 @@ def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], (add (decimate (shl SGPR_32, 7), 4))]>; // SGPR 512-bit registers -def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, - sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], +def SGPR_512Regs : RegisterTuples<getSubRegs<16>.ret, [(add (decimate SGPR_32, 4)), (add (decimate (shl SGPR_32, 1), 4)), (add (decimate (shl SGPR_32, 2), 4)), @@ -190,47 +209,125 @@ def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32, } // Trap handler TMP 64-bit registers -def TTMP_64Regs : RegisterTuples<[sub0, sub1], +def TTMP_64Regs : RegisterTuples<getSubRegs<2>.ret, [(add (decimate TTMP_32, 2)), (add (decimate (shl TTMP_32, 1), 2))]>; // Trap handler TMP 128-bit registers -def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3], +def TTMP_128Regs : RegisterTuples<getSubRegs<4>.ret, [(add (decimate TTMP_32, 4)), (add (decimate (shl TTMP_32, 1), 4)), (add (decimate (shl TTMP_32, 2), 4)), (add (decimate (shl TTMP_32, 3), 4))]>; -class TmpRegTuples <string tgt, - bit Is64Bit, - int Index0, - int Index1 = !add(Index0, 1), - int Index2 = !add(Index0, !if(Is64Bit, 1, 2)), - int Index3 = !add(Index0, !if(Is64Bit, 1, 3)), - string name = "ttmp["#Index0#":"#Index3#"]", - Register r0 = !cast<Register>("TTMP"#Index0#tgt), - Register r1 = !cast<Register>("TTMP"#Index1#tgt), - Register r2 = !cast<Register>("TTMP"#Index2#tgt), - Register r3 = !cast<Register>("TTMP"#Index3#tgt)> : - RegisterWithSubRegs<name, !if(Is64Bit, [r0, r1], [r0, r1, r2, r3])> { - let SubRegIndices = !if(Is64Bit, [sub0, sub1], [sub0, sub1, sub2, sub3]); - let HWEncoding = r0.HWEncoding; -} +def TTMP_256Regs : RegisterTuples<getSubRegs<8>.ret, + [(add (decimate TTMP_32, 4)), + (add (decimate (shl TTMP_32, 1), 4)), + (add (decimate (shl TTMP_32, 2), 4)), + (add (decimate (shl TTMP_32, 3), 4)), + (add (decimate (shl TTMP_32, 4), 4)), + (add (decimate (shl TTMP_32, 5), 4)), + (add (decimate (shl TTMP_32, 6), 4)), + (add (decimate (shl TTMP_32, 7), 4))]>; + +def TTMP_512Regs : RegisterTuples<getSubRegs<16>.ret, + [(add (decimate TTMP_32, 4)), + (add (decimate (shl TTMP_32, 1), 4)), + (add (decimate (shl TTMP_32, 2), 4)), + (add (decimate (shl TTMP_32, 3), 4)), + (add (decimate (shl TTMP_32, 4), 4)), + (add (decimate (shl TTMP_32, 5), 4)), + (add (decimate (shl TTMP_32, 6), 4)), + (add (decimate (shl TTMP_32, 7), 4)), + (add (decimate (shl TTMP_32, 8), 4)), + (add (decimate (shl TTMP_32, 9), 4)), + (add (decimate (shl TTMP_32, 10), 4)), + (add (decimate (shl TTMP_32, 11), 4)), + (add (decimate (shl TTMP_32, 12), 4)), + (add (decimate (shl TTMP_32, 13), 4)), + (add (decimate (shl TTMP_32, 14), 4)), + (add (decimate (shl TTMP_32, 15), 4))]>; + +class TmpRegTuplesBase<int index, int size, + list<Register> subRegs, + list<SubRegIndex> indices = getSubRegs<size>.ret, + int index1 = !add(index, !add(size, -1)), + string name = "ttmp["#index#":"#index1#"]"> : + RegisterWithSubRegs<name, subRegs> { + let HWEncoding = subRegs[0].HWEncoding; + let SubRegIndices = indices; +} + +class TmpRegTuples<string tgt, + int size, + int index0, + int index1 = !add(index0, 1), + int index2 = !add(index0, !if(!eq(size, 2), 1, 2)), + int index3 = !add(index0, !if(!eq(size, 2), 1, 3)), + int index4 = !add(index0, !if(!eq(size, 8), 4, 1)), + int index5 = !add(index0, !if(!eq(size, 8), 5, 1)), + int index6 = !add(index0, !if(!eq(size, 8), 6, 1)), + int index7 = !add(index0, !if(!eq(size, 8), 7, 1)), + Register r0 = !cast<Register>("TTMP"#index0#tgt), + Register r1 = !cast<Register>("TTMP"#index1#tgt), + Register r2 = !cast<Register>("TTMP"#index2#tgt), + Register r3 = !cast<Register>("TTMP"#index3#tgt), + Register r4 = !cast<Register>("TTMP"#index4#tgt), + Register r5 = !cast<Register>("TTMP"#index5#tgt), + Register r6 = !cast<Register>("TTMP"#index6#tgt), + Register r7 = !cast<Register>("TTMP"#index7#tgt)> : + TmpRegTuplesBase<index0, size, + !if(!eq(size, 2), [r0, r1], + !if(!eq(size, 4), [r0, r1, r2, r3], + [r0, r1, r2, r3, r4, r5, r6, r7])), + getSubRegs<size>.ret>; foreach Index = {0, 2, 4, 6, 8, 10, 12, 14} in { - def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 1, Index>; - def TTMP#Index#_TTMP#!add(Index,1)#_gfx9 : TmpRegTuples<"_gfx9", 1, Index>; + def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 2, Index>; + def TTMP#Index#_TTMP#!add(Index,1)#_gfx9 : TmpRegTuples<"_gfx9", 2, Index>; } foreach Index = {0, 4, 8, 12} in { def TTMP#Index#_TTMP#!add(Index,1)# _TTMP#!add(Index,2)# - _TTMP#!add(Index,3)#_vi : TmpRegTuples<"_vi", 0, Index>; + _TTMP#!add(Index,3)#_vi : TmpRegTuples<"_vi", 4, Index>; def TTMP#Index#_TTMP#!add(Index,1)# _TTMP#!add(Index,2)# - _TTMP#!add(Index,3)#_gfx9 : TmpRegTuples<"_gfx9", 0, Index>; + _TTMP#!add(Index,3)#_gfx9 : TmpRegTuples<"_gfx9", 4, Index>; } +foreach Index = {0, 4, 8} in { + def TTMP#Index#_TTMP#!add(Index,1)# + _TTMP#!add(Index,2)# + _TTMP#!add(Index,3)# + _TTMP#!add(Index,4)# + _TTMP#!add(Index,5)# + _TTMP#!add(Index,6)# + _TTMP#!add(Index,7)#_vi : TmpRegTuples<"_vi", 8, Index>; + def TTMP#Index#_TTMP#!add(Index,1)# + _TTMP#!add(Index,2)# + _TTMP#!add(Index,3)# + _TTMP#!add(Index,4)# + _TTMP#!add(Index,5)# + _TTMP#!add(Index,6)# + _TTMP#!add(Index,7)#_gfx9 : TmpRegTuples<"_gfx9", 8, Index>; +} + +def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_vi : + TmpRegTuplesBase<0, 16, + [TTMP0_vi, TTMP1_vi, TTMP2_vi, TTMP3_vi, + TTMP4_vi, TTMP5_vi, TTMP6_vi, TTMP7_vi, + TTMP8_vi, TTMP9_vi, TTMP10_vi, TTMP11_vi, + TTMP12_vi, TTMP13_vi, TTMP14_vi, TTMP15_vi]>; + +def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9 : + TmpRegTuplesBase<0, 16, + [TTMP0_gfx9, TTMP1_gfx9, TTMP2_gfx9, TTMP3_gfx9, + TTMP4_gfx9, TTMP5_gfx9, TTMP6_gfx9, TTMP7_gfx9, + TTMP8_gfx9, TTMP9_gfx9, TTMP10_gfx9, TTMP11_gfx9, + TTMP12_gfx9, TTMP13_gfx9, TTMP14_gfx9, TTMP15_gfx9]>; + + // VGPR 32-bit registers // i16/f16 only on VI+ def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, @@ -240,25 +337,25 @@ def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, } // VGPR 64-bit registers -def VGPR_64 : RegisterTuples<[sub0, sub1], +def VGPR_64 : RegisterTuples<getSubRegs<2>.ret, [(add (trunc VGPR_32, 255)), (add (shl VGPR_32, 1))]>; // VGPR 96-bit registers -def VGPR_96 : RegisterTuples<[sub0, sub1, sub2], +def VGPR_96 : RegisterTuples<getSubRegs<3>.ret, [(add (trunc VGPR_32, 254)), (add (shl VGPR_32, 1)), (add (shl VGPR_32, 2))]>; // VGPR 128-bit registers -def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], +def VGPR_128 : RegisterTuples<getSubRegs<4>.ret, [(add (trunc VGPR_32, 253)), (add (shl VGPR_32, 1)), (add (shl VGPR_32, 2)), (add (shl VGPR_32, 3))]>; // VGPR 256-bit registers -def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], +def VGPR_256 : RegisterTuples<getSubRegs<8>.ret, [(add (trunc VGPR_32, 249)), (add (shl VGPR_32, 1)), (add (shl VGPR_32, 2)), @@ -269,8 +366,7 @@ def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], (add (shl VGPR_32, 7))]>; // VGPR 512-bit registers -def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, - sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], +def VGPR_512 : RegisterTuples<getSubRegs<16>.ret, [(add (trunc VGPR_32, 241)), (add (shl VGPR_32, 1)), (add (shl VGPR_32, 2)), @@ -368,13 +464,31 @@ def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, } // End CopyCost = 2 -def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> { +def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> { + let AllocationPriority = 11; +} + +def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> { + let isAllocatable = 0; +} + +def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, + (add SGPR_256, TTMP_256)> { // Requires 4 s_mov_b64 to copy let CopyCost = 4; let AllocationPriority = 11; } -def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512)> { +def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512Regs)> { + let AllocationPriority = 12; +} + +def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add TTMP_512Regs)> { + let isAllocatable = 0; +} + +def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, + (add SGPR_512, TTMP_512)> { // Requires 8 s_mov_b64 to copy let CopyCost = 8; let AllocationPriority = 12; diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 819a7add0be4..125a3b22d0cf 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -667,6 +667,10 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) { CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7) \ CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11) \ CASE_VI_GFX9(TTMP12_TTMP13_TTMP14_TTMP15) \ + CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \ + CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \ + CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ + CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ } #define CASE_CI_VI(node) \ diff --git a/lib/Target/ARC/ARCTargetMachine.cpp b/lib/Target/ARC/ARCTargetMachine.cpp index d2512c281a61..1acae3a88870 100644 --- a/lib/Target/ARC/ARCTargetMachine.cpp +++ b/lib/Target/ARC/ARCTargetMachine.cpp @@ -88,8 +88,7 @@ extern "C" void LLVMInitializeARCTarget() { RegisterTargetMachine<ARCTargetMachine> X(getTheARCTarget()); } -TargetIRAnalysis ARCTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(ARCTTIImpl(this, F)); - }); +TargetTransformInfo +ARCTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(ARCTTIImpl(this, F)); } diff --git a/lib/Target/ARC/ARCTargetMachine.h b/lib/Target/ARC/ARCTargetMachine.h index 98021b3dc1d5..18117e3409af 100644 --- a/lib/Target/ARC/ARCTargetMachine.h +++ b/lib/Target/ARC/ARCTargetMachine.h @@ -40,7 +40,7 @@ public: // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h index 3aac689c6310..9ffb4c2055f9 100644 --- a/lib/Target/ARM/ARM.h +++ b/lib/Target/ARM/ARM.h @@ -61,6 +61,7 @@ void initializeARMLoadStoreOptPass(PassRegistry &); void initializeARMPreAllocLoadStoreOptPass(PassRegistry &); void initializeARMConstantIslandsPass(PassRegistry &); void initializeARMExpandPseudoPass(PassRegistry &); +void initializeThumb2SizeReducePass(PassRegistry &); } // end namespace llvm diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index c1a3f639461d..c9766aa2161a 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -83,6 +83,9 @@ def FeatureDB : SubtargetFeature<"db", "HasDataBarrier", "true", def FeatureV7Clrex : SubtargetFeature<"v7clrex", "HasV7Clrex", "true", "Has v7 clrex instruction">; +def FeatureDFB : SubtargetFeature<"dfb", "HasFullDataBarrier", "true", + "Has full data barrier (dfb) instruction">; + def FeatureAcquireRelease : SubtargetFeature<"acquire-release", "HasAcquireRelease", "true", "Has v8 acquire/release (lda/ldaex " @@ -617,6 +620,7 @@ def ARMv83a : Architecture<"armv8.3-a", "ARMv83a", [HasV8_3aOps, def ARMv8r : Architecture<"armv8-r", "ARMv8r", [HasV8Ops, FeatureRClass, FeatureDB, + FeatureDFB, FeatureDSP, FeatureCRC, FeatureMP, diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 0ea435062ec0..60048d4453d8 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -1416,7 +1416,7 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value, case MVT::i8: case MVT::i16: needsExt = true; - // Intentional fall-through. + LLVM_FALLTHROUGH; case MVT::i32: if (isThumb2) { if (!UseImm) diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 1b4d7ff50848..aeda7c06a27a 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -1041,7 +1041,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (!Subtarget->isThumb1Only()) setOperationAction(ISD::SETCCE, MVT::i32, Custom); - setOperationAction(ISD::BRCOND, MVT::Other, Expand); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i32, Custom); setOperationAction(ISD::BR_CC, MVT::f32, Custom); setOperationAction(ISD::BR_CC, MVT::f64, Custom); @@ -1084,20 +1084,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } } - // Combine sin / cos into one node or libcall if possible. - if (Subtarget->hasSinCos()) { - setLibcallName(RTLIB::SINCOS_F32, "sincosf"); - setLibcallName(RTLIB::SINCOS_F64, "sincos"); - if (Subtarget->isTargetWatchABI()) { - setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP); - setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP); - } - if (Subtarget->isTargetIOS() || Subtarget->isTargetWatchOS()) { - // For iOS, we don't want to the normal expansion of a libcall to - // sincos. We want to issue a libcall to __sincos_stret. - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); - } + // Use __sincos_stret if available. + if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && + getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { + setOperationAction(ISD::FSINCOS, MVT::f64, Custom); + setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } // FP-ARMv8 implements a lot of rounding-like FP operations. @@ -1255,6 +1246,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::CMOV: return "ARMISD::CMOV"; case ARMISD::SSAT: return "ARMISD::SSAT"; + case ARMISD::USAT: return "ARMISD::USAT"; case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; @@ -3902,6 +3894,10 @@ ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); } +// This function returns three things: the arithmetic computation itself +// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The +// comparison and the condition code define the case in which the arithmetic +// computation *does not* overflow. std::pair<SDValue, SDValue> ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const { @@ -3927,7 +3923,11 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, break; case ISD::UADDO: ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); - Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); + // We use ADDC here to correspond to its use in LowerUnsignedALUO. + // We do not use it in the USUBO case as Value may not be used. + Value = DAG.getNode(ARMISD::ADDC, dl, + DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS) + .getValue(0); OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); break; case ISD::SSUBO: @@ -4205,7 +4205,7 @@ static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))); } -// Check if two chained conditionals could be converted into SSAT. +// Check if two chained conditionals could be converted into SSAT or USAT. // // SSAT can replace a set of two conditional selectors that bound a number to an // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: @@ -4216,10 +4216,14 @@ static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, // x < k ? (x < -k ? -k : x) : k // etc. // +// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is +// a power of 2. +// // It returns true if the conversion can be done, false otherwise. -// Additionally, the variable is returned in parameter V and the constant in K. +// Additionally, the variable is returned in parameter V, the constant in K and +// usat is set to true if the conditional represents an unsigned saturation static bool isSaturatingConditional(const SDValue &Op, SDValue &V, - uint64_t &K) { + uint64_t &K, bool &usat) { SDValue LHS1 = Op.getOperand(0); SDValue RHS1 = Op.getOperand(1); SDValue TrueVal1 = Op.getOperand(2); @@ -4286,13 +4290,23 @@ static bool isSaturatingConditional(const SDValue &Op, SDValue &V, int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue(); int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue(); int64_t PosVal = std::max(Val1, Val2); + int64_t NegVal = std::min(Val1, Val2); if (((Val1 > Val2 && UpperCheckOp == &Op) || (Val1 < Val2 && UpperCheckOp == &Op2)) && - Val1 == ~Val2 && isPowerOf2_64(PosVal + 1)) { + isPowerOf2_64(PosVal + 1)) { + + // Handle the difference between USAT (unsigned) and SSAT (signed) saturation + if (Val1 == ~Val2) + usat = false; + else if (NegVal == 0) + usat = true; + else + return false; V = V2; K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive + return true; } @@ -4306,10 +4320,16 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // Try to convert two saturating conditional selects into a single SSAT SDValue SatValue; uint64_t SatConstant; + bool SatUSat; if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) && - isSaturatingConditional(Op, SatValue, SatConstant)) - return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, - DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); + isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) { + if (SatUSat) + return DAG.getNode(ARMISD::USAT, dl, VT, SatValue, + DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); + else + return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, + DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); + } SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); @@ -4506,6 +4526,39 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } +SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + SDValue Cond = Op.getOperand(1); + SDValue Dest = Op.getOperand(2); + SDLoc dl(Op); + + // Optimize {s|u}{add|sub}.with.overflow feeding into a branch instruction. + unsigned Opc = Cond.getOpcode(); + if (Cond.getResNo() == 1 && (Opc == ISD::SADDO || Opc == ISD::UADDO || + Opc == ISD::SSUBO || Opc == ISD::USUBO)) { + // Only lower legal XALUO ops. + if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) + return SDValue(); + + // The actual operation with overflow check. + SDValue Value, OverflowCmp; + SDValue ARMcc; + std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); + + // Reverse the condition code. + ARMCC::CondCodes CondCode = + (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); + CondCode = ARMCC::getOppositeCondition(CondCode); + ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + + return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, + OverflowCmp); + } + + return SDValue(); +} + SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); @@ -4526,6 +4579,33 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { } } + // Optimize {s|u}{add|sub}.with.overflow feeding into a branch instruction. + unsigned Opc = LHS.getOpcode(); + if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) && + (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || + Opc == ISD::USUBO) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { + // Only lower legal XALUO ops. + if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) + return SDValue(); + + // The actual operation with overflow check. + SDValue Value, OverflowCmp; + SDValue ARMcc; + std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); + + if ((CC == ISD::SETNE) != isOneConstant(RHS)) { + // Reverse the condition code. + ARMCC::CondCodes CondCode = + (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); + CondCode = ARMCC::getOppositeCondition(CondCode); + ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); + } + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + + return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, + OverflowCmp); + } + if (LHS.getValueType() == MVT::i32) { SDValue ARMcc; SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); @@ -7523,10 +7603,9 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { Entry.IsZExt = false; Args.push_back(Entry); - const char *LibcallName = - (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; RTLIB::Libcall LC = - (ArgVT == MVT::f64) ? RTLIB::SINCOS_F64 : RTLIB::SINCOS_F32; + (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; + const char *LibcallName = getLibcallName(LC); CallingConv::ID CC = getLibcallCallingConv(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); @@ -7782,6 +7861,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::BR_JT: return LowerBR_JT(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); @@ -13751,7 +13831,7 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, case AtomicOrdering::SequentiallyConsistent: if (!Inst->hasAtomicStore()) return nullptr; // Nothing to do - /*FALLTHROUGH*/ + LLVM_FALLTHROUGH; case AtomicOrdering::Release: case AtomicOrdering::AcquireRelease: if (Subtarget->preferISHSTBarriers()) diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 0a1af8d89f9b..bf63dfae4407 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -87,6 +87,7 @@ class VectorType; CMOV, // ARM conditional move instructions. SSAT, // Signed saturation + USAT, // Unsigned saturation BCC_i64, @@ -643,6 +644,7 @@ class VectorType; SDValue LowerUnsignedALUO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 4e13af596300..eb8526bfeadf 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -139,6 +139,8 @@ def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov, def ARMssatnoshift : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>; +def ARMusatnoshift : SDNode<"ARMISD::USAT", SDTIntSatNoShOp, []>; + def ARMbrcond : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond, [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; @@ -278,6 +280,9 @@ def HasDSP : Predicate<"Subtarget->hasDSP()">, def HasDB : Predicate<"Subtarget->hasDataBarrier()">, AssemblerPredicate<"FeatureDB", "data-barriers">; +def HasDFB : Predicate<"Subtarget->hasFullDataBarrier()">, + AssemblerPredicate<"FeatureDFB", + "full-data-barrier">; def HasV7Clrex : Predicate<"Subtarget->hasV7Clrex()">, AssemblerPredicate<"FeatureV7Clrex", "v7 clrex">; @@ -3832,6 +3837,8 @@ def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos), (USAT imm0_31:$pos, GPRnopc:$a, 0)>; def : ARMPat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm), (SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>; +def : ARMPat<(ARMusatnoshift GPRnopc:$Rn, imm0_31:$imm), + (USAT imm0_31:$imm, GPRnopc:$Rn, 0)>; def : ARMV6Pat<(int_arm_ssat16 GPRnopc:$a, imm1_16:$pos), (SSAT16 imm1_16:$pos, GPRnopc:$a)>; def : ARMV6Pat<(int_arm_usat16 GPRnopc:$a, imm0_15:$pos), @@ -5846,6 +5853,8 @@ include "ARMInstrNEON.td" def : InstAlias<"dmb", (DMB 0xf), 0>, Requires<[IsARM, HasDB]>; def : InstAlias<"dsb", (DSB 0xf), 0>, Requires<[IsARM, HasDB]>; def : InstAlias<"isb", (ISB 0xf), 0>, Requires<[IsARM, HasDB]>; +// Armv8-R 'Data Full Barrier' +def : InstAlias<"dfb", (DSB 0xc), 1>, Requires<[IsARM, HasDFB]>; // System instructions def : MnemonicAlias<"swi", "svc">; diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index 670ed127da7e..4592249f5795 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -2336,6 +2336,8 @@ def t2USAT16: T2SatI<(ins imm0_15:$sat_imm, rGPR:$Rn), def : T2Pat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm), (t2SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>; +def : T2Pat<(ARMusatnoshift GPRnopc:$Rn, imm0_31:$imm), + (t2USAT imm0_31:$imm, GPRnopc:$Rn, 0)>; def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos), (t2SSAT imm1_32:$pos, GPR:$a, 0)>; def : T2Pat<(int_arm_usat GPR:$a, imm0_31:$pos), @@ -4506,6 +4508,8 @@ def : t2InstAlias<"tst${p} $Rn, $Rm", def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>; def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>; def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>; +// Armv8-R 'Data Full Barrier' +def : InstAlias<"dfb${p}", (t2DSB 0xc, pred:$p), 1>, Requires<[HasDFB]>; // Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional // width specifier. diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp index 6bbeae2e1151..b0fd0b476920 100644 --- a/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/lib/Target/ARM/ARMInstructionSelector.cpp @@ -669,13 +669,22 @@ bool ARMInstructionSelector::select(MachineInstr &I, return true; } + using namespace TargetOpcode; + if (I.getOpcode() == G_CONSTANT) { + // Pointer constants should be treated the same as 32-bit integer constants. + // Change the type and let TableGen handle it. + unsigned ResultReg = I.getOperand(0).getReg(); + LLT Ty = MRI.getType(ResultReg); + if (Ty.isPointer()) + MRI.setType(ResultReg, LLT::scalar(32)); + } + if (selectImpl(I, CoverageInfo)) return true; MachineInstrBuilder MIB{MF, I}; bool isSExt = false; - using namespace TargetOpcode; switch (I.getOpcode()) { case G_SEXT: isSExt = true; @@ -741,6 +750,31 @@ bool ARMInstructionSelector::select(MachineInstr &I, const auto &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); + if (SrcRegBank.getID() == ARM::FPRRegBankID) { + // This should only happen in the obscure case where we have put a 64-bit + // integer into a D register. Get it out of there and keep only the + // interesting part. + assert(I.getOpcode() == G_TRUNC && "Unsupported operand for G_ANYEXT"); + assert(DstRegBank.getID() == ARM::GPRRegBankID && + "Unsupported combination of register banks"); + assert(MRI.getType(SrcReg).getSizeInBits() == 64 && "Unsupported size"); + assert(MRI.getType(DstReg).getSizeInBits() <= 32 && "Unsupported size"); + + unsigned IgnoredBits = MRI.createVirtualRegister(&ARM::GPRRegClass); + auto InsertBefore = std::next(I.getIterator()); + auto MovI = + BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(ARM::VMOVRRD)) + .addDef(DstReg) + .addDef(IgnoredBits) + .addUse(SrcReg) + .add(predOps(ARMCC::AL)); + if (!constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI)) + return false; + + MIB->eraseFromParent(); + return true; + } + if (SrcRegBank.getID() != DstRegBank.getID()) { DEBUG(dbgs() << "G_TRUNC/G_ANYEXT operands on different register banks\n"); return false; @@ -754,6 +788,28 @@ bool ARMInstructionSelector::select(MachineInstr &I, I.setDesc(TII.get(COPY)); return selectCopy(I, TII, MRI, TRI, RBI); } + case G_INTTOPTR: + case G_PTRTOINT: { + auto SrcReg = I.getOperand(1).getReg(); + auto DstReg = I.getOperand(0).getReg(); + + const auto &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); + const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); + + if (SrcRegBank.getID() != DstRegBank.getID()) { + DEBUG(dbgs() + << "G_INTTOPTR/G_PTRTOINT operands on different register banks\n"); + return false; + } + + if (SrcRegBank.getID() != ARM::GPRRegBankID) { + DEBUG(dbgs() << "G_INTTOPTR/G_PTRTOINT on non-GPR not supported yet\n"); + return false; + } + + I.setDesc(TII.get(COPY)); + return selectCopy(I, TII, MRI, TRI, RBI); + } case G_SELECT: return selectSelect(MIB, MRI); case G_ICMP: { diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp index 2dd1dff64e87..8cff1f0869d0 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -126,6 +126,12 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { setAction({Op, s32}, Legal); } + setAction({G_INTTOPTR, p0}, Legal); + setAction({G_INTTOPTR, 1, s32}, Legal); + + setAction({G_PTRTOINT, s32}, Legal); + setAction({G_PTRTOINT, 1, p0}, Legal); + for (unsigned Op : {G_ASHR, G_LSHR, G_SHL}) setAction({Op, s32}, Legal); @@ -139,6 +145,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { setAction({G_BRCOND, s1}, Legal); setAction({G_CONSTANT, s32}, Legal); + setAction({G_CONSTANT, p0}, Legal); setLegalizeScalarToDifferentSizeStrategy(G_CONSTANT, 0, widen_1_8_16); setAction({G_ICMP, s1}, Legal); diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp index b32bfd449544..fad0e98285e6 100644 --- a/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -226,12 +226,30 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case G_SEXT: case G_ZEXT: case G_ANYEXT: - case G_TRUNC: case G_GEP: + case G_INTTOPTR: + case G_PTRTOINT: // FIXME: We're abusing the fact that everything lives in a GPR for now; in // the real world we would use different mappings. OperandsMapping = &ARM::ValueMappings[ARM::GPR3OpsIdx]; break; + case G_TRUNC: { + // In some cases we may end up with a G_TRUNC from a 64-bit value to a + // 32-bit value. This isn't a real floating point trunc (that would be a + // G_FPTRUNC). Instead it is an integer trunc in disguise, which can appear + // because the legalizer doesn't distinguish between integer and floating + // point values so it may leave some 64-bit integers un-narrowed. Until we + // have a more principled solution that doesn't let such things sneak all + // the way to this point, just map the source to a DPR and the destination + // to a GPR. + LLT LargeTy = MRI.getType(MI.getOperand(1).getReg()); + OperandsMapping = + LargeTy.getSizeInBits() <= 32 + ? &ARM::ValueMappings[ARM::GPR3OpsIdx] + : getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], + &ARM::ValueMappings[ARM::DPR3OpsIdx]}); + break; + } case G_LOAD: case G_STORE: { LLT Ty = MRI.getType(MI.getOperand(0).getReg()); diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 4d4a88126ce6..23027e92481f 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -348,11 +348,6 @@ unsigned ARMSubtarget::getMispredictionPenalty() const { return SchedModel.MispredictPenalty; } -bool ARMSubtarget::hasSinCos() const { - return isTargetWatchOS() || - (isTargetIOS() && !getTargetTriple().isOSVersionLT(7, 0)); -} - bool ARMSubtarget::enableMachineScheduler() const { // Enable the MachineScheduler before register allocation for subtargets // with the use-misched feature. diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 9301197e1387..eedb675a3304 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -236,6 +236,10 @@ protected: /// instructions. bool HasDataBarrier = false; + /// HasFullDataBarrier - True if the subtarget supports DFB data barrier + /// instruction. + bool HasFullDataBarrier = false; + /// HasV7Clrex - True if the subtarget supports CLREX instructions bool HasV7Clrex = false; @@ -544,6 +548,7 @@ public: bool hasDivideInThumbMode() const { return HasHardwareDivideInThumb; } bool hasDivideInARMMode() const { return HasHardwareDivideInARM; } bool hasDataBarrier() const { return HasDataBarrier; } + bool hasFullDataBarrier() const { return HasFullDataBarrier; } bool hasV7Clrex() const { return HasV7Clrex; } bool hasAcquireRelease() const { return HasAcquireRelease; } @@ -712,10 +717,6 @@ public: unsigned getMispredictionPenalty() const; - /// This function returns true if the target has sincos() routine in its - /// compiler runtime or math libraries. - bool hasSinCos() const; - /// Returns true if machine scheduler should be enabled. bool enableMachineScheduler() const override; diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index 51982b2dab14..0f6d1eddc985 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -92,6 +92,7 @@ extern "C" void LLVMInitializeARMTarget() { initializeARMConstantIslandsPass(Registry); initializeARMExecutionDepsFixPass(Registry); initializeARMExpandPseudoPass(Registry); + initializeThumb2SizeReducePass(Registry); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -282,10 +283,9 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const { return I.get(); } -TargetIRAnalysis ARMBaseTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(ARMTTIImpl(this, F)); - }); +TargetTransformInfo +ARMBaseTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(ARMTTIImpl(this, F)); } ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT, diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h index 655ec3202bfb..2072bb731f0a 100644 --- a/lib/Target/ARM/ARMTargetMachine.h +++ b/lib/Target/ARM/ARMTargetMachine.h @@ -53,8 +53,7 @@ public: const ARMSubtarget *getSubtargetImpl() const = delete; bool isLittleEndian() const { return isLittle; } - /// \brief Get the TargetIRAnalysis for this target. - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index cae01e415eff..43d7888075b5 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -394,25 +394,6 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, return 1; } -int ARMTTIImpl::getFPOpCost(Type *Ty) { - // Use similar logic that's in ARMISelLowering: - // Any ARM CPU with VFP2 has floating point, but Thumb1 didn't have access - // to VFP. - - if (ST->hasVFP2() && !ST->isThumb1Only()) { - if (Ty->isFloatTy()) { - return TargetTransformInfo::TCC_Basic; - } - - if (Ty->isDoubleTy()) { - return ST->isFPOnlySP() ? TargetTransformInfo::TCC_Expensive : - TargetTransformInfo::TCC_Basic; - } - } - - return TargetTransformInfo::TCC_Expensive; -} - int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { // We only handle costs of reverse and alternate shuffles for now. diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h index 99353a3219a0..cd9fa0709020 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/lib/Target/ARM/ARMTargetTransformInfo.h @@ -156,8 +156,6 @@ public: int getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr); - int getFPOpCost(Type *Ty); - int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info = TTI::OK_AnyValue, diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 26fda5f22b4f..97b642c99f80 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -5581,11 +5581,11 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst, CanAcceptPredicationCode = Mnemonic != "cdp2" && Mnemonic != "clrex" && Mnemonic != "mcr2" && Mnemonic != "mcrr2" && Mnemonic != "mrc2" && Mnemonic != "mrrc2" && - Mnemonic != "dmb" && Mnemonic != "dsb" && Mnemonic != "isb" && - Mnemonic != "pld" && Mnemonic != "pli" && Mnemonic != "pldw" && - Mnemonic != "ldc2" && Mnemonic != "ldc2l" && Mnemonic != "stc2" && - Mnemonic != "stc2l" && !Mnemonic.startswith("rfe") && - !Mnemonic.startswith("srs"); + Mnemonic != "dmb" && Mnemonic != "dfb" && Mnemonic != "dsb" && + Mnemonic != "isb" && Mnemonic != "pld" && Mnemonic != "pli" && + Mnemonic != "pldw" && Mnemonic != "ldc2" && Mnemonic != "ldc2l" && + Mnemonic != "stc2" && Mnemonic != "stc2l" && + !Mnemonic.startswith("rfe") && !Mnemonic.startswith("srs"); } else if (isThumbOne()) { if (hasV6MOps()) CanAcceptPredicationCode = Mnemonic != "movs"; diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index a29a2eeccfe8..53c635877675 100644 --- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -2386,6 +2386,7 @@ static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Insn, case ARM::VLD4q32_UPD: if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder))) return MCDisassembler::Fail; + break; default: break; } @@ -3326,6 +3327,7 @@ static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val, case ARM::t2STRs: if (Rn == 15) return MCDisassembler::Fail; + break; default: break; } @@ -3391,6 +3393,7 @@ static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn, break; case ARM::t2LDRSBs: Inst.setOpcode(ARM::t2PLIs); + break; default: break; } @@ -3854,6 +3857,7 @@ static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val, case ARM::t2STRHi12: if (Rn == 15) return MCDisassembler::Fail; + break; default: break; } diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp index 3920c73fba6a..5357e26856ea 100644 --- a/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -45,6 +45,7 @@ using namespace llvm; #define DEBUG_TYPE "t2-reduce-size" +#define THUMB2_SIZE_REDUCE_NAME "Thumb2 instruction size reduce pass" STATISTIC(NumNarrows, "Number of 32-bit instrs reduced to 16-bit ones"); STATISTIC(Num2Addrs, "Number of 32-bit instrs reduced to 2addr 16-bit ones"); @@ -162,7 +163,7 @@ namespace { const Thumb2InstrInfo *TII; const ARMSubtarget *STI; - Thumb2SizeReduce(std::function<bool(const Function &)> Ftor); + Thumb2SizeReduce(std::function<bool(const Function &)> Ftor = nullptr); bool runOnMachineFunction(MachineFunction &MF) override; @@ -172,7 +173,7 @@ namespace { } StringRef getPassName() const override { - return "Thumb2 instruction size reduction pass"; + return THUMB2_SIZE_REDUCE_NAME; } private: @@ -237,6 +238,9 @@ namespace { } // end anonymous namespace +INITIALIZE_PASS(Thumb2SizeReduce, DEBUG_TYPE, THUMB2_SIZE_REDUCE_NAME, false, + false) + Thumb2SizeReduce::Thumb2SizeReduce(std::function<bool(const Function &)> Ftor) : MachineFunctionPass(ID), PredicateFtor(std::move(Ftor)) { OptimizeSize = MinimizeSize = false; diff --git a/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp b/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp index 6f81e020b996..1f4ef098403d 100644 --- a/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp +++ b/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp @@ -56,7 +56,7 @@ void BPFInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, if (Op.isReg()) { O << getRegisterName(Op.getReg()); } else if (Op.isImm()) { - O << (int32_t)Op.getImm(); + O << formatImm((int32_t)Op.getImm()); } else { assert(Op.isExpr() && "Expected an expression"); printExpr(Op.getExpr(), O); @@ -76,9 +76,9 @@ void BPFInstPrinter::printMemOperand(const MCInst *MI, int OpNo, raw_ostream &O, if (OffsetOp.isImm()) { auto Imm = OffsetOp.getImm(); if (Imm >= 0) - O << " + " << formatDec(Imm); + O << " + " << formatImm(Imm); else - O << " - " << formatDec(-Imm); + O << " - " << formatImm(-Imm); } else { assert(0 && "Expected an immediate"); } @@ -88,7 +88,7 @@ void BPFInstPrinter::printImm64Operand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); if (Op.isImm()) - O << (uint64_t)Op.getImm(); + O << formatImm(Op.getImm()); else if (Op.isExpr()) printExpr(Op.getExpr(), O); else @@ -100,7 +100,7 @@ void BPFInstPrinter::printBrTargetOperand(const MCInst *MI, unsigned OpNo, const MCOperand &Op = MI->getOperand(OpNo); if (Op.isImm()) { int16_t Imm = Op.getImm(); - O << ((Imm >= 0) ? "+" : "") << Imm; + O << ((Imm >= 0) ? "+" : "") << formatImm(Imm); } else if (Op.isExpr()) { printExpr(Op.getExpr(), O); } else { diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index 537f97c9a987..8b6c571dee02 100644 --- a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -756,11 +756,11 @@ struct ShuffleMask { ShuffleMask lo() const { size_t H = Mask.size()/2; - return ShuffleMask({Mask.data(), H}); + return ShuffleMask(Mask.take_front(H)); } ShuffleMask hi() const { size_t H = Mask.size()/2; - return ShuffleMask({Mask.data()+H, H}); + return ShuffleMask(Mask.take_back(H)); } }; @@ -836,15 +836,6 @@ namespace llvm { }; } -// Return a submask of A that is shorter than A by |C| elements: -// - if C > 0, return a submask of A that starts at position C, -// - if C <= 0, return a submask of A that starts at 0 (reduce A by |C|). -static ArrayRef<int> subm(ArrayRef<int> A, int C) { - if (C > 0) - return { A.data()+C, A.size()-C }; - return { A.data(), A.size()+C }; -} - static void splitMask(ArrayRef<int> Mask, MutableArrayRef<int> MaskL, MutableArrayRef<int> MaskR) { unsigned VecLen = Mask.size(); @@ -910,21 +901,38 @@ bool HvxSelector::selectVectorConstants(SDNode *N) { // Since they are generated during the selection process, the main // selection algorithm is not aware of them. Select them directly // here. - if (!N->isMachineOpcode() && N->getOpcode() == ISD::LOAD) { - SDValue Addr = cast<LoadSDNode>(N)->getBasePtr(); - unsigned AddrOpc = Addr.getOpcode(); - if (AddrOpc == HexagonISD::AT_PCREL || AddrOpc == HexagonISD::CP) { - if (Addr.getOperand(0).getOpcode() == ISD::TargetConstantPool) { - ISel.Select(N); - return true; - } + SmallVector<SDNode*,4> Loads; + SmallVector<SDNode*,16> WorkQ; + + // The DAG can change (due to CSE) during selection, so cache all the + // unselected nodes first to avoid traversing a mutating DAG. + + auto IsLoadToSelect = [] (SDNode *N) { + if (!N->isMachineOpcode() && N->getOpcode() == ISD::LOAD) { + SDValue Addr = cast<LoadSDNode>(N)->getBasePtr(); + unsigned AddrOpc = Addr.getOpcode(); + if (AddrOpc == HexagonISD::AT_PCREL || AddrOpc == HexagonISD::CP) + if (Addr.getOperand(0).getOpcode() == ISD::TargetConstantPool) + return true; } + return false; + }; + + WorkQ.push_back(N); + for (unsigned i = 0; i != WorkQ.size(); ++i) { + SDNode *W = WorkQ[i]; + if (IsLoadToSelect(W)) { + Loads.push_back(W); + continue; + } + for (unsigned j = 0, f = W->getNumOperands(); j != f; ++j) + WorkQ.push_back(W->getOperand(j).getNode()); } - bool Selected = false; - for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I) - Selected = selectVectorConstants(N->getOperand(I).getNode()) || Selected; - return Selected; + for (SDNode *L : Loads) + ISel.Select(L); + + return !Loads.empty(); } void HvxSelector::materialize(const ResultStack &Results) { @@ -1159,8 +1167,8 @@ OpRef HvxSelector::vmuxp(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb, ResultStack &Results) { DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); size_t S = Bytes.size() / 2; - OpRef L = vmuxs({Bytes.data(), S}, OpRef::lo(Va), OpRef::lo(Vb), Results); - OpRef H = vmuxs({Bytes.data()+S, S}, OpRef::hi(Va), OpRef::hi(Vb), Results); + OpRef L = vmuxs(Bytes.take_front(S), OpRef::lo(Va), OpRef::lo(Vb), Results); + OpRef H = vmuxs(Bytes.drop_front(S), OpRef::hi(Va), OpRef::hi(Vb), Results); return concat(L, H, Results); } @@ -1435,7 +1443,7 @@ OpRef HvxSelector::contracting(ShuffleMask SM, OpRef Va, OpRef Vb, return OpRef::fail(); // Examine the rest of the mask. for (int I = L; I < N; I += L) { - auto S = findStrip(subm(SM.Mask,I), 1, N-I); + auto S = findStrip(SM.Mask.drop_front(I), 1, N-I); // Check whether the mask element at the beginning of each strip // increases by 2L each time. if (S.first - Strip.first != 2*I) @@ -1465,7 +1473,7 @@ OpRef HvxSelector::contracting(ShuffleMask SM, OpRef Va, OpRef Vb, std::pair<int,unsigned> PrevS = Strip; bool Flip = false; for (int I = L; I < N; I += L) { - auto S = findStrip(subm(SM.Mask,I), 1, N-I); + auto S = findStrip(SM.Mask.drop_front(I), 1, N-I); if (S.second != PrevS.second) return OpRef::fail(); int Diff = Flip ? PrevS.first - S.first + 2*L @@ -1524,7 +1532,7 @@ OpRef HvxSelector::expanding(ShuffleMask SM, OpRef Va, ResultStack &Results) { // First, check the non-ignored strips. for (int I = 2*L; I < 2*N; I += 2*L) { - auto S = findStrip(subm(SM.Mask,I), 1, N-I); + auto S = findStrip(SM.Mask.drop_front(I), 1, N-I); if (S.second != unsigned(L)) return OpRef::fail(); if (2*S.first != I) @@ -1532,7 +1540,7 @@ OpRef HvxSelector::expanding(ShuffleMask SM, OpRef Va, ResultStack &Results) { } // Check the -1s. for (int I = L; I < 2*N; I += 2*L) { - auto S = findStrip(subm(SM.Mask,I), 0, N-I); + auto S = findStrip(SM.Mask.drop_front(I), 0, N-I); if (S.first != -1 || S.second != unsigned(L)) return OpRef::fail(); } @@ -1666,8 +1674,8 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) { if (!isPowerOf2_32(X)) return OpRef::fail(); // Check the other segments of Mask. - for (int J = 0; J < VecLen; J += I) { - if (XorPow2(subm(SM.Mask, -J), I) != X) + for (int J = I; J < VecLen; J += I) { + if (XorPow2(SM.Mask.slice(J, I), I) != X) return OpRef::fail(); } Perm[Log2_32(X)] = Log2_32(I)-1; diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index 586363335df1..0e0da2ddc400 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -761,11 +761,13 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Promote the value if needed. switch (VA.getLocInfo()) { default: - // Loc info must be one of Full, SExt, ZExt, or AExt. + // Loc info must be one of Full, BCvt, SExt, ZExt, or AExt. llvm_unreachable("Unknown loc info!"); - case CCValAssign::BCvt: case CCValAssign::Full: break; + case CCValAssign::BCvt: + Arg = DAG.getBitcast(VA.getLocVT(), Arg); + break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); break; @@ -1135,6 +1137,8 @@ SDValue HexagonTargetLowering::LowerFormalArguments( unsigned VReg = RegInfo.createVirtualRegister(&Hexagon::IntRegsRegClass); RegInfo.addLiveIn(VA.getLocReg(), VReg); + if (VA.getLocInfo() == CCValAssign::BCvt) + RegVT = VA.getValVT(); SDValue Copy = DAG.getCopyFromReg(Chain, dl, VReg, RegVT); // Treat values of type MVT::i1 specially: they are passed in // registers of type i32, but they need to remain as values of @@ -1155,6 +1159,8 @@ SDValue HexagonTargetLowering::LowerFormalArguments( unsigned VReg = RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass); RegInfo.addLiveIn(VA.getLocReg(), VReg); + if (VA.getLocInfo() == CCValAssign::BCvt) + RegVT = VA.getValVT(); InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT)); // Single Vector @@ -1715,8 +1721,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v4i1, &Hexagon::PredRegsRegClass); // ddccbbaa addRegisterClass(MVT::v8i1, &Hexagon::PredRegsRegClass); // hgfedcba addRegisterClass(MVT::i32, &Hexagon::IntRegsRegClass); - addRegisterClass(MVT::v4i8, &Hexagon::IntRegsRegClass); addRegisterClass(MVT::v2i16, &Hexagon::IntRegsRegClass); + addRegisterClass(MVT::v4i8, &Hexagon::IntRegsRegClass); addRegisterClass(MVT::i64, &Hexagon::DoubleRegsRegClass); addRegisterClass(MVT::v8i8, &Hexagon::DoubleRegsRegClass); addRegisterClass(MVT::v4i16, &Hexagon::DoubleRegsRegClass); @@ -1735,6 +1741,14 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v128i8, &Hexagon::HvxWRRegClass); addRegisterClass(MVT::v64i16, &Hexagon::HvxWRRegClass); addRegisterClass(MVT::v32i32, &Hexagon::HvxWRRegClass); + // These "short" boolean vector types should be legal because + // they will appear as results of vector compares. If they were + // not legal, type legalization would try to make them legal + // and that would require using operations that do not use or + // produce such types. That, in turn, would imply using custom + // nodes, which would be unoptimizable by the DAG combiner. + // The idea is to rely on target-independent operations as much + // as possible. addRegisterClass(MVT::v16i1, &Hexagon::HvxQRRegClass); addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass); addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass); @@ -1964,9 +1978,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Legal); // Types natively supported: - for (MVT NativeVT : {MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v32i1, MVT::v64i1, - MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16, MVT::v1i32, - MVT::v2i32, MVT::v1i64}) { + for (MVT NativeVT : {MVT::v32i1, MVT::v64i1, MVT::v4i8, MVT::v8i8, MVT::v2i16, + MVT::v4i16, MVT::v1i32, MVT::v2i32, MVT::v1i64}) { setOperationAction(ISD::BUILD_VECTOR, NativeVT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, NativeVT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, NativeVT, Custom); @@ -1992,63 +2005,6 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, AddPromotedToType(Opc, FromTy, ToTy); }; - if (Subtarget.useHVXOps()) { - bool Use64b = Subtarget.useHVX64BOps(); - ArrayRef<MVT> LegalV = Use64b ? LegalV64 : LegalV128; - ArrayRef<MVT> LegalW = Use64b ? LegalW64 : LegalW128; - MVT ByteV = Use64b ? MVT::v64i8 : MVT::v128i8; - MVT ByteW = Use64b ? MVT::v128i8 : MVT::v256i8; - - setOperationAction(ISD::VECTOR_SHUFFLE, ByteV, Legal); - setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal); - setOperationAction(ISD::CONCAT_VECTORS, ByteW, Legal); - setOperationAction(ISD::AND, ByteV, Legal); - setOperationAction(ISD::OR, ByteV, Legal); - setOperationAction(ISD::XOR, ByteV, Legal); - - for (MVT T : LegalV) { - setIndexedLoadAction(ISD::POST_INC, T, Legal); - setIndexedStoreAction(ISD::POST_INC, T, Legal); - - setOperationAction(ISD::ADD, T, Legal); - setOperationAction(ISD::SUB, T, Legal); - setOperationAction(ISD::VSELECT, T, Legal); - if (T != ByteV) { - setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal); - setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal); - } - - setOperationAction(ISD::MUL, T, Custom); - setOperationAction(ISD::SETCC, T, Custom); - setOperationAction(ISD::BUILD_VECTOR, T, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, T, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, T, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, T, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom); - if (T != ByteV) - setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, T, Custom); - } - - for (MVT T : LegalV) { - if (T == ByteV) - continue; - // Promote all shuffles and concats to operate on vectors of bytes. - setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV); - setPromoteTo(ISD::CONCAT_VECTORS, T, ByteV); - setPromoteTo(ISD::AND, T, ByteV); - setPromoteTo(ISD::OR, T, ByteV); - setPromoteTo(ISD::XOR, T, ByteV); - } - - for (MVT T : LegalW) { - if (T == ByteW) - continue; - // Promote all shuffles and concats to operate on vectors of bytes. - setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteW); - setPromoteTo(ISD::CONCAT_VECTORS, T, ByteW); - } - } - // Subtarget-specific operation actions. // if (Subtarget.hasV5TOps()) { @@ -2110,6 +2066,67 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setIndexedStoreAction(ISD::POST_INC, VT, Legal); } + if (Subtarget.useHVXOps()) { + bool Use64b = Subtarget.useHVX64BOps(); + ArrayRef<MVT> LegalV = Use64b ? LegalV64 : LegalV128; + ArrayRef<MVT> LegalW = Use64b ? LegalW64 : LegalW128; + MVT ByteV = Use64b ? MVT::v64i8 : MVT::v128i8; + MVT ByteW = Use64b ? MVT::v128i8 : MVT::v256i8; + + setOperationAction(ISD::VECTOR_SHUFFLE, ByteV, Legal); + setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal); + setOperationAction(ISD::CONCAT_VECTORS, ByteW, Legal); + setOperationAction(ISD::AND, ByteV, Legal); + setOperationAction(ISD::OR, ByteV, Legal); + setOperationAction(ISD::XOR, ByteV, Legal); + + for (MVT T : LegalV) { + setIndexedLoadAction(ISD::POST_INC, T, Legal); + setIndexedStoreAction(ISD::POST_INC, T, Legal); + + setOperationAction(ISD::ADD, T, Legal); + setOperationAction(ISD::SUB, T, Legal); + if (T != ByteV) { + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal); + } + + setOperationAction(ISD::MUL, T, Custom); + setOperationAction(ISD::SETCC, T, Custom); + setOperationAction(ISD::BUILD_VECTOR, T, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, T, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, T, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, T, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom); + if (T != ByteV) + setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, T, Custom); + } + + for (MVT T : LegalV) { + if (T == ByteV) + continue; + // Promote all shuffles and concats to operate on vectors of bytes. + setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV); + setPromoteTo(ISD::CONCAT_VECTORS, T, ByteV); + setPromoteTo(ISD::AND, T, ByteV); + setPromoteTo(ISD::OR, T, ByteV); + setPromoteTo(ISD::XOR, T, ByteV); + } + + for (MVT T : LegalW) { + // Custom-lower BUILD_VECTOR for vector pairs. The standard (target- + // independent) handling of it would convert it to a load, which is + // not always the optimal choice. + setOperationAction(ISD::BUILD_VECTOR, T, Custom); + + if (T == ByteW) + continue; + // Promote all shuffles and concats to operate on vectors of bytes. + setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteW); + setPromoteTo(ISD::CONCAT_VECTORS, T, ByteW); + } + } + computeRegisterProperties(&HRI); // @@ -2256,6 +2273,7 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { case HexagonISD::VINSERTW0: return "HexagonISD::VINSERTW0"; case HexagonISD::VROR: return "HexagonISD::VROR"; case HexagonISD::READCYCLE: return "HexagonISD::READCYCLE"; + case HexagonISD::VZERO: return "HexagonISD::VZERO"; case HexagonISD::OP_END: break; } return nullptr; @@ -2331,14 +2349,27 @@ bool HexagonTargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, TargetLoweringBase::LegalizeTypeAction HexagonTargetLowering::getPreferredVectorAction(EVT VT) const { + if (VT.getVectorNumElements() == 1) + return TargetLoweringBase::TypeScalarizeVector; + + // Always widen vectors of i1. + MVT ElemTy = VT.getSimpleVT().getVectorElementType(); + if (ElemTy == MVT::i1) + return TargetLoweringBase::TypeWidenVector; + if (Subtarget.useHVXOps()) { // If the size of VT is at least half of the vector length, // widen the vector. Note: the threshold was not selected in // any scientific way. - if (VT.getSizeInBits() >= Subtarget.getVectorLength()*8/2) - return TargetLoweringBase::TypeWidenVector; + ArrayRef<MVT> Tys = Subtarget.getHVXElementTypes(); + if (llvm::find(Tys, ElemTy) != Tys.end()) { + unsigned HwWidth = 8*Subtarget.getVectorLength(); + unsigned VecWidth = VT.getSizeInBits(); + if (VecWidth >= HwWidth/2 && VecWidth < HwWidth) + return TargetLoweringBase::TypeWidenVector; + } } - return TargetLowering::getPreferredVectorAction(VT); + return TargetLoweringBase::TypeSplitVector; } // Lower a vector shuffle (V1, V2, V3). V1 and V2 are the two vectors @@ -2463,21 +2494,43 @@ HexagonTargetLowering::LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BITCAST, dl, VT, Result); } +bool +HexagonTargetLowering::getBuildVectorConstInts(ArrayRef<SDValue> Values, + MVT VecTy, SelectionDAG &DAG, + MutableArrayRef<ConstantInt*> Consts) const { + MVT ElemTy = VecTy.getVectorElementType(); + unsigned ElemWidth = ElemTy.getSizeInBits(); + IntegerType *IntTy = IntegerType::get(*DAG.getContext(), ElemWidth); + bool AllConst = true; + + for (unsigned i = 0, e = Values.size(); i != e; ++i) { + SDValue V = Values[i]; + if (V.isUndef()) { + Consts[i] = ConstantInt::get(IntTy, 0); + continue; + } + if (auto *CN = dyn_cast<ConstantSDNode>(V.getNode())) { + const ConstantInt *CI = CN->getConstantIntValue(); + Consts[i] = const_cast<ConstantInt*>(CI); + } else if (auto *CN = dyn_cast<ConstantFPSDNode>(V.getNode())) { + const ConstantFP *CF = CN->getConstantFPValue(); + APInt A = CF->getValueAPF().bitcastToAPInt(); + Consts[i] = ConstantInt::get(IntTy, A.getZExtValue()); + } else { + AllConst = false; + } + } + return AllConst; +} + SDValue HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl, MVT VecTy, SelectionDAG &DAG) const { MVT ElemTy = VecTy.getVectorElementType(); assert(VecTy.getVectorNumElements() == Elem.size()); - SmallVector<ConstantSDNode*,4> Consts; - bool AllConst = true; - for (SDValue V : Elem) { - if (isUndef(V)) - V = DAG.getConstant(0, dl, ElemTy); - auto *C = dyn_cast<ConstantSDNode>(V.getNode()); - Consts.push_back(C); - AllConst = AllConst && C != nullptr; - } + SmallVector<ConstantInt*,4> Consts(Elem.size()); + bool AllConst = getBuildVectorConstInts(Elem, VecTy, DAG, Consts); unsigned First, Num = Elem.size(); for (First = 0; First != Num; ++First) @@ -2486,6 +2539,10 @@ HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl, if (First == Num) return DAG.getUNDEF(VecTy); + if (AllConst && + llvm::all_of(Consts, [](ConstantInt *CI) { return CI->isZero(); })) + return getZero(dl, VecTy, DAG); + if (ElemTy == MVT::i16) { assert(Elem.size() == 2); if (AllConst) { @@ -2498,45 +2555,55 @@ HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl, return DAG.getBitcast(MVT::v2i16, N); } - // First try generating a constant. - assert(ElemTy == MVT::i8 && Num == 4); - if (AllConst) { - int32_t V = (Consts[0]->getZExtValue() & 0xFF) | - (Consts[1]->getZExtValue() & 0xFF) << 8 | - (Consts[1]->getZExtValue() & 0xFF) << 16 | - Consts[2]->getZExtValue() << 24; - return DAG.getBitcast(MVT::v4i8, DAG.getConstant(V, dl, MVT::i32)); - } + if (ElemTy == MVT::i8) { + // First try generating a constant. + if (AllConst) { + int32_t V = (Consts[0]->getZExtValue() & 0xFF) | + (Consts[1]->getZExtValue() & 0xFF) << 8 | + (Consts[1]->getZExtValue() & 0xFF) << 16 | + Consts[2]->getZExtValue() << 24; + return DAG.getBitcast(MVT::v4i8, DAG.getConstant(V, dl, MVT::i32)); + } - // Then try splat. - bool IsSplat = true; - for (unsigned i = 0; i != Num; ++i) { - if (i == First) - continue; - if (Elem[i] == Elem[First] || isUndef(Elem[i])) - continue; - IsSplat = false; - break; - } - if (IsSplat) - return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Elem[First]); + // Then try splat. + bool IsSplat = true; + for (unsigned i = 0; i != Num; ++i) { + if (i == First) + continue; + if (Elem[i] == Elem[First] || isUndef(Elem[i])) + continue; + IsSplat = false; + break; + } + if (IsSplat) { + // Legalize the operand to VSPLAT. + SDValue Ext = DAG.getZExtOrTrunc(Elem[First], dl, MVT::i32); + return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Ext); + } - // Generate - // (zxtb(Elem[0]) | (zxtb(Elem[1]) << 8)) | - // (zxtb(Elem[2]) | (zxtb(Elem[3]) << 8)) << 16 - SDValue S8 = DAG.getConstant(8, dl, MVT::i32); - SDValue V0 = DAG.getZeroExtendInReg(Elem[0], dl, MVT::i8); - SDValue V1 = DAG.getZeroExtendInReg(Elem[1], dl, MVT::i8); - SDValue V2 = DAG.getZeroExtendInReg(Elem[2], dl, MVT::i8); - SDValue V3 = DAG.getZeroExtendInReg(Elem[3], dl, MVT::i8); + // Generate + // (zxtb(Elem[0]) | (zxtb(Elem[1]) << 8)) | + // (zxtb(Elem[2]) | (zxtb(Elem[3]) << 8)) << 16 + assert(Elem.size() == 4); + SDValue Vs[4]; + for (unsigned i = 0; i != 4; ++i) { + Vs[i] = DAG.getZExtOrTrunc(Elem[i], dl, MVT::i32); + Vs[i] = DAG.getZeroExtendInReg(Vs[i], dl, MVT::i8); + } + SDValue S8 = DAG.getConstant(8, dl, MVT::i32); + SDValue T0 = DAG.getNode(ISD::SHL, dl, MVT::i32, {Vs[1], S8}); + SDValue T1 = DAG.getNode(ISD::SHL, dl, MVT::i32, {Vs[3], S8}); + SDValue B0 = DAG.getNode(ISD::OR, dl, MVT::i32, {Vs[0], T0}); + SDValue B1 = DAG.getNode(ISD::OR, dl, MVT::i32, {Vs[2], T1}); - SDValue V4 = DAG.getNode(ISD::SHL, dl, MVT::i32, {V1, S8}); - SDValue V5 = DAG.getNode(ISD::SHL, dl, MVT::i32, {V3, S8}); - SDValue V6 = DAG.getNode(ISD::OR, dl, MVT::i32, {V0, V4}); - SDValue V7 = DAG.getNode(ISD::OR, dl, MVT::i32, {V2, V5}); + SDValue R = getNode(Hexagon::A2_combine_ll, dl, MVT::i32, {B1, B0}, DAG); + return DAG.getBitcast(MVT::v4i8, R); + } - SDValue T0 = getNode(Hexagon::A2_combine_ll, dl, MVT::i32, {V7, V6}, DAG); - return DAG.getBitcast(MVT::v4i8, T0); +#ifndef NDEBUG + dbgs() << "VecTy: " << EVT(VecTy).getEVTString() << '\n'; +#endif + llvm_unreachable("Unexpected vector element type"); } SDValue @@ -2545,15 +2612,8 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl, MVT ElemTy = VecTy.getVectorElementType(); assert(VecTy.getVectorNumElements() == Elem.size()); - SmallVector<ConstantSDNode*,8> Consts; - bool AllConst = true; - for (SDValue V : Elem) { - if (isUndef(V)) - V = DAG.getConstant(0, dl, ElemTy); - auto *C = dyn_cast<ConstantSDNode>(V.getNode()); - Consts.push_back(C); - AllConst = AllConst && C != nullptr; - } + SmallVector<ConstantInt*,8> Consts(Elem.size()); + bool AllConst = getBuildVectorConstInts(Elem, VecTy, DAG, Consts); unsigned First, Num = Elem.size(); for (First = 0; First != Num; ++First) @@ -2562,6 +2622,10 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl, if (First == Num) return DAG.getUNDEF(VecTy); + if (AllConst && + llvm::all_of(Consts, [](ConstantInt *CI) { return CI->isZero(); })) + return getZero(dl, VecTy, DAG); + // First try splat if possible. if (ElemTy == MVT::i16) { bool IsSplat = true; @@ -2573,8 +2637,11 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl, IsSplat = false; break; } - if (IsSplat) - return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Elem[First]); + if (IsSplat) { + // Legalize the operand to VSPLAT. + SDValue Ext = DAG.getZExtOrTrunc(Elem[First], dl, MVT::i32); + return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Ext); + } } // Then try constant. @@ -2593,10 +2660,10 @@ HexagonTargetLowering::buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl, MVT HalfTy = MVT::getVectorVT(ElemTy, Num/2); SDValue L = (ElemTy == MVT::i32) ? Elem[0] - : buildVector32({Elem.data(), Num/2}, dl, HalfTy, DAG); + : buildVector32(Elem.take_front(Num/2), dl, HalfTy, DAG); SDValue H = (ElemTy == MVT::i32) ? Elem[1] - : buildVector32({Elem.data()+Num/2, Num/2}, dl, HalfTy, DAG); + : buildVector32(Elem.drop_front(Num/2), dl, HalfTy, DAG); return DAG.getNode(HexagonISD::COMBINE, dl, VecTy, {H, L}); } @@ -2696,21 +2763,41 @@ HexagonTargetLowering::insertVector(SDValue VecV, SDValue ValV, SDValue IdxV, } SDValue +HexagonTargetLowering::getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG) + const { + if (Ty.isVector()) { + assert(Ty.isInteger() && "Only integer vectors are supported here"); + unsigned W = Ty.getSizeInBits(); + if (W <= 64) + return DAG.getBitcast(Ty, DAG.getConstant(0, dl, MVT::getIntegerVT(W))); + return DAG.getNode(HexagonISD::VZERO, dl, Ty); + } + + if (Ty.isInteger()) + return DAG.getConstant(0, dl, Ty); + if (Ty.isFloatingPoint()) + return DAG.getConstantFP(0.0, dl, Ty); + llvm_unreachable("Invalid type for zero"); +} + +SDValue HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { MVT VecTy = ty(Op); unsigned BW = VecTy.getSizeInBits(); + + if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy, true)) + return LowerHvxBuildVector(Op, DAG); + if (BW == 32 || BW == 64) { + const SDLoc &dl(Op); SmallVector<SDValue,8> Ops; for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) Ops.push_back(Op.getOperand(i)); if (BW == 32) - return buildVector32(Ops, SDLoc(Op), VecTy, DAG); - return buildVector64(Ops, SDLoc(Op), VecTy, DAG); + return buildVector32(Ops, dl, VecTy, DAG); + return buildVector64(Ops, dl, VecTy, DAG); } - if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy)) - return LowerHvxBuildVector(Op, DAG); - return SDValue(); } @@ -2822,7 +2909,7 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { #ifndef NDEBUG Op.getNode()->dumpr(&DAG); if (Opc > HexagonISD::OP_BEGIN && Opc < HexagonISD::OP_END) - errs() << "Check for a non-legal type in this operation\n"; + errs() << "Error: check for a non-legal type in this operation\n"; #endif llvm_unreachable("Should not custom lower this!"); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h index 0619e2e4e7f9..732834b464b4 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.h +++ b/lib/Target/Hexagon/HexagonISelLowering.h @@ -70,6 +70,7 @@ namespace HexagonISD { EH_RETURN, DCFETCH, READCYCLE, + VZERO, OP_END }; @@ -283,6 +284,9 @@ namespace HexagonISD { } private: + bool getBuildVectorConstInts(ArrayRef<SDValue> Values, MVT VecTy, + SelectionDAG &DAG, + MutableArrayRef<ConstantInt*> Consts) const; SDValue buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl, MVT VecTy, SelectionDAG &DAG) const; SDValue buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl, MVT VecTy, @@ -301,6 +305,7 @@ namespace HexagonISD { SDNode *N = DAG.getMachineNode(MachineOpc, dl, Ty, Ops); return SDValue(N, 0); } + SDValue getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG) const; using VectorPair = std::pair<SDValue, SDValue>; using TypePair = std::pair<MVT, MVT>; @@ -344,6 +349,13 @@ namespace HexagonISD { SDValue getByteShuffle(const SDLoc &dl, SDValue Op0, SDValue Op1, ArrayRef<int> Mask, SelectionDAG &DAG) const; + MVT getVecBoolVT() const; + + SDValue buildHvxVectorSingle(ArrayRef<SDValue> Values, const SDLoc &dl, + MVT VecTy, SelectionDAG &DAG) const; + SDValue buildHvxVectorPred(ArrayRef<SDValue> Values, const SDLoc &dl, + MVT VecTy, SelectionDAG &DAG) const; + SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index c1d44cb0e7de..51480d09d734 100644 --- a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -141,49 +141,50 @@ HexagonTargetLowering::getByteShuffle(const SDLoc &dl, SDValue Op0, opCastElem(Op1, MVT::i8, DAG), ByteMask); } +MVT +HexagonTargetLowering::getVecBoolVT() const { + return MVT::getVectorVT(MVT::i1, 8*Subtarget.getVectorLength()); +} + SDValue -HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) - const { - const SDLoc &dl(Op); - BuildVectorSDNode *BN = cast<BuildVectorSDNode>(Op.getNode()); - bool IsConst = BN->isConstant(); +HexagonTargetLowering::buildHvxVectorSingle(ArrayRef<SDValue> Values, + const SDLoc &dl, MVT VecTy, + SelectionDAG &DAG) const { + unsigned VecLen = Values.size(); MachineFunction &MF = DAG.getMachineFunction(); - MVT VecTy = ty(Op); + MVT ElemTy = VecTy.getVectorElementType(); + unsigned ElemWidth = ElemTy.getSizeInBits(); + unsigned HwLen = Subtarget.getVectorLength(); - if (IsConst) { - SmallVector<Constant*, 128> Elems; - for (SDValue V : BN->op_values()) { - if (auto *C = dyn_cast<ConstantSDNode>(V.getNode())) - Elems.push_back(const_cast<ConstantInt*>(C->getConstantIntValue())); - } - Constant *CV = ConstantVector::get(Elems); - unsigned Align = VecTy.getSizeInBits() / 8; + SmallVector<ConstantInt*, 128> Consts(VecLen); + bool AllConst = getBuildVectorConstInts(Values, VecTy, DAG, Consts); + if (AllConst) { + if (llvm::all_of(Consts, [](ConstantInt *CI) { return CI->isZero(); })) + return getZero(dl, VecTy, DAG); + + ArrayRef<Constant*> Tmp((Constant**)Consts.begin(), + (Constant**)Consts.end()); + Constant *CV = ConstantVector::get(Tmp); + unsigned Align = HwLen; SDValue CP = LowerConstantPool(DAG.getConstantPool(CV, VecTy, Align), DAG); return DAG.getLoad(VecTy, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(MF), Align); } - unsigned NumOps = Op.getNumOperands(); - unsigned HwLen = Subtarget.getVectorLength(); - unsigned ElemSize = VecTy.getVectorElementType().getSizeInBits() / 8; - assert(ElemSize*NumOps == HwLen); - + unsigned ElemSize = ElemWidth / 8; + assert(ElemSize*VecLen == HwLen); SmallVector<SDValue,32> Words; - SmallVector<SDValue,32> Ops; - for (unsigned i = 0; i != NumOps; ++i) - Ops.push_back(Op.getOperand(i)); if (VecTy.getVectorElementType() != MVT::i32) { - assert(ElemSize < 4 && "vNi64 should have been promoted to vNi32"); assert((ElemSize == 1 || ElemSize == 2) && "Invalid element size"); unsigned OpsPerWord = (ElemSize == 1) ? 4 : 2; MVT PartVT = MVT::getVectorVT(VecTy.getVectorElementType(), OpsPerWord); - for (unsigned i = 0; i != NumOps; i += OpsPerWord) { - SDValue W = buildVector32({&Ops[i], OpsPerWord}, dl, PartVT, DAG); + for (unsigned i = 0; i != VecLen; i += OpsPerWord) { + SDValue W = buildVector32(Values.slice(i, OpsPerWord), dl, PartVT, DAG); Words.push_back(DAG.getBitcast(MVT::i32, W)); } } else { - Words.assign(Ops.begin(), Ops.end()); + Words.assign(Values.begin(), Values.end()); } // Construct two halves in parallel, then or them together. @@ -208,6 +209,83 @@ HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) } SDValue +HexagonTargetLowering::buildHvxVectorPred(ArrayRef<SDValue> Values, + const SDLoc &dl, MVT VecTy, + SelectionDAG &DAG) const { + // Construct a vector V of bytes, such that a comparison V >u 0 would + // produce the required vector predicate. + unsigned VecLen = Values.size(); + unsigned HwLen = Subtarget.getVectorLength(); + assert(VecLen <= HwLen || VecLen == 8*HwLen); + SmallVector<SDValue,128> Bytes; + + if (VecLen <= HwLen) { + // In the hardware, each bit of a vector predicate corresponds to a byte + // of a vector register. Calculate how many bytes does a bit of VecTy + // correspond to. + assert(HwLen % VecLen == 0); + unsigned BitBytes = HwLen / VecLen; + for (SDValue V : Values) { + SDValue Ext = !V.isUndef() ? DAG.getZExtOrTrunc(V, dl, MVT::i8) + : DAG.getConstant(0, dl, MVT::i8); + for (unsigned B = 0; B != BitBytes; ++B) + Bytes.push_back(Ext); + } + } else { + // There are as many i1 values, as there are bits in a vector register. + // Divide the values into groups of 8 and check that each group consists + // of the same value (ignoring undefs). + for (unsigned I = 0; I != VecLen; I += 8) { + unsigned B = 0; + // Find the first non-undef value in this group. + for (; B != 8; ++B) { + if (!Values[I+B].isUndef()) + break; + } + SDValue F = Values[I+B]; + SDValue Ext = (B < 8) ? DAG.getZExtOrTrunc(F, dl, MVT::i8) + : DAG.getConstant(0, dl, MVT::i8); + Bytes.push_back(Ext); + // Verify that the rest of values in the group are the same as the + // first. + for (; B != 8; ++B) + assert(Values[I+B].isUndef() || Values[I+B] == F); + } + } + + MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen); + SDValue ByteVec = buildHvxVectorSingle(Bytes, dl, ByteTy, DAG); + SDValue Cmp = DAG.getSetCC(dl, VecTy, ByteVec, getZero(dl, ByteTy, DAG), + ISD::SETUGT); + return Cmp; +} + +SDValue +HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) + const { + const SDLoc &dl(Op); + MVT VecTy = ty(Op); + + unsigned Size = Op.getNumOperands(); + SmallVector<SDValue,128> Ops; + for (unsigned i = 0; i != Size; ++i) + Ops.push_back(Op.getOperand(i)); + + if (VecTy.getVectorElementType() == MVT::i1) + return buildHvxVectorPred(Ops, dl, VecTy, DAG); + + if (VecTy.getSizeInBits() == 16*Subtarget.getVectorLength()) { + ArrayRef<SDValue> A(Ops); + MVT SingleTy = typeSplit(VecTy).first; + SDValue V0 = buildHvxVectorSingle(A.take_front(Size/2), dl, SingleTy, DAG); + SDValue V1 = buildHvxVectorSingle(A.drop_front(Size/2), dl, SingleTy, DAG); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, V0, V1); + } + + return buildHvxVectorSingle(Ops, dl, VecTy, DAG); +} + +SDValue HexagonTargetLowering::LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG) const { // Change the type of the extracted element to i32. @@ -399,6 +477,10 @@ HexagonTargetLowering::LowerHvxSetCC(SDValue Op, SelectionDAG &DAG) const { // (negate (swap-op NewCmp)), // the condition code for the NewCmp should be calculated from the original // CC by applying these operations in the reverse order. + // + // This could also be done through setCondCodeAction, but for negation it + // uses a xor with a vector of -1s, which it obtains from BUILD_VECTOR. + // That is far too expensive for what can be done with a single instruction. switch (CC) { case ISD::SETNE: // !eq diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td index e2120d3de2ef..cdc2085986a5 100644 --- a/lib/Target/Hexagon/HexagonPatterns.td +++ b/lib/Target/Hexagon/HexagonPatterns.td @@ -2899,6 +2899,8 @@ def HexagonREADCYCLE: SDNode<"HexagonISD::READCYCLE", SDTInt64Leaf, def: Pat<(HexagonREADCYCLE), (A4_tfrcpp UPCYCLE)>; +def SDTVecLeaf: SDTypeProfile<1, 0, [SDTCisVec<0>]>; + def SDTHexagonVEXTRACTW: SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisVT<2, i32>]>; def HexagonVEXTRACTW : SDNode<"HexagonISD::VEXTRACTW", SDTHexagonVEXTRACTW>; @@ -2920,7 +2922,14 @@ let Predicates = [UseHVX] in { def: OpR_RR_pat<V6_vpackoh, pf2<HexagonVPACKO>, VecI16, HVI16>; } +def HexagonVZERO: SDNode<"HexagonISD::VZERO", SDTVecLeaf>; +def vzero: PatFrag<(ops), (HexagonVZERO)>; + let Predicates = [UseHVX] in { + def: Pat<(VecI8 vzero), (V6_vd0)>; + def: Pat<(VecI16 vzero), (V6_vd0)>; + def: Pat<(VecI32 vzero), (V6_vd0)>; + def: Pat<(VecPI8 (concat_vectors HVI8:$Vs, HVI8:$Vt)), (Combinev HvxVR:$Vt, HvxVR:$Vs)>; def: Pat<(VecPI16 (concat_vectors HVI16:$Vs, HVI16:$Vt)), diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td index 2ceed70c2497..1d1e85e7ac7e 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.td +++ b/lib/Target/Hexagon/HexagonRegisterInfo.td @@ -242,7 +242,7 @@ def VecQ32 // FIXME: the register order should be defined in terms of the preferred // allocation order... // -def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32, +def IntRegs : RegisterClass<"Hexagon", [i32, f32, v32i1, v4i8, v2i16], 32, (add (sequence "R%u", 0, 9), (sequence "R%u", 12, 28), R10, R11, R29, R30, R31)>; @@ -254,7 +254,8 @@ def GeneralSubRegs : RegisterClass<"Hexagon", [i32], 32, def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32, (add R7, R6, R5, R4, R3, R2, R1, R0)> ; -def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64, +def DoubleRegs : RegisterClass<"Hexagon", + [i64, f64, v64i1, v8i8, v4i16, v2i32], 64, (add (sequence "D%u", 0, 4), (sequence "D%u", 6, 13), D5, D14, D15)>; def GeneralDoubleLow8Regs : RegisterClass<"Hexagon", [i64], 64, diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h index 678ef210d0ae..af93f20d97fc 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.h +++ b/lib/Target/Hexagon/HexagonSubtarget.h @@ -204,14 +204,38 @@ public: llvm_unreachable("Invalid HVX vector length settings"); } - bool isHVXVectorType(MVT VecTy) const { + ArrayRef<MVT> getHVXElementTypes() const { + static MVT Types[] = { MVT::i8, MVT::i16, MVT::i32 }; + return makeArrayRef(Types); + } + + bool isHVXVectorType(MVT VecTy, bool IncludeBool = false) const { if (!VecTy.isVector() || !useHVXOps()) return false; - unsigned ElemWidth = VecTy.getVectorElementType().getSizeInBits(); - if (ElemWidth < 8 || ElemWidth > 64) + MVT ElemTy = VecTy.getVectorElementType(); + if (!IncludeBool && ElemTy == MVT::i1) + return false; + + unsigned HwLen = getVectorLength(); + unsigned NumElems = VecTy.getVectorNumElements(); + ArrayRef<MVT> ElemTypes = getHVXElementTypes(); + + if (IncludeBool && ElemTy == MVT::i1) { + // Special case for the v512i1, etc. + if (8*HwLen == NumElems) + return true; + // Boolean HVX vector types are formed from regular HVX vector types + // by replacing the element type with i1. + for (MVT T : ElemTypes) + if (NumElems * T.getSizeInBits() == 8*HwLen) + return true; return false; + } + unsigned VecWidth = VecTy.getSizeInBits(); - return VecWidth == 8*getVectorLength() || VecWidth == 16*getVectorLength(); + if (VecWidth != 8*HwLen && VecWidth != 16*HwLen) + return false; + return llvm::any_of(ElemTypes, [ElemTy] (MVT T) { return ElemTy == T; }); } unsigned getL1CacheLineSize() const; diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index 0c40a7b8f382..363b703fef28 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -258,10 +258,9 @@ void HexagonTargetMachine::adjustPassManager(PassManagerBuilder &PMB) { }); } -TargetIRAnalysis HexagonTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(HexagonTTIImpl(this, F)); - }); +TargetTransformInfo +HexagonTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(HexagonTTIImpl(this, F)); } diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h index acd41f920b53..a7c6a3437fbc 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.h +++ b/lib/Target/Hexagon/HexagonTargetMachine.h @@ -39,7 +39,7 @@ public: void adjustPassManager(PassManagerBuilder &PMB) override; TargetPassConfig *createPassConfig(PassManagerBase &PM) override; - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; HexagonTargetObjectFile *getObjFileLowering() const override { return static_cast<HexagonTargetObjectFile*>(TLOF.get()); diff --git a/lib/Target/Lanai/LanaiTargetMachine.cpp b/lib/Target/Lanai/LanaiTargetMachine.cpp index 9a73c95d6516..2c21a53b13bb 100644 --- a/lib/Target/Lanai/LanaiTargetMachine.cpp +++ b/lib/Target/Lanai/LanaiTargetMachine.cpp @@ -74,10 +74,9 @@ LanaiTargetMachine::LanaiTargetMachine(const Target &T, const Triple &TT, initAsmInfo(); } -TargetIRAnalysis LanaiTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(LanaiTTIImpl(this, F)); - }); +TargetTransformInfo +LanaiTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(LanaiTTIImpl(this, F)); } namespace { diff --git a/lib/Target/Lanai/LanaiTargetMachine.h b/lib/Target/Lanai/LanaiTargetMachine.h index 2fb1a0536104..0db286ec13e7 100644 --- a/lib/Target/Lanai/LanaiTargetMachine.h +++ b/lib/Target/Lanai/LanaiTargetMachine.h @@ -42,7 +42,7 @@ public: return &Subtarget; } - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &pass_manager) override; diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp index ac81e6207456..2f6dd0035de3 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp @@ -188,7 +188,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, // so we have to special check for them. unsigned Opcode = TmpInst.getOpcode(); if ((Opcode != Mips::NOP) && (Opcode != Mips::SLL) && - (Opcode != Mips::SLL_MM) && !Binary) + (Opcode != Mips::SLL_MM) && (Opcode != Mips::SLL_MMR6) && !Binary) llvm_unreachable("unimplemented opcode in encodeInstruction()"); int NewOpcode = -1; diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td index 50537bed8ff0..c85ee20273c0 100644 --- a/lib/Target/Mips/MipsRegisterInfo.td +++ b/lib/Target/Mips/MipsRegisterInfo.td @@ -38,7 +38,7 @@ class MipsRegWithSubRegs<bits<16> Enc, string n, list<Register> subregs> let Namespace = "Mips"; } -// Mips CPU Registers +// Mips CPU Registers. class MipsGPRReg<bits<16> Enc, string n> : MipsReg<Enc, n>; // Mips 64-bit CPU Registers diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp index 85193bffef56..fb79a4bf40c5 100644 --- a/lib/Target/Mips/MipsTargetMachine.cpp +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -259,17 +259,16 @@ void MipsPassConfig::addPreRegAlloc() { addPass(createMipsOptimizePICCallPass()); } -TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - if (Subtarget->allowMixed16_32()) { - DEBUG(errs() << "No Target Transform Info Pass Added\n"); - // FIXME: This is no longer necessary as the TTI returned is per-function. - return TargetTransformInfo(F.getParent()->getDataLayout()); - } - - DEBUG(errs() << "Target Transform Info Pass Added\n"); - return TargetTransformInfo(BasicTTIImpl(this, F)); - }); +TargetTransformInfo +MipsTargetMachine::getTargetTransformInfo(const Function &F) { + if (Subtarget->allowMixed16_32()) { + DEBUG(errs() << "No Target Transform Info Pass Added\n"); + // FIXME: This is no longer necessary as the TTI returned is per-function. + return TargetTransformInfo(F.getParent()->getDataLayout()); + } + + DEBUG(errs() << "Target Transform Info Pass Added\n"); + return TargetTransformInfo(BasicTTIImpl(this, F)); } // Implemented by targets that want to run passes immediately before diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h index ccfc9a938d9c..56e6e5d8daa2 100644 --- a/lib/Target/Mips/MipsTargetMachine.h +++ b/lib/Target/Mips/MipsTargetMachine.h @@ -44,7 +44,7 @@ public: CodeGenOpt::Level OL, bool JIT, bool isLittle); ~MipsTargetMachine() override; - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; const MipsSubtarget *getSubtargetImpl() const { if (Subtarget) diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 85f757878f94..d31e1cb5047b 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -180,10 +180,9 @@ void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { }); } -TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(NVPTXTTIImpl(this, F)); - }); +TargetTransformInfo +NVPTXTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(NVPTXTTIImpl(this, F)); } void NVPTXPassConfig::addEarlyCSEOrGVNPass() { diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h index 54a72a688ee3..eeebf64d39c3 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -63,7 +63,7 @@ public: void adjustPassManager(PassManagerBuilder &) override; - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; bool isMachineVerifierClean() const override { return false; diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp index c870a2256691..7902da20a010 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -1531,11 +1531,11 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, void PPCFrameLowering::createTailCallBranchInstr(MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); - DebugLoc dl; - if (MBBI != MBB.end()) - dl = MBBI->getDebugLoc(); + // If we got this far a first terminator should exist. + assert(MBBI != MBB.end() && "Failed to find the first terminator."); + DebugLoc dl = MBBI->getDebugLoc(); const PPCInstrInfo &TII = *Subtarget.getInstrInfo(); // Create branch instruction for pseudo tail call return instruction diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 18e567fa589c..cea59de3e8a9 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -11882,6 +11882,12 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, SDLoc dl(N); SDValue Op(N, 0); + // Don't handle ppc_fp128 here or i1 conversions. + if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) + return SDValue(); + if (Op.getOperand(0).getValueType() == MVT::i1) + return SDValue(); + SDValue FirstOperand(Op.getOperand(0)); bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD && (FirstOperand.getValueType() == MVT::i8 || @@ -11910,11 +11916,6 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld); } - // Don't handle ppc_fp128 here or i1 conversions. - if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) - return SDValue(); - if (Op.getOperand(0).getValueType() == MVT::i1) - return SDValue(); // For i32 intermediate values, unfortunately, the conversion functions // leave the upper 32 bits of the value are undefined. Within the set of diff --git a/lib/Target/PowerPC/PPCMIPeephole.cpp b/lib/Target/PowerPC/PPCMIPeephole.cpp index a2640727f813..474661aaaee8 100644 --- a/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -1025,9 +1025,6 @@ bool PPCMIPeephole::eliminateRedundantTOCSaves( // bge 0, .LBB0_4 bool PPCMIPeephole::eliminateRedundantCompare(void) { - // FIXME: this transformation is causing miscompiles. Disabling it for now - // until we can resolve the issue. - return false; bool Simplified = false; for (MachineBasicBlock &MBB2 : *MF) { @@ -1087,10 +1084,21 @@ bool PPCMIPeephole::eliminateRedundantCompare(void) { // we replace it with a signed comparison if the comparison // to be merged is a signed comparison. // In other cases of opcode mismatch, we cannot optimize this. - if (isEqOrNe(BI2) && + + // We cannot change opcode when comparing against an immediate + // if the most significant bit of the immediate is one + // due to the difference in sign extension. + auto CmpAgainstImmWithSignBit = [](MachineInstr *I) { + if (!I->getOperand(2).isImm()) + return false; + int16_t Imm = (int16_t)I->getOperand(2).getImm(); + return Imm < 0; + }; + + if (isEqOrNe(BI2) && !CmpAgainstImmWithSignBit(CMPI2) && CMPI1->getOpcode() == getSignedCmpOpCode(CMPI2->getOpcode())) NewOpCode = CMPI1->getOpcode(); - else if (isEqOrNe(BI1) && + else if (isEqOrNe(BI1) && !CmpAgainstImmWithSignBit(CMPI1) && getSignedCmpOpCode(CMPI1->getOpcode()) == CMPI2->getOpcode()) NewOpCode = CMPI2->getOpcode(); else continue; diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index 491f25ca2c64..20a83c973026 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -451,8 +451,7 @@ void PPCPassConfig::addPreEmitPass() { addPass(createPPCBranchSelectionPass(), false); } -TargetIRAnalysis PPCTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(PPCTTIImpl(this, F)); - }); +TargetTransformInfo +PPCTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(PPCTTIImpl(this, F)); } diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h index 102bf7ca59c2..75b98a815ab4 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.h +++ b/lib/Target/PowerPC/PPCTargetMachine.h @@ -49,7 +49,7 @@ public: // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp index e74d68182949..3a167a6d452a 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -257,8 +257,7 @@ TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) { return new SystemZPassConfig(*this, PM); } -TargetIRAnalysis SystemZTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(SystemZTTIImpl(this, F)); - }); +TargetTransformInfo +SystemZTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(SystemZTTIImpl(this, F)); } diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h index 95ad5e339e0b..52bf8bba55de 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.h +++ b/lib/Target/SystemZ/SystemZTargetMachine.h @@ -44,7 +44,7 @@ public: // Override LLVMTargetMachine TargetPassConfig *createPassConfig(PassManagerBase &PM) override; - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp index ad63c7a9cb30..c4c0dd22ee0c 100644 --- a/lib/Target/TargetMachine.cpp +++ b/lib/Target/TargetMachine.cpp @@ -219,10 +219,8 @@ CodeGenOpt::Level TargetMachine::getOptLevel() const { return OptLevel; } void TargetMachine::setOptLevel(CodeGenOpt::Level Level) { OptLevel = Level; } -TargetIRAnalysis TargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([](const Function &F) { - return TargetTransformInfo(F.getParent()->getDataLayout()); - }); +TargetTransformInfo TargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(F.getParent()->getDataLayout()); } void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name, @@ -244,3 +242,10 @@ MCSymbol *TargetMachine::getSymbol(const GlobalValue *GV) const { getNameWithPrefix(NameStr, GV, TLOF->getMangler()); return TLOF->getContext().getOrCreateSymbol(NameStr); } + +TargetIRAnalysis TargetMachine::getTargetIRAnalysis() { + // Since Analysis can't depend on Target, use a std::function to invert the + // dependency. + return TargetIRAnalysis( + [this](const Function &F) { return this->getTargetTransformInfo(F); }); +} diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp index 2bdba96ab674..a4bb967f36f6 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -746,6 +746,14 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>(); LiveIntervals &LIS = getAnalysis<LiveIntervals>(); + // Disable the TEE optimization if we aren't doing direct wasm object + // emission, because lowering TEE to TEE_LOCAL is done in the ExplicitLocals + // pass, which is also disabled. + bool UseTee = true; + if (MF.getSubtarget<WebAssemblySubtarget>() + .getTargetTriple().isOSBinFormatELF()) + UseTee = false; + // Walk the instructions from the bottom up. Currently we don't look past // block boundaries, and the blocks aren't ordered so the block visitation // order isn't significant, but we may want to change this in the future. @@ -811,7 +819,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { Insert = RematerializeCheapDef(Reg, Op, *Def, MBB, Insert->getIterator(), LIS, MFI, MRI, TII, TRI); - } else if (CanMove && + } else if (UseTee && CanMove && OneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT, LIS, MFI)) { Insert = MoveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI, MRI, TII); diff --git a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index 2599064334ee..f808c063d7e4 100644 --- a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -223,6 +223,8 @@ RuntimeLibcallSignatures[RTLIB::UNKNOWN_LIBCALL] = { /* SINCOS_F80 */ unsupported, /* SINCOS_F128 */ func_i64_i64_iPTR_iPTR, /* SINCOS_PPCF128 */ unsupported, +/* SINCOS_STRET_F32 */ unsupported, +/* SINCOS_STRET_F64 */ unsupported, /* POW_F32 */ f32_func_f32_f32, /* POW_F64 */ f64_func_f64_f64, /* POW_F80 */ unsupported, @@ -390,8 +392,9 @@ RuntimeLibcallSignatures[RTLIB::UNKNOWN_LIBCALL] = { // MEMORY /* MEMCPY */ iPTR_func_iPTR_iPTR_iPTR, -/* MEMSET */ iPTR_func_iPTR_i32_iPTR, /* MEMMOVE */ iPTR_func_iPTR_iPTR_iPTR, +/* MEMSET */ iPTR_func_iPTR_i32_iPTR, +/* BZERO */ unsupported, // ELEMENT-WISE ATOMIC MEMORY /* MEMCPY_ELEMENT_UNORDERED_ATOMIC_1 */ unsupported, @@ -687,6 +690,8 @@ RuntimeLibcallNames[RTLIB::UNKNOWN_LIBCALL] = { /* SINCOS_F80 */ nullptr, /* SINCOS_F128 */ "sincosl", /* SINCOS_PPCF128 */ nullptr, +/* SINCOS_STRET_F32 */ nullptr, +/* SINCOS_STRET_F64 */ nullptr, /* POW_F32 */ "powf", /* POW_F64 */ "pow", /* POW_F80 */ nullptr, @@ -850,6 +855,7 @@ RuntimeLibcallNames[RTLIB::UNKNOWN_LIBCALL] = { /* MEMCPY */ "memcpy", /* MEMMOVE */ "memset", /* MEMSET */ "memmove", +/* BZERO */ nullptr, /* MEMCPY_ELEMENT_UNORDERED_ATOMIC_1 */ nullptr, /* MEMCPY_ELEMENT_UNORDERED_ATOMIC_2 */ nullptr, /* MEMCPY_ELEMENT_UNORDERED_ATOMIC_4 */ nullptr, diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index 3cc19ef5fbab..d38cde74d2ec 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -146,10 +146,9 @@ public: }; } // end anonymous namespace -TargetIRAnalysis WebAssemblyTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(WebAssemblyTTIImpl(this, F)); - }); +TargetTransformInfo +WebAssemblyTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(WebAssemblyTTIImpl(this, F)); } TargetPassConfig * diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.h b/lib/Target/WebAssembly/WebAssemblyTargetMachine.h index 224849526514..dd826befd117 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.h +++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.h @@ -43,8 +43,7 @@ public: return TLOF.get(); } - /// \brief Get the TargetIRAnalysis for this target. - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; bool usesPhysRegsForPEI() const override { return false; } }; diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 78385ae1877b..239db2a74b24 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -78,7 +78,7 @@ public: CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" && CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" && CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" && - CPU != "c3" && CPU != "c3-2" && CPU != "lakemont"; + CPU != "c3" && CPU != "c3-2" && CPU != "lakemont" && CPU != ""; } unsigned getNumFixupKinds() const override { diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 08731cd0204c..7e7c35569093 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -137,7 +137,7 @@ def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ", def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true", "Enable AVX-512 PreFetch Instructions", [FeatureAVX512]>; -def FeaturePREFETCHWT1 : SubtargetFeature<"prefetchwt1", "HasPFPREFETCHWT1", +def FeaturePREFETCHWT1 : SubtargetFeature<"prefetchwt1", "HasPREFETCHWT1", "true", "Prefetch with Intent to Write and T1 Hint">; def FeatureDQI : SubtargetFeature<"avx512dq", "HasDQI", "true", @@ -263,6 +263,12 @@ def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", "Use software floating point features.">; +// On recent X86 (port bound) processors, its preferable to combine to a single shuffle +// using a variable mask over multiple fixed shuffles. +def FeatureFastVariableShuffle + : SubtargetFeature<"fast-variable-shuffle", + "HasFastVariableShuffle", + "true", "Shuffles with variable masks are fast">; // On some X86 processors, there is no performance hazard to writing only the // lower parts of a YMM or ZMM register without clearing the upper part. def FeatureFastPartialYMMorZMMWrite @@ -620,7 +626,8 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [ FeatureERMSB, FeatureFMA, FeatureLZCNT, - FeatureMOVBE + FeatureMOVBE, + FeatureFastVariableShuffle ]>; class HaswellProc<string Name> : ProcModel<Name, HaswellModel, @@ -632,7 +639,8 @@ def : HaswellProc<"core-avx2">; // Legacy alias. def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [ FeatureADX, - FeatureRDSEED + FeatureRDSEED, + FeaturePRFCHW ]>; class BroadwellProc<string Name> : ProcModel<Name, BroadwellModel, BDWFeatures.Value, [ @@ -669,7 +677,8 @@ def KNLFeatures : ProcessorFeatures<IVBFeatures.Value, [ FeatureLZCNT, FeatureBMI, FeatureBMI2, - FeatureFMA + FeatureFMA, + FeaturePRFCHW ]>; // FIXME: define KNL model diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp index 0a87fb4533c2..ba7280c29cc9 100644 --- a/lib/Target/X86/X86DomainReassignment.cpp +++ b/lib/Target/X86/X86DomainReassignment.cpp @@ -301,60 +301,21 @@ typedef DenseMap<InstrConverterBaseKeyTy, InstrConverterBase *> /// different closure that manipulates the loaded or stored value. class Closure { private: - const TargetInstrInfo *TII; - MachineRegisterInfo *MRI; - /// Virtual registers in the closure. DenseSet<unsigned> Edges; /// Instructions in the closure. SmallVector<MachineInstr *, 8> Instrs; - /// A map of available Instruction Converters. - const InstrConverterBaseMap &Converters; - - /// The register domain of this closure. - RegDomain Domain; - /// Domains which this closure can legally be reassigned to. std::bitset<NumDomains> LegalDstDomains; - /// Enqueue \p Reg to be considered for addition to the closure. - void visitRegister(unsigned Reg, SmallVectorImpl<unsigned> &Worklist); - - /// Add \p MI to this closure. - void encloseInstr(MachineInstr *MI); - - /// Calculate the total cost of reassigning the closure to \p Domain. - double calculateCost(RegDomain Domain) const; - - /// All edges that are included in some closure. - DenseSet<unsigned> &EnclosedEdges; - - /// All instructions that are included in some closure. - DenseMap<MachineInstr *, Closure *> &EnclosedInstrs; - public: - Closure(const TargetInstrInfo *TII, MachineRegisterInfo *MRI, - const InstrConverterBaseMap &Converters, - std::initializer_list<RegDomain> LegalDstDomainList, - DenseSet<unsigned> &EnclosedEdges, - DenseMap<MachineInstr *, Closure *> &EnclosedInstrs) - : TII(TII), MRI(MRI), Converters(Converters), Domain(NoDomain), - EnclosedEdges(EnclosedEdges), EnclosedInstrs(EnclosedInstrs) { + Closure(std::initializer_list<RegDomain> LegalDstDomainList) { for (RegDomain D : LegalDstDomainList) LegalDstDomains.set(D); } - /// Starting from \Reg, expand the closure as much as possible. - void buildClosure(unsigned E); - - /// /returns true if it is profitable to reassign the closure to \p Domain. - bool isReassignmentProfitable(RegDomain Domain) const; - - /// Reassign the closure to \p Domain. - void Reassign(RegDomain Domain) const; - /// Mark this closure as illegal for reassignment to all domains. void setAllIllegal() { LegalDstDomains.reset(); } @@ -364,10 +325,41 @@ public: /// \returns true if is legal to reassign this closure to domain \p RD. bool isLegal(RegDomain RD) const { return LegalDstDomains[RD]; } + /// Mark this closure as illegal for reassignment to domain \p RD. + void setIllegal(RegDomain RD) { LegalDstDomains[RD] = false; } + bool empty() const { return Edges.empty(); } + + bool insertEdge(unsigned Reg) { + return Edges.insert(Reg).second; + } + + using const_edge_iterator = DenseSet<unsigned>::const_iterator; + iterator_range<const_edge_iterator> edges() const { + return iterator_range<const_edge_iterator>(Edges.begin(), Edges.end()); + } + + void addInstruction(MachineInstr *I) { + Instrs.push_back(I); + } + + ArrayRef<MachineInstr *> instructions() const { + return Instrs; + } + }; class X86DomainReassignment : public MachineFunctionPass { + const X86Subtarget *STI; + MachineRegisterInfo *MRI; + const X86InstrInfo *TII; + + /// All edges that are included in some closure + DenseSet<unsigned> EnclosedEdges; + + /// All instructions that are included in some closure. + DenseMap<MachineInstr *, Closure *> EnclosedInstrs; + public: static char ID; @@ -387,22 +379,39 @@ public: } private: - const X86Subtarget *STI; - MachineRegisterInfo *MRI; - const X86InstrInfo *TII; - /// A map of available Instruction Converters. InstrConverterBaseMap Converters; /// Initialize Converters map. void initConverters(); + + /// Starting from \Reg, expand the closure as much as possible. + void buildClosure(Closure &, unsigned Reg); + + /// Enqueue \p Reg to be considered for addition to the closure. + void visitRegister(Closure &, unsigned Reg, RegDomain &Domain, + SmallVectorImpl<unsigned> &Worklist); + + /// Reassign the closure to \p Domain. + void reassign(const Closure &C, RegDomain Domain) const; + + /// Add \p MI to the closure. + void encloseInstr(Closure &C, MachineInstr *MI); + + /// /returns true if it is profitable to reassign the closure to \p Domain. + bool isReassignmentProfitable(const Closure &C, RegDomain Domain) const; + + /// Calculate the total cost of reassigning the closure to \p Domain. + double calculateCost(const Closure &C, RegDomain Domain) const; }; char X86DomainReassignment::ID = 0; } // End anonymous namespace. -void Closure::visitRegister(unsigned Reg, SmallVectorImpl<unsigned> &Worklist) { +void X86DomainReassignment::visitRegister(Closure &C, unsigned Reg, + RegDomain &Domain, + SmallVectorImpl<unsigned> &Worklist) { if (EnclosedEdges.count(Reg)) return; @@ -423,59 +432,61 @@ void Closure::visitRegister(unsigned Reg, SmallVectorImpl<unsigned> &Worklist) { Worklist.push_back(Reg); } -void Closure::encloseInstr(MachineInstr *MI) { +void X86DomainReassignment::encloseInstr(Closure &C, MachineInstr *MI) { auto I = EnclosedInstrs.find(MI); if (I != EnclosedInstrs.end()) { - if (I->second != this) + if (I->second != &C) // Instruction already belongs to another closure, avoid conflicts between // closure and mark this closure as illegal. - setAllIllegal(); + C.setAllIllegal(); return; } - EnclosedInstrs[MI] = this; - Instrs.push_back(MI); + EnclosedInstrs[MI] = &C; + C.addInstruction(MI); // Mark closure as illegal for reassignment to domains, if there is no // converter for the instruction or if the converter cannot convert the // instruction. - for (unsigned i = 0; i != LegalDstDomains.size(); ++i) { - if (LegalDstDomains[i]) { + for (int i = 0; i != NumDomains; ++i) { + if (C.isLegal((RegDomain)i)) { InstrConverterBase *IC = Converters.lookup({i, MI->getOpcode()}); if (!IC || !IC->isLegal(MI, TII)) - LegalDstDomains[i] = false; + C.setIllegal((RegDomain)i); } } } -double Closure::calculateCost(RegDomain DstDomain) const { - assert(isLegal(DstDomain) && "Cannot calculate cost for illegal closure"); +double X86DomainReassignment::calculateCost(const Closure &C, + RegDomain DstDomain) const { + assert(C.isLegal(DstDomain) && "Cannot calculate cost for illegal closure"); double Cost = 0.0; - for (auto MI : Instrs) + for (auto *MI : C.instructions()) Cost += Converters.lookup({DstDomain, MI->getOpcode()})->getExtraCost(MI, MRI); return Cost; } -bool Closure::isReassignmentProfitable(RegDomain Domain) const { - return calculateCost(Domain) < 0.0; +bool X86DomainReassignment::isReassignmentProfitable(const Closure &C, + RegDomain Domain) const { + return calculateCost(C, Domain) < 0.0; } -void Closure::Reassign(RegDomain Domain) const { - assert(isLegal(Domain) && "Cannot convert illegal closure"); +void X86DomainReassignment::reassign(const Closure &C, RegDomain Domain) const { + assert(C.isLegal(Domain) && "Cannot convert illegal closure"); // Iterate all instructions in the closure, convert each one using the // appropriate converter. SmallVector<MachineInstr *, 8> ToErase; - for (auto MI : Instrs) + for (auto *MI : C.instructions()) if (Converters.lookup({Domain, MI->getOpcode()}) ->convertInstr(MI, TII, MRI)) ToErase.push_back(MI); // Iterate all registers in the closure, replace them with registers in the // destination domain. - for (unsigned Reg : Edges) { + for (unsigned Reg : C.edges()) { MRI->setRegClass(Reg, getDstRC(MRI->getRegClass(Reg), Domain)); for (auto &MO : MRI->use_operands(Reg)) { if (MO.isReg()) @@ -512,18 +523,19 @@ static bool usedAsAddr(const MachineInstr &MI, unsigned Reg, return false; } -void Closure::buildClosure(unsigned Reg) { +void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) { SmallVector<unsigned, 4> Worklist; - visitRegister(Reg, Worklist); + RegDomain Domain = NoDomain; + visitRegister(C, Reg, Domain, Worklist); while (!Worklist.empty()) { unsigned CurReg = Worklist.pop_back_val(); // Register already in this closure. - if (!Edges.insert(CurReg).second) + if (!C.insertEdge(CurReg)) continue; MachineInstr *DefMI = MRI->getVRegDef(CurReg); - encloseInstr(DefMI); + encloseInstr(C, DefMI); // Add register used by the defining MI to the worklist. // Do not add registers which are used in address calculation, they will be @@ -542,7 +554,7 @@ void Closure::buildClosure(unsigned Reg) { auto &Op = DefMI->getOperand(OpIdx); if (!Op.isReg() || !Op.isUse()) continue; - visitRegister(Op.getReg(), Worklist); + visitRegister(C, Op.getReg(), Domain, Worklist); } // Expand closure through register uses. @@ -550,10 +562,10 @@ void Closure::buildClosure(unsigned Reg) { // We would like to avoid converting closures which calculare addresses, // as this should remain in GPRs. if (usedAsAddr(UseMI, CurReg, TII)) { - setAllIllegal(); + C.setAllIllegal(); continue; } - encloseInstr(&UseMI); + encloseInstr(C, &UseMI); for (auto &DefOp : UseMI.defs()) { if (!DefOp.isReg()) @@ -561,10 +573,10 @@ void Closure::buildClosure(unsigned Reg) { unsigned DefReg = DefOp.getReg(); if (!TargetRegisterInfo::isVirtualRegister(DefReg)) { - setAllIllegal(); + C.setAllIllegal(); continue; } - visitRegister(DefReg, Worklist); + visitRegister(C, DefReg, Domain, Worklist); } } } @@ -701,8 +713,8 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { initConverters(); bool Changed = false; - DenseSet<unsigned> EnclosedEdges; - DenseMap<MachineInstr *, Closure *> EnclosedInstrs; + EnclosedEdges.clear(); + EnclosedInstrs.clear(); std::vector<Closure> Closures; @@ -719,9 +731,8 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { continue; // Calculate closure starting with Reg. - Closure C(TII, MRI, Converters, {MaskDomain}, EnclosedEdges, - EnclosedInstrs); - C.buildClosure(Reg); + Closure C({MaskDomain}); + buildClosure(C, Reg); // Collect all closures that can potentially be converted. if (!C.empty() && C.isLegal(MaskDomain)) @@ -729,8 +740,8 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { } for (Closure &C : Closures) - if (C.isReassignmentProfitable(MaskDomain)) { - C.Reassign(MaskDomain); + if (isReassignmentProfitable(C, MaskDomain)) { + reassign(C, MaskDomain); ++NumClosuresConverted; Changed = true; } diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index a6c7c5f22a3a..660c1eff3c4b 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -106,14 +106,15 @@ namespace { if (Base_Reg.getNode()) Base_Reg.getNode()->dump(); else - dbgs() << "nul"; - dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n' - << " Scale" << Scale << '\n' + dbgs() << "nul\n"; + if (BaseType == FrameIndexBase) + dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'; + dbgs() << " Scale " << Scale << '\n' << "IndexReg "; if (IndexReg.getNode()) IndexReg.getNode()->dump(); else - dbgs() << "nul"; + dbgs() << "nul\n"; dbgs() << " Disp " << Disp << '\n' << "GV "; if (GV) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index a72f4daa5e11..5ac5d0348f8a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -461,7 +461,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SRL_PARTS, VT, Custom); } - if (Subtarget.hasSSE1()) + if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow()) setOperationAction(ISD::PREFETCH , MVT::Other, Legal); setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); @@ -1622,16 +1622,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLibcallName(RTLIB::MUL_I128, nullptr); } - // Combine sin / cos into one node or libcall if possible. - if (Subtarget.hasSinCos()) { - setLibcallName(RTLIB::SINCOS_F32, "sincosf"); - setLibcallName(RTLIB::SINCOS_F64, "sincos"); - if (Subtarget.isTargetDarwin()) { - // For MacOSX, we don't want the normal expansion of a libcall to sincos. - // We want to issue a libcall to __sincos_stret to avoid memory traffic. - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); - } + // Combine sin / cos into _sincos_stret if it is available. + if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && + getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { + setOperationAction(ISD::FSINCOS, MVT::f64, Custom); + setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } if (Subtarget.isTargetWin64()) { @@ -7480,9 +7475,9 @@ static bool isAddSub(const BuildVectorSDNode *BV, } /// Returns true if is possible to fold MUL and an idiom that has already been -/// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1). -/// If (and only if) true is returned, the operands of FMADDSUB are written to -/// parameters \p Opnd0, \p Opnd1, \p Opnd2. +/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into +/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the +/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2. /// /// Prior to calling this function it should be known that there is some /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation @@ -7505,12 +7500,12 @@ static bool isAddSub(const BuildVectorSDNode *BV, /// recognized ADDSUB idiom with ADDSUB operation is that such replacement /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit /// FMADDSUB is. -static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG, - SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, - unsigned ExpectedUses) { +static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, + SelectionDAG &DAG, + SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, + unsigned ExpectedUses) { if (Opnd0.getOpcode() != ISD::FMUL || - !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || - !Subtarget.hasAnyFMA()) + !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA()) return false; // FIXME: These checks must match the similar ones in @@ -7547,7 +7542,7 @@ static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, SDValue Opnd2; // TODO: According to coverage reports, the FMADDSUB transform is not // triggered by any tests. - if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) + if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2); // Do not generate X86ISD::ADDSUB node for 512-bit types even though @@ -11958,6 +11953,19 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask, return 0; } +static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { + MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); + MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); + + SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true); + if (V2.isUndef()) + return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); + + return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); +} + /// \brief Generic lowering of v16i8 shuffles. /// /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to @@ -12148,6 +12156,10 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( DL, MVT::v16i8, V1, V2, Mask, DAG)) return Unpack; + + // If we have VBMI we can use one VPERM instead of multiple PSHUFBs. + if (Subtarget.hasVBMI() && Subtarget.hasVLX()) + return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG); } return PSHUFB; @@ -13048,19 +13060,6 @@ static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT, DAG.getConstant(Immediate, DL, MVT::i8)); } -static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { - MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); - MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); - - SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true); - if (V2.isUndef()) - return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); - - return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); -} - /// \brief Handle lowering of 4-lane 64-bit floating point shuffles. /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 @@ -13615,6 +13614,10 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; + // AVX512VBMIVL can lower to VPERMB. + if (Subtarget.hasVBMI() && Subtarget.hasVLX()) + return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG); + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( @@ -14077,6 +14080,10 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, Zeroable, Subtarget, DAG)) return Blend; + if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( + DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG)) + return PSHUFB; + return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } @@ -14212,7 +14219,9 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, ExtVT = MVT::v4i32; break; case MVT::v8i1: - ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL + // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit + // shuffle. + ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64; break; case MVT::v16i1: ExtVT = MVT::v16i32; @@ -14569,11 +14578,10 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, unsigned NumElts = VecVT.getVectorNumElements(); // Extending v8i1/v16i1 to 512-bit get better performance on KNL // than extending to 128/256bit. - unsigned VecSize = (NumElts <= 4 ? 128 : 512); - MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize / NumElts), NumElts); - SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec); - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, - ExtVT.getVectorElementType(), Ext, Idx); + MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8; + MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts); + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx); return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); } @@ -14768,12 +14776,11 @@ static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, // Non constant index. Extend source and destination, // insert element and then truncate the result. unsigned NumElts = VecVT.getVectorNumElements(); - unsigned VecSize = (NumElts <= 4 ? 128 : 512); - MVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts); - MVT ExtEltVT = ExtVecVT.getVectorElementType(); + MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8; + MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts); SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, - DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), - DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); + DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec), + DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx); return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); } @@ -16287,21 +16294,6 @@ static SDValue LowerZERO_EXTEND_Mask(SDValue Op, return SelectedVal; } -static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - SDValue In = Op->getOperand(0); - MVT InVT = In.getSimpleValueType(); - - if (InVT.getVectorElementType() == MVT::i1) - return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG); - - if (Subtarget.hasFp256()) - if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) - return Res; - - return SDValue(); -} - static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = Op.getOperand(0); @@ -16440,7 +16432,8 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, assert((InVT.is256BitVector() || InVT.is128BitVector()) && "Unexpected vector type."); unsigned NumElts = InVT.getVectorNumElements(); - MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts); + MVT ExtVT = MVT::getVectorVT(EltVT, NumElts); In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); InVT = ExtVT; ShiftInx = InVT.getScalarSizeInBits() - 1; @@ -18446,6 +18439,21 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, return V; } +static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SDValue In = Op->getOperand(0); + MVT InVT = In.getSimpleValueType(); + + if (InVT.getVectorElementType() == MVT::i1) + return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG); + + if (Subtarget.hasFp256()) + if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) + return Res; + + return SDValue(); +} + // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG. // For sign extend this needs to handle all vector sizes and SSE4.1 and // non-SSE4.1 targets. For zero extend this should only handle inputs of @@ -21128,7 +21136,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, // ADC/ADCX/SBB case ADX: { SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32); - SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::i32); + SDVTList VTs = DAG.getVTList(Op.getOperand(3).getValueType(), MVT::i32); SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2), DAG.getConstant(-1, dl, MVT::i8)); SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3), @@ -22231,6 +22239,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask)); } + assert(VT == MVT::v16i8 && "Unexpected VT"); + SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A); SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B); SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB); @@ -22989,12 +22999,14 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, (Subtarget.hasAVX512() && VT == MVT::v16i16) || (Subtarget.hasAVX512() && VT == MVT::v16i8) || (Subtarget.hasBWI() && VT == MVT::v32i8)) { - MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32); + assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && + "Unexpected vector type"); + MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32; MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements()); unsigned ExtOpc = Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; R = DAG.getNode(ExtOpc, dl, ExtVT, R); - Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt); + Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt); return DAG.getNode(ISD::TRUNCATE, dl, VT, DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt)); } @@ -24101,8 +24113,9 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, // Only optimize x86_64 for now. i386 is a bit messy. For f32, // the small struct {f32, f32} is returned in (eax, edx). For f64, // the results are returned via SRet in memory. - const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret"; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; + const char *LibcallName = TLI.getLibcallName(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); @@ -24928,7 +24941,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::BITCAST: { assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT DstVT = N->getValueType(0); - EVT SrcVT = N->getOperand(0)->getValueType(0); + EVT SrcVT = N->getOperand(0).getValueType(); if (SrcVT != MVT::f64 || (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8)) @@ -28407,8 +28420,6 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // TODO - attempt to narrow Mask back to writemask size. bool IsEVEXShuffle = RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128); - if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits)) - return SDValue(); // TODO - handle 128/256-bit lane shuffles of 512-bit vectors. @@ -28491,11 +28502,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, - ShuffleVT)) { + ShuffleVT) && + (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! - if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) - return SDValue(); // AVX512 Writemask clash. Res = DAG.getBitcast(ShuffleSrcVT, V1); DCI.AddToWorklist(Res.getNode()); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); @@ -28505,11 +28515,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, Subtarget, Shuffle, - ShuffleVT, PermuteImm)) { + ShuffleVT, PermuteImm) && + (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! - if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) - return SDValue(); // AVX512 Writemask clash. Res = DAG.getBitcast(ShuffleVT, V1); DCI.AddToWorklist(Res.getNode()); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, @@ -28520,12 +28529,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, - V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, - ShuffleVT, UnaryShuffle)) { + V1, V2, DL, DAG, Subtarget, Shuffle, + ShuffleSrcVT, ShuffleVT, UnaryShuffle) && + (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! - if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) - return SDValue(); // AVX512 Writemask clash. V1 = DAG.getBitcast(ShuffleSrcVT, V1); DCI.AddToWorklist(V1.getNode()); V2 = DAG.getBitcast(ShuffleSrcVT, V2); @@ -28538,11 +28546,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT, - PermuteImm)) { + PermuteImm) && + (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! - if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) - return SDValue(); // AVX512 Writemask clash. V1 = DAG.getBitcast(ShuffleVT, V1); DCI.AddToWorklist(V1.getNode()); V2 = DAG.getBitcast(ShuffleVT, V2); @@ -28594,8 +28601,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return SDValue(); // Depth threshold above which we can efficiently use variable mask shuffles. - // TODO This should probably be target specific. - bool AllowVariableMask = (Depth >= 3) || HasVariableMask; + int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3; + bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask; bool MaskContainsZeros = any_of(Mask, [](int M) { return M == SM_SentinelZero; }); @@ -29698,17 +29705,18 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, return SDValue(); } -/// Returns true iff the shuffle node \p N can be replaced with ADDSUB -/// operation. If true is returned then the operands of ADDSUB operation +/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD) +/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation /// are written to the parameters \p Opnd0 and \p Opnd1. /// -/// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes +/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes /// so it is easier to generically match. We also insert dummy vector shuffle /// nodes for the operands which explicitly discard the lanes which are unused /// by this operation to try to flow through the rest of the combiner /// the fact that they're unused. -static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget, - SDValue &Opnd0, SDValue &Opnd1) { +static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget, + SDValue &Opnd0, SDValue &Opnd1, + bool matchSubAdd = false) { EVT VT = N->getValueType(0); if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && @@ -29728,12 +29736,15 @@ static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget, SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); - // We require the first shuffle operand to be the FSUB node, and the second to - // be the FADD node. - if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) { + unsigned ExpectedOpcode = matchSubAdd ? ISD::FADD : ISD::FSUB; + unsigned NextExpectedOpcode = matchSubAdd ? ISD::FSUB : ISD::FADD; + + // We require the first shuffle operand to be the ExpectedOpcode node, + // and the second to be the NextExpectedOpcode node. + if (V1.getOpcode() == NextExpectedOpcode && V2.getOpcode() == ExpectedOpcode) { ShuffleVectorSDNode::commuteMask(Mask); std::swap(V1, V2); - } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD) + } else if (V1.getOpcode() != ExpectedOpcode || V2.getOpcode() != NextExpectedOpcode) return false; // If there are other uses of these operations we can't fold them. @@ -29767,7 +29778,7 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Opnd0, Opnd1; - if (!isAddSub(N, Subtarget, Opnd0, Opnd1)) + if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1)) return SDValue(); EVT VT = N->getValueType(0); @@ -29775,7 +29786,7 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, // Try to generate X86ISD::FMADDSUB node here. SDValue Opnd2; - if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) + if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2); // Do not generate X86ISD::ADDSUB node for 512-bit types even though @@ -29787,6 +29798,26 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); } +/// \brief Try to combine a shuffle into a target-specific +/// mul-sub-add node. +static SDValue combineShuffleToFMSubAdd(SDNode *N, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SDValue Opnd0, Opnd1; + if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1, true)) + return SDValue(); + + EVT VT = N->getValueType(0); + SDLoc DL(N); + + // Try to generate X86ISD::FMSUBADD node here. + SDValue Opnd2; + if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) + return DAG.getNode(X86ISD::FMSUBADD, DL, VT, Opnd0, Opnd1, Opnd2); + + return SDValue(); +} + // We are looking for a shuffle where both sources are concatenated with undef // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so // if we can express this as a single-source shuffle, that's preferable. @@ -29873,11 +29904,14 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If we have legalized the vector types, look for blends of FADD and FSUB - // nodes that we can fuse into an ADDSUB node. + // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node. if (TLI.isTypeLegal(VT)) { if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG)) return AddSub; + if (SDValue FMSubAdd = combineShuffleToFMSubAdd(N, Subtarget, DAG)) + return FMSubAdd; + if (SDValue HAddSub = foldShuffleOfHorizOp(N)) return HAddSub; } @@ -30181,7 +30215,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2)) // sign-extend to a 256-bit operation to avoid truncation. if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() && - N0->getOperand(0)->getValueType(0).is256BitVector()) { + N0->getOperand(0).getValueType().is256BitVector()) { SExtVT = MVT::v4i64; FPCastVT = MVT::v4f64; } @@ -30194,8 +30228,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, // 256-bit because the shuffle is cheaper than sign extending the result of // the compare. if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() && - (N0->getOperand(0)->getValueType(0).is256BitVector() || - N0->getOperand(0)->getValueType(0).is512BitVector())) { + (N0->getOperand(0).getValueType().is256BitVector() || + N0->getOperand(0).getValueType().is512BitVector())) { SExtVT = MVT::v8i32; FPCastVT = MVT::v8f32; } @@ -30484,7 +30518,8 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1); } -// Attempt to replace an min/max v8i16 horizontal reduction with PHMINPOSUW. +// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with +// PHMINPOSUW. static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Bail without SSE41. @@ -30492,7 +30527,7 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, return SDValue(); EVT ExtractVT = Extract->getValueType(0); - if (ExtractVT != MVT::i16) + if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8) return SDValue(); // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns. @@ -30504,7 +30539,7 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, EVT SrcVT = Src.getValueType(); EVT SrcSVT = SrcVT.getScalarType(); - if (SrcSVT != MVT::i16 || (SrcVT.getSizeInBits() % 128) != 0) + if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0) return SDValue(); SDLoc DL(Extract); @@ -30520,22 +30555,39 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits); MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi); } - assert(SrcVT == MVT::v8i16 && "Unexpected value type"); + assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || + (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && + "Unexpected value type"); // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask // to flip the value accordingly. SDValue Mask; + unsigned MaskEltsBits = ExtractVT.getSizeInBits(); if (BinOp == ISD::SMAX) - Mask = DAG.getConstant(APInt::getSignedMaxValue(16), DL, SrcVT); + Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT); else if (BinOp == ISD::SMIN) - Mask = DAG.getConstant(APInt::getSignedMinValue(16), DL, SrcVT); + Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT); else if (BinOp == ISD::UMAX) - Mask = DAG.getConstant(APInt::getAllOnesValue(16), DL, SrcVT); + Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT); if (Mask) MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); - MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, SrcVT, MinPos); + // For v16i8 cases we need to perform UMIN on pairs of byte elements, + // shuffling each upper element down and insert zeros. This means that the + // v16i8 UMIN will leave the upper element as zero, performing zero-extension + // ready for the PHMINPOS. + if (ExtractVT == MVT::i8) { + SDValue Upper = DAG.getVectorShuffle( + SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL), + {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16}); + MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper); + } + + // Perform the PHMINPOS on a v8i16 vector, + MinPos = DAG.getBitcast(MVT::v8i16, MinPos); + MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos); + MinPos = DAG.getBitcast(SrcVT, MinPos); if (Mask) MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); @@ -30851,7 +30903,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget)) return Cmp; - // Attempt to replace min/max v8i16 reductions with PHMINPOSUW. + // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW. if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget)) return MinMax; @@ -32555,7 +32607,7 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) { // 1. MOVs can write to a register that differs from source // 2. MOVs accept memory operands - if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant || + if (VT.isVector() || N1.getOpcode() != ISD::Constant || N0.getOpcode() != ISD::SHL || !N0.hasOneUse() || N0.getOperand(1).getOpcode() != ISD::Constant) return SDValue(); @@ -32569,11 +32621,11 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) { if (SarConst.isNegative()) return SDValue(); - for (MVT SVT : MVT::integer_valuetypes()) { + for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) { unsigned ShiftSize = SVT.getSizeInBits(); // skipping types without corresponding sext/zext and // ShlConst that is not one of [56,48,32,24,16] - if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize) + if (ShiftSize >= Size || ShlConst != Size - ShiftSize) continue; SDLoc DL(N); SDValue NN = @@ -32626,37 +32678,6 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -/// \brief Returns a vector of 0s if the node in input is a vector logical -/// shift by a constant amount which is known to be bigger than or equal -/// to the vector element size in bits. -static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - EVT VT = N->getValueType(0); - - if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && - (!Subtarget.hasInt256() || - (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) - return SDValue(); - - SDValue Amt = N->getOperand(1); - SDLoc DL(N); - if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt)) - if (auto *AmtSplat = AmtBV->getConstantSplatNode()) { - const APInt &ShiftAmt = AmtSplat->getAPIntValue(); - unsigned MaxAmount = - VT.getSimpleVT().getScalarSizeInBits(); - - // SSE2/AVX2 logical shifts always return a vector of 0s - // if the shift amount is bigger than or equal to - // the element size. The constant shift amount will be - // encoded as a 8-bit immediate. - if (ShiftAmt.trunc(8).uge(MaxAmount)) - return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL); - } - - return SDValue(); -} - static SDValue combineShift(SDNode* N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -32672,11 +32693,6 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG, if (SDValue V = combineShiftRightLogical(N, DAG)) return V; - // Try to fold this logical shift into a zero vector. - if (N->getOpcode() != ISD::SRA) - if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget)) - return V; - return SDValue(); } @@ -32996,21 +33012,20 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { // register. In most cases we actually compare or select YMM-sized registers // and mixing the two types creates horrible code. This method optimizes // some of the transition sequences. +// Even with AVX-512 this is still useful for removing casts around logical +// operations on vXi1 mask types. static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); - if (!VT.is256BitVector()) - return SDValue(); + assert(VT.isVector() && "Expected vector type"); assert((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); SDValue Narrow = N->getOperand(0); - EVT NarrowVT = Narrow->getValueType(0); - if (!NarrowVT.is128BitVector()) - return SDValue(); + EVT NarrowVT = Narrow.getValueType(); if (Narrow->getOpcode() != ISD::XOR && Narrow->getOpcode() != ISD::AND && @@ -33026,12 +33041,12 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, return SDValue(); // The type of the truncated inputs. - EVT WideVT = N0->getOperand(0)->getValueType(0); - if (WideVT != VT) + if (N0->getOperand(0).getValueType() != VT) return SDValue(); // The right side has to be a 'trunc' or a constant vector. - bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; + bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE && + N1.getOperand(0).getValueType() == VT; ConstantSDNode *RHSConstSplat = nullptr; if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1)) RHSConstSplat = RHSBV->getConstantSplatNode(); @@ -33040,37 +33055,31 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT)) + if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT)) return SDValue(); // Set N0 and N1 to hold the inputs to the new wide operation. N0 = N0->getOperand(0); if (RHSConstSplat) { - N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(), + N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT.getVectorElementType(), SDValue(RHSConstSplat, 0)); - N1 = DAG.getSplatBuildVector(WideVT, DL, N1); + N1 = DAG.getSplatBuildVector(VT, DL, N1); } else if (RHSTrunc) { N1 = N1->getOperand(0); } // Generate the wide operation. - SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1); + SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1); unsigned Opcode = N->getOpcode(); switch (Opcode) { + default: llvm_unreachable("Unexpected opcode"); case ISD::ANY_EXTEND: return Op; - case ISD::ZERO_EXTEND: { - unsigned InBits = NarrowVT.getScalarSizeInBits(); - APInt Mask = APInt::getAllOnesValue(InBits); - Mask = Mask.zext(VT.getScalarSizeInBits()); - return DAG.getNode(ISD::AND, DL, VT, - Op, DAG.getConstant(Mask, DL, VT)); - } + case ISD::ZERO_EXTEND: + return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType()); case ISD::SIGN_EXTEND: return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op, DAG.getValueType(NarrowVT)); - default: - llvm_unreachable("Unexpected opcode"); } } @@ -33882,16 +33891,6 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, if (!Subtarget.hasSSE2()) return SDValue(); - if (Subtarget.hasBWI()) { - if (VT.getSizeInBits() > 512) - return SDValue(); - } else if (Subtarget.hasAVX2()) { - if (VT.getSizeInBits() > 256) - return SDValue(); - } else { - if (VT.getSizeInBits() > 128) - return SDValue(); - } // Detect the following pattern: // @@ -33903,7 +33902,6 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, // %6 = trunc <N x i32> %5 to <N x i8> // // In AVX512, the last instruction can also be a trunc store. - if (In.getOpcode() != ISD::SRL) return SDValue(); @@ -33924,6 +33922,35 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, return true; }; + // Split vectors to legal target size and apply AVG. + auto LowerToAVG = [&](SDValue Op0, SDValue Op1) { + unsigned NumSubs = 1; + if (Subtarget.hasBWI()) { + if (VT.getSizeInBits() > 512) + NumSubs = VT.getSizeInBits() / 512; + } else if (Subtarget.hasAVX2()) { + if (VT.getSizeInBits() > 256) + NumSubs = VT.getSizeInBits() / 256; + } else { + if (VT.getSizeInBits() > 128) + NumSubs = VT.getSizeInBits() / 128; + } + + if (NumSubs == 1) + return DAG.getNode(X86ISD::AVG, DL, VT, Op0, Op1); + + SmallVector<SDValue, 4> Subs; + EVT SubVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), + VT.getVectorNumElements() / NumSubs); + for (unsigned i = 0; i != NumSubs; ++i) { + unsigned Idx = i * SubVT.getVectorNumElements(); + SDValue LHS = extractSubVector(Op0, Idx, DAG, DL, SubVT.getSizeInBits()); + SDValue RHS = extractSubVector(Op1, Idx, DAG, DL, SubVT.getSizeInBits()); + Subs.push_back(DAG.getNode(X86ISD::AVG, DL, SubVT, LHS, RHS)); + } + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs); + }; + // Check if each element of the vector is left-shifted by one. auto LHS = In.getOperand(0); auto RHS = In.getOperand(1); @@ -33947,8 +33974,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, SDValue VecOnes = DAG.getConstant(1, DL, InVT); Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes); Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]); - return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), - Operands[1]); + return LowerToAVG(Operands[0].getOperand(0), Operands[1]); } if (Operands[0].getOpcode() == ISD::ADD) @@ -33972,8 +33998,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, return SDValue(); // The pattern is detected, emit X86ISD::AVG instruction. - return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), - Operands[1].getOperand(0)); + return LowerToAVG(Operands[0].getOperand(0), Operands[1].getOperand(0)); } return SDValue(); @@ -35872,14 +35897,8 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, if (SDValue NewCMov = combineToExtendCMOV(N, DAG)) return NewCMov; - if (!DCI.isBeforeLegalizeOps()) { - if (InVT == MVT::i1) { - SDValue Zero = DAG.getConstant(0, DL, VT); - SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); - return DAG.getSelect(DL, VT, N0, AllOnes, Zero); - } + if (!DCI.isBeforeLegalizeOps()) return SDValue(); - } if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR && isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) { @@ -35897,7 +35916,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) return V; - if (Subtarget.hasAVX() && VT.is256BitVector()) + if (VT.isVector()) if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; @@ -36089,7 +36108,7 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) return V; - if (VT.is256BitVector()) + if (VT.isVector()) if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; @@ -36244,39 +36263,54 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(N); - // Pre-shrink oversized index elements to avoid triggering scalarization. - if (DCI.isBeforeLegalize()) { + if (DCI.isBeforeLegalizeOps()) { SDValue Index = N->getOperand(4); - if (Index.getScalarValueSizeInBits() > 64) { - EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, + // Remove any sign extends from 32 or smaller to larger than 32. + // Only do this before LegalizeOps in case we need the sign extend for + // legalization. + if (Index.getOpcode() == ISD::SIGN_EXTEND) { + if (Index.getScalarValueSizeInBits() > 32 && + Index.getOperand(0).getScalarValueSizeInBits() <= 32) { + SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); + NewOps[4] = Index.getOperand(0); + DAG.UpdateNodeOperands(N, NewOps); + // The original sign extend has less users, add back to worklist in case + // it needs to be removed + DCI.AddToWorklist(Index.getNode()); + DCI.AddToWorklist(N); + return SDValue(N, 0); + } + } + + // Make sure the index is either i32 or i64 + unsigned ScalarSize = Index.getScalarValueSizeInBits(); + if (ScalarSize != 32 && ScalarSize != 64) { + MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32; + EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT, Index.getValueType().getVectorNumElements()); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index); + Index = DAG.getSExtOrTrunc(Index, DL, IndexVT); SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); - NewOps[4] = Trunc; + NewOps[4] = Index; DAG.UpdateNodeOperands(N, NewOps); DCI.AddToWorklist(N); return SDValue(N, 0); } - } - // Try to remove sign extends from i32 to i64 on the index. - // Only do this before legalize in case we are relying on it for - // legalization. - // TODO: We should maybe remove any sign extend once we learn how to sign - // extend narrow index during lowering. - if (DCI.isBeforeLegalizeOps()) { - SDValue Index = N->getOperand(4); - if (Index.getScalarValueSizeInBits() == 64 && - Index.getOpcode() == ISD::SIGN_EXTEND && + // Try to remove zero extends from 32->64 if we know the sign bit of + // the input is zero. + if (Index.getOpcode() == ISD::ZERO_EXTEND && + Index.getScalarValueSizeInBits() == 64 && Index.getOperand(0).getScalarValueSizeInBits() == 32) { - SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); - NewOps[4] = Index.getOperand(0); - DAG.UpdateNodeOperands(N, NewOps); - // The original sign extend has less users, add back to worklist in case - // it needs to be removed. - DCI.AddToWorklist(Index.getNode()); - DCI.AddToWorklist(N); - return SDValue(N, 0); + if (DAG.SignBitIsZero(Index.getOperand(0))) { + SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); + NewOps[4] = Index.getOperand(0); + DAG.UpdateNodeOperands(N, NewOps); + // The original zero extend has less users, add back to worklist in case + // it needs to be removed + DCI.AddToWorklist(Index.getNode()); + DCI.AddToWorklist(N); + return SDValue(N, 0); + } } } @@ -36288,6 +36322,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); NewOps[2] = Mask.getOperand(0); DAG.UpdateNodeOperands(N, NewOps); + return SDValue(N, 0); } // With AVX2 we only demand the upper bit of the mask. @@ -36356,7 +36391,7 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, EVT VT = N->getValueType(0); if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || - VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) + VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits()) return SDValue(); // Now check that the other operand of the AND is a constant. We could diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td index 2acd8d17beb2..0d30b7d47f3e 100644 --- a/lib/Target/X86/X86Instr3DNow.td +++ b/lib/Target/X86/X86Instr3DNow.td @@ -116,14 +116,30 @@ defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw", I3DNOW_MISC_FUNC_ITINS, 1>; def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)], IIC_MMX_EMMS>; +// PREFETCHWT1 is supported we want to use it for everything but T0. +def PrefetchWLevel : PatFrag<(ops), (i32 imm), [{ + return N->getSExtValue() == 3 || !Subtarget->hasPREFETCHWT1(); +}]>; + +// Use PREFETCHWT1 for NTA, T2, T1. +def PrefetchWT1Level : ImmLeaf<i32, [{ + return Imm < 3; +}]>; + let SchedRW = [WriteLoad] in { +let Predicates = [Has3DNow, NoSSEPrefetch] in def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr), "prefetch\t$addr", - [(prefetch addr:$addr, (i32 0), imm, (i32 1))], + [(prefetch addr:$addr, imm, imm, (i32 1))], IIC_SSE_PREFETCH>; + def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr", - [(prefetch addr:$addr, (i32 1), (i32 3), (i32 1))], + [(prefetch addr:$addr, (i32 1), (i32 PrefetchWLevel), (i32 1))], IIC_SSE_PREFETCH>, TB, Requires<[HasPrefetchW]>; + +def PREFETCHWT1 : I<0x0D, MRM2m, (outs), (ins i8mem:$addr), "prefetchwt1\t$addr", + [(prefetch addr:$addr, (i32 1), (i32 PrefetchWT1Level), (i32 1))], + IIC_SSE_PREFETCH>, TB, Requires<[HasPREFETCHWT1]>; } // "3DNowA" instructions diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index 2a6ed02fadab..0b266e5591b4 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -349,8 +349,9 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, let TSFlags{54} = hasEVEX_RC; } -class PseudoI<dag oops, dag iops, list<dag> pattern> - : X86Inst<0, Pseudo, NoImm, oops, iops, "", NoItinerary> { +class PseudoI<dag oops, dag iops, list<dag> pattern, + InstrItinClass itin = NoItinerary> + : X86Inst<0, Pseudo, NoImm, oops, iops, "", itin> { let Pattern = pattern; } @@ -423,9 +424,8 @@ class FPI<bits<8> o, Format F, dag outs, dag ins, string asm, // FpI_ - Floating Point Pseudo Instruction template. Not Predicated. class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern, InstrItinClass itin = NoItinerary> - : X86Inst<0, Pseudo, NoImm, outs, ins, "", itin> { + : PseudoI<outs, ins, pattern, itin> { let FPForm = fp; - let Pattern = pattern; } // Templates for instructions that use a 16- or 32-bit segmented address as diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 42e89cb4831d..fdf3e73e4fcd 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -874,7 +874,10 @@ def HasADX : Predicate<"Subtarget->hasADX()">; def HasSHA : Predicate<"Subtarget->hasSHA()">; def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; +def HasSSEPrefetch : Predicate<"Subtarget->hasSSEPrefetch()">; +def NoSSEPrefetch : Predicate<"!Subtarget->hasSSEPrefetch()">; def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">; +def HasPREFETCHWT1 : Predicate<"Subtarget->hasPREFETCHWT1()">; def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">; def HasMWAITX : Predicate<"Subtarget->hasMWAITX()">; def HasCLZERO : Predicate<"Subtarget->hasCLZERO()">; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index a86a0bfc168d..b48fa1841979 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3487,7 +3487,7 @@ let Predicates = [UseSSE2] in { //===----------------------------------------------------------------------===// // Prefetch intrinsic. -let Predicates = [HasSSE1], SchedRW = [WriteLoad] in { +let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in { def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src), "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))], IIC_SSE_PREFETCH>, TB; diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 1e04997ad294..e131f1a1e4bd 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -89,8 +89,9 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( // Check to see if there is a specialized entry-point for memory zeroing. ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val); - if (const char *bzeroEntry = ValC && - ValC->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) { + if (const char *bzeroName = (ValC && ValC->isNullValue()) + ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO) + : nullptr) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout()); Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); @@ -106,7 +107,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( CLI.setDebugLoc(dl) .setChain(Chain) .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol(bzeroEntry, IntPtr), + DAG.getExternalSymbol(bzeroName, IntPtr), std::move(Args)) .setDiscardResult(); diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 8b08766b6171..ad023623142f 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -174,28 +174,6 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV, return X86II::MO_NO_FLAG; } -/// This function returns the name of a function which has an interface like -/// the non-standard bzero function, if such a function exists on the -/// current subtarget and it is considered preferable over memset with zero -/// passed as the second argument. Otherwise it returns null. -const char *X86Subtarget::getBZeroEntry() const { - // Darwin 10 has a __bzero entry point for this purpose. - if (getTargetTriple().isMacOSX() && - !getTargetTriple().isMacOSXVersionLT(10, 6)) - return "__bzero"; - - return nullptr; -} - -bool X86Subtarget::hasSinCos() const { - if (getTargetTriple().isMacOSX()) { - return !getTargetTriple().isMacOSXVersionLT(10, 9) && is64Bit(); - } else if (getTargetTriple().isOSFuchsia()) { - return true; - } - return false; -} - /// Return true if the subtarget allows calls to immediate address. bool X86Subtarget::isLegalToCallImmediateAddr() const { // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32 @@ -324,6 +302,7 @@ void X86Subtarget::initializeEnvironment() { HasVNNI = false; HasBITALG = false; HasSHA = false; + HasPREFETCHWT1 = false; HasPRFCHW = false; HasRDSEED = false; HasLAHFSAHF = false; @@ -342,6 +321,7 @@ void X86Subtarget::initializeEnvironment() { HasSSEUnalignedMem = false; HasCmpxchg16b = false; UseLeaForSP = false; + HasFastVariableShuffle = false; HasFastPartialYMMorZMMWrite = false; HasFastGather = false; HasFastScalarFSQRT = false; diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index be4d46c470de..c9435890fc1f 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -201,7 +201,7 @@ protected: bool HasCLZERO; /// Processor has Prefetch with intent to Write instruction - bool HasPFPREFETCHWT1; + bool HasPREFETCHWT1; /// True if SHLD instructions are slow. bool IsSHLDSlow; @@ -228,6 +228,10 @@ protected: /// the stack pointer. This is an optimization for Intel Atom processors. bool UseLeaForSP; + /// True if its preferable to combine to a single shuffle using a variable + /// mask over multiple fixed shuffles. + bool HasFastVariableShuffle; + /// True if there is no performance penalty to writing only the lower parts /// of a YMM or ZMM register without clearing the upper part. bool HasFastPartialYMMorZMMWrite; @@ -513,7 +517,14 @@ public: bool hasRTM() const { return HasRTM; } bool hasADX() const { return HasADX; } bool hasSHA() const { return HasSHA; } - bool hasPRFCHW() const { return HasPRFCHW; } + bool hasPRFCHW() const { return HasPRFCHW || HasPREFETCHWT1; } + bool hasPREFETCHWT1() const { return HasPREFETCHWT1; } + bool hasSSEPrefetch() const { + // We implicitly enable these when we have a write prefix supporting cache + // level OR if we have prfchw, but don't already have a read prefetch from + // 3dnow. + return hasSSE1() || (hasPRFCHW() && !has3DNow()) || hasPREFETCHWT1(); + } bool hasRDSEED() const { return HasRDSEED; } bool hasLAHFSAHF() const { return HasLAHFSAHF; } bool hasMWAITX() const { return HasMWAITX; } @@ -527,6 +538,9 @@ public: bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } bool useLeaForSP() const { return UseLeaForSP; } + bool hasFastVariableShuffle() const { + return HasFastVariableShuffle; + } bool hasFastPartialYMMorZMMWrite() const { return HasFastPartialYMMorZMMWrite; } @@ -682,17 +696,6 @@ public: /// Return true if the subtarget allows calls to immediate address. bool isLegalToCallImmediateAddr() const; - /// This function returns the name of a function which has an interface - /// like the non-standard bzero function, if such a function exists on - /// the current subtarget and it is considered prefereable over - /// memset with zero passed as the second argument. Otherwise it - /// returns null. - const char *getBZeroEntry() const; - - /// This function returns true if the target has sincos() routine in its - /// compiler runtime or math libraries. - bool hasSinCos() const; - /// Enable the MachineScheduler pass for all X86 subtargets. bool enableMachineScheduler() const override { return true; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index ea8c9862230e..e95e6ecae091 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -281,10 +281,9 @@ UseVZeroUpper("x86-use-vzeroupper", cl::Hidden, // X86 TTI query. //===----------------------------------------------------------------------===// -TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(X86TTIImpl(this, F)); - }); +TargetTransformInfo +X86TargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(X86TTIImpl(this, F)); } //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index 952bd1321ff9..5b21cd82b5b1 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -45,7 +45,7 @@ public: // attributes of each function. const X86Subtarget *getSubtargetImpl() const = delete; - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; // Set up the pass pipeline. TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp index 3aa7187e0cd1..38925bfd51b0 100644 --- a/lib/Target/XCore/XCoreTargetMachine.cpp +++ b/lib/Target/XCore/XCoreTargetMachine.cpp @@ -108,8 +108,7 @@ extern "C" void LLVMInitializeXCoreTarget() { RegisterTargetMachine<XCoreTargetMachine> X(getTheXCoreTarget()); } -TargetIRAnalysis XCoreTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(XCoreTTIImpl(this, F)); - }); +TargetTransformInfo +XCoreTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(XCoreTTIImpl(this, F)); } diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h index 5baa3524d2a6..965b9b2c4d65 100644 --- a/lib/Target/XCore/XCoreTargetMachine.h +++ b/lib/Target/XCore/XCoreTargetMachine.h @@ -43,7 +43,7 @@ public: // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); |