From 581a6d8501ff5614297da837b81ed3b6956361ea Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Sat, 14 Jan 2017 15:37:50 +0000 Subject: Vendor import of llvm release_40 branch r292009: https://llvm.org/svn/llvm-project/llvm/branches/release_40@292009 --- lib/Target/AArch64/AArch64GenRegisterBankInfo.def | 129 ++++++- lib/Target/AArch64/AArch64ISelLowering.cpp | 2 + lib/Target/AArch64/AArch64InstrInfo.td | 5 +- lib/Target/AArch64/AArch64RegisterBankInfo.cpp | 33 +- lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 25 +- lib/Target/AArch64/AArch64TargetTransformInfo.h | 3 +- lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 236 ++++++++++++ lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 + lib/Target/AMDGPU/AMDGPUInstructions.td | 3 +- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 2 +- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h | 3 +- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 116 ++++-- lib/Target/AMDGPU/EvergreenInstructions.td | 4 +- lib/Target/AMDGPU/SIFoldOperands.cpp | 399 +++++++++++++-------- lib/Target/AMDGPU/SIInstrInfo.td | 42 ++- lib/Target/AMDGPU/SIInstructions.td | 5 + lib/Target/AMDGPU/SIShrinkInstructions.cpp | 36 +- lib/Target/AMDGPU/VOP1Instructions.td | 2 +- lib/Target/AMDGPU/VOP2Instructions.td | 8 +- lib/Target/AMDGPU/VOPCInstructions.td | 4 +- lib/Target/ARM/ARMISelLowering.cpp | 198 ++++++++-- lib/Target/ARM/ARMISelLowering.h | 41 ++- lib/Target/ARM/ARMRegisterBankInfo.cpp | 35 +- lib/Target/ARM/ARMTargetTransformInfo.cpp | 3 +- lib/Target/ARM/ARMTargetTransformInfo.h | 3 +- lib/Target/Lanai/LanaiTargetTransformInfo.h | 3 +- lib/Target/Mips/MipsSEISelLowering.cpp | 162 ++++++++- lib/Target/NVPTX/ManagedStringPool.h | 7 +- lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 152 ++++---- lib/Target/NVPTX/NVPTXAsmPrinter.h | 58 ++- lib/Target/NVPTX/NVPTXISelLowering.cpp | 118 +++--- lib/Target/NVPTX/NVPTXInstrInfo.td | 2 +- lib/Target/NVPTX/NVPTXSection.h | 10 +- lib/Target/NVPTX/NVPTXTargetMachine.cpp | 37 +- lib/Target/NVPTX/NVPTXTargetObjectFile.h | 10 +- lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp | 2 +- lib/Target/NVPTX/NVPTXTargetTransformInfo.h | 3 +- lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 2 +- lib/Target/PowerPC/PPCTargetTransformInfo.h | 3 +- lib/Target/SystemZ/SystemZISelLowering.cpp | 22 +- lib/Target/TargetMachine.cpp | 9 +- lib/Target/WebAssembly/WebAssemblyFastISel.cpp | 3 + .../WebAssembly/WebAssemblyFixFunctionBitcasts.cpp | 17 +- .../WebAssembly/WebAssemblyTargetTransformInfo.cpp | 2 +- .../WebAssembly/WebAssemblyTargetTransformInfo.h | 3 +- lib/Target/X86/X86.td | 41 ++- lib/Target/X86/X86ISelDAGToDAG.cpp | 10 - lib/Target/X86/X86ISelLowering.cpp | 387 +++++++++++++++----- lib/Target/X86/X86InstrAVX512.td | 39 ++ lib/Target/X86/X86InstrSSE.td | 74 +++- lib/Target/X86/X86Subtarget.h | 2 +- lib/Target/X86/X86TargetTransformInfo.cpp | 57 ++- lib/Target/X86/X86TargetTransformInfo.h | 3 +- 53 files changed, 1991 insertions(+), 585 deletions(-) (limited to 'lib/Target') diff --git a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def index e927d58ad612b..d472a54d95437 100644 --- a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def +++ b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def @@ -18,9 +18,132 @@ namespace llvm { namespace AArch64 { -RegisterBank GPRRegBank; -RegisterBank FPRRegBank; -RegisterBank CCRRegBank; +const uint32_t GPRCoverageData[] = { + // Classes 0-31 + (1u << AArch64::GPR32allRegClassID) | (1u << AArch64::GPR32RegClassID) | + (1u << AArch64::GPR32spRegClassID) | + (1u << AArch64::GPR32commonRegClassID) | + (1u << AArch64::GPR32sponlyRegClassID) | + (1u << AArch64::GPR64allRegClassID) | (1u << AArch64::GPR64RegClassID) | + (1u << AArch64::GPR64spRegClassID) | + (1u << AArch64::GPR64commonRegClassID) | + (1u << AArch64::tcGPR64RegClassID) | + (1u << AArch64::GPR64sponlyRegClassID), + // Classes 32-63 + 0, + // FIXME: The entries below this point can be safely removed once this is + // tablegenerated. It's only needed because of the hardcoded register class + // limit. + // Classes 64-96 + 0, + // Classes 97-128 + 0, + // Classes 129-160 + 0, + // Classes 161-192 + 0, + // Classes 193-224 + 0, +}; + +const uint32_t FPRCoverageData[] = { + // Classes 0-31 + (1u << AArch64::FPR8RegClassID) | (1u << AArch64::FPR16RegClassID) | + (1u << AArch64::FPR32RegClassID) | (1u << AArch64::FPR64RegClassID) | + (1u << AArch64::DDRegClassID) | (1u << AArch64::FPR128RegClassID) | + (1u << AArch64::FPR128_loRegClassID) | (1u << AArch64::DDDRegClassID) | + (1u << AArch64::DDDDRegClassID), + // Classes 32-63 + (1u << (AArch64::QQRegClassID - 32)) | + (1u << (AArch64::QQ_with_qsub0_in_FPR128_loRegClassID - 32)) | + (1u << (AArch64::QQ_with_qsub1_in_FPR128_loRegClassID - 32)) | + (1u + << (AArch64:: + QQQ_with_qsub1_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID - + 32)) | + (1u + << (AArch64:: + QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID - + 32)) | + (1u << (AArch64::QQQQRegClassID - 32)) | + (1u << (AArch64::QQQQ_with_qsub0_in_FPR128_loRegClassID - 32)) | + (1u << (AArch64::QQQQ_with_qsub1_in_FPR128_loRegClassID - 32)) | + (1u << (AArch64::QQQQ_with_qsub2_in_FPR128_loRegClassID - 32)) | + (1u << (AArch64::QQQQ_with_qsub3_in_FPR128_loRegClassID - 32)) | + (1u + << (AArch64:: + QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub1_in_FPR128_loRegClassID - + 32)) | + (1u + << (AArch64:: + QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID - + 32)) | + (1u + << (AArch64:: + QQQQ_with_qsub2_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID - + 32)) | + (1u + << (AArch64:: + QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID - + 32)) | + (1u + << (AArch64:: + QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID - + 32)) | + (1u + << (AArch64:: + QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID - + 32)) | + (1u + << (AArch64:: + QQ_with_qsub0_in_FPR128_lo_and_QQ_with_qsub1_in_FPR128_loRegClassID - + 32)) | + (1u << (AArch64::QQQRegClassID - 32)) | + (1u << (AArch64::QQQ_with_qsub0_in_FPR128_loRegClassID - 32)) | + (1u << (AArch64::QQQ_with_qsub1_in_FPR128_loRegClassID - 32)) | + (1u << (AArch64::QQQ_with_qsub2_in_FPR128_loRegClassID - 32)) | + (1u + << (AArch64:: + QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub1_in_FPR128_loRegClassID - + 32)), + // FIXME: The entries below this point can be safely removed once this + // is tablegenerated. It's only needed because of the hardcoded register + // class limit. + // Classes 64-96 + 0, + // Classes 97-128 + 0, + // Classes 129-160 + 0, + // Classes 161-192 + 0, + // Classes 193-224 + 0, +}; + +const uint32_t CCRCoverageData[] = { + // Classes 0-31 + 1u << AArch64::CCRRegClassID, + // Classes 32-63 + 0, + // FIXME: The entries below this point can be safely removed once this + // is tablegenerated. It's only needed because of the hardcoded register + // class limit. + // Classes 64-96 + 0, + // Classes 97-128 + 0, + // Classes 129-160 + 0, + // Classes 161-192 + 0, + // Classes 193-224 + 0, +}; + +RegisterBank GPRRegBank(AArch64::GPRRegBankID, "GPR", 64, GPRCoverageData); +RegisterBank FPRRegBank(AArch64::FPRRegBankID, "FPR", 512, FPRCoverageData); +RegisterBank CCRRegBank(AArch64::CCRRegBankID, "CCR", 32, CCRCoverageData); RegisterBank *RegBanks[] = {&GPRRegBank, &FPRRegBank, &CCRRegBank}; diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 74a01835171be..7b581a706fa22 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -159,6 +159,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SETCC, MVT::i64, Custom); setOperationAction(ISD::SETCC, MVT::f32, Custom); setOperationAction(ISD::SETCC, MVT::f64, Custom); + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); setOperationAction(ISD::BRCOND, MVT::Other, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Custom); setOperationAction(ISD::BR_CC, MVT::i64, Custom); diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index c5b95f282ea87..2244baacca17b 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -951,10 +951,7 @@ def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>; defm CLS : OneOperandData<0b101, "cls">; defm CLZ : OneOperandData<0b100, "clz", ctlz>; -defm RBIT : OneOperandData<0b000, "rbit">; - -def : Pat<(int_aarch64_rbit GPR32:$Rn), (RBITWr $Rn)>; -def : Pat<(int_aarch64_rbit GPR64:$Rn), (RBITXr $Rn)>; +defm RBIT : OneOperandData<0b000, "rbit", bitreverse>; def REV16Wr : OneWRegData<0b001, "rev16", UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>; diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp index a5fd2fbdde196..b292c9c87dcd6 100644 --- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -41,28 +41,30 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) if (AlreadyInit) return; AlreadyInit = true; - // Initialize the GPR bank. - createRegisterBank(AArch64::GPRRegBankID, "GPR"); - // The GPR register bank is fully defined by all the registers in - // GR64all + its subclasses. - addRegBankCoverage(AArch64::GPRRegBankID, AArch64::GPR64allRegClassID, TRI); + const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID); (void)RBGPR; assert(&AArch64::GPRRegBank == &RBGPR && "The order in RegBanks is messed up"); + + const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID); + (void)RBFPR; + assert(&AArch64::FPRRegBank == &RBFPR && + "The order in RegBanks is messed up"); + + const RegisterBank &RBCCR = getRegBank(AArch64::CCRRegBankID); + (void)RBCCR; + assert(&AArch64::CCRRegBank == &RBCCR && + "The order in RegBanks is messed up"); + + // The GPR register bank is fully defined by all the registers in + // GR64all + its subclasses. assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) && "Subclass not added?"); assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit"); - // Initialize the FPR bank. - createRegisterBank(AArch64::FPRRegBankID, "FPR"); // The FPR register bank is fully defined by all the registers in // GR64all + its subclasses. - addRegBankCoverage(AArch64::FPRRegBankID, AArch64::QQQQRegClassID, TRI); - const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID); - (void)RBFPR; - assert(&AArch64::FPRRegBank == &RBFPR && - "The order in RegBanks is messed up"); assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) && "Subclass not added?"); assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) && @@ -70,13 +72,6 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) assert(RBFPR.getSize() == 512 && "FPRs should hold up to 512-bit via QQQQ sequence"); - // Initialize the CCR bank. - createRegisterBank(AArch64::CCRRegBankID, "CCR"); - addRegBankCoverage(AArch64::CCRRegBankID, AArch64::CCRRegClassID, TRI); - const RegisterBank &RBCCR = getRegBank(AArch64::CCRRegBankID); - (void)RBCCR; - assert(&AArch64::CCRRegBank == &RBCCR && - "The order in RegBanks is messed up"); assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) && "Class not added?"); assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit"); diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 1a17691fc5846..b8833e5a5552d 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -374,7 +374,7 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, int AArch64TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); @@ -466,28 +466,27 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, +int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, unsigned Alignment, unsigned AddressSpace) { - std::pair LT = TLI->getTypeLegalizationCost(DL, Src); + auto LT = TLI->getTypeLegalizationCost(DL, Ty); if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && - Src->isVectorTy() && Alignment != 16 && - Src->getVectorElementType()->isIntegerTy(64)) { - // Unaligned stores are extremely inefficient. We don't split - // unaligned v2i64 stores because the negative impact that has shown in - // practice on inlined memcpy code. - // We make v2i64 stores expensive so that we will only vectorize if there + LT.second.is128BitVector() && Alignment < 16) { + // Unaligned stores are extremely inefficient. We don't split all + // unaligned 128-bit stores because the negative impact that has shown in + // practice on inlined block copy code. + // We make such stores expensive so that we will only vectorize if there // are 6 other instructions getting vectorized. - int AmortizationCost = 6; + const int AmortizationCost = 6; return LT.first * 2 * AmortizationCost; } - if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) && - Src->getVectorNumElements() < 8) { + if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8) && + Ty->getVectorNumElements() < 8) { // We scalarize the loads/stores because there is not v.4b register and we // have to promote the elements to v.4h. - unsigned NumVecElts = Src->getVectorNumElements(); + unsigned NumVecElts = Ty->getVectorNumElements(); unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; // We generate 2 instructions per vector element. return NumVectorizableInstsToAmortize * NumVecElts * 2; diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h index 849fd3d9b44ae..18287ed6653f7 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -102,7 +102,8 @@ public: TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef Args = ArrayRef()); int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr); diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 730bcdcf7afa5..e48c1943cb016 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -434,6 +434,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setSchedulingPreference(Sched::RegPressure); setJumpIsExpensive(true); + + // FIXME: This is only partially true. If we have to do vector compares, any + // SGPR pair can be a condition register. If we have a uniform condition, we + // are better off doing SALU operations, where there is only one SCC. For now, + // we don't have a way of knowing during instruction selection if a condition + // will be uniform and we always use vector compares. Assume we are using + // vector compares until that is fixed. setHasMultipleConditionRegisters(true); // SI at least has hardware support for floating point exceptions, but no way @@ -470,12 +477,31 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::FNEG); } //===----------------------------------------------------------------------===// // Target Information //===----------------------------------------------------------------------===// +static bool fnegFoldsIntoOp(unsigned Opc) { + switch (Opc) { + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + case ISD::FMA: + case ISD::FMAD: + case ISD::FSIN: + case AMDGPUISD::RCP: + case AMDGPUISD::RCP_LEGACY: + case AMDGPUISD::SIN_HW: + case AMDGPUISD::FMUL_LEGACY: + return true; + default: + return false; + } +} + MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const { return MVT::i32; } @@ -2679,8 +2705,93 @@ SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond, return SDValue(); } +static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, + unsigned Op, + const SDLoc &SL, + SDValue Cond, + SDValue N1, + SDValue N2) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N1.getValueType(); + + SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond, + N1.getOperand(0), N2.getOperand(0)); + DCI.AddToWorklist(NewSelect.getNode()); + return DAG.getNode(Op, SL, VT, NewSelect); +} + +// Pull a free FP operation out of a select so it may fold into uses. +// +// select c, (fneg x), (fneg y) -> fneg (select c, x, y) +// select c, (fneg x), k -> fneg (select c, x, (fneg k)) +// +// select c, (fabs x), (fabs y) -> fabs (select c, x, y) +// select c, (fabs x), +k -> fabs (select c, x, k) +static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, + SDValue N) { + SelectionDAG &DAG = DCI.DAG; + SDValue Cond = N.getOperand(0); + SDValue LHS = N.getOperand(1); + SDValue RHS = N.getOperand(2); + + EVT VT = N.getValueType(); + if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) || + (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) { + return distributeOpThroughSelect(DCI, LHS.getOpcode(), + SDLoc(N), Cond, LHS, RHS); + } + + bool Inv = false; + if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) { + std::swap(LHS, RHS); + Inv = true; + } + + // TODO: Support vector constants. + ConstantFPSDNode *CRHS = dyn_cast(RHS); + if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) { + SDLoc SL(N); + // If one side is an fneg/fabs and the other is a constant, we can push the + // fneg/fabs down. If it's an fabs, the constant needs to be non-negative. + SDValue NewLHS = LHS.getOperand(0); + SDValue NewRHS = RHS; + + // Careful: if the neg can be folded up, don't try to pull it back down. + bool ShouldFoldNeg = true; + + if (NewLHS.hasOneUse()) { + unsigned Opc = NewLHS.getOpcode(); + if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc)) + ShouldFoldNeg = false; + if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL) + ShouldFoldNeg = false; + } + + if (ShouldFoldNeg) { + if (LHS.getOpcode() == ISD::FNEG) + NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + else if (CRHS->isNegative()) + return SDValue(); + + if (Inv) + std::swap(NewLHS, NewRHS); + + SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, + Cond, NewLHS, NewRHS); + DCI.AddToWorklist(NewSelect.getNode()); + return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect); + } + } + + return SDValue(); +} + + SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { + if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) + return Folded; + SDValue Cond = N->getOperand(0); if (Cond.getOpcode() != ISD::SETCC) return SDValue(); @@ -2724,6 +2835,129 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); } +SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + unsigned Opc = N0.getOpcode(); + + // If the input has multiple uses and we can either fold the negate down, or + // the other uses cannot, give up. This both prevents unprofitable + // transformations and infinite loops: we won't repeatedly try to fold around + // a negate that has no 'good' form. + // + // TODO: Check users can fold + if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse()) + return SDValue(); + + SDLoc SL(N); + switch (Opc) { + case ISD::FADD: { + // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y)) + SDValue LHS = N0.getOperand(0); + SDValue RHS = N0.getOperand(1); + + if (LHS.getOpcode() != ISD::FNEG) + LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); + else + LHS = LHS.getOperand(0); + + if (RHS.getOpcode() != ISD::FNEG) + RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + else + RHS = RHS.getOperand(0); + + SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS); + if (!N0.hasOneUse()) + DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + return Res; + } + case ISD::FMUL: + case AMDGPUISD::FMUL_LEGACY: { + // (fneg (fmul x, y)) -> (fmul x, (fneg y)) + // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y)) + SDValue LHS = N0.getOperand(0); + SDValue RHS = N0.getOperand(1); + + if (LHS.getOpcode() == ISD::FNEG) + LHS = LHS.getOperand(0); + else if (RHS.getOpcode() == ISD::FNEG) + RHS = RHS.getOperand(0); + else + RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + + SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS); + if (!N0.hasOneUse()) + DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + return Res; + } + case ISD::FMA: + case ISD::FMAD: { + // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z)) + SDValue LHS = N0.getOperand(0); + SDValue MHS = N0.getOperand(1); + SDValue RHS = N0.getOperand(2); + + if (LHS.getOpcode() == ISD::FNEG) + LHS = LHS.getOperand(0); + else if (MHS.getOpcode() == ISD::FNEG) + MHS = MHS.getOperand(0); + else + MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS); + + if (RHS.getOpcode() != ISD::FNEG) + RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + else + RHS = RHS.getOperand(0); + + SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS); + if (!N0.hasOneUse()) + DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + return Res; + } + case ISD::FP_EXTEND: + case AMDGPUISD::RCP: + case AMDGPUISD::RCP_LEGACY: + case ISD::FSIN: + case AMDGPUISD::SIN_HW: { + SDValue CvtSrc = N0.getOperand(0); + if (CvtSrc.getOpcode() == ISD::FNEG) { + // (fneg (fp_extend (fneg x))) -> (fp_extend x) + // (fneg (rcp (fneg x))) -> (rcp x) + return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0)); + } + + if (!N0.hasOneUse()) + return SDValue(); + + // (fneg (fp_extend x)) -> (fp_extend (fneg x)) + // (fneg (rcp x)) -> (rcp (fneg x)) + SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); + return DAG.getNode(Opc, SL, VT, Neg); + } + case ISD::FP_ROUND: { + SDValue CvtSrc = N0.getOperand(0); + + if (CvtSrc.getOpcode() == ISD::FNEG) { + // (fneg (fp_round (fneg x))) -> (fp_round x) + return DAG.getNode(ISD::FP_ROUND, SL, VT, + CvtSrc.getOperand(0), N0.getOperand(1)); + } + + if (!N0.hasOneUse()) + return SDValue(); + + // (fneg (fp_round x)) -> (fp_round (fneg x)) + SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); + return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1)); + } + default: + return SDValue(); + } +} + SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -2829,6 +3063,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performMulLoHi24Combine(N, DCI); case ISD::SELECT: return performSelectCombine(N, DCI); + case ISD::FNEG: + return performFNegCombine(N, DCI); case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { assert(!N->getValueType(0).isVector() && diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 745c9923de2ee..69567aa5f713b 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -84,6 +84,7 @@ protected: SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const; SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const; static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index 513df3a9cdf39..59cba636c5860 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -629,9 +629,10 @@ def smax_oneuse : HasOneUseBinOp; def smin_oneuse : HasOneUseBinOp; def umax_oneuse : HasOneUseBinOp; def umin_oneuse : HasOneUseBinOp; -def sub_oneuse : HasOneUseBinOp; } // Properties = [SDNPCommutative, SDNPAssociative] +def sub_oneuse : HasOneUseBinOp; + def select_oneuse : HasOneUseTernaryOp