diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-07-01 13:22:02 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-07-01 13:22:02 +0000 |
commit | 9df3605dea17e84f8183581f6103bd0c79e2a606 (patch) | |
tree | 70a2f36ce9eb9bb213603cd7f2f120af53fc176f /lib/Target/AMDGPU | |
parent | 08bbd35a80bf7765fe0d3043f9eb5a2f2786b649 (diff) |
Notes
Diffstat (limited to 'lib/Target/AMDGPU')
-rw-r--r-- | lib/Target/AMDGPU/AMDGPU.h | 1 | ||||
-rw-r--r-- | lib/Target/AMDGPU/AMDGPU.td | 6 | ||||
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp | 4 | ||||
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 2 | ||||
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUSubtarget.h | 6 | ||||
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 1 | ||||
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 2 | ||||
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h | 3 | ||||
-rw-r--r-- | lib/Target/AMDGPU/CMakeLists.txt | 1 | ||||
-rw-r--r-- | lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp | 4 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIISelLowering.cpp | 60 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIInstrInfo.cpp | 18 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIInstrInfo.td | 4 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIInstructions.td | 8 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 44 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SITypeRewriter.cpp | 156 |
16 files changed, 119 insertions, 201 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 55d18c3f3646b..5a799b2d88d0d 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -36,7 +36,6 @@ FunctionPass *createR600ControlFlowFinalizer(); FunctionPass *createAMDGPUCFGStructurizerPass(); // SI Passes -FunctionPass *createSITypeRewriter(); FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSIPeepholeSDWAPass(); diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 7494e5decd6f6..f1d899c4d0039 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -262,8 +262,8 @@ def FeatureSDWAMac : SubtargetFeature<"sdwa-mav", "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension" >; -def FeatureSDWAClampVOPC : SubtargetFeature<"sdwa-clamp-vopc", - "HasSDWAClampVOPC", +def FeatureSDWAOutModsVOPC : SubtargetFeature<"sdwa-out-mods-vopc", + "HasSDWAOutModsVOPC", "true", "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension" >; @@ -452,7 +452,7 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel, FeatureScalarStores, FeatureInv2PiInlineImm, - FeatureSDWA, FeatureSDWAClampVOPC, FeatureSDWAMac, FeatureDPP + FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP ] >; diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp index 2071b6f157cd3..9a391d06c9ea9 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp @@ -1776,7 +1776,7 @@ static void removeExternalCFGEdges(MachineBasicBlock *StartMBB, E = EndMBB->succ_end(); PI != E; ++PI) { // Either we have a back-edge to the entry block, or a back-edge to the - // succesor of the entry block since the block may be split. + // successor of the entry block since the block may be split. if ((*PI) != StartMBB && !((*PI) == StartMBBSucc && StartMBB != EndMBB && SuccSize == 1)) { Succs.insert( @@ -1831,7 +1831,7 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock( IfBB->addSuccessor(CodeBBStart); DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n"); - // Ensure that the MergeBB is a succesor of the CodeEndBB. + // Ensure that the MergeBB is a successor of the CodeEndBB. if (!CodeBBEnd->isSuccessor(MergeBB)) CodeBBEnd->addSuccessor(MergeBB); diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index ab5abf2039a5b..be47b900c6f06 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -128,7 +128,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasSDWAScalar(false), HasSDWASdst(false), HasSDWAMac(false), - HasSDWAClampVOPC(false), + HasSDWAOutModsVOPC(false), HasDPP(false), FlatAddressSpace(false), FlatInstOffsets(false), diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 2b16289c723ef..22cede59086ab 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -153,7 +153,7 @@ protected: bool HasSDWAScalar; bool HasSDWASdst; bool HasSDWAMac; - bool HasSDWAClampVOPC; + bool HasSDWAOutModsVOPC; bool HasDPP; bool FlatAddressSpace; bool FlatInstOffsets; @@ -452,8 +452,8 @@ public: return HasSDWAMac; } - bool hasSDWAClampVOPC() const { - return HasSDWAClampVOPC; + bool hasSDWAOutModsVOPC() const { + return HasSDWAOutModsVOPC; } /// \brief Returns the offset in bytes from the start of the input buffer diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 04fe9f689806c..425fd35d47de6 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -720,7 +720,6 @@ bool GCNPassConfig::addPreISel() { addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions } addPass(createSinkingPass()); - addPass(createSITypeRewriter()); addPass(createAMDGPUAnnotateUniformValues()); if (!LateCFGStructurize) { addPass(createSIAnnotateControlFlowPass()); diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 88245b01683a5..89a03902dc69b 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -63,7 +63,7 @@ static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, return false; } -void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, +void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { UP.Threshold = 300; // Twice the default. UP.MaxCount = UINT_MAX; diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 485e20411ab49..9a320bdfcc3d4 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -68,7 +68,8 @@ public: bool hasBranchDivergence() { return true; } - void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index e30844f082cdd..917d9cfa69054 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -96,7 +96,6 @@ add_llvm_target(AMDGPUCodeGen SIPeepholeSDWA.cpp SIRegisterInfo.cpp SIShrinkInstructions.cpp - SITypeRewriter.cpp SIWholeQuadMode.cpp GCNIterativeScheduler.cpp GCNMinRegStrategy.cpp diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 04308fb3aaf64..f26e49295e690 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -626,7 +626,9 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width, using namespace AMDGPU::SDWA; if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) { - if (SDWA9EncValues::SRC_VGPR_MIN <= Val && + // XXX: static_cast<int> is needed to avoid stupid warning: + // compare with unsigned is always true + if (SDWA9EncValues::SRC_VGPR_MIN <= static_cast<int>(Val) && Val <= SDWA9EncValues::SRC_VGPR_MAX) { return createRegOperand(getVgprClassId(Width), Val - SDWA9EncValues::SRC_VGPR_MIN); diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index d0f4e00994de1..d39b345bdf032 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4314,6 +4314,23 @@ SDValue SITargetLowering::splitBinaryBitConstantOp( return SDValue(); } +// Returns true if argument is a boolean value which is not serialized into +// memory or argument and does not require v_cmdmask_b32 to be deserialized. +static bool isBoolSGPR(SDValue V) { + if (V.getValueType() != MVT::i1) + return false; + switch (V.getOpcode()) { + default: break; + case ISD::SETCC: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case AMDGPUISD::FP_CLASS: + return true; + } + return false; +} + SDValue SITargetLowering::performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (DCI.isBeforeLegalize()) @@ -4402,6 +4419,16 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, } } + if (VT == MVT::i32 && + (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) { + // and x, (sext cc from i1) => select cc, x, 0 + if (RHS.getOpcode() != ISD::SIGN_EXTEND) + std::swap(LHS, RHS); + if (isBoolSGPR(RHS.getOperand(0))) + return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), + LHS, DAG.getConstant(0, SDLoc(N), MVT::i32)); + } + return SDValue(); } @@ -4941,8 +4968,7 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, case ISD::SIGN_EXTEND: case ISD::ANY_EXTEND: { auto Cond = RHS.getOperand(0); - if (Cond.getOpcode() != ISD::SETCC && - Cond.getOpcode() != AMDGPUISD::FP_CLASS) + if (!isBoolSGPR(Cond)) break; SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond }; @@ -5109,6 +5135,35 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); EVT VT = LHS.getValueType(); + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + + auto CRHS = dyn_cast<ConstantSDNode>(RHS); + if (!CRHS) { + CRHS = dyn_cast<ConstantSDNode>(LHS); + if (CRHS) { + std::swap(LHS, RHS); + CC = getSetCCSwappedOperands(CC); + } + } + + if (CRHS && VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND && + isBoolSGPR(LHS.getOperand(0))) { + // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1 + // setcc (sext from i1 cc), -1, eq|sle|uge) => cc + // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1 + // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc + if ((CRHS->isAllOnesValue() && + (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) || + (CRHS->isNullValue() && + (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE))) + return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), + DAG.getConstant(-1, SL, MVT::i1)); + if ((CRHS->isAllOnesValue() && + (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) || + (CRHS->isNullValue() && + (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT))) + return LHS.getOperand(0); + } if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() && VT != MVT::f16)) @@ -5116,7 +5171,6 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, // Match isinf pattern // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) - ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); if (!CRHS) diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index c9b48fea7225e..b6784ec14e9f8 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -770,7 +770,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, if (ST.hasScalarStores()) { // m0 is used for offset to scalar stores if used to spill. - Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); + Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); } return; @@ -871,7 +871,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, if (ST.hasScalarStores()) { // m0 is used for offset to scalar stores if used to spill. - Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); + Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); } return; @@ -2444,8 +2444,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); - if ( DstIdx == -1) - DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::sdst); const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; @@ -2488,14 +2486,20 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; return false; } - } else if (!ST.hasSDWAClampVOPC()) { + } else if (!ST.hasSDWAOutModsVOPC()) { // No clamp allowed on GFX9 for VOPC const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); - if (Clamp != nullptr && - (!Clamp->isImm() || Clamp->getImm() != 0)) { + if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; return false; } + + // No omod allowed on GFX9 for VOPC + const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); + if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { + ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; + return false; + } } } } diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 3b4a8b5d1e817..4a81fb3b463a9 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -336,6 +336,10 @@ def NegSubInlineConst16 : ImmLeaf<i16, [{ return Imm < -16 && Imm >= -64; }], NegateImm>; +def ShiftAmt32Imm : PatLeaf <(imm), [{ + return N->getZExtValue() < 32; +}]>; + //===----------------------------------------------------------------------===// // Custom Operands //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 3b4bdc864253c..bcc685015cf5e 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -929,6 +929,14 @@ def : UMad24Pat<V_MAD_U32_U24>; defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>; def : ROTRPattern <V_ALIGNBIT_B32>; +def : Pat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), + (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), + (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; + +def : Pat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), + (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), + (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; + /********** ====================== **********/ /********** Indirect addressing **********/ /********** ====================== **********/ diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 4ac23ef03cb32..e2ac6631d2f32 100644 --- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -627,10 +627,13 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, return false; } - if (!ST.hasSDWAClampVOPC() && TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) + if (!ST.hasSDWAOutModsVOPC() && + (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || + TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) return false; - } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { + } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || + !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { return false; } @@ -649,25 +652,24 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(MI.getOpcode())); assert(SDWAOpcode != -1); - // Copy dst, if it is present in original then should also be present in SDWA - MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (!Dst && !TII->isVOPC(MI)) - return false; - const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); // Create SDWA version of instruction MI and initialize its operands MachineInstrBuilder SDWAInst = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc); + // Copy dst, if it is present in original then should also be present in SDWA + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); if (Dst) { assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); SDWAInst.add(*Dst); - } else { - Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); + } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) { assert(Dst && AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); SDWAInst.add(*Dst); + } else { + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); + SDWAInst.addReg(AMDGPU::VCC, RegState::Define); } // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and @@ -714,20 +716,22 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, } // Copy omod if present, initialize otherwise if needed - MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); - if (OMod) { - assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1); - SDWAInst.add(*OMod); - } else if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) { - SDWAInst.addImm(0); + if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) { + MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); + if (OMod) { + SDWAInst.add(*OMod); + } else { + SDWAInst.addImm(0); + } } - // Initialize dst_sel and dst_unused if present - if (Dst) { - assert( - AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1 && - AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1); + // Initialize dst_sel if present + if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) { SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + } + + // Initialize dst_unused if present + if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) { SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); } diff --git a/lib/Target/AMDGPU/SITypeRewriter.cpp b/lib/Target/AMDGPU/SITypeRewriter.cpp deleted file mode 100644 index aad68537f7791..0000000000000 --- a/lib/Target/AMDGPU/SITypeRewriter.cpp +++ /dev/null @@ -1,156 +0,0 @@ -//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass removes performs the following type substitution on all -/// non-compute shaders: -/// -/// v16i8 => i128 -/// - v16i8 is used for constant memory resource descriptors. This type is -/// legal for some compute APIs, and we don't want to declare it as legal -/// in the backend, because we want the legalizer to expand all v16i8 -/// operations. -/// v1* => * -/// - Having v1* types complicates the legalizer and we can easily replace -/// - them with the element type. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstVisitor.h" - -using namespace llvm; - -namespace { - -class SITypeRewriter : public FunctionPass, - public InstVisitor<SITypeRewriter> { - - static char ID; - Module *Mod; - Type *v16i8; - Type *v4i32; - -public: - SITypeRewriter() : FunctionPass(ID) { } - bool doInitialization(Module &M) override; - bool runOnFunction(Function &F) override; - StringRef getPassName() const override { return "SI Type Rewriter"; } - void visitLoadInst(LoadInst &I); - void visitCallInst(CallInst &I); - void visitBitCast(BitCastInst &I); -}; - -} // End anonymous namespace - -char SITypeRewriter::ID = 0; - -bool SITypeRewriter::doInitialization(Module &M) { - Mod = &M; - v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16); - v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4); - return false; -} - -bool SITypeRewriter::runOnFunction(Function &F) { - if (!AMDGPU::isShader(F.getCallingConv())) - return false; - - visit(F); - visit(F); - - return false; -} - -void SITypeRewriter::visitLoadInst(LoadInst &I) { - Value *Ptr = I.getPointerOperand(); - Type *PtrTy = Ptr->getType(); - Type *ElemTy = PtrTy->getPointerElementType(); - IRBuilder<> Builder(&I); - if (ElemTy == v16i8) { - Value *BitCast = Builder.CreateBitCast(Ptr, - PointerType::get(v4i32,PtrTy->getPointerAddressSpace())); - LoadInst *Load = Builder.CreateLoad(BitCast); - SmallVector<std::pair<unsigned, MDNode *>, 8> MD; - I.getAllMetadataOtherThanDebugLoc(MD); - for (unsigned i = 0, e = MD.size(); i != e; ++i) { - Load->setMetadata(MD[i].first, MD[i].second); - } - Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType()); - I.replaceAllUsesWith(BitCastLoad); - I.eraseFromParent(); - } -} - -void SITypeRewriter::visitCallInst(CallInst &I) { - IRBuilder<> Builder(&I); - - SmallVector <Value*, 8> Args; - SmallVector <Type*, 8> Types; - bool NeedToReplace = false; - Function *F = I.getCalledFunction(); - if (!F) - return; - - std::string Name = F->getName(); - for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) { - Value *Arg = I.getArgOperand(i); - if (Arg->getType() == v16i8) { - Args.push_back(Builder.CreateBitCast(Arg, v4i32)); - Types.push_back(v4i32); - NeedToReplace = true; - Name = Name + ".v4i32"; - } else if (Arg->getType()->isVectorTy() && - Arg->getType()->getVectorNumElements() == 1 && - Arg->getType()->getVectorElementType() == - Type::getInt32Ty(I.getContext())){ - Type *ElementTy = Arg->getType()->getVectorElementType(); - std::string TypeName = "i32"; - InsertElementInst *Def = cast<InsertElementInst>(Arg); - Args.push_back(Def->getOperand(1)); - Types.push_back(ElementTy); - std::string VecTypeName = "v1" + TypeName; - Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName); - NeedToReplace = true; - } else { - Args.push_back(Arg); - Types.push_back(Arg->getType()); - } - } - - if (!NeedToReplace) { - return; - } - Function *NewF = Mod->getFunction(Name); - if (!NewF) { - NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod); - NewF->setAttributes(F->getAttributes()); - } - I.replaceAllUsesWith(Builder.CreateCall(NewF, Args)); - I.eraseFromParent(); -} - -void SITypeRewriter::visitBitCast(BitCastInst &I) { - IRBuilder<> Builder(&I); - if (I.getDestTy() != v4i32) { - return; - } - - if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) { - if (Op->getSrcTy() == v4i32) { - I.replaceAllUsesWith(Op->getOperand(0)); - I.eraseFromParent(); - } - } -} - -FunctionPass *llvm::createSITypeRewriter() { - return new SITypeRewriter(); -} |