diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2022-07-24 15:03:44 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2022-07-24 15:03:44 +0000 |
commit | 4b4fe385e49bd883fd183b5f21c1ea486c722e61 (patch) | |
tree | c3d8fdb355c9c73e57723718c22103aaf7d15aa6 /llvm/lib/Target | |
parent | 1f917f69ff07f09b6dbb670971f57f8efe718b84 (diff) | |
download | src-4b4fe385e49bd883fd183b5f21c1ea486c722e61.tar.gz src-4b4fe385e49bd883fd183b5f21c1ea486c722e61.zip |
Diffstat (limited to 'llvm/lib/Target')
160 files changed, 5435 insertions, 1674 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index b332e9dcb176..8fb5d49e2121 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -216,7 +216,7 @@ def FeatureSlowPaired128 : SubtargetFeature<"slow-paired-128", "IsPaired128Slow", "true", "Paired 128 bit loads and stores are slow">; def FeatureAscendStoreAddress : SubtargetFeature<"ascend-store-address", - "IsStoreAddressAscend", "false", + "IsStoreAddressAscend", "true", "Schedule vector stores by ascending address">; def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "IsSTRQroSlow", diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 82fe5772c99d..00621b84d2f2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -69,6 +69,7 @@ public: bool tryMLAV64LaneV128(SDNode *N); bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N); bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift); + bool SelectArithUXTXRegister(SDValue N, SDValue &Reg, SDValue &Shift); bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift); bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift); bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) { @@ -893,6 +894,30 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg, return isWorthFolding(N); } +/// SelectArithUXTXRegister - Select a "UXTX register" operand. This +/// operand is refered by the instructions have SP operand +bool AArch64DAGToDAGISel::SelectArithUXTXRegister(SDValue N, SDValue &Reg, + SDValue &Shift) { + unsigned ShiftVal = 0; + AArch64_AM::ShiftExtendType Ext; + + if (N.getOpcode() != ISD::SHL) + return false; + + ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1)); + if (!CSD) + return false; + ShiftVal = CSD->getZExtValue(); + if (ShiftVal > 4) + return false; + + Ext = AArch64_AM::UXTX; + Reg = N.getOperand(0); + Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N), + MVT::i32); + return isWorthFolding(N); +} + /// If there's a use of this ADDlow that's not itself a load/store then we'll /// need to create a real ADD instruction from it anyway and there's no point in /// folding it into the mem op. Theoretically, it shouldn't matter, but there's @@ -4049,6 +4074,24 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } break; } + case Intrinsic::swift_async_context_addr: { + SDLoc DL(Node); + SDValue Chain = Node->getOperand(0); + SDValue CopyFP = CurDAG->getCopyFromReg(Chain, DL, AArch64::FP, MVT::i64); + SDValue Res = SDValue( + CurDAG->getMachineNode(AArch64::SUBXri, DL, MVT::i64, CopyFP, + CurDAG->getTargetConstant(8, DL, MVT::i32), + CurDAG->getTargetConstant(0, DL, MVT::i32)), + 0); + ReplaceUses(SDValue(Node, 0), Res); + ReplaceUses(SDValue(Node, 1), CopyFP.getValue(1)); + CurDAG->RemoveDeadNode(Node); + + auto &MF = CurDAG->getMachineFunction(); + MF.getFrameInfo().setFrameAddressIsTaken(true); + MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true); + return; + } } } break; case ISD::INTRINSIC_WO_CHAIN: { @@ -4094,18 +4137,6 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { if (tryMULLV64LaneV128(IntNo, Node)) return; break; - case Intrinsic::swift_async_context_addr: { - SDLoc DL(Node); - CurDAG->SelectNodeTo(Node, AArch64::SUBXri, MVT::i64, - CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, - AArch64::FP, MVT::i64), - CurDAG->getTargetConstant(8, DL, MVT::i32), - CurDAG->getTargetConstant(0, DL, MVT::i32)); - auto &MF = CurDAG->getMachineFunction(); - MF.getFrameInfo().setFrameAddressIsTaken(true); - MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true); - return; - } } break; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 447ad10ddf22..e070ce2efa6b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -521,6 +521,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTPOP, MVT::i64, Custom); setOperationAction(ISD::CTPOP, MVT::i128, Custom); + setOperationAction(ISD::PARITY, MVT::i64, Custom); + setOperationAction(ISD::PARITY, MVT::i128, Custom); + setOperationAction(ISD::ABS, MVT::i32, Custom); setOperationAction(ISD::ABS, MVT::i64, Custom); @@ -5463,7 +5466,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::SRA_PARTS: return LowerShiftParts(Op, DAG); case ISD::CTPOP: - return LowerCTPOP(Op, DAG); + case ISD::PARITY: + return LowerCTPOP_PARITY(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::OR: @@ -7783,7 +7787,8 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, return BitCast(VT, BSP, DAG); } -SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { +SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op, + SelectionDAG &DAG) const { if (DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat)) return SDValue(); @@ -7791,6 +7796,8 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { if (!Subtarget->hasNEON()) return SDValue(); + bool IsParity = Op.getOpcode() == ISD::PARITY; + // While there is no integer popcount instruction, it can // be more efficiently lowered to the following sequence that uses // AdvSIMD registers/instructions as long as the copies to/from @@ -7813,6 +7820,10 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); + if (IsParity) + UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV, + DAG.getConstant(1, DL, MVT::i32)); + if (VT == MVT::i64) UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); return UaddLV; @@ -7824,9 +7835,15 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); + if (IsParity) + UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV, + DAG.getConstant(1, DL, MVT::i32)); + return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV); } + assert(!IsParity && "ISD::PARITY of vector types not supported"); + if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU); @@ -11811,6 +11828,12 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { isConcatMask(M, VT, VT.getSizeInBits() == 128)); } +bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M, + EVT VT) const { + // Just delegate to the generic legality, clear masks aren't special. + return isShuffleMaskLegal(M, VT); +} + /// getVShiftImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift operation, where all the elements of the /// build_vector must have the same constant integer value. @@ -11969,6 +11992,11 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, if (IsZero) return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); + case AArch64CC::LE: + if (!NoNans) + return SDValue(); + // If we ignore NaNs then we can use to the LS implementation. + LLVM_FALLTHROUGH; case AArch64CC::LS: if (IsZero) return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); @@ -12073,7 +12101,7 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, bool ShouldInvert; changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); - bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; + bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs(); SDValue Cmp = EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); if (!Cmp.getNode()) @@ -13587,21 +13615,50 @@ AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const { bool AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const { - N = N->getOperand(0).getNode(); + assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA || + N->getOpcode() == ISD::SRL) && + "Expected shift op"); + + SDValue ShiftLHS = N->getOperand(0); EVT VT = N->getValueType(0); - // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine - // it with shift to let it be lowered to UBFX. - if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && - isa<ConstantSDNode>(N->getOperand(1))) { - uint64_t TruncMask = N->getConstantOperandVal(1); + + // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not combine + // it with shift 'N' to let it be lowered to UBFX. + if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && + isa<ConstantSDNode>(ShiftLHS.getOperand(1))) { + uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1); if (isMask_64(TruncMask) && - N->getOperand(0).getOpcode() == ISD::SRL && - isa<ConstantSDNode>(N->getOperand(0)->getOperand(1))) + ShiftLHS.getOperand(0).getOpcode() == ISD::SRL && + isa<ConstantSDNode>(ShiftLHS.getOperand(0).getOperand(1))) return false; } return true; } +bool AArch64TargetLowering::isDesirableToCommuteXorWithShift( + const SDNode *N) const { + assert(N->getOpcode() == ISD::XOR && + (N->getOperand(0).getOpcode() == ISD::SHL || + N->getOperand(0).getOpcode() == ISD::SRL) && + "Expected XOR(SHIFT) pattern"); + + // Only commute if the entire NOT mask is a hidden shifted mask. + auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1)); + auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1)); + if (XorC && ShiftC) { + unsigned MaskIdx, MaskLen; + if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) { + unsigned ShiftAmt = ShiftC->getZExtValue(); + unsigned BitWidth = N->getValueType(0).getScalarSizeInBits(); + if (N->getOperand(0).getOpcode() == ISD::SHL) + return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt); + return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt); + } + } + + return false; +} + bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask( const SDNode *N, CombineLevel Level) const { assert(((N->getOpcode() == ISD::SHL && @@ -19221,6 +19278,41 @@ static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv); } +static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + + SDValue Insert = N->getOperand(0); + if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR) + return SDValue(); + + if (!Insert.getOperand(0).isUndef()) + return SDValue(); + + uint64_t IdxInsert = Insert.getConstantOperandVal(2); + uint64_t IdxDupLane = N->getConstantOperandVal(1); + if (IdxInsert != IdxDupLane) + return SDValue(); + + SDValue Bitcast = Insert.getOperand(1); + if (Bitcast.getOpcode() != ISD::BITCAST) + return SDValue(); + + SDValue Subvec = Bitcast.getOperand(0); + EVT SubvecVT = Subvec.getValueType(); + if (!SubvecVT.is128BitVector()) + return SDValue(); + EVT NewSubvecVT = + getPackedSVEVectorVT(Subvec.getValueType().getVectorElementType()); + + SDLoc DL(N); + SDValue NewInsert = + DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT, + DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2)); + SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT, + NewInsert, N->getOperand(1)); + return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -19307,6 +19399,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performCSELCombine(N, DCI, DAG); case AArch64ISD::DUP: return performDUPCombine(N, DCI); + case AArch64ISD::DUPLANE128: + return performDupLane128Combine(N, DAG); case AArch64ISD::NVCAST: return performNVCASTCombine(N); case AArch64ISD::SPLICE: @@ -19981,7 +20075,8 @@ void AArch64TargetLowering::ReplaceNodeResults( return; case ISD::CTPOP: - if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG)) + case ISD::PARITY: + if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG)) Results.push_back(Result); return; case AArch64ISD::SADDV: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index e02b5e56fd2e..1ba2e2f315ec 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -549,6 +549,10 @@ public: /// should be stack expanded. bool isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override; + /// Similar to isShuffleMaskLegal. Return true is the given 'select with zero' + /// shuffle mask can be codegen'd directly. + bool isVectorClearMaskLegal(ArrayRef<int> M, EVT VT) const override; + /// Return the ISD::SETCC ValueType. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; @@ -653,6 +657,9 @@ public: bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override; + /// Returns false if N is a bit extraction pattern of (X >> C) & Mask. + bool isDesirableToCommuteXorWithShift(const SDNode *N) const override; + /// Return true if it is profitable to fold a pair of shifts into a mask. bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override; @@ -995,7 +1002,7 @@ private: SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTPOP_PARITY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBitreverse(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 02fa36a1df4b..e70d304f37b9 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -1168,6 +1168,8 @@ def gi_arith_extended_reg32to64_i64 : GIComplexOperandMatcher<s64, "selectArithExtendedRegister">, GIComplexPatternEquiv<arith_extended_reg32to64_i64>; +def arith_uxtx : ComplexPattern<i64, 2, "SelectArithUXTXRegister", []>; + // Floating-point immediate. def fpimm16XForm : SDNodeXForm<fpimm, [{ @@ -1234,6 +1236,10 @@ def fpimm0 : FPImmLeaf<fAny, [{ return Imm.isExactlyValue(+0.0); }]>; +def fpimm_minus0 : FPImmLeaf<fAny, [{ + return Imm.isExactlyValue(-0.0); +}]>; + def fpimm_half : FPImmLeaf<fAny, [{ return Imm.isExactlyValue(+0.5); }]>; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index d444223e4494..a7b7e5270888 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1691,6 +1691,11 @@ def : InstAlias<"mov $dst, $src", defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn", "subs", "cmp">; defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp", "adds", "cmn">; +def copyFromSP: PatLeaf<(i64 GPR64:$src), [{ + return N->getOpcode() == ISD::CopyFromReg && + cast<RegisterSDNode>(N->getOperand(1))->getReg() == AArch64::SP; +}]>; + // Use SUBS instead of SUB to enable CSE between SUBS and SUB. def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm), (SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>; @@ -1709,6 +1714,8 @@ def : Pat<(sub GPR32sp:$R2, arith_extended_reg32_i32:$R3), (SUBSWrx GPR32sp:$R2, arith_extended_reg32_i32:$R3)>; def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64_i64:$R3), (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64_i64:$R3)>; +def : Pat<(sub copyFromSP:$R2, (arith_uxtx GPR64:$R3, arith_extendlsl64:$imm)), + (SUBXrx64 GPR64sp:$R2, GPR64:$R3, arith_extendlsl64:$imm)>; } // Because of the immediate format for add/sub-imm instructions, the @@ -5293,6 +5300,9 @@ def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)), // CodeGen patterns for addhn and subhn instructions, which can actually be // written in LLVM IR without too much difficulty. +// Prioritize ADDHN and SUBHN over UZP2. +let AddedComplexity = 10 in { + // ADDHN def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))), (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>; @@ -5343,6 +5353,8 @@ def : Pat<(concat_vectors (v2i32 V64:$Rd), (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), V128:$Rn, V128:$Rm)>; +} // AddedComplexity = 10 + //---------------------------------------------------------------------------- // AdvSIMD bitwise extract from vector instruction. //---------------------------------------------------------------------------- @@ -5409,6 +5421,19 @@ def : Pat<(v4i32 (concat_vectors (v2i32 (trunc (v2i64 V128:$Vn))), (v2i32 (trunc (v2i64 V128:$Vm))))), (UZP1v4i32 V128:$Vn, V128:$Vm)>; +def : Pat<(v16i8 (concat_vectors + (v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vn), (i32 8)))), + (v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vm), (i32 8)))))), + (UZP2v16i8 V128:$Vn, V128:$Vm)>; +def : Pat<(v8i16 (concat_vectors + (v4i16 (trunc (AArch64vlshr (v4i32 V128:$Vn), (i32 16)))), + (v4i16 (trunc (AArch64vlshr (v4i32 V128:$Vm), (i32 16)))))), + (UZP2v8i16 V128:$Vn, V128:$Vm)>; +def : Pat<(v4i32 (concat_vectors + (v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vn), (i32 32)))), + (v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vm), (i32 32)))))), + (UZP2v4i32 V128:$Vn, V128:$Vm)>; + //---------------------------------------------------------------------------- // AdvSIMD TBL/TBX instructions //---------------------------------------------------------------------------- diff --git a/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp b/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp index 6c8845ee8598..79866c9b0a05 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp +++ b/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp @@ -22,8 +22,8 @@ static bool needReorderStoreMI(const MachineInstr *MI) { return false; case AArch64::STURQi: case AArch64::STRQui: - if (MI->getMF()->getSubtarget<AArch64Subtarget>().isStoreAddressAscend()) - return false; + if (!MI->getMF()->getSubtarget<AArch64Subtarget>().isStoreAddressAscend()) + return false; LLVM_FALLTHROUGH; case AArch64::STPQi: return AArch64InstrInfo::getLdStOffsetOp(*MI).isImm(); diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index c66f9cfd9c22..4032c4667bc7 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -278,10 +278,18 @@ def AArch64scvtf_mt : SDNode<"AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU", SDT_AArch def AArch64fcvtzu_mt : SDNode<"AArch64ISD::FCVTZU_MERGE_PASSTHRU", SDT_AArch64FCVT>; def AArch64fcvtzs_mt : SDNode<"AArch64ISD::FCVTZS_MERGE_PASSTHRU", SDT_AArch64FCVT>; -def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>; -def AArch64clasta_n : SDNode<"AArch64ISD::CLASTA_N", SDT_AArch64ReduceWithInit>; -def AArch64clastb_n : SDNode<"AArch64ISD::CLASTB_N", SDT_AArch64ReduceWithInit>; -def AArch64fadda_p : SDNode<"AArch64ISD::FADDA_PRED", SDT_AArch64ReduceWithInit>; +def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, + [SDTCisVec<1>, SDTCVecEltisVT<1,i1>, SDTCisVec<3>, SDTCisSameNumEltsAs<1,3>]>; +def AArch64clasta_n : SDNode<"AArch64ISD::CLASTA_N", SDT_AArch64ReduceWithInit>; +def AArch64clastb_n : SDNode<"AArch64ISD::CLASTB_N", SDT_AArch64ReduceWithInit>; +def AArch64fadda_p_node : SDNode<"AArch64ISD::FADDA_PRED", SDT_AArch64ReduceWithInit>; + +def AArch64fadda_p : PatFrags<(ops node:$op1, node:$op2, node:$op3), + [(AArch64fadda_p_node node:$op1, node:$op2, node:$op3), + (AArch64fadda_p_node (SVEAllActive), node:$op2, + (vselect node:$op1, node:$op3, (splat_vector (f32 fpimm_minus0)))), + (AArch64fadda_p_node (SVEAllActive), node:$op2, + (vselect node:$op1, node:$op3, (splat_vector (f64 fpimm_minus0))))]>; def SDT_AArch64PTest : SDTypeProfile<0, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; def AArch64ptest : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>; @@ -447,6 +455,16 @@ let Predicates = [HasSVEorSME] in { defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", AArch64fabs_mt>; defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", AArch64fneg_mt>; + // zext(cmpeq(x, splat(0))) -> cnot(x) + def : Pat<(nxv16i8 (zext (nxv16i1 (AArch64setcc_z (nxv16i1 (SVEAllActive):$Pg), nxv16i8:$Op2, (SVEDup0), SETEQ)))), + (CNOT_ZPmZ_B $Op2, $Pg, $Op2)>; + def : Pat<(nxv8i16 (zext (nxv8i1 (AArch64setcc_z (nxv8i1 (SVEAllActive):$Pg), nxv8i16:$Op2, (SVEDup0), SETEQ)))), + (CNOT_ZPmZ_H $Op2, $Pg, $Op2)>; + def : Pat<(nxv4i32 (zext (nxv4i1 (AArch64setcc_z (nxv4i1 (SVEAllActive):$Pg), nxv4i32:$Op2, (SVEDup0), SETEQ)))), + (CNOT_ZPmZ_S $Op2, $Pg, $Op2)>; + def : Pat<(nxv2i64 (zext (nxv2i1 (AArch64setcc_z (nxv2i1 (SVEAllActive):$Pg), nxv2i64:$Op2, (SVEDup0), SETEQ)))), + (CNOT_ZPmZ_D $Op2, $Pg, $Op2)>; + defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", "SMAX_ZPZZ", int_aarch64_sve_smax, DestructiveBinaryComm>; defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", "UMAX_ZPZZ", int_aarch64_sve_umax, DestructiveBinaryComm>; defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", "SMIN_ZPZZ", int_aarch64_sve_smin, DestructiveBinaryComm>; @@ -857,6 +875,16 @@ let Predicates = [HasSVEorSME] in { defm LD1RQ_W : sve_mem_ldqr_ss<0b10, "ld1rqw", Z_s, ZPR32, GPR64NoXZRshifted32>; defm LD1RQ_D : sve_mem_ldqr_ss<0b11, "ld1rqd", Z_d, ZPR64, GPR64NoXZRshifted64>; + let AddedComplexity = 1 in { + class LD1RQPat<ValueType vt1, ValueType vt2, SDPatternOperator op, Instruction load_instr, Instruction ptrue> : + Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load GPR64sp:$Xn)), (i64 0))), (i64 0))), + (load_instr (ptrue 31), GPR64sp:$Xn, 0)>; + } + def : LD1RQPat<nxv16i8, v16i8, AArch64duplane128, LD1RQ_B_IMM, PTRUE_B>; + def : LD1RQPat<nxv8i16, v8i16, AArch64duplane128, LD1RQ_H_IMM, PTRUE_H>; + def : LD1RQPat<nxv4i32, v4i32, AArch64duplane128, LD1RQ_W_IMM, PTRUE_S>; + def : LD1RQPat<nxv2i64, v2i64, AArch64duplane128, LD1RQ_D_IMM, PTRUE_D>; + // continuous load with reg+reg addressing. defm LD1B : sve_mem_cld_ss<0b0000, "ld1b", Z_b, ZPR8, GPR64NoXZRshifted8>; defm LD1B_H : sve_mem_cld_ss<0b0001, "ld1b", Z_h, ZPR16, GPR64NoXZRshifted8>; diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 3f9795f5198b..47e4c6589c26 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -128,7 +128,7 @@ static cl::opt<bool> static cl::opt<bool> EnableGEPOpt("aarch64-enable-gep-opt", cl::Hidden, cl::desc("Enable optimizations on complex GEPs"), - cl::init(false)); + cl::init(true)); static cl::opt<bool> BranchRelaxation("aarch64-enable-branch-relax", cl::Hidden, cl::init(true), @@ -563,17 +563,6 @@ void AArch64PassConfig::addIRPasses() { addPass(createFalkorMarkStridedAccessesPass()); } - TargetPassConfig::addIRPasses(); - - addPass(createAArch64StackTaggingPass( - /*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None)); - - // Match interleaved memory accesses to ldN/stN intrinsics. - if (TM->getOptLevel() != CodeGenOpt::None) { - addPass(createInterleavedLoadCombinePass()); - addPass(createInterleavedAccessPass()); - } - if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) { // Call SeparateConstOffsetFromGEP pass to extract constants within indices // and lower a GEP with multiple indices to either arithmetic operations or @@ -587,6 +576,17 @@ void AArch64PassConfig::addIRPasses() { addPass(createLICMPass()); } + TargetPassConfig::addIRPasses(); + + addPass(createAArch64StackTaggingPass( + /*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None)); + + // Match interleaved memory accesses to ldN/stN intrinsics. + if (TM->getOptLevel() != CodeGenOpt::None) { + addPass(createInterleavedLoadCombinePass()); + addPass(createInterleavedAccessPass()); + } + // Add Control Flow Guard checks. if (TM->getTargetTriple().isOSWindows()) addPass(createCFGuardCheckPass()); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 274a025e82a0..66617393c9ae 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" +#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include <algorithm> using namespace llvm; using namespace llvm::PatternMatch; @@ -37,6 +38,74 @@ static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10), static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden); +class TailFoldingKind { +private: + uint8_t Bits = 0; // Currently defaults to disabled. + +public: + enum TailFoldingOpts { + TFDisabled = 0x0, + TFReductions = 0x01, + TFRecurrences = 0x02, + TFSimple = 0x80, + TFAll = TFReductions | TFRecurrences | TFSimple + }; + + void operator=(const std::string &Val) { + if (Val.empty()) + return; + SmallVector<StringRef, 6> TailFoldTypes; + StringRef(Val).split(TailFoldTypes, '+', -1, false); + for (auto TailFoldType : TailFoldTypes) { + if (TailFoldType == "disabled") + Bits = 0; + else if (TailFoldType == "all") + Bits = TFAll; + else if (TailFoldType == "default") + Bits = 0; // Currently defaults to never tail-folding. + else if (TailFoldType == "simple") + add(TFSimple); + else if (TailFoldType == "reductions") + add(TFReductions); + else if (TailFoldType == "recurrences") + add(TFRecurrences); + else if (TailFoldType == "noreductions") + remove(TFReductions); + else if (TailFoldType == "norecurrences") + remove(TFRecurrences); + else { + errs() + << "invalid argument " << TailFoldType.str() + << " to -sve-tail-folding=; each element must be one of: disabled, " + "all, default, simple, reductions, noreductions, recurrences, " + "norecurrences\n"; + } + } + } + + operator uint8_t() const { return Bits; } + + void add(uint8_t Flag) { Bits |= Flag; } + void remove(uint8_t Flag) { Bits &= ~Flag; } +}; + +TailFoldingKind TailFoldingKindLoc; + +cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding( + "sve-tail-folding", + cl::desc( + "Control the use of vectorisation using tail-folding for SVE:" + "\ndisabled No loop types will vectorize using tail-folding" + "\ndefault Uses the default tail-folding settings for the target " + "CPU" + "\nall All legal loop types will vectorize using tail-folding" + "\nsimple Use tail-folding for simple loops (not reductions or " + "recurrences)" + "\nreductions Use tail-folding for loops containing reductions" + "\nrecurrences Use tail-folding for loops containing first order " + "recurrences"), + cl::location(TailFoldingKindLoc)); + bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); @@ -2955,3 +3024,20 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); } + +bool AArch64TTIImpl::preferPredicateOverEpilogue( + Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, + TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL) { + if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled) + return false; + + TailFoldingKind Required; // Defaults to 0. + if (LVL->getReductionVars().size()) + Required.add(TailFoldingKind::TFReductions); + if (LVL->getFirstOrderRecurrences().size()) + Required.add(TailFoldingKind::TFRecurrences); + if (!Required) + Required.add(TailFoldingKind::TFSimple); + + return (TailFoldingKindLoc & Required) == Required; +} diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 59ec91843266..2231f8705998 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -340,6 +340,11 @@ public: return PredicationStyle::None; } + bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + AssumptionCache &AC, TargetLibraryInfo *TLI, + DominatorTree *DT, + LoopVectorizationLegality *LVL); + bool supportsScalableVectors() const { return ST->hasSVE(); } bool enableScalableVectorization() const { return ST->hasSVE(); } @@ -347,6 +352,11 @@ public: bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const; + bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const { + return ST->hasSVE(); + } + InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, Optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index f129bfe11e4d..3fe3b2a69855 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -231,7 +231,70 @@ void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) { {codeview::RegisterId::ARM64_Q29, AArch64::Q29}, {codeview::RegisterId::ARM64_Q30, AArch64::Q30}, {codeview::RegisterId::ARM64_Q31, AArch64::Q31}, - + {codeview::RegisterId::ARM64_B0, AArch64::B0}, + {codeview::RegisterId::ARM64_B1, AArch64::B1}, + {codeview::RegisterId::ARM64_B2, AArch64::B2}, + {codeview::RegisterId::ARM64_B3, AArch64::B3}, + {codeview::RegisterId::ARM64_B4, AArch64::B4}, + {codeview::RegisterId::ARM64_B5, AArch64::B5}, + {codeview::RegisterId::ARM64_B6, AArch64::B6}, + {codeview::RegisterId::ARM64_B7, AArch64::B7}, + {codeview::RegisterId::ARM64_B8, AArch64::B8}, + {codeview::RegisterId::ARM64_B9, AArch64::B9}, + {codeview::RegisterId::ARM64_B10, AArch64::B10}, + {codeview::RegisterId::ARM64_B11, AArch64::B11}, + {codeview::RegisterId::ARM64_B12, AArch64::B12}, + {codeview::RegisterId::ARM64_B13, AArch64::B13}, + {codeview::RegisterId::ARM64_B14, AArch64::B14}, + {codeview::RegisterId::ARM64_B15, AArch64::B15}, + {codeview::RegisterId::ARM64_B16, AArch64::B16}, + {codeview::RegisterId::ARM64_B17, AArch64::B17}, + {codeview::RegisterId::ARM64_B18, AArch64::B18}, + {codeview::RegisterId::ARM64_B19, AArch64::B19}, + {codeview::RegisterId::ARM64_B20, AArch64::B20}, + {codeview::RegisterId::ARM64_B21, AArch64::B21}, + {codeview::RegisterId::ARM64_B22, AArch64::B22}, + {codeview::RegisterId::ARM64_B23, AArch64::B23}, + {codeview::RegisterId::ARM64_B24, AArch64::B24}, + {codeview::RegisterId::ARM64_B25, AArch64::B25}, + {codeview::RegisterId::ARM64_B26, AArch64::B26}, + {codeview::RegisterId::ARM64_B27, AArch64::B27}, + {codeview::RegisterId::ARM64_B28, AArch64::B28}, + {codeview::RegisterId::ARM64_B29, AArch64::B29}, + {codeview::RegisterId::ARM64_B30, AArch64::B30}, + {codeview::RegisterId::ARM64_B31, AArch64::B31}, + {codeview::RegisterId::ARM64_H0, AArch64::H0}, + {codeview::RegisterId::ARM64_H1, AArch64::H1}, + {codeview::RegisterId::ARM64_H2, AArch64::H2}, + {codeview::RegisterId::ARM64_H3, AArch64::H3}, + {codeview::RegisterId::ARM64_H4, AArch64::H4}, + {codeview::RegisterId::ARM64_H5, AArch64::H5}, + {codeview::RegisterId::ARM64_H6, AArch64::H6}, + {codeview::RegisterId::ARM64_H7, AArch64::H7}, + {codeview::RegisterId::ARM64_H8, AArch64::H8}, + {codeview::RegisterId::ARM64_H9, AArch64::H9}, + {codeview::RegisterId::ARM64_H10, AArch64::H10}, + {codeview::RegisterId::ARM64_H11, AArch64::H11}, + {codeview::RegisterId::ARM64_H12, AArch64::H12}, + {codeview::RegisterId::ARM64_H13, AArch64::H13}, + {codeview::RegisterId::ARM64_H14, AArch64::H14}, + {codeview::RegisterId::ARM64_H15, AArch64::H15}, + {codeview::RegisterId::ARM64_H16, AArch64::H16}, + {codeview::RegisterId::ARM64_H17, AArch64::H17}, + {codeview::RegisterId::ARM64_H18, AArch64::H18}, + {codeview::RegisterId::ARM64_H19, AArch64::H19}, + {codeview::RegisterId::ARM64_H20, AArch64::H20}, + {codeview::RegisterId::ARM64_H21, AArch64::H21}, + {codeview::RegisterId::ARM64_H22, AArch64::H22}, + {codeview::RegisterId::ARM64_H23, AArch64::H23}, + {codeview::RegisterId::ARM64_H24, AArch64::H24}, + {codeview::RegisterId::ARM64_H25, AArch64::H25}, + {codeview::RegisterId::ARM64_H26, AArch64::H26}, + {codeview::RegisterId::ARM64_H27, AArch64::H27}, + {codeview::RegisterId::ARM64_H28, AArch64::H28}, + {codeview::RegisterId::ARM64_H29, AArch64::H29}, + {codeview::RegisterId::ARM64_H30, AArch64::H30}, + {codeview::RegisterId::ARM64_H31, AArch64::H31}, }; for (const auto &I : RegMap) MRI->mapLLVMRegToCVReg(I.Reg, static_cast<int>(I.CVReg)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 48b5814cd482..2d6f1438e315 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -585,6 +585,12 @@ def FeatureMAIInsts : SubtargetFeature<"mai-insts", "Has mAI instructions" >; +def FeatureFP8Insts : SubtargetFeature<"fp8-insts", + "HasFP8Insts", + "true", + "Has fp8 and bf8 instructions" +>; + def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst", "HasPkFmacF16Inst", "true", @@ -1124,6 +1130,7 @@ def FeatureISAVersion9_4_0 : FeatureSet< Feature64BitDPP, FeaturePackedFP32Ops, FeatureMAIInsts, + FeatureFP8Insts, FeaturePkFmacF16Inst, FeatureAtomicFaddRtnInsts, FeatureAtomicFaddNoRtnInsts, @@ -1265,11 +1272,14 @@ def FeatureISAVersion11_Common : FeatureSet< FeaturePackedTID, FeatureVcmpxPermlaneHazard]>; -// Features for GFX 11.0.0 and 11.0.1 -def FeatureISAVersion11_0 : FeatureSet< +def FeatureISAVersion11_0_0 : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, [FeatureUserSGPRInit16Bug])>; +def FeatureISAVersion11_0_1 : FeatureSet< + !listconcat(FeatureISAVersion11_Common.Features, + [])>; + def FeatureISAVersion11_0_2 : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, [FeatureUserSGPRInit16Bug])>; @@ -1704,6 +1714,9 @@ def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">, def HasShaderCyclesRegister : Predicate<"Subtarget->hasShaderCyclesRegister()">, AssemblerPredicate<(all_of FeatureShaderCyclesRegister)>; +def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">, + AssemblerPredicate<(all_of FeatureFP8Insts)>; + def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">, AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp index d28f38e42430..d361e33995cf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -74,6 +74,7 @@ void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const { << " WorkGroupIDY: " << FI.second.WorkGroupIDY << " WorkGroupIDZ: " << FI.second.WorkGroupIDZ << " WorkGroupInfo: " << FI.second.WorkGroupInfo + << " LDSKernelId: " << FI.second.LDSKernelId << " PrivateSegmentWaveByteOffset: " << FI.second.PrivateSegmentWaveByteOffset << " ImplicitBufferPtr: " << FI.second.ImplicitBufferPtr @@ -107,6 +108,9 @@ AMDGPUFunctionArgInfo::getPreloadedValue( case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: return std::make_tuple(WorkGroupIDZ ? &WorkGroupIDZ : nullptr, &AMDGPU::SGPR_32RegClass, LLT::scalar(32)); + case AMDGPUFunctionArgInfo::LDS_KERNEL_ID: + return std::make_tuple(LDSKernelId ? &LDSKernelId : nullptr, + &AMDGPU::SGPR_32RegClass, LLT::scalar(32)); case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: return std::make_tuple( PrivateSegmentWaveByteOffset ? &PrivateSegmentWaveByteOffset : nullptr, @@ -162,6 +166,7 @@ constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() { AI.WorkGroupIDX = ArgDescriptor::createRegister(AMDGPU::SGPR12); AI.WorkGroupIDY = ArgDescriptor::createRegister(AMDGPU::SGPR13); AI.WorkGroupIDZ = ArgDescriptor::createRegister(AMDGPU::SGPR14); + AI.LDSKernelId = ArgDescriptor::createRegister(AMDGPU::SGPR15); const unsigned Mask = 0x3ff; AI.WorkItemIDX = ArgDescriptor::createRegister(AMDGPU::VGPR31, Mask); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index e9ed45d8cd14..f595e469f998 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -103,6 +103,7 @@ struct AMDGPUFunctionArgInfo { KERNARG_SEGMENT_PTR = 3, DISPATCH_ID = 4, FLAT_SCRATCH_INIT = 5, + LDS_KERNEL_ID = 6, // LLVM internal, not part of the ABI WORKGROUP_ID_X = 10, WORKGROUP_ID_Y = 11, WORKGROUP_ID_Z = 12, @@ -128,6 +129,7 @@ struct AMDGPUFunctionArgInfo { ArgDescriptor DispatchID; ArgDescriptor FlatScratchInit; ArgDescriptor PrivateSegmentSize; + ArgDescriptor LDSKernelId; // System SGPRs in kernels. ArgDescriptor WorkGroupIDX; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 57a4660bc1eb..13a65f1ad601 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -27,8 +27,10 @@ #include "SIMachineFunctionInfo.h" #include "TargetInfo/AMDGPUTargetInfo.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" @@ -415,6 +417,10 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; } + if (CurrentProgramInfo.DynamicCallStack) { + KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK; + } + return KernelCodeProperties; } @@ -506,6 +512,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { emitFunctionBody(); + emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(), + STM.hasMAIInsts()); + if (isVerbose()) { MCSectionELF *CommentSection = Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); @@ -875,6 +884,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, LDSAlignShift = 9; } + ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs(); + ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs(); + ProgInfo.LDSSize = MFI->getLDSSize(); ProgInfo.LDSBlocks = alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; @@ -1180,3 +1192,58 @@ void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<AMDGPUResourceUsageAnalysis>(); AsmPrinter::getAnalysisUsage(AU); } + +void AMDGPUAsmPrinter::emitResourceUsageRemarks( + const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo, + bool isModuleEntryFunction, bool hasMAIInsts) { + if (!ORE) + return; + + const char *Name = "kernel-resource-usage"; + const char *Indent = " "; + + // If the remark is not specifically enabled, do not output to yaml + LLVMContext &Ctx = MF.getFunction().getContext(); + if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name)) + return; + + auto EmitResourceUsageRemark = [&](StringRef RemarkName, + StringRef RemarkLabel, auto Argument) { + // Add an indent for every line besides the line with the kernel name. This + // makes it easier to tell which resource usage go with which kernel since + // the kernel name will always be displayed first. + std::string LabelStr = RemarkLabel.str() + ": "; + if (!RemarkName.equals("FunctionName")) + LabelStr = Indent + LabelStr; + + ORE->emit([&]() { + return MachineOptimizationRemarkAnalysis(Name, RemarkName, + MF.getFunction().getSubprogram(), + &MF.front()) + << LabelStr << ore::NV(RemarkName, Argument); + }); + }; + + // FIXME: Formatting here is pretty nasty because clang does not accept + // newlines from diagnostics. This forces us to emit multiple diagnostic + // remarks to simulate newlines. If and when clang does accept newlines, this + // formatting should be aggregated into one remark with newlines to avoid + // printing multiple diagnostic location and diag opts. + EmitResourceUsageRemark("FunctionName", "Function Name", + MF.getFunction().getName()); + EmitResourceUsageRemark("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR); + EmitResourceUsageRemark("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR); + if (hasMAIInsts) + EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR); + EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]", + CurrentProgramInfo.ScratchSize); + EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]", + CurrentProgramInfo.Occupancy); + EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill", + CurrentProgramInfo.SGPRSpill); + EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill", + CurrentProgramInfo.VGPRSpill); + if (isModuleEntryFunction) + EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]", + CurrentProgramInfo.LDSSize); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index ddda2cf107b1..2881b8d7bcca 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -69,6 +69,9 @@ private: uint64_t ScratchSize, uint64_t CodeSize, const AMDGPUMachineFunction* MFI); + void emitResourceUsageRemarks(const MachineFunction &MF, + const SIProgramInfo &CurrentProgramInfo, + bool isModuleEntryFunction, bool hasMAIInsts); uint16_t getAmdhsaKernelCodeProperties( const MachineFunction &MF) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def index 0a2cf3874245..c7a060c5db5b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def @@ -27,5 +27,6 @@ AMDGPU_ATTRIBUTE(WORKGROUP_ID_Z, "amdgpu-no-workgroup-id-z") AMDGPU_ATTRIBUTE(WORKITEM_ID_X, "amdgpu-no-workitem-id-x") AMDGPU_ATTRIBUTE(WORKITEM_ID_Y, "amdgpu-no-workitem-id-y") AMDGPU_ATTRIBUTE(WORKITEM_ID_Z, "amdgpu-no-workitem-id-z") +AMDGPU_ATTRIBUTE(LDS_KERNEL_ID, "amdgpu-no-lds-kernel-id") #undef AMDGPU_ATTRIBUTE diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 8de0d7e6bff1..a3634d2440c3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -72,6 +72,8 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, case Intrinsic::amdgcn_workgroup_id_z: case Intrinsic::r600_read_tgid_z: return WORKGROUP_ID_Z; + case Intrinsic::amdgcn_lds_kernel_id: + return LDS_KERNEL_ID; case Intrinsic::amdgcn_dispatch_ptr: return DISPATCH_PTR; case Intrinsic::amdgcn_dispatch_id: @@ -457,6 +459,10 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { removeAssumedBits(QUEUE_PTR); } + if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) { + removeAssumedBits(LDS_KERNEL_ID); + } + return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; } @@ -591,6 +597,16 @@ private: return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this, UsedAssumedInformation); } + + bool funcRetrievesLDSKernelId(Attributor &A) { + auto DoesNotRetrieve = [&](Instruction &I) { + auto &Call = cast<CallBase>(I); + return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id; + }; + bool UsedAssumedInformation = false; + return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this, + UsedAssumedInformation); + } }; AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP, @@ -743,7 +759,8 @@ public: AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM); DenseSet<const char *> Allowed( {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, - &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID, &AAPointerInfo::ID}); + &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID, + &AAPointerInfo::ID}); AttributorConfig AC(CGUpdater); AC.Allowed = &Allowed; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index fd812eb676ef..4550cfdcf883 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -764,7 +764,8 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, AMDGPUFunctionArgInfo::DISPATCH_ID, AMDGPUFunctionArgInfo::WORKGROUP_ID_X, AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, - AMDGPUFunctionArgInfo::WORKGROUP_ID_Z + AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, + AMDGPUFunctionArgInfo::LDS_KERNEL_ID, }; static constexpr StringLiteral ImplicitAttrNames[] = { @@ -774,7 +775,8 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, "amdgpu-no-dispatch-id", "amdgpu-no-workgroup-id-x", "amdgpu-no-workgroup-id-y", - "amdgpu-no-workgroup-id-z" + "amdgpu-no-workgroup-id-z", + "amdgpu-no-lds-kernel-id", }; MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -810,6 +812,14 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy); } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) { LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder); + } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) { + Optional<uint32_t> Id = + AMDGPUMachineFunction::getLDSKernelIdMetadata(MF.getFunction()); + if (Id.has_value()) { + MIRBuilder.buildConstant(InputReg, Id.value()); + } else { + MIRBuilder.buildUndef(InputReg); + } } else { // We may have proven the input wasn't needed, although the ABI is // requiring it. We just need to allocate the register appropriately. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 5747fc0ca8e6..229dfb62ef6e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -88,6 +88,10 @@ def gi_smrd_sgpr : GIComplexOperandMatcher<s64, "selectSmrdSgpr">, GIComplexPatternEquiv<SMRDSgpr>; +def gi_smrd_sgpr_imm : + GIComplexOperandMatcher<s64, "selectSmrdSgprImm">, + GIComplexPatternEquiv<SMRDSgprImm>; + def gi_flat_offset : GIComplexOperandMatcher<s64, "selectFlatOffset">, GIComplexPatternEquiv<FlatOffset>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index 6fa44ffcbfaa..632a76b32009 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -875,6 +875,8 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF, Kern.getDocument()->getNode(ProgramInfo.LDSSize); Kern[".private_segment_fixed_size"] = Kern.getDocument()->getNode(ProgramInfo.ScratchSize); + Kern[".uses_dynamic_stack"] = + Kern.getDocument()->getNode(ProgramInfo.DynamicCallStack); // FIXME: The metadata treats the minimum as 16? Kern[".kernarg_segment_align"] = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 589992c7a7ec..147c8850587e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -33,7 +33,7 @@ #include "llvm/IR/Dominators.h" #endif -#define DEBUG_TYPE "isel" +#define DEBUG_TYPE "amdgpu-isel" using namespace llvm; @@ -1886,21 +1886,21 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, // Match an immediate (if Imm is true) or an SGPR (if Imm is false) // offset. If Imm32Only is true, match only 32-bit immediate offsets // available on CI. -bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, - SDValue &Offset, bool Imm, +bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue Addr, SDValue ByteOffsetNode, + SDValue *SOffset, SDValue *Offset, bool Imm32Only) const { ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode); if (!C) { - if (Imm) + if (!SOffset) return false; if (ByteOffsetNode.getValueType().isScalarInteger() && ByteOffsetNode.getValueType().getSizeInBits() == 32) { - Offset = ByteOffsetNode; + *SOffset = ByteOffsetNode; return true; } if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) { if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) { - Offset = ByteOffsetNode.getOperand(0); + *SOffset = ByteOffsetNode.getOperand(0); return true; } } @@ -1912,8 +1912,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, int64_t ByteOffset = C->getSExtValue(); Optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false); - if (EncodedOffset && Imm && !Imm32Only) { - Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); + if (EncodedOffset && Offset && !Imm32Only) { + *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); return true; } @@ -1922,17 +1922,17 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, return false; EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset); - if (EncodedOffset && Imm32Only) { - Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); + if (EncodedOffset && Offset && Imm32Only) { + *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); return true; } if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset)) return false; - if (!Imm) { + if (SOffset) { SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32); - Offset = SDValue( + *SOffset = SDValue( CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0); return true; } @@ -1968,11 +1968,18 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { // Match a base and an immediate (if Imm is true) or an SGPR // (if Imm is false) offset. If Imm32Only is true, match only 32-bit // immediate offsets available on CI. -bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, - SDValue &Offset, bool Imm, - bool Imm32Only) const { +bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, + SDValue *SOffset, SDValue *Offset, + bool Imm32Only) const { SDLoc SL(Addr); + if (SOffset && Offset) { + assert(!Imm32Only); + SDValue B; + return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) && + SelectSMRDBaseOffset(B, SBase, SOffset, nullptr); + } + // A 32-bit (address + offset) should not cause unsigned 32-bit integer // wraparound, because s_load instructions perform the addition in 64 bits. if ((Addr.getValueType() != MVT::i32 || @@ -1987,34 +1994,55 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, assert(N0 && N1 && isa<ConstantSDNode>(N1)); } if (N0 && N1) { - if (SelectSMRDOffset(N1, Offset, Imm, Imm32Only)) { - SBase = Expand32BitAddress(N0); + if (SelectSMRDOffset(N0, N1, SOffset, Offset, Imm32Only)) { + SBase = N0; + return true; + } + if (SelectSMRDOffset(N1, N0, SOffset, Offset, Imm32Only)) { + SBase = N1; return true; } } return false; } - if (!Imm) + if (Offset && !SOffset) { + SBase = Addr; + *Offset = CurDAG->getTargetConstant(0, SL, MVT::i32); + return true; + } + return false; +} + +bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, + SDValue *SOffset, SDValue *Offset, + bool Imm32Only) const { + if (!SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) return false; - SBase = Expand32BitAddress(Addr); - Offset = CurDAG->getTargetConstant(0, SL, MVT::i32); + SBase = Expand32BitAddress(SBase); return true; } bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const { - return SelectSMRD(Addr, SBase, Offset, /* Imm */ true); + return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset); } bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const { assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); - return SelectSMRD(Addr, SBase, Offset, /* Imm */ true, /* Imm32Only */ true); + return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset, + /* Imm32Only */ true); } bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase, - SDValue &Offset) const { - return SelectSMRD(Addr, SBase, Offset, /* Imm */ false); + SDValue &SOffset) const { + return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr); +} + +bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, + SDValue &SOffset, + SDValue &Offset) const { + return SelectSMRD(Addr, SBase, &SOffset, &Offset); } bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 7894b8eb5b67..fda2bfac71fc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -193,14 +193,18 @@ private: bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &SAddr, SDValue &Offset) const; - bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool Imm, - bool Imm32Only) const; + bool SelectSMRDOffset(SDValue Base, SDValue ByteOffsetNode, SDValue *SOffset, + SDValue *Offset, bool Imm32Only = false) const; SDValue Expand32BitAddress(SDValue Addr) const; - bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, bool Imm, - bool Imm32Only = false) const; + bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset, + SDValue *Offset, bool Imm32Only = false) const; + bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset, + SDValue *Offset, bool Imm32Only = false) const; bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const; - bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const; + bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &SOffset) const; + bool SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, SDValue &SOffset, + SDValue &Offset) const; bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 70fae9d784a2..f2e5c2fe00e8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1006,6 +1006,14 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: + case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: + case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: + case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: + case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: + case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: + case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: + case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: + case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: return selectSMFMACIntrin(I); default: return selectImpl(I, *CoverageInfo); @@ -2361,7 +2369,7 @@ void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) return; - GEPInfo GEPInfo(*PtrMI); + GEPInfo GEPInfo; for (unsigned i = 1; i != 3; ++i) { const MachineOperand &GEPOp = PtrMI->getOperand(i); @@ -3237,6 +3245,8 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) return Register(); + assert(Def->getNumOperands() == 3 && + MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { return Def->getOperand(1).getReg(); } @@ -3354,6 +3364,30 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const { case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64; break; + case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: + Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64; + break; + case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: + Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64; + break; + case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: + Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64; + break; + case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: + Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64; + break; + case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: + Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64; + break; + case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: + Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64; + break; + case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: + Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64; + break; + case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: + Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64; + break; default: llvm_unreachable("unhandled smfmac intrinsic"); } @@ -3800,25 +3834,82 @@ AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const { }}; } -InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { +bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, + Register &Base, + Register *SOffset, + int64_t *Offset) const { + MachineInstr *MI = Root.getParent(); + MachineBasicBlock *MBB = MI->getParent(); + + // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, + // then we can select all ptr + 32-bit offsets. SmallVector<GEPInfo, 4> AddrInfo; - getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); + getAddrModeInfo(*MI, *MRI, AddrInfo); - if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) - return None; + if (AddrInfo.empty()) + return false; - const GEPInfo &GEPInfo = AddrInfo[0]; + const GEPInfo &GEPI = AddrInfo[0]; Optional<int64_t> EncodedImm = - AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false); - if (!EncodedImm) + AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false); + + if (SOffset && Offset) { + if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm && + AddrInfo.size() > 1) { + const GEPInfo &GEPI2 = AddrInfo[1]; + if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) { + if (Register OffsetReg = + matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) { + Base = GEPI2.SgprParts[0]; + *SOffset = OffsetReg; + *Offset = *EncodedImm; + return true; + } + } + } + return false; + } + + if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) { + Base = GEPI.SgprParts[0]; + *Offset = *EncodedImm; + return true; + } + + // SGPR offset is unsigned. + if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) && + GEPI.Imm != 0) { + // If we make it this far we have a load with an 32-bit immediate offset. + // It is OK to select this using a sgpr offset, because we have already + // failed trying to select this load into one of the _IMM variants since + // the _IMM Patterns are considered before the _SGPR patterns. + Base = GEPI.SgprParts[0]; + *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset) + .addImm(GEPI.Imm); + return true; + } + + if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) { + if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) { + Base = GEPI.SgprParts[0]; + *SOffset = OffsetReg; + return true; + } + } + + return false; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { + Register Base; + int64_t Offset; + if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset)) return None; - unsigned PtrReg = GEPInfo.SgprParts[0]; - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } - }}; + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}}; } InstructionSelector::ComplexRendererFns @@ -3844,43 +3935,24 @@ AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { - MachineInstr *MI = Root.getParent(); - MachineBasicBlock *MBB = MI->getParent(); - - SmallVector<GEPInfo, 4> AddrInfo; - getAddrModeInfo(*MI, *MRI, AddrInfo); - - // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, - // then we can select all ptr + 32-bit offsets. - if (AddrInfo.empty()) + Register Base, SOffset; + if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr)) return None; - const GEPInfo &GEPInfo = AddrInfo[0]; - Register PtrReg = GEPInfo.SgprParts[0]; - - // SGPR offset is unsigned. - if (AddrInfo[0].SgprParts.size() == 1 && isUInt<32>(GEPInfo.Imm) && - GEPInfo.Imm != 0) { - // If we make it this far we have a load with an 32-bit immediate offset. - // It is OK to select this using a sgpr offset, because we have already - // failed trying to select this load into one of the _IMM variants since - // the _IMM Patterns are considered before the _SGPR patterns. - Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) - .addImm(GEPInfo.Imm); - return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, - [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}}; - } + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}}; +} - if (AddrInfo[0].SgprParts.size() == 2 && GEPInfo.Imm == 0) { - if (Register OffsetReg = - matchZeroExtendFromS32(*MRI, GEPInfo.SgprParts[1])) { - return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, - [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}}; - } - } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const { + Register Base, SOffset; + int64_t Offset; + if (!selectSmrdOffset(Root, Base, &SOffset, &Offset)) + return None; - return None; + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}}; } std::pair<Register, int> diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 22672ba59e76..5baf55d23480 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -63,11 +63,9 @@ public: private: struct GEPInfo { - const MachineInstr &GEP; SmallVector<unsigned, 2> SgprParts; SmallVector<unsigned, 2> VgprParts; - int64_t Imm; - GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { } + int64_t Imm = 0; }; bool isSGPR(Register Reg) const; @@ -200,12 +198,16 @@ private: InstructionSelector::ComplexRendererFns selectVINTERPModsHi(MachineOperand &Root) const; + bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset, + int64_t *Offset) const; InstructionSelector::ComplexRendererFns selectSmrdImm(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectSmrdImm32(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectSmrdSgpr(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSmrdSgprImm(MachineOperand &Root) const; std::pair<Register, int> selectFlatOffsetImpl(MachineOperand &Root, uint64_t FlatVariant) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 01a3e78ea48c..0979debe9777 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4197,6 +4197,35 @@ bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, return true; } +bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + Function &F = B.getMF().getFunction(); + Optional<uint32_t> KnownSize = + AMDGPUMachineFunction::getLDSKernelIdMetadata(F); + if (KnownSize.has_value()) + B.buildConstant(DstReg, KnownSize.value()); + return false; +} + +bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + + const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); + if (!MFI->isEntryFunction()) { + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::LDS_KERNEL_ID); + } + + Register DstReg = MI.getOperand(0).getReg(); + if (!getLDSKernelId(DstReg, MRI, B)) + return false; + + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, @@ -5636,6 +5665,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_workgroup_id_z: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); + case Intrinsic::amdgcn_lds_kernel_id: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::LDS_KERNEL_ID); case Intrinsic::amdgcn_dispatch_ptr: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::DISPATCH_PTR); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index cee533aa34ec..5e8111e22aad 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -155,6 +155,13 @@ public: bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + + bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + + bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 78e092b2e872..7e49a6117ebd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -376,15 +376,7 @@ static bool HasNative(AMDGPULibFunc::EFuncId id) { return false; } -struct TableRef { - size_t size; - const TableEntry *table; // variable size: from 0 to (size - 1) - - TableRef() : size(0), table(nullptr) {} - - template <size_t N> - TableRef(const TableEntry (&tbl)[N]) : size(N), table(&tbl[0]) {} -}; +using TableRef = ArrayRef<TableEntry>; static TableRef getOptTable(AMDGPULibFunc::EFuncId id) { switch(id) { @@ -698,11 +690,10 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) { bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) { // Table-Driven optimization const TableRef tr = getOptTable(FInfo.getId()); - if (tr.size==0) + if (tr.empty()) return false; - int const sz = (int)tr.size; - const TableEntry * const ftbl = tr.table; + int const sz = (int)tr.size(); Value *opr0 = CI->getArgOperand(0); if (getVecSize(FInfo) > 1) { @@ -714,8 +705,8 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) { assert(eltval && "Non-FP arguments in math function!"); bool found = false; for (int i=0; i < sz; ++i) { - if (eltval->isExactlyValue(ftbl[i].input)) { - DVal.push_back(ftbl[i].result); + if (eltval->isExactlyValue(tr[i].input)) { + DVal.push_back(tr[i].result); found = true; break; } @@ -746,8 +737,8 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) { // Scalar version if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) { for (int i = 0; i < sz; ++i) { - if (CF->isExactlyValue(ftbl[i].input)) { - Value *nval = ConstantFP::get(CF->getType(), ftbl[i].result); + if (CF->isExactlyValue(tr[i].input)) { + Value *nval = ConstantFP::get(CF->getType(), tr[i].result); LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n"); replaceCall(nval); return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 35922341de26..b4a8766d682e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -55,21 +55,6 @@ static cl::opt<bool> SuperAlignLDSGlobals( cl::init(true), cl::Hidden); namespace { - -SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) { - SmallPtrSet<GlobalValue *, 32> UsedList; - - SmallVector<GlobalValue *, 32> TmpVec; - collectUsedGlobalVariables(M, TmpVec, true); - UsedList.insert(TmpVec.begin(), TmpVec.end()); - - TmpVec.clear(); - collectUsedGlobalVariables(M, TmpVec, false); - UsedList.insert(TmpVec.begin(), TmpVec.end()); - - return UsedList; -} - class AMDGPULowerModuleLDS : public ModulePass { static void removeFromUsedList(Module &M, StringRef Name, @@ -153,9 +138,6 @@ class AMDGPULowerModuleLDS : public ModulePass { ""); } -private: - SmallPtrSet<GlobalValue *, 32> UsedList; - public: static char ID; @@ -165,9 +147,10 @@ public: bool runOnModule(Module &M) override { CallGraph CG = CallGraph(M); - UsedList = getUsedList(M); bool Changed = superAlignLDSGlobals(M); - Changed |= processUsedLDS(CG, M); + std::vector<GlobalVariable *> ModuleScopeVariables = + AMDGPU::findVariablesToLower(M, nullptr); + Changed |= processUsedLDS(CG, M, ModuleScopeVariables); for (Function &F : M.functions()) { if (F.isDeclaration()) @@ -176,10 +159,11 @@ public: // Only lower compute kernels' LDS. if (!AMDGPU::isKernel(F.getCallingConv())) continue; - Changed |= processUsedLDS(CG, M, &F); + std::vector<GlobalVariable *> KernelUsedVariables = + AMDGPU::findVariablesToLower(M, &F); + Changed |= processUsedLDS(CG, M, KernelUsedVariables, &F); } - UsedList.clear(); return Changed; } @@ -228,22 +212,20 @@ private: return Changed; } - bool processUsedLDS(CallGraph const &CG, Module &M, Function *F = nullptr) { + bool processUsedLDS(CallGraph const &CG, Module &M, + std::vector<GlobalVariable *> const &LDSVarsToTransform, + Function *F = nullptr) { LLVMContext &Ctx = M.getContext(); const DataLayout &DL = M.getDataLayout(); - // Find variables to move into new struct instance - std::vector<GlobalVariable *> FoundLocalVars = - AMDGPU::findVariablesToLower(M, F); - - if (FoundLocalVars.empty()) { + if (LDSVarsToTransform.empty()) { // No variables to rewrite, no changes made. return false; } SmallVector<OptimizedStructLayoutField, 8> LayoutFields; - LayoutFields.reserve(FoundLocalVars.size()); - for (GlobalVariable *GV : FoundLocalVars) { + LayoutFields.reserve(LDSVarsToTransform.size()); + for (GlobalVariable *GV : LDSVarsToTransform) { OptimizedStructLayoutField F(GV, DL.getTypeAllocSize(GV->getValueType()), AMDGPU::getAlign(DL, GV)); LayoutFields.emplace_back(F); @@ -252,7 +234,7 @@ private: performOptimizedStructLayout(LayoutFields); std::vector<GlobalVariable *> LocalVars; - LocalVars.reserve(FoundLocalVars.size()); // will be at least this large + LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large { // This usually won't need to insert any padding, perhaps avoid the alloc uint64_t CurrentOffset = 0; @@ -352,7 +334,6 @@ private: GV->replaceAllUsesWith(GEP); } if (GV->use_empty()) { - UsedList.erase(GV); GV->eraseFromParent(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index b461c3c4bfdc..f5e12fd960d0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -11,6 +11,7 @@ #include "AMDGPUPerfHintAnalysis.h" #include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/IR/Constants.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; @@ -101,6 +102,21 @@ void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Function &F) { } } +Optional<uint32_t> +AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) { + auto MD = F.getMetadata("llvm.amdgcn.lds.kernel.id"); + if (MD && MD->getNumOperands() == 1) { + ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(0)); + if (KnownSize) { + uint64_t V = KnownSize->getZExtValue(); + if (V <= UINT32_MAX) { + return V; + } + } + } + return {}; +} + void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV) { assert(DL.getTypeAllocSize(GV.getValueType()).isZero()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index df62c2314617..97db8b7eb8d6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -11,11 +11,12 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Optional.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/Function.h" namespace llvm { @@ -104,6 +105,8 @@ public: unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV); void allocateModuleLDSGlobal(const Function &F); + static Optional<uint32_t> getLDSKernelIdMetadata(const Function &F); + Align getDynLDSAlign() const { return DynLDSAlign; } void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp index 09dbd2150db6..a9f1e9bd0996 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -74,10 +74,10 @@ public: private: struct MemAccessInfo { - const Value *V; - const Value *Base; - int64_t Offset; - MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {} + const Value *V = nullptr; + const Value *Base = nullptr; + int64_t Offset = 0; + MemAccessInfo() = default; bool isLargeStride(MemAccessInfo &Reference) const; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) Printable print() const { @@ -116,6 +116,7 @@ private: bool isGlobalAddr(const Value *V) const; bool isLocalAddr(const Value *V) const; + bool isGlobalLoadUsedInBB(const Instruction &) const; }; static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType( @@ -196,6 +197,24 @@ bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { return false; } +// Returns true if the global load `I` is used in its own basic block. +bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const { + const auto *Ld = dyn_cast<LoadInst>(&I); + if (!Ld) + return false; + if (!isGlobalAddr(Ld->getPointerOperand())) + return false; + + for (const User *Usr : Ld->users()) { + if (const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) { + if (UsrInst->getParent() == I.getParent()) + return true; + } + } + + return false; +} + AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F]; @@ -203,9 +222,14 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { for (auto &B : F) { LastAccess = MemAccessInfo(); + unsigned UsedGlobalLoadsInBB = 0; for (auto &I : B) { if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) { unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32); + // TODO: Check if the global load and its user are close to each other + // instead (Or do this analysis in GCNSchedStrategy?). + if (isGlobalLoadUsedInBB(I)) + UsedGlobalLoadsInBB += Size; if (isIndirectAccess(&I)) FI.IAMInstCost += Size; if (isLargeStride(&I)) @@ -245,6 +269,16 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { ++FI.InstCost; } } + + if (!FI.HasDenseGlobalMemAcc) { + unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size(); + if (GlobalMemAccPercentage > 50) { + LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since " + << B.getName() << " has " << GlobalMemAccPercentage + << "% global memory access\n"); + FI.HasDenseGlobalMemAcc = true; + } + } } return &FI; @@ -286,6 +320,11 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) { } bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { + // Reverting optimal scheduling in favour of occupancy with basic block(s) + // having dense global memory access can potentially hurt performance. + if (FI.HasDenseGlobalMemAcc) + return true; + return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h index 31ff80f5f431..2db8db6957ce 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h @@ -41,7 +41,11 @@ public: unsigned InstCost; unsigned IAMInstCost; // Indirect access memory instruction count unsigned LSMInstCost; // Large stride memory instruction count - FuncInfo() : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0) {} + bool HasDenseGlobalMemAcc; // Set if at least 1 basic block has relatively + // high global memory access + FuncInfo() + : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0), + HasDenseGlobalMemAcc(false) {} }; typedef ValueMap<const Function*, FuncInfo> FuncInfoMap; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index 0df6f4d45b06..bd8e568213b7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -153,7 +153,10 @@ bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3( if (!isVgprRegBank(Dst)) return false; - if (MRI.getType(Dst).isVector()) + // med3 for i16 is only available on gfx9+, and not available for v2i16. + LLT Ty = MRI.getType(Dst); + if ((Ty != LLT::scalar(16) || !Subtarget.hasMed3_16()) && + Ty != LLT::scalar(32)) return false; MinMaxMedOpc OpcodeTriple = getMinMaxPair(MI.getOpcode()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 0830cbd919a0..887341e67454 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4426,7 +4426,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_mfma_i32_16x16x32_i8: case Intrinsic::amdgcn_mfma_i32_32x32x16_i8: case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32: - case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: { + case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: + case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8: + case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8: + case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8: + case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8: + case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8: + case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8: + case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8: + case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: { // Default for MAI intrinsics. // srcC can also be an immediate which can be folded later. // FIXME: Should we eventually add an alternative mapping with AGPR src @@ -4451,7 +4459,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: - case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: { + case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: + case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: + case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: + case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: + case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: + case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: + case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: + case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: + case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: { // vdst, srcA, srcB, srcC, idx OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp index 4d7a3f4028e8..aa51c5d20bdc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp @@ -141,7 +141,7 @@ class ReplaceLDSUseImpl { std::vector<GlobalVariable *> collectLDSRequiringPointerReplace() { // Collect LDS which requires module lowering. std::vector<GlobalVariable *> LDSGlobals = - llvm::AMDGPU::findVariablesToLower(M); + llvm::AMDGPU::findVariablesToLower(M, nullptr); // Remove LDS which don't qualify for replacement. llvm::erase_if(LDSGlobals, [&](GlobalVariable *GV) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 8297635d7bb2..5d7bade00a3e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -340,12 +340,28 @@ def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x32_i8>; def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x16_i8>; def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8_xf32>; def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4_xf32>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_bf8>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_fp8>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_bf8>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_fp8>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_bf8>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_fp8>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_bf8>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_fp8>; def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_f16>; def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_f16>; def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_bf16>; def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_bf16>; def : SourceOfDivergence<int_amdgcn_smfmac_i32_16x16x64_i8>; def : SourceOfDivergence<int_amdgcn_smfmac_i32_32x32x32_i8>; +def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_bf8>; +def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_fp8>; +def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_bf8>; +def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_fp8>; +def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_bf8>; +def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_fp8>; +def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_bf8>; +def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_fp8>; // The dummy boolean output is divergent from the IR's perspective, // but the mask results are uniform. These produce a divergent and diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 6bd906439ee8..cf4826d81b4b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -736,13 +736,18 @@ static unsigned getMaxNumPreloadedSGPRs() { 2 + // dispatch ID 2 + // flat scratch init 2; // Implicit buffer ptr + // Max number of system SGPRs unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX 1 + // WorkGroupIDY 1 + // WorkGroupIDZ 1 + // WorkGroupInfo 1; // private segment wave byte offset - return MaxUserSGPRs + MaxSystemSGPRs; + + // Max number of synthetic SGPRs + unsigned SyntheticSGPRs = 1; // LDSKernelId + + return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs; } unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { @@ -852,34 +857,6 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation { return MI && TII->isVALU(*MI); } - bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { - if (Pred->NodeNum < Succ->NodeNum) - return true; - - SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); - - for (unsigned I = 0; I < Succs.size(); ++I) { - for (const SDep &SI : Succs[I]->Succs) { - const SUnit *SU = SI.getSUnit(); - if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) - Succs.push_back(SU); - } - } - - SmallPtrSet<const SUnit*, 32> Visited; - while (!Preds.empty()) { - const SUnit *SU = Preds.pop_back_val(); - if (llvm::is_contained(Succs, SU)) - return false; - Visited.insert(SU); - for (const SDep &SI : SU->Preds) - if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) - Preds.push_back(SI.getSUnit()); - } - - return true; - } - // Link as many SALU instructions in chain as possible. Return the size // of the chain. Links up to MaxChain instructions. unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, @@ -895,18 +872,20 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation { LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); - if (SU->addPred(SDep(From, SDep::Artificial), false)) - ++Linked; + if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From)) + if (DAG->addEdge(SU, SDep(From, SDep::Artificial))) + ++Linked; for (SDep &SI : From->Succs) { SUnit *SUv = SI.getSUnit(); - if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) - SUv->addPred(SDep(SU, SDep::Artificial), false); + if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) && + DAG->canAddEdge(SUv, SU)) + DAG->addEdge(SUv, SDep(SU, SDep::Artificial)); } for (SDep &SI : SU->Succs) { SUnit *Succ = SI.getSUnit(); - if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) + if (Succ != SU && isSALU(Succ)) Worklist.push_back(Succ); } } @@ -949,7 +928,8 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation { if (Visited.count(&*LastSALU)) continue; - if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) + if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) || + !DAG->canAddEdge(&*LastSALU, &SU)) continue; Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 971e44723758..dca926867300 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1584,6 +1584,9 @@ bool GCNTargetMachine::parseMachineFunctionInfo( parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, AMDGPU::SGPR_32RegClass, MFI->ArgInfo.PrivateSegmentSize, 0, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId, + AMDGPU::SGPR_32RegClass, + MFI->ArgInfo.LDSKernelId, 0, 1) || parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, 0, 1) || diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index e12d0ffef35c..2a9393fc1595 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1739,6 +1739,8 @@ public: void cvtVOP3(MCInst &Inst, const OperandVector &Operands); void cvtVOP3P(MCInst &Inst, const OperandVector &Operands); void cvtVOPD(MCInst &Inst, const OperandVector &Operands); + void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands, + OptionalImmIndexMap &OptionalIdx); void cvtVOP3P(MCInst &Inst, const OperandVector &Operands, OptionalImmIndexMap &OptionalIdx); @@ -1767,21 +1769,11 @@ public: void cvtDPP8(MCInst &Inst, const OperandVector &Operands) { cvtDPP(Inst, Operands, true); } - void cvtVOPCNoDstDPP(MCInst &Inst, const OperandVector &Operands, - bool IsDPP8 = false); - void cvtVOPCNoDstDPP8(MCInst &Inst, const OperandVector &Operands) { - cvtVOPCNoDstDPP(Inst, Operands, true); - } void cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8 = false); void cvtVOP3DPP8(MCInst &Inst, const OperandVector &Operands) { cvtVOP3DPP(Inst, Operands, true); } - void cvtVOPC64NoDstDPP(MCInst &Inst, const OperandVector &Operands, - bool IsDPP8 = false); - void cvtVOPC64NoDstDPP8(MCInst &Inst, const OperandVector &Operands) { - cvtVOPC64NoDstDPP(Inst, Operands, true); - } OperandMatchResultTy parseSDWASel(OperandVector &Operands, StringRef Prefix, AMDGPUOperand::ImmTy Type); @@ -4177,7 +4169,9 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) { return false; } - if (isGFX940() && (MII.get(Opc).TSFlags & SIInstrFlags::IsDOT)) { + uint64_t TSFlags = MII.get(Opc).TSFlags; + + if (isGFX940() && (TSFlags & SIInstrFlags::IsDOT)) { int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); if (OpSelIdx != -1) { if (Inst.getOperand(OpSelIdx).getImm() != 0) @@ -4190,6 +4184,15 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) { } } + // op_sel[0:1] must be 0 for v_dot2_bf16_bf16 and v_dot2_f16_f16 (VOP3 Dot). + if ((TSFlags & SIInstrFlags::IsDOT) && (TSFlags & SIInstrFlags::VOP3) && + !(TSFlags & SIInstrFlags::VOP3P)) { + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); + if (OpSel & 3) + return false; + } + return true; } @@ -4636,9 +4639,6 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, Error(IDLoc, "ABS not allowed in VOP3B instructions"); return false; } - if (!validateCoherencyBits(Inst, Operands, IDLoc)) { - return false; - } if (!validateExeczVcczOperands(Operands)) { return false; } @@ -5004,6 +5004,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32, Val, ValRange); + } else if (ID == ".amdhsa_uses_dynamic_stack") { + PARSE_BITS_ENTRY(KD.kernel_code_properties, + KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK, Val, ValRange); } else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") { if (hasArchitectedFlatScratch()) return Error(IDRange.Start, @@ -8024,10 +8027,13 @@ OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands) return MatchOperand_NoMatch; } -void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands) { - cvtVOP3P(Inst, Operands); - +// Determines which bit DST_OP_SEL occupies in the op_sel operand according to +// the number of src operands present, then copies that bit into src0_modifiers. +void cvtVOP3DstOpSelOnly(MCInst &Inst) { int Opc = Inst.getOpcode(); + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + if (OpSelIdx == -1) + return; int SrcNum; const int Ops[] = { AMDGPU::OpName::src0, @@ -8038,7 +8044,6 @@ void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands) ++SrcNum); assert(SrcNum > 0); - int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); if ((OpSel & (1 << SrcNum)) != 0) { @@ -8048,6 +8053,18 @@ void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands) } } +void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, + const OperandVector &Operands) { + cvtVOP3P(Inst, Operands); + cvtVOP3DstOpSelOnly(Inst); +} + +void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands, + OptionalImmIndexMap &OptionalIdx) { + cvtVOP3P(Inst, Operands, OptionalIdx); + cvtVOP3DstOpSelOnly(Inst); +} + static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) { // 1. This operand is input modifiers return Desc.OpInfo[OpNum].OperandType == AMDGPU::OPERAND_INPUT_MODS @@ -8241,6 +8258,12 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0; + if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi || + Opc == AMDGPU::V_CVT_SR_FP8_F32_vi) { + Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods + Inst.addOperand(Inst.getOperand(0)); + } + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1) { assert(!IsPacked); Inst.addOperand(Inst.getOperand(0)); @@ -8747,14 +8770,6 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFI() const { return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppFi); } -// Add dummy $old operand -void AMDGPUAsmParser::cvtVOPC64NoDstDPP(MCInst &Inst, - const OperandVector &Operands, - bool IsDPP8) { - Inst.addOperand(MCOperand::createReg(0)); - cvtVOP3DPP(Inst, Operands, IsDPP8); -} - void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) { OptionalImmIndexMap OptionalIdx; unsigned Opc = Inst.getOpcode(); @@ -8802,6 +8817,8 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, bo } if (Desc.TSFlags & SIInstrFlags::VOP3P) cvtVOP3P(Inst, Operands, OptionalIdx); + else if (Desc.TSFlags & SIInstrFlags::VOP3) + cvtVOP3OpSel(Inst, Operands, OptionalIdx); else if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOpSel); } @@ -8821,14 +8838,6 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, bo } } -// Add dummy $old operand -void AMDGPUAsmParser::cvtVOPCNoDstDPP(MCInst &Inst, - const OperandVector &Operands, - bool IsDPP8) { - Inst.addOperand(MCOperand::createReg(0)); - cvtDPP(Inst, Operands, IsDPP8); -} - void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) { OptionalImmIndexMap OptionalIdx; @@ -9043,12 +9052,27 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, // v_nop_sdwa_sdwa_vi/gfx9 has no optional sdwa arguments switch (BasicInstType) { case SIInstrFlags::VOP1: - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); - if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) { - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0); + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), + AMDGPU::OpName::clamp) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, + AMDGPUOperand::ImmTyClampSI, 0); + } + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), + AMDGPU::OpName::omod) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, + AMDGPUOperand::ImmTyOModSI, 0); + } + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), + AMDGPU::OpName::dst_sel) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, + AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD); + } + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), + AMDGPU::OpName::dst_unused) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, + AMDGPUOperand::ImmTySdwaDstUnused, + DstUnused::UNUSED_PRESERVE); } - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); break; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index ccaf646008b1..98ee720200b4 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -451,7 +451,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) convertVOP3PDPPInst(MI); else if (AMDGPU::isVOPC64DPP(MI.getOpcode())) - convertVOPCDPPInst(MI); + convertVOPCDPPInst(MI); // Special VOP3 case + else { + assert(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3); + convertVOP3DPPInst(MI); // Regular VOP3 case + } break; } Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address); @@ -745,6 +749,43 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { return MCDisassembler::Success; } +struct VOPModifiers { + unsigned OpSel = 0; + unsigned OpSelHi = 0; + unsigned NegLo = 0; + unsigned NegHi = 0; +}; + +// Reconstruct values of VOP3/VOP3P operands such as op_sel. +// Note that these values do not affect disassembler output, +// so this is only necessary for consistency with src_modifiers. +static VOPModifiers collectVOPModifiers(const MCInst &MI, + bool IsVOP3P = false) { + VOPModifiers Modifiers; + unsigned Opc = MI.getOpcode(); + const int ModOps[] = {AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers}; + for (int J = 0; J < 3; ++J) { + int OpIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); + if (OpIdx == -1) + continue; + + unsigned Val = MI.getOperand(OpIdx).getImm(); + + Modifiers.OpSel |= !!(Val & SISrcMods::OP_SEL_0) << J; + if (IsVOP3P) { + Modifiers.OpSelHi |= !!(Val & SISrcMods::OP_SEL_1) << J; + Modifiers.NegLo |= !!(Val & SISrcMods::NEG) << J; + Modifiers.NegHi |= !!(Val & SISrcMods::NEG_HI) << J; + } else if (J == 0) { + Modifiers.OpSel |= !!(Val & SISrcMods::DST_OP_SEL) << 3; + } + } + + return Modifiers; +} + // We must check FI == literal to reject not genuine dpp8 insts, and we must // first add optional MI operands to check FI DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { @@ -755,6 +796,11 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) || AMDGPU::isVOPC64DPP(Opc)) { convertVOPCDPPInst(MI); + } else if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) { + auto Mods = collectVOPModifiers(MI); + insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel), + AMDGPU::OpName::op_sel); } else { // Insert dummy unused src modifiers. if (MI.getNumOperands() < DescNumOps && @@ -770,6 +816,18 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail; } +DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const { + unsigned Opc = MI.getOpcode(); + unsigned DescNumOps = MCII->get(Opc).getNumOperands(); + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) { + auto Mods = collectVOPModifiers(MI); + insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel), + AMDGPU::OpName::op_sel); + } + return MCDisassembler::Success; +} + // Note that before gfx10, the MIMG encoding provided no information about // VADDR size. Consequently, decoded instructions always show address as if it // has 1 dword, which could be not really so. @@ -914,45 +972,27 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const { unsigned Opc = MI.getOpcode(); unsigned DescNumOps = MCII->get(Opc).getNumOperands(); + auto Mods = collectVOPModifiers(MI, true); if (MI.getNumOperands() < DescNumOps && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1) insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vdst_in); - const int ModOps[] = {AMDGPU::OpName::src0_modifiers, - AMDGPU::OpName::src1_modifiers, - AMDGPU::OpName::src2_modifiers}; - unsigned OpSel = 0; - unsigned OpSelHi = 0; - unsigned NegLo = 0; - unsigned NegHi = 0; - for (int J = 0; J < 3; ++J) { - int OpIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); - if (OpIdx == -1) - break; - unsigned Val = MI.getOperand(OpIdx).getImm(); - - OpSel |= !!(Val & SISrcMods::OP_SEL_0) << J; - OpSelHi |= !!(Val & SISrcMods::OP_SEL_1) << J; - NegLo |= !!(Val & SISrcMods::NEG) << J; - NegHi |= !!(Val & SISrcMods::NEG_HI) << J; - } - if (MI.getNumOperands() < DescNumOps && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) - insertNamedMCOperand(MI, MCOperand::createImm(OpSel), + insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel), AMDGPU::OpName::op_sel); if (MI.getNumOperands() < DescNumOps && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi) != -1) - insertNamedMCOperand(MI, MCOperand::createImm(OpSelHi), + insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSelHi), AMDGPU::OpName::op_sel_hi); if (MI.getNumOperands() < DescNumOps && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo) != -1) - insertNamedMCOperand(MI, MCOperand::createImm(NegLo), + insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegLo), AMDGPU::OpName::neg_lo); if (MI.getNumOperands() < DescNumOps && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi) != -1) - insertNamedMCOperand(MI, MCOperand::createImm(NegHi), + insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegHi), AMDGPU::OpName::neg_hi); return MCDisassembler::Success; @@ -2000,6 +2040,9 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective( KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32); } + PRINT_DIRECTIVE(".amdhsa_uses_dynamic_stack", + KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK); + if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED1) return MCDisassembler::Fail; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 31869f0917ae..d17e2d8d5082 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -162,6 +162,7 @@ public: DecodeStatus convertSDWAInst(MCInst &MI) const; DecodeStatus convertDPP8Inst(MCInst &MI) const; DecodeStatus convertMIMGInst(MCInst &MI) const; + DecodeStatus convertVOP3DPPInst(MCInst &MI) const; DecodeStatus convertVOP3PDPPInst(MCInst &MI) const; DecodeStatus convertVOPCDPPInst(MCInst &MI) const; diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index 5d254518c67a..4558ddf6dbfe 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -202,6 +202,19 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n"); return nullptr; } + int OrigOpE32 = AMDGPU::getVOPe32(OrigOp); + // Prior checks cover Mask with VOPC condition, but not on purpose + auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask); + assert(RowMaskOpnd && RowMaskOpnd->isImm()); + auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask); + assert(BankMaskOpnd && BankMaskOpnd->isImm()); + const bool MaskAllLanes = + RowMaskOpnd->getImm() == 0xF && BankMaskOpnd->getImm() == 0xF; + (void)MaskAllLanes; + assert(MaskAllLanes || + !(TII->isVOPC(DPPOp) || + (TII->isVOP3(DPPOp) && OrigOpE32 != -1 && TII->isVOPC(OrigOpE32))) && + "VOPC cannot form DPP unless mask is full"); auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI, OrigMI.getDebugLoc(), TII->get(DPPOp)) @@ -234,6 +247,10 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef, CombOldVGPR.SubReg); ++NumOperands; + } else if (TII->isVOPC(DPPOp) || (TII->isVOP3(DPPOp) && OrigOpE32 != -1 && + TII->isVOPC(OrigOpE32))) { + // VOPC DPP and VOPC promoted to VOP3 DPP do not have an old operand + // because they write to SGPRs not VGPRs } else { // TODO: this discards MAC/FMA instructions for now, let's add it later LLVM_DEBUG(dbgs() << " failed: no old operand in DPP instruction," diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index 281474994bca..6ff349e31f22 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -249,11 +249,11 @@ def : ProcessorModel<"gfx1036", GFX10SpeedModel, //===----------------------------------------------------------------------===// def : ProcessorModel<"gfx1100", GFX11SpeedModel, - FeatureISAVersion11_0.Features + FeatureISAVersion11_0_0.Features >; def : ProcessorModel<"gfx1101", GFX11SpeedModel, - FeatureISAVersion11_0.Features + FeatureISAVersion11_0_1.Features >; def : ProcessorModel<"gfx1102", GFX11SpeedModel, diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 100410bb7644..04da14cc4916 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -733,7 +733,7 @@ void GCNScheduleDAGMILive::collectRematerializableInstructions() { MachineOperand *Op = MRI.getOneDef(Reg); MachineInstr *Def = Op->getParent(); - if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def, AA)) + if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def)) continue; MachineInstr *UseI = &*MRI.use_instr_nodbg_begin(Reg); @@ -943,9 +943,8 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, } // Copied from MachineLICM -bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI, - AAResults *AA) { - if (!TII->isTriviallyReMaterializable(MI, AA)) +bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI) { + if (!TII->isTriviallyReMaterializable(MI)) return false; for (const MachineOperand &MO : MI.operands()) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 97f94f69b70e..c3db849cf81a 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -142,7 +142,7 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { // and single use outside the defining block into RematerializableInsts. void collectRematerializableInstructions(); - bool isTriviallyReMaterializable(const MachineInstr &MI, AAResults *AA); + bool isTriviallyReMaterializable(const MachineInstr &MI); // TODO: Should also attempt to reduce RP of SGPRs and AGPRs // Attempt to reduce RP of VGPR by sinking trivially rematerializable diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index d269d0945f3b..d71f80c5f458 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -145,6 +145,7 @@ protected: bool HasDot7Insts = false; bool HasDot8Insts = false; bool HasMAIInsts = false; + bool HasFP8Insts = false; bool HasPkFmacF16Inst = false; bool HasAtomicFaddRtnInsts = false; bool HasAtomicFaddNoRtnInsts = false; @@ -721,6 +722,10 @@ public: return HasMAIInsts; } + bool hasFP8Insts() const { + return HasFP8Insts; + } + bool hasPkFmacF16Inst() const { return HasPkFmacF16Inst; } @@ -930,7 +935,7 @@ public: } bool hasUserSGPRInit16Bug() const { - return UserSGPRInit16Bug; + return UserSGPRInit16Bug && isWave32(); } bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index bd938d829953..21ff2744e5b4 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -627,7 +627,7 @@ void AMDGPUInstPrinter::printWaitEXP(const MCInst *MI, unsigned OpNo, bool AMDGPUInstPrinter::needsImpliedVcc(const MCInstrDesc &Desc, unsigned OpNo) const { - return OpNo == 1 && (Desc.TSFlags & SIInstrFlags::DPP) && + return OpNo == 0 && (Desc.TSFlags & SIInstrFlags::DPP) && (Desc.TSFlags & SIInstrFlags::VOPC) && (Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) || Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO)); @@ -644,8 +644,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, // If there are printed modifiers, printOperandAndFPInputMods or // printOperandAndIntInputMods will be called instead if ((OpNo == 0 || - (OpNo == 1 && (Desc.TSFlags & SIInstrFlags::DPP)) || - (OpNo == 2 && (Desc.TSFlags & SIInstrFlags::DPP) && ModIdx != -1)) && + (OpNo == 1 && (Desc.TSFlags & SIInstrFlags::DPP) && ModIdx != -1)) && (Desc.TSFlags & SIInstrFlags::VOPC) && (Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) || Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO))) diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 078133469549..0e71509cf2bd 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -367,6 +367,8 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD, kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32); + PRINT_FIELD(OS, ".amdhsa_uses_dynamic_stack", KD, kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK); PRINT_FIELD(OS, (hasArchitectedFlatScratch(STI) ? ".amdhsa_enable_private_segment" diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index f54778535b7c..3e95c55df57e 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -67,6 +67,7 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SetOperations.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/InitializePasses.h" #include "llvm/Target/TargetMachine.h" @@ -81,9 +82,9 @@ static cl::opt<bool> EnableM0Merge( cl::init(true)); namespace { - class SIFixSGPRCopies : public MachineFunctionPass { MachineDominatorTree *MDT; + unsigned NextVGPRToSGPRCopyID; public: static char ID; @@ -92,9 +93,16 @@ public: const SIRegisterInfo *TRI; const SIInstrInfo *TII; - SIFixSGPRCopies() : MachineFunctionPass(ID) {} + SIFixSGPRCopies() : MachineFunctionPass(ID), NextVGPRToSGPRCopyID(0) {} bool runOnMachineFunction(MachineFunction &MF) override; + unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; } + void lowerVGPR2SGPRCopies(MachineFunction &MF); + // Handles copies which source register is: + // 1. Physical register + // 2. AGPR + // 3. Defined by the instruction the merely moves the immediate + bool lowerSpecialCase(MachineInstr &MI); MachineBasicBlock *processPHINode(MachineInstr &MI); @@ -569,6 +577,14 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { TII = ST.getInstrInfo(); MDT = &getAnalysis<MachineDominatorTree>(); + // We have to lower VGPR to SGPR copies before the main loop + // because the REG_SEQUENCE and PHI lowering in main loop + // convert the def-use chains to VALU and close the opportunities + // for keeping them scalar. + // TODO: REG_SEQENCE and PHIs are semantically copies. The next patch + // addresses their lowering and unify the processing in one main loop. + lowerVGPR2SGPRCopies(MF); + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock *MBB = &*BI; @@ -640,42 +656,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { continue; } - if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { - Register SrcReg = MI.getOperand(1).getReg(); - if (!SrcReg.isVirtual()) { - MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT); - if (NewBB && NewBB != MBB) { - MBB = NewBB; - E = MBB->end(); - BI = MachineFunction::iterator(MBB); - BE = MF.end(); - } - assert((!NewBB || NewBB == I->getParent()) && - "moveToVALU did not return the right basic block"); - break; - } - - MachineInstr *DefMI = MRI->getVRegDef(SrcReg); - unsigned SMovOp; - int64_t Imm; - // If we are just copying an immediate, we can replace the copy with - // s_mov_b32. - if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) { - MI.getOperand(1).ChangeToImmediate(Imm); - MI.addImplicitDefUseOperands(MF); - MI.setDesc(TII->get(SMovOp)); - break; - } - MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT); - if (NewBB && NewBB != MBB) { - MBB = NewBB; - E = MBB->end(); - BI = MachineFunction::iterator(MBB); - BE = MF.end(); - } - assert((!NewBB || NewBB == I->getParent()) && - "moveToVALU did not return the right basic block"); - } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { + if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { tryChangeVGPRtoSGPRinCopy(MI, TRI, TII); } @@ -916,3 +897,269 @@ MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) { } return CreatedBB; } + +bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI) { + MachineBasicBlock *MBB = MI.getParent(); + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI); + + // We return true to indicate that no further processing needed + if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) + return true; + + Register SrcReg = MI.getOperand(1).getReg(); + if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) { + TII->moveToVALU(MI, MDT); + return true; + } + + unsigned SMovOp; + int64_t Imm; + // If we are just copying an immediate, we can replace the copy with + // s_mov_b32. + if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) { + MI.getOperand(1).ChangeToImmediate(Imm); + MI.addImplicitDefUseOperands(*MBB->getParent()); + MI.setDesc(TII->get(SMovOp)); + return true; + } + return false; +} + +class V2SCopyInfo { +public: + // VGPR to SGPR copy being processed + MachineInstr *Copy; + // All SALU instructions reachable from this copy in SSA graph + DenseSet<MachineInstr *> SChain; + // Number of SGPR to VGPR copies that are used to put the SALU computation + // results back to VALU. + unsigned NumSVCopies; + + unsigned Score; + // Actual count of v_readfirstlane_b32 + // which need to be inserted to keep SChain SALU + unsigned NumReadfirstlanes; + // Current score state. To speedup selection V2SCopyInfos for processing + bool NeedToBeConvertedToVALU = false; + // Unique ID. Used as a key for mapping to keep permanent order. + unsigned ID; + + // Count of another VGPR to SGPR copies that contribute to the + // current copy SChain + unsigned SiblingPenalty = 0; + SetVector<unsigned> Siblings; + V2SCopyInfo() : Copy(nullptr), ID(0){}; + V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width) + : Copy(C), NumSVCopies(0), NumReadfirstlanes(Width / 32), ID(Id){}; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump() { + dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size() + << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty + << "\nScore: " << Score << "\n"; + } +#endif +}; + +void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { + + DenseMap<unsigned, V2SCopyInfo> Copies; + DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty; + + // The main function that computes the VGPR to SGPR copy score + // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU + auto needToBeConvertedToVALU = [&](V2SCopyInfo *I) -> bool { + if (I->SChain.empty()) + return true; + I->Siblings = SiblingPenalty[*std::max_element( + I->SChain.begin(), I->SChain.end(), + [&](MachineInstr *A, MachineInstr *B) -> bool { + return SiblingPenalty[A].size() < SiblingPenalty[B].size(); + })]; + I->Siblings.remove_if([&](unsigned ID) { return ID == I->ID; }); + // The loop below computes the number of another VGPR to SGPR copies + // which contribute to the current copy SALU chain. We assume that all the + // copies with the same source virtual register will be squashed to one by + // regalloc. Also we take careof the copies of the differnt subregs of the + // same register. + SmallSet<std::pair<Register, unsigned>, 4> SrcRegs; + for (auto J : I->Siblings) { + auto InfoIt = Copies.find(J); + if (InfoIt != Copies.end()) { + MachineInstr *SiblingCopy = InfoIt->getSecond().Copy; + if (SiblingCopy->isImplicitDef()) + // the COPY has already been MoveToVALUed + continue; + + SrcRegs.insert(std::make_pair(SiblingCopy->getOperand(1).getReg(), + SiblingCopy->getOperand(1).getSubReg())); + } + } + I->SiblingPenalty = SrcRegs.size(); + + unsigned Penalty = + I->NumSVCopies + I->SiblingPenalty + I->NumReadfirstlanes; + unsigned Profit = I->SChain.size(); + I->Score = Penalty > Profit ? 0 : Profit - Penalty; + I->NeedToBeConvertedToVALU = I->Score < 3; + return I->NeedToBeConvertedToVALU; + }; + + auto needProcessing = [](MachineInstr &MI) -> bool { + switch (MI.getOpcode()) { + case AMDGPU::COPY: + case AMDGPU::WQM: + case AMDGPU::STRICT_WQM: + case AMDGPU::SOFT_WQM: + case AMDGPU::STRICT_WWM: + return true; + default: + return false; + } + }; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; + ++BI) { + MachineBasicBlock *MBB = &*BI; + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; + ++I) { + MachineInstr &MI = *I; + if (!needProcessing(MI)) + continue; + if (lowerSpecialCase(MI)) + continue; + + // Compute the COPY width to pass it to V2SCopyInfo Ctor + Register DstReg = MI.getOperand(0).getReg(); + + const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, DstReg); + + V2SCopyInfo In(getNextVGPRToSGPRCopyId(), &MI, + TRI->getRegSizeInBits(*DstRC)); + + SmallVector<MachineInstr *, 8> AnalysisWorklist; + // Needed because the SSA is not a tree but a graph and may have + // forks and joins. We should not then go same way twice. + DenseSet<MachineInstr *> Visited; + AnalysisWorklist.push_back(&MI); + while (!AnalysisWorklist.empty()) { + + MachineInstr *Inst = AnalysisWorklist.pop_back_val(); + + if (!Visited.insert(Inst).second) + continue; + + // Copies and REG_SEQUENCE do not contribute to the final assembly + // So, skip them but take care of the SGPR to VGPR copies bookkeeping. + if (Inst->isCopy() || Inst->isRegSequence()) { + if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) { + if (!Inst->isCopy() || + !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) { + In.NumSVCopies++; + continue; + } + } + } + + SiblingPenalty[Inst].insert(In.ID); + + SmallVector<MachineInstr *, 4> Users; + if ((TII->isSALU(*Inst) && Inst->isCompare()) || + (Inst->isCopy() && Inst->getOperand(0).getReg() == AMDGPU::SCC)) { + auto I = Inst->getIterator(); + auto E = Inst->getParent()->end(); + while (++I != E && !I->findRegisterDefOperand(AMDGPU::SCC)) { + if (I->readsRegister(AMDGPU::SCC)) + Users.push_back(&*I); + } + } else if (Inst->getNumExplicitDefs() != 0) { + Register Reg = Inst->getOperand(0).getReg(); + if (TRI->isSGPRReg(*MRI, Reg)) + for (auto &U : MRI->use_instructions(Reg)) + Users.push_back(&U); + } + for (auto U : Users) { + if (TII->isSALU(*U)) + In.SChain.insert(U); + AnalysisWorklist.push_back(U); + } + } + Copies[In.ID] = In; + } + } + + SmallVector<unsigned, 8> LoweringWorklist; + for (auto &C : Copies) { + if (needToBeConvertedToVALU(&C.second)) + LoweringWorklist.push_back(C.second.ID); + } + + while (!LoweringWorklist.empty()) { + unsigned CurID = LoweringWorklist.pop_back_val(); + auto CurInfoIt = Copies.find(CurID); + if (CurInfoIt != Copies.end()) { + V2SCopyInfo C = CurInfoIt->getSecond(); + LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump()); + for (auto S : C.Siblings) { + auto SibInfoIt = Copies.find(S); + if (SibInfoIt != Copies.end()) { + V2SCopyInfo &SI = SibInfoIt->getSecond(); + LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump()); + if (!SI.NeedToBeConvertedToVALU) { + set_subtract(SI.SChain, C.SChain); + if (needToBeConvertedToVALU(&SI)) + LoweringWorklist.push_back(SI.ID); + } + SI.Siblings.remove_if([&](unsigned ID) { return ID == C.ID; }); + } + } + LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy + << " is being turned to VALU\n"); + Copies.erase(C.ID); + TII->moveToVALU(*C.Copy, MDT); + } + } + + // Now do actual lowering + for (auto C : Copies) { + MachineInstr *MI = C.second.Copy; + MachineBasicBlock *MBB = MI->getParent(); + // We decide to turn V2S copy to v_readfirstlane_b32 + // remove it from the V2SCopies and remove it from all its siblings + LLVM_DEBUG(dbgs() << "V2S copy " << *MI + << " is being turned to v_readfirstlane_b32" + << " Score: " << C.second.Score << "\n"); + Register DstReg = MI->getOperand(0).getReg(); + Register SrcReg = MI->getOperand(1).getReg(); + unsigned SubReg = MI->getOperand(1).getSubReg(); + const TargetRegisterClass *SrcRC = TRI->getRegClassForReg(*MRI, SrcReg); + SrcRC = TRI->getSubRegClass(SrcRC, SubReg); + size_t SrcSize = TRI->getRegSizeInBits(*SrcRC); + if (SrcSize == 16) { + // HACK to handle possible 16bit VGPR source + auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); + MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister); + } else if (SrcSize == 32) { + auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); + MIB.addReg(SrcReg, 0, SubReg); + } else { + auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(AMDGPU::REG_SEQUENCE), DstReg); + int N = TRI->getRegSizeInBits(*SrcRC) / 32; + for (int i = 0; i < N; i++) { + Register PartialSrc = TII->buildExtractSubReg( + Result, *MRI, MI->getOperand(1), SrcRC, + TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass); + Register PartialDst = + MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, *Result, Result->getDebugLoc(), + TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst) + .addReg(PartialSrc); + Result.addReg(PartialDst).addImm(TRI->getSubRegFromChannel(i)); + } + } + MI->eraseFromParent(); + } +} diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index d16da2a8b86b..438e8b200ecc 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1664,6 +1664,17 @@ SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG, return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset); } +SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG, + const SDLoc &SL) const { + + Function &F = DAG.getMachineFunction().getFunction(); + Optional<uint32_t> KnownSize = + AMDGPUMachineFunction::getLDSKernelIdMetadata(F); + if (KnownSize.has_value()) + return DAG.getConstant(KnownSize.value(), SL, MVT::i32); + return SDValue(); +} + SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val, bool Signed, @@ -2049,6 +2060,9 @@ void SITargetLowering::allocateSpecialInputSGPRs( if (Info.hasWorkGroupIDZ()) allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ); + + if (Info.hasLDSKernelId()) + allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId); } // Allocate special inputs passed in user SGPRs. @@ -2102,6 +2116,12 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, CCInfo.AllocateReg(FlatScratchInitReg); } + if (Info.hasLDSKernelId()) { + Register Reg = Info.addLDSKernelId(); + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read // these from the dispatch pointer. } @@ -2347,8 +2367,8 @@ SDValue SITargetLowering::LowerFormalArguments( (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && - !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && - !Info->hasWorkItemIDZ()); + !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() && + !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()); } if (CallConv == CallingConv::AMDGPU_PS) { @@ -2762,7 +2782,8 @@ void SITargetLowering::passSpecialInputs( {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"}, {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"}, {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"}, - {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"} + {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"}, + {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"}, }; for (auto Attr : ImplicitAttrs) { @@ -2798,6 +2819,13 @@ void SITargetLowering::passSpecialInputs( // The implicit arg ptr is special because it doesn't have a corresponding // input for kernels, and is computed from the kernarg segment pointer. InputReg = getImplicitArgPtr(DAG, DL); + } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) { + Optional<uint32_t> Id = AMDGPUMachineFunction::getLDSKernelIdMetadata(F); + if (Id.has_value()) { + InputReg = DAG.getConstant(Id.value(), DL, ArgVT); + } else { + InputReg = DAG.getUNDEF(ArgVT); + } } else { // We may have proven the input wasn't needed, although the ABI is // requiring it. We just need to allocate the register appropriately. @@ -6887,6 +6915,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_workgroup_id_z: return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); + case Intrinsic::amdgcn_lds_kernel_id: { + if (MFI->isEntryFunction()) + return getLDSKernelId(DAG, DL); + return getPreloadedValue(DAG, *MFI, VT, + AMDGPUFunctionArgInfo::LDS_KERNEL_ID); + } case Intrinsic::amdgcn_workitem_id_x: return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX); case Intrinsic::amdgcn_workitem_id_y: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 4fbccf0c5850..d1fecc1afc7f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -48,6 +48,7 @@ private: SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, uint64_t Offset) const; SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const; + SDValue getLDSKernelId(SelectionDAG &DAG, const SDLoc &SL) const; SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, uint64_t Offset, Align Alignment, diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td index b398e108bf62..7c1d8d32b624 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -85,7 +85,7 @@ class InstSI <dag outs, dag ins, string asm = "", field bit VOPAsmPrefer32Bit = 0; // This bit indicates that this is a VOP3 opcode which supports op_sel - // modifier (gfx9 only). + // modifier. field bit VOP3_OPSEL = 0; // Is it possible for this instruction to be atomic? diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 799d34e32d27..8916f06598c6 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -108,8 +108,8 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); } -bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, - AAResults *AA) const { +bool SIInstrInfo::isReallyTriviallyReMaterializable( + const MachineInstr &MI) const { if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) { // Normally VALU use of exec would block the rematerialization, but that // is OK in this case to have an implicit exec read as all VALU do. @@ -220,16 +220,23 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) return false; - assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); + unsigned NumOps = getNumOperandsNoGlue(Load0); + if (NumOps != getNumOperandsNoGlue(Load1)) + return false; // Check base reg. if (Load0->getOperand(0) != Load1->getOperand(0)) return false; + // Match register offsets, if both register and immediate offsets present. + assert(NumOps == 4 || NumOps == 5); + if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1)) + return false; + const ConstantSDNode *Load0Offset = - dyn_cast<ConstantSDNode>(Load0->getOperand(1)); + dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3)); const ConstantSDNode *Load1Offset = - dyn_cast<ConstantSDNode>(Load1->getOperand(1)); + dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3)); if (!Load0Offset || !Load1Offset) return false; @@ -5011,10 +5018,8 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, } if (MO->isReg()) { - if (!DefinedRC) { - // This operand allows any register. - return true; - } + if (!DefinedRC) + return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN; if (!isLegalRegOperand(MRI, OpInfo, *MO)) return false; bool IsAGPR = RI.isAGPR(MRI, MO->getReg()); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 1b411eb83eb3..5840f45bdc5a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -184,8 +184,7 @@ public: return ST; } - bool isReallyTriviallyReMaterializable(const MachineInstr &MI, - AAResults *AA) const override; + bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override; bool isIgnorableUse(const MachineOperand &MO) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 23afd6556bc9..81f8dcc482da 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -324,7 +324,8 @@ class isFloatType<ValueType SrcVT> { // XXX - do v2i16 instructions? class isIntType<ValueType SrcVT> { - bit ret = !or(!eq(SrcVT.Value, i16.Value), + bit ret = !or(!eq(SrcVT.Value, i8.Value), + !eq(SrcVT.Value, i16.Value), !eq(SrcVT.Value, i32.Value), !eq(SrcVT.Value, i64.Value), !eq(SrcVT.Value, v4i16.Value), @@ -1411,6 +1412,10 @@ class IntSDWAInputModsMatchClass <int opSize> : AsmOperandClass { def Int16SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<16>; def Int32SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<32>; +def Bin32SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<32> { + let Name = "SDWAWithBin32InputMods"; + let ParserMethod = "parseRegOrImm"; +} class IntSDWAInputMods <IntSDWAInputModsMatchClass matchClass> : InputMods <matchClass> { @@ -1419,6 +1424,7 @@ class IntSDWAInputMods <IntSDWAInputModsMatchClass matchClass> : def Int16SDWAInputMods : IntSDWAInputMods<Int16SDWAInputModsMatchClass>; def Int32SDWAInputMods : IntSDWAInputMods<Int32SDWAInputModsMatchClass>; +def Bin32SDWAInputMods : IntSDWAInputMods<Bin32SDWAInputModsMatchClass>; def IntVRegInputModsMatchClass : AsmOperandClass { let Name = "VRegWithIntInputMods"; @@ -1897,94 +1903,94 @@ class getInsVOP3OpSel <RegisterOperand Src0RC, RegisterOperand Src1RC, class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC, RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers, - Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> { + Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld> { - dag ret = !if (!eq(NumSrcArgs, 0), + dag ret = !if(!eq(NumSrcArgs, 0), // VOP1 without input operands (V_NOP) (ins ), - !if (!eq(NumSrcArgs, 1), - !if (HasModifiers, - // VOP1_DPP with modifiers - (ins OldRC:$old, Src0Mod:$src0_modifiers, - Src0RC:$src0) - /* else */, - // VOP1_DPP without modifiers - (ins OldRC:$old, Src0RC:$src0) - /* endif */), - !if (!eq(NumSrcArgs, 2), - !if (HasModifiers, - // VOP2_DPP with modifiers - (ins OldRC:$old, - Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1) - /* else */, - // VOP2_DPP without modifiers - (ins OldRC:$old, - Src0RC:$src0, Src1RC:$src1) - ) - /* NumSrcArgs == 3, VOP3 */, - !if (HasModifiers, - // VOP3_DPP with modifiers - (ins OldRC:$old, - Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - Src2Mod:$src2_modifiers, Src2RC:$src2) - /* else */, - // VOP3_DPP without modifiers - (ins OldRC:$old, - Src0RC:$src0, Src1RC:$src1, - Src2RC:$src2) + !con( + !if(HasOld ,(ins OldRC:$old), (ins)), + !if (!eq(NumSrcArgs, 1), + !if (HasModifiers, + // VOP1_DPP with modifiers + (ins Src0Mod:$src0_modifiers, Src0RC:$src0) + /* else */, + // VOP1_DPP without modifiers + (ins Src0RC:$src0) + /* endif */), + !if (!eq(NumSrcArgs, 2), + !if (HasModifiers, + // VOP2_DPP with modifiers + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1) + /* else */, + // VOP2_DPP without modifiers + (ins Src0RC:$src0, Src1RC:$src1) + ) + /* NumSrcArgs == 3, VOP3 */, + !if (HasModifiers, + // VOP3_DPP with modifiers + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2) + /* else */, + // VOP3_DPP without modifiers + (ins Src0RC:$src0, Src1RC:$src1, + Src2RC:$src2) + ) + ) + ) ) - /* endif */))); + ); } class getInsDPP <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC, RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers, - Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> { + Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> { dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs, - HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret, + HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret, (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)); } class getInsDPP16 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC, RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers, - Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> { + Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> { dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs, - HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret, + HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret, (ins FI:$fi)); } class getInsDPP8 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC, RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers, - Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> { + Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> { dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs, - HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret, + HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret, (ins dpp8:$dpp8, FI:$fi)); } -class getInsVOP3DPPBase<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> { +class getInsVOP3DPPBase<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld> { dag old = ( ins OldRC:$old ); dag base = VOP3Base; dag ret = !con( - !if(!ne(NumSrcArgs, 0), old, (ins)), + !if(!and(HasOld,!ne(NumSrcArgs, 0)), old, (ins)), base ); } -class getInsVOP3DPP<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> { - dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs>.ret, +class getInsVOP3DPP<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> { + dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret, (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)); } -class getInsVOP3DPP16<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> { - dag ret = !con(getInsVOP3DPP<VOP3Base,OldRC,NumSrcArgs>.ret, +class getInsVOP3DPP16<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> { + dag ret = !con(getInsVOP3DPP<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret, (ins FI:$fi)); } -class getInsVOP3DPP8<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> { - dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs>.ret, +class getInsVOP3DPP8<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> { + dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret, (ins dpp8:$dpp8, FI:$fi)); } @@ -2665,6 +2671,8 @@ def VOP_V4I32_I64_I64_V4I32 : VOPProfile <[v4i32, i64, i64, v4i32]>; def VOP_V16I32_I64_I64_V16I32 : VOPProfile <[v16i32, i64, i64, v16i32]>; def VOP_V4F32_V2F32_V2F32_V4F32 : VOPProfile <[v4f32, v2f32, v2f32, v4f32]>; def VOP_V16F32_V2F32_V2F32_V16F32 : VOPProfile <[v16f32, v2f32, v2f32, v16f32]>; +def VOP_V4F32_I64_I64_V4F32 : VOPProfile <[v4f32, i64, i64, v4f32]>; +def VOP_V16F32_I64_I64_V16F32 : VOPProfile <[v16f32, i64, i64, v16f32]>; def VOP_V4F32_V4F16_V8F16_I32 : VOPProfile <[v4f32, v4f16, v8f16, i32]>; def VOP_V16F32_V4F16_V8F16_I32 : VOPProfile <[v16f32, v4f16, v8f16, i32]>; @@ -2672,6 +2680,8 @@ def VOP_V4F32_V4I16_V8I16_I32 : VOPProfile <[v4f32, v4i16, v8i16, i32]>; def VOP_V16F32_V4I16_V8I16_I32 : VOPProfile <[v16f32, v4i16, v8i16, i32]>; def VOP_V4I32_V2I32_V4I32_I32 : VOPProfile <[v4i32, v2i32, v4i32, i32]>; def VOP_V16I32_V2I32_V4I32_I32 : VOPProfile <[v16i32, v2i32, v4i32, i32]>; +def VOP_V4F32_V2I32_V4I32_I32 : VOPProfile <[v4f32, v2i32, v4i32, i32]>; +def VOP_V16F32_V2I32_V4I32_I32 : VOPProfile <[v16f32, v2i32, v4i32, i32]>; class Commutable_REV <string revOp, bit isOrig> { string RevOp = revOp; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 0504c59ebd9e..9176e85568ee 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -44,6 +44,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkGroupIDY(false), WorkGroupIDZ(false), WorkGroupInfo(false), + LDSKernelId(false), PrivateSegmentWaveByteOffset(false), WorkItemIDX(false), WorkItemIDY(false), @@ -143,6 +144,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (!F.hasFnAttribute("amdgpu-no-dispatch-id")) DispatchID = true; + + if (!IsKernel && !F.hasFnAttribute("amdgpu-no-lds-kernel-id")) + LDSKernelId = true; } // FIXME: This attribute is a hack, we just need an analysis on the function @@ -261,6 +265,12 @@ Register SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) return ArgInfo.ImplicitBufferPtr.getRegister(); } +Register SIMachineFunctionInfo::addLDSKernelId() { + ArgInfo.LDSKernelId = ArgDescriptor::createRegister(getNextUserSGPR()); + NumUserSGPRs += 1; + return ArgInfo.LDSKernelId.getRegister(); +} + bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) { for (unsigned I = 0; CSRegs[I]; ++I) { @@ -561,6 +571,7 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo, Any |= convertArg(AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr); Any |= convertArg(AI.DispatchID, ArgInfo.DispatchID); Any |= convertArg(AI.FlatScratchInit, ArgInfo.FlatScratchInit); + Any |= convertArg(AI.LDSKernelId, ArgInfo.LDSKernelId); Any |= convertArg(AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize); Any |= convertArg(AI.WorkGroupIDX, ArgInfo.WorkGroupIDX); Any |= convertArg(AI.WorkGroupIDY, ArgInfo.WorkGroupIDY); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index bebb13cbf09f..5105587617fd 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -191,6 +191,7 @@ struct SIArgumentInfo { Optional<SIArgument> WorkGroupIDY; Optional<SIArgument> WorkGroupIDZ; Optional<SIArgument> WorkGroupInfo; + Optional<SIArgument> LDSKernelId; Optional<SIArgument> PrivateSegmentWaveByteOffset; Optional<SIArgument> ImplicitArgPtr; @@ -215,6 +216,7 @@ template <> struct MappingTraits<SIArgumentInfo> { YamlIO.mapOptional("workGroupIDY", AI.WorkGroupIDY); YamlIO.mapOptional("workGroupIDZ", AI.WorkGroupIDZ); YamlIO.mapOptional("workGroupInfo", AI.WorkGroupInfo); + YamlIO.mapOptional("LDSKernelId", AI.LDSKernelId); YamlIO.mapOptional("privateSegmentWaveByteOffset", AI.PrivateSegmentWaveByteOffset); @@ -418,6 +420,7 @@ private: bool WorkGroupIDY : 1; bool WorkGroupIDZ : 1; bool WorkGroupInfo : 1; + bool LDSKernelId : 1; bool PrivateSegmentWaveByteOffset : 1; bool WorkItemIDX : 1; // Always initialized. @@ -608,6 +611,7 @@ public: Register addDispatchID(const SIRegisterInfo &TRI); Register addFlatScratchInit(const SIRegisterInfo &TRI); Register addImplicitBufferPtr(const SIRegisterInfo &TRI); + Register addLDSKernelId(); /// Increment user SGPRs used for padding the argument list only. Register addReservedUserSGPR() { @@ -705,6 +709,8 @@ public: return WorkGroupInfo; } + bool hasLDSKernelId() const { return LDSKernelId; } + bool hasPrivateSegmentWaveByteOffset() const { return PrivateSegmentWaveByteOffset; } diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index 66bc46aaefea..19a83ad53e2e 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -12,6 +12,8 @@ #include "SIRegisterInfo.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/InitializePasses.h" using namespace llvm; @@ -26,6 +28,10 @@ class SIOptimizeExecMasking : public MachineFunctionPass { const SIRegisterInfo *TRI = nullptr; const SIInstrInfo *TII = nullptr; const MachineRegisterInfo *MRI = nullptr; + MCRegister Exec; + + DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping; + SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors; Register isCopyFromExec(const MachineInstr &MI) const; Register isCopyToExec(const MachineInstr &MI) const; @@ -44,13 +50,13 @@ class SIOptimizeExecMasking : public MachineFunctionPass { std::function<bool(MachineInstr *)> Pred, ArrayRef<MCRegister> NonModifiableRegs, unsigned MaxInstructions = 20) const; - MachineInstr *findPossibleVCMPVCMPXOptimization(MachineInstr &SaveExec, - MCRegister Exec) const; - bool optimizeExecSequence() const; - bool optimizeVCmpxAndSaveexecSequence() const; - bool optimizeSingleVCMPSaveExecSequence(MachineInstr &SaveExecInstr, - MachineInstr &VCmp, - MCRegister Exec) const; + bool optimizeExecSequence(); + void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI); + bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, + MachineInstr &VCmp, MCRegister Exec) const; + + void tryRecordOrSaveexecXorSequence(MachineInstr &MI); + bool optimizeOrSaveexecXorSequences(); public: static char ID; @@ -92,7 +98,7 @@ Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const { case AMDGPU::S_MOV_B32: case AMDGPU::S_MOV_B32_term: { const MachineOperand &Src = MI.getOperand(1); - if (Src.isReg() && Src.getReg() == TRI->getExec()) + if (Src.isReg() && Src.getReg() == Exec) return MI.getOperand(0).getReg(); } } @@ -107,8 +113,7 @@ Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const { case AMDGPU::S_MOV_B64: case AMDGPU::S_MOV_B32: { const MachineOperand &Dst = MI.getOperand(0); - if (Dst.isReg() && Dst.getReg() == TRI->getExec() && - MI.getOperand(1).isReg()) + if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(1).isReg()) return MI.getOperand(1).getReg(); break; } @@ -394,9 +399,7 @@ bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop, // => // x = s_<op>_saveexec_b64 y // -bool SIOptimizeExecMasking::optimizeExecSequence() const { - MCRegister Exec = TRI->getExec(); - +bool SIOptimizeExecMasking::optimizeExecSequence() { bool Changed = false; for (MachineBasicBlock &MBB : *MF) { MachineBasicBlock::reverse_iterator I = fixTerminators(MBB); @@ -551,88 +554,9 @@ bool SIOptimizeExecMasking::optimizeExecSequence() const { return Changed; } -// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence -// by looking at an instance of a s_and_saveexec instruction. Returns a pointer -// to the v_cmp instruction if it is safe to replace the sequence (see the -// conditions in the function body). This is after register allocation, so some -// checks on operand dependencies need to be considered. -MachineInstr *SIOptimizeExecMasking::findPossibleVCMPVCMPXOptimization( - MachineInstr &SaveExec, MCRegister Exec) const { - - MachineInstr *VCmp = nullptr; - - Register SaveExecDest = SaveExec.getOperand(0).getReg(); - if (!TRI->isSGPRReg(*MRI, SaveExecDest)) - return nullptr; - - MachineOperand *SaveExecSrc0 = - TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0); - if (!SaveExecSrc0->isReg()) - return nullptr; - - // Try to find the last v_cmp instruction that defs the saveexec input - // operand without any write to Exec or the saveexec input operand inbetween. - VCmp = findInstrBackwards( - SaveExec, - [&](MachineInstr *Check) { - return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && - Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); - }, - {Exec, SaveExecSrc0->getReg()}); - - if (!VCmp) - return nullptr; - - MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); - assert(VCmpDest && "Should have an sdst operand!"); - - // Check if any of the v_cmp source operands is written by the saveexec. - MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); - if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) && - SaveExec.modifiesRegister(Src0->getReg(), TRI)) - return nullptr; - - MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); - if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) && - SaveExec.modifiesRegister(Src1->getReg(), TRI)) - return nullptr; - - // Don't do the transformation if the destination operand is included in - // it's MBB Live-outs, meaning it's used in any of it's successors, leading - // to incorrect code if the v_cmp and therefore the def of - // the dest operand is removed. - if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) - return nullptr; - - // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the - // s_and_saveexec, skip the optimization. - if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), false, - true) || - isRegisterInUseAfter(SaveExec, VCmpDest->getReg())) - return nullptr; - - // Try to determine if there is a write to any of the VCmp - // operands between the saveexec and the vcmp. - // If yes, additional VGPR spilling might need to be inserted. In this case, - // it's not worth replacing the instruction sequence. - SmallVector<MCRegister, 2> NonDefRegs; - if (Src0->isReg()) - NonDefRegs.push_back(Src0->getReg()); - - if (Src1->isReg()) - NonDefRegs.push_back(Src1->getReg()); - - if (!findInstrBackwards( - SaveExec, [&](MachineInstr *Check) { return Check == VCmp; }, - NonDefRegs)) - return nullptr; - - return VCmp; -} - // Inserts the optimized s_mov_b32 / v_cmpx sequence based on the // operands extracted from a v_cmp ..., s_and_saveexec pattern. -bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence( +bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence( MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const { const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); @@ -678,50 +602,164 @@ bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence( if (Src1->isReg()) MRI->clearKillFlags(Src1->getReg()); + SaveExecInstr.eraseFromParent(); + VCmp.eraseFromParent(); + return true; } -// After all s_op_saveexec instructions are inserted, -// replace (on GFX10.3 and later) +// Record (on GFX10.3 and later) occurences of // v_cmp_* SGPR, IMM, VGPR // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR -// with +// to be replaced with // s_mov_b32 EXEC_SGPR_DEST, exec_lo // v_cmpx_* IMM, VGPR // to reduce pipeline stalls. -bool SIOptimizeExecMasking::optimizeVCmpxAndSaveexecSequence() const { +void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence( + MachineInstr &MI) { if (!ST->hasGFX10_3Insts()) - return false; + return; - bool Changed = false; - - DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping; - MCRegister Exec = TRI->getExec(); const unsigned AndSaveExecOpcode = ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; - for (MachineBasicBlock &MBB : *MF) { - for (MachineInstr &MI : MBB) { - // Record relevant v_cmp / s_and_saveexec instruction pairs for - // replacement. - if (MI.getOpcode() != AndSaveExecOpcode) - continue; + if (MI.getOpcode() != AndSaveExecOpcode) + return; + + Register SaveExecDest = MI.getOperand(0).getReg(); + if (!TRI->isSGPRReg(*MRI, SaveExecDest)) + return; - if (MachineInstr *VCmp = findPossibleVCMPVCMPXOptimization(MI, Exec)) - SaveExecVCmpMapping[&MI] = VCmp; + MachineOperand *SaveExecSrc0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + if (!SaveExecSrc0->isReg()) + return; + + // Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec + // sequence by looking at an instance of a s_and_saveexec instruction. Returns + // a pointer to the v_cmp instruction if it is safe to replace the sequence + // (see the conditions in the function body). This is after register + // allocation, so some checks on operand dependencies need to be considered. + MachineInstr *VCmp = nullptr; + + // Try to find the last v_cmp instruction that defs the saveexec input + // operand without any write to Exec or the saveexec input operand inbetween. + VCmp = findInstrBackwards( + MI, + [&](MachineInstr *Check) { + return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && + Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); + }, + {Exec, SaveExecSrc0->getReg()}); + + if (!VCmp) + return; + + MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); + assert(VCmpDest && "Should have an sdst operand!"); + + // Check if any of the v_cmp source operands is written by the saveexec. + MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); + if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) && + MI.modifiesRegister(Src0->getReg(), TRI)) + return; + + MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); + if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) && + MI.modifiesRegister(Src1->getReg(), TRI)) + return; + + // Don't do the transformation if the destination operand is included in + // it's MBB Live-outs, meaning it's used in any of it's successors, leading + // to incorrect code if the v_cmp and therefore the def of + // the dest operand is removed. + if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) + return; + + // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the + // s_and_saveexec, skip the optimization. + if (isRegisterInUseBetween(*VCmp, MI, VCmpDest->getReg(), false, true) || + isRegisterInUseAfter(MI, VCmpDest->getReg())) + return; + + // Try to determine if there is a write to any of the VCmp + // operands between the saveexec and the vcmp. + // If yes, additional VGPR spilling might need to be inserted. In this case, + // it's not worth replacing the instruction sequence. + SmallVector<MCRegister, 2> NonDefRegs; + if (Src0->isReg()) + NonDefRegs.push_back(Src0->getReg()); + + if (Src1->isReg()) + NonDefRegs.push_back(Src1->getReg()); + + if (!findInstrBackwards( + MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs)) + return; + + if (VCmp) + SaveExecVCmpMapping[&MI] = VCmp; +} + +// Record occurences of +// s_or_saveexec s_o, s_i +// s_xor exec, exec, s_o +// to be replaced with +// s_andn2_saveexec s_o, s_i. +void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(MachineInstr &MI) { + const unsigned XorOpcode = + ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64; + + if (MI.getOpcode() == XorOpcode && &MI != &MI.getParent()->front()) { + const MachineOperand &XorDst = MI.getOperand(0); + const MachineOperand &XorSrc0 = MI.getOperand(1); + const MachineOperand &XorSrc1 = MI.getOperand(2); + + if (XorDst.isReg() && XorDst.getReg() == Exec && XorSrc0.isReg() && + XorSrc1.isReg() && + (XorSrc0.getReg() == Exec || XorSrc1.getReg() == Exec)) { + const unsigned OrSaveexecOpcode = ST->isWave32() + ? AMDGPU::S_OR_SAVEEXEC_B32 + : AMDGPU::S_OR_SAVEEXEC_B64; + + // Peek at the previous instruction and check if this is a relevant + // s_or_saveexec instruction. + MachineInstr &PossibleOrSaveexec = *MI.getPrevNode(); + if (PossibleOrSaveexec.getOpcode() != OrSaveexecOpcode) + return; + + const MachineOperand &OrDst = PossibleOrSaveexec.getOperand(0); + const MachineOperand &OrSrc0 = PossibleOrSaveexec.getOperand(1); + if (OrDst.isReg() && OrSrc0.isReg()) { + if ((XorSrc0.getReg() == Exec && XorSrc1.getReg() == OrDst.getReg()) || + (XorSrc0.getReg() == OrDst.getReg() && XorSrc1.getReg() == Exec)) { + OrXors.emplace_back(&PossibleOrSaveexec, &MI); + } + } } } +} - for (const auto &Entry : SaveExecVCmpMapping) { - MachineInstr *SaveExecInstr = Entry.getFirst(); - MachineInstr *VCmpInstr = Entry.getSecond(); +bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() { + if (OrXors.empty()) { + return false; + } - if (optimizeSingleVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec)) { - SaveExecInstr->eraseFromParent(); - VCmpInstr->eraseFromParent(); + bool Changed = false; + const unsigned Andn2Opcode = ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32 + : AMDGPU::S_ANDN2_SAVEEXEC_B64; - Changed = true; - } + for (const auto &Pair : OrXors) { + MachineInstr *Or = nullptr; + MachineInstr *Xor = nullptr; + std::tie(Or, Xor) = Pair; + BuildMI(*Or->getParent(), Or->getIterator(), Or->getDebugLoc(), + TII->get(Andn2Opcode), Or->getOperand(0).getReg()) + .addReg(Or->getOperand(1).getReg()); + + Or->eraseFromParent(); + Xor->eraseFromParent(); + + Changed = true; } return Changed; @@ -736,9 +774,42 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { TRI = ST->getRegisterInfo(); TII = ST->getInstrInfo(); MRI = &MF.getRegInfo(); + Exec = TRI->getExec(); bool Changed = optimizeExecSequence(); - Changed |= optimizeVCmpxAndSaveexecSequence(); + + OrXors.clear(); + SaveExecVCmpMapping.clear(); + static unsigned SearchWindow = 10; + for (MachineBasicBlock &MBB : MF) { + unsigned SearchCount = 0; + + for (auto &MI : llvm::reverse(MBB)) { + if (MI.isDebugInstr()) + continue; + + if (SearchCount >= SearchWindow) { + break; + } + + tryRecordOrSaveexecXorSequence(MI); + tryRecordVCmpxAndSaveexecSequence(MI); + + if (MI.modifiesRegister(Exec, TRI)) { + break; + } + + ++SearchCount; + } + } + + Changed |= optimizeOrSaveexecXorSequences(); + for (const auto &Entry : SaveExecVCmpMapping) { + MachineInstr *SaveExecInstr = Entry.getFirst(); + MachineInstr *VCmpInstr = Entry.getSecond(); + + Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec); + } return Changed; } diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index 57dbad468de8..aed84437b890 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -184,6 +184,16 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) { if (isDefBetween(*TRI, LIS, CCReg, *Sel, *And)) return false; + // Cannot safely mirror live intervals with PHI nodes, so check for these + // before optimization. + SlotIndex SelIdx = LIS->getInstructionIndex(*Sel); + LiveInterval *SelLI = &LIS->getInterval(SelReg); + if (llvm::any_of(SelLI->vnis(), + [](const VNInfo *VNI) { + return VNI->isPHIDef(); + })) + return false; + // TODO: Guard against implicit def operands? LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t' << *And); @@ -204,31 +214,34 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) { LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2 << '\n'); - SlotIndex CmpIdx = LIS->getInstructionIndex(*Cmp); - SlotIndex SelIdx = LIS->getInstructionIndex(*Sel); - - LiveInterval *CmpLI = - CmpReg.isVirtual() ? &LIS->getInterval(CmpReg) : nullptr; - LiveInterval *SelLI = - SelReg.isVirtual() ? &LIS->getInterval(SelReg) : nullptr; - // Update live intervals for CCReg before potentially removing CmpReg/SelReg, // and their associated liveness information. + SlotIndex CmpIdx = LIS->getInstructionIndex(*Cmp); if (CCReg.isVirtual()) { - // Note: this ignores that SelLI might have multiple internal values - // or splits and simply extends the live range to cover all cases - // where the result of the v_cndmask_b32 was live (e.g. loops). - // This could yield worse register allocation in rare edge cases. - SlotIndex EndIdx = AndIdx.getRegSlot(); - if (SelLI && SelLI->endIndex() > EndIdx && SelLI->endIndex().isBlock()) - EndIdx = SelLI->endIndex(); + // Apply live ranges from SelLI to CCReg potentially matching splits + // and extending to loop boundaries. + + auto applyLiveRanges = [&](LiveRange &Dst, VNInfo *VNI) { + // Copy live ranges from SelLI, adjusting start and end as required + auto DefSegment = SelLI->FindSegmentContaining(SelIdx.getRegSlot()); + assert(DefSegment != SelLI->end() && + "No live interval segment covering definition?"); + for (auto I = DefSegment; I != SelLI->end(); ++I) { + SlotIndex Start = I->start < SelIdx.getRegSlot() ? + SelIdx.getRegSlot() : I->start; + SlotIndex End = I->end < AndIdx.getRegSlot() || I->end.isBlock() ? + I->end : AndIdx.getRegSlot(); + Dst.addSegment(LiveRange::Segment(Start, End, VNI)); + } + // If SelLI does not cover AndIdx (because Cmp killed Sel) then extend. + if (!SelLI->getSegmentContaining(AndIdx.getRegSlot())) + Dst.addSegment(LiveRange::Segment(CmpIdx.getRegSlot(), AndIdx.getRegSlot(), VNI)); + }; LiveInterval &CCLI = LIS->getInterval(CCReg); auto CCQ = CCLI.Query(SelIdx.getRegSlot()); - if (CCQ.valueIn()) { - CCLI.addSegment(LiveRange::Segment(SelIdx.getRegSlot(), - EndIdx, CCQ.valueIn())); - } + if (CCQ.valueIn()) + applyLiveRanges(CCLI, CCQ.valueIn()); if (CC->getSubReg()) { LaneBitmask Mask = TRI->getSubRegIndexLaneMask(CC->getSubReg()); @@ -237,10 +250,8 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) { Allocator, Mask, [=](LiveInterval::SubRange &SR) { auto CCQS = SR.Query(SelIdx.getRegSlot()); - if (CCQS.valueIn()) { - SR.addSegment(LiveRange::Segment( - SelIdx.getRegSlot(), EndIdx, CCQS.valueIn())); - } + if (CCQS.valueIn()) + applyLiveRanges(SR, CCQS.valueIn()); }, *LIS->getSlotIndexes(), *TRI); CCLI.removeEmptySubRanges(); @@ -253,7 +264,8 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) { // Try to remove compare. Cmp value should not used in between of cmp // and s_and_b64 if VCC or just unused if any other register. - if ((CmpReg.isVirtual() && CmpLI && CmpLI->Query(AndIdx.getRegSlot()).isKill()) || + LiveInterval *CmpLI = CmpReg.isVirtual() ? &LIS->getInterval(CmpReg) : nullptr; + if ((CmpLI && CmpLI->Query(AndIdx.getRegSlot()).isKill()) || (CmpReg == Register(CondReg) && std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(), [&](const MachineInstr &MI) { @@ -266,18 +278,16 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) { Cmp->eraseFromParent(); // Try to remove v_cndmask_b32. - if (SelLI) { - // Kill status must be checked before shrinking the live range. - bool IsKill = SelLI->Query(CmpIdx.getRegSlot()).isKill(); - LIS->shrinkToUses(SelLI); - bool IsDead = SelLI->Query(SelIdx.getRegSlot()).isDeadDef(); - if (MRI->use_nodbg_empty(SelReg) && (IsKill || IsDead)) { - LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n'); - - LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot()); - LIS->RemoveMachineInstrFromMaps(*Sel); - Sel->eraseFromParent(); - } + // Kill status must be checked before shrinking the live range. + bool IsKill = SelLI->Query(CmpIdx.getRegSlot()).isKill(); + LIS->shrinkToUses(SelLI); + bool IsDead = SelLI->Query(SelIdx.getRegSlot()).isDeadDef(); + if (MRI->use_nodbg_empty(SelReg) && (IsKill || IsDead)) { + LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n'); + + LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot()); + LIS->RemoveMachineInstrFromMaps(*Sel); + Sel->eraseFromParent(); } } diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h index b13afceba20e..553fb4cf496c 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -49,6 +49,8 @@ struct SIProgramInfo { uint32_t AccumOffset = 0; uint32_t TgSplit = 0; uint32_t NumSGPR = 0; + unsigned SGPRSpill = 0; + unsigned VGPRSpill = 0; uint32_t LDSSize = 0; bool FlatUsed = false; diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 882d13402a19..b7e8eadfe71d 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -119,13 +119,19 @@ class SM_Probe_Pseudo <string opName, string variant, RegisterClass baseClass, let PseudoInstr = opName # variant; } -class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> - : SM_Pseudo<opName, outs, ins, asmOps, pattern> { - RegisterClass BaseClass; +class SM_Load_Pseudo <string opName, RegisterClass baseClass, + RegisterClass dstClass, OffsetMode offsets> + : SM_Pseudo<opName, (outs dstClass:$sdst), + !con((ins baseClass:$sbase), offsets.Ins, (ins CPol:$cpol)), + " $sdst, $sbase, " # offsets.Asm # "$cpol", []> { + RegisterClass BaseClass = baseClass; let mayLoad = 1; let mayStore = 0; let has_glc = 1; let has_dlc = 1; + let has_offset = offsets.HasOffset; + let has_soffset = offsets.HasSOffset; + let PseudoInstr = opName # offsets.Variant; } class SM_Store_Pseudo <string opName, RegisterClass baseClass, @@ -158,40 +164,9 @@ class SM_Discard_Pseudo <string opName, string variant, dag offsets, multiclass SM_Pseudo_Loads<string opName, RegisterClass baseClass, RegisterClass dstClass> { - def _IMM : SM_Load_Pseudo <opName, - (outs dstClass:$sdst), - (ins baseClass:$sbase, i32imm:$offset, CPol:$cpol), - " $sdst, $sbase, $offset$cpol", []> { - let has_offset = 1; - let BaseClass = baseClass; - let PseudoInstr = opName # "_IMM"; - let has_glc = 1; - let has_dlc = 1; - } - - def _SGPR : SM_Load_Pseudo <opName, - (outs dstClass:$sdst), - (ins baseClass:$sbase, SReg_32:$soffset, CPol:$cpol), - " $sdst, $sbase, $soffset$cpol", []> { - let has_soffset = 1; - let BaseClass = baseClass; - let PseudoInstr = opName # "_SGPR"; - let has_glc = 1; - let has_dlc = 1; - } - - def _SGPR_IMM : SM_Load_Pseudo <opName, - (outs dstClass:$sdst), - (ins baseClass:$sbase, SReg_32:$soffset, - i32imm:$offset, CPol:$cpol), - " $sdst, $sbase, $soffset$offset$cpol", []> { - let has_offset = 1; - let has_soffset = 1; - let BaseClass = baseClass; - let PseudoInstr = opName # "_SGPR_IMM"; - let has_glc = 1; - let has_dlc = 1; - } + def _IMM : SM_Load_Pseudo <opName, baseClass, dstClass, IMM_Offset>; + def _SGPR : SM_Load_Pseudo <opName, baseClass, dstClass, SGPR_Offset>; + def _SGPR_IMM : SM_Load_Pseudo <opName, baseClass, dstClass, SGPR_IMM_Offset>; } multiclass SM_Pseudo_Stores<string opName, @@ -596,10 +571,10 @@ class SMEM_Real_vi <bits<8> op, SM_Pseudo ps> soffset{6-0}, ?); } -class SMEM_Real_Load_vi<bits<8> op, string ps, dag offsets> - : SMEM_Real_vi<op, !cast<SM_Pseudo>(ps)> { - RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps).BaseClass; - let InOperandList = !con((ins BaseClass:$sbase), offsets, (ins CPol:$cpol)); +class SMEM_Real_Load_vi<bits<8> op, string ps, OffsetMode offsets> + : SMEM_Real_vi<op, !cast<SM_Pseudo>(ps # offsets.Variant)> { + RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass; + let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol)); } // The alternative GFX9 SGPR encoding using soffset to encode the @@ -614,14 +589,12 @@ class SMEM_Real_SGPR_alt_gfx9 { } multiclass SM_Real_Loads_vi<bits<8> op, string ps> { - def _IMM_vi : SMEM_Real_Load_vi <op, ps#"_IMM", (ins smem_offset:$offset)>; - def _SGPR_vi : SMEM_Real_Load_vi <op, ps#"_SGPR", (ins SReg_32:$soffset)>; - def _SGPR_alt_gfx9 : SMEM_Real_Load_vi <op, ps#"_SGPR", - (ins SReg_32:$soffset)>, + def _IMM_vi : SMEM_Real_Load_vi <op, ps, IMM_Offset>; + def _SGPR_vi : SMEM_Real_Load_vi <op, ps, SGPR_Offset>; + def _SGPR_alt_gfx9 : SMEM_Real_Load_vi <op, ps, SGPR_Offset>, SMEM_Real_SGPR_alt_gfx9; let IsGFX9SpecificEncoding = true in - def _SGPR_IMM_gfx9 : SMEM_Real_Load_vi < - op, ps#"_SGPR_IMM", (ins SReg_32:$soffset, smem_offset_mod:$offset)>; + def _SGPR_IMM_gfx9 : SMEM_Real_Load_vi <op, ps, SGPR_IMM_Offset>; } class SMEM_Real_Store_Base_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps> { @@ -883,6 +856,7 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformL def SMRDImm : ComplexPattern<iPTR, 2, "SelectSMRDImm">; def SMRDImm32 : ComplexPattern<iPTR, 2, "SelectSMRDImm32">; def SMRDSgpr : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">; +def SMRDSgprImm : ComplexPattern<iPTR, 3, "SelectSMRDSgprImm">; def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">; def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">; @@ -903,11 +877,18 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> { // 3. SGPR offset def : GCNPat < - (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0)) + (smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $soffset, 0)) >; - // 4. No offset + // 4. SGPR+IMM offset + def : GCNPat < + (smrd_load (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))> { + let OtherPredicates = [isGFX9Plus]; + } + + // 5. No offset def : GCNPat < (vt (smrd_load (i64 SReg_64:$sbase))), (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0)) @@ -1021,19 +1002,16 @@ class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps> let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ?); } -multiclass SM_Real_Loads_gfx10<bits<8> op, string ps, - SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM), - SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> { - def _IMM_gfx10 : SMEM_Real_gfx10<op, immPs> { - let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol); - } - def _SGPR_gfx10 : SMEM_Real_gfx10<op, sgprPs> { - let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$soffset, CPol:$cpol); - } - def _SGPR_IMM_gfx10 : SMEM_Real_gfx10<op, !cast<SM_Load_Pseudo>(ps#_SGPR_IMM)> { - let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$soffset, - smem_offset_mod:$offset, CPol:$cpol); - } +class SMEM_Real_Load_gfx10<bits<8> op, string ps, OffsetMode offsets> + : SMEM_Real_gfx10<op, !cast<SM_Pseudo>(ps # offsets.Variant)> { + RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass; + let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol)); +} + +multiclass SM_Real_Loads_gfx10<bits<8> op, string ps> { + def _IMM_gfx10 : SMEM_Real_Load_gfx10<op, ps, IMM_Offset>; + def _SGPR_gfx10 : SMEM_Real_Load_gfx10<op, ps, SGPR_Offset>; + def _SGPR_IMM_gfx10 : SMEM_Real_Load_gfx10<op, ps, SGPR_IMM_Offset>; } class SMEM_Real_Store_gfx10<bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx10<op, ps> { @@ -1227,17 +1205,16 @@ class SMEM_Real_gfx11<bits<8> op, SM_Pseudo ps, string opName = ps.Mnemonic> : let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, 0); } -class SMEM_Real_Load_gfx11<bits<8> op, string ps, string opName, dag offsets> : - SMEM_Real_gfx11<op, !cast<SM_Pseudo>(ps), opName> { - RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps).BaseClass; - let InOperandList = !con((ins BaseClass:$sbase), offsets, (ins CPol:$cpol)); +class SMEM_Real_Load_gfx11<bits<8> op, string ps, string opName, OffsetMode offsets> : + SMEM_Real_gfx11<op, !cast<SM_Pseudo>(ps # offsets.Variant), opName> { + RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass; + let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol)); } multiclass SM_Real_Loads_gfx11<bits<8> op, string ps, string opName> { - def _IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_IMM", opName, (ins smem_offset:$offset)>; - def _SGPR_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_SGPR", opName, (ins SReg_32:$soffset)>; - def _SGPR_IMM_gfx11 : SMEM_Real_Load_gfx11< - op, ps#"_SGPR_IMM", opName, (ins SReg_32:$soffset, smem_offset_mod:$offset)>; + def _IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps, opName, IMM_Offset>; + def _SGPR_gfx11 : SMEM_Real_Load_gfx11<op, ps, opName, SGPR_Offset>; + def _SGPR_IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps, opName, SGPR_IMM_Offset>; def : MnemonicAlias<!cast<SM_Pseudo>(ps#"_IMM").Mnemonic, opName>, Requires<[isGFX11Plus]>; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 2f334e211181..b5fb390c08e1 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -417,9 +417,9 @@ bool getMAIIsGFX940XDL(unsigned Opc) { CanBeVOPD getCanBeVOPD(unsigned Opc) { const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc); if (Info) - return {Info->CanBeVOPDX, 1}; + return {Info->CanBeVOPDX, true}; else - return {0, 0}; + return {false, false}; } unsigned getVOPDOpcode(unsigned Opc) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h index 65ed02ca62de..a2d59abd3abb 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h @@ -30,7 +30,7 @@ namespace AMDGPU { Align getAlign(DataLayout const &DL, const GlobalVariable *GV); std::vector<GlobalVariable *> findVariablesToLower(Module &M, - const Function *F = nullptr); + const Function *F); /// Replace all uses of constant \p C with instructions in \p F. void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F); diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 1d374a9f90ba..73e4eb8cdc24 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -499,6 +499,59 @@ let SubtargetPredicate = isGFX9Only in { defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>; } // End SubtargetPredicate = isGFX9Only +class VOPProfile_Base_CVT_F32_F8<ValueType vt> : VOPProfileI2F <vt, i32> { + let HasExtSDWA = 1; + let HasExtSDWA9 = 1; + let HasExt = 1; + let DstRCSDWA = getVALUDstForVT<vt>.ret; + let InsSDWA = (ins Bin32SDWAInputMods:$src0_modifiers, Src0SDWA:$src0, + clampmod:$clamp, omod:$omod, src0_sel:$src0_sel); + let AsmSDWA = "$vdst, $src0_modifiers$clamp$omod $src0_sel"; // No dst_sel + let AsmSDWA9 = AsmSDWA; + let EmitDstSel = 0; +} + +def VOPProfileCVT_F32_F8 : VOPProfile_Base_CVT_F32_F8 <f32>; +def VOPProfileCVT_PK_F32_F8 : VOPProfile_Base_CVT_F32_F8 <v2f32>; + +let SubtargetPredicate = HasFP8Insts, mayRaiseFPException = 0, + SchedRW = [WriteFloatCvt] in { + defm V_CVT_F32_FP8 : VOP1Inst<"v_cvt_f32_fp8", VOPProfileCVT_F32_F8>; + defm V_CVT_F32_BF8 : VOP1Inst<"v_cvt_f32_bf8", VOPProfileCVT_F32_F8>; + defm V_CVT_PK_F32_FP8 : VOP1Inst<"v_cvt_pk_f32_fp8", VOPProfileCVT_PK_F32_F8>; + defm V_CVT_PK_F32_BF8 : VOP1Inst<"v_cvt_pk_f32_bf8", VOPProfileCVT_PK_F32_F8>; +} + +class Cvt_F32_F8_Pat<SDPatternOperator node, int index, + VOP1_Pseudo inst_e32, VOP1_SDWA_Pseudo inst_sdwa> : GCNPat< + (f32 (node i32:$src, index)), + !if (index, + (inst_sdwa 0, $src, 0, 0, index), + (inst_e32 $src)) +>; + +foreach Index = [0, 1, 2, 3] in { + def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_fp8, Index, + V_CVT_F32_FP8_e32, V_CVT_F32_FP8_sdwa>; + def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_bf8, Index, + V_CVT_F32_BF8_e32, V_CVT_F32_BF8_sdwa>; +} + +class Cvt_PK_F32_F8_Pat<SDPatternOperator node, int index, + VOP1_Pseudo inst_e32, VOP1_SDWA_Pseudo inst_sdwa> : GCNPat< + (v2f32 (node i32:$src, index)), + !if (index, + (inst_sdwa 0, $src, 0, 0, SDWA.WORD_1), + (inst_e32 $src)) +>; + +foreach Index = [0, -1] in { + def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_fp8, Index, + V_CVT_PK_F32_FP8_e32, V_CVT_PK_F32_FP8_sdwa>; + def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_bf8, Index, + V_CVT_PK_F32_BF8_e32, V_CVT_PK_F32_BF8_sdwa>; +} + let SubtargetPredicate = isGFX10Plus in { defm V_PIPEFLUSH : VOP1Inst<"v_pipeflush", VOP_NO_EXT<VOP_NONE>>; @@ -1106,11 +1159,36 @@ multiclass VOP1_Real_gfx9 <bits<10> op> { } +multiclass VOP1_Real_NoDstSel_SDWA_gfx9 <bits<10> op> { + let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in { + defm NAME : VOP1_Real_e32e64_vi <op>; + } + + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, + VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { + let Inst{42-40} = 6; + } + + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx9 : + VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, + VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>; +} + defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>; let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9" in defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>; +let OtherPredicates = [HasFP8Insts] in { +defm V_CVT_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x54>; +defm V_CVT_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x55>; +defm V_CVT_PK_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>; +defm V_CVT_PK_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>; +} + //===----------------------------------------------------------------------===// // GFX10 //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index dddd0aacc140..a911483cade5 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -481,6 +481,30 @@ def shl_0_to_4 : PatFrag< }]; } +def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> { + let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0, + FP32InputMods:$src1_modifiers, Src1RC64:$src1, + VGPR_32:$vdst_in, op_sel0:$op_sel); + let HasClamp = 0; + let HasExtVOP3DPP = 0; +} + +def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>, + VOP3_OPSEL> { + let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0, + FP32InputMods:$src1_modifiers, Src1RC64:$src1, + FP32InputMods:$src2_modifiers, VGPR_32:$src2, + op_sel0:$op_sel); + let HasClamp = 0; + let HasSrc2 = 0; + let HasSrc2Mods = 1; + let AsmVOP3OpSel = !subst(", $src2_modifiers", "", + getAsmVOP3OpSel<3, HasClamp, + HasSrc0FloatMods, HasSrc1FloatMods, + HasSrc2FloatMods>.ret); + let HasExtVOP3DPP = 0; +} + let SubtargetPredicate = isGFX9Plus in { let isCommutable = 1, isReMaterializable = 1 in { defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; @@ -526,6 +550,43 @@ defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32 let SubtargetPredicate = isGFX940Plus in defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", VOP3_Profile<VOP_I64_I64_I32_I64>>; +let SubtargetPredicate = HasFP8Insts, mayRaiseFPException = 0, + SchedRW = [WriteFloatCvt] in { + let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in { + defm V_CVT_PK_FP8_F32 : VOP3Inst<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile>; + defm V_CVT_PK_BF8_F32 : VOP3Inst<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile>; + } + + // These instructions have non-standard use of op_sel. In particular they are + // using op_sel bits 2 and 3 while only having two sources. Therefore dummy + // src2 is used to hold the op_sel value. + let Constraints = "$vdst = $src2", DisableEncoding = "$src2" in { + defm V_CVT_SR_FP8_F32 : VOP3Inst<"v_cvt_sr_fp8_f32", VOP3_CVT_SR_F8_F32_Profile>; + defm V_CVT_SR_BF8_F32 : VOP3Inst<"v_cvt_sr_bf8_f32", VOP3_CVT_SR_F8_F32_Profile>; + } +} + +class Cvt_PK_F8_F32_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst> : GCNPat< + (i32 (node f32:$src0, f32:$src1, i32:$old, index)), + (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, !if(index, SRCMODS.OP_SEL_0, 0)) +>; + +class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst> : GCNPat< + (i32 (node f32:$src0, i32:$src1, i32:$old, index)), + (inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, + !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, !if(index{1}, SRCMODS.OP_SEL_0, 0)) +>; + +foreach Index = [0, -1] in { + def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>; + def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_e64>; +} + +foreach Index = [0, 1, 2, 3] in { + def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_fp8_f32, Index, V_CVT_SR_FP8_F32_e64>; + def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_bf8_f32, Index, V_CVT_SR_BF8_F32_e64>; +} + class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat < // This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions. (ThreeOpFrag<op1, op2> i32:$src0, i32:$src1, i32:$src2), @@ -699,15 +760,19 @@ def : DivFmasPat<f64, V_DIV_FMAS_F64_e64, VCC_LO>; } class VOP3_DOT_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile<P, Features> { - // FIXME VOP3 DPP versions are unsupported - let HasExtVOP3DPP = 0; let HasClamp = 0; let HasOMod = 0; - let InsVOP3OpSel = getInsVOP3OpSel<Src0RC64, Src1RC64, Src2RC64, - NumSrcArgs, HasClamp, HasOMod, - !if(isFloatType<Src0VT>.ret, FPVRegInputMods, IntOpSelMods), - !if(isFloatType<Src1VT>.ret, FPVRegInputMods, IntOpSelMods), - !if(isFloatType<Src2VT>.ret, FPVRegInputMods, IntOpSelMods)>.ret; + // Override modifiers for bf16(i16) (same as float modifiers). + let HasSrc0Mods = 1; + let HasSrc1Mods = 1; + let HasSrc2Mods = 1; + let Src0ModDPP = FPVRegInputMods; + let Src1ModDPP = FPVRegInputMods; + let Src2ModVOP3DPP = FPVRegInputMods; + let InsVOP3OpSel = getInsVOP3OpSel<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, + HasClamp, HasOMod, FPVRegInputMods, + FPVRegInputMods, FPVRegInputMods>.ret; + let AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs, HasClamp, 1, 1, 1>.ret; } let SubtargetPredicate = isGFX11Plus in { @@ -723,7 +788,7 @@ let SubtargetPredicate = isGFX11Plus in { defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>; } // End SubtargetPredicate = isGFX11Plus -let SubtargetPredicate = HasDot8Insts in { +let SubtargetPredicate = HasDot8Insts, IsDOT=1 in { defm V_DOT2_F16_F16 : VOP3Inst<"v_dot2_f16_f16", VOP3_DOT_Profile<VOP_F16_V2F16_V2F16_F16>, int_amdgcn_fdot2_f16_f16>; defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_I16_V2I16_V2I16_I16>, int_amdgcn_fdot2_bf16_bf16>; } @@ -848,9 +913,8 @@ defm V_MAXMIN_U32 : VOP3_Realtriple_gfx11<0x262>; defm V_MINMAX_U32 : VOP3_Realtriple_gfx11<0x263>; defm V_MAXMIN_I32 : VOP3_Realtriple_gfx11<0x264>; defm V_MINMAX_I32 : VOP3_Realtriple_gfx11<0x265>; -// FIXME VOP3 DPP Dot instructions are unsupported -defm V_DOT2_F16_F16 : VOP3_Real_Base_gfx11<0x266>; -defm V_DOT2_BF16_BF16 : VOP3_Real_Base_gfx11<0x267>; +defm V_DOT2_F16_F16 : VOP3Dot_Realtriple_gfx11<0x266>; +defm V_DOT2_BF16_BF16 : VOP3Dot_Realtriple_gfx11<0x267>; defm V_DIV_SCALE_F32 : VOP3be_Real_gfx11<0x2fc, "V_DIV_SCALE_F32", "v_div_scale_f32">; defm V_DIV_SCALE_F64 : VOP3be_Real_gfx11<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">; defm V_MAD_U64_U32_gfx11 : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">; @@ -1161,6 +1225,13 @@ multiclass VOP3OpSel_Real_gfx9<bits<10> op> { VOP3OpSel_gfx9 <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl>; } +multiclass VOP3OpSel_Real_gfx9_forced_opsel2<bits<10> op> { + def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, + VOP3OpSel_gfx9 <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl> { + let Inst{13} = src2_modifiers{2}; // op_sel(2) + } +} + multiclass VOP3Interp_Real_vi<bits<10> op> { def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>, VOP3Interp_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>; @@ -1352,3 +1423,10 @@ defm V_CVT_PKNORM_I16_F16 : VOP3OpSel_Real_gfx9 <0x299>; defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx9 <0x29a>; defm V_LSHL_ADD_U64 : VOP3_Real_vi <0x208>; + +let OtherPredicates = [HasFP8Insts] in { +defm V_CVT_PK_FP8_F32 : VOP3OpSel_Real_gfx9 <0x2a2>; +defm V_CVT_PK_BF8_F32 : VOP3OpSel_Real_gfx9 <0x2a3>; +defm V_CVT_SR_FP8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a4>; +defm V_CVT_SR_BF8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a5>; +} diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 59ce532af59b..f1ce613d613b 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -493,6 +493,8 @@ def VOPProfileMAI_I32_I64_X16 : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32, A def VOPProfileMAI_I32_I64_X32 : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32, AISrc_512_b32, ADst_512, AVSrc_64>; def VOPProfileMAI_F32_V2F32_X16 : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>; def VOPProfileMAI_F32_V2F32_X32 : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>; +def VOPProfileMAI_F32_I64_X32 : VOPProfileMAI<VOP_V4F32_I64_I64_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>; +def VOPProfileMAI_F32_I64_X16 : VOPProfileMAI<VOP_V16F32_I64_I64_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>; def VOPProfileMAI_F32_F32_X4_VCD : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, VISrc_128_f32, VDst_128>; def VOPProfileMAI_F32_F32_X16_VCD : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, VISrc_512_f32, VDst_512>; @@ -515,6 +517,8 @@ def VOPProfileMAI_I32_I64_X16_VCD : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32, def VOPProfileMAI_I32_I64_X32_VCD : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32, VISrc_512_b32, VDst_512, AVSrc_64>; def VOPProfileMAI_F32_V2F32_X16_VCD : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>; def VOPProfileMAI_F32_V2F32_X32_VCD : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>; +def VOPProfileMAI_F32_I64_X32_VCD : VOPProfileMAI<VOP_V4F32_I64_I64_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>; +def VOPProfileMAI_F32_I64_X16_VCD : VOPProfileMAI<VOP_V16F32_I64_I64_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>; def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I32, AVDst_128, AVSrc_64, AVSrc_128>; def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512, AVSrc_64, AVSrc_128>; @@ -522,6 +526,8 @@ def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512, AVSrc_64, AVSrc_128>; def VOPProfileSMFMAC_I32_16X16X64_I8 : VOPProfileSMFMAC<VOP_V4I32_V2I32_V4I32_I32, AVDst_128, AVSrc_64, AVSrc_128>; def VOPProfileSMFMAC_I32_32X32X32_I8 : VOPProfileSMFMAC<VOP_V16I32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>; +def VOPProfileSMFMAC_F32_16X16X64_F8 : VOPProfileSMFMAC<VOP_V4F32_V2I32_V4I32_I32, AVDst_128, AVSrc_64, AVSrc_128>; +def VOPProfileSMFMAC_F32_32X32X32_F8 : VOPProfileSMFMAC<VOP_V16F32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>; class MFMATable <bit is_mac, string Name> { bit IsMac = is_mac; @@ -638,6 +644,14 @@ let Predicates = [isGFX940Plus], is_gfx940_xdl = 1 in { defm V_MFMA_I32_16X16X32I8 : MAIInst<"v_mfma_i32_16x16x32i8", "I32_I64_X16", int_amdgcn_mfma_i32_16x16x32_i8>; defm V_MFMA_F32_16X16X8XF32 : MAIInst<"v_mfma_f32_16x16x8xf32", "F32_V2F32_X16", int_amdgcn_mfma_f32_16x16x8_xf32>; defm V_MFMA_F32_32X32X4XF32 : MAIInst<"v_mfma_f32_32x32x4xf32", "F32_V2F32_X32", int_amdgcn_mfma_f32_32x32x4_xf32>; + defm V_MFMA_F32_16X16X32_BF8_BF8 : MAIInst<"v_mfma_f32_16x16x32_bf8_bf8", "F32_I64_X32", int_amdgcn_mfma_f32_16x16x32_bf8_bf8>; + defm V_MFMA_F32_16X16X32_BF8_FP8 : MAIInst<"v_mfma_f32_16x16x32_bf8_fp8", "F32_I64_X32", int_amdgcn_mfma_f32_16x16x32_bf8_fp8>; + defm V_MFMA_F32_16X16X32_FP8_BF8 : MAIInst<"v_mfma_f32_16x16x32_fp8_bf8", "F32_I64_X32", int_amdgcn_mfma_f32_16x16x32_fp8_bf8>; + defm V_MFMA_F32_16X16X32_FP8_FP8 : MAIInst<"v_mfma_f32_16x16x32_fp8_fp8", "F32_I64_X32", int_amdgcn_mfma_f32_16x16x32_fp8_fp8>; + defm V_MFMA_F32_32X32X16_BF8_BF8 : MAIInst<"v_mfma_f32_32x32x16_bf8_bf8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_bf8_bf8>; + defm V_MFMA_F32_32X32X16_BF8_FP8 : MAIInst<"v_mfma_f32_32x32x16_bf8_fp8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_bf8_fp8>; + defm V_MFMA_F32_32X32X16_FP8_BF8 : MAIInst<"v_mfma_f32_32x32x16_fp8_bf8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_fp8_bf8>; + defm V_MFMA_F32_32X32X16_FP8_FP8 : MAIInst<"v_mfma_f32_32x32x16_fp8_fp8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_fp8_fp8>; } // End Predicates = [isGFX940Plus], is_gfx940_xdl = 1 multiclass SMFMACInst<string OpName, string P, SDPatternOperator node> { @@ -654,6 +668,14 @@ defm V_SMFMAC_F32_16X16X32_BF16 : SMFMACInst<"v_smfmac_f32_16x16x32_bf16", defm V_SMFMAC_F32_32X32X16_BF16 : SMFMACInst<"v_smfmac_f32_32x32x16_bf16", "F32_32X32X16_I16", int_amdgcn_smfmac_f32_32x32x16_bf16>; defm V_SMFMAC_I32_16X16X64_I8 : SMFMACInst<"v_smfmac_i32_16x16x64_i8", "I32_16X16X64_I8", int_amdgcn_smfmac_i32_16x16x64_i8>; defm V_SMFMAC_I32_32X32X32_I8 : SMFMACInst<"v_smfmac_i32_32x32x32_i8", "I32_32X32X32_I8", int_amdgcn_smfmac_i32_32x32x32_i8>; +defm V_SMFMAC_F32_16X16X64_BF8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x64_bf8_bf8", "F32_16X16X64_F8", int_amdgcn_smfmac_f32_16x16x64_bf8_bf8>; +defm V_SMFMAC_F32_16X16X64_BF8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x64_bf8_fp8", "F32_16X16X64_F8", int_amdgcn_smfmac_f32_16x16x64_bf8_fp8>; +defm V_SMFMAC_F32_16X16X64_FP8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x64_fp8_bf8", "F32_16X16X64_F8", int_amdgcn_smfmac_f32_16x16x64_fp8_bf8>; +defm V_SMFMAC_F32_16X16X64_FP8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x64_fp8_fp8", "F32_16X16X64_F8", int_amdgcn_smfmac_f32_16x16x64_fp8_fp8>; +defm V_SMFMAC_F32_32X32X32_BF8_BF8 : SMFMACInst<"v_smfmac_f32_32x32x32_bf8_bf8", "F32_32X32X32_F8", int_amdgcn_smfmac_f32_32x32x32_bf8_bf8>; +defm V_SMFMAC_F32_32X32X32_BF8_FP8 : SMFMACInst<"v_smfmac_f32_32x32x32_bf8_fp8", "F32_32X32X32_F8", int_amdgcn_smfmac_f32_32x32x32_bf8_fp8>; +defm V_SMFMAC_F32_32X32X32_FP8_BF8 : SMFMACInst<"v_smfmac_f32_32x32x32_fp8_bf8", "F32_32X32X32_F8", int_amdgcn_smfmac_f32_32x32x32_fp8_bf8>; +defm V_SMFMAC_F32_32X32X32_FP8_FP8 : SMFMACInst<"v_smfmac_f32_32x32x32_fp8_fp8", "F32_32X32X32_F8", int_amdgcn_smfmac_f32_32x32x32_fp8_fp8>; } def MAIInstInfoTable : GenericTable { @@ -1121,6 +1143,14 @@ defm V_MFMA_I32_32X32X16I8 : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x defm V_MFMA_I32_16X16X32I8 : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">; defm V_MFMA_F32_16X16X8XF32 : VOP3P_Real_MFMA_gfx940 <0x3e, "v_mfma_f32_16x16x8_xf32">; defm V_MFMA_F32_32X32X4XF32 : VOP3P_Real_MFMA_gfx940 <0x3f, "v_mfma_f32_32x32x4_xf32">; +defm V_MFMA_F32_16X16X32_BF8_BF8 : VOP3P_Real_MFMA_gfx940 <0x70>; +defm V_MFMA_F32_16X16X32_BF8_FP8 : VOP3P_Real_MFMA_gfx940 <0x71>; +defm V_MFMA_F32_16X16X32_FP8_BF8 : VOP3P_Real_MFMA_gfx940 <0x72>; +defm V_MFMA_F32_16X16X32_FP8_FP8 : VOP3P_Real_MFMA_gfx940 <0x73>; +defm V_MFMA_F32_32X32X16_BF8_BF8 : VOP3P_Real_MFMA_gfx940 <0x74>; +defm V_MFMA_F32_32X32X16_BF8_FP8 : VOP3P_Real_MFMA_gfx940 <0x75>; +defm V_MFMA_F32_32X32X16_FP8_BF8 : VOP3P_Real_MFMA_gfx940 <0x76>; +defm V_MFMA_F32_32X32X16_FP8_FP8 : VOP3P_Real_MFMA_gfx940 <0x77>; defm V_MFMA_F32_32X32X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5d, "v_mfma_f32_32x32x4_2b_bf16">; defm V_MFMA_F32_16X16X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5e, "v_mfma_f32_16x16x4_4b_bf16">; @@ -1137,6 +1167,14 @@ defm V_SMFMAC_F32_16X16X32_BF16 : VOP3P_Real_SMFMAC <0x66, "v_smfmac_f32_16x1 defm V_SMFMAC_F32_32X32X16_BF16 : VOP3P_Real_SMFMAC <0x68, "v_smfmac_f32_32x32x16bf16">; defm V_SMFMAC_I32_16X16X64_I8 : VOP3P_Real_SMFMAC <0x6a, "v_smfmac_i32_16x16x64i8">; defm V_SMFMAC_I32_32X32X32_I8 : VOP3P_Real_SMFMAC <0x6c, "v_smfmac_i32_32x32x32i8">; +defm V_SMFMAC_F32_16X16X64_BF8_BF8 : VOP3P_Real_SMFMAC <0x78, "v_smfmac_f32_16x16x64bf8bf8">; +defm V_SMFMAC_F32_16X16X64_BF8_FP8 : VOP3P_Real_SMFMAC <0x79, "v_smfmac_f32_16x16x64bf8fp8">; +defm V_SMFMAC_F32_16X16X64_FP8_BF8 : VOP3P_Real_SMFMAC <0x7a, "v_smfmac_f32_16x16x64fp8bf8">; +defm V_SMFMAC_F32_16X16X64_FP8_FP8 : VOP3P_Real_SMFMAC <0x7b, "v_smfmac_f32_16x16x64fp8fp8">; +defm V_SMFMAC_F32_32X32X32_BF8_BF8 : VOP3P_Real_SMFMAC <0x7c, "v_smfmac_f32_32x32x32bf8bf8">; +defm V_SMFMAC_F32_32X32X32_BF8_FP8 : VOP3P_Real_SMFMAC <0x7d, "v_smfmac_f32_32x32x32bf8fp8">; +defm V_SMFMAC_F32_32X32X32_FP8_BF8 : VOP3P_Real_SMFMAC <0x7e, "v_smfmac_f32_32x32x32fp8bf8">; +defm V_SMFMAC_F32_32X32X32_FP8_FP8 : VOP3P_Real_SMFMAC <0x7f, "v_smfmac_f32_32x32x32fp8fp8">; let SubtargetPredicate = HasPackedFP32Ops in { defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>; diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 33d3441e94c2..d489a089ac78 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -59,15 +59,17 @@ class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt "$src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl"); let AsmDPP8 = "$src0, $src1 $dpp8$fi"; let AsmDPP16 = AsmDPP#"$fi"; + // VOPC DPP Instructions do not need an old operand + let TieRegDPP = ""; let InsDPP = getInsDPP<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP, NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP, - Src2ModDPP>.ret; + Src2ModDPP, 0/*HasOld*/>.ret; let InsDPP16 = getInsDPP16<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP, NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP, - Src2ModDPP>.ret; + Src2ModDPP, 0/*HasOld*/>.ret; let InsDPP8 = getInsDPP8<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP, NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP, - Src2ModDPP>.ret; + Src2ModDPP, 0/*HasOld*/>.ret; // The destination for 32-bit encoding is implicit. let HasDst32 = 0; @@ -76,9 +78,9 @@ class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt let Outs64 = (outs VOPDstS64orS32:$sdst); let OutsVOP3DPP = Outs64; let OutsVOP3DPP8 = Outs64; - let InsVOP3DPP = getInsVOP3DPP<InsVOP3Base, Src0VOP3DPP, NumSrcArgs>.ret; - let InsVOP3DPP16 = getInsVOP3DPP16<InsVOP3Base, Src0VOP3DPP, NumSrcArgs>.ret; - let InsVOP3DPP8 = getInsVOP3DPP8<InsVOP3Base, Src0VOP3DPP, NumSrcArgs>.ret; + let InsVOP3DPP = getInsVOP3DPP<InsVOP3Base, Src0VOP3DPP, NumSrcArgs, 0/*HasOld*/>.ret; + let InsVOP3DPP16 = getInsVOP3DPP16<InsVOP3Base, Src0VOP3DPP, NumSrcArgs, 0/*HasOld*/>.ret; + let InsVOP3DPP8 = getInsVOP3DPP8<InsVOP3Base, Src0VOP3DPP, NumSrcArgs, 0/*HasOld*/>.ret; list<SchedReadWrite> Schedule = sched; } @@ -293,7 +295,7 @@ multiclass VOPC_Pseudos <string opName, let Defs = !if(DefExec, [EXEC], []); let SchedRW = P.Schedule; let isCompare = 1; - let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $sdst", ""); + let Constraints = ""; } } // end SubtargetPredicate = isGFX11Plus @@ -711,7 +713,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> : VOPC_Profile<sched, vt, i32> { let AsmDPP = "$src0_modifiers, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; let AsmDPP16 = AsmDPP#"$fi"; - let InsDPP = (ins VGPR_32:$old, FPVRegInputMods:$src0_modifiers, VGPR_32:$src0, VGPR_32:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsDPP = (ins FPVRegInputMods:$src0_modifiers, VGPR_32:$src0, VGPR_32:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); let InsDPP16 = !con(InsDPP, (ins FI:$fi)); // DPP8 forbids modifiers and can inherit from VOPC_Profile @@ -793,7 +795,7 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec, def _e64_dpp : VOP3_DPP_Pseudo<opName, p> { let Defs = !if(DefExec, [EXEC], []); let SchedRW = p.Schedule; - let Constraints = !if(p.NumSrcArgs, p.TieRegDPP # " = $sdst", ""); + let Constraints = ""; } } // end SubtargetPredicate = isGFX11Plus } @@ -1068,7 +1070,6 @@ class VOPC_DPP16<bits<8> op, VOP_DPP_Pseudo ps, string opName = ps.OpName> let Uses = ps.Uses; let OtherPredicates = ps.OtherPredicates; let Constraints = ps.Constraints; - let AsmMatchConverter = "cvtVOPCNoDstDPP"; } class VOPC_DPP16_SIMC<bits<8> op, VOP_DPP_Pseudo ps, int subtarget, @@ -1084,7 +1085,6 @@ class VOPC_DPP8<bits<8> op, VOPC_Pseudo ps, string opName = ps.OpName> let Uses = ps.Uses; let OtherPredicates = ps.OtherPredicates; let Constraints = ""; - let AsmMatchConverter = "cvtVOPCNoDstDPP8"; } // VOPC64 @@ -1133,7 +1133,6 @@ class VOPC64_DPP16_NoDst<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName> : VOPC64_DPP16<op, ps, opName> { let Inst{7-0} = ? ; - let AsmMatchConverter = "cvtVOPC64NoDstDPP"; } class VOPC64_DPP8_Base<bits<10> op, string OpName, VOPProfile P> @@ -1163,13 +1162,12 @@ class VOPC64_DPP8_Dst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> : VOPC64_DPP8<op, ps, opName> { bits<8> sdst; let Inst{7-0} = sdst; - let Constraints = "$old = $sdst"; + let Constraints = ""; } class VOPC64_DPP8_NoDst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> : VOPC64_DPP8<op, ps, opName> { let Inst{7-0} = ? ; - let AsmMatchConverter = "cvtVOPC64NoDstDPP8"; let Constraints = ""; } diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 187485ffa3ae..b65ca2d6b1b3 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -269,6 +269,10 @@ class VOP3OpSel_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> { class VOP3OpSel_gfx11<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>; +class VOP3DotOpSel_gfx11<bits<10> op, VOPProfile p> : VOP3OpSel_gfx11<op, p>{ + let Inst{11} = ?; + let Inst{12} = ?; +} // NB: For V_INTERP* opcodes, src0 is encoded as src1 and vice versa class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> { @@ -1270,6 +1274,8 @@ multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_f class Base_VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName> : VOP3_DPP<op, opName, ps.Pfl, 1> { + let VOP3_OPSEL = ps.Pfl.HasOpSel; + let IsDOT = ps.IsDOT; let hasSideEffects = ps.hasSideEffects; let Defs = ps.Defs; let SchedRW = ps.SchedRW; @@ -1285,6 +1291,8 @@ class VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, int subtarget, class Base_VOP3_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> : VOP3_DPP8<op, opName, ps.Pfl> { + let VOP3_OPSEL = ps.Pfl.HasOpSel; + let IsDOT = ps.IsDOT; let hasSideEffects = ps.hasSideEffects; let Defs = ps.Defs; let SchedRW = ps.SchedRW; @@ -1326,6 +1334,15 @@ let AssemblerPredicate = isGFX11Only, VOP3e_gfx11<op, ps.Pfl>; } } + multiclass VOP3Dot_Real_Base_gfx11<bits<10> op, string opName = NAME, + bit isSingle = 0> { + defvar ps = !cast<VOP_Pseudo>(opName#"_e64"); + let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in { + def _e64_gfx11 : + VOP3_Real<ps, SIEncodingFamily.GFX11>, + VOP3DotOpSel_gfx11<op, ps.Pfl>; + } + } multiclass VOP3_Real_with_name_gfx11<bits<10> op, string opName, string asmName, bit isSingle = 0> { defvar ps = !cast<VOP_Pseudo>(opName#"_e64"); @@ -1355,6 +1372,15 @@ let AssemblerPredicate = isGFX11Only, let DecoderNamespace = "DPPGFX11"; } } + + multiclass VOP3Dot_Real_dpp_Base_gfx11<bits<10> op, string opName = NAME> { + def _e64_dpp_gfx11 : VOP3_DPP16<op, !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp"), SIEncodingFamily.GFX11> { + let Inst{11} = ?; + let Inst{12} = ?; + let DecoderNamespace = "DPPGFX11"; + } + } + multiclass VOP3_Real_dpp_with_name_gfx11<bits<10> op, string opName, string asmName> { defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); @@ -1368,6 +1394,16 @@ let AssemblerPredicate = isGFX11Only, let DecoderNamespace = "DPP8GFX11"; } } + + multiclass VOP3Dot_Real_dpp8_Base_gfx11<bits<10> op, string opName = NAME> { + defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); + def _e64_dpp8_gfx11 : Base_VOP3_DPP8<op, ps> { + let Inst{11} = ?; + let Inst{12} = ?; + let DecoderNamespace = "DPP8GFX11"; + } + } + multiclass VOP3_Real_dpp8_with_name_gfx11<bits<10> op, string opName, string asmName> { defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); @@ -1406,6 +1442,12 @@ multiclass VOP3_Realtriple_gfx11<bits<10> op, VOP3_Real_dpp_Base_gfx11<op, opName>, VOP3_Real_dpp8_Base_gfx11<op, opName>; +multiclass VOP3Dot_Realtriple_gfx11<bits<10> op, + bit isSingle = 0, string opName = NAME> : + VOP3Dot_Real_Base_gfx11<op, opName, isSingle>, + VOP3Dot_Real_dpp_Base_gfx11<op, opName>, + VOP3Dot_Real_dpp8_Base_gfx11<op, opName>; + multiclass VOP3Only_Realtriple_gfx11<bits<10> op> : VOP3_Realtriple_gfx11<op, 1>; diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 80ba7b5f0d2e..183febe756c1 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -6726,8 +6726,8 @@ bool ARMBaseInstrInfo::shouldOutlineFromFunctionByDefault( return Subtarget.isMClass() && MF.getFunction().hasMinSize(); } -bool ARMBaseInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, - AAResults *AA) const { +bool ARMBaseInstrInfo::isReallyTriviallyReMaterializable( + const MachineInstr &MI) const { // Try hard to rematerialize any VCTPs because if we spill P0, it will block // the tail predication conversion. This means that the element count // register has to be live for longer, but that has to be better than diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index 3b8f3403e3c3..453e3fa1b99b 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -480,8 +480,7 @@ private: MachineInstr *canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI, const TargetInstrInfo *TII) const; - bool isReallyTriviallyReMaterializable(const MachineInstr &MI, - AAResults *AA) const override; + bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override; private: /// Modeling special VFP / NEON fp MLA / MLS hazards. diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 613904f702f0..e5347ed8e53a 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -1720,6 +1720,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, unsigned UxtOp, MachineBasicBlock::iterator &NextMBBI) { bool IsThumb = STI->isThumb(); + bool IsThumb1Only = STI->isThumb1Only(); MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); const MachineOperand &Dest = MI.getOperand(0); @@ -1794,7 +1795,8 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, MIB.addImm(0); // a 32-bit Thumb strex (only) allows an offset. MIB.add(predOps(ARMCC::AL)); - unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri; + unsigned CMPri = + IsThumb ? (IsThumb1Only ? ARM::tCMPi8 : ARM::t2CMPri) : ARM::CMPri; BuildMI(StoreBB, DL, TII->get(CMPri)) .addReg(TempReg, RegState::Kill) .addImm(0) @@ -1848,6 +1850,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI) { bool IsThumb = STI->isThumb(); + assert(!STI->isThumb1Only() && "CMP_SWAP_64 unsupported under Thumb1!"); MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); MachineOperand &Dest = MI.getOperand(0); @@ -3044,6 +3047,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, assert(STI->isThumb()); return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXH, ARM::t2STREXH, ARM::tUXTH, NextMBBI); + case ARM::tCMP_SWAP_32: + assert(STI->isThumb()); + return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREX, ARM::t2STREX, 0, NextMBBI); case ARM::CMP_SWAP_8: assert(!STI->isThumb()); @@ -3054,11 +3060,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXH, ARM::STREXH, ARM::UXTH, NextMBBI); case ARM::CMP_SWAP_32: - if (STI->isThumb()) - return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREX, ARM::t2STREX, 0, - NextMBBI); - else - return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREX, ARM::STREX, 0, NextMBBI); + assert(!STI->isThumb()); + return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREX, ARM::STREX, 0, NextMBBI); case ARM::CMP_SWAP_64: return ExpandCMP_SWAP_64(MBB, MBBI, NextMBBI); diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index e0e4ffd90e0e..afe16a3cd55c 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -3131,7 +3131,7 @@ bool ARMDAGToDAGISel::tryInsertVectorElt(SDNode *N) { // Else v8i16 pattern of an extract and an insert, with a optional vmovx for // extracting odd lanes. - if (VT == MVT::v8i16) { + if (VT == MVT::v8i16 && Subtarget->hasFullFP16()) { SDValue Inp1 = CurDAG->getTargetExtractSubreg( ARM::ssub_0 + ExtractLane1 / 2, dl, MVT::f32, Val1.getOperand(0)); SDValue Inp2 = CurDAG->getTargetExtractSubreg( @@ -3151,7 +3151,7 @@ bool ARMDAGToDAGISel::tryInsertVectorElt(SDNode *N) { // The inserted values are not extracted - if they are f16 then insert them // directly using a VINS. - if (VT == MVT::v8f16) { + if (VT == MVT::v8f16 && Subtarget->hasFullFP16()) { SDNode *VINS = CurDAG->getMachineNode(ARM::VINSH, dl, MVT::f32, Val2, Val1); SDValue NewIns = CurDAG->getTargetInsertSubreg(ARM::ssub_0 + Lane2 / 2, dl, MVT::v4f32, @@ -3512,7 +3512,7 @@ void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) { else if (MemTy == MVT::i16) Opcode = Subtarget->isThumb() ? ARM::tCMP_SWAP_16 : ARM::CMP_SWAP_16; else if (MemTy == MVT::i32) - Opcode = ARM::CMP_SWAP_32; + Opcode = Subtarget->isThumb() ? ARM::tCMP_SWAP_32 : ARM::CMP_SWAP_32; else llvm_unreachable("Unknown AtomicCmpSwap type"); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index e6be93e6480a..743cca9ff71f 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -13572,6 +13572,10 @@ static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, bool ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const { + assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA || + N->getOpcode() == ISD::SRL) && + "Expected shift op"); + if (Level == BeforeLegalizeTypes) return true; @@ -13605,8 +13609,38 @@ ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, return false; } +bool ARMTargetLowering::isDesirableToCommuteXorWithShift( + const SDNode *N) const { + assert(N->getOpcode() == ISD::XOR && + (N->getOperand(0).getOpcode() == ISD::SHL || + N->getOperand(0).getOpcode() == ISD::SRL) && + "Expected XOR(SHIFT) pattern"); + + // Only commute if the entire NOT mask is a hidden shifted mask. + auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1)); + auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1)); + if (XorC && ShiftC) { + unsigned MaskIdx, MaskLen; + if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) { + unsigned ShiftAmt = ShiftC->getZExtValue(); + unsigned BitWidth = N->getValueType(0).getScalarSizeInBits(); + if (N->getOperand(0).getOpcode() == ISD::SHL) + return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt); + return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt); + } + } + + return false; +} + bool ARMTargetLowering::shouldFoldConstantShiftPairToMask( const SDNode *N, CombineLevel Level) const { + assert(((N->getOpcode() == ISD::SHL && + N->getOperand(0).getOpcode() == ISD::SRL) || + (N->getOpcode() == ISD::SRL && + N->getOperand(0).getOpcode() == ISD::SHL)) && + "Expected shift-shift mask"); + if (!Subtarget->isThumb1Only()) return true; @@ -19962,6 +19996,14 @@ bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode( } break; } + case ARMISD::VBICIMM: { + SDValue Op0 = Op.getOperand(0); + unsigned ModImm = Op.getConstantOperandVal(1); + unsigned EltBits = 0; + uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits); + if ((OriginalDemandedBits & Mask) == 0) + return TLO.CombineTo(Op, Op0); + } } return TargetLowering::SimplifyDemandedBitsForTargetNode( diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 10f60ab93ae3..fae279ea7569 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -733,6 +733,8 @@ class VectorType; bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override; + bool isDesirableToCommuteXorWithShift(const SDNode *N) const override; + bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override; diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 15c33014e988..9c03f72fe6ae 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -1882,6 +1882,7 @@ let Predicates = [HasMVEInt] in { def : Pat<(ARMvgetlaneu (v8f16 MQPR:$src), imm:$lane), (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>; // For i16's inserts being extracted from low lanes, then may use VINS. + let Predicates = [HasFullFP16] in { def : Pat<(ARMinsertelt (v8i16 MQPR:$src1), (ARMvgetlaneu (v8i16 MQPR:$src2), imm_even:$extlane), imm_odd:$inslane), @@ -1889,6 +1890,7 @@ let Predicates = [HasMVEInt] in { (VINSH (EXTRACT_SUBREG MQPR:$src1, (SSubReg_f16_reg imm_odd:$inslane)), (EXTRACT_SUBREG MQPR:$src2, (SSubReg_f16_reg imm_even:$extlane))), (SSubReg_f16_reg imm_odd:$inslane)), MQPR)>; + } def : Pat<(v16i8 (scalar_to_vector GPR:$src)), (MVE_VMOV_to_lane_8 (v16i8 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; @@ -1905,17 +1907,21 @@ let Predicates = [HasMVEInt] in { def : Pat<(insertelt (v8f16 MQPR:$src1), (f16 HPR:$src2), imm_even:$lane), (MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS (f16 HPR:$src2), rGPR), imm:$lane)>; + let Predicates = [HasFullFP16] in { def : Pat<(insertelt (v8f16 MQPR:$src1), (f16 HPR:$src2), imm_odd:$lane), (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), (VINSH (EXTRACT_SUBREG MQPR:$src1, (SSubReg_f16_reg imm_odd:$lane)), (COPY_TO_REGCLASS HPR:$src2, SPR)), (SSubReg_f16_reg imm_odd:$lane)), MQPR)>; + } def : Pat<(extractelt (v8f16 MQPR:$src), imm_even:$lane), (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_even:$lane))>; + let Predicates = [HasFullFP16] in { def : Pat<(extractelt (v8f16 MQPR:$src), imm_odd:$lane), (COPY_TO_REGCLASS (VMOVH (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane))), HPR)>; + } def : Pat<(v2f64 (scalar_to_vector (f64 DPR:$src))), (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), DPR:$src, dsub_0)>; diff --git a/llvm/lib/Target/ARM/ARMInstrThumb.td b/llvm/lib/Target/ARM/ARMInstrThumb.td index 71527ae1ab11..8f7039a327b3 100644 --- a/llvm/lib/Target/ARM/ARMInstrThumb.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb.td @@ -1782,11 +1782,15 @@ def tLDRConstPool let Constraints = "@earlyclobber $Rd,@earlyclobber $temp", mayLoad = 1, mayStore = 1 in { -def tCMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, GPR:$temp), +def tCMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, tGPR:$temp), (ins GPR:$addr, tGPR:$desired, GPR:$new), NoItinerary, []>, Sched<[]>; -def tCMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, GPR:$temp), +def tCMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, tGPR:$temp), (ins GPR:$addr, tGPR:$desired, GPR:$new), NoItinerary, []>, Sched<[]>; + +def tCMP_SWAP_32 : PseudoInst<(outs GPR:$Rd, tGPR:$temp), + (ins GPR:$addr, GPR:$desired, GPR:$new), + NoItinerary, []>, Sched<[]>; } diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index ba1d806c8d81..3c102463ba08 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -20,8 +20,8 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" @@ -33,6 +33,7 @@ #include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -2197,12 +2198,9 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, return true; } -bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, - ScalarEvolution &SE, - AssumptionCache &AC, - TargetLibraryInfo *TLI, - DominatorTree *DT, - const LoopAccessInfo *LAI) { +bool ARMTTIImpl::preferPredicateOverEpilogue( + Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, + TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL) { if (!EnableTailPredication) { LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n"); return false; @@ -2244,7 +2242,7 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, return false; } - return canTailPredicateLoop(L, LI, SE, DL, LAI); + return canTailPredicateLoop(L, LI, SE, DL, LVL->getLAI()); } PredicationStyle ARMTTIImpl::emitGetActiveLaneMask() const { diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index dcf82e703a7f..9c3980d79e60 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -288,12 +288,10 @@ public: AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo); - bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, - ScalarEvolution &SE, - AssumptionCache &AC, - TargetLibraryInfo *TLI, + bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, - const LoopAccessInfo *LAI); + LoopVectorizationLegality *LVL); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE); diff --git a/llvm/lib/Target/AVR/AVRSubtarget.h b/llvm/lib/Target/AVR/AVRSubtarget.h index 2325193bac0a..3dd71243387b 100644 --- a/llvm/lib/Target/AVR/AVRSubtarget.h +++ b/llvm/lib/Target/AVR/AVRSubtarget.h @@ -92,15 +92,15 @@ public: } /// Get I/O register addresses. - int getIORegRAMPZ(void) const { return hasELPM() ? 0x3b : -1; } - int getIORegEIND(void) const { return hasEIJMPCALL() ? 0x3c : -1; } - int getIORegSPL(void) const { return 0x3d; } - int getIORegSPH(void) const { return hasSmallStack() ? -1 : 0x3e; } - int getIORegSREG(void) const { return 0x3f; } + int getIORegRAMPZ() const { return hasELPM() ? 0x3b : -1; } + int getIORegEIND() const { return hasEIJMPCALL() ? 0x3c : -1; } + int getIORegSPL() const { return 0x3d; } + int getIORegSPH() const { return hasSmallStack() ? -1 : 0x3e; } + int getIORegSREG() const { return 0x3f; } /// Get GPR aliases. - int getRegTmpIndex(void) const { return hasTinyEncoding() ? 16 : 0; } - int getRegZeroIndex(void) const { return hasTinyEncoding() ? 17 : 1; } + int getRegTmpIndex() const { return hasTinyEncoding() ? 16 : 0; } + int getRegZeroIndex() const { return hasTinyEncoding() ? 17 : 1; } private: /// The ELF e_flags architecture. diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp index d490b385ac16..0bf739452fd2 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp @@ -518,7 +518,7 @@ void CSKYInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned Opcode = 0; if (CSKY::GPRRegClass.contains(DestReg, SrcReg)) - Opcode = CSKY::MOV32; + Opcode = STI.hasE2() ? CSKY::MOV32 : CSKY::MOV16; else if (v2sf && CSKY::sFPR32RegClass.contains(DestReg, SrcReg)) Opcode = CSKY::FMOV_S; else if (v3sf && CSKY::FPR32RegClass.contains(DestReg, SrcReg)) diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp index 3e09270a66d0..869433613620 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp @@ -13,6 +13,7 @@ #include "DXILBitcodeWriter.h" #include "DXILValueEnumerator.h" #include "PointerTypeAnalysis.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Triple.h" #include "llvm/Bitcode/BitcodeCommon.h" #include "llvm/Bitcode/BitcodeReader.h" @@ -2580,10 +2581,9 @@ void DXILBitcodeWriter::writeFunctionLevelValueSymbolTable( SortedTable.push_back(VI.second->getValueName()); } // The keys are unique, so there shouldn't be stability issues. - std::sort(SortedTable.begin(), SortedTable.end(), - [](const ValueName *A, const ValueName *B) { - return A->first() < B->first(); - }); + llvm::sort(SortedTable, [](const ValueName *A, const ValueName *B) { + return A->first() < B->first(); + }); for (const ValueName *SI : SortedTable) { auto &Name = *SI; diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp index 08944ee3f1fe..e2a41515de38 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp @@ -809,7 +809,7 @@ void ValueEnumerator::organizeMetadata() { // - by function, then // - by isa<MDString> // and then sort by the original/current ID. Since the IDs are guaranteed to - // be unique, the result of std::sort will be deterministic. There's no need + // be unique, the result of llvm::sort will be deterministic. There's no need // for std::stable_sort. llvm::sort(Order, [this](MDIndex LHS, MDIndex RHS) { return std::make_tuple(LHS.F, getMetadataTypeOrder(LHS.get(MDs)), LHS.ID) < diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp index abd84a188cfa..bd0232c71d48 100644 --- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp @@ -85,7 +85,6 @@ public: int getAllocSizeOf(const Type *Ty) const; int getTypeAlignment(Type *Ty) const; - VectorType *getByteVectorTy(int ScLen) const; Constant *getNullValue(Type *Ty) const; Constant *getFullValue(Type *Ty) const; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 4acf90bd9788..93c8864347bb 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -217,9 +217,8 @@ SDValue LoongArchTargetLowering::lowerGlobalAddress(SDValue Op, const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); unsigned ADDIOp = Subtarget.is64Bit() ? LoongArch::ADDI_D : LoongArch::ADDI_W; - // FIXME: Only support PC-relative addressing to access the symbol. - // TODO: Add target flags. - if (!isPositionIndependent()) { + // TODO: Support dso_preemptable and target flags. + if (GV->isDSOLocal()) { SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty); SDValue AddrHi(DAG.getMachineNode(LoongArch::PCALAU12I, DL, Ty, GA), 0); SDValue Addr(DAG.getMachineNode(ADDIOp, DL, Ty, AddrHi, GA), 0); diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp index 468c4f43cb90..2d08d5c674bc 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp @@ -38,9 +38,7 @@ static std::string computeDataLayout(const Triple &TT) { static Reloc::Model getEffectiveRelocModel(const Triple &TT, Optional<Reloc::Model> RM) { - if (!RM.hasValue()) - return Reloc::Static; - return *RM; + return RM.value_or(Reloc::Static); } LoongArchTargetMachine::LoongArchTargetMachine( diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index b98be4ae4b75..4dfc16526a00 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -1192,6 +1192,12 @@ bool MipsTargetLowering::hasBitTest(SDValue X, SDValue Y) const { bool MipsTargetLowering::shouldFoldConstantShiftPairToMask( const SDNode *N, CombineLevel Level) const { + assert(((N->getOpcode() == ISD::SHL && + N->getOperand(0).getOpcode() == ISD::SRL) || + (N->getOpcode() == ISD::SRL && + N->getOperand(0).getOpcode() == ISD::SHL)) && + "Expected shift-shift mask"); + if (N->getOperand(0).getValueType().isVector()) return false; return true; diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 9977d8ba0300..45e82e935772 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -73,8 +73,10 @@ #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Endian.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MachineValueType.h" +#include "llvm/Support/NativeFormatting.h" #include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLoweringObjectFile.h" @@ -354,8 +356,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { // PTX ABI requires all scalar return values to be at least 32 // bits in size. fp16 normally uses .b16 as its storage type in // PTX, so its size must be adjusted here, too. - if (size < 32) - size = 32; + size = promoteScalarArgumentSize(size); O << ".param .b" << size << " func_retval0"; } else if (isa<PointerType>(Ty)) { @@ -384,8 +385,8 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { for (unsigned j = 0, je = elems; j != je; ++j) { unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 32)) - sz = 32; + if (elemtype.isInteger()) + sz = promoteScalarArgumentSize(sz); O << ".reg .b" << sz << " func_retval" << idx; if (j < je - 1) O << ", "; @@ -1168,31 +1169,37 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, GVar->hasInitializer()) { const Constant *Initializer = GVar->getInitializer(); if (!isa<UndefValue>(Initializer) && !Initializer->isNullValue()) { - AggBuffer aggBuffer(ElementSize, O, *this); + AggBuffer aggBuffer(ElementSize, *this); bufferAggregateConstant(Initializer, &aggBuffer); - if (aggBuffer.numSymbols) { - if (static_cast<const NVPTXTargetMachine &>(TM).is64Bit()) { - O << " .u64 "; + if (aggBuffer.numSymbols()) { + unsigned int ptrSize = MAI->getCodePointerSize(); + if (ElementSize % ptrSize || + !aggBuffer.allSymbolsAligned(ptrSize)) { + // Print in bytes and use the mask() operator for pointers. + if (!STI.hasMaskOperator()) + report_fatal_error( + "initialized packed aggregate with pointers '" + + GVar->getName() + + "' requires at least PTX ISA version 7.1"); + O << " .u8 "; getSymbol(GVar)->print(O, MAI); - O << "["; - O << ElementSize / 8; + O << "[" << ElementSize << "] = {"; + aggBuffer.printBytes(O); + O << "}"; } else { - O << " .u32 "; + O << " .u" << ptrSize * 8 << " "; getSymbol(GVar)->print(O, MAI); - O << "["; - O << ElementSize / 4; + O << "[" << ElementSize / ptrSize << "] = {"; + aggBuffer.printWords(O); + O << "}"; } - O << "]"; } else { O << " .b8 "; getSymbol(GVar)->print(O, MAI); - O << "["; - O << ElementSize; - O << "]"; + O << "[" << ElementSize << "] = {"; + aggBuffer.printBytes(O); + O << "}"; } - O << " = {"; - aggBuffer.print(); - O << "}"; } else { O << " .b8 "; getSymbol(GVar)->print(O, MAI); @@ -1219,6 +1226,80 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, O << ";\n"; } +void NVPTXAsmPrinter::AggBuffer::printSymbol(unsigned nSym, raw_ostream &os) { + const Value *v = Symbols[nSym]; + const Value *v0 = SymbolsBeforeStripping[nSym]; + if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) { + MCSymbol *Name = AP.getSymbol(GVar); + PointerType *PTy = dyn_cast<PointerType>(v0->getType()); + // Is v0 a generic pointer? + bool isGenericPointer = PTy && PTy->getAddressSpace() == 0; + if (EmitGeneric && isGenericPointer && !isa<Function>(v)) { + os << "generic("; + Name->print(os, AP.MAI); + os << ")"; + } else { + Name->print(os, AP.MAI); + } + } else if (const ConstantExpr *CExpr = dyn_cast<ConstantExpr>(v0)) { + const MCExpr *Expr = AP.lowerConstantForGV(cast<Constant>(CExpr), false); + AP.printMCExpr(*Expr, os); + } else + llvm_unreachable("symbol type unknown"); +} + +void NVPTXAsmPrinter::AggBuffer::printBytes(raw_ostream &os) { + unsigned int ptrSize = AP.MAI->getCodePointerSize(); + symbolPosInBuffer.push_back(size); + unsigned int nSym = 0; + unsigned int nextSymbolPos = symbolPosInBuffer[nSym]; + for (unsigned int pos = 0; pos < size;) { + if (pos) + os << ", "; + if (pos != nextSymbolPos) { + os << (unsigned int)buffer[pos]; + ++pos; + continue; + } + // Generate a per-byte mask() operator for the symbol, which looks like: + // .global .u8 addr[] = {0xFF(foo), 0xFF00(foo), 0xFF0000(foo), ...}; + // See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#initializers + std::string symText; + llvm::raw_string_ostream oss(symText); + printSymbol(nSym, oss); + for (unsigned i = 0; i < ptrSize; ++i) { + if (i) + os << ", "; + llvm::write_hex(os, 0xFFULL << i * 8, HexPrintStyle::PrefixUpper); + os << "(" << symText << ")"; + } + pos += ptrSize; + nextSymbolPos = symbolPosInBuffer[++nSym]; + assert(nextSymbolPos >= pos); + } +} + +void NVPTXAsmPrinter::AggBuffer::printWords(raw_ostream &os) { + unsigned int ptrSize = AP.MAI->getCodePointerSize(); + symbolPosInBuffer.push_back(size); + unsigned int nSym = 0; + unsigned int nextSymbolPos = symbolPosInBuffer[nSym]; + assert(nextSymbolPos % ptrSize == 0); + for (unsigned int pos = 0; pos < size; pos += ptrSize) { + if (pos) + os << ", "; + if (pos == nextSymbolPos) { + printSymbol(nSym, os); + nextSymbolPos = symbolPosInBuffer[++nSym]; + assert(nextSymbolPos % ptrSize == 0); + assert(nextSymbolPos >= pos + ptrSize); + } else if (ptrSize == 4) + os << support::endian::read32le(&buffer[pos]); + else + os << support::endian::read64le(&buffer[pos]); + } +} + void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) { if (localDecls.find(f) == localDecls.end()) return; @@ -1494,8 +1575,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { unsigned sz = 0; if (isa<IntegerType>(Ty)) { sz = cast<IntegerType>(Ty)->getBitWidth(); - if (sz < 32) - sz = 32; + sz = promoteScalarArgumentSize(sz); } else if (isa<PointerType>(Ty)) sz = thePointerTy.getSizeInBits(); else if (Ty->isHalfTy()) @@ -1559,8 +1639,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { for (unsigned j = 0, je = elems; j != je; ++j) { unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 32)) - sz = 32; + if (elemtype.isInteger()) + sz = promoteScalarArgumentSize(sz); O << "\t.reg .b" << sz << " "; printParamName(I, paramIndex, O); if (j < je - 1) diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h index cd61e99a103a..710c089e3325 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -61,24 +61,30 @@ class MCOperand; class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter { class AggBuffer { - // Used to buffer the emitted string for initializing global - // aggregates. + // Used to buffer the emitted string for initializing global aggregates. // - // Normally an aggregate (array, vector or structure) is emitted - // as a u8[]. However, if one element/field of the aggregate - // is a non-NULL address, then the aggregate is emitted as u32[] - // or u64[]. + // Normally an aggregate (array, vector, or structure) is emitted as a u8[]. + // However, if either element/field of the aggregate is a non-NULL address, + // and all such addresses are properly aligned, then the aggregate is + // emitted as u32[] or u64[]. In the case of unaligned addresses, the + // aggregate is emitted as u8[], and the mask() operator is used for all + // pointers. // - // We first layout the aggregate in 'buffer' in bytes, except for - // those symbol addresses. For the i-th symbol address in the - //aggregate, its corresponding 4-byte or 8-byte elements in 'buffer' - // are filled with 0s. symbolPosInBuffer[i-1] records its position - // in 'buffer', and Symbols[i-1] records the Value*. + // We first layout the aggregate in 'buffer' in bytes, except for those + // symbol addresses. For the i-th symbol address in the aggregate, its + // corresponding 4-byte or 8-byte elements in 'buffer' are filled with 0s. + // symbolPosInBuffer[i-1] records its position in 'buffer', and Symbols[i-1] + // records the Value*. // - // Once we have this AggBuffer setup, we can choose how to print - // it out. + // Once we have this AggBuffer setup, we can choose how to print it out. public: - unsigned numSymbols; // number of symbol addresses + // number of symbol addresses + unsigned numSymbols() const { return Symbols.size(); } + + bool allSymbolsAligned(unsigned ptrSize) const { + return llvm::all_of(symbolPosInBuffer, + [=](unsigned pos) { return pos % ptrSize == 0; }); + } private: const unsigned size; // size of the buffer in bytes @@ -94,15 +100,13 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter { // SymbolsBeforeStripping[i]. SmallVector<const Value *, 4> SymbolsBeforeStripping; unsigned curpos; - raw_ostream &O; NVPTXAsmPrinter &AP; bool EmitGeneric; public: - AggBuffer(unsigned size, raw_ostream &O, NVPTXAsmPrinter &AP) - : size(size), buffer(size), O(O), AP(AP) { + AggBuffer(unsigned size, NVPTXAsmPrinter &AP) + : size(size), buffer(size), AP(AP) { curpos = 0; - numSymbols = 0; EmitGeneric = AP.EmitGeneric; } @@ -135,63 +139,13 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter { symbolPosInBuffer.push_back(curpos); Symbols.push_back(GVar); SymbolsBeforeStripping.push_back(GVarBeforeStripping); - numSymbols++; } - void print() { - if (numSymbols == 0) { - // print out in bytes - for (unsigned i = 0; i < size; i++) { - if (i) - O << ", "; - O << (unsigned int) buffer[i]; - } - } else { - // print out in 4-bytes or 8-bytes - unsigned int pos = 0; - unsigned int nSym = 0; - unsigned int nextSymbolPos = symbolPosInBuffer[nSym]; - unsigned int nBytes = 4; - if (static_cast<const NVPTXTargetMachine &>(AP.TM).is64Bit()) - nBytes = 8; - for (pos = 0; pos < size; pos += nBytes) { - if (pos) - O << ", "; - if (pos == nextSymbolPos) { - const Value *v = Symbols[nSym]; - const Value *v0 = SymbolsBeforeStripping[nSym]; - if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) { - MCSymbol *Name = AP.getSymbol(GVar); - PointerType *PTy = dyn_cast<PointerType>(v0->getType()); - bool IsNonGenericPointer = false; // Is v0 a non-generic pointer? - if (PTy && PTy->getAddressSpace() != 0) { - IsNonGenericPointer = true; - } - if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) { - O << "generic("; - Name->print(O, AP.MAI); - O << ")"; - } else { - Name->print(O, AP.MAI); - } - } else if (const ConstantExpr *CExpr = dyn_cast<ConstantExpr>(v0)) { - const MCExpr *Expr = - AP.lowerConstantForGV(cast<Constant>(CExpr), false); - AP.printMCExpr(*Expr, O); - } else - llvm_unreachable("symbol type unknown"); - nSym++; - if (nSym >= numSymbols) - nextSymbolPos = size + 1; - else - nextSymbolPos = symbolPosInBuffer[nSym]; - } else if (nBytes == 4) - O << *(unsigned int *)(&buffer[pos]); - else - O << *(unsigned long long *)(&buffer[pos]); - } - } - } + void printBytes(raw_ostream &os); + void printWords(raw_ostream &os); + + private: + void printSymbol(unsigned nSym, raw_ostream &os); }; friend class AggBuffer; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 6ad016dfa0a7..8264032b765a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -206,6 +206,40 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, } } +/// PromoteScalarIntegerPTX +/// Used to make sure the arguments/returns are suitable for passing +/// and promote them to a larger size if they're not. +/// +/// The promoted type is placed in \p PromoteVT if the function returns true. +static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) { + if (VT.isScalarInteger()) { + switch (PowerOf2Ceil(VT.getFixedSizeInBits())) { + default: + llvm_unreachable( + "Promotion is not suitable for scalars of size larger than 64-bits"); + case 1: + *PromotedVT = MVT::i1; + break; + case 2: + case 4: + case 8: + *PromotedVT = MVT::i8; + break; + case 16: + *PromotedVT = MVT::i16; + break; + case 32: + *PromotedVT = MVT::i32; + break; + case 64: + *PromotedVT = MVT::i64; + break; + } + return EVT(*PromotedVT) != VT; + } + return false; +} + // Check whether we can merge loads/stores of some of the pieces of a // flattened function parameter or return value into a single vector // load/store. @@ -1291,8 +1325,7 @@ std::string NVPTXTargetLowering::getPrototype( // PTX ABI requires all scalar return values to be at least 32 // bits in size. fp16 normally uses .b16 as its storage type in // PTX, so its size must be adjusted here, too. - if (size < 32) - size = 32; + size = promoteScalarArgumentSize(size); O << ".param .b" << size << " _"; } else if (isa<PointerType>(retTy)) { @@ -1343,8 +1376,7 @@ std::string NVPTXTargetLowering::getPrototype( unsigned sz = 0; if (isa<IntegerType>(Ty)) { sz = cast<IntegerType>(Ty)->getBitWidth(); - if (sz < 32) - sz = 32; + sz = promoteScalarArgumentSize(sz); } else if (isa<PointerType>(Ty)) { sz = PtrVT.getSizeInBits(); } else if (Ty->isHalfTy()) @@ -1515,11 +1547,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, NeedAlign = true; } else { // declare .param .b<size> .param<n>; - if ((VT.isInteger() || VT.isFloatingPoint()) && TypeSize < 4) { + if (VT.isInteger() || VT.isFloatingPoint()) { // PTX ABI requires integral types to be at least 32 bits in // size. FP16 is loaded/stored using i16, so it's handled // here as well. - TypeSize = 4; + TypeSize = promoteScalarArgumentSize(TypeSize * 8) / 8; } SDValue DeclareScalarParamOps[] = { Chain, DAG.getConstant(ParamCount, dl, MVT::i32), @@ -1556,6 +1588,17 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } SDValue StVal = OutVals[OIdx]; + + MVT PromotedVT; + if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { + EltVT = EVT(PromotedVT); + } + if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) { + llvm::ISD::NodeType Ext = + Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + StVal = DAG.getNode(Ext, dl, PromotedVT, StVal); + } + if (IsByVal) { auto PtrVT = getPointerTy(DL); SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal, @@ -1638,9 +1681,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Plus, this behavior is consistent with nvcc's. if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() || (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) { - // Scalar needs to be at least 32bit wide - if (resultsz < 32) - resultsz = 32; + resultsz = promoteScalarArgumentSize(resultsz); SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), DAG.getConstant(resultsz, dl, MVT::i32), @@ -1778,6 +1819,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, EVT TheLoadType = VTs[i]; EVT EltType = Ins[i].VT; Align EltAlign = commonAlignment(RetAlign, Offsets[i]); + MVT PromotedVT; + + if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) { + TheLoadType = EVT(PromotedVT); + EltType = EVT(PromotedVT); + needTruncate = true; + } + if (ExtendIntegerRetVal) { TheLoadType = MVT::i32; EltType = MVT::i32; @@ -2558,6 +2607,13 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( // v2f16 was loaded as an i32. Now we must bitcast it back. else if (EltVT == MVT::v2f16) Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt); + + // If a promoted integer type is used, truncate down to the original + MVT PromotedVT; + if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { + Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); + } + // Extend the element if necessary (e.g. an i8 is loaded // into an i16 register) if (Ins[InsIdx].VT.isInteger() && @@ -2627,11 +2683,26 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, return Chain; const DataLayout &DL = DAG.getDataLayout(); + SmallVector<SDValue, 16> PromotedOutVals; SmallVector<EVT, 16> VTs; SmallVector<uint64_t, 16> Offsets; ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); + for (unsigned i = 0, e = VTs.size(); i != e; ++i) { + SDValue PromotedOutVal = OutVals[i]; + MVT PromotedVT; + if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) { + VTs[i] = EVT(PromotedVT); + } + if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) { + llvm::ISD::NodeType Ext = + Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal); + } + PromotedOutVals.push_back(PromotedOutVal); + } + auto VectorInfo = VectorizePTXValueVTs( VTs, Offsets, RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL) @@ -2652,12 +2723,14 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); } - SDValue RetVal = OutVals[i]; + SDValue OutVal = OutVals[i]; + SDValue RetVal = PromotedOutVals[i]; + if (ExtendIntegerRetVal) { RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl, MVT::i32, RetVal); - } else if (RetVal.getValueSizeInBits() < 16) { + } else if (OutVal.getValueSizeInBits() < 16) { // Use 16-bit registers for small load-stores as it's the // smallest general purpose register size supported by NVPTX. RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 9a249d3da3d5..cea3dce3f1c5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -77,6 +77,7 @@ public: bool hasImageHandles() const; bool hasFP16Math() const { return SmVersion >= 53; } bool allowFP16Math() const; + bool hasMaskOperator() const { return PTXVersion >= 71; } unsigned int getSmVersion() const { return SmVersion; } std::string getTargetName() const { return TargetName; } diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index bf1524194cfb..6fee57b4664e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -59,6 +59,16 @@ bool isKernelFunction(const Function &); bool getAlign(const Function &, unsigned index, unsigned &); bool getAlign(const CallInst &, unsigned index, unsigned &); +// PTX ABI requires all scalar argument/return values to have +// bit-size as a power of two of at least 32 bits. +inline unsigned promoteScalarArgumentSize(unsigned size) { + if (size <= 32) + return 32; + else if (size <= 64) + return 64; + else + return size; +} } #endif diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 4247cf557c2a..14c4fd3a9ffa 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -5473,7 +5473,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) { } case ISD::MUL: { SDValue Op1 = N->getOperand(1); - if (Op1.getOpcode() != ISD::Constant || Op1.getValueType() != MVT::i64) + if (Op1.getOpcode() != ISD::Constant || + (Op1.getValueType() != MVT::i64 && Op1.getValueType() != MVT::i32)) break; // If the multiplier fits int16, we can handle it with mulli. @@ -5486,13 +5487,27 @@ void PPCDAGToDAGISel::Select(SDNode *N) { // (mul X, c1 << c2) -> (rldicr (mulli X, c1) c2). We do this in ISEL due to // DAGCombiner prefers (shl (mul X, c1), c2) -> (mul X, c1 << c2). uint64_t ImmSh = Imm >> Shift; - if (isInt<16>(ImmSh)) { - uint64_t SextImm = SignExtend64(ImmSh & 0xFFFF, 16); + if (!isInt<16>(ImmSh)) + break; + + uint64_t SextImm = SignExtend64(ImmSh & 0xFFFF, 16); + if (Op1.getValueType() == MVT::i64) { SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i64); SDNode *MulNode = CurDAG->getMachineNode(PPC::MULLI8, dl, MVT::i64, N->getOperand(0), SDImm); - CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, SDValue(MulNode, 0), - getI32Imm(Shift, dl), getI32Imm(63 - Shift, dl)); + + SDValue Ops[] = {SDValue(MulNode, 0), getI32Imm(Shift, dl), + getI32Imm(63 - Shift, dl)}; + CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, Ops); + return; + } else { + SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i32); + SDNode *MulNode = CurDAG->getMachineNode(PPC::MULLI, dl, MVT::i32, + N->getOperand(0), SDImm); + + SDValue Ops[] = {SDValue(MulNode, 0), getI32Imm(Shift, dl), + getI32Imm(0, dl), getI32Imm(31 - Shift, dl)}; + CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); return; } break; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 59486c323567..c85f57f04c7d 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1086,8 +1086,8 @@ unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, // For opcodes with the ReMaterializable flag set, this function is called to // verify the instruction is really rematable. -bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, - AliasAnalysis *AA) const { +bool PPCInstrInfo::isReallyTriviallyReMaterializable( + const MachineInstr &MI) const { switch (MI.getOpcode()) { default: // This function should only be called for opcodes with the ReMaterializable diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h index e22b0086bde8..980bb3107a8b 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -495,8 +495,7 @@ public: unsigned &SubIdx) const override; unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; - bool isReallyTriviallyReMaterializable(const MachineInstr &MI, - AAResults *AA) const override; + bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override; unsigned isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override; diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp index 4689c0638ca6..23703ac54d0e 100644 --- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp +++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -568,7 +568,7 @@ bool PPCLoopInstrFormPrep::rewriteLoadStoresForCommoningChains( const SCEVAddRecExpr *BasePtrSCEV = cast<SCEVAddRecExpr>(BaseSCEV); // Make sure the base is able to expand. - if (!isSafeToExpand(BasePtrSCEV->getStart(), *SE)) + if (!SCEVE.isSafeToExpand(BasePtrSCEV->getStart())) return MadeChange; assert(BasePtrSCEV->isAffine() && @@ -602,7 +602,7 @@ bool PPCLoopInstrFormPrep::rewriteLoadStoresForCommoningChains( // Make sure offset is able to expand. Only need to check one time as the // offsets are reused between different chains. if (!BaseElemIdx) - if (!isSafeToExpand(OffsetSCEV, *SE)) + if (!SCEVE.isSafeToExpand(OffsetSCEV)) return false; Value *OffsetValue = SCEVE.expandCodeFor( @@ -1018,14 +1018,13 @@ bool PPCLoopInstrFormPrep::rewriteLoadStores( if (!BasePtrSCEV->isAffine()) return MadeChange; - if (!isSafeToExpand(BasePtrSCEV->getStart(), *SE)) - return MadeChange; - - SmallPtrSet<Value *, 16> DeletedPtrs; - BasicBlock *Header = L->getHeader(); SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "loopprepare-formrewrite"); + if (!SCEVE.isSafeToExpand(BasePtrSCEV->getStart())) + return MadeChange; + + SmallPtrSet<Value *, 16> DeletedPtrs; // For some DS form load/store instructions, it can also be an update form, // if the stride is constant and is a multipler of 4. Use update form if diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index 917837a307ad..e6140edc8403 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -30,6 +30,9 @@ class MachineInstr; class MachineOperand; class PassRegistry; +FunctionPass *createRISCVCodeGenPreparePass(); +void initializeRISCVCodeGenPreparePass(PassRegistry &); + bool lowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, AsmPrinter &AP); bool lowerRISCVMachineOperandToMCOperand(const MachineOperand &MO, diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td index e783ef38b448..8a6f69c7f7ca 100644 --- a/llvm/lib/Target/RISCV/RISCV.td +++ b/llvm/lib/Target/RISCV/RISCV.td @@ -19,6 +19,19 @@ def HasStdExtM : Predicate<"Subtarget->hasStdExtM()">, AssemblerPredicate<(all_of FeatureStdExtM), "'M' (Integer Multiplication and Division)">; +def FeatureStdExtZmmul + : SubtargetFeature<"zmmul", "HasStdExtZmmul", "true", + "'Zmmul' (Integer Multiplication)">; +def HasStdExtZmmul : Predicate<"Subtarget->hasStdExtZmmul()">, + AssemblerPredicate<(all_of FeatureStdExtZmmul), + "'Zmmul' (Integer Multiplication)">; + +def HasStdExtMOrZmmul + : Predicate<"Subtarget->hasStdExtM() || Subtarget->hasStdExtZmmul()">, + AssemblerPredicate<(any_of FeatureStdExtM, FeatureStdExtZmmul), + "'M' (Integer Multiplication and Division) or " + "'Zmmul' (Integer Multiplication)">; + def FeatureStdExtA : SubtargetFeature<"a", "HasStdExtA", "true", "'A' (Atomic Instructions)">; @@ -465,7 +478,8 @@ def TuneNoDefaultUnroll "Disable default unroll preference.">; def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7", - "SiFive 7-Series processors">; + "SiFive 7-Series processors", + [TuneNoDefaultUnroll]>; //===----------------------------------------------------------------------===// // Named operands for CSR instructions. @@ -499,9 +513,9 @@ def : ProcessorModel<"rocket-rv32", RocketModel, []>; def : ProcessorModel<"rocket-rv64", RocketModel, [Feature64Bit]>; def : ProcessorModel<"sifive-7-rv32", SiFive7Model, [], - [TuneSiFive7, TuneNoDefaultUnroll]>; + [TuneSiFive7]>; def : ProcessorModel<"sifive-7-rv64", SiFive7Model, [Feature64Bit], - [TuneSiFive7, TuneNoDefaultUnroll]>; + [TuneSiFive7]>; def : ProcessorModel<"sifive-e20", RocketModel, [FeatureStdExtM, FeatureStdExtC]>; @@ -528,7 +542,7 @@ def : ProcessorModel<"sifive-e76", SiFive7Model, [FeatureStdExtM, FeatureStdExtA, FeatureStdExtF, FeatureStdExtC], - [TuneSiFive7, TuneNoDefaultUnroll]>; + [TuneSiFive7]>; def : ProcessorModel<"sifive-s21", RocketModel, [Feature64Bit, FeatureStdExtM, @@ -553,7 +567,7 @@ def : ProcessorModel<"sifive-s76", SiFive7Model, [Feature64Bit, FeatureStdExtF, FeatureStdExtD, FeatureStdExtC], - [TuneSiFive7, TuneNoDefaultUnroll]>; + [TuneSiFive7]>; def : ProcessorModel<"sifive-u54", RocketModel, [Feature64Bit, FeatureStdExtM, @@ -568,7 +582,7 @@ def : ProcessorModel<"sifive-u74", SiFive7Model, [Feature64Bit, FeatureStdExtF, FeatureStdExtD, FeatureStdExtC], - [TuneSiFive7, TuneNoDefaultUnroll]>; + [TuneSiFive7]>; //===----------------------------------------------------------------------===// // Define the RISC-V target. diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp new file mode 100644 index 000000000000..b700a9ede39b --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp @@ -0,0 +1,169 @@ +//===----- RISCVCodeGenPrepare.cpp ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is a RISCV specific version of CodeGenPrepare. +// It munges the code in the input function to better prepare it for +// SelectionDAG-based code generation. This works around limitations in it's +// basic-block-at-a-time approach. +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVTargetMachine.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" + +using namespace llvm; + +#define DEBUG_TYPE "riscv-codegenprepare" +#define PASS_NAME "RISCV CodeGenPrepare" + +STATISTIC(NumZExtToSExt, "Number of SExt instructions converted to ZExt"); + +namespace { + +class RISCVCodeGenPrepare : public FunctionPass { + const DataLayout *DL; + const RISCVSubtarget *ST; + +public: + static char ID; + + RISCVCodeGenPrepare() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + StringRef getPassName() const override { return PASS_NAME; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<TargetPassConfig>(); + } + +private: + bool optimizeZExt(ZExtInst *I); + bool optimizeAndExt(BinaryOperator *BO); +}; + +} // end anonymous namespace + +bool RISCVCodeGenPrepare::optimizeZExt(ZExtInst *ZExt) { + if (!ST->is64Bit()) + return false; + + Value *Src = ZExt->getOperand(0); + + // We only care about ZExt from i32 to i64. + if (!ZExt->getType()->isIntegerTy(64) || !Src->getType()->isIntegerTy(32)) + return false; + + // Look for an opportunity to replace (i64 (zext (i32 X))) with a sext if we + // can determine that the sign bit of X is zero via a dominating condition. + // This often occurs with widened induction variables. + if (isImpliedByDomCondition(ICmpInst::ICMP_SGE, Src, + Constant::getNullValue(Src->getType()), ZExt, + *DL)) { + auto *SExt = new SExtInst(Src, ZExt->getType(), "", ZExt); + SExt->takeName(ZExt); + SExt->setDebugLoc(ZExt->getDebugLoc()); + + ZExt->replaceAllUsesWith(SExt); + ZExt->eraseFromParent(); + ++NumZExtToSExt; + return true; + } + + return false; +} + +// Try to optimize (i64 (and (zext/sext (i32 X), C1))) if C1 has bit 31 set, +// but bits 63:32 are zero. If we can prove that bit 31 of X is 0, we can fill +// the upper 32 bits with ones. A separate transform will turn (zext X) into +// (sext X) for the same condition. +bool RISCVCodeGenPrepare::optimizeAndExt(BinaryOperator *BO) { + if (!ST->is64Bit()) + return false; + + if (BO->getOpcode() != Instruction::And) + return false; + + if (!BO->getType()->isIntegerTy(64)) + return false; + + // Left hand side should be sext or zext. + Instruction *LHS = dyn_cast<Instruction>(BO->getOperand(0)); + if (!LHS || (!isa<SExtInst>(LHS) && !isa<ZExtInst>(LHS))) + return false; + + Value *LHSSrc = LHS->getOperand(0); + if (!LHSSrc->getType()->isIntegerTy(32)) + return false; + + // Right hand side should be a constant. + Value *RHS = BO->getOperand(1); + + auto *CI = dyn_cast<ConstantInt>(RHS); + if (!CI) + return false; + uint64_t C = CI->getZExtValue(); + + // Look for constants that fit in 32 bits but not simm12, and can be made + // into simm12 by sign extending bit 31. This will allow use of ANDI. + // TODO: Is worth making simm32? + if (!isUInt<32>(C) || isInt<12>(C) || !isInt<12>(SignExtend64<32>(C))) + return false; + + // If we can determine the sign bit of the input is 0, we can replace the + // And mask constant. + if (!isImpliedByDomCondition(ICmpInst::ICMP_SGE, LHSSrc, + Constant::getNullValue(LHSSrc->getType()), + LHS, *DL)) + return false; + + // Sign extend the constant and replace the And operand. + C = SignExtend64<32>(C); + BO->setOperand(1, ConstantInt::get(LHS->getType(), C)); + + return true; +} + +bool RISCVCodeGenPrepare::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + auto &TPC = getAnalysis<TargetPassConfig>(); + auto &TM = TPC.getTM<RISCVTargetMachine>(); + ST = &TM.getSubtarget<RISCVSubtarget>(F); + + DL = &F.getParent()->getDataLayout(); + + bool MadeChange = false; + for (auto &BB : F) { + for (Instruction &I : llvm::make_early_inc_range(BB)) { + if (auto *ZExt = dyn_cast<ZExtInst>(&I)) + MadeChange |= optimizeZExt(ZExt); + else if (I.getOpcode() == Instruction::And) + MadeChange |= optimizeAndExt(cast<BinaryOperator>(&I)); + } + } + + return MadeChange; +} + +INITIALIZE_PASS_BEGIN(RISCVCodeGenPrepare, DEBUG_TYPE, PASS_NAME, false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(RISCVCodeGenPrepare, DEBUG_TYPE, PASS_NAME, false, false) + +char RISCVCodeGenPrepare::ID = 0; + +FunctionPass *llvm::createRISCVCodeGenPreparePass() { + return new RISCVCodeGenPrepare(); +} diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 5b823af1e9b8..d5826b46d738 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -690,6 +690,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // 32 trailing ones should use srliw via tablegen pattern. if (TrailingOnes == 32 || ShAmt >= TrailingOnes) break; + // If C2 is (1 << ShAmt) use bexti if possible. + if (Subtarget->hasStdExtZbs() && ShAmt + 1 == TrailingOnes) { + SDNode *BEXTI = + CurDAG->getMachineNode(RISCV::BEXTI, DL, VT, N0->getOperand(0), + CurDAG->getTargetConstant(ShAmt, DL, VT)); + ReplaceNode(Node, BEXTI); + return; + } unsigned LShAmt = Subtarget->getXLen() - TrailingOnes; SDNode *SLLI = CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0), @@ -939,18 +947,17 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (!isMask_64(C2)) break; - // This should be the only use of the AND unless we will use - // (SRLI (SLLI X, 32), 32). We don't use a shift pair for other AND - // constants. - if (!N0.hasOneUse() && C2 != UINT64_C(0xFFFFFFFF)) - break; - - // If this can be an ANDI, ZEXT.H or ZEXT.W we don't need to do this - // optimization. - if (isInt<12>(C2) || + // If this can be an ANDI, ZEXT.H or ZEXT.W, don't do this if the ANDI/ZEXT + // has multiple users or the constant is a simm12. This prevents inserting + // a shift and still have uses of the AND/ZEXT. Shifting a simm12 will + // likely make it more costly to materialize. Otherwise, using a SLLI + // might allow it to be compressed. + bool IsANDIOrZExt = + isInt<12>(C2) || (C2 == UINT64_C(0xFFFF) && (Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp())) || - (C2 == UINT64_C(0xFFFFFFFF) && Subtarget->hasStdExtZba())) + (C2 == UINT64_C(0xFFFFFFFF) && Subtarget->hasStdExtZba()); + if (IsANDIOrZExt && (isInt<12>(N1C->getSExtValue()) || !N0.hasOneUse())) break; // We need to shift left the AND input and C1 by a total of XLen bits. diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 658865703079..1702546b58a6 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -215,21 +215,26 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setLibcallName(RTLIB::MULO_I64, nullptr); } - if (!Subtarget.hasStdExtM()) { - setOperationAction({ISD::MUL, ISD::MULHS, ISD::MULHU, ISD::SDIV, ISD::UDIV, - ISD::SREM, ISD::UREM}, - XLenVT, Expand); + if (!Subtarget.hasStdExtM() && !Subtarget.hasStdExtZmmul()) { + setOperationAction({ISD::MUL, ISD::MULHS, ISD::MULHU}, XLenVT, Expand); } else { if (Subtarget.is64Bit()) { setOperationAction(ISD::MUL, {MVT::i32, MVT::i128}, Custom); - - setOperationAction({ISD::SDIV, ISD::UDIV, ISD::UREM}, - {MVT::i8, MVT::i16, MVT::i32}, Custom); } else { setOperationAction(ISD::MUL, MVT::i64, Custom); } } + if (!Subtarget.hasStdExtM()) { + setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, + XLenVT, Expand); + } else { + if (Subtarget.is64Bit()) { + setOperationAction({ISD::SDIV, ISD::UDIV, ISD::UREM}, + {MVT::i8, MVT::i16, MVT::i32}, Custom); + } + } + setOperationAction( {ISD::SDIVREM, ISD::UDIVREM, ISD::SMUL_LOHI, ISD::UMUL_LOHI}, XLenVT, Expand); @@ -294,7 +299,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, XLenVT, Custom); } - static constexpr ISD::NodeType FPLegalNodeTypes[] = { + static const unsigned FPLegalNodeTypes[] = { ISD::FMINNUM, ISD::FMAXNUM, ISD::LRINT, ISD::LLRINT, ISD::LROUND, ISD::LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT, ISD::STRICT_LROUND, @@ -307,7 +312,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE, ISD::SETGT, ISD::SETGE, ISD::SETNE, ISD::SETO, ISD::SETUO}; - static const ISD::NodeType FPOpToExpand[] = { + static const unsigned FPOpToExpand[] = { ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW, ISD::FREM, ISD::FP16_TO_FP, ISD::FP_TO_FP16}; @@ -315,8 +320,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BITCAST, MVT::i16, Custom); if (Subtarget.hasStdExtZfh()) { - for (auto NT : FPLegalNodeTypes) - setOperationAction(NT, MVT::f16, Legal); + setOperationAction(FPLegalNodeTypes, MVT::f16, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); setCondCodeAction(FPCCToExpand, MVT::f16, Expand); @@ -340,14 +344,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, } if (Subtarget.hasStdExtF()) { - for (auto NT : FPLegalNodeTypes) - setOperationAction(NT, MVT::f32, Legal); + setOperationAction(FPLegalNodeTypes, MVT::f32, Legal); setCondCodeAction(FPCCToExpand, MVT::f32, Expand); setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); setOperationAction(ISD::SELECT, MVT::f32, Custom); setOperationAction(ISD::BR_CC, MVT::f32, Expand); - for (auto Op : FPOpToExpand) - setOperationAction(Op, MVT::f32, Expand); + setOperationAction(FPOpToExpand, MVT::f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); } @@ -356,8 +358,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BITCAST, MVT::i32, Custom); if (Subtarget.hasStdExtD()) { - for (auto NT : FPLegalNodeTypes) - setOperationAction(NT, MVT::f64, Legal); + setOperationAction(FPLegalNodeTypes, MVT::f64, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); setCondCodeAction(FPCCToExpand, MVT::f64, Expand); @@ -366,8 +367,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BR_CC, MVT::f64, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); - for (auto Op : FPOpToExpand) - setOperationAction(Op, MVT::f64, Expand); + setOperationAction(FPOpToExpand, MVT::f64, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); } @@ -458,17 +458,22 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::VP_SETCC, ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}; + static const unsigned IntegerVecReduceOps[] = { + ISD::VECREDUCE_ADD, ISD::VECREDUCE_AND, ISD::VECREDUCE_OR, + ISD::VECREDUCE_XOR, ISD::VECREDUCE_SMAX, ISD::VECREDUCE_SMIN, + ISD::VECREDUCE_UMAX, ISD::VECREDUCE_UMIN}; + + static const unsigned FloatingPointVecReduceOps[] = { + ISD::VECREDUCE_FADD, ISD::VECREDUCE_SEQ_FADD, ISD::VECREDUCE_FMIN, + ISD::VECREDUCE_FMAX}; + if (!Subtarget.is64Bit()) { // We must custom-lower certain vXi64 operations on RV32 due to the vector // element type being illegal. setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT}, MVT::i64, Custom); - setOperationAction({ISD::VECREDUCE_ADD, ISD::VECREDUCE_AND, - ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR, - ISD::VECREDUCE_SMAX, ISD::VECREDUCE_SMIN, - ISD::VECREDUCE_UMAX, ISD::VECREDUCE_UMIN}, - MVT::i64, Custom); + setOperationAction(IntegerVecReduceOps, MVT::i64, Custom); setOperationAction({ISD::VP_REDUCE_ADD, ISD::VP_REDUCE_AND, ISD::VP_REDUCE_OR, ISD::VP_REDUCE_XOR, @@ -581,11 +586,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // Custom-lower reduction operations to set up the corresponding custom // nodes' operands. - setOperationAction({ISD::VECREDUCE_ADD, ISD::VECREDUCE_AND, - ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR, - ISD::VECREDUCE_SMAX, ISD::VECREDUCE_SMIN, - ISD::VECREDUCE_UMAX, ISD::VECREDUCE_UMIN}, - VT, Custom); + setOperationAction(IntegerVecReduceOps, VT, Custom); setOperationAction(IntegerVPOps, VT, Custom); @@ -661,9 +662,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FFLOOR, ISD::FROUND}, VT, Custom); - setOperationAction({ISD::VECREDUCE_FADD, ISD::VECREDUCE_SEQ_FADD, - ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMAX}, - VT, Custom); + setOperationAction(FloatingPointVecReduceOps, VT, Custom); // Expand FP operations that need libcalls. setOperationAction(ISD::FREM, VT, Expand); @@ -905,17 +904,14 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FFLOOR, ISD::FROUND}, VT, Custom); - for (auto CC : VFPCCToExpand) - setCondCodeAction(CC, VT, Expand); + setCondCodeAction(VFPCCToExpand, VT, Expand); setOperationAction({ISD::VSELECT, ISD::SELECT}, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::BITCAST, VT, Custom); - setOperationAction({ISD::VECREDUCE_FADD, ISD::VECREDUCE_SEQ_FADD, - ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMAX}, - VT, Custom); + setOperationAction(FloatingPointVecReduceOps, VT, Custom); setOperationAction(FloatingPointVPOps, VT, Custom); } @@ -943,7 +939,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setJumpIsExpensive(); setTargetDAGCombine({ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::AND, - ISD::OR, ISD::XOR}); + ISD::OR, ISD::XOR, ISD::SETCC}); if (Subtarget.is64Bit()) setTargetDAGCombine(ISD::SRA); @@ -1374,6 +1370,23 @@ unsigned RISCVTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context // with 1/-1. static void translateSetCCForBranch(const SDLoc &DL, SDValue &LHS, SDValue &RHS, ISD::CondCode &CC, SelectionDAG &DAG) { + // If this is a single bit test that can't be handled by ANDI, shift the + // bit to be tested to the MSB and perform a signed compare with 0. + if (isIntEqualitySetCC(CC) && isNullConstant(RHS) && + LHS.getOpcode() == ISD::AND && LHS.hasOneUse() && + isa<ConstantSDNode>(LHS.getOperand(1))) { + uint64_t Mask = LHS.getConstantOperandVal(1); + if (isPowerOf2_64(Mask) && !isInt<12>(Mask)) { + CC = CC == ISD::SETEQ ? ISD::SETGE : ISD::SETLT; + unsigned ShAmt = LHS.getValueSizeInBits() - 1 - Log2_64(Mask); + LHS = LHS.getOperand(0); + if (ShAmt != 0) + LHS = DAG.getNode(ISD::SHL, DL, LHS.getValueType(), LHS, + DAG.getConstant(ShAmt, DL, LHS.getValueType())); + return; + } + } + // Convert X > -1 to X >= 0. if (CC == ISD::SETGT && isAllOnesConstant(RHS)) { RHS = DAG.getConstant(0, DL, RHS.getValueType()); @@ -3707,10 +3720,7 @@ SDValue RISCVTargetLowering::lowerGlobalAddress(SDValue Op, SDLoc DL(Op); GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op); assert(N->getOffset() == 0 && "unexpected offset in global node"); - - const GlobalValue *GV = N->getGlobal(); - bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); - return getAddr(N, DAG, IsLocal); + return getAddr(N, DAG, N->getGlobal()->isDSOLocal()); } SDValue RISCVTargetLowering::lowerBlockAddress(SDValue Op, @@ -8130,6 +8140,50 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG) { return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false); } +// Replace (seteq (i64 (and X, 0xffffffff)), C1) with +// (seteq (i64 (sext_inreg (X, i32)), C1')) where C1' is C1 sign extended from +// bit 31. Same for setne. C1' may be cheaper to materialize and the sext_inreg +// can become a sext.w instead of a shift pair. +static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + EVT OpVT = N0.getValueType(); + + if (OpVT != MVT::i64 || !Subtarget.is64Bit()) + return SDValue(); + + // RHS needs to be a constant. + auto *N1C = dyn_cast<ConstantSDNode>(N1); + if (!N1C) + return SDValue(); + + // LHS needs to be (and X, 0xffffffff). + if (N0.getOpcode() != ISD::AND || !N0.hasOneUse() || + !isa<ConstantSDNode>(N0.getOperand(1)) || + N0.getConstantOperandVal(1) != UINT64_C(0xffffffff)) + return SDValue(); + + // Looking for an equality compare. + ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get(); + if (!isIntEqualitySetCC(Cond)) + return SDValue(); + + const APInt &C1 = cast<ConstantSDNode>(N1)->getAPIntValue(); + + SDLoc dl(N); + // If the constant is larger than 2^32 - 1 it is impossible for both sides + // to be equal. + if (C1.getActiveBits() > 32) + return DAG.getBoolConstant(Cond == ISD::SETNE, dl, VT, OpVT); + + SDValue SExtOp = DAG.getNode(ISD::SIGN_EXTEND_INREG, N, OpVT, + N0.getOperand(0), DAG.getValueType(MVT::i32)); + return DAG.getSetCC(dl, VT, SExtOp, DAG.getConstant(C1.trunc(32).sext(64), + dl, OpVT), Cond); +} + static SDValue performSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { @@ -8658,6 +8712,75 @@ static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG, DAG.getConstant(32 - ShAmt, DL, MVT::i64)); } +// Perform common combines for BR_CC and SELECT_CC condtions. +static bool combine_CC(SDValue &LHS, SDValue &RHS, SDValue &CC, const SDLoc &DL, + SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { + ISD::CondCode CCVal = cast<CondCodeSDNode>(CC)->get(); + if (!ISD::isIntEqualitySetCC(CCVal)) + return false; + + // Fold ((setlt X, Y), 0, ne) -> (X, Y, lt) + // Sometimes the setcc is introduced after br_cc/select_cc has been formed. + if (LHS.getOpcode() == ISD::SETCC && isNullConstant(RHS) && + LHS.getOperand(0).getValueType() == Subtarget.getXLenVT()) { + // If we're looking for eq 0 instead of ne 0, we need to invert the + // condition. + bool Invert = CCVal == ISD::SETEQ; + CCVal = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); + if (Invert) + CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType()); + + RHS = LHS.getOperand(1); + LHS = LHS.getOperand(0); + translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG); + + CC = DAG.getCondCode(CCVal); + return true; + } + + // Fold ((xor X, Y), 0, eq/ne) -> (X, Y, eq/ne) + if (LHS.getOpcode() == ISD::XOR && isNullConstant(RHS)) { + RHS = LHS.getOperand(1); + LHS = LHS.getOperand(0); + return true; + } + + // Fold ((srl (and X, 1<<C), C), 0, eq/ne) -> ((shl X, XLen-1-C), 0, ge/lt) + if (isNullConstant(RHS) && LHS.getOpcode() == ISD::SRL && LHS.hasOneUse() && + LHS.getOperand(1).getOpcode() == ISD::Constant) { + SDValue LHS0 = LHS.getOperand(0); + if (LHS0.getOpcode() == ISD::AND && + LHS0.getOperand(1).getOpcode() == ISD::Constant) { + uint64_t Mask = LHS0.getConstantOperandVal(1); + uint64_t ShAmt = LHS.getConstantOperandVal(1); + if (isPowerOf2_64(Mask) && Log2_64(Mask) == ShAmt) { + CCVal = CCVal == ISD::SETEQ ? ISD::SETGE : ISD::SETLT; + CC = DAG.getCondCode(CCVal); + + ShAmt = LHS.getValueSizeInBits() - 1 - ShAmt; + LHS = LHS0.getOperand(0); + if (ShAmt != 0) + LHS = + DAG.getNode(ISD::SHL, DL, LHS.getValueType(), LHS0.getOperand(0), + DAG.getConstant(ShAmt, DL, LHS.getValueType())); + return true; + } + } + } + + // (X, 1, setne) -> // (X, 0, seteq) if we can prove X is 0/1. + // This can occur when legalizing some floating point comparisons. + APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1); + if (isOneConstant(RHS) && DAG.MaskedValueIsZero(LHS, Mask)) { + CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType()); + CC = DAG.getCondCode(CCVal); + RHS = DAG.getConstant(0, DL, LHS.getValueType()); + return true; + } + + return false; +} + SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -8872,6 +8995,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, case ISD::FMAXNUM: case ISD::FMINNUM: return combineBinOpToReduce(N, DAG); + case ISD::SETCC: + return performSETCCCombine(N, DAG, Subtarget); case ISD::SIGN_EXTEND_INREG: return performSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); case ISD::ZERO_EXTEND: @@ -8900,110 +9025,32 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, // Transform SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); + SDValue CC = N->getOperand(2); SDValue TrueV = N->getOperand(3); SDValue FalseV = N->getOperand(4); + SDLoc DL(N); // If the True and False values are the same, we don't need a select_cc. if (TrueV == FalseV) return TrueV; - ISD::CondCode CCVal = cast<CondCodeSDNode>(N->getOperand(2))->get(); - if (!ISD::isIntEqualitySetCC(CCVal)) - break; - - // Fold (select_cc (setlt X, Y), 0, ne, trueV, falseV) -> - // (select_cc X, Y, lt, trueV, falseV) - // Sometimes the setcc is introduced after select_cc has been formed. - if (LHS.getOpcode() == ISD::SETCC && isNullConstant(RHS) && - LHS.getOperand(0).getValueType() == Subtarget.getXLenVT()) { - // If we're looking for eq 0 instead of ne 0, we need to invert the - // condition. - bool Invert = CCVal == ISD::SETEQ; - CCVal = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); - if (Invert) - CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType()); - - SDLoc DL(N); - RHS = LHS.getOperand(1); - LHS = LHS.getOperand(0); - translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG); - - SDValue TargetCC = DAG.getCondCode(CCVal); - return DAG.getNode(RISCVISD::SELECT_CC, DL, N->getValueType(0), - {LHS, RHS, TargetCC, TrueV, FalseV}); - } - - // Fold (select_cc (xor X, Y), 0, eq/ne, trueV, falseV) -> - // (select_cc X, Y, eq/ne, trueV, falseV) - if (LHS.getOpcode() == ISD::XOR && isNullConstant(RHS)) - return DAG.getNode(RISCVISD::SELECT_CC, SDLoc(N), N->getValueType(0), - {LHS.getOperand(0), LHS.getOperand(1), - N->getOperand(2), TrueV, FalseV}); - // (select_cc X, 1, setne, trueV, falseV) -> - // (select_cc X, 0, seteq, trueV, falseV) if we can prove X is 0/1. - // This can occur when legalizing some floating point comparisons. - APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1); - if (isOneConstant(RHS) && DAG.MaskedValueIsZero(LHS, Mask)) { - SDLoc DL(N); - CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType()); - SDValue TargetCC = DAG.getCondCode(CCVal); - RHS = DAG.getConstant(0, DL, LHS.getValueType()); + if (combine_CC(LHS, RHS, CC, DL, DAG, Subtarget)) return DAG.getNode(RISCVISD::SELECT_CC, DL, N->getValueType(0), - {LHS, RHS, TargetCC, TrueV, FalseV}); - } + {LHS, RHS, CC, TrueV, FalseV}); - break; + return SDValue(); } case RISCVISD::BR_CC: { SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); - ISD::CondCode CCVal = cast<CondCodeSDNode>(N->getOperand(3))->get(); - if (!ISD::isIntEqualitySetCC(CCVal)) - break; - - // Fold (br_cc (setlt X, Y), 0, ne, dest) -> - // (br_cc X, Y, lt, dest) - // Sometimes the setcc is introduced after br_cc has been formed. - if (LHS.getOpcode() == ISD::SETCC && isNullConstant(RHS) && - LHS.getOperand(0).getValueType() == Subtarget.getXLenVT()) { - // If we're looking for eq 0 instead of ne 0, we need to invert the - // condition. - bool Invert = CCVal == ISD::SETEQ; - CCVal = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); - if (Invert) - CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType()); - - SDLoc DL(N); - RHS = LHS.getOperand(1); - LHS = LHS.getOperand(0); - translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG); + SDValue CC = N->getOperand(3); + SDLoc DL(N); + if (combine_CC(LHS, RHS, CC, DL, DAG, Subtarget)) return DAG.getNode(RISCVISD::BR_CC, DL, N->getValueType(0), - N->getOperand(0), LHS, RHS, DAG.getCondCode(CCVal), - N->getOperand(4)); - } - - // Fold (br_cc (xor X, Y), 0, eq/ne, dest) -> - // (br_cc X, Y, eq/ne, trueV, falseV) - if (LHS.getOpcode() == ISD::XOR && isNullConstant(RHS)) - return DAG.getNode(RISCVISD::BR_CC, SDLoc(N), N->getValueType(0), - N->getOperand(0), LHS.getOperand(0), LHS.getOperand(1), - N->getOperand(3), N->getOperand(4)); - - // (br_cc X, 1, setne, br_cc) -> - // (br_cc X, 0, seteq, br_cc) if we can prove X is 0/1. - // This can occur when legalizing some floating point comparisons. - APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1); - if (isOneConstant(RHS) && DAG.MaskedValueIsZero(LHS, Mask)) { - SDLoc DL(N); - CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType()); - SDValue TargetCC = DAG.getCondCode(CCVal); - RHS = DAG.getConstant(0, DL, LHS.getValueType()); - return DAG.getNode(RISCVISD::BR_CC, DL, N->getValueType(0), - N->getOperand(0), LHS, RHS, TargetCC, - N->getOperand(4)); - } - break; + N->getOperand(0), LHS, RHS, CC, N->getOperand(4)); + + return SDValue(); } case ISD::BITREVERSE: return performBITREVERSECombine(N, DAG, Subtarget); @@ -9299,6 +9346,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, bool RISCVTargetLowering::isDesirableToCommuteWithShift( const SDNode *N, CombineLevel Level) const { + assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA || + N->getOpcode() == ISD::SRL) && + "Expected shift op"); + // The following folds are only desirable if `(OP _, c1 << c2)` can be // materialised in fewer instructions than `(OP _, c1)`: // @@ -9357,7 +9408,8 @@ bool RISCVTargetLowering::targetShrinkDemandedConstant( return false; // Only handle AND for now. - if (Op.getOpcode() != ISD::AND) + unsigned Opcode = Op.getOpcode(); + if (Opcode != ISD::AND && Opcode != ISD::OR && Opcode != ISD::XOR) return false; ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); @@ -9376,12 +9428,13 @@ bool RISCVTargetLowering::targetShrinkDemandedConstant( auto IsLegalMask = [ShrunkMask, ExpandedMask](const APInt &Mask) -> bool { return ShrunkMask.isSubsetOf(Mask) && Mask.isSubsetOf(ExpandedMask); }; - auto UseMask = [Mask, Op, VT, &TLO](const APInt &NewMask) -> bool { + auto UseMask = [Mask, Op, &TLO](const APInt &NewMask) -> bool { if (NewMask == Mask) return true; SDLoc DL(Op); - SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT); - SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); + SDValue NewC = TLO.DAG.getConstant(NewMask, DL, Op.getValueType()); + SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), + Op.getOperand(0), NewC); return TLO.CombineTo(Op, NewOp); }; @@ -9390,18 +9443,21 @@ bool RISCVTargetLowering::targetShrinkDemandedConstant( if (ShrunkMask.isSignedIntN(12)) return false; - // Preserve (and X, 0xffff) when zext.h is supported. - if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp()) { - APInt NewMask = APInt(Mask.getBitWidth(), 0xffff); - if (IsLegalMask(NewMask)) - return UseMask(NewMask); - } + // And has a few special cases for zext. + if (Opcode == ISD::AND) { + // Preserve (and X, 0xffff) when zext.h is supported. + if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp()) { + APInt NewMask = APInt(Mask.getBitWidth(), 0xffff); + if (IsLegalMask(NewMask)) + return UseMask(NewMask); + } - // Try to preserve (and X, 0xffffffff), the (zext_inreg X, i32) pattern. - if (VT == MVT::i64) { - APInt NewMask = APInt(64, 0xffffffff); - if (IsLegalMask(NewMask)) - return UseMask(NewMask); + // Try to preserve (and X, 0xffffffff), the (zext_inreg X, i32) pattern. + if (VT == MVT::i64) { + APInt NewMask = APInt(64, 0xffffffff); + if (IsLegalMask(NewMask)) + return UseMask(NewMask); + } } // For the remaining optimizations, we need to be able to make a negative @@ -9414,10 +9470,11 @@ bool RISCVTargetLowering::targetShrinkDemandedConstant( // Try to make a 12 bit negative immediate. If that fails try to make a 32 // bit negative immediate unless the shrunk immediate already fits in 32 bits. + // If we can't create a simm12, we shouldn't change opaque constants. APInt NewMask = ShrunkMask; if (MinSignedBits <= 12) NewMask.setBitsFrom(11); - else if (MinSignedBits <= 32 && !ShrunkMask.isSignedIntN(32)) + else if (!C->isOpaque() && MinSignedBits <= 32 && !ShrunkMask.isSignedIntN(32)) NewMask.setBitsFrom(31); else return false; @@ -10015,15 +10072,15 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI, LastSelectPseudo = &*SequenceMBBI; SequenceMBBI->collectDebugValues(SelectDebugValues); SelectDests.insert(SequenceMBBI->getOperand(0).getReg()); - } else { - if (SequenceMBBI->hasUnmodeledSideEffects() || - SequenceMBBI->mayLoadOrStore()) - break; - if (llvm::any_of(SequenceMBBI->operands(), [&](MachineOperand &MO) { - return MO.isReg() && MO.isUse() && SelectDests.count(MO.getReg()); - })) - break; + continue; } + if (SequenceMBBI->hasUnmodeledSideEffects() || + SequenceMBBI->mayLoadOrStore()) + break; + if (llvm::any_of(SequenceMBBI->operands(), [&](MachineOperand &MO) { + return MO.isReg() && MO.isUse() && SelectDests.count(MO.getReg()); + })) + break; } const RISCVInstrInfo &TII = *Subtarget.getInstrInfo(); @@ -12159,7 +12216,8 @@ bool RISCVTargetLowering::isVScaleKnownToBeAPowerOfTwo() const { // FIXME: This doesn't work for zve32, but that's already broken // elsewhere for the same reason. assert(Subtarget.getRealMinVLen() >= 64 && "zve32* unsupported"); - assert(RISCV::RVVBitsPerBlock == 64 && "RVVBitsPerBlock changed, audit needed"); + static_assert(RISCV::RVVBitsPerBlock == 64, + "RVVBitsPerBlock changed, audit needed"); return true; } @@ -12214,10 +12272,12 @@ bool RISCVTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const { // Check integral scalar types. + const bool HasExtMOrZmmul = + Subtarget.hasStdExtM() || Subtarget.hasStdExtZmmul(); if (VT.isScalarInteger()) { // Omit the optimization if the sub target has the M extension and the data // size exceeds XLen. - if (Subtarget.hasStdExtM() && VT.getSizeInBits() > Subtarget.getXLen()) + if (HasExtMOrZmmul && VT.getSizeInBits() > Subtarget.getXLen()) return false; if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) { // Break the MUL to a SLLI and an ADD/SUB. @@ -12232,7 +12292,7 @@ bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, return true; // Omit the following optimization if the sub target has the M extension // and the data size >= XLen. - if (Subtarget.hasStdExtM() && VT.getSizeInBits() >= Subtarget.getXLen()) + if (HasExtMOrZmmul && VT.getSizeInBits() >= Subtarget.getXLen()) return false; // Break the MUL to two SLLI instructions and an ADD/SUB, if Imm needs // a pair of LUI/ADDI. diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 685604ad9a59..75a79895330f 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -637,6 +637,64 @@ void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, } } +MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl( + MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS, + VirtRegMap *VRM) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + // The below optimizations narrow the load so they are only valid for little + // endian. + // TODO: Support big endian by adding an offset into the frame object? + if (MF.getDataLayout().isBigEndian()) + return nullptr; + + // Fold load from stack followed by sext.w into lw. + // TODO: Fold with sext.b, sext.h, zext.b, zext.h, zext.w? + if (Ops.size() != 1 || Ops[0] != 1) + return nullptr; + + unsigned LoadOpc; + switch (MI.getOpcode()) { + default: + if (RISCV::isSEXT_W(MI)) { + LoadOpc = RISCV::LW; + break; + } + if (RISCV::isZEXT_W(MI)) { + LoadOpc = RISCV::LWU; + break; + } + if (RISCV::isZEXT_B(MI)) { + LoadOpc = RISCV::LBU; + break; + } + return nullptr; + case RISCV::SEXT_H: + LoadOpc = RISCV::LH; + break; + case RISCV::SEXT_B: + LoadOpc = RISCV::LB; + break; + case RISCV::ZEXT_H_RV32: + case RISCV::ZEXT_H_RV64: + LoadOpc = RISCV::LHU; + break; + } + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FrameIndex), + MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex), + MFI.getObjectAlign(FrameIndex)); + + Register DstReg = MI.getOperand(0).getReg(); + return BuildMI(*MI.getParent(), InsertPt, MI.getDebugLoc(), get(LoadOpc), + DstReg) + .addFrameIndex(FrameIndex) + .addImm(0) + .addMemOperand(MMO); +} + void RISCVInstrInfo::movImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register DstReg, uint64_t Val, @@ -1799,17 +1857,30 @@ Register RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF, .addReg(VL, RegState::Kill) .addImm(ShiftAmount) .setMIFlag(Flag); - } else if ((NumOfVReg == 3 || NumOfVReg == 5 || NumOfVReg == 9) && - STI.hasStdExtZba()) { - // We can use Zba SHXADD instructions for multiply in some cases. - // TODO: Generalize to SHXADD+SLLI. + } else if (STI.hasStdExtZba() && + ((NumOfVReg % 3 == 0 && isPowerOf2_64(NumOfVReg / 3)) || + (NumOfVReg % 5 == 0 && isPowerOf2_64(NumOfVReg / 5)) || + (NumOfVReg % 9 == 0 && isPowerOf2_64(NumOfVReg / 9)))) { + // We can use Zba SHXADD+SLLI instructions for multiply in some cases. unsigned Opc; - switch (NumOfVReg) { - default: llvm_unreachable("Unexpected number of vregs"); - case 3: Opc = RISCV::SH1ADD; break; - case 5: Opc = RISCV::SH2ADD; break; - case 9: Opc = RISCV::SH3ADD; break; + uint32_t ShiftAmount; + if (NumOfVReg % 9 == 0) { + Opc = RISCV::SH3ADD; + ShiftAmount = Log2_64(NumOfVReg / 9); + } else if (NumOfVReg % 5 == 0) { + Opc = RISCV::SH2ADD; + ShiftAmount = Log2_64(NumOfVReg / 5); + } else if (NumOfVReg % 3 == 0) { + Opc = RISCV::SH1ADD; + ShiftAmount = Log2_64(NumOfVReg / 3); + } else { + llvm_unreachable("Unexpected number of vregs"); } + if (ShiftAmount) + BuildMI(MBB, II, DL, get(RISCV::SLLI), VL) + .addReg(VL, RegState::Kill) + .addImm(ShiftAmount) + .setMIFlag(Flag); BuildMI(MBB, II, DL, get(Opc), VL) .addReg(VL, RegState::Kill) .addReg(VL) @@ -1839,10 +1910,11 @@ Register RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF, } else { Register N = MRI.createVirtualRegister(&RISCV::GPRRegClass); movImm(MBB, II, DL, N, NumOfVReg, Flag); - if (!STI.hasStdExtM()) + if (!STI.hasStdExtM() && !STI.hasStdExtZmmul()) MF.getFunction().getContext().diagnose(DiagnosticInfoUnsupported{ MF.getFunction(), - "M-extension must be enabled to calculate the vscaled size/offset."}); + "M- or Zmmul-extension must be enabled to calculate the vscaled size/" + "offset."}); BuildMI(MBB, II, DL, get(RISCV::MUL), VL) .addReg(VL, RegState::Kill) .addReg(N, RegState::Kill) @@ -1852,6 +1924,24 @@ Register RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF, return VL; } +// Returns true if this is the sext.w pattern, addiw rd, rs1, 0. +bool RISCV::isSEXT_W(const MachineInstr &MI) { + return MI.getOpcode() == RISCV::ADDIW && MI.getOperand(1).isReg() && + MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0; +} + +// Returns true if this is the zext.w pattern, adduw rd, rs1, x0. +bool RISCV::isZEXT_W(const MachineInstr &MI) { + return MI.getOpcode() == RISCV::ADD_UW && MI.getOperand(1).isReg() && + MI.getOperand(2).isReg() && MI.getOperand(2).getReg() == RISCV::X0; +} + +// Returns true if this is the zext.b pattern, andi rd, rs1, 255. +bool RISCV::isZEXT_B(const MachineInstr &MI) { + return MI.getOpcode() == RISCV::ANDI && MI.getOperand(1).isReg() && + MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 255; +} + static bool isRVVWholeLoadStore(unsigned Opcode) { switch (Opcode) { default: diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index 5368437618bd..4aa9ded5b3a2 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -69,6 +69,14 @@ public: int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; + using TargetInstrInfo::foldMemoryOperandImpl; + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, + ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, + int FrameIndex, + LiveIntervals *LIS = nullptr, + VirtRegMap *VRM = nullptr) const override; + // Materializes the given integer Val into DstReg. void movImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register DstReg, uint64_t Val, @@ -183,6 +191,11 @@ protected: namespace RISCV { +// Returns true if this is the sext.w pattern, addiw rd, rs1, 0. +bool isSEXT_W(const MachineInstr &MI); +bool isZEXT_W(const MachineInstr &MI); +bool isZEXT_B(const MachineInstr &MI); + // Returns true if the given MI is an RVV instruction opcode for which we may // expect to see a FrameIndex operand. bool isRVVSpill(const MachineInstr &MI); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 06a90438838e..78fd09fbf387 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1278,6 +1278,13 @@ def : Pat<(setgt GPR:$rs1, simm12_minus1_nonzero:$imm), def : Pat<(setugt GPR:$rs1, simm12_minus1_nonzero:$imm), (XORI (SLTIU GPR:$rs1, (ImmPlus1 simm12_minus1_nonzero:$imm)), 1)>; +// If negating a pattern that requires an XORI above, we can fold the XORI with +// the NEG. The XORI is equivalent to 1-X and negating gives X-1. +def : Pat<(ineg (setuge GPR:$rs1, GPR:$rs2)), (ADDI (SLTU GPR:$rs1, GPR:$rs2), -1)>; +def : Pat<(ineg (setule GPR:$rs1, GPR:$rs2)), (ADDI (SLTU GPR:$rs2, GPR:$rs1), -1)>; +def : Pat<(ineg (setge GPR:$rs1, GPR:$rs2)), (ADDI (SLT GPR:$rs1, GPR:$rs2), -1)>; +def : Pat<(ineg (setle GPR:$rs1, GPR:$rs2)), (ADDI (SLT GPR:$rs2, GPR:$rs1), -1)>; + def IntCCtoRISCVCC : SDNodeXForm<riscv_selectcc, [{ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); RISCVCC::CondCode BrCC = getRISCVCCForIntCC(CC); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td index 72ba8460116f..662604b138d2 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td @@ -24,7 +24,7 @@ def riscv_remuw : SDNode<"RISCVISD::REMUW", SDT_RISCVIntBinOpW>; // Instructions //===----------------------------------------------------------------------===// -let Predicates = [HasStdExtM] in { +let Predicates = [HasStdExtMOrZmmul] in { def MUL : ALU_rr<0b0000001, 0b000, "mul", /*Commutable*/1>, Sched<[WriteIMul, ReadIMul, ReadIMul]>; def MULH : ALU_rr<0b0000001, 0b001, "mulh", /*Commutable*/1>, @@ -33,6 +33,9 @@ def MULHSU : ALU_rr<0b0000001, 0b010, "mulhsu">, Sched<[WriteIMul, ReadIMul, ReadIMul]>; def MULHU : ALU_rr<0b0000001, 0b011, "mulhu", /*Commutable*/1>, Sched<[WriteIMul, ReadIMul, ReadIMul]>; +} // Predicates = [HasStdExtMOrZmmul] + +let Predicates = [HasStdExtM] in { def DIV : ALU_rr<0b0000001, 0b100, "div">, Sched<[WriteIDiv, ReadIDiv, ReadIDiv]>; def DIVU : ALU_rr<0b0000001, 0b101, "divu">, @@ -43,9 +46,12 @@ def REMU : ALU_rr<0b0000001, 0b111, "remu">, Sched<[WriteIDiv, ReadIDiv, ReadIDiv]>; } // Predicates = [HasStdExtM] -let Predicates = [HasStdExtM, IsRV64] in { +let Predicates = [HasStdExtMOrZmmul, IsRV64] in { def MULW : ALUW_rr<0b0000001, 0b000, "mulw", /*Commutable*/1>, Sched<[WriteIMul32, ReadIMul32, ReadIMul32]>; +} // Predicates = [HasStdExtMOrZmmul, IsRV64] + +let Predicates = [HasStdExtM, IsRV64] in { def DIVW : ALUW_rr<0b0000001, 0b100, "divw">, Sched<[WriteIDiv32, ReadIDiv32, ReadIDiv32]>; def DIVUW : ALUW_rr<0b0000001, 0b101, "divuw">, @@ -60,21 +66,25 @@ def REMUW : ALUW_rr<0b0000001, 0b111, "remuw">, // Pseudo-instructions and codegen patterns //===----------------------------------------------------------------------===// -let Predicates = [HasStdExtM] in { +let Predicates = [HasStdExtMOrZmmul] in { def : PatGprGpr<mul, MUL>; def : PatGprGpr<mulhs, MULH>; def : PatGprGpr<mulhu, MULHU>; def : PatGprGpr<riscv_mulhsu, MULHSU>; +} // Predicates = [HasStdExtMOrZmmul] + +let Predicates = [HasStdExtM] in { def : PatGprGpr<sdiv, DIV>; def : PatGprGpr<udiv, DIVU>; def : PatGprGpr<srem, REM>; def : PatGprGpr<urem, REMU>; } // Predicates = [HasStdExtM] -let Predicates = [HasStdExtM, IsRV64] in { // Select W instructions if only the lower 32-bits of the result are used. +let Predicates = [HasStdExtMOrZmmul, IsRV64] in def : PatGprGpr<binop_allwusers<mul>, MULW>; +let Predicates = [HasStdExtM, IsRV64] in { def : PatGprGpr<riscv_divw, DIVW>; def : PatGprGpr<riscv_divuw, DIVUW>; def : PatGprGpr<riscv_remuw, REMUW>; @@ -96,11 +106,11 @@ def : Pat<(srem (sexti32 (i64 GPR:$rs1)), (sexti32 (i64 GPR:$rs2))), (REMW GPR:$rs1, GPR:$rs2)>; } // Predicates = [HasStdExtM, IsRV64] -let Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba] in { +let Predicates = [HasStdExtMOrZmmul, IsRV64, NotHasStdExtZba] in { // Special case for calculating the full 64-bit product of a 32x32 unsigned // multiply where the inputs aren't known to be zero extended. We can shift the // inputs left by 32 and use a MULHU. This saves two SRLIs needed to finish // zeroing the upper 32 bits. def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff))), (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>; -} // Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba] +} // Predicates = [HasStdExtMOrZmmul, IsRV64, NotHasStdExtZba] diff --git a/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp index dadf8f81a2c0..920729e9ebbf 100644 --- a/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp +++ b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp @@ -443,8 +443,7 @@ bool RISCVSExtWRemoval::runOnMachineFunction(MachineFunction &MF) { MachineInstr *MI = &*I++; // We're looking for the sext.w pattern ADDIW rd, rs1, 0. - if (MI->getOpcode() != RISCV::ADDIW || !MI->getOperand(2).isImm() || - MI->getOperand(2).getImm() != 0 || !MI->getOperand(1).isReg()) + if (!RISCV::isSEXT_W(*MI)) continue; // Input should be a virtual register. diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index 7589b44b81d3..0446edefa979 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -202,11 +202,9 @@ bool RISCVSubtarget::useRVVForFixedLengthVectors() const { } bool RISCVSubtarget::enableSubRegLiveness() const { - if (EnableSubRegLiveness.getNumOccurrences()) - return EnableSubRegLiveness; - // Enable subregister liveness for RVV to better handle LMUL>1 and segment - // load/store. - return hasVInstructions(); + // FIXME: Enable subregister liveness by default for RVV to better handle + // LMUL>1 and segment load/store. + return EnableSubRegLiveness; } void RISCVSubtarget::getPostRAMutations( diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 831f7fadaa62..6eb949fa551c 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -89,6 +89,7 @@ private: bool HasStdExtZicbom = false; bool HasStdExtZicboz = false; bool HasStdExtZicbop = false; + bool HasStdExtZmmul = false; bool HasRV64 = false; bool IsRV32E = false; bool EnableLinkerRelax = false; @@ -184,6 +185,7 @@ public: bool hasStdExtZicbom() const { return HasStdExtZicbom; } bool hasStdExtZicboz() const { return HasStdExtZicboz; } bool hasStdExtZicbop() const { return HasStdExtZicbop; } + bool hasStdExtZmmul() const { return HasStdExtZmmul; } bool is64Bit() const { return HasRV64; } bool isRV32E() const { return IsRV32E; } bool enableLinkerRelax() const { return EnableLinkerRelax; } diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index b2707b753e87..50fcb00e6c63 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -49,6 +49,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { initializeGlobalISel(*PR); initializeRISCVMakeCompressibleOptPass(*PR); initializeRISCVGatherScatterLoweringPass(*PR); + initializeRISCVCodeGenPreparePass(*PR); initializeRISCVMergeBaseOffsetOptPass(*PR); initializeRISCVSExtWRemovalPass(*PR); initializeRISCVExpandPseudoPass(*PR); @@ -187,7 +188,11 @@ TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) { void RISCVPassConfig::addIRPasses() { addPass(createAtomicExpandPass()); - addPass(createRISCVGatherScatterLoweringPass()); + if (getOptLevel() != CodeGenOpt::None) + addPass(createRISCVGatherScatterLoweringPass()); + + if (getOptLevel() != CodeGenOpt::None) + addPass(createRISCVCodeGenPreparePass()); TargetPassConfig::addIRPasses(); } diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 29d3c5e491de..f9cd5ffb512b 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -65,7 +65,7 @@ InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb()) return TTI::TCC_Free; // zext.w - if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZbb()) + if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba()) return TTI::TCC_Free; LLVM_FALLTHROUGH; case Instruction::Add: @@ -198,6 +198,9 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, // vid.v v9 // vrsub.vx v10, v9, a0 // vrgather.vv v9, v8, v10 + if (Tp->getElementType()->isIntegerTy(1)) + // Mask operation additionally required extend and truncate + return LT.first * 9; return LT.first * 6; } } diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp index 1a3e35a5f901..220fd76305aa 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp @@ -1068,5 +1068,15 @@ StringRef getKernelProfilingInfoName(KernelProfilingInfo e) { } llvm_unreachable("Unexpected operand"); } + +std::string getExtInstSetName(InstructionSet e) { + switch (e) { + CASE(InstructionSet, OpenCL_std) + CASE(InstructionSet, GLSL_std_450) + CASE(InstructionSet, SPV_AMD_shader_trinary_minmax) + break; + } + llvm_unreachable("Unexpected operand"); +} } // namespace SPIRV } // namespace llvm diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h index 2aa9f076c78e..9482723993a2 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h @@ -706,6 +706,19 @@ enum class KernelProfilingInfo : uint32_t { CmdExecTime = 0x1, }; StringRef getKernelProfilingInfoName(KernelProfilingInfo e); + +enum class InstructionSet : uint32_t { + OpenCL_std = 0, + GLSL_std_450 = 1, + SPV_AMD_shader_trinary_minmax = 2, +}; +std::string getExtInstSetName(InstructionSet e); + +// TODO: implement other mnemonics. +enum class Opcode : uint32_t { + InBoundsPtrAccessChain = 70, + PtrCastToGeneric = 121, +}; } // namespace SPIRV } // namespace llvm diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp index 3105baa02c90..d60e61f36270 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp @@ -59,7 +59,7 @@ void SPIRVInstPrinter::printOpConstantVarOps(const MCInst *MI, } void SPIRVInstPrinter::recordOpExtInstImport(const MCInst *MI) { - llvm_unreachable("Unimplemented recordOpExtInstImport"); + // TODO: insert {Reg, Set} into ExtInstSetIDs map. } void SPIRVInstPrinter::printInst(const MCInst *MI, uint64_t Address, @@ -176,7 +176,18 @@ void SPIRVInstPrinter::printInst(const MCInst *MI, uint64_t Address, } void SPIRVInstPrinter::printOpExtInst(const MCInst *MI, raw_ostream &O) { - llvm_unreachable("Unimplemented printOpExtInst"); + // The fixed operands have already been printed, so just need to decide what + // type of ExtInst operands to print based on the instruction set and number. + MCInstrDesc MCDesc = MII.get(MI->getOpcode()); + unsigned NumFixedOps = MCDesc.getNumOperands(); + const auto NumOps = MI->getNumOperands(); + if (NumOps == NumFixedOps) + return; + + O << ' '; + + // TODO: implement special printing for OpenCLExtInst::vstor*. + printRemainingVariableOps(MI, NumFixedOps, O, true); } void SPIRVInstPrinter::printOpDecorate(const MCInst *MI, raw_ostream &O) { diff --git a/llvm/lib/Target/SPIRV/SPIRV.h b/llvm/lib/Target/SPIRV/SPIRV.h index 8da54a5d6e61..5a7f2e51afb8 100644 --- a/llvm/lib/Target/SPIRV/SPIRV.h +++ b/llvm/lib/Target/SPIRV/SPIRV.h @@ -19,6 +19,7 @@ class SPIRVSubtarget; class InstructionSelector; class RegisterBankInfo; +ModulePass *createSPIRVPrepareFunctionsPass(); FunctionPass *createSPIRVPreLegalizerPass(); FunctionPass *createSPIRVEmitIntrinsicsPass(SPIRVTargetMachine *TM); InstructionSelector * diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp index 605bf949187f..6d60bd5e3c97 100644 --- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp @@ -21,6 +21,7 @@ #include "SPIRVUtils.h" #include "TargetInfo/SPIRVTargetInfo.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -58,9 +59,14 @@ public: void outputModuleSection(SPIRV::ModuleSectionType MSType); void outputEntryPoints(); void outputDebugSourceAndStrings(const Module &M); + void outputOpExtInstImports(const Module &M); void outputOpMemoryModel(); void outputOpFunctionEnd(); void outputExtFuncDecls(); + void outputExecutionModeFromMDNode(Register Reg, MDNode *Node, + SPIRV::ExecutionMode EM); + void outputExecutionMode(const Module &M); + void outputAnnotations(const Module &M); void outputModuleSections(); void emitInstruction(const MachineInstr *MI) override; @@ -127,6 +133,8 @@ void SPIRVAsmPrinter::emitFunctionBodyEnd() { } void SPIRVAsmPrinter::emitOpLabel(const MachineBasicBlock &MBB) { + if (MAI->MBBsToSkip.contains(&MBB)) + return; MCInst LabelInst; LabelInst.setOpcode(SPIRV::OpLabel); LabelInst.addOperand(MCOperand::createReg(MAI->getOrCreateMBBRegister(MBB))); @@ -237,6 +245,13 @@ void SPIRVAsmPrinter::outputModuleSection(SPIRV::ModuleSectionType MSType) { } void SPIRVAsmPrinter::outputDebugSourceAndStrings(const Module &M) { + // Output OpSourceExtensions. + for (auto &Str : MAI->SrcExt) { + MCInst Inst; + Inst.setOpcode(SPIRV::OpSourceExtension); + addStringImm(Str.first(), Inst); + outputMCInst(Inst); + } // Output OpSource. MCInst Inst; Inst.setOpcode(SPIRV::OpSource); @@ -246,6 +261,19 @@ void SPIRVAsmPrinter::outputDebugSourceAndStrings(const Module &M) { outputMCInst(Inst); } +void SPIRVAsmPrinter::outputOpExtInstImports(const Module &M) { + for (auto &CU : MAI->ExtInstSetMap) { + unsigned Set = CU.first; + Register Reg = CU.second; + MCInst Inst; + Inst.setOpcode(SPIRV::OpExtInstImport); + Inst.addOperand(MCOperand::createReg(Reg)); + addStringImm(getExtInstSetName(static_cast<SPIRV::InstructionSet>(Set)), + Inst); + outputMCInst(Inst); + } +} + void SPIRVAsmPrinter::outputOpMemoryModel() { MCInst Inst; Inst.setOpcode(SPIRV::OpMemoryModel); @@ -301,6 +329,135 @@ void SPIRVAsmPrinter::outputExtFuncDecls() { } } +// Encode LLVM type by SPIR-V execution mode VecTypeHint. +static unsigned encodeVecTypeHint(Type *Ty) { + if (Ty->isHalfTy()) + return 4; + if (Ty->isFloatTy()) + return 5; + if (Ty->isDoubleTy()) + return 6; + if (IntegerType *IntTy = dyn_cast<IntegerType>(Ty)) { + switch (IntTy->getIntegerBitWidth()) { + case 8: + return 0; + case 16: + return 1; + case 32: + return 2; + case 64: + return 3; + default: + llvm_unreachable("invalid integer type"); + } + } + if (FixedVectorType *VecTy = dyn_cast<FixedVectorType>(Ty)) { + Type *EleTy = VecTy->getElementType(); + unsigned Size = VecTy->getNumElements(); + return Size << 16 | encodeVecTypeHint(EleTy); + } + llvm_unreachable("invalid type"); +} + +static void addOpsFromMDNode(MDNode *MDN, MCInst &Inst, + SPIRV::ModuleAnalysisInfo *MAI) { + for (const MDOperand &MDOp : MDN->operands()) { + if (auto *CMeta = dyn_cast<ConstantAsMetadata>(MDOp)) { + Constant *C = CMeta->getValue(); + if (ConstantInt *Const = dyn_cast<ConstantInt>(C)) { + Inst.addOperand(MCOperand::createImm(Const->getZExtValue())); + } else if (auto *CE = dyn_cast<Function>(C)) { + Register FuncReg = MAI->getFuncReg(CE->getName().str()); + assert(FuncReg.isValid()); + Inst.addOperand(MCOperand::createReg(FuncReg)); + } + } + } +} + +void SPIRVAsmPrinter::outputExecutionModeFromMDNode(Register Reg, MDNode *Node, + SPIRV::ExecutionMode EM) { + MCInst Inst; + Inst.setOpcode(SPIRV::OpExecutionMode); + Inst.addOperand(MCOperand::createReg(Reg)); + Inst.addOperand(MCOperand::createImm(static_cast<unsigned>(EM))); + addOpsFromMDNode(Node, Inst, MAI); + outputMCInst(Inst); +} + +void SPIRVAsmPrinter::outputExecutionMode(const Module &M) { + NamedMDNode *Node = M.getNamedMetadata("spirv.ExecutionMode"); + if (Node) { + for (unsigned i = 0; i < Node->getNumOperands(); i++) { + MCInst Inst; + Inst.setOpcode(SPIRV::OpExecutionMode); + addOpsFromMDNode(cast<MDNode>(Node->getOperand(i)), Inst, MAI); + outputMCInst(Inst); + } + } + for (auto FI = M.begin(), E = M.end(); FI != E; ++FI) { + const Function &F = *FI; + if (F.isDeclaration()) + continue; + Register FReg = MAI->getFuncReg(F.getGlobalIdentifier()); + assert(FReg.isValid()); + if (MDNode *Node = F.getMetadata("reqd_work_group_size")) + outputExecutionModeFromMDNode(FReg, Node, + SPIRV::ExecutionMode::LocalSize); + if (MDNode *Node = F.getMetadata("work_group_size_hint")) + outputExecutionModeFromMDNode(FReg, Node, + SPIRV::ExecutionMode::LocalSizeHint); + if (MDNode *Node = F.getMetadata("intel_reqd_sub_group_size")) + outputExecutionModeFromMDNode(FReg, Node, + SPIRV::ExecutionMode::SubgroupSize); + if (MDNode *Node = F.getMetadata("vec_type_hint")) { + MCInst Inst; + Inst.setOpcode(SPIRV::OpExecutionMode); + Inst.addOperand(MCOperand::createReg(FReg)); + unsigned EM = static_cast<unsigned>(SPIRV::ExecutionMode::VecTypeHint); + Inst.addOperand(MCOperand::createImm(EM)); + unsigned TypeCode = encodeVecTypeHint(getMDOperandAsType(Node, 0)); + Inst.addOperand(MCOperand::createImm(TypeCode)); + outputMCInst(Inst); + } + } +} + +void SPIRVAsmPrinter::outputAnnotations(const Module &M) { + outputModuleSection(SPIRV::MB_Annotations); + // Process llvm.global.annotations special global variable. + for (auto F = M.global_begin(), E = M.global_end(); F != E; ++F) { + if ((*F).getName() != "llvm.global.annotations") + continue; + const GlobalVariable *V = &(*F); + const ConstantArray *CA = cast<ConstantArray>(V->getOperand(0)); + for (Value *Op : CA->operands()) { + ConstantStruct *CS = cast<ConstantStruct>(Op); + // The first field of the struct contains a pointer to + // the annotated variable. + Value *AnnotatedVar = CS->getOperand(0)->stripPointerCasts(); + if (!isa<Function>(AnnotatedVar)) + llvm_unreachable("Unsupported value in llvm.global.annotations"); + Function *Func = cast<Function>(AnnotatedVar); + Register Reg = MAI->getFuncReg(Func->getGlobalIdentifier()); + + // The second field contains a pointer to a global annotation string. + GlobalVariable *GV = + cast<GlobalVariable>(CS->getOperand(1)->stripPointerCasts()); + + StringRef AnnotationString; + getConstantStringInfo(GV, AnnotationString); + MCInst Inst; + Inst.setOpcode(SPIRV::OpDecorate); + Inst.addOperand(MCOperand::createReg(Reg)); + unsigned Dec = static_cast<unsigned>(SPIRV::Decoration::UserSemantic); + Inst.addOperand(MCOperand::createImm(Dec)); + addStringImm(AnnotationString, Inst); + outputMCInst(Inst); + } + } +} + void SPIRVAsmPrinter::outputModuleSections() { const Module *M = MMI->getModule(); // Get the global subtarget to output module-level info. @@ -311,13 +468,14 @@ void SPIRVAsmPrinter::outputModuleSections() { // Output instructions according to the Logical Layout of a Module: // TODO: 1,2. All OpCapability instructions, then optional OpExtension // instructions. - // TODO: 3. Optional OpExtInstImport instructions. + // 3. Optional OpExtInstImport instructions. + outputOpExtInstImports(*M); // 4. The single required OpMemoryModel instruction. outputOpMemoryModel(); // 5. All entry point declarations, using OpEntryPoint. outputEntryPoints(); // 6. Execution-mode declarations, using OpExecutionMode or OpExecutionModeId. - // TODO: + outputExecutionMode(*M); // 7a. Debug: all OpString, OpSourceExtension, OpSource, and // OpSourceContinued, without forward references. outputDebugSourceAndStrings(*M); @@ -326,7 +484,7 @@ void SPIRVAsmPrinter::outputModuleSections() { // 7c. Debug: all OpModuleProcessed instructions. outputModuleSection(SPIRV::MB_DebugModuleProcessed); // 8. All annotation instructions (all decorations). - outputModuleSection(SPIRV::MB_Annotations); + outputAnnotations(*M); // 9. All type declarations (OpTypeXXX instructions), all constant // instructions, and all global variable declarations. This section is // the first section to allow use of: OpLine and OpNoLine debug information; diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp index 5b6b82aebf30..e8fedfeffde7 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp @@ -24,9 +24,8 @@ using namespace llvm; SPIRVCallLowering::SPIRVCallLowering(const SPIRVTargetLowering &TLI, - const SPIRVSubtarget &ST, SPIRVGlobalRegistry *GR) - : CallLowering(&TLI), ST(ST), GR(GR) {} + : CallLowering(&TLI), GR(GR) {} bool SPIRVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, ArrayRef<Register> VRegs, @@ -36,11 +35,13 @@ bool SPIRVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, // TODO: handle the case of multiple registers. if (VRegs.size() > 1) return false; - if (Val) + if (Val) { + const auto &STI = MIRBuilder.getMF().getSubtarget(); return MIRBuilder.buildInstr(SPIRV::OpReturnValue) .addUse(VRegs[0]) - .constrainAllUses(MIRBuilder.getTII(), *ST.getRegisterInfo(), - *ST.getRegBankInfo()); + .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(), + *STI.getRegBankInfo()); + } MIRBuilder.buildInstr(SPIRV::OpReturn); return true; } @@ -63,6 +64,56 @@ static uint32_t getFunctionControl(const Function &F) { return FuncControl; } +static ConstantInt *getConstInt(MDNode *MD, unsigned NumOp) { + if (MD->getNumOperands() > NumOp) { + auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(NumOp)); + if (CMeta) + return dyn_cast<ConstantInt>(CMeta->getValue()); + } + return nullptr; +} + +// This code restores function args/retvalue types for composite cases +// because the final types should still be aggregate whereas they're i32 +// during the translation to cope with aggregate flattening etc. +static FunctionType *getOriginalFunctionType(const Function &F) { + auto *NamedMD = F.getParent()->getNamedMetadata("spv.cloned_funcs"); + if (NamedMD == nullptr) + return F.getFunctionType(); + + Type *RetTy = F.getFunctionType()->getReturnType(); + SmallVector<Type *, 4> ArgTypes; + for (auto &Arg : F.args()) + ArgTypes.push_back(Arg.getType()); + + auto ThisFuncMDIt = + std::find_if(NamedMD->op_begin(), NamedMD->op_end(), [&F](MDNode *N) { + return isa<MDString>(N->getOperand(0)) && + cast<MDString>(N->getOperand(0))->getString() == F.getName(); + }); + // TODO: probably one function can have numerous type mutations, + // so we should support this. + if (ThisFuncMDIt != NamedMD->op_end()) { + auto *ThisFuncMD = *ThisFuncMDIt; + MDNode *MD = dyn_cast<MDNode>(ThisFuncMD->getOperand(1)); + assert(MD && "MDNode operand is expected"); + ConstantInt *Const = getConstInt(MD, 0); + if (Const) { + auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(1)); + assert(CMeta && "ConstantAsMetadata operand is expected"); + assert(Const->getSExtValue() >= -1); + // Currently -1 indicates return value, greater values mean + // argument numbers. + if (Const->getSExtValue() == -1) + RetTy = CMeta->getType(); + else + ArgTypes[Const->getSExtValue()] = CMeta->getType(); + } + } + + return FunctionType::get(RetTy, ArgTypes, F.isVarArg()); +} + bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<ArrayRef<Register>> VRegs, @@ -71,7 +122,8 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, GR->setCurrentFunc(MIRBuilder.getMF()); // Assign types and names to all args, and store their types for later. - SmallVector<Register, 4> ArgTypeVRegs; + FunctionType *FTy = getOriginalFunctionType(F); + SmallVector<SPIRVType *, 4> ArgTypeVRegs; if (VRegs.size() > 0) { unsigned i = 0; for (const auto &Arg : F.args()) { @@ -79,9 +131,18 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, // TODO: handle the case of multiple registers. if (VRegs[i].size() > 1) return false; - auto *SpirvTy = - GR->assignTypeToVReg(Arg.getType(), VRegs[i][0], MIRBuilder); - ArgTypeVRegs.push_back(GR->getSPIRVTypeID(SpirvTy)); + Type *ArgTy = FTy->getParamType(i); + SPIRV::AccessQualifier AQ = SPIRV::AccessQualifier::ReadWrite; + MDNode *Node = F.getMetadata("kernel_arg_access_qual"); + if (Node && i < Node->getNumOperands()) { + StringRef AQString = cast<MDString>(Node->getOperand(i))->getString(); + if (AQString.compare("read_only") == 0) + AQ = SPIRV::AccessQualifier::ReadOnly; + else if (AQString.compare("write_only") == 0) + AQ = SPIRV::AccessQualifier::WriteOnly; + } + auto *SpirvTy = GR->assignTypeToVReg(ArgTy, VRegs[i][0], MIRBuilder, AQ); + ArgTypeVRegs.push_back(SpirvTy); if (Arg.hasName()) buildOpName(VRegs[i][0], Arg.getName(), MIRBuilder); @@ -92,8 +153,10 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, SPIRV::Decoration::MaxByteOffset, {DerefBytes}); } if (Arg.hasAttribute(Attribute::Alignment)) { + auto Alignment = static_cast<unsigned>( + Arg.getAttribute(Attribute::Alignment).getValueAsInt()); buildOpDecorate(VRegs[i][0], MIRBuilder, SPIRV::Decoration::Alignment, - {static_cast<unsigned>(Arg.getParamAlignment())}); + {Alignment}); } if (Arg.hasAttribute(Attribute::ReadOnly)) { auto Attr = @@ -107,6 +170,38 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, buildOpDecorate(VRegs[i][0], MIRBuilder, SPIRV::Decoration::FuncParamAttr, {Attr}); } + if (Arg.hasAttribute(Attribute::NoAlias)) { + auto Attr = + static_cast<unsigned>(SPIRV::FunctionParameterAttribute::NoAlias); + buildOpDecorate(VRegs[i][0], MIRBuilder, + SPIRV::Decoration::FuncParamAttr, {Attr}); + } + Node = F.getMetadata("kernel_arg_type_qual"); + if (Node && i < Node->getNumOperands()) { + StringRef TypeQual = cast<MDString>(Node->getOperand(i))->getString(); + if (TypeQual.compare("volatile") == 0) + buildOpDecorate(VRegs[i][0], MIRBuilder, SPIRV::Decoration::Volatile, + {}); + } + Node = F.getMetadata("spirv.ParameterDecorations"); + if (Node && i < Node->getNumOperands() && + isa<MDNode>(Node->getOperand(i))) { + MDNode *MD = cast<MDNode>(Node->getOperand(i)); + for (const MDOperand &MDOp : MD->operands()) { + MDNode *MD2 = dyn_cast<MDNode>(MDOp); + assert(MD2 && "Metadata operand is expected"); + ConstantInt *Const = getConstInt(MD2, 0); + assert(Const && "MDOperand should be ConstantInt"); + auto Dec = static_cast<SPIRV::Decoration>(Const->getZExtValue()); + std::vector<uint32_t> DecVec; + for (unsigned j = 1; j < MD2->getNumOperands(); j++) { + ConstantInt *Const = getConstInt(MD2, j); + assert(Const && "MDOperand should be ConstantInt"); + DecVec.push_back(static_cast<uint32_t>(Const->getZExtValue())); + } + buildOpDecorate(VRegs[i][0], MIRBuilder, Dec, DecVec); + } + } ++i; } } @@ -117,30 +212,30 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, MRI->setRegClass(FuncVReg, &SPIRV::IDRegClass); if (F.isDeclaration()) GR->add(&F, &MIRBuilder.getMF(), FuncVReg); - - auto *FTy = F.getFunctionType(); - auto FuncTy = GR->assignTypeToVReg(FTy, FuncVReg, MIRBuilder); + SPIRVType *RetTy = GR->getOrCreateSPIRVType(FTy->getReturnType(), MIRBuilder); + SPIRVType *FuncTy = GR->getOrCreateOpTypeFunctionWithArgs( + FTy, RetTy, ArgTypeVRegs, MIRBuilder); // Build the OpTypeFunction declaring it. - Register ReturnTypeID = FuncTy->getOperand(1).getReg(); uint32_t FuncControl = getFunctionControl(F); MIRBuilder.buildInstr(SPIRV::OpFunction) .addDef(FuncVReg) - .addUse(ReturnTypeID) + .addUse(GR->getSPIRVTypeID(RetTy)) .addImm(FuncControl) .addUse(GR->getSPIRVTypeID(FuncTy)); // Add OpFunctionParameters. - const unsigned NumArgs = ArgTypeVRegs.size(); - for (unsigned i = 0; i < NumArgs; ++i) { + int i = 0; + for (const auto &Arg : F.args()) { assert(VRegs[i].size() == 1 && "Formal arg has multiple vregs"); MRI->setRegClass(VRegs[i][0], &SPIRV::IDRegClass); MIRBuilder.buildInstr(SPIRV::OpFunctionParameter) .addDef(VRegs[i][0]) - .addUse(ArgTypeVRegs[i]); + .addUse(GR->getSPIRVTypeID(ArgTypeVRegs[i])); if (F.isDeclaration()) - GR->add(F.getArg(i), &MIRBuilder.getMF(), VRegs[i][0]); + GR->add(&Arg, &MIRBuilder.getMF(), VRegs[i][0]); + i++; } // Name the function. if (F.hasName()) @@ -169,48 +264,51 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // TODO: handle the case of multiple registers. if (Info.OrigRet.Regs.size() > 1) return false; + MachineFunction &MF = MIRBuilder.getMF(); + GR->setCurrentFunc(MF); + FunctionType *FTy = nullptr; + const Function *CF = nullptr; - GR->setCurrentFunc(MIRBuilder.getMF()); - Register ResVReg = - Info.OrigRet.Regs.empty() ? Register(0) : Info.OrigRet.Regs[0]; // Emit a regular OpFunctionCall. If it's an externally declared function, - // be sure to emit its type and function declaration here. It will be - // hoisted globally later. + // be sure to emit its type and function declaration here. It will be hoisted + // globally later. if (Info.Callee.isGlobal()) { - auto *CF = dyn_cast_or_null<const Function>(Info.Callee.getGlobal()); + CF = dyn_cast_or_null<const Function>(Info.Callee.getGlobal()); // TODO: support constexpr casts and indirect calls. if (CF == nullptr) return false; - if (CF->isDeclaration()) { - // Emit the type info and forward function declaration to the first MBB - // to ensure VReg definition dependencies are valid across all MBBs. - MachineBasicBlock::iterator OldII = MIRBuilder.getInsertPt(); - MachineBasicBlock &OldBB = MIRBuilder.getMBB(); - MachineBasicBlock &FirstBB = *MIRBuilder.getMF().getBlockNumbered(0); - MIRBuilder.setInsertPt(FirstBB, FirstBB.instr_end()); - - SmallVector<ArrayRef<Register>, 8> VRegArgs; - SmallVector<SmallVector<Register, 1>, 8> ToInsert; - for (const Argument &Arg : CF->args()) { - if (MIRBuilder.getDataLayout().getTypeStoreSize(Arg.getType()).isZero()) - continue; // Don't handle zero sized types. - ToInsert.push_back({MIRBuilder.getMRI()->createGenericVirtualRegister( - LLT::scalar(32))}); - VRegArgs.push_back(ToInsert.back()); - } - // TODO: Reuse FunctionLoweringInfo. - FunctionLoweringInfo FuncInfo; - lowerFormalArguments(MIRBuilder, *CF, VRegArgs, FuncInfo); - MIRBuilder.setInsertPt(OldBB, OldII); + FTy = getOriginalFunctionType(*CF); + } + + Register ResVReg = + Info.OrigRet.Regs.empty() ? Register(0) : Info.OrigRet.Regs[0]; + if (CF && CF->isDeclaration() && + !GR->find(CF, &MIRBuilder.getMF()).isValid()) { + // Emit the type info and forward function declaration to the first MBB + // to ensure VReg definition dependencies are valid across all MBBs. + MachineIRBuilder FirstBlockBuilder; + FirstBlockBuilder.setMF(MF); + FirstBlockBuilder.setMBB(*MF.getBlockNumbered(0)); + + SmallVector<ArrayRef<Register>, 8> VRegArgs; + SmallVector<SmallVector<Register, 1>, 8> ToInsert; + for (const Argument &Arg : CF->args()) { + if (MIRBuilder.getDataLayout().getTypeStoreSize(Arg.getType()).isZero()) + continue; // Don't handle zero sized types. + ToInsert.push_back( + {MIRBuilder.getMRI()->createGenericVirtualRegister(LLT::scalar(32))}); + VRegArgs.push_back(ToInsert.back()); } + // TODO: Reuse FunctionLoweringInfo + FunctionLoweringInfo FuncInfo; + lowerFormalArguments(FirstBlockBuilder, *CF, VRegArgs, FuncInfo); } // Make sure there's a valid return reg, even for functions returning void. - if (!ResVReg.isValid()) { + if (!ResVReg.isValid()) ResVReg = MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::IDRegClass); - } SPIRVType *RetType = - GR->assignTypeToVReg(Info.OrigRet.Ty, ResVReg, MIRBuilder); + GR->assignTypeToVReg(FTy->getReturnType(), ResVReg, MIRBuilder); // Emit the OpFunctionCall and its args. auto MIB = MIRBuilder.buildInstr(SPIRV::OpFunctionCall) @@ -224,6 +322,7 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, return false; MIB.addUse(Arg.Regs[0]); } - return MIB.constrainAllUses(MIRBuilder.getTII(), *ST.getRegisterInfo(), - *ST.getRegBankInfo()); + const auto &STI = MF.getSubtarget(); + return MIB.constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(), + *STI.getRegBankInfo()); } diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.h b/llvm/lib/Target/SPIRV/SPIRVCallLowering.h index c179bb35154b..c2d6ad82d507 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.h +++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.h @@ -13,23 +13,21 @@ #ifndef LLVM_LIB_TARGET_SPIRV_SPIRVCALLLOWERING_H #define LLVM_LIB_TARGET_SPIRV_SPIRVCALLLOWERING_H +#include "SPIRVGlobalRegistry.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" namespace llvm { class SPIRVGlobalRegistry; -class SPIRVSubtarget; class SPIRVTargetLowering; class SPIRVCallLowering : public CallLowering { private: - const SPIRVSubtarget &ST; // Used to create and assign function, argument, and return type information. SPIRVGlobalRegistry *GR; public: - SPIRVCallLowering(const SPIRVTargetLowering &TLI, const SPIRVSubtarget &ST, - SPIRVGlobalRegistry *GR); + SPIRVCallLowering(const SPIRVTargetLowering &TLI, SPIRVGlobalRegistry *GR); // Built OpReturn or OpReturnValue. bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val, diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp index 57cd4bafd351..1926977ea66e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp @@ -92,4 +92,4 @@ void SPIRVGeneralDuplicatesTracker::buildDepsGraph( } } } -}
\ No newline at end of file +} diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h index 58ae1f86ce42..ab22c3d2a647 100644 --- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h +++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h @@ -169,6 +169,8 @@ public: Register find(const Argument *Arg, const MachineFunction *MF) { return AT.find(const_cast<Argument *>(Arg), MF); } + + const SPIRVDuplicatesTracker<Type> *getTypes() { return &TT; } }; } // namespace llvm -#endif
\ No newline at end of file +#endif // LLVM_LIB_TARGET_SPIRV_SPIRVDUPLICATESTRACKER_H diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 9624482e3622..0075f547b6d6 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -87,6 +87,7 @@ public: Instruction *visitLoadInst(LoadInst &I); Instruction *visitStoreInst(StoreInst &I); Instruction *visitAllocaInst(AllocaInst &I); + Instruction *visitAtomicCmpXchgInst(AtomicCmpXchgInst &I); bool runOnFunction(Function &F) override; }; } // namespace @@ -103,7 +104,7 @@ static inline bool isAssignTypeInstr(const Instruction *I) { static bool isMemInstrToReplace(Instruction *I) { return isa<StoreInst>(I) || isa<LoadInst>(I) || isa<InsertValueInst>(I) || - isa<ExtractValueInst>(I); + isa<ExtractValueInst>(I) || isa<AtomicCmpXchgInst>(I); } static bool isAggrToReplace(const Value *V) { @@ -134,13 +135,14 @@ void SPIRVEmitIntrinsics::replaceMemInstrUses(Instruction *Old, Instruction *New) { while (!Old->user_empty()) { auto *U = Old->user_back(); - if (isMemInstrToReplace(U) || isa<ReturnInst>(U)) { - U->replaceUsesOfWith(Old, New); - } else if (isAssignTypeInstr(U)) { + if (isAssignTypeInstr(U)) { IRB->SetInsertPoint(U); SmallVector<Value *, 2> Args = {New, U->getOperand(1)}; IRB->CreateIntrinsic(Intrinsic::spv_assign_type, {New->getType()}, Args); U->eraseFromParent(); + } else if (isMemInstrToReplace(U) || isa<ReturnInst>(U) || + isa<CallInst>(U)) { + U->replaceUsesOfWith(Old, New); } else { llvm_unreachable("illegal aggregate intrinsic user"); } @@ -301,10 +303,10 @@ Instruction *SPIRVEmitIntrinsics::visitStoreInst(StoreInst &I) { MachineMemOperand::Flags Flags = TLI->getStoreMemOperandFlags(I, F->getParent()->getDataLayout()); auto *PtrOp = I.getPointerOperand(); - auto *NewI = - IRB->CreateIntrinsic(Intrinsic::spv_store, {PtrOp->getType()}, - {I.getValueOperand(), PtrOp, IRB->getInt16(Flags), - IRB->getInt8(I.getAlign().value())}); + auto *NewI = IRB->CreateIntrinsic( + Intrinsic::spv_store, {I.getValueOperand()->getType(), PtrOp->getType()}, + {I.getValueOperand(), PtrOp, IRB->getInt16(Flags), + IRB->getInt8(I.getAlign().value())}); I.eraseFromParent(); return NewI; } @@ -314,6 +316,22 @@ Instruction *SPIRVEmitIntrinsics::visitAllocaInst(AllocaInst &I) { return &I; } +Instruction *SPIRVEmitIntrinsics::visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) { + assert(I.getType()->isAggregateType() && "Aggregate result is expected"); + SmallVector<Value *> Args; + for (auto &Op : I.operands()) + Args.push_back(Op); + Args.push_back(IRB->getInt32(I.getSyncScopeID())); + Args.push_back(IRB->getInt32( + static_cast<uint32_t>(getMemSemantics(I.getSuccessOrdering())))); + Args.push_back(IRB->getInt32( + static_cast<uint32_t>(getMemSemantics(I.getFailureOrdering())))); + auto *NewI = IRB->CreateIntrinsic(Intrinsic::spv_cmpxchg, + {I.getPointerOperand()->getType()}, {Args}); + replaceMemInstrUses(&I, NewI); + return NewI; +} + void SPIRVEmitIntrinsics::processGlobalValue(GlobalVariable &GV) { // Skip special artifical variable llvm.global.annotations. if (GV.getName() == "llvm.global.annotations") @@ -351,14 +369,13 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I) { // Check GetElementPtrConstantExpr case. (isa<ConstantExpr>(Op) && isa<GEPOperator>(Op))) { IRB->SetInsertPoint(I); - buildIntrWithMD(Intrinsic::spv_assign_type, {Op->getType()}, Op, Op); + if (isa<UndefValue>(Op) && Op->getType()->isAggregateType()) + buildIntrWithMD(Intrinsic::spv_assign_type, {IRB->getInt32Ty()}, Op, + UndefValue::get(IRB->getInt32Ty())); + else + buildIntrWithMD(Intrinsic::spv_assign_type, {Op->getType()}, Op, Op); } } - // StoreInst's operand type can be changed in the next stage so we need to - // store it in the set. - if (isa<StoreInst>(I) && - cast<StoreInst>(I)->getValueOperand()->getType()->isAggregateType()) - AggrStores.insert(I); } void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I) { @@ -378,7 +395,7 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I) { if ((isa<ConstantAggregateZero>(Op) && Op->getType()->isVectorTy()) || isa<PHINode>(I) || isa<SwitchInst>(I)) TrackConstants = false; - if (isa<ConstantData>(Op) && TrackConstants) { + if ((isa<ConstantData>(Op) || isa<ConstantExpr>(Op)) && TrackConstants) { unsigned OpNo = Op.getOperandNo(); if (II && ((II->getIntrinsicID() == Intrinsic::spv_gep && OpNo == 0) || (II->paramHasAttr(OpNo, Attribute::ImmArg)))) @@ -405,8 +422,20 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) { AggrConsts.clear(); AggrStores.clear(); - IRB->SetInsertPoint(&Func.getEntryBlock().front()); + // StoreInst's operand type can be changed during the next transformations, + // so we need to store it in the set. Also store already transformed types. + for (auto &I : instructions(Func)) { + StoreInst *SI = dyn_cast<StoreInst>(&I); + if (!SI) + continue; + Type *ElTy = SI->getValueOperand()->getType(); + PointerType *PTy = cast<PointerType>(SI->getOperand(1)->getType()); + if (ElTy->isAggregateType() || ElTy->isVectorTy() || + !PTy->isOpaqueOrPointeeTypeMatches(ElTy)) + AggrStores.insert(&I); + } + IRB->SetInsertPoint(&Func.getEntryBlock().front()); for (auto &GV : Func.getParent()->globals()) processGlobalValue(GV); diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 5f890c003cbc..5c8fa7adfbdf 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -24,6 +24,24 @@ using namespace llvm; SPIRVGlobalRegistry::SPIRVGlobalRegistry(unsigned PointerSize) : PointerSize(PointerSize) {} +SPIRVType *SPIRVGlobalRegistry::assignIntTypeToVReg(unsigned BitWidth, + Register VReg, + MachineInstr &I, + const SPIRVInstrInfo &TII) { + SPIRVType *SpirvType = getOrCreateSPIRVIntegerType(BitWidth, I, TII); + assignSPIRVTypeToVReg(SpirvType, VReg, *CurMF); + return SpirvType; +} + +SPIRVType *SPIRVGlobalRegistry::assignVectTypeToVReg( + SPIRVType *BaseType, unsigned NumElements, Register VReg, MachineInstr &I, + const SPIRVInstrInfo &TII) { + SPIRVType *SpirvType = + getOrCreateSPIRVVectorType(BaseType, NumElements, I, TII); + assignSPIRVTypeToVReg(SpirvType, VReg, *CurMF); + return SpirvType; +} + SPIRVType *SPIRVGlobalRegistry::assignTypeToVReg( const Type *Type, Register VReg, MachineIRBuilder &MIRBuilder, SPIRV::AccessQualifier AccessQual, bool EmitIR) { @@ -96,6 +114,65 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeVector(uint32_t NumElems, return MIB; } +std::tuple<Register, ConstantInt *, bool> +SPIRVGlobalRegistry::getOrCreateConstIntReg(uint64_t Val, SPIRVType *SpvType, + MachineIRBuilder *MIRBuilder, + MachineInstr *I, + const SPIRVInstrInfo *TII) { + const IntegerType *LLVMIntTy; + if (SpvType) + LLVMIntTy = cast<IntegerType>(getTypeForSPIRVType(SpvType)); + else + LLVMIntTy = IntegerType::getInt32Ty(CurMF->getFunction().getContext()); + bool NewInstr = false; + // Find a constant in DT or build a new one. + ConstantInt *CI = ConstantInt::get(const_cast<IntegerType *>(LLVMIntTy), Val); + Register Res = DT.find(CI, CurMF); + if (!Res.isValid()) { + unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32; + LLT LLTy = LLT::scalar(32); + Res = CurMF->getRegInfo().createGenericVirtualRegister(LLTy); + if (MIRBuilder) + assignTypeToVReg(LLVMIntTy, Res, *MIRBuilder); + else + assignIntTypeToVReg(BitWidth, Res, *I, *TII); + DT.add(CI, CurMF, Res); + NewInstr = true; + } + return std::make_tuple(Res, CI, NewInstr); +} + +Register SPIRVGlobalRegistry::getOrCreateConstInt(uint64_t Val, MachineInstr &I, + SPIRVType *SpvType, + const SPIRVInstrInfo &TII) { + assert(SpvType); + ConstantInt *CI; + Register Res; + bool New; + std::tie(Res, CI, New) = + getOrCreateConstIntReg(Val, SpvType, nullptr, &I, &TII); + // If we have found Res register which is defined by the passed G_CONSTANT + // machine instruction, a new constant instruction should be created. + if (!New && (!I.getOperand(0).isReg() || Res != I.getOperand(0).getReg())) + return Res; + MachineInstrBuilder MIB; + MachineBasicBlock &BB = *I.getParent(); + if (Val) { + MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantI)) + .addDef(Res) + .addUse(getSPIRVTypeID(SpvType)); + addNumImm(APInt(getScalarOrVectorBitWidth(SpvType), Val), MIB); + } else { + MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull)) + .addDef(Res) + .addUse(getSPIRVTypeID(SpvType)); + } + const auto &ST = CurMF->getSubtarget(); + constrainSelectedInstRegOperands(*MIB, *ST.getInstrInfo(), + *ST.getRegisterInfo(), *ST.getRegBankInfo()); + return Res; +} + Register SPIRVGlobalRegistry::buildConstantInt(uint64_t Val, MachineIRBuilder &MIRBuilder, SPIRVType *SpvType, @@ -112,14 +189,32 @@ Register SPIRVGlobalRegistry::buildConstantInt(uint64_t Val, Register Res = DT.find(ConstInt, &MF); if (!Res.isValid()) { unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32; - Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth)); - assignTypeToVReg(LLVMIntTy, Res, MIRBuilder); - if (EmitIR) + LLT LLTy = LLT::scalar(EmitIR ? BitWidth : 32); + Res = MF.getRegInfo().createGenericVirtualRegister(LLTy); + assignTypeToVReg(LLVMIntTy, Res, MIRBuilder, + SPIRV::AccessQualifier::ReadWrite, EmitIR); + DT.add(ConstInt, &MIRBuilder.getMF(), Res); + if (EmitIR) { MIRBuilder.buildConstant(Res, *ConstInt); - else - MIRBuilder.buildInstr(SPIRV::OpConstantI) - .addDef(Res) - .addImm(ConstInt->getSExtValue()); + } else { + MachineInstrBuilder MIB; + if (Val) { + assert(SpvType); + MIB = MIRBuilder.buildInstr(SPIRV::OpConstantI) + .addDef(Res) + .addUse(getSPIRVTypeID(SpvType)); + addNumImm(APInt(BitWidth, Val), MIB); + } else { + assert(SpvType); + MIB = MIRBuilder.buildInstr(SPIRV::OpConstantNull) + .addDef(Res) + .addUse(getSPIRVTypeID(SpvType)); + } + const auto &Subtarget = CurMF->getSubtarget(); + constrainSelectedInstRegOperands(*MIB, *Subtarget.getInstrInfo(), + *Subtarget.getRegisterInfo(), + *Subtarget.getRegBankInfo()); + } } return Res; } @@ -142,11 +237,63 @@ Register SPIRVGlobalRegistry::buildConstantFP(APFloat Val, unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32; Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth)); assignTypeToVReg(LLVMFPTy, Res, MIRBuilder); + DT.add(ConstFP, &MF, Res); MIRBuilder.buildFConstant(Res, *ConstFP); } return Res; } +Register +SPIRVGlobalRegistry::getOrCreateConsIntVector(uint64_t Val, MachineInstr &I, + SPIRVType *SpvType, + const SPIRVInstrInfo &TII) { + const Type *LLVMTy = getTypeForSPIRVType(SpvType); + assert(LLVMTy->isVectorTy()); + const FixedVectorType *LLVMVecTy = cast<FixedVectorType>(LLVMTy); + Type *LLVMBaseTy = LLVMVecTy->getElementType(); + // Find a constant vector in DT or build a new one. + const auto ConstInt = ConstantInt::get(LLVMBaseTy, Val); + auto ConstVec = + ConstantVector::getSplat(LLVMVecTy->getElementCount(), ConstInt); + Register Res = DT.find(ConstVec, CurMF); + if (!Res.isValid()) { + unsigned BitWidth = getScalarOrVectorBitWidth(SpvType); + SPIRVType *SpvBaseType = getOrCreateSPIRVIntegerType(BitWidth, I, TII); + // SpvScalConst should be created before SpvVecConst to avoid undefined ID + // error on validation. + // TODO: can moved below once sorting of types/consts/defs is implemented. + Register SpvScalConst; + if (Val) + SpvScalConst = getOrCreateConstInt(Val, I, SpvBaseType, TII); + // TODO: maybe use bitwidth of base type. + LLT LLTy = LLT::scalar(32); + Register SpvVecConst = + CurMF->getRegInfo().createGenericVirtualRegister(LLTy); + const unsigned ElemCnt = SpvType->getOperand(2).getImm(); + assignVectTypeToVReg(SpvBaseType, ElemCnt, SpvVecConst, I, TII); + DT.add(ConstVec, CurMF, SpvVecConst); + MachineInstrBuilder MIB; + MachineBasicBlock &BB = *I.getParent(); + if (Val) { + MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantComposite)) + .addDef(SpvVecConst) + .addUse(getSPIRVTypeID(SpvType)); + for (unsigned i = 0; i < ElemCnt; ++i) + MIB.addUse(SpvScalConst); + } else { + MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull)) + .addDef(SpvVecConst) + .addUse(getSPIRVTypeID(SpvType)); + } + const auto &Subtarget = CurMF->getSubtarget(); + constrainSelectedInstRegOperands(*MIB, *Subtarget.getInstrInfo(), + *Subtarget.getRegisterInfo(), + *Subtarget.getRegBankInfo()); + return SpvVecConst; + } + return Res; +} + Register SPIRVGlobalRegistry::buildGlobalVariable( Register ResVReg, SPIRVType *BaseType, StringRef Name, const GlobalValue *GV, SPIRV::StorageClass Storage, @@ -169,7 +316,13 @@ Register SPIRVGlobalRegistry::buildGlobalVariable( } GV = GVar; } - Register Reg; + Register Reg = DT.find(GVar, &MIRBuilder.getMF()); + if (Reg.isValid()) { + if (Reg != ResVReg) + MIRBuilder.buildCopy(ResVReg, Reg); + return ResVReg; + } + auto MIB = MIRBuilder.buildInstr(SPIRV::OpVariable) .addDef(ResVReg) .addUse(getSPIRVTypeID(BaseType)) @@ -234,14 +387,76 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeArray(uint32_t NumElems, return MIB; } +SPIRVType *SPIRVGlobalRegistry::getOpTypeOpaque(const StructType *Ty, + MachineIRBuilder &MIRBuilder) { + assert(Ty->hasName()); + const StringRef Name = Ty->hasName() ? Ty->getName() : ""; + Register ResVReg = createTypeVReg(MIRBuilder); + auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeOpaque).addDef(ResVReg); + addStringImm(Name, MIB); + buildOpName(ResVReg, Name, MIRBuilder); + return MIB; +} + +SPIRVType *SPIRVGlobalRegistry::getOpTypeStruct(const StructType *Ty, + MachineIRBuilder &MIRBuilder, + bool EmitIR) { + SmallVector<Register, 4> FieldTypes; + for (const auto &Elem : Ty->elements()) { + SPIRVType *ElemTy = findSPIRVType(Elem, MIRBuilder); + assert(ElemTy && ElemTy->getOpcode() != SPIRV::OpTypeVoid && + "Invalid struct element type"); + FieldTypes.push_back(getSPIRVTypeID(ElemTy)); + } + Register ResVReg = createTypeVReg(MIRBuilder); + auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeStruct).addDef(ResVReg); + for (const auto &Ty : FieldTypes) + MIB.addUse(Ty); + if (Ty->hasName()) + buildOpName(ResVReg, Ty->getName(), MIRBuilder); + if (Ty->isPacked()) + buildOpDecorate(ResVReg, MIRBuilder, SPIRV::Decoration::CPacked, {}); + return MIB; +} + +static bool isOpenCLBuiltinType(const StructType *SType) { + return SType->isOpaque() && SType->hasName() && + SType->getName().startswith("opencl."); +} + +static bool isSPIRVBuiltinType(const StructType *SType) { + return SType->isOpaque() && SType->hasName() && + SType->getName().startswith("spirv."); +} + +static bool isSpecialType(const Type *Ty) { + if (auto PType = dyn_cast<PointerType>(Ty)) { + if (!PType->isOpaque()) + Ty = PType->getNonOpaquePointerElementType(); + } + if (auto SType = dyn_cast<StructType>(Ty)) + return isOpenCLBuiltinType(SType) || isSPIRVBuiltinType(SType); + return false; +} + SPIRVType *SPIRVGlobalRegistry::getOpTypePointer(SPIRV::StorageClass SC, SPIRVType *ElemType, - MachineIRBuilder &MIRBuilder) { - auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypePointer) - .addDef(createTypeVReg(MIRBuilder)) - .addImm(static_cast<uint32_t>(SC)) - .addUse(getSPIRVTypeID(ElemType)); - return MIB; + MachineIRBuilder &MIRBuilder, + Register Reg) { + if (!Reg.isValid()) + Reg = createTypeVReg(MIRBuilder); + return MIRBuilder.buildInstr(SPIRV::OpTypePointer) + .addDef(Reg) + .addImm(static_cast<uint32_t>(SC)) + .addUse(getSPIRVTypeID(ElemType)); +} + +SPIRVType * +SPIRVGlobalRegistry::getOpTypeForwardPointer(SPIRV::StorageClass SC, + MachineIRBuilder &MIRBuilder) { + return MIRBuilder.buildInstr(SPIRV::OpTypeForwardPointer) + .addUse(createTypeVReg(MIRBuilder)) + .addImm(static_cast<uint32_t>(SC)); } SPIRVType *SPIRVGlobalRegistry::getOpTypeFunction( @@ -255,10 +470,49 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeFunction( return MIB; } +SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeFunctionWithArgs( + const Type *Ty, SPIRVType *RetType, + const SmallVectorImpl<SPIRVType *> &ArgTypes, + MachineIRBuilder &MIRBuilder) { + Register Reg = DT.find(Ty, &MIRBuilder.getMF()); + if (Reg.isValid()) + return getSPIRVTypeForVReg(Reg); + SPIRVType *SpirvType = getOpTypeFunction(RetType, ArgTypes, MIRBuilder); + return finishCreatingSPIRVType(Ty, SpirvType); +} + +SPIRVType *SPIRVGlobalRegistry::findSPIRVType(const Type *Ty, + MachineIRBuilder &MIRBuilder, + SPIRV::AccessQualifier AccQual, + bool EmitIR) { + Register Reg = DT.find(Ty, &MIRBuilder.getMF()); + if (Reg.isValid()) + return getSPIRVTypeForVReg(Reg); + if (ForwardPointerTypes.find(Ty) != ForwardPointerTypes.end()) + return ForwardPointerTypes[Ty]; + return restOfCreateSPIRVType(Ty, MIRBuilder, AccQual, EmitIR); +} + +Register SPIRVGlobalRegistry::getSPIRVTypeID(const SPIRVType *SpirvType) const { + assert(SpirvType && "Attempting to get type id for nullptr type."); + if (SpirvType->getOpcode() == SPIRV::OpTypeForwardPointer) + return SpirvType->uses().begin()->getReg(); + return SpirvType->defs().begin()->getReg(); +} + SPIRVType *SPIRVGlobalRegistry::createSPIRVType(const Type *Ty, MachineIRBuilder &MIRBuilder, SPIRV::AccessQualifier AccQual, bool EmitIR) { + assert(!isSpecialType(Ty)); + auto &TypeToSPIRVTypeMap = DT.getTypes()->getAllUses(); + auto t = TypeToSPIRVTypeMap.find(Ty); + if (t != TypeToSPIRVTypeMap.end()) { + auto tt = t->second.find(&MIRBuilder.getMF()); + if (tt != t->second.end()) + return getSPIRVTypeForVReg(tt->second); + } + if (auto IType = dyn_cast<IntegerType>(Ty)) { const unsigned Width = IType->getBitWidth(); return Width == 1 ? getOpTypeBool(MIRBuilder) @@ -269,21 +523,25 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType(const Type *Ty, if (Ty->isVoidTy()) return getOpTypeVoid(MIRBuilder); if (Ty->isVectorTy()) { - auto El = getOrCreateSPIRVType(cast<FixedVectorType>(Ty)->getElementType(), - MIRBuilder); + SPIRVType *El = + findSPIRVType(cast<FixedVectorType>(Ty)->getElementType(), MIRBuilder); return getOpTypeVector(cast<FixedVectorType>(Ty)->getNumElements(), El, MIRBuilder); } if (Ty->isArrayTy()) { - auto *El = getOrCreateSPIRVType(Ty->getArrayElementType(), MIRBuilder); + SPIRVType *El = findSPIRVType(Ty->getArrayElementType(), MIRBuilder); return getOpTypeArray(Ty->getArrayNumElements(), El, MIRBuilder, EmitIR); } - assert(!isa<StructType>(Ty) && "Unsupported StructType"); + if (auto SType = dyn_cast<StructType>(Ty)) { + if (SType->isOpaque()) + return getOpTypeOpaque(SType, MIRBuilder); + return getOpTypeStruct(SType, MIRBuilder, EmitIR); + } if (auto FType = dyn_cast<FunctionType>(Ty)) { - SPIRVType *RetTy = getOrCreateSPIRVType(FType->getReturnType(), MIRBuilder); + SPIRVType *RetTy = findSPIRVType(FType->getReturnType(), MIRBuilder); SmallVector<SPIRVType *, 4> ParamTypes; for (const auto &t : FType->params()) { - ParamTypes.push_back(getOrCreateSPIRVType(t, MIRBuilder)); + ParamTypes.push_back(findSPIRVType(t, MIRBuilder)); } return getOpTypeFunction(RetTy, ParamTypes, MIRBuilder); } @@ -292,24 +550,51 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType(const Type *Ty, // At the moment, all opaque pointers correspond to i8 element type. // TODO: change the implementation once opaque pointers are supported // in the SPIR-V specification. - if (PType->isOpaque()) { + if (PType->isOpaque()) SpvElementType = getOrCreateSPIRVIntegerType(8, MIRBuilder); - } else { - Type *ElemType = PType->getNonOpaquePointerElementType(); - // TODO: support OpenCL and SPIRV builtins like image2d_t that are passed - // as pointers, but should be treated as custom types like OpTypeImage. - assert(!isa<StructType>(ElemType) && "Unsupported StructType pointer"); - - // Otherwise, treat it as a regular pointer type. - SpvElementType = getOrCreateSPIRVType( - ElemType, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, EmitIR); - } + else + SpvElementType = + findSPIRVType(PType->getNonOpaquePointerElementType(), MIRBuilder, + SPIRV::AccessQualifier::ReadWrite, EmitIR); auto SC = addressSpaceToStorageClass(PType->getAddressSpace()); - return getOpTypePointer(SC, SpvElementType, MIRBuilder); + // Null pointer means we have a loop in type definitions, make and + // return corresponding OpTypeForwardPointer. + if (SpvElementType == nullptr) { + if (ForwardPointerTypes.find(Ty) == ForwardPointerTypes.end()) + ForwardPointerTypes[PType] = getOpTypeForwardPointer(SC, MIRBuilder); + return ForwardPointerTypes[PType]; + } + Register Reg(0); + // If we have forward pointer associated with this type, use its register + // operand to create OpTypePointer. + if (ForwardPointerTypes.find(PType) != ForwardPointerTypes.end()) + Reg = getSPIRVTypeID(ForwardPointerTypes[PType]); + + return getOpTypePointer(SC, SpvElementType, MIRBuilder, Reg); } llvm_unreachable("Unable to convert LLVM type to SPIRVType"); } +SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType( + const Type *Ty, MachineIRBuilder &MIRBuilder, + SPIRV::AccessQualifier AccessQual, bool EmitIR) { + if (TypesInProcessing.count(Ty) && !Ty->isPointerTy()) + return nullptr; + TypesInProcessing.insert(Ty); + SPIRVType *SpirvType = createSPIRVType(Ty, MIRBuilder, AccessQual, EmitIR); + TypesInProcessing.erase(Ty); + VRegToTypeMap[&MIRBuilder.getMF()][getSPIRVTypeID(SpirvType)] = SpirvType; + SPIRVToLLVMType[SpirvType] = Ty; + Register Reg = DT.find(Ty, &MIRBuilder.getMF()); + // Do not add OpTypeForwardPointer to DT, a corresponding normal pointer type + // will be added later. For special types it is already added to DT. + if (SpirvType->getOpcode() != SPIRV::OpTypeForwardPointer && !Reg.isValid() && + !isSpecialType(Ty)) + DT.add(Ty, &MIRBuilder.getMF(), getSPIRVTypeID(SpirvType)); + + return SpirvType; +} + SPIRVType *SPIRVGlobalRegistry::getSPIRVTypeForVReg(Register VReg) const { auto t = VRegToTypeMap.find(CurMF); if (t != VRegToTypeMap.end()) { @@ -321,13 +606,26 @@ SPIRVType *SPIRVGlobalRegistry::getSPIRVTypeForVReg(Register VReg) const { } SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVType( - const Type *Type, MachineIRBuilder &MIRBuilder, + const Type *Ty, MachineIRBuilder &MIRBuilder, SPIRV::AccessQualifier AccessQual, bool EmitIR) { - Register Reg = DT.find(Type, &MIRBuilder.getMF()); + Register Reg = DT.find(Ty, &MIRBuilder.getMF()); if (Reg.isValid()) return getSPIRVTypeForVReg(Reg); - SPIRVType *SpirvType = createSPIRVType(Type, MIRBuilder, AccessQual, EmitIR); - return restOfCreateSPIRVType(Type, SpirvType); + TypesInProcessing.clear(); + SPIRVType *STy = restOfCreateSPIRVType(Ty, MIRBuilder, AccessQual, EmitIR); + // Create normal pointer types for the corresponding OpTypeForwardPointers. + for (auto &CU : ForwardPointerTypes) { + const Type *Ty2 = CU.first; + SPIRVType *STy2 = CU.second; + if ((Reg = DT.find(Ty2, &MIRBuilder.getMF())).isValid()) + STy2 = getSPIRVTypeForVReg(Reg); + else + STy2 = restOfCreateSPIRVType(Ty2, MIRBuilder, AccessQual, EmitIR); + if (Ty == Ty2) + STy = STy2; + } + ForwardPointerTypes.clear(); + return STy; } bool SPIRVGlobalRegistry::isScalarOfType(Register VReg, @@ -393,8 +691,8 @@ SPIRVGlobalRegistry::getOrCreateSPIRVIntegerType(unsigned BitWidth, MIRBuilder); } -SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType(const Type *LLVMTy, - SPIRVType *SpirvType) { +SPIRVType *SPIRVGlobalRegistry::finishCreatingSPIRVType(const Type *LLVMTy, + SPIRVType *SpirvType) { assert(CurMF == SpirvType->getMF()); VRegToTypeMap[CurMF][getSPIRVTypeID(SpirvType)] = SpirvType; SPIRVToLLVMType[SpirvType] = LLVMTy; @@ -413,7 +711,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVIntegerType( .addDef(createTypeVReg(CurMF->getRegInfo())) .addImm(BitWidth) .addImm(0); - return restOfCreateSPIRVType(LLVMTy, MIB); + return finishCreatingSPIRVType(LLVMTy, MIB); } SPIRVType * @@ -423,6 +721,19 @@ SPIRVGlobalRegistry::getOrCreateSPIRVBoolType(MachineIRBuilder &MIRBuilder) { MIRBuilder); } +SPIRVType * +SPIRVGlobalRegistry::getOrCreateSPIRVBoolType(MachineInstr &I, + const SPIRVInstrInfo &TII) { + Type *LLVMTy = IntegerType::get(CurMF->getFunction().getContext(), 1); + Register Reg = DT.find(LLVMTy, CurMF); + if (Reg.isValid()) + return getSPIRVTypeForVReg(Reg); + MachineBasicBlock &BB = *I.getParent(); + auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpTypeBool)) + .addDef(createTypeVReg(CurMF->getRegInfo())); + return finishCreatingSPIRVType(LLVMTy, MIB); +} + SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVVectorType( SPIRVType *BaseType, unsigned NumElements, MachineIRBuilder &MIRBuilder) { return getOrCreateSPIRVType( @@ -436,12 +747,15 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVVectorType( const SPIRVInstrInfo &TII) { Type *LLVMTy = FixedVectorType::get( const_cast<Type *>(getTypeForSPIRVType(BaseType)), NumElements); + Register Reg = DT.find(LLVMTy, CurMF); + if (Reg.isValid()) + return getSPIRVTypeForVReg(Reg); MachineBasicBlock &BB = *I.getParent(); auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpTypeVector)) .addDef(createTypeVReg(CurMF->getRegInfo())) .addUse(getSPIRVTypeID(BaseType)) .addImm(NumElements); - return restOfCreateSPIRVType(LLVMTy, MIB); + return finishCreatingSPIRVType(LLVMTy, MIB); } SPIRVType * @@ -460,10 +774,39 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVPointerType( Type *LLVMTy = PointerType::get(const_cast<Type *>(getTypeForSPIRVType(BaseType)), storageClassToAddressSpace(SC)); + Register Reg = DT.find(LLVMTy, CurMF); + if (Reg.isValid()) + return getSPIRVTypeForVReg(Reg); MachineBasicBlock &BB = *I.getParent(); auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpTypePointer)) .addDef(createTypeVReg(CurMF->getRegInfo())) .addImm(static_cast<uint32_t>(SC)) .addUse(getSPIRVTypeID(BaseType)); - return restOfCreateSPIRVType(LLVMTy, MIB); + return finishCreatingSPIRVType(LLVMTy, MIB); +} + +Register SPIRVGlobalRegistry::getOrCreateUndef(MachineInstr &I, + SPIRVType *SpvType, + const SPIRVInstrInfo &TII) { + assert(SpvType); + const Type *LLVMTy = getTypeForSPIRVType(SpvType); + assert(LLVMTy); + // Find a constant in DT or build a new one. + UndefValue *UV = UndefValue::get(const_cast<Type *>(LLVMTy)); + Register Res = DT.find(UV, CurMF); + if (Res.isValid()) + return Res; + LLT LLTy = LLT::scalar(32); + Res = CurMF->getRegInfo().createGenericVirtualRegister(LLTy); + assignSPIRVTypeToVReg(SpvType, Res, *CurMF); + DT.add(UV, CurMF, Res); + + MachineInstrBuilder MIB; + MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpUndef)) + .addDef(Res) + .addUse(getSPIRVTypeID(SpvType)); + const auto &ST = CurMF->getSubtarget(); + constrainSelectedInstRegOperands(*MIB, *ST.getInstrInfo(), + *ST.getRegisterInfo(), *ST.getRegBankInfo()); + return Res; } diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h index 13dcc20a3e0a..59ac2712a02f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h @@ -30,7 +30,7 @@ class SPIRVGlobalRegistry { // Do not confuse this with DuplicatesTracker as DT maps Type* to <MF, Reg> // where Reg = OpType... // while VRegToTypeMap tracks SPIR-V type assigned to other regs (i.e. not - // type-declaring ones) + // type-declaring ones). DenseMap<const MachineFunction *, DenseMap<Register, SPIRVType *>> VRegToTypeMap; @@ -38,6 +38,9 @@ class SPIRVGlobalRegistry { DenseMap<SPIRVType *, const Type *> SPIRVToLLVMType; + SmallPtrSet<const Type *, 4> TypesInProcessing; + DenseMap<const Type *, SPIRVType *> ForwardPointerTypes; + // Number of bits pointers and size_t integers require. const unsigned PointerSize; @@ -46,6 +49,14 @@ class SPIRVGlobalRegistry { createSPIRVType(const Type *Type, MachineIRBuilder &MIRBuilder, SPIRV::AccessQualifier AQ = SPIRV::AccessQualifier::ReadWrite, bool EmitIR = true); + SPIRVType *findSPIRVType( + const Type *Ty, MachineIRBuilder &MIRBuilder, + SPIRV::AccessQualifier accessQual = SPIRV::AccessQualifier::ReadWrite, + bool EmitIR = true); + SPIRVType *restOfCreateSPIRVType(const Type *Type, + MachineIRBuilder &MIRBuilder, + SPIRV::AccessQualifier AccessQual, + bool EmitIR); public: SPIRVGlobalRegistry(unsigned PointerSize); @@ -91,6 +102,11 @@ public: const Type *Type, Register VReg, MachineIRBuilder &MIRBuilder, SPIRV::AccessQualifier AQ = SPIRV::AccessQualifier::ReadWrite, bool EmitIR = true); + SPIRVType *assignIntTypeToVReg(unsigned BitWidth, Register VReg, + MachineInstr &I, const SPIRVInstrInfo &TII); + SPIRVType *assignVectTypeToVReg(SPIRVType *BaseType, unsigned NumElements, + Register VReg, MachineInstr &I, + const SPIRVInstrInfo &TII); // In cases where the SPIR-V type is already known, this function can be // used to map it to the given VReg via an ASSIGN_TYPE instruction. @@ -123,10 +139,7 @@ public: } // Return the VReg holding the result of the given OpTypeXXX instruction. - Register getSPIRVTypeID(const SPIRVType *SpirvType) const { - assert(SpirvType && "Attempting to get type id for nullptr type."); - return SpirvType->defs().begin()->getReg(); - } + Register getSPIRVTypeID(const SPIRVType *SpirvType) const; void setCurrentFunc(MachineFunction &MF) { CurMF = &MF; } @@ -167,19 +180,38 @@ private: SPIRVType *getOpTypeArray(uint32_t NumElems, SPIRVType *ElemType, MachineIRBuilder &MIRBuilder, bool EmitIR = true); + SPIRVType *getOpTypeOpaque(const StructType *Ty, + MachineIRBuilder &MIRBuilder); + + SPIRVType *getOpTypeStruct(const StructType *Ty, MachineIRBuilder &MIRBuilder, + bool EmitIR = true); + SPIRVType *getOpTypePointer(SPIRV::StorageClass SC, SPIRVType *ElemType, - MachineIRBuilder &MIRBuilder); + MachineIRBuilder &MIRBuilder, Register Reg); + + SPIRVType *getOpTypeForwardPointer(SPIRV::StorageClass SC, + MachineIRBuilder &MIRBuilder); SPIRVType *getOpTypeFunction(SPIRVType *RetType, const SmallVectorImpl<SPIRVType *> &ArgTypes, MachineIRBuilder &MIRBuilder); - SPIRVType *restOfCreateSPIRVType(const Type *LLVMTy, SPIRVType *SpirvType); + std::tuple<Register, ConstantInt *, bool> getOrCreateConstIntReg( + uint64_t Val, SPIRVType *SpvType, MachineIRBuilder *MIRBuilder, + MachineInstr *I = nullptr, const SPIRVInstrInfo *TII = nullptr); + SPIRVType *finishCreatingSPIRVType(const Type *LLVMTy, SPIRVType *SpirvType); public: Register buildConstantInt(uint64_t Val, MachineIRBuilder &MIRBuilder, SPIRVType *SpvType = nullptr, bool EmitIR = true); + Register getOrCreateConstInt(uint64_t Val, MachineInstr &I, + SPIRVType *SpvType, const SPIRVInstrInfo &TII); Register buildConstantFP(APFloat Val, MachineIRBuilder &MIRBuilder, SPIRVType *SpvType = nullptr); + Register getOrCreateConsIntVector(uint64_t Val, MachineInstr &I, + SPIRVType *SpvType, + const SPIRVInstrInfo &TII); + Register getOrCreateUndef(MachineInstr &I, SPIRVType *SpvType, + const SPIRVInstrInfo &TII); Register buildGlobalVariable(Register Reg, SPIRVType *BaseType, StringRef Name, const GlobalValue *GV, SPIRV::StorageClass Storage, @@ -193,19 +225,24 @@ public: SPIRVType *getOrCreateSPIRVIntegerType(unsigned BitWidth, MachineInstr &I, const SPIRVInstrInfo &TII); SPIRVType *getOrCreateSPIRVBoolType(MachineIRBuilder &MIRBuilder); + SPIRVType *getOrCreateSPIRVBoolType(MachineInstr &I, + const SPIRVInstrInfo &TII); SPIRVType *getOrCreateSPIRVVectorType(SPIRVType *BaseType, unsigned NumElements, MachineIRBuilder &MIRBuilder); SPIRVType *getOrCreateSPIRVVectorType(SPIRVType *BaseType, unsigned NumElements, MachineInstr &I, const SPIRVInstrInfo &TII); - SPIRVType *getOrCreateSPIRVPointerType( SPIRVType *BaseType, MachineIRBuilder &MIRBuilder, SPIRV::StorageClass SClass = SPIRV::StorageClass::Function); SPIRVType *getOrCreateSPIRVPointerType( SPIRVType *BaseType, MachineInstr &I, const SPIRVInstrInfo &TII, SPIRV::StorageClass SClass = SPIRV::StorageClass::Function); + SPIRVType *getOrCreateOpTypeFunctionWithArgs( + const Type *Ty, SPIRVType *RetType, + const SmallVectorImpl<SPIRVType *> &ArgTypes, + MachineIRBuilder &MIRBuilder); }; } // end namespace llvm #endif // LLLVM_LIB_TARGET_SPIRV_SPIRVTYPEMANAGER_H diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp index 754906308114..66d8b17b4296 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp @@ -52,7 +52,7 @@ bool SPIRVInstrInfo::isTypeDeclInstr(const MachineInstr &MI) const { auto DefRegClass = MRI.getRegClassOrNull(MI.getOperand(0).getReg()); return DefRegClass && DefRegClass->getID() == SPIRV::TYPERegClass.getID(); } else { - return false; + return MI.getOpcode() == SPIRV::OpTypeForwardPointer; } } @@ -193,3 +193,15 @@ void SPIRVInstrInfo::copyPhysReg(MachineBasicBlock &MBB, auto &MRI = I->getMF()->getRegInfo(); MRI.replaceRegWith(DstOp.getReg(), SrcOp.getReg()); } + +bool SPIRVInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { + if (MI.getOpcode() == SPIRV::GET_ID || MI.getOpcode() == SPIRV::GET_fID || + MI.getOpcode() == SPIRV::GET_pID || MI.getOpcode() == SPIRV::GET_vfID || + MI.getOpcode() == SPIRV::GET_vID) { + auto &MRI = MI.getMF()->getRegInfo(); + MRI.replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(1).getReg()); + MI.eraseFromParent(); + return true; + } + return false; +} diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h index 2600d9cfca2e..334351c8eeae 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h @@ -48,6 +48,7 @@ public: void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override; + bool expandPostRAPseudo(MachineInstr &MI) const override; }; } // namespace llvm diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td index d6fec5fd0785..d1c20795f804 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td @@ -449,6 +449,7 @@ def OpCopyLogical: UnOp<"OpCopyLogical", 400>; def OpSNegate: UnOp<"OpSNegate", 126>; def OpFNegate: UnOpTyped<"OpFNegate", 127, fID, fneg>; +def OpFNegateV: UnOpTyped<"OpFNegate", 127, vfID, fneg>; defm OpIAdd: BinOpTypedGen<"OpIAdd", 128, add, 0, 1>; defm OpFAdd: BinOpTypedGen<"OpFAdd", 129, fadd, 1, 1>; @@ -618,8 +619,10 @@ def OpAtomicCompareExchange: Op<230, (outs ID:$res), (ins TYPE:$ty, ID:$ptr, ID:$sc, ID:$eq, ID:$neq, ID:$val, ID:$cmp), "$res = OpAtomicCompareExchange $ty $ptr $sc $eq $neq $val $cmp">; -// TODO Currently the following deprecated opcode is missing: -// OpAtomicCompareExchangeWeak +def OpAtomicCompareExchangeWeak: Op<231, (outs ID:$res), + (ins TYPE:$ty, ID:$ptr, ID:$sc, ID:$eq, + ID:$neq, ID:$val, ID:$cmp), + "$res = OpAtomicCompareExchangeWeak $ty $ptr $sc $eq $neq $val $cmp">; def OpAtomicIIncrement: AtomicOp<"OpAtomicIIncrement", 232>; def OpAtomicIDecrement: AtomicOp<"OpAtomicIDecrement", 233>; @@ -660,6 +663,11 @@ def OpMemoryNamedBarrier: Op<329, (outs), (ins ID:$barr, ID:$mem, ID:$sem), // 3.42.21. Group and Subgroup Instructions +def OpGroupAsyncCopy: Op<259, (outs ID:$res), (ins TYPE:$ty, ID:$scope, + ID:$dst, ID:$src, ID:$nelts, ID:$stride, ID:$event), + "$res = OpGroupAsyncCopy $ty $scope $dst $src $nelts $stride $event">; +def OpGroupWaitEvents: Op<260, (outs), (ins ID:$scope, ID:$nelts, ID:$elist), + "OpGroupWaitEvents $scope $nelts $elist">; def OpGroupAll: Op<261, (outs ID:$res), (ins TYPE:$ty, ID:$scope, ID:$pr), "$res = OpGroupAll $ty $scope $pr">; def OpGroupAny: Op<262, (outs ID:$res), (ins TYPE:$ty, ID:$scope, ID:$pr), @@ -680,6 +688,18 @@ def OpGroupUMax: OpGroup<"UMax", 270>; def OpGroupSMax: OpGroup<"SMax", 271>; // TODO: 3.42.22. Device-Side Enqueue Instructions +def OpRetainEvent: Op<297, (outs), (ins ID:$event), "OpRetainEvent $event">; +def OpReleaseEvent: Op<298, (outs), (ins ID:$event), "OpReleaseEvent $event">; +def OpCreateUserEvent: Op<299, (outs ID:$res), (ins TYPE:$type), + "$res = OpCreateUserEvent $type">; +def OpIsValidEvent: Op<300, (outs ID:$res), (ins TYPE:$type, ID:$event), + "$res = OpIsValidEvent $type $event ">; +def OpSetUserEventStatus: Op<301, (outs), (ins ID:$event, ID:$status), + "OpSetUserEventStatus $event $status">; +def OpCaptureEventProfilingInfo: Op<302, (outs), + (ins ID:$event, ID:$info, ID:$value), + "OpCaptureEventProfilingInfo $event $info $value">; + // TODO: 3.42.23. Pipe Instructions // 3.42.24. Non-Uniform Instructions diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 90b921a06f21..9365fd22e4e7 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -197,6 +197,8 @@ void SPIRVInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB, InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI); } +static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI); + // Defined in SPIRVLegalizerInfo.cpp. extern bool isTypeFoldingSupported(unsigned Opcode); @@ -335,6 +337,30 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg, return selectUnOp(ResVReg, ResType, I, SPIRV::OpBitcast); case TargetOpcode::G_ADDRSPACE_CAST: return selectAddrSpaceCast(ResVReg, ResType, I); + case TargetOpcode::G_PTR_ADD: { + // Currently, we get G_PTR_ADD only as a result of translating + // global variables, initialized with constant expressions like GV + Const + // (see test opencl/basic/progvar_prog_scope_init.ll). + // TODO: extend the handler once we have other cases. + assert(I.getOperand(1).isReg() && I.getOperand(2).isReg()); + Register GV = I.getOperand(1).getReg(); + MachineRegisterInfo::def_instr_iterator II = MRI->def_instr_begin(GV); + assert(((*II).getOpcode() == TargetOpcode::G_GLOBAL_VALUE || + (*II).getOpcode() == TargetOpcode::COPY || + (*II).getOpcode() == SPIRV::OpVariable) && + isImm(I.getOperand(2), MRI)); + Register Idx = buildZerosVal(GR.getOrCreateSPIRVIntegerType(32, I, TII), I); + MachineBasicBlock &BB = *I.getParent(); + auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSpecConstantOp)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addImm(static_cast<uint32_t>( + SPIRV::Opcode::InBoundsPtrAccessChain)) + .addUse(GV) + .addUse(Idx) + .addUse(I.getOperand(2).getReg()); + return MIB.constrainAllUses(TII, TRI, RBI); + } case TargetOpcode::G_ATOMICRMW_OR: return selectAtomicRMW(ResVReg, ResType, I, SPIRV::OpAtomicOr); @@ -387,23 +413,6 @@ bool SPIRVInstructionSelector::selectUnOp(Register ResVReg, Opcode); } -static SPIRV::MemorySemantics getMemSemantics(AtomicOrdering Ord) { - switch (Ord) { - case AtomicOrdering::Acquire: - return SPIRV::MemorySemantics::Acquire; - case AtomicOrdering::Release: - return SPIRV::MemorySemantics::Release; - case AtomicOrdering::AcquireRelease: - return SPIRV::MemorySemantics::AcquireRelease; - case AtomicOrdering::SequentiallyConsistent: - return SPIRV::MemorySemantics::SequentiallyConsistent; - case AtomicOrdering::Unordered: - case AtomicOrdering::Monotonic: - case AtomicOrdering::NotAtomic: - return SPIRV::MemorySemantics::None; - } -} - static SPIRV::Scope getScope(SyncScope::ID Ord) { switch (Ord) { case SyncScope::SingleThread: @@ -484,16 +493,15 @@ bool SPIRVInstructionSelector::selectMemOperation(Register ResVReg, MachineInstr &I) const { MachineBasicBlock &BB = *I.getParent(); auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCopyMemorySized)) - .addDef(I.getOperand(0).getReg()) + .addUse(I.getOperand(0).getReg()) .addUse(I.getOperand(1).getReg()) .addUse(I.getOperand(2).getReg()); if (I.getNumMemOperands()) addMemoryOperands(*I.memoperands_begin(), MIB); bool Result = MIB.constrainAllUses(TII, TRI, RBI); - if (ResVReg.isValid() && ResVReg != MIB->getOperand(0).getReg()) { + if (ResVReg.isValid() && ResVReg != MIB->getOperand(0).getReg()) BuildMI(BB, I, I.getDebugLoc(), TII.get(TargetOpcode::COPY), ResVReg) .addUse(MIB->getOperand(0).getReg()); - } return Result; } @@ -541,36 +549,71 @@ bool SPIRVInstructionSelector::selectFence(MachineInstr &I) const { bool SPIRVInstructionSelector::selectAtomicCmpXchg(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const { - assert(I.hasOneMemOperand()); - const MachineMemOperand *MemOp = *I.memoperands_begin(); - uint32_t Scope = static_cast<uint32_t>(getScope(MemOp->getSyncScopeID())); - Register ScopeReg = buildI32Constant(Scope, I); - + Register ScopeReg; + Register MemSemEqReg; + Register MemSemNeqReg; Register Ptr = I.getOperand(2).getReg(); + if (I.getOpcode() != TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS) { + assert(I.hasOneMemOperand()); + const MachineMemOperand *MemOp = *I.memoperands_begin(); + unsigned Scope = static_cast<uint32_t>(getScope(MemOp->getSyncScopeID())); + ScopeReg = buildI32Constant(Scope, I); + + unsigned ScSem = static_cast<uint32_t>( + getMemSemanticsForStorageClass(GR.getPointerStorageClass(Ptr))); + AtomicOrdering AO = MemOp->getSuccessOrdering(); + unsigned MemSemEq = static_cast<uint32_t>(getMemSemantics(AO)) | ScSem; + MemSemEqReg = buildI32Constant(MemSemEq, I); + AtomicOrdering FO = MemOp->getFailureOrdering(); + unsigned MemSemNeq = static_cast<uint32_t>(getMemSemantics(FO)) | ScSem; + MemSemNeqReg = + MemSemEq == MemSemNeq ? MemSemEqReg : buildI32Constant(MemSemNeq, I); + } else { + ScopeReg = I.getOperand(5).getReg(); + MemSemEqReg = I.getOperand(6).getReg(); + MemSemNeqReg = I.getOperand(7).getReg(); + } + Register Cmp = I.getOperand(3).getReg(); Register Val = I.getOperand(4).getReg(); - SPIRVType *SpvValTy = GR.getSPIRVTypeForVReg(Val); - SPIRV::StorageClass SC = GR.getPointerStorageClass(Ptr); - uint32_t ScSem = static_cast<uint32_t>(getMemSemanticsForStorageClass(SC)); - AtomicOrdering AO = MemOp->getSuccessOrdering(); - uint32_t MemSemEq = static_cast<uint32_t>(getMemSemantics(AO)) | ScSem; - Register MemSemEqReg = buildI32Constant(MemSemEq, I); - AtomicOrdering FO = MemOp->getFailureOrdering(); - uint32_t MemSemNeq = static_cast<uint32_t>(getMemSemantics(FO)) | ScSem; - Register MemSemNeqReg = - MemSemEq == MemSemNeq ? MemSemEqReg : buildI32Constant(MemSemNeq, I); + Register ACmpRes = MRI->createVirtualRegister(&SPIRV::IDRegClass); const DebugLoc &DL = I.getDebugLoc(); - return BuildMI(*I.getParent(), I, DL, TII.get(SPIRV::OpAtomicCompareExchange)) - .addDef(ResVReg) - .addUse(GR.getSPIRVTypeID(SpvValTy)) - .addUse(Ptr) - .addUse(ScopeReg) - .addUse(MemSemEqReg) - .addUse(MemSemNeqReg) - .addUse(Val) - .addUse(Cmp) - .constrainAllUses(TII, TRI, RBI); + bool Result = + BuildMI(*I.getParent(), I, DL, TII.get(SPIRV::OpAtomicCompareExchange)) + .addDef(ACmpRes) + .addUse(GR.getSPIRVTypeID(SpvValTy)) + .addUse(Ptr) + .addUse(ScopeReg) + .addUse(MemSemEqReg) + .addUse(MemSemNeqReg) + .addUse(Val) + .addUse(Cmp) + .constrainAllUses(TII, TRI, RBI); + Register CmpSuccReg = MRI->createVirtualRegister(&SPIRV::IDRegClass); + SPIRVType *BoolTy = GR.getOrCreateSPIRVBoolType(I, TII); + Result |= BuildMI(*I.getParent(), I, DL, TII.get(SPIRV::OpIEqual)) + .addDef(CmpSuccReg) + .addUse(GR.getSPIRVTypeID(BoolTy)) + .addUse(ACmpRes) + .addUse(Cmp) + .constrainAllUses(TII, TRI, RBI); + Register TmpReg = MRI->createVirtualRegister(&SPIRV::IDRegClass); + Result |= BuildMI(*I.getParent(), I, DL, TII.get(SPIRV::OpCompositeInsert)) + .addDef(TmpReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(ACmpRes) + .addUse(GR.getOrCreateUndef(I, ResType, TII)) + .addImm(0) + .constrainAllUses(TII, TRI, RBI); + Result |= BuildMI(*I.getParent(), I, DL, TII.get(SPIRV::OpCompositeInsert)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(CmpSuccReg) + .addUse(TmpReg) + .addImm(1) + .constrainAllUses(TII, TRI, RBI); + return Result; } static bool isGenericCastablePtr(SPIRV::StorageClass SC) { @@ -592,6 +635,27 @@ static bool isGenericCastablePtr(SPIRV::StorageClass SC) { bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const { + // If the AddrSpaceCast user is single and in OpConstantComposite or + // OpVariable, we should select OpSpecConstantOp. + auto UIs = MRI->use_instructions(ResVReg); + if (!UIs.empty() && ++UIs.begin() == UIs.end() && + (UIs.begin()->getOpcode() == SPIRV::OpConstantComposite || + UIs.begin()->getOpcode() == SPIRV::OpVariable || + isSpvIntrinsic(*UIs.begin(), Intrinsic::spv_init_global))) { + Register NewReg = I.getOperand(1).getReg(); + MachineBasicBlock &BB = *I.getParent(); + SPIRVType *SpvBaseTy = GR.getOrCreateSPIRVIntegerType(8, I, TII); + ResType = GR.getOrCreateSPIRVPointerType(SpvBaseTy, I, TII, + SPIRV::StorageClass::Generic); + bool Result = + BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSpecConstantOp)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addImm(static_cast<uint32_t>(SPIRV::Opcode::PtrCastToGeneric)) + .addUse(NewReg) + .constrainAllUses(TII, TRI, RBI); + return Result; + } Register SrcPtr = I.getOperand(1).getReg(); SPIRVType *SrcPtrTy = GR.getSPIRVTypeForVReg(SrcPtr); SPIRV::StorageClass SrcSC = GR.getPointerStorageClass(SrcPtr); @@ -842,7 +906,9 @@ bool SPIRVInstructionSelector::selectFCmp(Register ResVReg, Register SPIRVInstructionSelector::buildZerosVal(const SPIRVType *ResType, MachineInstr &I) const { - return buildI32Constant(0, I, ResType); + if (ResType->getOpcode() == SPIRV::OpTypeVector) + return GR.getOrCreateConsIntVector(0, I, ResType, TII); + return GR.getOrCreateConstInt(0, I, ResType, TII); } Register SPIRVInstructionSelector::buildOnesVal(bool AllOnes, @@ -851,20 +917,9 @@ Register SPIRVInstructionSelector::buildOnesVal(bool AllOnes, unsigned BitWidth = GR.getScalarOrVectorBitWidth(ResType); APInt One = AllOnes ? APInt::getAllOnesValue(BitWidth) : APInt::getOneBitSet(BitWidth, 0); - Register OneReg = buildI32Constant(One.getZExtValue(), I, ResType); - if (ResType->getOpcode() == SPIRV::OpTypeVector) { - const unsigned NumEles = ResType->getOperand(2).getImm(); - Register OneVec = MRI->createVirtualRegister(&SPIRV::IDRegClass); - unsigned Opcode = SPIRV::OpConstantComposite; - auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode)) - .addDef(OneVec) - .addUse(GR.getSPIRVTypeID(ResType)); - for (unsigned i = 0; i < NumEles; ++i) - MIB.addUse(OneReg); - constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); - return OneVec; - } - return OneReg; + if (ResType->getOpcode() == SPIRV::OpTypeVector) + return GR.getOrCreateConsIntVector(One.getZExtValue(), I, ResType, TII); + return GR.getOrCreateConstInt(One.getZExtValue(), I, ResType, TII); } bool SPIRVInstructionSelector::selectSelect(Register ResVReg, @@ -959,13 +1014,23 @@ bool SPIRVInstructionSelector::selectConst(Register ResVReg, const SPIRVType *ResType, const APInt &Imm, MachineInstr &I) const { - assert(ResType->getOpcode() != SPIRV::OpTypePointer || Imm.isNullValue()); + unsigned TyOpcode = ResType->getOpcode(); + assert(TyOpcode != SPIRV::OpTypePointer || Imm.isNullValue()); MachineBasicBlock &BB = *I.getParent(); - if (ResType->getOpcode() == SPIRV::OpTypePointer && Imm.isNullValue()) { + if ((TyOpcode == SPIRV::OpTypePointer || TyOpcode == SPIRV::OpTypeEvent) && + Imm.isNullValue()) return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull)) .addDef(ResVReg) .addUse(GR.getSPIRVTypeID(ResType)) .constrainAllUses(TII, TRI, RBI); + if (TyOpcode == SPIRV::OpTypeInt) { + Register Reg = GR.getOrCreateConstInt(Imm.getZExtValue(), I, ResType, TII); + if (Reg == ResVReg) + return true; + return BuildMI(BB, I, I.getDebugLoc(), TII.get(TargetOpcode::COPY)) + .addDef(ResVReg) + .addUse(Reg) + .constrainAllUses(TII, TRI, RBI); } auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantI)) .addDef(ResVReg) @@ -1006,29 +1071,29 @@ bool SPIRVInstructionSelector::selectInsertVal(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const { MachineBasicBlock &BB = *I.getParent(); - return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeInsert)) - .addDef(ResVReg) - .addUse(GR.getSPIRVTypeID(ResType)) - // object to insert - .addUse(I.getOperand(3).getReg()) - // composite to insert into - .addUse(I.getOperand(2).getReg()) - // TODO: support arbitrary number of indices - .addImm(foldImm(I.getOperand(4), MRI)) - .constrainAllUses(TII, TRI, RBI); + auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeInsert)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + // object to insert + .addUse(I.getOperand(3).getReg()) + // composite to insert into + .addUse(I.getOperand(2).getReg()); + for (unsigned i = 4; i < I.getNumOperands(); i++) + MIB.addImm(foldImm(I.getOperand(i), MRI)); + return MIB.constrainAllUses(TII, TRI, RBI); } bool SPIRVInstructionSelector::selectExtractVal(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const { MachineBasicBlock &BB = *I.getParent(); - return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract)) - .addDef(ResVReg) - .addUse(GR.getSPIRVTypeID(ResType)) - .addUse(I.getOperand(2).getReg()) - // TODO: support arbitrary number of indices - .addImm(foldImm(I.getOperand(3), MRI)) - .constrainAllUses(TII, TRI, RBI); + auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(I.getOperand(2).getReg()); + for (unsigned i = 3; i < I.getNumOperands(); i++) + MIB.addImm(foldImm(I.getOperand(i), MRI)); + return MIB.constrainAllUses(TII, TRI, RBI); } bool SPIRVInstructionSelector::selectInsertElt(Register ResVReg, @@ -1154,6 +1219,9 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, } return MIB.constrainAllUses(TII, TRI, RBI); } break; + case Intrinsic::spv_cmpxchg: + return selectAtomicCmpXchg(ResVReg, ResType, I); + break; default: llvm_unreachable("Intrinsic selection not implemented"); } @@ -1239,8 +1307,32 @@ bool SPIRVInstructionSelector::selectGlobalValue( GV->getType(), MIRBuilder, SPIRV::AccessQualifier::ReadWrite, false); std::string GlobalIdent = GV->getGlobalIdentifier(); - // TODO: suport @llvm.global.annotations. + // We have functions as operands in tests with blocks of instruction e.g. in + // transcoding/global_block.ll. These operands are not used and should be + // substituted by zero constants. Their type is expected to be always + // OpTypePointer Function %uchar. + if (isa<Function>(GV)) { + const Constant *ConstVal = GV; + MachineBasicBlock &BB = *I.getParent(); + Register NewReg = GR.find(ConstVal, GR.CurMF); + if (!NewReg.isValid()) { + SPIRVType *SpvBaseTy = GR.getOrCreateSPIRVIntegerType(8, I, TII); + ResType = GR.getOrCreateSPIRVPointerType(SpvBaseTy, I, TII); + Register NewReg = ResVReg; + GR.add(ConstVal, GR.CurMF, NewReg); + return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull)) + .addDef(NewReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .constrainAllUses(TII, TRI, RBI); + } + assert(NewReg != ResVReg); + return BuildMI(BB, I, I.getDebugLoc(), TII.get(TargetOpcode::COPY)) + .addDef(ResVReg) + .addUse(NewReg) + .constrainAllUses(TII, TRI, RBI); + } auto GlobalVar = cast<GlobalVariable>(GV); + assert(GlobalVar->getName() != "llvm.global.annotations"); bool HasInit = GlobalVar->hasInitializer() && !isa<UndefValue>(GlobalVar->getInitializer()); diff --git a/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp b/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp index 8e4ab973bf07..8aaac50c94d7 100644 --- a/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp @@ -45,7 +45,12 @@ void SPIRVMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI, break; } case MachineOperand::MO_Immediate: - MCOp = MCOperand::createImm(MO.getImm()); + if (MI->getOpcode() == SPIRV::OpExtInst && i == 2) { + Register Reg = MAI->getExtInstSetReg(MO.getImm()); + MCOp = MCOperand::createReg(Reg); + } else { + MCOp = MCOperand::createImm(MO.getImm()); + } break; case MachineOperand::MO_FPImmediate: MCOp = MCOperand::createDFPImm( diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index a39df5234935..143ddf7297dc 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -60,62 +60,50 @@ void SPIRVModuleAnalysis::setBaseInfo(const Module &M) { MAI.InstrsToDelete.clear(); MAI.FuncNameMap.clear(); MAI.GlobalVarList.clear(); + MAI.ExtInstSetMap.clear(); // TODO: determine memory model and source language from the configuratoin. - MAI.Mem = SPIRV::MemoryModel::OpenCL; - MAI.SrcLang = SPIRV::SourceLanguage::OpenCL_C; - unsigned PtrSize = ST->getPointerSize(); - MAI.Addr = PtrSize == 32 ? SPIRV::AddressingModel::Physical32 - : PtrSize == 64 ? SPIRV::AddressingModel::Physical64 - : SPIRV::AddressingModel::Logical; + if (auto MemModel = M.getNamedMetadata("spirv.MemoryModel")) { + auto MemMD = MemModel->getOperand(0); + MAI.Addr = static_cast<SPIRV::AddressingModel>(getMetadataUInt(MemMD, 0)); + MAI.Mem = static_cast<SPIRV::MemoryModel>(getMetadataUInt(MemMD, 1)); + } else { + MAI.Mem = SPIRV::MemoryModel::OpenCL; + unsigned PtrSize = ST->getPointerSize(); + MAI.Addr = PtrSize == 32 ? SPIRV::AddressingModel::Physical32 + : PtrSize == 64 ? SPIRV::AddressingModel::Physical64 + : SPIRV::AddressingModel::Logical; + } // Get the OpenCL version number from metadata. // TODO: support other source languages. - MAI.SrcLangVersion = 0; if (auto VerNode = M.getNamedMetadata("opencl.ocl.version")) { - // Construct version literal according to OpenCL 2.2 environment spec. + MAI.SrcLang = SPIRV::SourceLanguage::OpenCL_C; + // Construct version literal in accordance with SPIRV-LLVM-Translator. + // TODO: support multiple OCL version metadata. + assert(VerNode->getNumOperands() > 0 && "Invalid SPIR"); auto VersionMD = VerNode->getOperand(0); unsigned MajorNum = getMetadataUInt(VersionMD, 0, 2); unsigned MinorNum = getMetadataUInt(VersionMD, 1); unsigned RevNum = getMetadataUInt(VersionMD, 2); - MAI.SrcLangVersion = 0 | (MajorNum << 16) | (MinorNum << 8) | RevNum; + MAI.SrcLangVersion = (MajorNum * 100 + MinorNum) * 1000 + RevNum; + } else { + MAI.SrcLang = SPIRV::SourceLanguage::Unknown; + MAI.SrcLangVersion = 0; } -} -// True if there is an instruction in the MS list with all the same operands as -// the given instruction has (after the given starting index). -// TODO: maybe it needs to check Opcodes too. -static bool findSameInstrInMS(const MachineInstr &A, - SPIRV::ModuleSectionType MSType, - SPIRV::ModuleAnalysisInfo &MAI, - bool UpdateRegAliases, - unsigned StartOpIndex = 0) { - for (const auto *B : MAI.MS[MSType]) { - const unsigned NumAOps = A.getNumOperands(); - if (NumAOps == B->getNumOperands() && A.getNumDefs() == B->getNumDefs()) { - bool AllOpsMatch = true; - for (unsigned i = StartOpIndex; i < NumAOps && AllOpsMatch; ++i) { - if (A.getOperand(i).isReg() && B->getOperand(i).isReg()) { - Register RegA = A.getOperand(i).getReg(); - Register RegB = B->getOperand(i).getReg(); - AllOpsMatch = MAI.getRegisterAlias(A.getMF(), RegA) == - MAI.getRegisterAlias(B->getMF(), RegB); - } else { - AllOpsMatch = A.getOperand(i).isIdenticalTo(B->getOperand(i)); - } - } - if (AllOpsMatch) { - if (UpdateRegAliases) { - assert(A.getOperand(0).isReg() && B->getOperand(0).isReg()); - Register LocalReg = A.getOperand(0).getReg(); - Register GlobalReg = - MAI.getRegisterAlias(B->getMF(), B->getOperand(0).getReg()); - MAI.setRegisterAlias(A.getMF(), LocalReg, GlobalReg); - } - return true; - } + if (auto ExtNode = M.getNamedMetadata("opencl.used.extensions")) { + for (unsigned I = 0, E = ExtNode->getNumOperands(); I != E; ++I) { + MDNode *MD = ExtNode->getOperand(I); + if (!MD || MD->getNumOperands() == 0) + continue; + for (unsigned J = 0, N = MD->getNumOperands(); J != N; ++J) + MAI.SrcExt.insert(cast<MDString>(MD->getOperand(J))->getString()); } } - return false; + + // TODO: check if it's required by default. + MAI.ExtInstSetMap[static_cast<unsigned>(SPIRV::InstructionSet::OpenCL_std)] = + Register::index2VirtReg(MAI.getNextID()); } // Collect MI which defines the register in the given machine function. @@ -135,7 +123,7 @@ void SPIRVModuleAnalysis::collectGlobalEntities( const std::vector<SPIRV::DTSortableEntry *> &DepsGraph, SPIRV::ModuleSectionType MSType, std::function<bool(const SPIRV::DTSortableEntry *)> Pred, - bool UsePreOrder) { + bool UsePreOrder = false) { DenseSet<const SPIRV::DTSortableEntry *> Visited; for (const auto *E : DepsGraph) { std::function<void(const SPIRV::DTSortableEntry *)> RecHoistUtil; @@ -188,13 +176,41 @@ void SPIRVModuleAnalysis::processDefInstrs(const Module &M) { collectGlobalEntities( DepsGraph, SPIRV::MB_TypeConstVars, - [](const SPIRV::DTSortableEntry *E) { return !E->getIsFunc(); }, false); + [](const SPIRV::DTSortableEntry *E) { return !E->getIsFunc(); }); collectGlobalEntities( DepsGraph, SPIRV::MB_ExtFuncDecls, [](const SPIRV::DTSortableEntry *E) { return E->getIsFunc(); }, true); } +// True if there is an instruction in the MS list with all the same operands as +// the given instruction has (after the given starting index). +// TODO: maybe it needs to check Opcodes too. +static bool findSameInstrInMS(const MachineInstr &A, + SPIRV::ModuleSectionType MSType, + SPIRV::ModuleAnalysisInfo &MAI, + unsigned StartOpIndex = 0) { + for (const auto *B : MAI.MS[MSType]) { + const unsigned NumAOps = A.getNumOperands(); + if (NumAOps != B->getNumOperands() || A.getNumDefs() != B->getNumDefs()) + continue; + bool AllOpsMatch = true; + for (unsigned i = StartOpIndex; i < NumAOps && AllOpsMatch; ++i) { + if (A.getOperand(i).isReg() && B->getOperand(i).isReg()) { + Register RegA = A.getOperand(i).getReg(); + Register RegB = B->getOperand(i).getReg(); + AllOpsMatch = MAI.getRegisterAlias(A.getMF(), RegA) == + MAI.getRegisterAlias(B->getMF(), RegB); + } else { + AllOpsMatch = A.getOperand(i).isIdenticalTo(B->getOperand(i)); + } + } + if (AllOpsMatch) + return true; + } + return false; +} + // Look for IDs declared with Import linkage, and map the imported name string // to the register defining that variable (which will usually be the result of // an OpFunction). This lets us call externally imported functions using @@ -228,12 +244,16 @@ void SPIRVModuleAnalysis::collectFuncNames(MachineInstr &MI, // numbering has already occurred by this point. We can directly compare reg // arguments when detecting duplicates. static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI, - SPIRV::ModuleSectionType MSType) { + SPIRV::ModuleSectionType MSType, + bool Append = true) { MAI.setSkipEmission(&MI); - if (findSameInstrInMS(MI, MSType, MAI, false)) + if (findSameInstrInMS(MI, MSType, MAI)) return; // Found a duplicate, so don't add it. // No duplicates, so add it. - MAI.MS[MSType].push_back(&MI); + if (Append) + MAI.MS[MSType].push_back(&MI); + else + MAI.MS[MSType].insert(MAI.MS[MSType].begin(), &MI); } // Some global instructions make reference to function-local ID regs, so cannot @@ -256,15 +276,22 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) { } else if (TII->isDecorationInstr(MI)) { collectOtherInstr(MI, MAI, SPIRV::MB_Annotations); collectFuncNames(MI, *F); + } else if (TII->isConstantInstr(MI)) { + // Now OpSpecConstant*s are not in DT, + // but they need to be collected anyway. + collectOtherInstr(MI, MAI, SPIRV::MB_TypeConstVars); } else if (OpCode == SPIRV::OpFunction) { collectFuncNames(MI, *F); + } else if (OpCode == SPIRV::OpTypeForwardPointer) { + collectOtherInstr(MI, MAI, SPIRV::MB_TypeConstVars, false); } } } } // Number registers in all functions globally from 0 onwards and store -// the result in global register alias table. +// the result in global register alias table. Some registers are already +// numbered in collectGlobalEntities. void SPIRVModuleAnalysis::numberRegistersGlobally(const Module &M) { for (auto F = M.begin(), E = M.end(); F != E; ++F) { if ((*F).isDeclaration()) @@ -282,11 +309,50 @@ void SPIRVModuleAnalysis::numberRegistersGlobally(const Module &M) { Register NewReg = Register::index2VirtReg(MAI.getNextID()); MAI.setRegisterAlias(MF, Reg, NewReg); } + if (MI.getOpcode() != SPIRV::OpExtInst) + continue; + auto Set = MI.getOperand(2).getImm(); + if (MAI.ExtInstSetMap.find(Set) == MAI.ExtInstSetMap.end()) + MAI.ExtInstSetMap[Set] = Register::index2VirtReg(MAI.getNextID()); } } } } +// Find OpIEqual and OpBranchConditional instructions originating from +// OpSwitches, mark them skipped for emission. Also mark MBB skipped if it +// contains only these instructions. +static void processSwitches(const Module &M, SPIRV::ModuleAnalysisInfo &MAI, + MachineModuleInfo *MMI) { + DenseSet<Register> SwitchRegs; + for (auto F = M.begin(), E = M.end(); F != E; ++F) { + MachineFunction *MF = MMI->getMachineFunction(*F); + if (!MF) + continue; + for (MachineBasicBlock &MBB : *MF) + for (MachineInstr &MI : MBB) { + if (MAI.getSkipEmission(&MI)) + continue; + if (MI.getOpcode() == SPIRV::OpSwitch) { + assert(MI.getOperand(0).isReg()); + SwitchRegs.insert(MI.getOperand(0).getReg()); + } + if (MI.getOpcode() != SPIRV::OpIEqual || !MI.getOperand(2).isReg() || + !SwitchRegs.contains(MI.getOperand(2).getReg())) + continue; + Register CmpReg = MI.getOperand(0).getReg(); + MachineInstr *CBr = MI.getNextNode(); + assert(CBr && CBr->getOpcode() == SPIRV::OpBranchConditional && + CBr->getOperand(0).isReg() && + CBr->getOperand(0).getReg() == CmpReg); + MAI.setSkipEmission(&MI); + MAI.setSkipEmission(CBr); + if (&MBB.front() == &MI && &MBB.back() == CBr) + MAI.MBBsToSkip.insert(&MBB); + } + } +} + struct SPIRV::ModuleAnalysisInfo SPIRVModuleAnalysis::MAI; void SPIRVModuleAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { @@ -305,7 +371,9 @@ bool SPIRVModuleAnalysis::runOnModule(Module &M) { setBaseInfo(M); - // TODO: Process type/const/global var/func decl instructions, number their + processSwitches(M, MAI, MMI); + + // Process type/const/global var/func decl instructions, number their // destination registers from 0 to N, collect Extensions and Capabilities. processDefInstrs(M); diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h index 585868909d28..9bcdf6e9ae2a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h @@ -52,6 +52,9 @@ struct ModuleAnalysisInfo { SPIRV::AddressingModel Addr; SPIRV::SourceLanguage SrcLang; unsigned SrcLangVersion; + StringSet<> SrcExt; + // Maps ExtInstSet to corresponding ID register. + DenseMap<unsigned, Register> ExtInstSetMap; // Contains the list of all global OpVariables in the module. SmallVector<MachineInstr *, 4> GlobalVarList; // Maps function names to coresponding function ID registers. @@ -59,6 +62,9 @@ struct ModuleAnalysisInfo { // The set contains machine instructions which are necessary // for correct MIR but will not be emitted in function bodies. DenseSet<MachineInstr *> InstrsToDelete; + // The set contains machine basic blocks which are necessary + // for correct MIR but will not be emitted. + DenseSet<MachineBasicBlock *> MBBsToSkip; // The table contains global aliases of local registers for each machine // function. The aliases are used to substitute local registers during // code emission. @@ -75,6 +81,7 @@ struct ModuleAnalysisInfo { assert(FuncReg != FuncNameMap.end() && "Cannot find function Id"); return FuncReg->second; } + Register getExtInstSetReg(unsigned SetNum) { return ExtInstSetMap[SetNum]; } InstrList &getMSInstrs(unsigned MSType) { return MS[MSType]; } void setSkipEmission(MachineInstr *MI) { InstrsToDelete.insert(MI); } bool getSkipEmission(const MachineInstr *MI) { @@ -123,7 +130,6 @@ public: private: void setBaseInfo(const Module &M); - template <typename T> void collectTypesConstsVars(); void collectGlobalEntities( const std::vector<SPIRV::DTSortableEntry *> &DepsGraph, SPIRV::ModuleSectionType MSType, diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index 687f84046650..e620226dcc7a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -39,11 +39,58 @@ public: }; } // namespace -static bool isSpvIntrinsic(MachineInstr &MI, Intrinsic::ID IntrinsicID) { - if (MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS && - MI.getIntrinsicID() == IntrinsicID) - return true; - return false; +static void addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + DenseMap<MachineInstr *, Register> RegsAlreadyAddedToDT; + SmallVector<MachineInstr *, 10> ToErase, ToEraseComposites; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (!isSpvIntrinsic(MI, Intrinsic::spv_track_constant)) + continue; + ToErase.push_back(&MI); + auto *Const = + cast<Constant>(cast<ConstantAsMetadata>( + MI.getOperand(3).getMetadata()->getOperand(0)) + ->getValue()); + if (auto *GV = dyn_cast<GlobalValue>(Const)) { + Register Reg = GR->find(GV, &MF); + if (!Reg.isValid()) + GR->add(GV, &MF, MI.getOperand(2).getReg()); + else + RegsAlreadyAddedToDT[&MI] = Reg; + } else { + Register Reg = GR->find(Const, &MF); + if (!Reg.isValid()) { + if (auto *ConstVec = dyn_cast<ConstantDataVector>(Const)) { + auto *BuildVec = MRI.getVRegDef(MI.getOperand(2).getReg()); + assert(BuildVec && + BuildVec->getOpcode() == TargetOpcode::G_BUILD_VECTOR); + for (unsigned i = 0; i < ConstVec->getNumElements(); ++i) + GR->add(ConstVec->getElementAsConstant(i), &MF, + BuildVec->getOperand(1 + i).getReg()); + } + GR->add(Const, &MF, MI.getOperand(2).getReg()); + } else { + RegsAlreadyAddedToDT[&MI] = Reg; + // This MI is unused and will be removed. If the MI uses + // const_composite, it will be unused and should be removed too. + assert(MI.getOperand(2).isReg() && "Reg operand is expected"); + MachineInstr *SrcMI = MRI.getVRegDef(MI.getOperand(2).getReg()); + if (SrcMI && isSpvIntrinsic(*SrcMI, Intrinsic::spv_const_composite)) + ToEraseComposites.push_back(SrcMI); + } + } + } + } + for (MachineInstr *MI : ToErase) { + Register Reg = MI->getOperand(2).getReg(); + if (RegsAlreadyAddedToDT.find(MI) != RegsAlreadyAddedToDT.end()) + Reg = RegsAlreadyAddedToDT[MI]; + MRI.replaceRegWith(MI->getOperand(0).getReg(), Reg); + MI->eraseFromParent(); + } + for (MachineInstr *MI : ToEraseComposites) + MI->eraseFromParent(); } static void foldConstantsIntoIntrinsics(MachineFunction &MF) { @@ -120,6 +167,7 @@ static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR, } case TargetOpcode::G_TRUNC: case TargetOpcode::G_ADDRSPACE_CAST: + case TargetOpcode::G_PTR_ADD: case TargetOpcode::COPY: { MachineOperand &Op = MI->getOperand(1); MachineInstr *Def = Op.isReg() ? MRI.getVRegDef(Op.getReg()) : nullptr; @@ -308,6 +356,22 @@ static void processInstrsWithTypeFolding(MachineFunction &MF, processInstr(MI, MIB, MRI, GR); } } + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + // We need to rewrite dst types for ASSIGN_TYPE instrs to be able + // to perform tblgen'erated selection and we can't do that on Legalizer + // as it operates on gMIR only. + if (MI.getOpcode() != SPIRV::ASSIGN_TYPE) + continue; + Register SrcReg = MI.getOperand(1).getReg(); + if (!isTypeFoldingSupported(MRI.getVRegDef(SrcReg)->getOpcode())) + continue; + Register DstReg = MI.getOperand(0).getReg(); + if (MRI.getType(DstReg).isVector()) + MRI.setRegClass(DstReg, &SPIRV::IDRegClass); + MRI.setType(DstReg, LLT::scalar(32)); + } + } } static void processSwitches(MachineFunction &MF, SPIRVGlobalRegistry *GR, @@ -421,6 +485,7 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) { SPIRVGlobalRegistry *GR = ST.getSPIRVGlobalRegistry(); GR->setCurrentFunc(MF); MachineIRBuilder MIB(MF); + addConstantsToTrack(MF, GR); foldConstantsIntoIntrinsics(MF); insertBitcasts(MF, GR, MIB); generateAssignInstrs(MF, GR, MIB); diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp new file mode 100644 index 000000000000..13c3c12c1b41 --- /dev/null +++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp @@ -0,0 +1,288 @@ +//===-- SPIRVPrepareFunctions.cpp - modify function signatures --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass modifies function signatures containing aggregate arguments +// and/or return value. Also it substitutes some llvm intrinsic calls by +// function calls, generating these functions as the translator does. +// +// NOTE: this pass is a module-level one due to the necessity to modify +// GVs/functions. +// +//===----------------------------------------------------------------------===// + +#include "SPIRV.h" +#include "SPIRVTargetMachine.h" +#include "SPIRVUtils.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/LowerMemIntrinsics.h" + +using namespace llvm; + +namespace llvm { +void initializeSPIRVPrepareFunctionsPass(PassRegistry &); +} + +namespace { + +class SPIRVPrepareFunctions : public ModulePass { + Function *processFunctionSignature(Function *F); + +public: + static char ID; + SPIRVPrepareFunctions() : ModulePass(ID) { + initializeSPIRVPrepareFunctionsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; + + StringRef getPassName() const override { return "SPIRV prepare functions"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + ModulePass::getAnalysisUsage(AU); + } +}; + +} // namespace + +char SPIRVPrepareFunctions::ID = 0; + +INITIALIZE_PASS(SPIRVPrepareFunctions, "prepare-functions", + "SPIRV prepare functions", false, false) + +Function *SPIRVPrepareFunctions::processFunctionSignature(Function *F) { + IRBuilder<> B(F->getContext()); + + bool IsRetAggr = F->getReturnType()->isAggregateType(); + bool HasAggrArg = + std::any_of(F->arg_begin(), F->arg_end(), [](Argument &Arg) { + return Arg.getType()->isAggregateType(); + }); + bool DoClone = IsRetAggr || HasAggrArg; + if (!DoClone) + return F; + SmallVector<std::pair<int, Type *>, 4> ChangedTypes; + Type *RetType = IsRetAggr ? B.getInt32Ty() : F->getReturnType(); + if (IsRetAggr) + ChangedTypes.push_back(std::pair<int, Type *>(-1, F->getReturnType())); + SmallVector<Type *, 4> ArgTypes; + for (const auto &Arg : F->args()) { + if (Arg.getType()->isAggregateType()) { + ArgTypes.push_back(B.getInt32Ty()); + ChangedTypes.push_back( + std::pair<int, Type *>(Arg.getArgNo(), Arg.getType())); + } else + ArgTypes.push_back(Arg.getType()); + } + FunctionType *NewFTy = + FunctionType::get(RetType, ArgTypes, F->getFunctionType()->isVarArg()); + Function *NewF = + Function::Create(NewFTy, F->getLinkage(), F->getName(), *F->getParent()); + + ValueToValueMapTy VMap; + auto NewFArgIt = NewF->arg_begin(); + for (auto &Arg : F->args()) { + StringRef ArgName = Arg.getName(); + NewFArgIt->setName(ArgName); + VMap[&Arg] = &(*NewFArgIt++); + } + SmallVector<ReturnInst *, 8> Returns; + + CloneFunctionInto(NewF, F, VMap, CloneFunctionChangeType::LocalChangesOnly, + Returns); + NewF->takeName(F); + + NamedMDNode *FuncMD = + F->getParent()->getOrInsertNamedMetadata("spv.cloned_funcs"); + SmallVector<Metadata *, 2> MDArgs; + MDArgs.push_back(MDString::get(B.getContext(), NewF->getName())); + for (auto &ChangedTyP : ChangedTypes) + MDArgs.push_back(MDNode::get( + B.getContext(), + {ConstantAsMetadata::get(B.getInt32(ChangedTyP.first)), + ValueAsMetadata::get(Constant::getNullValue(ChangedTyP.second))})); + MDNode *ThisFuncMD = MDNode::get(B.getContext(), MDArgs); + FuncMD->addOperand(ThisFuncMD); + + for (auto *U : make_early_inc_range(F->users())) { + if (auto *CI = dyn_cast<CallInst>(U)) + CI->mutateFunctionType(NewF->getFunctionType()); + U->replaceUsesOfWith(F, NewF); + } + return NewF; +} + +std::string lowerLLVMIntrinsicName(IntrinsicInst *II) { + Function *IntrinsicFunc = II->getCalledFunction(); + assert(IntrinsicFunc && "Missing function"); + std::string FuncName = IntrinsicFunc->getName().str(); + std::replace(FuncName.begin(), FuncName.end(), '.', '_'); + FuncName = "spirv." + FuncName; + return FuncName; +} + +static Function *getOrCreateFunction(Module *M, Type *RetTy, + ArrayRef<Type *> ArgTypes, + StringRef Name) { + FunctionType *FT = FunctionType::get(RetTy, ArgTypes, false); + Function *F = M->getFunction(Name); + if (F && F->getFunctionType() == FT) + return F; + Function *NewF = Function::Create(FT, GlobalValue::ExternalLinkage, Name, M); + if (F) + NewF->setDSOLocal(F->isDSOLocal()); + NewF->setCallingConv(CallingConv::SPIR_FUNC); + return NewF; +} + +static void lowerFunnelShifts(Module *M, IntrinsicInst *FSHIntrinsic) { + // Get a separate function - otherwise, we'd have to rework the CFG of the + // current one. Then simply replace the intrinsic uses with a call to the new + // function. + // Generate LLVM IR for i* @spirv.llvm_fsh?_i* (i* %a, i* %b, i* %c) + FunctionType *FSHFuncTy = FSHIntrinsic->getFunctionType(); + Type *FSHRetTy = FSHFuncTy->getReturnType(); + const std::string FuncName = lowerLLVMIntrinsicName(FSHIntrinsic); + Function *FSHFunc = + getOrCreateFunction(M, FSHRetTy, FSHFuncTy->params(), FuncName); + + if (!FSHFunc->empty()) { + FSHIntrinsic->setCalledFunction(FSHFunc); + return; + } + BasicBlock *RotateBB = BasicBlock::Create(M->getContext(), "rotate", FSHFunc); + IRBuilder<> IRB(RotateBB); + Type *Ty = FSHFunc->getReturnType(); + // Build the actual funnel shift rotate logic. + // In the comments, "int" is used interchangeably with "vector of int + // elements". + FixedVectorType *VectorTy = dyn_cast<FixedVectorType>(Ty); + Type *IntTy = VectorTy ? VectorTy->getElementType() : Ty; + unsigned BitWidth = IntTy->getIntegerBitWidth(); + ConstantInt *BitWidthConstant = IRB.getInt({BitWidth, BitWidth}); + Value *BitWidthForInsts = + VectorTy + ? IRB.CreateVectorSplat(VectorTy->getNumElements(), BitWidthConstant) + : BitWidthConstant; + Value *RotateModVal = + IRB.CreateURem(/*Rotate*/ FSHFunc->getArg(2), BitWidthForInsts); + Value *FirstShift = nullptr, *SecShift = nullptr; + if (FSHIntrinsic->getIntrinsicID() == Intrinsic::fshr) { + // Shift the less significant number right, the "rotate" number of bits + // will be 0-filled on the left as a result of this regular shift. + FirstShift = IRB.CreateLShr(FSHFunc->getArg(1), RotateModVal); + } else { + // Shift the more significant number left, the "rotate" number of bits + // will be 0-filled on the right as a result of this regular shift. + FirstShift = IRB.CreateShl(FSHFunc->getArg(0), RotateModVal); + } + // We want the "rotate" number of the more significant int's LSBs (MSBs) to + // occupy the leftmost (rightmost) "0 space" left by the previous operation. + // Therefore, subtract the "rotate" number from the integer bitsize... + Value *SubRotateVal = IRB.CreateSub(BitWidthForInsts, RotateModVal); + if (FSHIntrinsic->getIntrinsicID() == Intrinsic::fshr) { + // ...and left-shift the more significant int by this number, zero-filling + // the LSBs. + SecShift = IRB.CreateShl(FSHFunc->getArg(0), SubRotateVal); + } else { + // ...and right-shift the less significant int by this number, zero-filling + // the MSBs. + SecShift = IRB.CreateLShr(FSHFunc->getArg(1), SubRotateVal); + } + // A simple binary addition of the shifted ints yields the final result. + IRB.CreateRet(IRB.CreateOr(FirstShift, SecShift)); + + FSHIntrinsic->setCalledFunction(FSHFunc); +} + +static void buildUMulWithOverflowFunc(Module *M, Function *UMulFunc) { + // The function body is already created. + if (!UMulFunc->empty()) + return; + + BasicBlock *EntryBB = BasicBlock::Create(M->getContext(), "entry", UMulFunc); + IRBuilder<> IRB(EntryBB); + // Build the actual unsigned multiplication logic with the overflow + // indication. Do unsigned multiplication Mul = A * B. Then check + // if unsigned division Div = Mul / A is not equal to B. If so, + // then overflow has happened. + Value *Mul = IRB.CreateNUWMul(UMulFunc->getArg(0), UMulFunc->getArg(1)); + Value *Div = IRB.CreateUDiv(Mul, UMulFunc->getArg(0)); + Value *Overflow = IRB.CreateICmpNE(UMulFunc->getArg(0), Div); + + // umul.with.overflow intrinsic return a structure, where the first element + // is the multiplication result, and the second is an overflow bit. + Type *StructTy = UMulFunc->getReturnType(); + Value *Agg = IRB.CreateInsertValue(UndefValue::get(StructTy), Mul, {0}); + Value *Res = IRB.CreateInsertValue(Agg, Overflow, {1}); + IRB.CreateRet(Res); +} + +static void lowerUMulWithOverflow(Module *M, IntrinsicInst *UMulIntrinsic) { + // Get a separate function - otherwise, we'd have to rework the CFG of the + // current one. Then simply replace the intrinsic uses with a call to the new + // function. + FunctionType *UMulFuncTy = UMulIntrinsic->getFunctionType(); + Type *FSHLRetTy = UMulFuncTy->getReturnType(); + const std::string FuncName = lowerLLVMIntrinsicName(UMulIntrinsic); + Function *UMulFunc = + getOrCreateFunction(M, FSHLRetTy, UMulFuncTy->params(), FuncName); + buildUMulWithOverflowFunc(M, UMulFunc); + UMulIntrinsic->setCalledFunction(UMulFunc); +} + +static void substituteIntrinsicCalls(Module *M, Function *F) { + for (BasicBlock &BB : *F) { + for (Instruction &I : BB) { + auto Call = dyn_cast<CallInst>(&I); + if (!Call) + continue; + Call->setTailCall(false); + Function *CF = Call->getCalledFunction(); + if (!CF || !CF->isIntrinsic()) + continue; + auto *II = cast<IntrinsicInst>(Call); + if (II->getIntrinsicID() == Intrinsic::fshl || + II->getIntrinsicID() == Intrinsic::fshr) + lowerFunnelShifts(M, II); + else if (II->getIntrinsicID() == Intrinsic::umul_with_overflow) + lowerUMulWithOverflow(M, II); + } + } +} + +bool SPIRVPrepareFunctions::runOnModule(Module &M) { + for (Function &F : M) + substituteIntrinsicCalls(&M, &F); + + std::vector<Function *> FuncsWorklist; + bool Changed = false; + for (auto &F : M) + FuncsWorklist.push_back(&F); + + for (auto *Func : FuncsWorklist) { + Function *F = processFunctionSignature(Func); + + bool CreatedNewF = F != Func; + + if (Func->isDeclaration()) { + Changed |= CreatedNewF; + continue; + } + + if (CreatedNewF) + Func->eraseFromParent(); + } + + return Changed; +} + +ModulePass *llvm::createSPIRVPrepareFunctionsPass() { + return new SPIRVPrepareFunctions(); +} diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp index cdf3a160f373..00549c7b5768 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp @@ -46,8 +46,7 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU, PointerSize(computePointerSize(TT)), SPIRVVersion(0), InstrInfo(), FrameLowering(initSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) { GR = std::make_unique<SPIRVGlobalRegistry>(PointerSize); - CallLoweringInfo = - std::make_unique<SPIRVCallLowering>(TLInfo, *this, GR.get()); + CallLoweringInfo = std::make_unique<SPIRVCallLowering>(TLInfo, GR.get()); Legalizer = std::make_unique<SPIRVLegalizerInfo>(*this); RegBankInfo = std::make_unique<SPIRVRegisterBankInfo>(); InstSelector.reset( diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp index f7c88a5c6d4a..7f5f14dc3ce8 100644 --- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp @@ -140,7 +140,10 @@ TargetPassConfig *SPIRVTargetMachine::createPassConfig(PassManagerBase &PM) { return new SPIRVPassConfig(*this, PM); } -void SPIRVPassConfig::addIRPasses() { TargetPassConfig::addIRPasses(); } +void SPIRVPassConfig::addIRPasses() { + TargetPassConfig::addIRPasses(); + addPass(createSPIRVPrepareFunctionsPass()); +} void SPIRVPassConfig::addISelPrepare() { addPass(createSPIRVEmitIntrinsicsPass(&getTM<SPIRVTargetMachine>())); diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp index b92dc12735f8..15671ef3e512 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp @@ -45,6 +45,14 @@ static size_t getPaddedLen(const StringRef &Str) { return (Len % 4 == 0) ? Len : Len + (4 - (Len % 4)); } +void addStringImm(const StringRef &Str, MCInst &Inst) { + const size_t PaddedLen = getPaddedLen(Str); + for (unsigned i = 0; i < PaddedLen; i += 4) { + // Add an operand for the 32-bits of chars or padding. + Inst.addOperand(MCOperand::createImm(convertCharsToWord(Str, i))); + } +} + void addStringImm(const StringRef &Str, MachineInstrBuilder &MIB) { const size_t PaddedLen = getPaddedLen(Str); for (unsigned i = 0; i < PaddedLen; i += 4) { @@ -182,6 +190,24 @@ SPIRV::MemorySemantics getMemSemanticsForStorageClass(SPIRV::StorageClass SC) { } } +SPIRV::MemorySemantics getMemSemantics(AtomicOrdering Ord) { + switch (Ord) { + case AtomicOrdering::Acquire: + return SPIRV::MemorySemantics::Acquire; + case AtomicOrdering::Release: + return SPIRV::MemorySemantics::Release; + case AtomicOrdering::AcquireRelease: + return SPIRV::MemorySemantics::AcquireRelease; + case AtomicOrdering::SequentiallyConsistent: + return SPIRV::MemorySemantics::SequentiallyConsistent; + case AtomicOrdering::Unordered: + case AtomicOrdering::Monotonic: + case AtomicOrdering::NotAtomic: + default: + return SPIRV::MemorySemantics::None; + } +} + MachineInstr *getDefInstrMaybeConstant(Register &ConstReg, const MachineRegisterInfo *MRI) { MachineInstr *ConstInstr = MRI->getVRegDef(ConstReg); @@ -202,6 +228,11 @@ uint64_t getIConstVal(Register ConstReg, const MachineRegisterInfo *MRI) { return MI->getOperand(1).getCImm()->getValue().getZExtValue(); } +bool isSpvIntrinsic(MachineInstr &MI, Intrinsic::ID IntrinsicID) { + return MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS && + MI.getIntrinsicID() == IntrinsicID; +} + Type *getMDOperandAsType(const MDNode *N, unsigned I) { return cast<ValueAsMetadata>(N->getOperand(I))->getType(); } diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h index ffa82c9c1fe4..35e24b076570 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.h +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h @@ -32,6 +32,7 @@ class SPIRVInstrInfo; // Add the given string as a series of integer operand, inserting null // terminators and padding to make sure the operands all have 32-bit // little-endian words. +void addStringImm(const llvm::StringRef &Str, llvm::MCInst &Inst); void addStringImm(const llvm::StringRef &Str, llvm::MachineInstrBuilder &MIB); void addStringImm(const llvm::StringRef &Str, llvm::IRBuilder<> &B, std::vector<llvm::Value *> &Args); @@ -67,6 +68,8 @@ llvm::SPIRV::StorageClass addressSpaceToStorageClass(unsigned AddrSpace); llvm::SPIRV::MemorySemantics getMemSemanticsForStorageClass(llvm::SPIRV::StorageClass SC); +llvm::SPIRV::MemorySemantics getMemSemantics(llvm::AtomicOrdering Ord); + // Find def instruction for the given ConstReg, walking through // spv_track_constant and ASSIGN_TYPE instructions. Updates ConstReg by def // of OpConstant instruction. @@ -78,6 +81,9 @@ getDefInstrMaybeConstant(llvm::Register &ConstReg, uint64_t getIConstVal(llvm::Register ConstReg, const llvm::MachineRegisterInfo *MRI); +// Check if MI is a SPIR-V specific intrinsic call. +bool isSpvIntrinsic(llvm::MachineInstr &MI, llvm::Intrinsic::ID IntrinsicID); + // Get type of i-th operand of the metadata node. llvm::Type *getMDOperandAsType(const llvm::MDNode *N, unsigned I); #endif // LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H diff --git a/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp index 1138788ac7fa..1f8837eb0194 100644 --- a/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp +++ b/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp @@ -24,10 +24,10 @@ Target &llvm::getTheSparcelTarget() { } extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcTargetInfo() { - RegisterTarget<Triple::sparc, /*HasJIT=*/true> X(getTheSparcTarget(), "sparc", - "Sparc", "Sparc"); - RegisterTarget<Triple::sparcv9, /*HasJIT=*/true> Y( + RegisterTarget<Triple::sparc, /*HasJIT=*/false> X(getTheSparcTarget(), + "sparc", "Sparc", "Sparc"); + RegisterTarget<Triple::sparcv9, /*HasJIT=*/false> Y( getTheSparcV9Target(), "sparcv9", "Sparc V9", "Sparc"); - RegisterTarget<Triple::sparcel, /*HasJIT=*/true> Z( + RegisterTarget<Triple::sparcel, /*HasJIT=*/false> Z( getTheSparcelTarget(), "sparcel", "Sparc LE", "Sparc"); } diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp b/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp index 9c73757d7f5c..86eb8365d527 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp @@ -28,7 +28,3 @@ const MCPhysReg SystemZ::XPLINK64ArgGPRs[SystemZ::XPLINK64NumArgGPRs] = { const MCPhysReg SystemZ::XPLINK64ArgFPRs[SystemZ::XPLINK64NumArgFPRs] = { SystemZ::F0D, SystemZ::F2D, SystemZ::F4D, SystemZ::F6D }; - -const MCPhysReg SystemZ::XPLINK64ArgVRs[SystemZ::XPLINK64NumArgVRs] = { - SystemZ::V24, SystemZ::V25, SystemZ::V26, SystemZ::V27, - SystemZ::V28, SystemZ::V29, SystemZ::V30, SystemZ::V31}; diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/llvm/lib/Target/SystemZ/SystemZCallingConv.h index f82c61c0f344..387411942aba 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.h +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.h @@ -27,9 +27,6 @@ namespace SystemZ { const unsigned XPLINK64NumArgFPRs = 4; extern const MCPhysReg XPLINK64ArgFPRs[XPLINK64NumArgFPRs]; - - const unsigned XPLINK64NumArgVRs = 8; - extern const MCPhysReg XPLINK64ArgVRs[XPLINK64NumArgVRs]; } // end namespace SystemZ class SystemZCCState : public CCState { @@ -205,41 +202,6 @@ inline bool CC_XPLINK64_Allocate128BitVararg(unsigned &ValNo, MVT &ValVT, return false; } -inline bool CC_XPLINK64_Shadow_Stack(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - ArrayRef<MCPhysReg> RegList; - - switch (LocVT.SimpleTy) { - case MVT::i64: - RegList = SystemZ::XPLINK64ArgGPRs; - break; - case MVT::v16i8: - case MVT::v8i16: - case MVT::v4i32: - case MVT::v2i64: - case MVT::v4f32: - case MVT::v2f64: - RegList = SystemZ::XPLINK64ArgVRs; - break; - case MVT::f32: - case MVT::f64: - case MVT::f128: - RegList = SystemZ::XPLINK64ArgFPRs; - break; - default: - return false; - } - - unsigned UnallocatedRegisterIndex = State.getFirstUnallocated(RegList); - // Every time we can allocate a register, allocate on the stack. - if (UnallocatedRegisterIndex < RegList.size()) - State.AllocateStack(LocVT.getSizeInBits() / 8, Align(8)); - - return false; -} - inline bool RetCC_SystemZ_Error(unsigned &, MVT &, MVT &, CCValAssign::LocInfo &, ISD::ArgFlagsTy &, CCState &) { diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td index fdd82a01f211..29b4a26736b2 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -221,9 +221,10 @@ def CC_SystemZ_XPLINK64 : CallingConv<[ // XPLINK64 ABI compliant code widens integral types smaller than i64 // to i64 before placing the parameters either on the stack or in registers. CCIfType<[i32], CCIfExtend<CCPromoteToType<i64>>>, - // Promote f32 to f64 and bitcast to i64, if it needs to be passed in GPRS. - CCIfType<[f32], CCIfNotFixed<CCPromoteToType<f64>>>, - CCIfType<[f64], CCIfNotFixed<CCBitConvertToType<i64>>>, + // Promote f32 to f64 and bitcast to i64, if it needs to be passed in GPRs. + // Although we assign the f32 vararg to be bitcast, it will first be promoted + // to an f64 within convertValVTToLocVT(). + CCIfType<[f32, f64], CCIfNotFixed<CCBitConvertToType<i64>>>, // long double, can only be passed in GPR2 and GPR3, if available, // hence R2Q CCIfType<[f128], CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>, @@ -246,34 +247,29 @@ def CC_SystemZ_XPLINK64 : CallingConv<[ // The first 3 integer arguments are passed in registers R1D-R3D. // The rest will be passed in the user area. The address offset of the user // area can be found in register R4D. - CCIfType<[i64], CCCustom<"CC_XPLINK64_Shadow_Stack">>, - CCIfType<[i64], CCAssignToReg<[R1D, R2D, R3D]>>, + CCIfType<[i64], CCAssignToRegAndStack<[R1D, R2D, R3D], 8, 8>>, - // The first 8 named vector arguments are passed in V24-V31. Sub-128 vectors + // The first 8 named vector arguments are passed in V24-V31. Sub-128 vectors // are passed in the same way, but they're widened to one of these types // during type legalization. CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>, - CCIfSubtarget<"hasVector()", - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Stack">>>>, + CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>, CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfFixed<CCAssignToReg<[V24, V25, V26, V27, - V28, V29, V30, V31]>>>>, + CCIfFixed<CCAssignToRegAndStack<[V24, V25, V26, V27, + V28, V29, V30, V31], 16, 8>>>>, - // The first 4 named float and double arguments are passed in registers FPR0-FPR6. - // The rest will be passed in the user area. + // The first 4 named float and double arguments are passed in registers + // FPR0-FPR6. The rest will be passed in the user area. CCIfType<[f32, f64], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>, - CCIfType<[f32, f64], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Stack">>>, - CCIfType<[f32], CCIfFixed<CCAssignToReg<[F0S, F2S, F4S, F6S]>>>, - CCIfType<[f64], CCIfFixed<CCAssignToReg<[F0D, F2D, F4D, F6D]>>>, + CCIfType<[f32], CCIfFixed<CCAssignToRegAndStack<[F0S, F2S, F4S, F6S], 4, 8>>>, + CCIfType<[f64], CCIfFixed<CCAssignToRegAndStack<[F0D, F2D, F4D, F6D], 8, 8>>>, + // The first 2 long double arguments are passed in register FPR0/FPR2 // and FPR4/FPR6. The rest will be passed in the user area. CCIfType<[f128], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>, - CCIfType<[f128], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Stack">>>, - CCIfType<[f128], CCIfFixed<CCAssignToReg<[F0Q, F4Q]>>>, + CCIfType<[f128], CCIfFixed<CCAssignToRegAndStack<[F0Q, F4Q], 16, 8>>>, // Other arguments are passed in 8-byte-aligned 8-byte stack slots. CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index 975eb8862e82..d943507b4112 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -911,6 +911,54 @@ SystemZXPLINKFrameLowering::SystemZXPLINKFrameLowering() XPLINKSpillOffsetTable[I].Offset; } +// Checks if the function is a potential candidate for being a XPLeaf routine. +static bool isXPLeafCandidate(const MachineFunction &MF) { + const MachineFrameInfo &MFFrame = MF.getFrameInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>(); + auto *Regs = + static_cast<SystemZXPLINK64Registers *>(Subtarget.getSpecialRegisters()); + + // If function calls other functions including alloca, then it is not a XPLeaf + // routine. + if (MFFrame.hasCalls()) + return false; + + // If the function has var Sized Objects, then it is not a XPLeaf routine. + if (MFFrame.hasVarSizedObjects()) + return false; + + // If the function adjusts the stack, then it is not a XPLeaf routine. + if (MFFrame.adjustsStack()) + return false; + + // If function modifies the stack pointer register, then it is not a XPLeaf + // routine. + if (MRI.isPhysRegModified(Regs->getStackPointerRegister())) + return false; + + // If function modifies the ADA register, then it is not a XPLeaf routine. + if (MRI.isPhysRegModified(Regs->getAddressOfCalleeRegister())) + return false; + + // If function modifies the return address register, then it is not a XPLeaf + // routine. + if (MRI.isPhysRegModified(Regs->getReturnFunctionAddressRegister())) + return false; + + // If the backchain pointer should be stored, then it is not a XPLeaf routine. + if (MF.getFunction().hasFnAttribute("backchain")) + return false; + + // If function acquires its own stack frame, then it is not a XPLeaf routine. + // At the time this function is called, only slots for local variables are + // allocated, so this is a very rough estimate. + if (MFFrame.estimateStackSize(MF) > 0) + return false; + + return true; +} + bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots( MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector<CalleeSavedInfo> &CSI) const { @@ -920,6 +968,18 @@ bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots( auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>(); auto &GRRegClass = SystemZ::GR64BitRegClass; + // At this point, the result of isXPLeafCandidate() is not accurate because + // the size of the save area has not yet been determined. If + // isXPLeafCandidate() indicates a potential leaf function, and there are no + // callee-save registers, then it is indeed a leaf function, and we can early + // exit. + // TODO: It is possible for leaf functions to use callee-saved registers. + // It can use the 0-2k range between R4 and the caller's stack frame without + // acquiring its own stack frame. + bool IsLeaf = CSI.empty() && isXPLeafCandidate(MF); + if (IsLeaf) + return true; + // For non-leaf functions: // - the address of callee (entry point) register R6 must be saved CSI.push_back(CalleeSavedInfo(Regs.getAddressOfCalleeRegister())); @@ -1137,16 +1197,16 @@ void SystemZXPLINKFrameLowering::emitPrologue(MachineFunction &MF, auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>(); MachineFrameInfo &MFFrame = MF.getFrameInfo(); MachineInstr *StoreInstr = nullptr; + + determineFrameLayout(MF); + bool HasFP = hasFP(MF); // Debug location must be unknown since the first debug location is used // to determine the end of the prologue. DebugLoc DL; uint64_t Offset = 0; - // TODO: Support leaf functions; only add size of save+reserved area when - // function is non-leaf. - MFFrame.setStackSize(MFFrame.getStackSize() + Regs.getCallFrameSize()); - uint64_t StackSize = MFFrame.getStackSize(); + const uint64_t StackSize = MFFrame.getStackSize(); if (ZFI->getSpillGPRRegs().LowGPR) { // Skip over the GPR saves. @@ -1213,8 +1273,8 @@ void SystemZXPLINKFrameLowering::emitPrologue(MachineFunction &MF, // Mark the FramePtr as live at the beginning of every block except // the entry block. (We'll have marked R8 as live on entry when // saving the GPRs.) - for (auto I = std::next(MF.begin()), E = MF.end(); I != E; ++I) - I->addLiveIn(Regs.getFramePointerRegister()); + for (MachineBasicBlock &B : llvm::drop_begin(MF)) + B.addLiveIn(Regs.getFramePointerRegister()); } } @@ -1321,3 +1381,32 @@ void SystemZXPLINKFrameLowering::processFunctionBeforeFrameFinalized( // Setup stack frame offset MFFrame.setOffsetAdjustment(Regs.getStackPointerBias()); } + +// Determines the size of the frame, and creates the deferred spill objects. +void SystemZXPLINKFrameLowering::determineFrameLayout( + MachineFunction &MF) const { + MachineFrameInfo &MFFrame = MF.getFrameInfo(); + const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>(); + auto *Regs = + static_cast<SystemZXPLINK64Registers *>(Subtarget.getSpecialRegisters()); + + uint64_t StackSize = MFFrame.getStackSize(); + if (StackSize == 0) + return; + + // Add the size of the register save area and the reserved area to the size. + StackSize += Regs->getCallFrameSize(); + MFFrame.setStackSize(StackSize); + + // We now know the stack size. Create the fixed spill stack objects for the + // register save area now. This has no impact on the stack frame layout, as + // this is already computed. However, it makes sure that all callee saved + // registers have a valid frame index assigned. + const unsigned RegSize = MF.getDataLayout().getPointerSize(); + for (auto &CS : MFFrame.getCalleeSavedInfo()) { + int Offset = RegSpillOffsets[CS.getReg()]; + if (Offset >= 0) + CS.setFrameIdx( + MFFrame.CreateFixedSpillStackObject(RegSize, Offset - StackSize)); + } +} diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h index bec83a9457e0..95f30e3c0d99 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h @@ -134,6 +134,8 @@ public: void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS) const override; + + void determineFrameLayout(MachineFunction &MF) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 42c1c77f14e4..ac4531262187 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1404,8 +1404,12 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, const SDLoc &DL, return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value); case CCValAssign::BCvt: { assert(VA.getLocVT() == MVT::i64 || VA.getLocVT() == MVT::i128); - assert(VA.getValVT().isVector() || VA.getValVT() == MVT::f64 || - VA.getValVT() == MVT::f128); + assert(VA.getValVT().isVector() || VA.getValVT() == MVT::f32 || + VA.getValVT() == MVT::f64 || VA.getValVT() == MVT::f128); + // For an f32 vararg we need to first promote it to an f64 and then + // bitcast it to an i64. + if (VA.getValVT() == MVT::f32 && VA.getLocVT() == MVT::i64) + Value = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f64, Value); MVT BitCastToType = VA.getValVT().isVector() && VA.getLocVT() == MVT::i64 ? MVT::v2i64 : VA.getLocVT(); diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp index 94ebb59c4c77..46bb85606a62 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.cpp +++ b/llvm/lib/Target/VE/VEInstrInfo.cpp @@ -418,7 +418,9 @@ unsigned VEInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, if (MI.getOpcode() == VE::LDrii || // I64 MI.getOpcode() == VE::LDLSXrii || // I32 MI.getOpcode() == VE::LDUrii || // F32 - MI.getOpcode() == VE::LDQrii // F128 (pseudo) + MI.getOpcode() == VE::LDQrii || // F128 (pseudo) + MI.getOpcode() == VE::LDVMrii || // VM (pseudo) + MI.getOpcode() == VE::LDVM512rii // VM512 (pseudo) ) { if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0 && MI.getOperand(3).isImm() && @@ -437,10 +439,12 @@ unsigned VEInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, /// any side effects other than storing to the stack slot. unsigned VEInstrInfo::isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const { - if (MI.getOpcode() == VE::STrii || // I64 - MI.getOpcode() == VE::STLrii || // I32 - MI.getOpcode() == VE::STUrii || // F32 - MI.getOpcode() == VE::STQrii // F128 (pseudo) + if (MI.getOpcode() == VE::STrii || // I64 + MI.getOpcode() == VE::STLrii || // I32 + MI.getOpcode() == VE::STUrii || // F32 + MI.getOpcode() == VE::STQrii || // F128 (pseudo) + MI.getOpcode() == VE::STVMrii || // VM (pseudo) + MI.getOpcode() == VE::STVM512rii // VM512 (pseudo) ) { if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0 && MI.getOperand(2).isImm() && @@ -496,6 +500,20 @@ void VEInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addImm(0) .addReg(SrcReg, getKillRegState(isKill)) .addMemOperand(MMO); + } else if (RC == &VE::VMRegClass) { + BuildMI(MBB, I, DL, get(VE::STVMrii)) + .addFrameIndex(FI) + .addImm(0) + .addImm(0) + .addReg(SrcReg, getKillRegState(isKill)) + .addMemOperand(MMO); + } else if (VE::VM512RegClass.hasSubClassEq(RC)) { + BuildMI(MBB, I, DL, get(VE::STVM512rii)) + .addFrameIndex(FI) + .addImm(0) + .addImm(0) + .addReg(SrcReg, getKillRegState(isKill)) + .addMemOperand(MMO); } else report_fatal_error("Can't store this register to stack slot"); } @@ -539,6 +557,18 @@ void VEInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, .addImm(0) .addImm(0) .addMemOperand(MMO); + } else if (RC == &VE::VMRegClass) { + BuildMI(MBB, I, DL, get(VE::LDVMrii), DestReg) + .addFrameIndex(FI) + .addImm(0) + .addImm(0) + .addMemOperand(MMO); + } else if (VE::VM512RegClass.hasSubClassEq(RC)) { + BuildMI(MBB, I, DL, get(VE::LDVM512rii), DestReg) + .addFrameIndex(FI) + .addImm(0) + .addImm(0) + .addMemOperand(MMO); } else report_fatal_error("Can't load this register from stack slot"); } diff --git a/llvm/lib/Target/VE/VEInstrPatternsVec.td b/llvm/lib/Target/VE/VEInstrPatternsVec.td index 71199717a3a2..0b2f5039e3f3 100644 --- a/llvm/lib/Target/VE/VEInstrPatternsVec.td +++ b/llvm/lib/Target/VE/VEInstrPatternsVec.td @@ -25,6 +25,20 @@ def: Pat<(i64 (repl_i32 i32:$val)), (zero_f32 (i2l $val)), (SLLri (i2l $val), 32))>; +///// Mask Load & Store ///// + +// Store for v256i1, v512i1 are implemented in 2 ways. These STVM/STVM512 +// pseudo instruction is used for frameindex related load/store instructions. +// Custom Lowering is used for other load/store instructions. + +def : Pat<(v256i1 (load ADDRrii:$addr)), + (LDVMrii ADDRrii:$addr)>; +def : Pat<(v512i1 (load ADDRrii:$addr)), + (LDVM512rii ADDRrii:$addr)>; +def : Pat<(store v256i1:$vx, ADDRrii:$addr), + (STVMrii ADDRrii:$addr, $vx)>; +def : Pat<(store v512i1:$vx, ADDRrii:$addr), + (STVM512rii ADDRrii:$addr, $vx)>; multiclass vbrd_elem32<ValueType v32, ValueType s32, SDPatternOperator ImmOp, SDNodeXForm ImmCast, OutPatFrag SuperRegCast> { diff --git a/llvm/lib/Target/VE/VEInstrVec.td b/llvm/lib/Target/VE/VEInstrVec.td index 4a8476f7288a..327ad9ceacc5 100644 --- a/llvm/lib/Target/VE/VEInstrVec.td +++ b/llvm/lib/Target/VE/VEInstrVec.td @@ -2,6 +2,33 @@ // Vector Instructions //===----------------------------------------------------------------------===// +// Pseudo instructions for VM/VM512 spill/restore +// +// These pseudo instructions are used for only spill/restore since +// InlineSpiller assumes storeRegToStackSlot/loadRegFromStackSlot +// functions emit only single instruction. Those functions emit a +// single store/load instruction or one of these pseudo store/load +// instructions. +// +// Specifies hasSideEffects = 0 to disable UnmodeledSideEffects. + +let mayLoad = 1, hasSideEffects = 0 in { +def LDVMrii : Pseudo< + (outs VM:$vmx), (ins MEMrii:$addr), + "# pseudo ldvm $vmx, $addr", []>; +def LDVM512rii : Pseudo< + (outs VM512:$vmx), (ins MEMrii:$addr), + "# pseudo ldvm512 $vmx, $addr", []>; +} +let mayStore = 1, hasSideEffects = 0 in { +def STVMrii : Pseudo< + (outs), (ins MEMrii:$addr, VM:$vmx), + "# pseudo stvm $addr, $vmx", []>; +def STVM512rii : Pseudo< + (outs), (ins MEMrii:$addr, VM512:$vmx), + "# pseudo stvm512 $addr, $vmx", []>; +} + //===----------------------------------------------------------------------===// // Pseudo instructions for VM512 modifications //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/VE/VERegisterInfo.cpp b/llvm/lib/Target/VE/VERegisterInfo.cpp index f334af128162..397ea09c9a02 100644 --- a/llvm/lib/Target/VE/VERegisterInfo.cpp +++ b/llvm/lib/Target/VE/VERegisterInfo.cpp @@ -180,6 +180,16 @@ class EliminateFrameIndex { int FIOperandNum); void processLDQ(MachineInstr &MI, Register FrameReg, int64_t Offset, int FIOperandNum); + // Expand and eliminate Frame Index of pseudo STVMrii and LDVMrii. + void processSTVM(MachineInstr &MI, Register FrameReg, int64_t Offset, + int FIOperandNum); + void processLDVM(MachineInstr &MI, Register FrameReg, int64_t Offset, + int FIOperandNum); + // Expand and eliminate Frame Index of pseudo STVM512rii and LDVM512rii. + void processSTVM512(MachineInstr &MI, Register FrameReg, int64_t Offset, + int FIOperandNum); + void processLDVM512(MachineInstr &MI, Register FrameReg, int64_t Offset, + int FIOperandNum); public: EliminateFrameIndex(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, @@ -271,6 +281,185 @@ void EliminateFrameIndex::processLDQ(MachineInstr &MI, Register FrameReg, replaceFI(MI, FrameReg, Offset, FIOperandNum); } +void EliminateFrameIndex::processSTVM(MachineInstr &MI, Register FrameReg, + int64_t Offset, int FIOperandNum) { + assert(MI.getOpcode() == VE::STVMrii); + LLVM_DEBUG(dbgs() << "processSTVM: "; MI.dump()); + + // Original MI is: + // STVMrii frame-index, 0, offset, reg (, memory operand) + // Convert it to: + // SVMi tmp-reg, reg, 0 + // STrii frame-reg, 0, offset, tmp-reg + // SVMi tmp-reg, reg, 1 + // STrii frame-reg, 0, offset+8, tmp-reg + // SVMi tmp-reg, reg, 2 + // STrii frame-reg, 0, offset+16, tmp-reg + // SVMi tmp-reg, reg, 3 + // STrii frame-reg, 0, offset+24, tmp-reg + + prepareReplaceFI(MI, FrameReg, Offset, 24); + + Register SrcReg = MI.getOperand(3).getReg(); + bool isKill = MI.getOperand(3).isKill(); + // FIXME: it would be better to scavenge a register here instead of + // reserving SX16 all of the time. + Register TmpReg = VE::SX16; + for (int i = 0; i < 3; ++i) { + build(VE::SVMmr, TmpReg).addReg(SrcReg).addImm(i); + MachineInstr *StMI = + build(VE::STrii).addReg(FrameReg).addImm(0).addImm(0).addReg( + TmpReg, getKillRegState(true)); + replaceFI(*StMI, FrameReg, Offset, 0); + Offset += 8; + } + build(VE::SVMmr, TmpReg).addReg(SrcReg, getKillRegState(isKill)).addImm(3); + MI.setDesc(get(VE::STrii)); + MI.getOperand(3).ChangeToRegister(TmpReg, false, false, true); + replaceFI(MI, FrameReg, Offset, FIOperandNum); +} + +void EliminateFrameIndex::processLDVM(MachineInstr &MI, Register FrameReg, + int64_t Offset, int FIOperandNum) { + assert(MI.getOpcode() == VE::LDVMrii); + LLVM_DEBUG(dbgs() << "processLDVM: "; MI.dump()); + + // Original MI is: + // LDVMri reg, frame-index, 0, offset (, memory operand) + // Convert it to: + // LDrii tmp-reg, frame-reg, 0, offset + // LVMir vm, 0, tmp-reg + // LDrii tmp-reg, frame-reg, 0, offset+8 + // LVMir_m vm, 1, tmp-reg, vm + // LDrii tmp-reg, frame-reg, 0, offset+16 + // LVMir_m vm, 2, tmp-reg, vm + // LDrii tmp-reg, frame-reg, 0, offset+24 + // LVMir_m vm, 3, tmp-reg, vm + + prepareReplaceFI(MI, FrameReg, Offset, 24); + + Register DestReg = MI.getOperand(0).getReg(); + // FIXME: it would be better to scavenge a register here instead of + // reserving SX16 all of the time. + unsigned TmpReg = VE::SX16; + for (int i = 0; i < 4; ++i) { + if (i != 3) { + MachineInstr *StMI = + build(VE::LDrii, TmpReg).addReg(FrameReg).addImm(0).addImm(0); + replaceFI(*StMI, FrameReg, Offset, 1); + Offset += 8; + } else { + // Last LDrii replace the target instruction. + MI.setDesc(get(VE::LDrii)); + MI.getOperand(0).ChangeToRegister(TmpReg, true); + } + // First LVM is LVMir. Others are LVMir_m. Last LVM places at the + // next of the target instruction. + if (i == 0) + build(VE::LVMir, DestReg).addImm(i).addReg(TmpReg, getKillRegState(true)); + else if (i != 3) + build(VE::LVMir_m, DestReg) + .addImm(i) + .addReg(TmpReg, getKillRegState(true)) + .addReg(DestReg); + else + BuildMI(*MI.getParent(), std::next(II), DL, get(VE::LVMir_m), DestReg) + .addImm(3) + .addReg(TmpReg, getKillRegState(true)) + .addReg(DestReg); + } + replaceFI(MI, FrameReg, Offset, FIOperandNum); +} + +void EliminateFrameIndex::processSTVM512(MachineInstr &MI, Register FrameReg, + int64_t Offset, int FIOperandNum) { + assert(MI.getOpcode() == VE::STVM512rii); + LLVM_DEBUG(dbgs() << "processSTVM512: "; MI.dump()); + + prepareReplaceFI(MI, FrameReg, Offset, 56); + + Register SrcReg = MI.getOperand(3).getReg(); + Register SrcLoReg = getSubReg(SrcReg, VE::sub_vm_odd); + Register SrcHiReg = getSubReg(SrcReg, VE::sub_vm_even); + bool isKill = MI.getOperand(3).isKill(); + // FIXME: it would be better to scavenge a register here instead of + // reserving SX16 all of the time. + Register TmpReg = VE::SX16; + // store low part of VMP + MachineInstr *LastMI = nullptr; + for (int i = 0; i < 4; ++i) { + LastMI = build(VE::SVMmr, TmpReg).addReg(SrcLoReg).addImm(i); + MachineInstr *StMI = + build(VE::STrii).addReg(FrameReg).addImm(0).addImm(0).addReg( + TmpReg, getKillRegState(true)); + replaceFI(*StMI, FrameReg, Offset, 0); + Offset += 8; + } + if (isKill) + LastMI->addRegisterKilled(SrcLoReg, &TRI, true); + // store high part of VMP + for (int i = 0; i < 3; ++i) { + build(VE::SVMmr, TmpReg).addReg(SrcHiReg).addImm(i); + MachineInstr *StMI = + build(VE::STrii).addReg(FrameReg).addImm(0).addImm(0).addReg( + TmpReg, getKillRegState(true)); + replaceFI(*StMI, FrameReg, Offset, 0); + Offset += 8; + } + LastMI = build(VE::SVMmr, TmpReg).addReg(SrcHiReg).addImm(3); + if (isKill) { + LastMI->addRegisterKilled(SrcHiReg, &TRI, true); + // Add implicit super-register kills to the particular MI. + LastMI->addRegisterKilled(SrcReg, &TRI, true); + } + MI.setDesc(get(VE::STrii)); + MI.getOperand(3).ChangeToRegister(TmpReg, false, false, true); + replaceFI(MI, FrameReg, Offset, FIOperandNum); +} + +void EliminateFrameIndex::processLDVM512(MachineInstr &MI, Register FrameReg, + int64_t Offset, int FIOperandNum) { + assert(MI.getOpcode() == VE::LDVM512rii); + LLVM_DEBUG(dbgs() << "processLDVM512: "; MI.dump()); + + prepareReplaceFI(MI, FrameReg, Offset, 56); + + Register DestReg = MI.getOperand(0).getReg(); + Register DestLoReg = getSubReg(DestReg, VE::sub_vm_odd); + Register DestHiReg = getSubReg(DestReg, VE::sub_vm_even); + // FIXME: it would be better to scavenge a register here instead of + // reserving SX16 all of the time. + Register TmpReg = VE::SX16; + build(VE::IMPLICIT_DEF, DestReg); + for (int i = 0; i < 4; ++i) { + MachineInstr *LdMI = + build(VE::LDrii, TmpReg).addReg(FrameReg).addImm(0).addImm(0); + replaceFI(*LdMI, FrameReg, Offset, 1); + build(VE::LVMir_m, DestLoReg) + .addImm(i) + .addReg(TmpReg, getKillRegState(true)) + .addReg(DestLoReg); + Offset += 8; + } + for (int i = 0; i < 3; ++i) { + MachineInstr *LdMI = + build(VE::LDrii, TmpReg).addReg(FrameReg).addImm(0).addImm(0); + replaceFI(*LdMI, FrameReg, Offset, 1); + build(VE::LVMir_m, DestHiReg) + .addImm(i) + .addReg(TmpReg, getKillRegState(true)) + .addReg(DestHiReg); + Offset += 8; + } + MI.setDesc(get(VE::LDrii)); + MI.getOperand(0).ChangeToRegister(TmpReg, true); + BuildMI(*MI.getParent(), std::next(II), DL, get(VE::LVMir_m), DestHiReg) + .addImm(3) + .addReg(TmpReg, getKillRegState(true)) + .addReg(DestHiReg); + replaceFI(MI, FrameReg, Offset, FIOperandNum); +} + void EliminateFrameIndex::processMI(MachineInstr &MI, Register FrameReg, int64_t Offset, int FIOperandNum) { switch (MI.getOpcode()) { @@ -280,6 +469,18 @@ void EliminateFrameIndex::processMI(MachineInstr &MI, Register FrameReg, case VE::LDQrii: processLDQ(MI, FrameReg, Offset, FIOperandNum); return; + case VE::STVMrii: + processSTVM(MI, FrameReg, Offset, FIOperandNum); + return; + case VE::LDVMrii: + processLDVM(MI, FrameReg, Offset, FIOperandNum); + return; + case VE::STVM512rii: + processSTVM512(MI, FrameReg, Offset, FIOperandNum); + return; + case VE::LDVM512rii: + processLDVM512(MI, FrameReg, Offset, FIOperandNum); + return; } prepareReplaceFI(MI, FrameReg, Offset); replaceFI(MI, FrameReg, Offset, FIOperandNum); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp index 9316826e3d92..d7720604d6dc 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp @@ -40,7 +40,7 @@ WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI) RI(STI.getTargetTriple()) {} bool WebAssemblyInstrInfo::isReallyTriviallyReMaterializable( - const MachineInstr &MI, AAResults *AA) const { + const MachineInstr &MI) const { switch (MI.getOpcode()) { case WebAssembly::CONST_I32: case WebAssembly::CONST_I64: diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h index f45a3792467a..29d700bdf83f 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h @@ -43,8 +43,7 @@ public: const WebAssemblyRegisterInfo &getRegisterInfo() const { return RI; } - bool isReallyTriviallyReMaterializable(const MachineInstr &MI, - AAResults *AA) const override; + bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp index d3ad47147ac8..f9ef45bfb41c 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -49,7 +49,6 @@ class WebAssemblyRegStackify final : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<MachineDominatorTree>(); AU.addRequired<LiveIntervals>(); AU.addPreserved<MachineBlockFrequencyInfo>(); @@ -164,15 +163,15 @@ static void queryCallee(const MachineInstr &MI, bool &Read, bool &Write, // Determine whether MI reads memory, writes memory, has side effects, // and/or uses the stack pointer value. -static void query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read, - bool &Write, bool &Effects, bool &StackPointer) { +static void query(const MachineInstr &MI, bool &Read, bool &Write, + bool &Effects, bool &StackPointer) { assert(!MI.isTerminator()); if (MI.isDebugInstr() || MI.isPosition()) return; // Check for loads. - if (MI.mayLoad() && !MI.isDereferenceableInvariantLoad(&AA)) + if (MI.mayLoad() && !MI.isDereferenceableInvariantLoad()) Read = true; // Check for stores. @@ -255,9 +254,9 @@ static void query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read, } // Test whether Def is safe and profitable to rematerialize. -static bool shouldRematerialize(const MachineInstr &Def, AliasAnalysis &AA, +static bool shouldRematerialize(const MachineInstr &Def, const WebAssemblyInstrInfo *TII) { - return Def.isAsCheapAsAMove() && TII->isTriviallyReMaterializable(Def, &AA); + return Def.isAsCheapAsAMove() && TII->isTriviallyReMaterializable(Def); } // Identify the definition for this register at this point. This is a @@ -311,7 +310,7 @@ static bool hasOneUse(unsigned Reg, MachineInstr *Def, MachineRegisterInfo &MRI, // TODO: Compute memory dependencies in a way that uses AliasAnalysis to be // more precise. static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use, - const MachineInstr *Insert, AliasAnalysis &AA, + const MachineInstr *Insert, const WebAssemblyFunctionInfo &MFI, const MachineRegisterInfo &MRI) { const MachineInstr *DefI = Def->getParent(); @@ -391,7 +390,7 @@ static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use, } bool Read = false, Write = false, Effects = false, StackPointer = false; - query(*DefI, AA, Read, Write, Effects, StackPointer); + query(*DefI, Read, Write, Effects, StackPointer); // If the instruction does not access memory and has no side effects, it has // no additional dependencies. @@ -406,7 +405,7 @@ static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use, bool InterveningWrite = false; bool InterveningEffects = false; bool InterveningStackPointer = false; - query(*I, AA, InterveningRead, InterveningWrite, InterveningEffects, + query(*I, InterveningRead, InterveningWrite, InterveningEffects, InterveningStackPointer); if (Effects && InterveningEffects) return false; @@ -808,7 +807,6 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>(); const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo(); const auto *TRI = MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo(); - AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); auto &MDT = getAnalysis<MachineDominatorTree>(); auto &LIS = getAnalysis<LiveIntervals>(); @@ -872,8 +870,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { // supports intra-block moves) and it's MachineSink's job to catch all // the sinking opportunities anyway. bool SameBlock = DefI->getParent() == &MBB; - bool CanMove = SameBlock && - isSafeToMove(Def, &Use, Insert, AA, MFI, MRI) && + bool CanMove = SameBlock && isSafeToMove(Def, &Use, Insert, MFI, MRI) && !TreeWalker.isOnStack(Reg); if (CanMove && hasOneUse(Reg, DefI, MRI, MDT, LIS)) { Insert = moveForSingleUse(Reg, Use, DefI, MBB, Insert, LIS, MFI, MRI); @@ -883,7 +880,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { // TODO: Encode this properly as a stackified value. if (MFI.isFrameBaseVirtual() && MFI.getFrameBaseVreg() == Reg) MFI.clearFrameBaseVreg(); - } else if (shouldRematerialize(*DefI, AA, TII)) { + } else if (shouldRematerialize(*DefI, TII)) { Insert = rematerializeCheapDef(Reg, Use, *DefI, MBB, Insert->getIterator(), LIS, MFI, MRI, TII, TRI); diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index a859176220c7..fa0a6bd415dc 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -1277,7 +1277,7 @@ class ProcModel<string Name, SchedMachineModel Model, // enabled. It has no effect on code generation. // NOTE: As a default tuning, "generic" aims to produce code optimized for the // most common X86 processors. The tunings might be changed over time. It is -// recommended to use "x86-64" in lit tests for consistency. +// recommended to use "tune-cpu"="x86-64" in function attribute for consistency. def : ProcModel<"generic", SandyBridgeModel, [FeatureX87, FeatureCX8, FeatureX86_64], [TuningSlow3OpsLEA, diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp index 16bff201dd03..db6923416177 100644 --- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp +++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp @@ -393,12 +393,12 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI, switch (MI->getOpcode()) { case X86::MOV8rm: - // Only replace 8 bit loads with the zero extending versions if - // in an inner most loop and not optimizing for size. This takes - // an extra byte to encode, and provides limited performance upside. - if (MachineLoop *ML = MLI->getLoopFor(&MBB)) - if (ML->begin() == ML->end() && !OptForSize) - return tryReplaceLoad(X86::MOVZX32rm8, MI); + // Replace 8-bit loads with the zero-extending version if not optimizing + // for size. The extending op is cheaper across a wide range of uarch and + // it avoids a potentially expensive partial register stall. It takes an + // extra byte to encode, however, so don't do this when optimizing for size. + if (!OptForSize) + return tryReplaceLoad(X86::MOVZX32rm8, MI); break; case X86::MOV16rm: diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 12af6087cb47..5a4533c4bac4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -555,6 +555,39 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); + auto setF16Action = [&] (MVT VT, LegalizeAction Action) { + setOperationAction(ISD::FABS, VT, Action); + setOperationAction(ISD::FNEG, VT, Action); + setOperationAction(ISD::FCOPYSIGN, VT, Expand); + setOperationAction(ISD::FREM, VT, Action); + setOperationAction(ISD::FMA, VT, Action); + setOperationAction(ISD::FMINNUM, VT, Action); + setOperationAction(ISD::FMAXNUM, VT, Action); + setOperationAction(ISD::FMINIMUM, VT, Action); + setOperationAction(ISD::FMAXIMUM, VT, Action); + setOperationAction(ISD::FSIN, VT, Action); + setOperationAction(ISD::FCOS, VT, Action); + setOperationAction(ISD::FSINCOS, VT, Action); + setOperationAction(ISD::FSQRT, VT, Action); + setOperationAction(ISD::FPOW, VT, Action); + setOperationAction(ISD::FLOG, VT, Action); + setOperationAction(ISD::FLOG2, VT, Action); + setOperationAction(ISD::FLOG10, VT, Action); + setOperationAction(ISD::FEXP, VT, Action); + setOperationAction(ISD::FEXP2, VT, Action); + setOperationAction(ISD::FCEIL, VT, Action); + setOperationAction(ISD::FFLOOR, VT, Action); + setOperationAction(ISD::FNEARBYINT, VT, Action); + setOperationAction(ISD::FRINT, VT, Action); + setOperationAction(ISD::BR_CC, VT, Action); + setOperationAction(ISD::SETCC, VT, Action); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::SELECT_CC, VT, Action); + setOperationAction(ISD::FROUND, VT, Action); + setOperationAction(ISD::FROUNDEVEN, VT, Action); + setOperationAction(ISD::FTRUNC, VT, Action); + }; + if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { // f16, f32 and f64 use SSE. // Set up the FP register classes. @@ -592,40 +625,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Half type will be promoted by default. - setOperationAction(ISD::FABS, MVT::f16, Promote); - setOperationAction(ISD::FNEG, MVT::f16, Promote); - setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); + setF16Action(MVT::f16, Promote); setOperationAction(ISD::FADD, MVT::f16, Promote); setOperationAction(ISD::FSUB, MVT::f16, Promote); setOperationAction(ISD::FMUL, MVT::f16, Promote); setOperationAction(ISD::FDIV, MVT::f16, Promote); - setOperationAction(ISD::FREM, MVT::f16, Promote); - setOperationAction(ISD::FMA, MVT::f16, Promote); - setOperationAction(ISD::FMINNUM, MVT::f16, Promote); - setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); - setOperationAction(ISD::FMINIMUM, MVT::f16, Promote); - setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote); - setOperationAction(ISD::FSIN, MVT::f16, Promote); - setOperationAction(ISD::FCOS, MVT::f16, Promote); - setOperationAction(ISD::FSINCOS, MVT::f16, Promote); - setOperationAction(ISD::FSQRT, MVT::f16, Promote); - setOperationAction(ISD::FPOW, MVT::f16, Promote); - setOperationAction(ISD::FLOG, MVT::f16, Promote); - setOperationAction(ISD::FLOG2, MVT::f16, Promote); - setOperationAction(ISD::FLOG10, MVT::f16, Promote); - setOperationAction(ISD::FEXP, MVT::f16, Promote); - setOperationAction(ISD::FEXP2, MVT::f16, Promote); - setOperationAction(ISD::FCEIL, MVT::f16, Promote); - setOperationAction(ISD::FFLOOR, MVT::f16, Promote); - setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); - setOperationAction(ISD::FRINT, MVT::f16, Promote); - setOperationAction(ISD::BR_CC, MVT::f16, Promote); - setOperationAction(ISD::SETCC, MVT::f16, Promote); - setOperationAction(ISD::SELECT, MVT::f16, Custom); - setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); - setOperationAction(ISD::FROUND, MVT::f16, Promote); - setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote); - setOperationAction(ISD::FTRUNC, MVT::f16, Promote); setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall); setOperationAction(ISD::FP_EXTEND, MVT::f32, LibCall); setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); @@ -1003,6 +1007,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, : &X86::VR128RegClass); addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); + addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass + : &X86::VR128RegClass); addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass @@ -1084,7 +1090,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } - for (auto VT : { MVT::v2f64, MVT::v2i64 }) { + for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); @@ -1095,19 +1101,25 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } + setF16Action(MVT::v8f16, Expand); + setOperationAction(ISD::FADD, MVT::v8f16, Expand); + setOperationAction(ISD::FSUB, MVT::v8f16, Expand); + setOperationAction(ISD::FMUL, MVT::v8f16, Expand); + setOperationAction(ISD::FDIV, MVT::v8f16, Expand); // Custom lower v2i64 and v2f64 selects. setOperationAction(ISD::SELECT, MVT::v2f64, Custom); setOperationAction(ISD::SELECT, MVT::v2i64, Custom); setOperationAction(ISD::SELECT, MVT::v4i32, Custom); setOperationAction(ISD::SELECT, MVT::v8i16, Custom); + setOperationAction(ISD::SELECT, MVT::v8f16, Custom); setOperationAction(ISD::SELECT, MVT::v16i8, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom); - setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom); // Custom legalize these to avoid over promotion or custom promotion. @@ -1118,8 +1130,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom); } - setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); - setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom); @@ -1304,6 +1316,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, : &X86::VR256RegClass); addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); + addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass + : &X86::VR256RegClass); addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass @@ -1340,12 +1354,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32); - setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom); - setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); - setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal); @@ -1356,7 +1372,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal); @@ -1386,6 +1401,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8i32, Custom); setOperationAction(ISD::SELECT, MVT::v16i16, Custom); + setOperationAction(ISD::SELECT, MVT::v16f16, Custom); setOperationAction(ISD::SELECT, MVT::v32i8, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); @@ -1507,7 +1523,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Custom lower several nodes for 256-bit types. for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, - MVT::v8f32, MVT::v4f64 }) { + MVT::v16f16, MVT::v8f32, MVT::v4f64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); @@ -1518,6 +1534,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::STORE, VT, Custom); } + setF16Action(MVT::v16f16, Expand); + setOperationAction(ISD::FADD, MVT::v16f16, Expand); + setOperationAction(ISD::FSUB, MVT::v16f16, Expand); + setOperationAction(ISD::FMUL, MVT::v16f16, Expand); + setOperationAction(ISD::FDIV, MVT::v16f16, Expand); if (HasInt256) { setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); @@ -1532,11 +1553,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } } - if (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) { - setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); - setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); + if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() && + Subtarget.hasF16C()) { + for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) { + setOperationAction(ISD::FP_ROUND, VT, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom); + } + for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) { + setOperationAction(ISD::FP_EXTEND, VT, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom); + } + for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) { + setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32); + setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32); + } + + setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal); } // This block controls legalization of the mask vector sizes that are @@ -1619,6 +1652,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); addRegisterClass(MVT::v32i16, &X86::VR512RegClass); + addRegisterClass(MVT::v32f16, &X86::VR512RegClass); addRegisterClass(MVT::v64i8, &X86::VR512RegClass); for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { @@ -1645,14 +1679,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32); } - setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); - setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal); - setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); - setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal); - setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom); setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal); @@ -1664,7 +1700,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); @@ -1799,15 +1834,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FSHR, MVT::v16i32, Custom); if (Subtarget.hasDQI()) { - setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); - setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal); - setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); - setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal); - setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal); - + for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, + ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT, + ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) + setOperationAction(Opc, MVT::v8i64, Custom); setOperationAction(ISD::MUL, MVT::v8i64, Legal); } @@ -1831,7 +1861,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, - MVT::v16f32, MVT::v8f64 }) { + MVT::v32f16, MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::SELECT, VT, Custom); @@ -1842,6 +1872,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); } + setF16Action(MVT::v32f16, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal); + for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) { + setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32); + setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32); + } for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::MLOAD, VT, Legal); @@ -1881,23 +1920,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // These operations are handled on non-VLX by artificially widening in // isel patterns. - setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, - Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, - Subtarget.hasVLX() ? Legal : Custom); if (Subtarget.hasDQI()) { // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion. @@ -1934,25 +1959,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MSCATTER, VT, Custom); if (Subtarget.hasDQI()) { - for (auto VT : { MVT::v2i64, MVT::v4i64 }) { - setOperationAction(ISD::SINT_TO_FP, VT, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::UINT_TO_FP, VT, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::STRICT_SINT_TO_FP, VT, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::STRICT_UINT_TO_FP, VT, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::FP_TO_SINT, VT, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::FP_TO_UINT, VT, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::STRICT_FP_TO_SINT, VT, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::STRICT_FP_TO_UINT, VT, - Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::MUL, VT, Legal); + for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, + ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT, + ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) { + setOperationAction(Opc, MVT::v2i64, Custom); + setOperationAction(Opc, MVT::v4i64, Custom); } + setOperationAction(ISD::MUL, MVT::v2i64, Legal); + setOperationAction(ISD::MUL, MVT::v4i64, Legal); } if (Subtarget.hasCDI()) { @@ -2052,7 +2066,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // AVX512_FP16 scalar operations setGroup(MVT::f16); - addRegisterClass(MVT::f16, &X86::FR16XRegClass); setOperationAction(ISD::FREM, MVT::f16, Promote); setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote); setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); @@ -2066,6 +2079,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal); setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand); @@ -2073,14 +2087,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.useAVX512Regs()) { setGroup(MVT::v32f16); - addRegisterClass(MVT::v32f16, &X86::VR512RegClass); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal); + setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom); @@ -2112,8 +2129,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (Subtarget.hasVLX()) { - addRegisterClass(MVT::v8f16, &X86::VR128XRegClass); - addRegisterClass(MVT::v16f16, &X86::VR256XRegClass); setGroup(MVT::v8f16); setGroup(MVT::v16f16); @@ -2132,8 +2147,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom); + setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal); // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom); @@ -2347,7 +2366,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::FP16_TO_FP, ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND, - ISD::FP_ROUND}); + ISD::FP_ROUND, + ISD::STRICT_FP_ROUND}); computeRegisterProperties(Subtarget.getRegisterInfo()); @@ -2404,6 +2424,10 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const { return TypeSplitVector; if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && + !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16) + return TypeSplitVector; + + if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && VT.getVectorElementType() != MVT::i1) return TypeWidenVector; @@ -2447,22 +2471,21 @@ handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC, MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && - Subtarget.hasAVX512()) { - unsigned NumElts = VT.getVectorNumElements(); + if (VT.isVector()) { + if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) { + unsigned NumElts = VT.getVectorNumElements(); - MVT RegisterVT; - unsigned NumRegisters; - std::tie(RegisterVT, NumRegisters) = - handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); - if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) - return RegisterVT; - } + MVT RegisterVT; + unsigned NumRegisters; + std::tie(RegisterVT, NumRegisters) = + handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); + if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) + return RegisterVT; + } - // v3f16 will be widen to v4f16. But we don't assign register class for v4f16. - // So its default register type is f16. We override the type to v8f16 here. - if (VT == MVT::v3f16 && Subtarget.hasFP16()) - return MVT::v8f16; + if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8) + return MVT::v8f16; + } // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled. if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() && @@ -2475,22 +2498,21 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && - Subtarget.hasAVX512()) { - unsigned NumElts = VT.getVectorNumElements(); + if (VT.isVector()) { + if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) { + unsigned NumElts = VT.getVectorNumElements(); - MVT RegisterVT; - unsigned NumRegisters; - std::tie(RegisterVT, NumRegisters) = - handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); - if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) - return NumRegisters; - } + MVT RegisterVT; + unsigned NumRegisters; + std::tie(RegisterVT, NumRegisters) = + handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); + if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) + return NumRegisters; + } - // v3f16 will be widen to v4f16. But we don't assign register class for v4f16. - // So its default register number is 3. We override the number to 1 here. - if (VT == MVT::v3f16 && Subtarget.hasFP16()) - return 1; + if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8) + return 1; + } // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if // x87 is disabled. @@ -9646,13 +9668,13 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); - // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. + // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2. // For size optimization, also splat v2f64 and v2i64, and for size opt // with AVX2, also splat i8 and i16. // With pattern matching, the VBROADCAST node may become a VMOVDDUP. if (ScalarSize == 32 || (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) || - (ScalarSize == 16 && Subtarget.hasFP16() && CVT.isFloatingPoint()) || + CVT == MVT::f16 || (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) @@ -14129,6 +14151,16 @@ static bool isShuffleFoldableLoad(SDValue V) { ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode()); } +template<typename T> +static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) { + return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16(); +} + +template<typename T> +bool X86TargetLowering::isSoftFP16(T VT) const { + return ::isSoftFP16(VT, Subtarget); +} + /// Try to lower insertion of a single element into a zero vector. /// /// This is a common pattern that we have especially efficient patterns to lower @@ -14140,6 +14172,9 @@ static SDValue lowerShuffleAsElementInsertion( MVT ExtVT = VT; MVT EltVT = VT.getVectorElementType(); + if (isSoftFP16(EltVT, Subtarget)) + return SDValue(); + int V2Index = find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) - Mask.begin(); @@ -19444,6 +19479,15 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + if (isSoftFP16(VT)) { + MVT NVT = VT.changeVectorElementTypeToInteger(); + return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond, + DAG.getBitcast(NVT, LHS), + DAG.getBitcast(NVT, RHS))); + } + // A vselect where all conditions and data are constants can be optimized into // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) && @@ -19467,8 +19511,6 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { if (!Subtarget.hasSSE41()) return SDValue(); - SDLoc dl(Op); - MVT VT = Op.getSimpleValueType(); unsigned EltSize = VT.getScalarSizeInBits(); unsigned NumElts = VT.getVectorNumElements(); @@ -20856,16 +20898,6 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG, return Cvt; } -template<typename T> -static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) { - return VT == MVT::f16 && !Subtarget.hasFP16(); -} - -template<typename T> -bool X86TargetLowering::isSoftFP16(T VT) const { - return ::isSoftFP16(VT, Subtarget); -} - static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) { bool IsStrict = Op->isStrictFPOpcode(); SDValue Src = Op.getOperand(IsStrict ? 1 : 0); @@ -20885,6 +20917,26 @@ static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) { DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd); } +static bool isLegalConversion(MVT VT, bool IsSigned, + const X86Subtarget &Subtarget) { + if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned) + return true; + if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned) + return true; + if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32)) + return true; + if (Subtarget.useAVX512Regs()) { + if (VT == MVT::v16i32) + return true; + if (VT == MVT::v8i64 && Subtarget.hasDQI()) + return true; + } + if (Subtarget.hasDQI() && Subtarget.hasVLX() && + (VT == MVT::v2i64 || VT == MVT::v4i64)) + return true; + return false; +} + SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); @@ -20897,6 +20949,8 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, if (isSoftFP16(VT)) return promoteXINT_TO_FP(Op, DAG); + else if (isLegalConversion(SrcVT, true, Subtarget)) + return Op; if (Subtarget.isTargetWin64() && SrcVT == MVT::i128) return LowerWin64_INT128_TO_FP(Op, DAG); @@ -21400,6 +21454,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, if (isSoftFP16(DstVT)) return promoteXINT_TO_FP(Op, DAG); + else if (isLegalConversion(SrcVT, false, Subtarget)) + return Op; if (DstVT.isVector()) return lowerUINT_TO_FP_vec(Op, DAG, Subtarget); @@ -22229,6 +22285,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { {NVT, MVT::Other}, {Chain, Src})}); return DAG.getNode(Op.getOpcode(), dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src)); + } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) { + return Op; } if (VT.isVector()) { @@ -22826,7 +22884,7 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { return Op; if (SVT.getVectorElementType() == MVT::f16) { - assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!"); + assert(Subtarget.hasF16C() && "Unexpected features!"); if (SVT == MVT::v2f16) In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In, DAG.getUNDEF(MVT::v2f16)); @@ -22836,6 +22894,8 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other}, {Op->getOperand(0), Res}); return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res); + } else if (VT == MVT::v4f64 || VT == MVT::v8f64) { + return Op; } assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); @@ -22854,34 +22914,19 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); SDValue In = Op.getOperand(IsStrict ? 1 : 0); - SDValue Op2 = Op.getOperand(IsStrict ? 2 : 1); MVT VT = Op.getSimpleValueType(); MVT SVT = In.getSimpleValueType(); if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80)) return SDValue(); - if (VT == MVT::f16) { - if (Subtarget.hasFP16()) - return Op; - - if (SVT != MVT::f32) { - if (IsStrict) - return DAG.getNode( - ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other}, - {Chain, - DAG.getNode(ISD::STRICT_FP_ROUND, DL, {MVT::f32, MVT::Other}, - {Chain, In, Op2}), - Op2}); - - return DAG.getNode(ISD::FP_ROUND, DL, VT, - DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, In, Op2), - Op2); - } - - if (!Subtarget.hasF16C()) + if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) { + if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32) return SDValue(); + if (VT.isVector()) + return Op; + SDValue Res; SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL, MVT::i32); @@ -24176,10 +24221,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, SDLoc dl(Op); if (isFP) { -#ifndef NDEBUG MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64); -#endif + if (isSoftFP16(EltVT, Subtarget)) + return SDValue(); bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); @@ -24741,6 +24786,9 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get(); + if (isSoftFP16(Op0.getValueType())) + return SDValue(); + // Handle f128 first, since one possible outcome is a normal integer // comparison which gets handled by emitFlagsForSetcc. if (Op0.getValueType() == MVT::f128) { @@ -24931,10 +24979,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op1.getSimpleValueType(); SDValue CC; - if (isSoftFP16(VT)) - return DAG.getBitcast(MVT::f16, DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond, - DAG.getBitcast(MVT::i16, Op1), - DAG.getBitcast(MVT::i16, Op2))); + if (isSoftFP16(VT)) { + MVT NVT = VT.changeTypeToInteger(); + return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond, + DAG.getBitcast(NVT, Op1), + DAG.getBitcast(NVT, Op2))); + } // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops // are available or VBLENDV if AVX is available. @@ -27268,27 +27318,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); } - case Intrinsic::swift_async_context_addr: { - auto &MF = DAG.getMachineFunction(); - auto X86FI = MF.getInfo<X86MachineFunctionInfo>(); - if (Subtarget.is64Bit()) { - MF.getFrameInfo().setFrameAddressIsTaken(true); - X86FI->setHasSwiftAsyncContext(true); - return SDValue( - DAG.getMachineNode( - X86::SUB64ri8, dl, MVT::i64, - DAG.getCopyFromReg(DAG.getEntryNode(), dl, X86::RBP, MVT::i64), - DAG.getTargetConstant(8, dl, MVT::i32)), - 0); - } else { - // 32-bit so no special extended frame, create or reuse an existing stack - // slot. - if (!X86FI->getSwiftAsyncContextFrameIdx()) - X86FI->setSwiftAsyncContextFrameIdx( - MF.getFrameInfo().CreateStackObject(4, Align(4), false)); - return DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32); - } - } case Intrinsic::x86_avx512_vp2intersect_q_512: case Intrinsic::x86_avx512_vp2intersect_q_256: case Intrinsic::x86_avx512_vp2intersect_q_128: @@ -27668,6 +27697,37 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo); if (!IntrData) { switch (IntNo) { + + case Intrinsic::swift_async_context_addr: { + SDLoc dl(Op); + auto &MF = DAG.getMachineFunction(); + auto X86FI = MF.getInfo<X86MachineFunctionInfo>(); + if (Subtarget.is64Bit()) { + MF.getFrameInfo().setFrameAddressIsTaken(true); + X86FI->setHasSwiftAsyncContext(true); + SDValue Chain = Op->getOperand(0); + SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64); + SDValue Result = + SDValue(DAG.getMachineNode(X86::SUB64ri8, dl, MVT::i64, CopyRBP, + DAG.getTargetConstant(8, dl, MVT::i32)), + 0); + // Return { result, chain }. + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, + CopyRBP.getValue(1)); + } else { + // 32-bit so no special extended frame, create or reuse an existing + // stack slot. + if (!X86FI->getSwiftAsyncContextFrameIdx()) + X86FI->setSwiftAsyncContextFrameIdx( + MF.getFrameInfo().CreateStackObject(4, Align(4), false)); + SDValue Result = + DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32); + // Return { result, chain }. + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, + Op->getOperand(0)); + } + } + case llvm::Intrinsic::x86_seh_ehregnode: return MarkEHRegistrationNode(Op, DAG); case llvm::Intrinsic::x86_seh_ehguard: @@ -32901,20 +32961,39 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::STRICT_FP_ROUND: case ISD::FP_ROUND: { bool IsStrict = N->isStrictFPOpcode(); + SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); SDValue Src = N->getOperand(IsStrict ? 1 : 0); + SDValue Rnd = N->getOperand(IsStrict ? 2 : 1); + EVT SrcVT = Src.getValueType(); EVT VT = N->getValueType(0); - EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32; + SDValue V; if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) { SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32) : DAG.getUNDEF(MVT::v2f32); Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext); } + if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) { + assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C"); + if (SrcVT.getVectorElementType() != MVT::f32) + return; + + if (IsStrict) + V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other}, + {Chain, Src, Rnd}); + else + V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd); + + Results.push_back(DAG.getBitcast(MVT::v8f16, V)); + if (IsStrict) + Results.push_back(V.getValue(1)); + return; + } if (!isTypeLegal(Src.getValueType())) return; - SDValue V; + EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32; if (IsStrict) V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other}, - {N->getOperand(0), Src}); + {Chain, Src}); else V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src); Results.push_back(V); @@ -37342,6 +37421,7 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask, bool IsUnary) { unsigned NumMaskElts = Mask.size(); unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); + unsigned SizeInBits = MaskVT.getSizeInBits(); if (MaskVT.is128BitVector()) { if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) && @@ -37409,7 +37489,10 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask, // Attempt to match against a OR if we're performing a blend shuffle and the // non-blended source element is zero in each case. - if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 && + // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits. + if (SizeInBits == V1.getValueSizeInBits() && + SizeInBits == V2.getValueSizeInBits() && + (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 && (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) { bool IsBlend = true; unsigned NumV1Elts = V1.getValueType().getVectorNumElements(); @@ -39652,11 +39735,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, SmallVector<int, 4> Mask; unsigned Opcode = N.getOpcode(); - // FIXME: Remove this after we support vector FP16 - if (isSoftFP16(peekThroughBitcasts(N.getOperand(0)).getSimpleValueType(), - Subtarget)) - return SDValue(); - if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG)) return R; @@ -40947,12 +41025,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( EltBits)) { OpBits.clearAllBits(); OpElts.clearAllBits(); - for (int I = 0; I != NumElts; ++I) - if (DemandedElts[I] && ((Invert && !EltBits[I].isAllOnes()) || - (!Invert && !EltBits[I].isZero()))) { + for (int I = 0; I != NumElts; ++I) { + if (!DemandedElts[I]) + continue; + if (UndefElts[I]) { + // We can't assume an undef src element gives an undef dst - the + // other src might be zero. + OpBits.setAllBits(); + OpElts.setBit(I); + } else if ((Invert && !EltBits[I].isAllOnes()) || + (!Invert && !EltBits[I].isZero())) { OpBits |= Invert ? ~EltBits[I] : EltBits[I]; OpElts.setBit(I); } + } } return std::make_pair(OpBits, OpElts); }; @@ -44715,7 +44801,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, } // Early exit check - if (!TLI.isTypeLegal(VT)) + if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget)) return SDValue(); if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget)) @@ -47798,11 +47884,17 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, EltBits)) { DemandedBits.clearAllBits(); DemandedElts.clearAllBits(); - for (int I = 0; I != NumElts; ++I) - if (!EltBits[I].isZero()) { + for (int I = 0; I != NumElts; ++I) { + if (UndefElts[I]) { + // We can't assume an undef src element gives an undef dst - the + // other src might be zero. + DemandedBits.setAllBits(); + DemandedElts.setBit(I); + } else if (!EltBits[I].isZero()) { DemandedBits |= EltBits[I]; DemandedElts.setBit(I); } + } } return std::make_pair(DemandedBits, DemandedElts); }; @@ -51042,6 +51134,8 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); MVT VT = N->getSimpleValueType(0); + int NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); // ANDNP(undef, x) -> 0 // ANDNP(x, undef) -> 0 @@ -51060,6 +51154,19 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, if (SDValue Not = IsNOT(N0, DAG)) return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1); + // Constant Folding + APInt Undefs0, Undefs1; + SmallVector<APInt> EltBits0, EltBits1; + if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0) && + getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1)) { + SDLoc DL(N); + SmallVector<APInt> ResultBits; + for (int I = 0; I != NumElts; ++I) + ResultBits.push_back(~EltBits0[I] & EltBits1[I]); + APInt ResultUndefs = APInt::getZero(NumElts); + return getConstVector(ResultBits, ResultUndefs, VT, DAG, DL); + } + // TODO: Constant fold NOT(N0) to allow us to use AND. // TODO: Do this in IsNOT with suitable oneuse checks? @@ -51074,20 +51181,24 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) { APInt UndefElts; SmallVector<APInt> EltBits; - int NumElts = VT.getVectorNumElements(); - int EltSizeInBits = VT.getScalarSizeInBits(); APInt DemandedBits = APInt::getAllOnes(EltSizeInBits); APInt DemandedElts = APInt::getAllOnes(NumElts); if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits)) { DemandedBits.clearAllBits(); DemandedElts.clearAllBits(); - for (int I = 0; I != NumElts; ++I) - if ((Invert && !EltBits[I].isAllOnes()) || - (!Invert && !EltBits[I].isZero())) { + for (int I = 0; I != NumElts; ++I) { + if (UndefElts[I]) { + // We can't assume an undef src element gives an undef dst - the + // other src might be zero. + DemandedBits.setAllBits(); + DemandedElts.setBit(I); + } else if ((Invert && !EltBits[I].isAllOnes()) || + (!Invert && !EltBits[I].isZero())) { DemandedBits |= Invert ? ~EltBits[I] : EltBits[I]; DemandedElts.setBit(I); } + } } return std::make_pair(DemandedBits, DemandedElts); }; @@ -54714,8 +54825,9 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, if (Subtarget.hasFP16()) return SDValue(); + bool IsStrict = N->isStrictFPOpcode(); EVT VT = N->getValueType(0); - SDValue Src = N->getOperand(0); + SDValue Src = N->getOperand(IsStrict ? 1 : 0); EVT SrcVT = Src.getValueType(); if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 || @@ -54736,8 +54848,15 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, // Destination is v8i16 with at least 8 elements. EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts)); - SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, - DAG.getTargetConstant(4, dl, MVT::i32)); + SDValue Cvt, Chain; + SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32); + if (IsStrict) { + Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other}, + {N->getOperand(0), Src, Rnd}); + Chain = Cvt.getValue(1); + } else { + Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd); + } // Extract down to real number of elements. if (NumElts < 8) { @@ -54746,7 +54865,12 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, DAG.getIntPtrConstant(0, dl)); } - return DAG.getBitcast(VT, Cvt); + Cvt = DAG.getBitcast(VT, Cvt); + + if (IsStrict) + return DAG.getMergeValues({Cvt, Chain}, dl); + + return Cvt; } static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) { @@ -54954,6 +55078,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget); case ISD::STRICT_FP_EXTEND: case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget); + case ISD::STRICT_FP_ROUND: case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget); case X86ISD::VBROADCAST_LOAD: case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 48da7b3ac882..c105bde78ad1 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3769,12 +3769,16 @@ let Predicates = [HasAVX512] in { (VMOVDQA64Zrm addr:$src)>; def : Pat<(alignedloadv32i16 addr:$src), (VMOVDQA64Zrm addr:$src)>; + def : Pat<(alignedloadv32f16 addr:$src), + (VMOVAPSZrm addr:$src)>; def : Pat<(alignedloadv64i8 addr:$src), (VMOVDQA64Zrm addr:$src)>; def : Pat<(loadv16i32 addr:$src), (VMOVDQU64Zrm addr:$src)>; def : Pat<(loadv32i16 addr:$src), (VMOVDQU64Zrm addr:$src)>; + def : Pat<(loadv32f16 addr:$src), + (VMOVUPSZrm addr:$src)>; def : Pat<(loadv64i8 addr:$src), (VMOVDQU64Zrm addr:$src)>; @@ -3783,12 +3787,16 @@ let Predicates = [HasAVX512] in { (VMOVDQA64Zmr addr:$dst, VR512:$src)>; def : Pat<(alignedstore (v32i16 VR512:$src), addr:$dst), (VMOVDQA64Zmr addr:$dst, VR512:$src)>; + def : Pat<(alignedstore (v32f16 VR512:$src), addr:$dst), + (VMOVAPSZmr addr:$dst, VR512:$src)>; def : Pat<(alignedstore (v64i8 VR512:$src), addr:$dst), (VMOVDQA64Zmr addr:$dst, VR512:$src)>; def : Pat<(store (v16i32 VR512:$src), addr:$dst), (VMOVDQU64Zmr addr:$dst, VR512:$src)>; def : Pat<(store (v32i16 VR512:$src), addr:$dst), (VMOVDQU64Zmr addr:$dst, VR512:$src)>; + def : Pat<(store (v32f16 VR512:$src), addr:$dst), + (VMOVUPSZmr addr:$dst, VR512:$src)>; def : Pat<(store (v64i8 VR512:$src), addr:$dst), (VMOVDQU64Zmr addr:$dst, VR512:$src)>; } @@ -3799,12 +3807,16 @@ let Predicates = [HasVLX] in { (VMOVDQA64Z128rm addr:$src)>; def : Pat<(alignedloadv8i16 addr:$src), (VMOVDQA64Z128rm addr:$src)>; + def : Pat<(alignedloadv8f16 addr:$src), + (VMOVAPSZ128rm addr:$src)>; def : Pat<(alignedloadv16i8 addr:$src), (VMOVDQA64Z128rm addr:$src)>; def : Pat<(loadv4i32 addr:$src), (VMOVDQU64Z128rm addr:$src)>; def : Pat<(loadv8i16 addr:$src), (VMOVDQU64Z128rm addr:$src)>; + def : Pat<(loadv8f16 addr:$src), + (VMOVUPSZ128rm addr:$src)>; def : Pat<(loadv16i8 addr:$src), (VMOVDQU64Z128rm addr:$src)>; @@ -3813,12 +3825,16 @@ let Predicates = [HasVLX] in { (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>; def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst), (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>; + def : Pat<(alignedstore (v8f16 VR128X:$src), addr:$dst), + (VMOVAPSZ128mr addr:$dst, VR128X:$src)>; def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst), (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>; def : Pat<(store (v4i32 VR128X:$src), addr:$dst), (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>; def : Pat<(store (v8i16 VR128X:$src), addr:$dst), (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>; + def : Pat<(store (v8f16 VR128X:$src), addr:$dst), + (VMOVUPSZ128mr addr:$dst, VR128X:$src)>; def : Pat<(store (v16i8 VR128X:$src), addr:$dst), (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>; @@ -3827,12 +3843,16 @@ let Predicates = [HasVLX] in { (VMOVDQA64Z256rm addr:$src)>; def : Pat<(alignedloadv16i16 addr:$src), (VMOVDQA64Z256rm addr:$src)>; + def : Pat<(alignedloadv16f16 addr:$src), + (VMOVAPSZ256rm addr:$src)>; def : Pat<(alignedloadv32i8 addr:$src), (VMOVDQA64Z256rm addr:$src)>; def : Pat<(loadv8i32 addr:$src), (VMOVDQU64Z256rm addr:$src)>; def : Pat<(loadv16i16 addr:$src), (VMOVDQU64Z256rm addr:$src)>; + def : Pat<(loadv16f16 addr:$src), + (VMOVUPSZ256rm addr:$src)>; def : Pat<(loadv32i8 addr:$src), (VMOVDQU64Z256rm addr:$src)>; @@ -3841,12 +3861,16 @@ let Predicates = [HasVLX] in { (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>; def : Pat<(alignedstore (v16i16 VR256X:$src), addr:$dst), (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>; + def : Pat<(alignedstore (v16f16 VR256X:$src), addr:$dst), + (VMOVAPSZ256mr addr:$dst, VR256X:$src)>; def : Pat<(alignedstore (v32i8 VR256X:$src), addr:$dst), (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>; def : Pat<(store (v8i32 VR256X:$src), addr:$dst), (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; def : Pat<(store (v16i16 VR256X:$src), addr:$dst), (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; + def : Pat<(store (v16f16 VR256X:$src), addr:$dst), + (VMOVUPSZ256mr addr:$dst, VR256X:$src)>; def : Pat<(store (v32i8 VR256X:$src), addr:$dst), (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; } @@ -3855,16 +3879,12 @@ let Predicates = [HasBWI] in { (VMOVDQU16Zrrk VR512:$src0, VK32WM:$mask, VR512:$src1)>; def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), v32f16_info.ImmAllZerosV)), (VMOVDQU16Zrrkz VK32WM:$mask, VR512:$src1)>; - def : Pat<(v32f16 (alignedloadv32f16 addr:$src)), - (VMOVAPSZrm addr:$src)>; def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 (alignedloadv32f16 addr:$src)), (v32f16 VR512:$src0))), (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>; def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 (alignedloadv32f16 addr:$src)), v32f16_info.ImmAllZerosV)), (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>; - def : Pat<(v32f16 (loadv32f16 addr:$src)), - (VMOVUPSZrm addr:$src)>; def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 (loadv32f16 addr:$src)), (v32f16 VR512:$src0))), (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>; @@ -3878,10 +3898,6 @@ let Predicates = [HasBWI] in { def : Pat<(v32f16 (masked_load addr:$src, VK32WM:$mask, v32f16_info.ImmAllZerosV)), (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>; - def : Pat<(alignedstore (v32f16 VR512:$src), addr:$dst), - (VMOVAPSZmr addr:$dst, VR512:$src)>; - def : Pat<(store (v32f16 VR512:$src), addr:$dst), - (VMOVUPSZmr addr:$dst, VR512:$src)>; def : Pat<(masked_store (v32f16 VR512:$src), addr:$dst, VK32WM:$mask), (VMOVDQU16Zmrk addr:$dst, VK32WM:$mask, VR512:$src)>; } @@ -3890,16 +3906,12 @@ let Predicates = [HasBWI, HasVLX] in { (VMOVDQU16Z256rrk VR256X:$src0, VK16WM:$mask, VR256X:$src1)>; def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), v16f16x_info.ImmAllZerosV)), (VMOVDQU16Z256rrkz VK16WM:$mask, VR256X:$src1)>; - def : Pat<(v16f16 (alignedloadv16f16 addr:$src)), - (VMOVAPSZ256rm addr:$src)>; def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 (alignedloadv16f16 addr:$src)), (v16f16 VR256X:$src0))), (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>; def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 (alignedloadv16f16 addr:$src)), v16f16x_info.ImmAllZerosV)), (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>; - def : Pat<(v16f16 (loadv16f16 addr:$src)), - (VMOVUPSZ256rm addr:$src)>; def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 (loadv16f16 addr:$src)), (v16f16 VR256X:$src0))), (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>; @@ -3913,10 +3925,6 @@ let Predicates = [HasBWI, HasVLX] in { def : Pat<(v16f16 (masked_load addr:$src, VK16WM:$mask, v16f16x_info.ImmAllZerosV)), (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>; - def : Pat<(alignedstore (v16f16 VR256X:$src), addr:$dst), - (VMOVAPSZ256mr addr:$dst, VR256X:$src)>; - def : Pat<(store (v16f16 VR256X:$src), addr:$dst), - (VMOVUPSZ256mr addr:$dst, VR256X:$src)>; def : Pat<(masked_store (v16f16 VR256X:$src), addr:$dst, VK16WM:$mask), (VMOVDQU16Z256mrk addr:$dst, VK16WM:$mask, VR256X:$src)>; @@ -3924,16 +3932,12 @@ let Predicates = [HasBWI, HasVLX] in { (VMOVDQU16Z128rrk VR128X:$src0, VK8WM:$mask, VR128X:$src1)>; def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 VR128X:$src1), v8f16x_info.ImmAllZerosV)), (VMOVDQU16Z128rrkz VK8WM:$mask, VR128X:$src1)>; - def : Pat<(v8f16 (alignedloadv8f16 addr:$src)), - (VMOVAPSZ128rm addr:$src)>; def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 (alignedloadv8f16 addr:$src)), (v8f16 VR128X:$src0))), (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 (alignedloadv8f16 addr:$src)), v8f16x_info.ImmAllZerosV)), (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>; - def : Pat<(v8f16 (loadv8f16 addr:$src)), - (VMOVUPSZ128rm addr:$src)>; def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 (loadv8f16 addr:$src)), (v8f16 VR128X:$src0))), (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>; @@ -3947,10 +3951,6 @@ let Predicates = [HasBWI, HasVLX] in { def : Pat<(v8f16 (masked_load addr:$src, VK8WM:$mask, v8f16x_info.ImmAllZerosV)), (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>; - def : Pat<(alignedstore (v8f16 VR128X:$src), addr:$dst), - (VMOVAPSZ128mr addr:$dst, VR128X:$src)>; - def : Pat<(store (v8f16 VR128X:$src), addr:$dst), - (VMOVUPSZ128mr addr:$dst, VR128X:$src)>; def : Pat<(masked_store (v8f16 VR128X:$src), addr:$dst, VK8WM:$mask), (VMOVDQU16Z128mrk addr:$dst, VK8WM:$mask, VR128X:$src)>; } diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index ec32ac2acad1..74ef831e1658 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -742,8 +742,8 @@ static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) { return isPICBase; } -bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, - AAResults *AA) const { +bool X86InstrInfo::isReallyTriviallyReMaterializable( + const MachineInstr &MI) const { switch (MI.getOpcode()) { default: // This function should only be called for opcodes with the ReMaterializable @@ -869,7 +869,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, MI.getOperand(1 + X86::AddrScaleAmt).isImm() && MI.getOperand(1 + X86::AddrIndexReg).isReg() && MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 && - MI.isDereferenceableInvariantLoad(AA)) { + MI.isDereferenceableInvariantLoad()) { Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); if (BaseReg == 0 || BaseReg == X86::RIP) return true; @@ -3892,6 +3892,10 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, Register DestReg, int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { + const MachineFunction &MF = *MBB.getParent(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) && + "Load size exceeds stack slot"); if (RC->getID() == X86::TILERegClassID) { unsigned Opc = X86::TILELOADD; // tileloadd (%sp, %idx), %tmm @@ -3913,8 +3917,6 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx); } else { - const MachineFunction &MF = *MBB.getParent(); - const MachineFrameInfo &MFI = MF.getFrameInfo(); unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16); bool isAligned = (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 4943d2152fd2..98da00c39bdb 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -240,8 +240,7 @@ public: unsigned isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override; - bool isReallyTriviallyReMaterializable(const MachineInstr &MI, - AAResults *AA) const override; + bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override; void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 06cb280e860a..c5557bd5df4e 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -140,6 +140,7 @@ def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", let Predicates = [NoAVX512] in { def : Pat<(v16i8 immAllZerosV), (V_SET0)>; def : Pat<(v8i16 immAllZerosV), (V_SET0)>; +def : Pat<(v8f16 immAllZerosV), (V_SET0)>; def : Pat<(v4i32 immAllZerosV), (V_SET0)>; def : Pat<(v2i64 immAllZerosV), (V_SET0)>; def : Pat<(v2f64 immAllZerosV), (V_SET0)>; @@ -159,6 +160,7 @@ def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", let Predicates = [NoAVX512] in { def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>; def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>; +def : Pat<(v16f16 immAllZerosV), (AVX_SET0)>; def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>; def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>; def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>; @@ -572,6 +574,23 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVUPSYmr addr:$dst, VR256:$src)>; def : Pat<(store (v32i8 VR256:$src), addr:$dst), (VMOVUPSYmr addr:$dst, VR256:$src)>; + + def : Pat<(alignedloadv8f16 addr:$src), + (VMOVAPSrm addr:$src)>; + def : Pat<(loadv8f16 addr:$src), + (VMOVUPSrm addr:$src)>; + def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), + (VMOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8f16 VR128:$src), addr:$dst), + (VMOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedloadv16f16 addr:$src), + (VMOVAPSYrm addr:$src)>; + def : Pat<(loadv16f16 addr:$src), + (VMOVUPSYrm addr:$src)>; + def : Pat<(alignedstore (v16f16 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v16f16 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; } // Use movaps / movups for SSE integer load / store (one byte shorter). @@ -613,6 +632,17 @@ let Predicates = [UseSSE1] in { (MOVUPSmr addr:$dst, VR128:$src)>; } +let Predicates = [UseSSE2] in { + def : Pat<(alignedloadv8f16 addr:$src), + (MOVAPSrm addr:$src)>; + def : Pat<(loadv8f16 addr:$src), + (MOVUPSrm addr:$src)>; + def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8f16 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; +} + //===----------------------------------------------------------------------===// // SSE 1 & 2 - Move Low packed FP Instructions //===----------------------------------------------------------------------===// @@ -3136,6 +3166,8 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVNTDQYmr addr:$dst, VR256:$src)>; def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst), (VMOVNTDQYmr addr:$dst, VR256:$src)>; + def : Pat<(alignednontemporalstore (v16f16 VR256:$src), addr:$dst), + (VMOVNTDQYmr addr:$dst, VR256:$src)>; def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst), (VMOVNTDQYmr addr:$dst, VR256:$src)>; @@ -3143,6 +3175,8 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVNTDQmr addr:$dst, VR128:$src)>; def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), (VMOVNTDQmr addr:$dst, VR128:$src)>; + def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst), + (VMOVNTDQmr addr:$dst, VR128:$src)>; def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), (VMOVNTDQmr addr:$dst, VR128:$src)>; } @@ -3152,6 +3186,8 @@ let Predicates = [UseSSE2] in { (MOVNTDQmr addr:$dst, VR128:$src)>; def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), (MOVNTDQmr addr:$dst, VR128:$src)>; + def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst), + (MOVNTDQmr addr:$dst, VR128:$src)>; def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), (MOVNTDQmr addr:$dst, VR128:$src)>; } @@ -3374,12 +3410,16 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVDQArm addr:$src)>; def : Pat<(alignedloadv8i16 addr:$src), (VMOVDQArm addr:$src)>; + def : Pat<(alignedloadv8f16 addr:$src), + (VMOVDQArm addr:$src)>; def : Pat<(alignedloadv16i8 addr:$src), (VMOVDQArm addr:$src)>; def : Pat<(loadv4i32 addr:$src), (VMOVDQUrm addr:$src)>; def : Pat<(loadv8i16 addr:$src), (VMOVDQUrm addr:$src)>; + def : Pat<(loadv8f16 addr:$src), + (VMOVDQUrm addr:$src)>; def : Pat<(loadv16i8 addr:$src), (VMOVDQUrm addr:$src)>; @@ -3387,12 +3427,16 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVDQAmr addr:$dst, VR128:$src)>; def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), (VMOVDQAmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), + (VMOVDQAmr addr:$dst, VR128:$src)>; def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), (VMOVDQAmr addr:$dst, VR128:$src)>; def : Pat<(store (v4i32 VR128:$src), addr:$dst), (VMOVDQUmr addr:$dst, VR128:$src)>; def : Pat<(store (v8i16 VR128:$src), addr:$dst), (VMOVDQUmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8f16 VR128:$src), addr:$dst), + (VMOVDQUmr addr:$dst, VR128:$src)>; def : Pat<(store (v16i8 VR128:$src), addr:$dst), (VMOVDQUmr addr:$dst, VR128:$src)>; } @@ -6431,6 +6475,8 @@ let Predicates = [HasAVX2, NoVLX] in { (VMOVNTDQAYrm addr:$src)>; def : Pat<(v16i16 (alignednontemporalload addr:$src)), (VMOVNTDQAYrm addr:$src)>; + def : Pat<(v16f16 (alignednontemporalload addr:$src)), + (VMOVNTDQAYrm addr:$src)>; def : Pat<(v32i8 (alignednontemporalload addr:$src)), (VMOVNTDQAYrm addr:$src)>; } @@ -6446,6 +6492,8 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVNTDQArm addr:$src)>; def : Pat<(v8i16 (alignednontemporalload addr:$src)), (VMOVNTDQArm addr:$src)>; + def : Pat<(v8f16 (alignednontemporalload addr:$src)), + (VMOVNTDQArm addr:$src)>; def : Pat<(v16i8 (alignednontemporalload addr:$src)), (VMOVNTDQArm addr:$src)>; } @@ -6461,6 +6509,8 @@ let Predicates = [UseSSE41] in { (MOVNTDQArm addr:$src)>; def : Pat<(v8i16 (alignednontemporalload addr:$src)), (MOVNTDQArm addr:$src)>; + def : Pat<(v8f16 (alignednontemporalload addr:$src)), + (MOVNTDQArm addr:$src)>; def : Pat<(v16i8 (alignednontemporalload addr:$src)), (MOVNTDQArm addr:$src)>; } @@ -7050,6 +7100,8 @@ def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF128 addr:$src)>; def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF128 addr:$src)>; +def : Pat<(v16f16 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTF128 addr:$src)>; def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF128 addr:$src)>; } @@ -7095,6 +7147,7 @@ let Predicates = [HasAVX1Only] in { defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>; defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>; defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>; + defm : vperm2x128_lowering<"VPERM2F128", v16f16, loadv16f16>; defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>; } @@ -7150,6 +7203,8 @@ let Predicates = [HasAVX1Only] in { defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2i64, v4i64, loadv2i64, loadv4i64>; defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4i32, v8i32, loadv4i32, loadv8i32>; defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8i16, v16i16, loadv8i16, loadv16i16>; + defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8f16, v16f16, loadv8f16, loadv16f16>; + defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>; defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>; } @@ -7189,6 +7244,8 @@ let Predicates = [HasAVX1Only] in { defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>; defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>; defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>; + defm : vextract_lowering<"VEXTRACTF128", v16f16, v8f16>; + defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; } @@ -7503,6 +7560,10 @@ def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)) (VBLENDPSYrri VR256:$src1, (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v16f16 VR256:$src1), (v8f16 VR128:$src2), (iPTR 0)), + (VBLENDPSYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), (VBLENDPSYrri VR256:$src1, (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), @@ -7517,6 +7578,9 @@ def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0 def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)), (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; +def : Pat<(insert_subvector (loadv16f16 addr:$src2), (v8f16 VR128:$src1), (iPTR 0)), + (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)), (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; @@ -7759,6 +7823,8 @@ let Predicates = [HasAVX2] in { defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>; defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>; defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>; + defm : vperm2x128_lowering<"VPERM2I128", v16f16, loadv16f16>; + defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>; defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>; } @@ -7781,6 +7847,8 @@ let Predicates = [HasAVX2, NoVLX] in { defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v2i64, v4i64, loadv2i64, loadv4i64>; defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v4i32, v8i32, loadv4i32, loadv8i32>; defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8i16, v16i16, loadv8i16, loadv16i16>; + defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8f16, v16f16, loadv8f16, loadv16f16>; + defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>; defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>; } @@ -7801,6 +7869,8 @@ let Predicates = [HasAVX2, NoVLX] in { defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>; defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>; defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>; + defm : vextract_lowering<"VEXTRACTI128", v16f16, v8f16>; + defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; } diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index f4e25e4194db..1de2a1725954 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -254,8 +254,12 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { StringRef CPU = CPUAttr.isValid() ? CPUAttr.getValueAsString() : (StringRef)TargetCPU; - StringRef TuneCPU = - TuneAttr.isValid() ? TuneAttr.getValueAsString() : (StringRef)CPU; + // "x86-64" is a default target setting for many front ends. In these cases, + // they actually request for "generic" tuning unless the "tune-cpu" was + // specified. + StringRef TuneCPU = TuneAttr.isValid() ? TuneAttr.getValueAsString() + : CPU == "x86-64" ? "generic" + : (StringRef)CPU; StringRef FS = FSAttr.isValid() ? FSAttr.getValueAsString() : (StringRef)TargetFS; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index b36f8a3d06d0..b27aac9c4e93 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1297,29 +1297,6 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, LT.first = NumOfDests * NumOfShufflesPerDest; } - static const CostTblEntry AVX512FP16ShuffleTbl[] = { - {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw - {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw - {TTI::SK_Broadcast, MVT::v8f16, 1}, // vpbroadcastw - - {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw - {TTI::SK_Reverse, MVT::v16f16, 2}, // vpermw - {TTI::SK_Reverse, MVT::v8f16, 1}, // vpshufb - - {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw - {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw - {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // vpshufb - - {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w - {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // vpermt2w - {TTI::SK_PermuteTwoSrc, MVT::v8f16, 2} // vpermt2w - }; - - if (!ST->useSoftFloat() && ST->hasFP16()) - if (const auto *Entry = - CostTableLookup(AVX512FP16ShuffleTbl, Kind, LT.second)) - return LT.first * Entry->Cost; - static const CostTblEntry AVX512VBMIShuffleTbl[] = { {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb @@ -1339,17 +1316,22 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, static const CostTblEntry AVX512BWShuffleTbl[] = { {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw + {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2 {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw + {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw + {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16 {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w + {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1 @@ -1369,6 +1351,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd @@ -1376,6 +1359,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca + {TTI::SK_Reverse, MVT::v32f16, 7}, // per mca {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd @@ -1408,11 +1392,14 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, // FIXME: This just applies the type legalization cost rules above // assuming these completely split. {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14}, + {TTI::SK_PermuteSingleSrc, MVT::v32f16, 14}, {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14}, {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42}, + {TTI::SK_PermuteTwoSrc, MVT::v32f16, 42}, {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42}, {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq + {TTI::SK_Select, MVT::v32f16, 1}, // vpternlogq {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps @@ -1430,6 +1417,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd @@ -1437,9 +1425,11 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb + {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb + {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd @@ -1448,6 +1438,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb // + vpblendvb + {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb + // + vpblendvb {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb // + vpblendvb @@ -1457,6 +1449,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb // + vpblendvb + {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb + // + vpblendvb {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb // + vpblendvb }; @@ -1493,6 +1487,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128 + {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128 {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128 {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd @@ -1501,6 +1496,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb // + vinsertf128 + {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb + // + vinsertf128 {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb // + vinsertf128 @@ -1509,6 +1506,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Select, MVT::v8i32, 1}, // vblendps {TTI::SK_Select, MVT::v8f32, 1}, // vblendps {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor + {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd @@ -1517,6 +1515,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb // + 2*por + vinsertf128 + {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb + // + 2*por + vinsertf128 {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb // + 2*por + vinsertf128 @@ -1526,6 +1526,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb // + 4*por + vinsertf128 + {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb + // + 4*por + vinsertf128 {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb // + 4*por + vinsertf128 }; @@ -1540,6 +1542,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Select, MVT::v4i32, 1}, // pblendw {TTI::SK_Select, MVT::v4f32, 1}, // blendps {TTI::SK_Select, MVT::v8i16, 1}, // pblendw + {TTI::SK_Select, MVT::v8f16, 1}, // pblendw {TTI::SK_Select, MVT::v16i8, 1} // pblendvb }; @@ -1549,18 +1552,23 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, static const CostTblEntry SSSE3ShuffleTbl[] = { {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb + {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb + {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por + {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb + {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por + {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por }; @@ -1573,12 +1581,14 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd + {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd + {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw // + 2*pshufd + 2*unpck + packus @@ -1586,6 +1596,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_Select, MVT::v2f64, 1}, // movsd {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por + {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd @@ -1593,6 +1604,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw // + pshufd/unpck + {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw + // + pshufd/unpck { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw // + 2*pshufd + 2*unpck + 2*packus @@ -1600,6 +1613,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute + { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute }; @@ -5219,7 +5233,7 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) return true; - if (ScalarTy->isHalfTy() && ST->hasBWI() && ST->hasFP16()) + if (ScalarTy->isHalfTy() && ST->hasBWI()) return true; if (!ScalarTy->isIntegerTy()) @@ -5674,8 +5688,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || EltTy->isIntegerTy(32) || EltTy->isPointerTy()) return true; - if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || - (!ST->useSoftFloat() && ST->hasFP16() && EltTy->isHalfTy())) + if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy()) return HasBW; return false; }; diff --git a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp index 19ebcb3ea3e8..2fb06e29bf3b 100644 --- a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp +++ b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp @@ -27,7 +27,7 @@ #include "llvm/IR/Function.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetOptions.h" -#include <algorithm> // std::sort +#include <algorithm> using namespace llvm; |