1 files changed, 209 insertions, 35 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
index 7be63ae6964b..829669157893 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -14,12 +14,24 @@ class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateContro
 
 }
 
+class UniformSextInreg<ValueType VT> : PatFrag<
+  (ops node:$src),
+  (sext_inreg $src, VT),
+  [{ return !N->isDivergent(); }]>;
+
+class DivergentSextInreg<ValueType VT> : PatFrag<
+  (ops node:$src),
+  (sext_inreg $src, VT),
+  [{ return N->isDivergent(); }]>;
+
 include "SOPInstructions.td"
 include "VOPInstructions.td"
 include "SMInstructions.td"
 include "FLATInstructions.td"
 include "BUFInstructions.td"
 include "EXPInstructions.td"
+include "LDSDIRInstructions.td"
+include "VINTERPInstructions.td"
 
 //===----------------------------------------------------------------------===//
 // VINTRP Instructions
@@ -176,19 +188,33 @@ def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
   let mayStore = 0;
 }
 
+// Pseudo instructions used for @llvm.fptrunc.round upward
+// and @llvm.fptrunc.round downward.
+// These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD
+// and G_FPTRUNC_ROUND_DOWNWARD before being lowered to
+// FPTRUNC_UPWARD_PSEUDO and FPTRUNC_DOWNWARD_PSEUDO.
+// The final codegen is done in the ModeRegister pass.
+let Uses = [MODE, EXEC] in {
+def FPTRUNC_UPWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
+  (ins VGPR_32:$src0),
+  [(set f16:$vdst, (SIfptrunc_round_upward f32:$src0))]>;
+
+def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
+  (ins VGPR_32:$src0),
+  [(set f16:$vdst, (SIfptrunc_round_downward f32:$src0))]>;
+} // End Uses = [MODE, EXEC]
+
 // Invert the exec mask and overwrite the inactive lanes of dst with inactive,
 // restoring it after we're done.
 let Defs = [SCC] in {
 def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
-  (ins VGPR_32: $src, VSrc_b32:$inactive),
+  (ins VSrc_b32: $src, VSrc_b32:$inactive),
   [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
-  let Constraints = "$src = $vdst";
 }
 
 def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
-  (ins VReg_64: $src, VSrc_b64:$inactive),
+  (ins VSrc_b64: $src, VSrc_b64:$inactive),
   [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
-  let Constraints = "$src = $vdst";
 }
 } // End Defs = [SCC]
 
@@ -287,6 +313,20 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
   let isConvergent = 1;
   let FixedSize = 1;
   let Size = 0;
+  let isMeta = 1;
+}
+
+def SCHED_BARRIER : SPseudoInstSI<(outs), (ins i32imm:$mask),
+  [(int_amdgcn_sched_barrier (i32 timm:$mask))]> {
+  let SchedRW = [];
+  let hasNoSchedulingInfo = 1;
+  let hasSideEffects = 1;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let isConvergent = 1;
+  let FixedSize = 1;
+  let Size = 0;
+  let isMeta = 1;
 }
 
 // SI pseudo instructions. These are used by the CFG structurizer pass
@@ -424,6 +464,7 @@ def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
   let Size = 0;
   let hasNoSchedulingInfo = 1;
   let FixedSize = 1;
+  let isMeta = 1;
 }
 
 // Used as an isel pseudo to directly emit initialization with an
@@ -459,11 +500,14 @@ def SI_RETURN_TO_EPILOG : SPseudoInstSI <
   let hasNoSchedulingInfo = 1;
   let DisableWQM = 1;
   let FixedSize = 1;
+
+  // TODO: Should this be true?
+  let isMeta = 0;
 }
 
 // Return for returning function calls.
 def SI_RETURN : SPseudoInstSI <
-  (outs), (ins), [],
+  (outs), (ins), [(AMDGPUret_flag)],
   "; return"> {
   let isTerminator = 1;
   let isBarrier = 1;
@@ -496,6 +540,7 @@ def : GCNPat<
 def SI_CALL : SPseudoInstSI <
   (outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> {
   let Size = 4;
+  let FixedSize = 1;
   let isCall = 1;
   let UseNamedOperandTable = 1;
   let SchedRW = [WriteBranch];
@@ -508,6 +553,7 @@ def SI_TCRETURN : SPseudoInstSI <(outs),
   (ins SReg_64:$src0, unknown:$callee, i32imm:$fpdiff),
   [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> {
   let Size = 4;
+  let FixedSize = 1;
   let isCall = 1;
   let isTerminator = 1;
   let isReturn = 1;
@@ -1212,6 +1258,26 @@ def : Pat <
   (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub2_sub3))
 >;
 
+def : Pat <
+  (extract_subvector v16i16:$vec, (i32 0)),
+  (v8i16 (EXTRACT_SUBREG v16i16:$vec, sub0_sub1_sub2_sub3))
+>;
+
+def : Pat <
+  (extract_subvector v16i16:$vec, (i32 8)),
+  (v8i16 (EXTRACT_SUBREG v16i16:$vec, sub4_sub5_sub6_sub7))
+>;
+
+def : Pat <
+  (extract_subvector v16f16:$vec, (i32 0)),
+  (v8f16 (EXTRACT_SUBREG v16f16:$vec, sub0_sub1_sub2_sub3))
+>;
+
+def : Pat <
+  (extract_subvector v16f16:$vec, (i32 8)),
+  (v8f16 (EXTRACT_SUBREG v16f16:$vec, sub4_sub5_sub6_sub7))
+>;
+
 foreach Index = 0-31 in {
   def Extract_Element_v32i32_#Index : Extract_Element <
     i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -1371,7 +1437,18 @@ def : BitConvert <v8i32, v4i64, VReg_256>;
 def : BitConvert <v8f32, v4i64, VReg_256>;
 def : BitConvert <v8i32, v4f64, VReg_256>;
 def : BitConvert <v8f32, v4f64, VReg_256>;
-
+def : BitConvert <v16i16, v16f16, SReg_256>;
+def : BitConvert <v16f16, v16i16, SReg_256>;
+def : BitConvert <v16i16, v16f16, VReg_256>;
+def : BitConvert <v16f16, v16i16, VReg_256>;
+def : BitConvert <v16f16, v8i32, VReg_256>;
+def : BitConvert <v16i16, v8i32, VReg_256>;
+def : BitConvert <v16f16, v8f32, VReg_256>;
+def : BitConvert <v16i16, v8f32, VReg_256>;
+def : BitConvert <v8i32, v16f16, VReg_256>;
+def : BitConvert <v8i32, v16i16, VReg_256>;
+def : BitConvert <v8f32, v16f16, VReg_256>;
+def : BitConvert <v8f32, v16i16, VReg_256>;
 
 // 512-bit bitcast
 def : BitConvert <v16i32, v16f32, VReg_512>;
@@ -1941,12 +2018,6 @@ def : GCNPat <
 //===----------------------------------------------------------------------===//
 // Conversion Patterns
 //===----------------------------------------------------------------------===//
-
-class UniformSextInreg<ValueType VT> : PatFrag<
-  (ops node:$src),
-  (sext_inreg $src, VT),
-  [{ return !N->isDivergent(); }]>;
-
 def : GCNPat<(i32 (UniformSextInreg<i1> i32:$src)),
   (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
 
@@ -1981,23 +2052,28 @@ def : GCNPat <
   (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
 >;
 
-
-class DivergentSextInreg<ValueType VT> : PatFrag<
-  (ops node:$src),
-  (sext_inreg $src, VT),
-  [{ return N->isDivergent(); }]>;
-
-def : GCNPat<(i32 (DivergentSextInreg<i1> i32:$src)),
+def : GCNPat<
+  (i32 (DivergentSextInreg<i1> i32:$src)),
   (V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>;
 
 def : GCNPat <
   (i16 (DivergentSextInreg<i1> i16:$src)),
-  (V_BFE_I32_e64 $src, (i32 0), (i32 1)) // 0 | 1 << 16
+  (V_BFE_I32_e64 $src, (i32 0), (i32 1))
 >;
 
 def : GCNPat <
   (i16 (DivergentSextInreg<i8> i16:$src)),
-  (V_BFE_I32_e64 $src, (i32 0), (i32 8)) // 0 | 8 << 16
+  (V_BFE_I32_e64 $src, (i32 0), (i32 8))
+>;
+
+def : GCNPat<
+  (i32 (DivergentSextInreg<i8> i32:$src)),
+  (V_BFE_I32_e64 i32:$src, (i32 0), (i32 8))
+>;
+
+def : GCNPat <
+  (i32 (DivergentSextInreg<i16> i32:$src)),
+  (V_BFE_I32_e64 $src, (i32 0), (i32 16))
 >;
 
 def : GCNPat <
@@ -2010,14 +2086,14 @@ def : GCNPat <
 def : GCNPat <
   (i64 (DivergentSextInreg<i8> i64:$src)),
   (REG_SEQUENCE VReg_64,
-    (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8)/* 0 | 8 << 16 */), sub0,
+    (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8)), sub0,
     (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8))), sub1)
 >;
 
 def : GCNPat <
   (i64 (DivergentSextInreg<i16> i64:$src)),
   (REG_SEQUENCE VReg_64,
-    (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16)/* 0 | 16 << 16 */), sub0,
+    (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16)), sub0,
     (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16))), sub1)
 >;
 
@@ -2053,12 +2129,18 @@ def : ZExt_i64_i1_Pat<anyext>;
 // FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
 // REG_SEQUENCE patterns don't support instructions with multiple outputs.
 def : GCNPat <
-  (i64 (sext i32:$src)),
+  (i64 (UniformUnaryFrag<sext> i32:$src)),
     (REG_SEQUENCE SReg_64, $src, sub0,
     (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)
 >;
 
 def : GCNPat <
+  (i64 (DivergentUnaryFrag<sext> i32:$src)),
+    (REG_SEQUENCE VReg_64, $src, sub0,
+    (i32 (COPY_TO_REGCLASS (V_ASHRREV_I32_e64 (i32 31), $src), VGPR_32)), sub1)
+>;
+
+def : GCNPat <
   (i64 (sext i1:$src)),
   (REG_SEQUENCE VReg_64,
     (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
@@ -2235,6 +2317,30 @@ def : GCNPat <
 // the src is lowered. e.g. fptrunc + fma may be lowered to a
 // v_fma_mix* instruction which does not zero, or may not.
 def : GCNPat<
+  (i32 (DivergentUnaryFrag<abs> i32:$src)),
+  (V_MAX_I32_e64 (V_SUB_CO_U32_e32 (i32 0), $src), $src)>;
+
+let AddedComplexity = 1 in {
+def : GCNPat<
+  (i32 (DivergentUnaryFrag<abs> i32:$src)),
+  (V_MAX_I32_e64 (V_SUB_U32_e32 (i32 0), $src), $src)>{
+  let SubtargetPredicate = HasAddNoCarryInsts;
+}
+}  // AddedComplexity = 1
+
+def : GCNPat<
+  (i32 (DivergentUnaryFrag<zext> i16:$src)),
+  (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src)
+>;
+
+def : GCNPat<
+  (i64 (DivergentUnaryFrag<zext> i16:$src)),
+  (REG_SEQUENCE VReg_64,
+    (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src), sub0,
+    (S_MOV_B32 (i32 0)), sub1)
+>;
+
+def : GCNPat<
   (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
   (COPY VSrc_b16:$src)>;
 
@@ -2269,6 +2375,34 @@ def : GCNPat <
   (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
 >;
 
+def IMMBitSelConst : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(1ULL << N->getZExtValue(), SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+// Matching separate SRL and TRUNC instructions
+// with dependent operands (SRL dest is source of TRUNC)
+// generates three instructions. However, by using bit shifts,
+// the V_LSHRREV_B32_e64 result can be directly used in the
+// operand of the V_AND_B32_e64 instruction:
+// (trunc i32 (srl i32 $a, i32 $b)) ->
+// v_and_b32_e64 $a, (1 << $b), $a
+// v_cmp_ne_u32_e64 $a, 0, $a
+
+// Handle the VALU case.
+def : GCNPat <
+  (i1 (DivergentUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))),
+  (V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 (IMMBitSelConst $b)), $a),
+    (i32 0))
+>;
+
+// Handle the scalar case.
+def : GCNPat <
+  (i1 (UniformUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))),
+  (S_CMP_LG_U32 (S_AND_B32 (i32 (IMMBitSelConst $b)), $a),
+    (i32 0))
+>;
+
 def : GCNPat <
   (i1 (DivergentUnaryFrag<trunc> i64:$a)),
   (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1),
@@ -2350,6 +2484,11 @@ def : GCNPat <
 
 }
 
+def : GCNPat<
+  (i64 (DivergentUnaryFrag<bitreverse> i64:$a)),
+  (REG_SEQUENCE VReg_64,
+   (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1))), sub0,
+   (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0))), sub1)>;
 
 // Prefer selecting to max when legal, but using mul is always valid.
 let AddedComplexity = -5 in {
@@ -2508,12 +2647,12 @@ def : GCNPat <
 >;
 
 def : GCNPat <
-  (v2i16 (build_vector (i16 SReg_32:$src0), (i16 undef))),
+  (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 undef))),
   (COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
 >;
 
 def : GCNPat <
-  (v2i16 (build_vector (i16 VGPR_32:$src0), (i16 undef))),
+  (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 undef))),
   (COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32)
 >;
 
@@ -2597,6 +2736,15 @@ def : GCNPat <
 >;
 } // End SubtargetPredicate = HasVOP3PInsts
 
+// With multiple uses of the shift, this will duplicate the shift and
+// increase register pressure.
+let SubtargetPredicate = isGFX11Plus in
+def : GCNPat <
+  (v2i16 (build_vector (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), (i16 SReg_32:$src1))),
+  (v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1))
+>;
+
+
 def : GCNPat <
   (v2f16 (scalar_to_vector f16:$src0)),
   (COPY $src0)
@@ -2678,18 +2826,18 @@ def : GCNPat <
 // an inline immediate than -c.
 // TODO: Also do for 64-bit.
 def : GCNPat<
-  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+  (UniformBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)),
   (S_SUB_I32 SReg_32:$src0, NegSubInlineConst32:$src1)
 >;
 
 def : GCNPat<
-  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+  (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)),
   (V_SUB_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
   let SubtargetPredicate = HasAddNoCarryInsts;
 }
 
 def : GCNPat<
-  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+  (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)),
   (V_SUB_CO_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
   let SubtargetPredicate = NotHasAddNoCarryInsts;
 }
@@ -2703,20 +2851,21 @@ def : GCNPat<
   (S_MOV_B32 SReg_32:$src)
 >;
 
-multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
+multiclass BFMPatterns <ValueType vt, PatFrag SHL, PatFrag ADD, InstSI BFM> {
   def : GCNPat <
-    (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
+    (vt (SHL (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
     (BFM $a, $b)
   >;
 
   def : GCNPat <
-    (vt (add (vt (shl 1, vt:$a)), -1)),
-    (BFM $a, (MOV (i32 0)))
+    (vt (ADD (vt (shl 1, vt:$a)), -1)),
+    (BFM $a, (i32 0))
   >;
 }
 
-defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
-// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
+defm : BFMPatterns <i32, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B32>;
+// FIXME: defm : BFMPatterns <i64, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B64>;
+defm : BFMPatterns <i32, DivergentBinFrag<shl>, DivergentBinFrag<add>, V_BFM_B32_e64>;
 
 // Bitfield extract patterns
 
@@ -3007,6 +3156,19 @@ def G_AMDGPU_CLAMP : AMDGPUGenericInstruction {
   let hasSideEffects = 0;
 }
 
+// Integer multiply-add: arg0 * arg1 + arg2.
+//
+// arg0 and arg1 are 32-bit integers (interpreted as signed or unsigned),
+// arg2 is a 64-bit integer. Result is a 64-bit integer and a 1-bit carry-out.
+class G_AMDGPU_MAD_64_32 : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst, type1:$carry_out);
+  let InOperandList = (ins type2:$arg0, type2:$arg1, type0:$arg2);
+  let hasSideEffects = 0;
+}
+
+def G_AMDGPU_MAD_U64_U32 : G_AMDGPU_MAD_64_32;
+def G_AMDGPU_MAD_I64_I32 : G_AMDGPU_MAD_64_32;
+
 // Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
 // operand Expects a MachineMemOperand in addition to explicit
 // operands.
@@ -3130,3 +3292,15 @@ def G_SI_CALL : AMDGPUGenericInstruction {
   // TODO: Should really base this on the call target
   let isConvergent = 1;
 }
+
+def G_FPTRUNC_ROUND_UPWARD : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$vdst);
+  let InOperandList = (ins type1:$src0);
+  let hasSideEffects = 0;
+}
+
+def G_FPTRUNC_ROUND_DOWNWARD : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$vdst);
+  let InOperandList = (ins type1:$src0);
+  let hasSideEffects = 0;
+}